Merge "Implement partial cache" from Tomasz and Piotr

"This series enables cache to keep partial partitions. Reads no longer have to read whole partition from sstables in order to cache the result. The 10MB threshold for partition size in cache is lifted. Known issues: - There is no partial eviction yet, whole partitions are still evicted, and partition snapshots held by active reads are not evictable at all - Information about range continuity is not recorded if that would require inserting a dummy entry, or if previous entry doesn't belong to the latest snapshot - Cache update after memtable flush happening concurrently with reads may inhibit that reads' ability to populate cache (new issue) - Cache update from flushed memtables has partition granularity, so may cause latency problems with large partition - Schema is still tracked per-partition, so after schema changes reads may induce high latency due to whole partition needing to be converted atomically - Range tombstones are repeated in the stream for every range between cache entries they cover (new issue) - Populating scans for both small and large partitions (perf_fast_forward) experienced a 40% reduction of throughput, CPU bound How was this tested: - test.py --mode release - row_cache_stress_test -c1 -m1G - perf_fast_forward, passes except for the test case checking range continuity population which would require inserting a dummy entry (mentioned above) - perf_simple_query (-c1 -m1G --duration 32): before: 90k [ops/s] stdev: 4k [ops/s] after: 94k [ops/s] stdev: 2k [ops/s]" * tag 'tgrabiec/introduce-partial-cache-v8' of github.com:cloudius-systems/seastar-dev: (130 commits) tests: row_cache: Add test_tombstone_merging_in_partial_partition test case tests: Introduce row_cache_stress_test utils: Add helpers for dealing with nonwrapping_range<int> tests: simple_schema: Allow passing the tombstone to make_range_tombstone() tests: simple_schema: Accept value by reference tests: simple_schema: Make add_row() accept optional timestamp tests: simple_schema: Make new_timestamp() public tests: simple_schema: Introduce make_ckeys() tests: simple_schema: Introduce get_value(const clustered_row&) helper tests: simple_schema: Fix comment tests: simple_schema: Add missing include row_cache: Introduce evict() tests: Add cache_streamed_mutation_test tests: mutation_assertions: Allow expecting fragments mutation_fragment: Implement equality check tests: row_cache: Add test for population of random partitions tests: row_cache: Add test for partition tombstone population tests: row_cache: Test reading randomly populated partition tests: row_cache: Add test_single_partition_update() tests: row_cache: Add test_scan_with_partial_partitions ...
2026-05-13 19:32:02 +00:00 · 2017-06-26 14:54:37 +03:00
parent 555621b537 b0bcf2be53
commit 9b21a9bfb6
54 changed files with 5966 additions and 1580 deletions
--- a/cache_streamed_mutation.hh
+++ b/cache_streamed_mutation.hh
@@ -0,0 +1,482 @@
+/*
+ * Copyright (C) 2017 ScyllaDB
+ */
+
+/*
+ * This file is part of Scylla.
+ *
+ * Scylla is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Scylla is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Scylla.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#include <vector>
+#include "row_cache.hh"
+#include "mutation_reader.hh"
+#include "streamed_mutation.hh"
+#include "partition_version.hh"
+#include "utils/logalloc.hh"
+#include "query-request.hh"
+#include "partition_snapshot_reader.hh"
+#include "partition_snapshot_row_cursor.hh"
+#include "read_context.hh"
+
+namespace cache {
+
+class lsa_manager {
+    row_cache& _cache;
+public:
+    lsa_manager(row_cache& cache) : _cache(cache) { }
+    template<typename Func>
+    decltype(auto) run_in_read_section(const Func& func) {
+        return _cache._read_section(_cache._tracker.region(), [&func] () {
+            return with_linearized_managed_bytes([&func] () {
+                return func();
+            });
+        });
+    }
+    template<typename Func>
+    decltype(auto) run_in_update_section(const Func& func) {
+        return _cache._update_section(_cache._tracker.region(), [&func] () {
+            return with_linearized_managed_bytes([&func] () {
+                return func();
+            });
+        });
+    }
+    template<typename Func>
+    void run_in_update_section_with_allocator(Func&& func) {
+        return _cache._update_section(_cache._tracker.region(), [this, &func] () {
+            return with_linearized_managed_bytes([this, &func] () {
+                return with_allocator(_cache._tracker.region().allocator(), [this, &func] () mutable {
+                    return func();
+                });
+            });
+        });
+    }
+    logalloc::region& region() { return _cache._tracker.region(); }
+    logalloc::allocating_section& read_section() { return _cache._read_section; }
+};
+
+class cache_streamed_mutation final : public streamed_mutation::impl {
+    lw_shared_ptr<partition_snapshot> _snp;
+    position_in_partition::tri_compare _position_cmp;
+
+    query::clustering_key_filter_ranges _ck_ranges;
+    query::clustering_row_ranges::const_iterator _ck_ranges_curr;
+    query::clustering_row_ranges::const_iterator _ck_ranges_end;
+
+    lsa_manager _lsa_manager;
+
+    stdx::optional<clustering_key> _last_row_key;
+
+    // We need to be prepared that we may get overlapping and out of order
+    // range tombstones. We must emit fragments with strictly monotonic positions,
+    // so we can't just trim such tombstones to the position of the last fragment.
+    // To solve that, range tombstones are accumulated first in a range_tombstone_stream
+    // and emitted once we have a fragment with a larger position.
+    range_tombstone_stream _tombstones;
+
+    // Holds the lower bound of a position range which hasn't been processed yet.
+    // Only fragments with positions < _lower_bound have been emitted.
+    position_in_partition _lower_bound;
+    position_in_partition_view _upper_bound;
+
+    bool _static_row_done = false;
+    bool _reading_underlying = false;
+    lw_shared_ptr<read_context> _read_context;
+    partition_snapshot_row_cursor _next_row;
+    bool _next_row_in_range = false;
+
+    future<> do_fill_buffer();
+    future<> copy_from_cache_to_buffer();
+    future<> process_static_row();
+    void move_to_end();
+    future<> move_to_next_range();
+    future<> move_to_current_range();
+    future<> move_to_next_entry();
+    // Emits all delayed range tombstones with positions smaller than upper_bound.
+    void drain_tombstones(position_in_partition_view upper_bound);
+    // Emits all delayed range tombstones.
+    void drain_tombstones();
+    void add_to_buffer(const partition_snapshot_row_cursor&);
+    void add_to_buffer(clustering_row&&);
+    void add_to_buffer(range_tombstone&&);
+    void add_to_buffer(mutation_fragment&&);
+    future<> read_from_underlying();
+    future<> start_reading_from_underlying();
+    bool after_current_range(position_in_partition_view position);
+    bool can_populate() const;
+    void maybe_update_continuity();
+    void maybe_add_to_cache(const mutation_fragment& mf);
+    void maybe_add_to_cache(const clustering_row& cr);
+    void maybe_add_to_cache(const range_tombstone& rt);
+    void maybe_add_to_cache(const static_row& sr);
+    void maybe_set_static_row_continuous();
+public:
+    cache_streamed_mutation(schema_ptr s,
+                            dht::decorated_key dk,
+                            query::clustering_key_filter_ranges&& crr,
+                            lw_shared_ptr<read_context> ctx,
+                            lw_shared_ptr<partition_snapshot> snp,
+                            row_cache& cache)
+        : streamed_mutation::impl(std::move(s), dk, snp->partition_tombstone())
+        , _snp(std::move(snp))
+        , _position_cmp(*_schema)
+        , _ck_ranges(std::move(crr))
+        , _ck_ranges_curr(_ck_ranges.begin())
+        , _ck_ranges_end(_ck_ranges.end())
+        , _lsa_manager(cache)
+        , _tombstones(*_schema)
+        , _lower_bound(position_in_partition::before_all_clustered_rows())
+        , _upper_bound(position_in_partition_view::before_all_clustered_rows())
+        , _read_context(std::move(ctx))
+        , _next_row(*_schema, cache._tracker.region(), *_snp)
+    { }
+    cache_streamed_mutation(const cache_streamed_mutation&) = delete;
+    cache_streamed_mutation(cache_streamed_mutation&&) = delete;
+    virtual future<> fill_buffer() override;
+    virtual ~cache_streamed_mutation() {
+        maybe_merge_versions(_snp, _lsa_manager.region(), _lsa_manager.read_section());
+    }
+};
+
+inline
+future<> cache_streamed_mutation::process_static_row() {
+    if (_snp->version()->partition().static_row_continuous()) {
+        row sr = _snp->static_row();
+        if (!sr.empty()) {
+            push_mutation_fragment(mutation_fragment(static_row(std::move(sr))));
+        }
+        return make_ready_future<>();
+    } else {
+        return _read_context->get_next_fragment().then([this] (mutation_fragment_opt&& sr) {
+            if (sr) {
+                assert(sr->is_static_row());
+                maybe_add_to_cache(sr->as_static_row());
+                push_mutation_fragment(std::move(*sr));
+            }
+            maybe_set_static_row_continuous();
+        });
+    }
+}
+
+inline
+future<> cache_streamed_mutation::fill_buffer() {
+    if (!_static_row_done) {
+        _static_row_done = true;
+        return process_static_row().then([this] {
+            return _lsa_manager.run_in_read_section([this] {
+                return move_to_current_range();
+            }).then([this] {
+                return fill_buffer();
+            });
+        });
+    }
+    return do_until([this] { return _end_of_stream || is_buffer_full(); }, [this] {
+        return do_fill_buffer();
+    });
+}
+
+inline
+future<> cache_streamed_mutation::do_fill_buffer() {
+    if (_reading_underlying) {
+        return read_from_underlying();
+    }
+    return _lsa_manager.run_in_read_section([this] {
+        auto same_pos = _next_row.maybe_refresh();
+        // FIXME: If continuity changed anywhere between _lower_bound and _next_row.position()
+        // we need to redo the lookup with _lower_bound. There is no eviction yet, so not yet a problem.
+        assert(same_pos);
+        while (!is_buffer_full() && !_end_of_stream && !_reading_underlying) {
+            future<> f = copy_from_cache_to_buffer();
+            if (!f.available() || need_preempt()) {
+                return f;
+            }
+        }
+        return make_ready_future<>();
+    });
+}
+
+inline
+future<> cache_streamed_mutation::read_from_underlying() {
+    return do_until([this] { return !_reading_underlying || is_buffer_full(); }, [this] {
+        return _read_context->get_next_fragment().then([this] (auto&& mfopt) {
+            if (!mfopt) {
+                _reading_underlying = false;
+                return _lsa_manager.run_in_update_section([this] {
+                    auto same_pos = _next_row.maybe_refresh();
+                    assert(same_pos); // FIXME: handle eviction
+                    if (_next_row_in_range) {
+                        this->maybe_update_continuity();
+                        this->add_to_buffer(_next_row);
+                        return this->move_to_next_entry();
+                    } else {
+                        if (no_clustering_row_between(*_schema, _upper_bound, _next_row.position())) {
+                            this->maybe_update_continuity();
+                        } else {
+                            // FIXME: Insert dummy entry at _upper_bound.
+                        }
+                        return this->move_to_next_range();
+                    }
+                });
+            } else {
+                this->maybe_add_to_cache(*mfopt);
+                this->add_to_buffer(std::move(*mfopt));
+                return make_ready_future<>();
+            }
+        });
+    });
+}
+
+inline
+void cache_streamed_mutation::maybe_update_continuity() {
+    if (can_populate() && _next_row.is_in_latest_version()) {
+        if (_last_row_key) {
+            if (_next_row.previous_row_in_latest_version_has_key(*_last_row_key)) {
+                _next_row.set_continuous(true);
+            }
+        } else if (!_ck_ranges_curr->start()) {
+            _next_row.set_continuous(true);
+        }
+    }
+}
+
+inline
+void cache_streamed_mutation::maybe_add_to_cache(const mutation_fragment& mf) {
+    if (mf.is_range_tombstone()) {
+        maybe_add_to_cache(mf.as_range_tombstone());
+    } else {
+        assert(mf.is_clustering_row());
+        const clustering_row& cr = mf.as_clustering_row();
+        maybe_add_to_cache(cr);
+    }
+}
+
+inline
+void cache_streamed_mutation::maybe_add_to_cache(const clustering_row& cr) {
+    if (!can_populate()) {
+        return;
+    }
+    _lsa_manager.run_in_update_section_with_allocator([this, &cr] {
+        mutation_partition& mp = _snp->version()->partition();
+        rows_entry::compare less(*_schema);
+
+        // FIXME: If _next_row is up to date, but latest version doesn't have iterator in
+        // current row (could be far away, so we'd do this often), then this will do
+        // the lookup in mp. This is not necessary, because _next_row has iterators for
+        // next rows in each version, even if they're not part of the current row.
+        // They're currently buried in the heap, but you could keep a vector of
+        // iterators per each version in addition to the heap.
+        auto new_entry = alloc_strategy_unique_ptr<rows_entry>(
+            current_allocator().construct<rows_entry>(cr.key(), cr.tomb(), cr.marker(), cr.cells()));
+        new_entry->set_continuous(false);
+        auto it = _next_row.has_up_to_date_row_from_latest_version()
+                  ? _next_row.get_iterator_in_latest_version() : mp.clustered_rows().lower_bound(cr.key(), less);
+        auto insert_result = mp.clustered_rows().insert_check(it, *new_entry, less);
+        if (insert_result.second) {
+            new_entry.release();
+        }
+        it = insert_result.first;
+
+        rows_entry& e = *it;
+        if (_last_row_key) {
+            if (it == mp.clustered_rows().begin()) {
+                // FIXME: check whether entry for _last_row_key is in older versions and if so set
+                // continuity to true.
+            } else {
+                auto prev_it = it;
+                --prev_it;
+                clustering_key_prefix::tri_compare tri_comp(*_schema);
+                if (tri_comp(*_last_row_key, prev_it->key()) == 0) {
+                    e.set_continuous(true);
+                }
+            }
+        } else if (!_ck_ranges_curr->start()) {
+            e.set_continuous(true);
+        } else {
+            // FIXME: Insert dummy entry at _ck_ranges_curr->start()
+        }
+    });
+}
+
+inline
+bool cache_streamed_mutation::after_current_range(position_in_partition_view p) {
+    return _position_cmp(p, _upper_bound) >= 0;
+}
+
+inline
+future<> cache_streamed_mutation::start_reading_from_underlying() {
+    _reading_underlying = true;
+    auto end = _next_row_in_range ? position_in_partition(_next_row.position())
+                                  : position_in_partition(_upper_bound);
+    return _read_context->fast_forward_to(position_range{_lower_bound, std::move(end)});
+}
+
+inline
+future<> cache_streamed_mutation::copy_from_cache_to_buffer() {
+    position_in_partition_view next_lower_bound = _next_row.dummy() ? _next_row.position() : position_in_partition_view::after_key(_next_row.key());
+    for (auto&& rts : _snp->range_tombstones(*_schema, _lower_bound, _next_row_in_range ? next_lower_bound : _upper_bound)) {
+        add_to_buffer(std::move(rts));
+        if (is_buffer_full()) {
+            return make_ready_future<>();
+        }
+    }
+    if (_next_row_in_range) {
+        add_to_buffer(_next_row);
+        return move_to_next_entry();
+    } else {
+        return move_to_next_range();
+    }
+}
+
+inline
+void cache_streamed_mutation::move_to_end() {
+    drain_tombstones();
+    _end_of_stream = true;
+}
+
+inline
+future<> cache_streamed_mutation::move_to_next_range() {
+    ++_ck_ranges_curr;
+    if (_ck_ranges_curr == _ck_ranges_end) {
+        move_to_end();
+        return make_ready_future<>();
+    } else {
+        return move_to_current_range();
+    }
+}
+
+inline
+future<> cache_streamed_mutation::move_to_current_range() {
+    _last_row_key = std::experimental::nullopt;
+    _lower_bound = position_in_partition::for_range_start(*_ck_ranges_curr);
+    _upper_bound = position_in_partition_view::for_range_end(*_ck_ranges_curr);
+    auto complete_until_next = _next_row.advance_to(_lower_bound) || _next_row.continuous();
+    _next_row_in_range = !after_current_range(_next_row.position());
+    if (!complete_until_next) {
+        return start_reading_from_underlying();
+    }
+    return make_ready_future<>();
+}
+
+// _next_row must be inside the range.
+inline
+future<> cache_streamed_mutation::move_to_next_entry() {
+    if (no_clustering_row_between(*_schema, _next_row.position(), _upper_bound)) {
+        return move_to_next_range();
+    } else {
+        if (!_next_row.next()) {
+            move_to_end();
+            return make_ready_future<>();
+        }
+        _next_row_in_range = !after_current_range(_next_row.position());
+        if (!_next_row.continuous()) {
+            return start_reading_from_underlying();
+        }
+        return make_ready_future<>();
+    }
+}
+
+inline
+void cache_streamed_mutation::drain_tombstones(position_in_partition_view pos) {
+    while (auto mfo = _tombstones.get_next(pos)) {
+        push_mutation_fragment(std::move(*mfo));
+    }
+}
+
+inline
+void cache_streamed_mutation::drain_tombstones() {
+    while (auto mfo = _tombstones.get_next()) {
+        push_mutation_fragment(std::move(*mfo));
+    }
+}
+
+inline
+void cache_streamed_mutation::add_to_buffer(mutation_fragment&& mf) {
+    if (mf.is_clustering_row()) {
+        add_to_buffer(std::move(std::move(mf).as_clustering_row()));
+    } else {
+        assert(mf.is_range_tombstone());
+        add_to_buffer(std::move(mf).as_range_tombstone());
+    }
+}
+
+inline
+void cache_streamed_mutation::add_to_buffer(const partition_snapshot_row_cursor& row) {
+    if (!row.dummy()) {
+        add_to_buffer(row.row());
+    }
+}
+
+inline
+void cache_streamed_mutation::add_to_buffer(clustering_row&& row) {
+    drain_tombstones(row.position());
+    _last_row_key = row.key();
+    _lower_bound = position_in_partition::after_key(row.key());
+    push_mutation_fragment(std::move(row));
+}
+
+inline
+void cache_streamed_mutation::add_to_buffer(range_tombstone&& rt) {
+    // This guarantees that rt starts after any emitted clustering_row
+    if (!rt.trim_front(*_schema, _lower_bound)) {
+        return;
+    }
+    _lower_bound = position_in_partition(rt.position());
+    _tombstones.apply(std::move(rt));
+    drain_tombstones(_lower_bound);
+}
+
+inline
+void cache_streamed_mutation::maybe_add_to_cache(const range_tombstone& rt) {
+    if (can_populate()) {
+        _lsa_manager.run_in_update_section_with_allocator([&] {
+            _snp->version()->partition().apply_row_tombstone(*_schema, rt);
+        });
+    }
+}
+
+inline
+void cache_streamed_mutation::maybe_add_to_cache(const static_row& sr) {
+    if (can_populate()) {
+        _lsa_manager.run_in_update_section_with_allocator([&] {
+            _snp->version()->partition().static_row().apply(*_schema, column_kind::static_column, sr.cells());
+        });
+    }
+}
+
+inline
+void cache_streamed_mutation::maybe_set_static_row_continuous() {
+    if (can_populate()) {
+        _snp->version()->partition().set_static_row_continuous(true);
+    }
+}
+
+inline
+bool cache_streamed_mutation::can_populate() const {
+    return _snp->at_latest_version() && _read_context->cache().phase_of(_read_context->key()) == _read_context->phase();
+}
+
+} // namespace cache
+
+inline streamed_mutation make_cache_streamed_mutation(schema_ptr s,
+                                                      dht::decorated_key dk,
+                                                      query::clustering_key_filter_ranges crr,
+                                                      row_cache& cache,
+                                                      lw_shared_ptr<cache::read_context> ctx,
+                                                      lw_shared_ptr<partition_snapshot> snp)
+{
+    return make_streamed_mutation<cache::cache_streamed_mutation>(
+        std::move(s), std::move(dk), std::move(crr), std::move(ctx), std::move(snp), cache);
+}
--- a/clustering_bounds_comparator.hh
+++ b/clustering_bounds_comparator.hh
@@ -54,8 +54,8 @@ static inline bound_kind flip_bound_kind(bound_kind bk)
 }

 class bound_view {
-    const static thread_local clustering_key empty_prefix;
 public:
+    const static thread_local clustering_key empty_prefix;
    const clustering_key_prefix& prefix;
    bound_kind kind;
    bound_view(const clustering_key_prefix& prefix, bound_kind kind)
--- a/configure.py
+++ b/configure.py
@@ -184,6 +184,8 @@ scylla_tests = [
    'tests/perf/perf_cql_parser',
    'tests/perf/perf_simple_query',
    'tests/perf/perf_fast_forward',
+    'tests/cache_streamed_mutation_test',
+    'tests/row_cache_stress_test',
    'tests/memory_footprint',
    'tests/perf/perf_sstable',
    'tests/cql_query_test',
@@ -625,6 +627,7 @@ tests_not_using_seastar_test_framework = set([
    'tests/message',
    'tests/perf/perf_simple_query',
    'tests/perf/perf_fast_forward',
+    'tests/row_cache_stress_test',
    'tests/memory_footprint',
    'tests/gossip',
    'tests/perf/perf_sstable',
--- a/converting_mutation_partition_applier.hh
+++ b/converting_mutation_partition_applier.hh
@@ -22,6 +22,7 @@
 #pragma once

 #include "mutation_partition_view.hh"
+#include "mutation_partition.hh"
 #include "schema.hh"

 // Mutation partition visitor which applies visited data into
@@ -37,12 +38,12 @@ private:
    static bool is_compatible(const column_definition& new_def, const data_type& old_type, column_kind kind) {
        return ::is_compatible(new_def.kind, kind) && new_def.type->is_value_compatible_with(*old_type);
    }
-    void accept_cell(row& dst, column_kind kind, const column_definition& new_def, const data_type& old_type, atomic_cell_view cell) {
+    static void accept_cell(row& dst, column_kind kind, const column_definition& new_def, const data_type& old_type, atomic_cell_view cell) {
        if (is_compatible(new_def, old_type, kind) && cell.timestamp() > new_def.dropped_at()) {
            dst.apply(new_def, atomic_cell_or_collection(cell));
        }
    }
-    void accept_cell(row& dst, column_kind kind, const column_definition& new_def, const data_type& old_type, collection_mutation_view cell) {
+    static void accept_cell(row& dst, column_kind kind, const column_definition& new_def, const data_type& old_type, collection_mutation_view cell) {
        if (!is_compatible(new_def, old_type, kind)) {
            return;
        }
@@ -94,8 +95,8 @@ public:
        _p.apply_row_tombstone(_p_schema, rt);
    }

-    virtual void accept_row(clustering_key_view key, const row_tombstone& deleted_at, const row_marker& rm) override {
-        deletable_row& r = _p.clustered_row(_p_schema, key);
+    virtual void accept_row(position_in_partition_view key, const row_tombstone& deleted_at, const row_marker& rm, is_dummy dummy, is_continuous continuous) override {
+        deletable_row& r = _p.clustered_row(_p_schema, key, dummy, continuous);
        r.apply(rm);
        r.apply(deleted_at);
        _current_row = &r;
@@ -116,4 +117,14 @@ public:
            accept_cell(_current_row->cells(), column_kind::regular_column, *def, col.type(), collection);
        }
    }
+
+    // Appends the cell to dst upgrading it to the new schema.
+    // Cells must have monotonic names.
+    static void append_cell(row& dst, column_kind kind, const column_definition& new_def, const data_type& old_type, const atomic_cell_or_collection& cell) {
+        if (new_def.is_atomic()) {
+            accept_cell(dst, kind, new_def, old_type, cell.as_atomic_cell());
+        } else {
+            accept_cell(dst, kind, new_def, old_type, cell.as_collection_mutation());
+        }
+    }
 };
--- a/cql3/statements/batch_statement.cc
+++ b/cql3/statements/batch_statement.cc
@@ -233,7 +233,7 @@ void batch_statement::verify_batch_size(const std::vector<mutation>& mutations)
            size += v.data.size();
        }
        void accept_row_tombstone(const range_tombstone&) override {}
-        void accept_row(clustering_key_view, const row_tombstone&, const row_marker&) override {}
+        void accept_row(position_in_partition_view, const row_tombstone&, const row_marker&, is_dummy, is_continuous) override {}
        void accept_row_cell(column_id, atomic_cell_view v) override {
            size += v.value().size();
        }
--- a/database.cc
+++ b/database.cc
@@ -144,7 +144,7 @@ column_family::column_family(schema_ptr schema, config config, db::commitlog* cl
    , _streaming_memtables(_config.enable_disk_writes ? make_streaming_memtable_list() : make_memory_only_memtable_list())
    , _compaction_strategy(make_compaction_strategy(_schema->compaction_strategy(), _schema->compaction_strategy_options()))
    , _sstables(make_lw_shared(_compaction_strategy.make_sstable_set(_schema)))
-    , _cache(_schema, sstables_as_mutation_source(), global_cache_tracker(), _config.max_cached_partition_size_in_bytes)
+    , _cache(_schema, sstables_as_snapshot_source(), global_cache_tracker())
    , _commitlog(cl)
    , _compaction_manager(compaction_manager)
    , _flush_queue(std::make_unique<memtable_flush_queue>())
@@ -183,7 +183,24 @@ column_family::sstables_as_mutation_source() {
                                   tracing::trace_state_ptr trace_state,
                                   streamed_mutation::forwarding fwd,
                                   mutation_reader::forwarding fwd_mr) {
-        return make_sstable_reader(std::move(s), r, slice, pc, std::move(trace_state), fwd, fwd_mr);
+        return make_sstable_reader(std::move(s), _sstables, r, slice, pc, std::move(trace_state), fwd, fwd_mr);
+    });
+}
+
+snapshot_source
+column_family::sstables_as_snapshot_source() {
+    return snapshot_source([this] () {
+        // FIXME: Will keep sstables on disk until next memtable flush. Make compaction force cache refresh.
+        auto sst_set = _sstables;
+        return mutation_source([this, sst_set = std::move(sst_set)] (schema_ptr s,
+                const dht::partition_range& r,
+                const query::partition_slice& slice,
+                const io_priority_class& pc,
+                tracing::trace_state_ptr trace_state,
+                streamed_mutation::forwarding fwd,
+                mutation_reader::forwarding fwd_mr) {
+            return make_sstable_reader(std::move(s), sst_set, r, slice, pc, std::move(trace_state), fwd, fwd_mr);
+        });
    });
 }

@@ -529,6 +546,7 @@ public:

 mutation_reader
 column_family::make_sstable_reader(schema_ptr s,
+                                   lw_shared_ptr<sstables::sstable_set> sstables,
                                   const dht::partition_range& pr,
                                   const query::partition_slice& slice,
                                   const io_priority_class& pc,
@@ -555,11 +573,11 @@ column_family::make_sstable_reader(schema_ptr s,
        if (dht::shard_of(pos.token()) != engine().cpu_id()) {
            return make_empty_reader(); // range doesn't belong to this shard
        }
-        return restrict_reader(make_mutation_reader<single_key_sstable_reader>(const_cast<column_family*>(this), std::move(s), _sstables,
+        return restrict_reader(make_mutation_reader<single_key_sstable_reader>(const_cast<column_family*>(this), std::move(s), std::move(sstables),
            _stats.estimated_sstable_per_read, pr, slice, pc, std::move(trace_state), fwd));
    } else {
        // range_sstable_reader is not movable so we need to wrap it
-        return restrict_reader(make_mutation_reader<range_sstable_reader>(std::move(s), _sstables, pr, slice, pc, std::move(trace_state), fwd, fwd_mr));
+        return restrict_reader(make_mutation_reader<range_sstable_reader>(std::move(s), std::move(sstables), pr, slice, pc, std::move(trace_state), fwd, fwd_mr));
    }
 }

@@ -643,7 +661,7 @@ column_family::make_reader(schema_ptr s,
    if (_config.enable_cache) {
        readers.emplace_back(_cache.make_reader(s, range, slice, pc, std::move(trace_state), fwd, fwd_mr));
    } else {
-        readers.emplace_back(make_sstable_reader(s, range, slice, pc, std::move(trace_state), fwd, fwd_mr));
+        readers.emplace_back(make_sstable_reader(s, _sstables, range, slice, pc, std::move(trace_state), fwd, fwd_mr));
    }

    return make_combined_reader(std::move(readers));
@@ -662,7 +680,7 @@ column_family::make_streaming_reader(schema_ptr s,
        readers.emplace_back(mt->make_reader(s, range, slice, pc, nullptr, streamed_mutation::forwarding::no, mutation_reader::forwarding::no));
    }

-    readers.emplace_back(make_sstable_reader(s, range, slice, pc, nullptr, streamed_mutation::forwarding::no, mutation_reader::forwarding::no));
+    readers.emplace_back(make_sstable_reader(s, _sstables, range, slice, pc, nullptr, streamed_mutation::forwarding::no, mutation_reader::forwarding::no));

    return make_combined_reader(std::move(readers));
 }
@@ -680,7 +698,7 @@ column_family::make_streaming_reader(schema_ptr s,
        for (auto&& mt : *_memtables) {
            readers.emplace_back(mt->make_reader(s, range, slice, pc, trace_state, fwd, fwd_mr));
        }
-        readers.emplace_back(make_sstable_reader(s, range, slice, pc, std::move(trace_state), fwd, fwd_mr));
+        readers.emplace_back(make_sstable_reader(s, _sstables, range, slice, pc, std::move(trace_state), fwd, fwd_mr));
        return make_combined_reader(std::move(readers));
    });

@@ -866,11 +884,6 @@ column_family::seal_active_streaming_memtable_immediate() {
            // If we ever need to, we'll keep them separate statistics, but we don't want to polute the
            // main stats about memtables with streaming memtables.
            //
-            // Second, we will not bother touching the cache after this flush. The current streaming code
-            // will invalidate the ranges it touches, so we won't do it twice. Even when that changes, the
-            // cache management code in here will have to differ from the main memtable's one. Please see
-            // the comment at flush_streaming_mutations() for details.
-            //
            // Lastly, we don't have any commitlog RP to update, and we don't need to deal manipulate the
            // memtable list, since this memtable was not available for reading up until this point.
            return write_memtable_to_sstable(*old, newtab, incremental_backups_enabled(), priority).then([this, newtab, old] {
@@ -878,7 +891,12 @@ column_family::seal_active_streaming_memtable_immediate() {
            }).then([this, old, newtab] () {
                add_sstable(newtab, {engine().cpu_id()});
                trigger_compaction();
-                return old->clear_gently();
+                // Cache synchronization must be started atomically with add_sstable()
+                if (_config.enable_cache) {
+                    return _cache.update_invalidating(*old);
+                } else {
+                    return old->clear_gently();
+                }
            }).handle_exception([old] (auto ep) {
                dblog.error("failed to write streamed sstable: {}", ep);
                return make_exception_future<>(ep);
@@ -1791,7 +1809,7 @@ future<> distributed_loader::load_new_sstables(distributed<database>& db, sstrin
            cf.trigger_compaction();
            // Drop entire cache for this column family because it may be populated
            // with stale data.
-            return cf.get_row_cache().clear();
+            return cf.get_row_cache().invalidate();
        });
    }).then([&db, ks, cf] () mutable {
        return smp::submit_to(0, [&db, ks = std::move(ks), cf = std::move(cf)] () mutable {
@@ -1824,6 +1842,7 @@ future<sstables::entry_descriptor> distributed_loader::probe_file(distributed<da
        return cf.open_sstable(std::move(info), sstdir, comps.generation, comps.version, comps.format).then([&cf] (sstables::shared_sstable sst) mutable {
            if (sst) {
                cf.load_sstable(sst);
+                return cf.get_row_cache().invalidate();
            }
            return make_ready_future<>();
        });
@@ -2566,7 +2585,6 @@ keyspace::make_column_family_config(const schema& s, const db::config& db_config
    cfg.streaming_read_concurrency_config = _config.streaming_read_concurrency_config;
    cfg.cf_stats = _config.cf_stats;
    cfg.enable_incremental_backups = _config.enable_incremental_backups;
-    cfg.max_cached_partition_size_in_bytes = db_config.max_cached_partition_size_in_kb() * 1024;

    return cfg;
 }
@@ -3797,28 +3815,26 @@ future<> column_family::flush_streaming_mutations(utils::UUID plan_id, dht::part
    // be to change seal_active_streaming_memtable_delayed to take a range parameter. However, we
    // need this code to go away as soon as we can (see FIXME above). So the double gate is a better
    // temporary counter measure.
-    return with_gate(_streaming_flush_gate, [this, plan_id, ranges = std::move(ranges)] {
-        return flush_streaming_big_mutations(plan_id).then([this] {
-            return _streaming_memtables->seal_active_memtable(memtable_list::flush_behavior::delayed);
-        }).finally([this] {
-            return _streaming_flush_phaser.advance_and_await();
-        }).finally([this, ranges = std::move(ranges)] {
-            if (!_config.enable_cache) {
-                return make_ready_future<>();
-            }
-            return do_with(std::move(ranges), [this] (auto& ranges) {
-                return parallel_for_each(ranges, [this](auto&& range) {
-                    return _cache.invalidate(range);
-                });
+    return with_gate(_streaming_flush_gate, [this, plan_id, ranges = std::move(ranges)] () mutable {
+        return flush_streaming_big_mutations(plan_id).then([this, ranges = std::move(ranges)] (auto sstables) mutable {
+            return _streaming_memtables->seal_active_memtable(memtable_list::flush_behavior::delayed).then([this] {
+                return _streaming_flush_phaser.advance_and_await();
+            }).then([this, sstables = std::move(sstables), ranges = std::move(ranges)] () mutable {
+                for (auto&& sst : sstables) {
+                    // seal_active_streaming_memtable_big() ensures sst is unshared.
+                    this->add_sstable(sst, {engine().cpu_id()});
+                }
+                this->trigger_compaction();
+                return _cache.invalidate(std::move(ranges));
            });
        });
    });
 }

-future<> column_family::flush_streaming_big_mutations(utils::UUID plan_id) {
+future<std::vector<sstables::shared_sstable>> column_family::flush_streaming_big_mutations(utils::UUID plan_id) {
    auto it = _streaming_memtables_big.find(plan_id);
    if (it == _streaming_memtables_big.end()) {
-        return make_ready_future<>();
+        return make_ready_future<std::vector<sstables::shared_sstable>>(std::vector<sstables::shared_sstable>());
    }
    auto entry = it->second;
    _streaming_memtables_big.erase(it);
@@ -3830,11 +3846,7 @@ future<> column_family::flush_streaming_big_mutations(utils::UUID plan_id) {
                return sst->open_data();
            });
        }).then([this, entry] {
-            for (auto&& sst : entry->sstables) {
-                // seal_active_streaming_memtable_big() ensures sst is unshared.
-                add_sstable(sst, {engine().cpu_id()});
-            }
-            trigger_compaction();
+            return std::move(entry->sstables);
        });
    });
 }
@@ -3862,7 +3874,7 @@ future<> column_family::clear() {
    _streaming_memtables->clear();
    _streaming_memtables->add_memtable();
    _streaming_memtables_big.clear();
-    return _cache.clear();
+    return _cache.invalidate();
 }

 // NOTE: does not need to be futurized, but might eventually, depending on
@@ -3888,7 +3900,7 @@ future<db::replay_position> column_family::discard_sstables(db_clock::time_point

        _sstables = std::move(pruned);
        dblog.debug("cleaning out row cache");
-        return _cache.clear().then([rp, remove = std::move(remove)] () mutable {
+        return _cache.invalidate().then([rp, remove = std::move(remove)] () mutable {
            return parallel_for_each(remove, [](sstables::shared_sstable s) {
                return sstables::delete_atomically({s});
            }).then([rp] {
--- a/database.hh
+++ b/database.hh
@@ -429,7 +429,6 @@ public:
        restricted_mutation_reader_config read_concurrency_config;
        restricted_mutation_reader_config streaming_read_concurrency_config;
        ::cf_stats* cf_stats = nullptr;
-        uint64_t max_cached_partition_size_in_bytes;
    };
    struct no_commitlog {};
    struct stats {
@@ -505,7 +504,7 @@ private:
    };
    std::unordered_map<utils::UUID, lw_shared_ptr<streaming_memtable_big>> _streaming_memtables_big;

-    future<> flush_streaming_big_mutations(utils::UUID plan_id);
+    future<std::vector<sstables::shared_sstable>> flush_streaming_big_mutations(utils::UUID plan_id);
    void apply_streaming_big_mutation(schema_ptr m_schema, utils::UUID plan_id, const frozen_mutation& m);
    future<> seal_active_streaming_memtable_big(streaming_memtable_big& smb);

@@ -575,7 +574,9 @@ private:
 private:
    void update_stats_for_new_sstable(uint64_t disk_space_used_by_sstable, std::vector<unsigned>&& shards_for_the_sstable);
    // Adds new sstable to the set of sstables
-    // Doesn't update the cache.
+    // Doesn't update the cache. The cache must be synchronized in order for reads to see
+    // the writes contained in this sstable.
+    // Cache must be synchronized atomically with this, otherwise write atomicity may not be respected.
    // Doesn't trigger compaction.
    void add_sstable(lw_shared_ptr<sstables::sstable> sstable, std::vector<unsigned>&& shards_for_the_sstable);
    // returns an empty pointer if sstable doesn't belong to current shard.
@@ -619,11 +620,12 @@ private:
    void remove_ancestors_needed_rewrite(std::unordered_set<uint64_t> ancestors);
 private:
    mutation_source_opt _virtual_reader;
-    // Creates a mutation reader which covers sstables.
+    // Creates a mutation reader which covers given sstables.
    // Caller needs to ensure that column_family remains live (FIXME: relax this).
    // The 'range' parameter must be live as long as the reader is used.
    // Mutations returned by the reader will all have given schema.
    mutation_reader make_sstable_reader(schema_ptr schema,
+                                        lw_shared_ptr<sstables::sstable_set> sstables,
                                        const dht::partition_range& range,
                                        const query::partition_slice& slice,
                                        const io_priority_class& pc,
@@ -632,6 +634,7 @@ private:
                                        mutation_reader::forwarding fwd_mr) const;

    mutation_source sstables_as_mutation_source();
+    snapshot_source sstables_as_snapshot_source();
    partition_presence_checker make_partition_presence_checker(lw_shared_ptr<sstables::sstable_set>);
    std::chrono::steady_clock::time_point _sstable_writes_disabled_at;
    void do_trigger_compaction();
--- a/db/config.hh
+++ b/db/config.hh
@@ -373,9 +373,6 @@ public:
    val(reduce_cache_sizes_at, double, .85, Invalid,     \
            "When Java heap usage (after a full concurrent mark sweep (CMS) garbage collection) exceeds this percentage, Cassandra reduces the cache capacity to the fraction of the current size as specified by reduce_cache_capacity_to. To disable, set the value to 1.0."  \
    )   \
-    val(max_cached_partition_size_in_kb, uint64_t, 10240uLL, Used,     \
-            "Partitions with size greater than this value won't be cached."  \
-    )   \
    /* Disks settings */    \
    val(stream_throughput_outbound_megabits_per_sec, uint32_t, 400, Unused,     \
            "Throttles all outbound streaming file transfers on a node to the specified throughput. Cassandra does mostly sequential I/O when streaming data during bootstrap or repair, which can lead to saturating the network connection and degrading client (RPC) performance."  \
--- a/dht/i_partitioner.cc
+++ b/dht/i_partitioner.cc
@@ -442,7 +442,7 @@ bool ring_position::less_compare(const schema& s, const ring_position& other) co
 }

 int ring_position_comparator::operator()(ring_position_view lh, ring_position_view rh) const {
-    auto token_cmp = tri_compare(lh._token, rh._token);
+    auto token_cmp = tri_compare(*lh._token, *rh._token);
    if (token_cmp) {
        return token_cmp;
    }
@@ -464,7 +464,7 @@ int ring_position_comparator::operator()(ring_position_view lh, ring_position_vi

 int ring_position_comparator::operator()(ring_position_view lh, sstables::key_view rh) const {
    auto rh_token = global_partitioner().get_token(rh);
-    auto token_cmp = tri_compare(lh._token, rh_token);
+    auto token_cmp = tri_compare(*lh._token, rh_token);
    if (token_cmp) {
        return token_cmp;
    }
--- a/dht/i_partitioner.hh
+++ b/dht/i_partitioner.hh
@@ -374,6 +374,14 @@ private:
    token_bound _token_bound; // valid when !_key
    std::experimental::optional<partition_key> _key;
 public:
+    static ring_position min() {
+        return { minimum_token(), token_bound::start };
+    }
+
+    static ring_position max() {
+        return { maximum_token(), token_bound::end };
+    }
+
    static ring_position starting_at(dht::token token) {
        return { std::move(token), token_bound::start };
    }
@@ -463,7 +471,7 @@ class ring_position_view {
    // For example {_token=t1, _key=nullptr, _weight=1} is ordered after {_token=t1, _key=k1, _weight=0},
    // but {_token=t1, _key=nullptr, _weight=-1} is ordered before it.
    //
-    const dht::token& _token;
+    const dht::token* _token; // always not nullptr
    const partition_key* _key; // Can be nullptr
    int8_t _weight;
 public:
@@ -479,11 +487,11 @@ public:
    }

    bool is_min() const {
-        return _token.is_minimum();
+        return _token->is_minimum();
    }

    bool is_max() const {
-        return _token.is_maximum();
+        return _token->is_maximum();
    }

    static ring_position_view for_range_start(const partition_range& r) {
@@ -503,11 +511,14 @@ public:
    }

    ring_position_view(const dht::ring_position& pos, after_key after = after_key::no)
-        : _token(pos.token())
+        : _token(&pos.token())
        , _key(pos.has_key() ? &*pos.key() : nullptr)
        , _weight(pos.has_key() ? bool(after) : pos.relation_to_keys())
    { }

+    ring_position_view(const ring_position_view& pos) = default;
+    ring_position_view& operator=(const ring_position_view& other) = default;
+
    ring_position_view(after_key_tag, const ring_position_view& v)
        : _token(v._token)
        , _key(v._key)
@@ -515,13 +526,13 @@ public:
    { }

    ring_position_view(const dht::decorated_key& key, after_key after_key = after_key::no)
-        : _token(key.token())
+        : _token(&key.token())
        , _key(&key.key())
        , _weight(bool(after_key))
    { }

    ring_position_view(const dht::token& token, partition_key* key, int8_t weight)
-        : _token(token)
+        : _token(&token)
        , _key(key)
        , _weight(weight)
    { }
--- a/hashing_partition_visitor.hh
+++ b/hashing_partition_visitor.hh
@@ -63,8 +63,11 @@ public:
        rt.feed_hash(_h, _s);
    }

-    virtual void accept_row(clustering_key_view key, const row_tombstone& deleted_at, const row_marker& rm) {
-        key.feed_hash(_h, _s);
+    virtual void accept_row(position_in_partition_view pos, const row_tombstone& deleted_at, const row_marker& rm, is_dummy dummy, is_continuous continuous) override {
+        if (dummy) {
+            return;
+        }
+        pos.key().feed_hash(_h, _s);
        feed_hash(_h, deleted_at);
        feed_hash(_h, rm);
    }
--- a/intrusive_set_external_comparator.hh
+++ b/intrusive_set_external_comparator.hh
@@ -208,6 +208,10 @@ public:
    }
    template<class ElemCompare>
    iterator insert(const_iterator hint, Elem& value, ElemCompare cmp) {
+        return insert_check(hint, value, std::move(cmp)).first;
+    }
+    template<class ElemCompare>
+    std::pair<iterator, bool> insert_check(const_iterator hint, Elem& value, ElemCompare cmp) {
        algo::insert_commit_data commit_data;
        std::pair<node_ptr, bool> ret =
            algo::insert_unique_check(_header.this_ptr(),
@@ -215,8 +219,8 @@ public:
                                      key_of_value()(value),
                                      key_node_comp(cmp),
                                      commit_data);
-        return ret.second ? insert_unique_commit(value, commit_data)
-                          : iterator(ret.first, priv_value_traits_ptr());
+        return ret.second ? std::make_pair(insert_unique_commit(value, commit_data), true)
+                          : std::make_pair(iterator(ret.first, priv_value_traits_ptr()), false);
    }
 };

--- a/mutation.cc
+++ b/mutation.cc
@@ -206,37 +206,20 @@ mutation& mutation::operator+=(mutation&& other) {
    return *this;
 }

-enum class limit_mutation_size { yes, no };
+mutation mutation::sliced(const query::clustering_row_ranges& ranges) const {
+    auto m = mutation(schema(), decorated_key(), mutation_partition(partition(), *schema(), ranges));
+    m.partition().row_tombstones().trim(*schema(), ranges);
+    return m;
+}

-template <limit_mutation_size with_limit>
 class mutation_rebuilder {
    mutation _m;
    streamed_mutation& _sm;
    size_t _remaining_limit;

-    template <typename T> bool check_remaining_limit(const T& e) {
-        if (with_limit == limit_mutation_size::no) {
-            return true;
-        }
-        size_t size = e.memory_usage();
-        if (_remaining_limit <= size) {
-            _remaining_limit = 0;
-        } else {
-            _remaining_limit -= size;
-        }
-        return _remaining_limit > 0;
-    }
 public:
    mutation_rebuilder(streamed_mutation& sm)
        : _m(sm.decorated_key(), sm.schema()), _sm(sm), _remaining_limit(0) {
-        static_assert(with_limit == limit_mutation_size::no,
-                     "This constructor should be used only for mutation_rebuildeer with no limit");
-    }
-    mutation_rebuilder(streamed_mutation& sm, size_t limit)
-        : _m(sm.decorated_key(), sm.schema()), _sm(sm), _remaining_limit(limit) {
-        static_assert(with_limit == limit_mutation_size::yes,
-                      "This constructor should be used only for mutation_rebuildeer with limit");
-        check_remaining_limit(_m.key());
    }

    stop_iteration consume(tombstone t) {
@@ -245,25 +228,16 @@ public:
    }

    stop_iteration consume(range_tombstone&& rt) {
-        if (!check_remaining_limit(rt)) {
-            return stop_iteration::yes;
-        }
        _m.partition().apply_row_tombstone(*_m.schema(), std::move(rt));
        return stop_iteration::no;
    }

    stop_iteration consume(static_row&& sr) {
-        if (!check_remaining_limit(sr)) {
-            return stop_iteration::yes;
-        }
        _m.partition().static_row().apply(*_m.schema(), column_kind::static_column, std::move(sr.cells()));
        return stop_iteration::no;
    }

    stop_iteration consume(clustering_row&& cr) {
-        if (!check_remaining_limit(cr)) {
-            return stop_iteration::yes;
-        }
        auto& dr = _m.partition().clustered_row(*_m.schema(), std::move(cr.key()));
        dr.apply(cr.tomb());
        dr.apply(cr.marker());
@@ -272,29 +246,21 @@ public:
    }

    mutation_opt consume_end_of_stream() {
-        return with_limit == limit_mutation_size::yes && _remaining_limit == 0 ? mutation_opt()
-                                                                               : mutation_opt(std::move(_m));
+        return mutation_opt(std::move(_m));
    }
 };

-future<mutation_opt>
-mutation_from_streamed_mutation_with_limit(streamed_mutation sm, size_t limit) {
-    return do_with(std::move(sm), [limit] (auto& sm) {
-        return consume(sm, mutation_rebuilder<limit_mutation_size::yes>(sm, limit));
-    });
-}
-
 future<mutation_opt> mutation_from_streamed_mutation(streamed_mutation_opt sm) {
    if (!sm) {
        return make_ready_future<mutation_opt>();
    }
    return do_with(std::move(*sm), [] (auto& sm) {
-        return consume(sm, mutation_rebuilder<limit_mutation_size::no>(sm));
+        return consume(sm, mutation_rebuilder(sm));
    });
 }

 future<mutation> mutation_from_streamed_mutation(streamed_mutation& sm) {
-    return consume(sm, mutation_rebuilder<limit_mutation_size::no>(sm)).then([] (mutation_opt&& mo) {
+    return consume(sm, mutation_rebuilder(sm)).then([] (mutation_opt&& mo) {
        return std::move(*mo);
    });
 }
--- a/mutation.hh
+++ b/mutation.hh
@@ -133,6 +133,10 @@ public:
    mutation operator+(const mutation& other) const;
    mutation& operator+=(const mutation& other);
    mutation& operator+=(mutation&& other);
+
+    // Returns a subset of this mutation holding only information relevant for given clustering ranges.
+    // Range tombstones will be trimmed to the boundaries of the clustering ranges.
+    mutation sliced(const query::clustering_row_ranges&) const;
 private:
    friend std::ostream& operator<<(std::ostream& os, const mutation& m);
 };
@@ -185,4 +189,3 @@ boost::iterator_range<std::vector<mutation>::const_iterator> slice(

 future<mutation_opt> mutation_from_streamed_mutation(streamed_mutation_opt sm);
 future<mutation> mutation_from_streamed_mutation(streamed_mutation& sm);
-future<mutation_opt> mutation_from_streamed_mutation_with_limit(streamed_mutation sm, size_t limit);
--- a/mutation_partition.cc
+++ b/mutation_partition.cc
@@ -175,7 +175,7 @@ void revert_intrusive_set_range(const schema& s, mutation_partition::rows_type&
        assert(i != dst.end());
        rows_entry& dst_e = *i;

-        if (e.empty()) {
+        if (e.erased()) {
            dst.erase(i);
            start = src.erase_and_dispose(start, deleter);
            start = src.insert_before(start, dst_e);
@@ -203,18 +203,10 @@ auto apply_reversibly_intrusive_set(const schema& s, mutation_partition::rows_ty
        while (src_i != src.end()) {
            rows_entry& src_e = *src_i;

-            // neutral entries will be given special meaning for the purpose of revert, so
-            // get rid of empty rows from the input as if they were not there. This doesn't change
-            // the value of src.
-            if (src_e.empty()) {
-                src_i = src.erase_and_dispose(src_i, current_deleter<rows_entry>());
-                continue;
-            }
-
            auto i = dst.lower_bound(src_e, cmp);
            if (i == dst.end() || cmp(src_e, *i)) {
-                // Construct neutral entry which will represent missing dst entry for revert.
-                rows_entry* empty_e = current_allocator().construct<rows_entry>(src_e.key());
+                // Construct erased entry which will represent missing dst entry for revert.
+                rows_entry* empty_e = current_allocator().construct<rows_entry>(rows_entry::erased_tag{}, src_e);
                [&] () noexcept {
                    src_i = src.erase(src_i);
                    src_i = src.insert_before(src_i, *empty_e);
@@ -235,6 +227,7 @@ auto apply_reversibly_intrusive_set(const schema& s, mutation_partition::rows_ty
 mutation_partition::mutation_partition(const mutation_partition& x)
        : _tombstone(x._tombstone)
        , _static_row(x._static_row)
+        , _static_row_continuous(x._static_row_continuous)
        , _rows()
        , _row_tombstones(x._row_tombstones) {
    auto cloner = [] (const auto& x) {
@@ -247,6 +240,7 @@ mutation_partition::mutation_partition(const mutation_partition& x, const schema
        query::clustering_key_filter_ranges ck_ranges)
        : _tombstone(x._tombstone)
        , _static_row(x._static_row)
+        , _static_row_continuous(x._static_row_continuous)
        , _rows()
        , _row_tombstones(x._row_tombstones, range_tombstone_list::copy_comparator_only()) {
    try {
@@ -271,6 +265,7 @@ mutation_partition::mutation_partition(mutation_partition&& x, const schema& sch
    query::clustering_key_filter_ranges ck_ranges)
    : _tombstone(x._tombstone)
    , _static_row(std::move(x._static_row))
+    , _static_row_continuous(x._static_row_continuous)
    , _rows(std::move(x._rows))
    , _row_tombstones(std::move(x._row_tombstones))
 {
@@ -319,6 +314,13 @@ mutation_partition::operator=(mutation_partition&& x) noexcept {
    return *this;
 }

+void mutation_partition::ensure_last_dummy(const schema& s) {
+    if (_rows.empty() || !_rows.rbegin()->position().is_after_all_clustered_rows(s)) {
+        _rows.insert_before(_rows.end(),
+            *current_allocator().construct<rows_entry>(s, position_in_partition_view::after_all_clustered_rows(), is_dummy::yes, is_continuous::yes));
+    }
+}
+
 void
 mutation_partition::apply(const schema& s, const mutation_partition& p, const schema& p_schema) {
    if (s.version() != p_schema.version()) {
@@ -507,7 +509,7 @@ mutation_partition::clustered_row(const schema& s, const clustering_key& key) {
 }

 deletable_row&
-mutation_partition::clustered_row(const schema& s, const clustering_key_view& key) {
+mutation_partition::clustered_row(const schema& s, clustering_key_view key) {
    auto i = _rows.find(key, rows_entry::compare(s));
    if (i == _rows.end()) {
        auto e = current_allocator().construct<rows_entry>(key);
@@ -517,6 +519,17 @@ mutation_partition::clustered_row(const schema& s, const clustering_key_view& ke
    return i->row();
 }

+deletable_row&
+mutation_partition::clustered_row(const schema& s, position_in_partition_view pos, is_dummy dummy, is_continuous continuous) {
+    auto i = _rows.find(pos, rows_entry::compare(s));
+    if (i == _rows.end()) {
+        auto e = current_allocator().construct<rows_entry>(s, pos, dummy, continuous);
+        _rows.insert(i, *e, rows_entry::compare(s));
+        return e->row();
+    }
+    return i->row();
+}
+
 mutation_partition::rows_type::const_iterator
 mutation_partition::lower_bound(const schema& schema, const query::clustering_range& r) const {
    auto cmp = rows_entry::key_comparator(clustering_key_prefix::prefix_equality_less_compare(schema));
@@ -759,6 +772,9 @@ mutation_partition::query_compacted(query::result::partition_writer& pw, const s
    auto is_reversed = slice.options.contains(query::partition_slice::option::reversed);
    auto send_ck = slice.options.contains(query::partition_slice::option::send_clustering_key);
    for_each_row(s, query::clustering_range::make_open_ended_both_sides(), is_reversed, [&] (const rows_entry& e) {
+        if (e.dummy()) {
+            return stop_iteration::no;
+        }
        auto& row = e.row();
        auto row_tombstone = tombstone_for_row(s, e);

@@ -843,13 +859,13 @@ operator<<(std::ostream& os, const deletable_row& dr) {

 std::ostream&
 operator<<(std::ostream& os, const rows_entry& re) {
-    return fprint(os, "{rows_entry: %s %s}", re._key, re._row);
+    return fprint(os, "{rows_entry: cont=%d dummy=%d %s %s}", re.continuous(), re.dummy(), re._key, re._row);
 }

 std::ostream&
 operator<<(std::ostream& os, const mutation_partition& mp) {
-    return fprint(os, "{mutation_partition: %s (%s) static %s clustered %s}",
-                  mp._tombstone, ::join(", ", mp._row_tombstones), mp._static_row,
+    return fprint(os, "{mutation_partition: %s (%s) static cont=%d %s clustered %s}",
+                  mp._tombstone, ::join(", ", mp._row_tombstones), mp._static_row_continuous, mp._static_row,
                  ::join(", ", mp._rows));
 }

@@ -905,14 +921,30 @@ void deletable_row::revert(const schema& s, deletable_row& src) {
    _marker.revert(src._marker);
 }

+void deletable_row::apply(const schema& s, deletable_row&& src) {
+    _cells.apply(s, column_kind::regular_column, std::move(src._cells));
+    _marker.apply(src._marker);
+    _deleted_at.apply(src._deleted_at, _marker);
+}
+
 bool
 rows_entry::equal(const schema& s, const rows_entry& other) const {
    return equal(s, other, s);
 }

+position_in_partition_view rows_entry::position() const {
+    if (_flags._last) {
+        return position_in_partition_view::after_all_clustered_rows();
+    } else {
+        return position_in_partition_view(
+            position_in_partition_view::clustering_row_tag_t(), _key);
+    }
+}
+
 bool
 rows_entry::equal(const schema& s, const rows_entry& other, const schema& other_schema) const {
-    return key().equal(s, other.key()) // Only representation-compatible changes are allowed
+    position_in_partition::equal_compare eq(s);
+    return eq(position(), other.position())
           && row().equal(column_kind::regular_column, s, other.row(), other_schema);
 }

@@ -925,7 +957,7 @@ bool mutation_partition::equal(const schema& this_schema, const mutation_partiti
        return false;
    }

-    if (!std::equal(_rows.begin(), _rows.end(), p._rows.begin(), p._rows.end(),
+    if (!boost::equal(non_dummy_rows(), p.non_dummy_rows(),
        [&] (const rows_entry& e1, const rows_entry& e2) {
            return e1.equal(this_schema, e2, p_schema);
        }
@@ -943,6 +975,16 @@ bool mutation_partition::equal(const schema& this_schema, const mutation_partiti
    return _static_row.equal(column_kind::static_column, this_schema, p._static_row, p_schema);
 }

+bool mutation_partition::equal_continuity(const schema& s, const mutation_partition& p) const {
+    return _static_row_continuous == p._static_row_continuous
+        && boost::equal(_rows, p._rows, [&] (const rows_entry& e1, const rows_entry& e2) {
+            position_in_partition::equal_compare eq(s);
+            return eq(e1.position(), e2.position())
+                   && e1.continuous() == e2.continuous()
+                   && e1.dummy() == e2.dummy();
+        });
+}
+
 void
 apply_reversibly(const column_definition& def, atomic_cell_or_collection& dst,  atomic_cell_or_collection& src) {
    // Must be run via with_linearized_managed_bytes() context, but assume it is
@@ -1216,8 +1258,10 @@ uint32_t mutation_partition::do_compact(const schema& s,
    uint32_t row_count = 0;

    auto row_callback = [&] (rows_entry& e) {
+        if (e.dummy()) {
+            return stop_iteration::no;
+        }
        deletable_row& row = e.row();
-
        row_tombstone tomb = tombstone_for_row(s, e);

        bool is_live = row.cells().compact_and_expire(s, column_kind::regular_column, tomb, query_time, can_gc, gc_before);
@@ -1315,7 +1359,7 @@ size_t
 mutation_partition::live_row_count(const schema& s, gc_clock::time_point query_time) const {
    size_t count = 0;

-    for (const rows_entry& e : _rows) {
+    for (const rows_entry& e : non_dummy_rows()) {
        tombstone base_tombstone = range_tombstone_for_row(s, e.key());
        if (e.row().is_live(s, base_tombstone, query_time)) {
            ++count;
@@ -1333,6 +1377,7 @@ rows_entry::rows_entry(rows_entry&& o) noexcept
    : _link(std::move(o._link))
    , _key(std::move(o._key))
    , _row(std::move(o._row))
+    , _flags(std::move(o._flags))
 { }

 row::row(const row& o)
@@ -1641,7 +1686,10 @@ mutation_partition mutation_partition::difference(schema_ptr s, const mutation_p
    auto it_r = other._rows.begin();
    rows_entry::compare cmp_r(*s);
    for (auto&& r : _rows) {
-        while (it_r != other._rows.end() && cmp_r(*it_r, r)) {
+        if (r.dummy()) {
+            continue;
+        }
+        while (it_r != other._rows.end() && (it_r->dummy() || cmp_r(*it_r, r))) {
            ++it_r;
        }
        if (it_r == other._rows.end() || !it_r->key().equal(*s, r.key())) {
@@ -1671,7 +1719,7 @@ void mutation_partition::accept(const schema& s, mutation_partition_visitor& v)
    }
    for (const rows_entry& e : _rows) {
        const deletable_row& dr = e.row();
-        v.accept_row(e.key(), dr.deleted_at(), dr.marker());
+        v.accept_row(e.position(), dr.deleted_at(), dr.marker(), e.dummy(), e.continuous());
        dr.cells().for_each_cell([&] (column_id id, const atomic_cell_or_collection& cell) {
            const column_definition& def = s.regular_column_at(id);
            if (def.is_atomic()) {
@@ -2069,6 +2117,41 @@ public:
    }
 };

+mutation_partition::mutation_partition(mutation_partition::incomplete_tag, const schema& s, tombstone t)
+    : _tombstone(t)
+    , _static_row_continuous(false)
+    , _rows()
+    , _row_tombstones(s)
+{
+    _rows.insert_before(_rows.end(),
+        *current_allocator().construct<rows_entry>(s, position_in_partition_view::after_all_clustered_rows(), is_dummy::yes, is_continuous::no));
+}
+
+bool mutation_partition::is_fully_continuous() const {
+    if (!_static_row_continuous) {
+        return false;
+    }
+    for (auto&& row : _rows) {
+        if (!row.continuous()) {
+            return false;
+        }
+    }
+    return true;
+}
+
+void mutation_partition::make_fully_continuous() {
+    _static_row_continuous = true;
+    auto i = _rows.begin();
+    while (i != _rows.end()) {
+        if (i->dummy()) {
+            i = _rows.erase_and_dispose(i, alloc_strategy_deleter<rows_entry>());
+        } else {
+            i->set_continuous(true);
+            ++i;
+        }
+    }
+}
+
 future<mutation_opt> counter_write_query(schema_ptr s, const mutation_source& source,
                                         const dht::decorated_key& dk,
                                         const query::partition_slice& slice,
--- a/mutation_partition.hh
+++ b/mutation_partition.hh
@@ -33,6 +33,7 @@
 #include "schema.hh"
 #include "tombstone.hh"
 #include "keys.hh"
+#include "position_in_partition.hh"
 #include "atomic_cell_or_collection.hh"
 #include "query-result.hh"
 #include "mutation_partition_view.hh"
@@ -598,6 +599,9 @@ class deletable_row final {
 public:
    deletable_row() {}
    explicit deletable_row(clustering_row&&);
+    deletable_row(row_tombstone tomb, const row_marker& marker, const row& cells)
+        : _deleted_at(tomb), _marker(marker), _cells(cells)
+    {}

    void apply(tombstone deleted_at) {
        _deleted_at.apply(deleted_at);
@@ -624,6 +628,10 @@ public:
    void apply_reversibly(const schema& s, deletable_row& src);
    // See reversibly_mergeable.hh
    void revert(const schema& s, deletable_row& src);
+
+    // Weak exception guarantees. After exception, both src and this will commute to the same value as
+    // they would should the exception not happen.
+    void apply(const schema& s, deletable_row&& src);
 public:
    row_tombstone deleted_at() const { return _deleted_at; }
    api::timestamp_type created_at() const { return _marker.timestamp(); }
@@ -642,28 +650,59 @@ class rows_entry {
    intrusive_set_external_comparator_member_hook _link;
    clustering_key _key;
    deletable_row _row;
+    struct flags {
+        bool _continuous : 1; // See doc of is_continuous.
+        bool _dummy : 1;
+        bool _last : 1;
+        bool _erased : 1; // Used only temporarily during apply_reversibly(). Refs #2012.
+        flags() : _continuous(true), _dummy(false), _last(false), _erased(false) { }
+    } _flags{};
    friend class mutation_partition;
 public:
+    struct erased_tag {};
+    rows_entry(erased_tag, const rows_entry& e)
+        : _key(e._key)
+    {
+        _flags._erased = true;
+        _flags._last = e._flags._last;
+    }
    explicit rows_entry(clustering_key&& key)
        : _key(std::move(key))
    { }
    explicit rows_entry(const clustering_key& key)
        : _key(key)
    { }
+    rows_entry(const schema& s, position_in_partition_view pos, is_dummy dummy, is_continuous continuous)
+        : _key(pos.key())
+    {
+        if (!pos.is_clustering_row()) {
+            assert(bool(dummy));
+            assert(pos.is_after_all_clustered_rows(s)); // FIXME: Support insertion at any position
+            _flags._last = true;
+        }
+        _flags._dummy = bool(dummy);
+        _flags._continuous = bool(continuous);
+    }
    rows_entry(const clustering_key& key, deletable_row&& row)
        : _key(key), _row(std::move(row))
    { }
    rows_entry(const clustering_key& key, const deletable_row& row)
        : _key(key), _row(row)
    { }
+    rows_entry(const clustering_key& key, row_tombstone tomb, const row_marker& marker, const row& row)
+        : _key(key), _row(tomb, marker, row)
+    { }
    rows_entry(rows_entry&& o) noexcept;
    rows_entry(const rows_entry& e)
        : _key(e._key)
        , _row(e._row)
+        , _flags(e._flags)
    { }
+    // Valid only if !dummy()
    clustering_key& key() {
        return _key;
    }
+    // Valid only if !dummy()
    const clustering_key& key() const {
        return _key;
    }
@@ -673,6 +712,11 @@ public:
    const deletable_row& row() const {
        return _row;
    }
+    position_in_partition_view position() const;
+    is_continuous continuous() const { return is_continuous(_flags._continuous); }
+    void set_continuous(bool value) { _flags._continuous = value; }
+    void set_continuous(is_continuous value) { set_continuous(bool(value)); }
+    is_dummy dummy() const { return is_dummy(_flags._dummy); }
    void apply(row_tombstone t) {
        _row.apply(t);
    }
@@ -687,23 +731,54 @@ public:
    bool empty() const {
        return _row.empty();
    }
+    bool erased() const {
+        return _flags._erased;
+    }
+    struct tri_compare {
+        position_in_partition::tri_compare _c;
+        explicit tri_compare(const schema& s) : _c(s) {}
+        int operator()(const rows_entry& e1, const rows_entry& e2) const {
+            return _c(e1.position(), e2.position());
+        }
+        int operator()(const clustering_key& key, const rows_entry& e) const {
+            return _c(position_in_partition_view::for_key(key), e.position());
+        }
+        int operator()(const rows_entry& e, const clustering_key& key) const {
+            return _c(e.position(), position_in_partition_view::for_key(key));
+        }
+        int operator()(const rows_entry& e, position_in_partition_view p) const {
+            return _c(e.position(), p);
+        }
+        int operator()(position_in_partition_view p, const rows_entry& e) const {
+            return _c(p, e.position());
+        }
+        int operator()(position_in_partition_view p1, position_in_partition_view p2) const {
+            return _c(p1, p2);
+        }
+    };
    struct compare {
-        clustering_key::less_compare _c;
-        compare(const schema& s) : _c(s) {}
+        tri_compare _c;
+        explicit compare(const schema& s) : _c(s) {}
        bool operator()(const rows_entry& e1, const rows_entry& e2) const {
-            return _c(e1._key, e2._key);
+            return _c(e1, e2) < 0;
        }
        bool operator()(const clustering_key& key, const rows_entry& e) const {
-            return _c(key, e._key);
+            return _c(key, e) < 0;
        }
        bool operator()(const rows_entry& e, const clustering_key& key) const {
-            return _c(e._key, key);
+            return _c(e, key) < 0;
        }
        bool operator()(const clustering_key_view& key, const rows_entry& e) const {
-            return _c(key, e._key);
+            return _c(key, e) < 0;
        }
        bool operator()(const rows_entry& e, const clustering_key_view& key) const {
-            return _c(e._key, key);
+            return _c(e, key) < 0;
+        }
+        bool operator()(const rows_entry& e, position_in_partition_view p) const {
+            return _c(e.position(), p) < 0;
+        }
+        bool operator()(position_in_partition_view p, const rows_entry& e) const {
+            return _c(p, e.position()) < 0;
        }
    };
    template <typename Comparator>
@@ -712,10 +787,16 @@ public:
        delegating_compare(Comparator&& c) : _c(std::move(c)) {}
        template <typename Comparable>
        bool operator()(const Comparable& v, const rows_entry& e) const {
+            if (e._flags._last) {
+                return true;
+            }
            return _c(v, e._key);
        }
        template <typename Comparable>
        bool operator()(const rows_entry& e, const Comparable& v) const {
+            if (e._flags._last) {
+                return false;
+            }
            return _c(e._key, v);
        }
    };
@@ -728,6 +809,47 @@ public:
    bool equal(const schema& s, const rows_entry& other, const schema& other_schema) const;
 };

+// Represents a set of writes made to a single partition.
+//
+// The object is schema-dependent. Each instance is governed by some
+// specific schema version. Accessors require a reference to the schema object
+// of that version.
+//
+// There is an operation of addition defined on mutation_partition objects
+// (also called "apply"), which gives as a result an object representing the
+// sum of writes contained in the addends. For instances governed by the same
+// schema, addition is commutative and associative.
+//
+// In addition to representing writes, the object supports specifying a set of
+// partition elements called "continuity". This set can be used to represent
+// lack of information about certain parts of the partition. It can be
+// specified which ranges of clustering keys belong to that set. We say that a
+// key range is continuous if all keys in that range belong to the continuity
+// set, and discontinuous otherwise. By default everything is continuous.
+// The static row may be also continuous or not.
+// Partition tombstone is always continuous.
+//
+// Continuity is ignored by instance equality. It's also transient, not
+// preserved by serialization.
+//
+// Continuity is represented internally using flags on row entries. The key
+// range between two consecutive entries (both ends exclusive) is continuous
+// if and only if rows_entry::continuous() is true for the later entry. The
+// range starting after the last entry is assumed to be continuous. The range
+// corresponding to the key of the entry is continuous if and only if
+// rows_entry::dummy() is false.
+//
+// Adding two fully-continuous instances gives a fully-continuous instance.
+// Continuity doesn't affect how the write part is added.
+//
+// Addition of continuity is not commutative in general, but is associative.
+// Continuity flags on objects representing the same thing (e.g. rows_entry
+// with the same key) are merged such that the information stored in the left-
+// hand operand wins. Flags on objects which are present only in one of the
+// operands are transferred as-is. Such merging rules are useful for layering
+// information in MVCC, where newer versions specify continuity with respect
+// to the combined set of rows in all prior versions, not just in their
+// versions.
 class mutation_partition final {
 public:
    using rows_type = intrusive_set_external_comparator<rows_entry, &rows_entry::_link>;
@@ -736,6 +858,7 @@ public:
 private:
    tombstone _tombstone;
    row _static_row;
+    bool _static_row_continuous = true;
    rows_type _rows;
    // Contains only strict prefixes so that we don't have to lookup full keys
    // in both _row_tombstones and _rows.
@@ -745,6 +868,12 @@ private:
    friend class converting_mutation_partition_applier;
 public:
    struct copy_comparators_only {};
+    struct incomplete_tag {};
+    // Constructs an empty instance which is fully discontinuous except for the partition tombstone.
+    mutation_partition(incomplete_tag, const schema& s, tombstone);
+    static mutation_partition make_incomplete(const schema& s, tombstone t = {}) {
+        return mutation_partition(incomplete_tag(), s, t);
+    }
    mutation_partition(schema_ptr s)
        : _rows()
        , _row_tombstones(*s)
@@ -762,6 +891,7 @@ public:
    mutation_partition& operator=(mutation_partition&& x) noexcept;
    bool equal(const schema&, const mutation_partition&) const;
    bool equal(const schema& this_schema, const mutation_partition& p, const schema& p_schema) const;
+    bool equal_continuity(const schema&, const mutation_partition&) const;
    // Consistent with equal()
    template<typename Hasher>
    void feed_hash(Hasher& h, const schema& s) const {
@@ -770,6 +900,13 @@ public:
    }
    friend std::ostream& operator<<(std::ostream& os, const mutation_partition& mp);
 public:
+    // Makes sure there is a dummy entry after all clustered rows. Doesn't affect continuity.
+    // Doesn't invalidate iterators.
+    void ensure_last_dummy(const schema&);
+    bool static_row_continuous() const { return _static_row_continuous; }
+    void set_static_row_continuous(bool value) { _static_row_continuous = value; }
+    bool is_fully_continuous() const;
+    void make_fully_continuous();
    void apply(tombstone t) { _tombstone.apply(t); }
    void apply_delete(const schema& schema, const clustering_key_prefix& prefix, tombstone t);
    void apply_delete(const schema& schema, range_tombstone rt);
@@ -866,7 +1003,8 @@ public:
 public:
    deletable_row& clustered_row(const schema& s, const clustering_key& key);
    deletable_row& clustered_row(const schema& s, clustering_key&& key);
-    deletable_row& clustered_row(const schema& s, const clustering_key_view& key);
+    deletable_row& clustered_row(const schema& s, clustering_key_view key);
+    deletable_row& clustered_row(const schema& s, position_in_partition_view pos, is_dummy, is_continuous);
 public:
    tombstone partition_tombstone() const { return _tombstone; }
    row& static_row() { return _static_row; }
@@ -879,6 +1017,7 @@ public:
    const row* find_row(const schema& s, const clustering_key& key) const;
    tombstone range_tombstone_for_row(const schema& schema, const clustering_key& key) const;
    row_tombstone tombstone_for_row(const schema& schema, const clustering_key& key) const;
+    // Can be called only for non-dummy entries
    row_tombstone tombstone_for_row(const schema& schema, const rows_entry& e) const;
    boost::iterator_range<rows_type::const_iterator> range(const schema& schema, const query::clustering_range& r) const;
    rows_type::const_iterator lower_bound(const schema& schema, const query::clustering_range& r) const;
@@ -886,6 +1025,11 @@ public:
    rows_type::iterator lower_bound(const schema& schema, const query::clustering_range& r);
    rows_type::iterator upper_bound(const schema& schema, const query::clustering_range& r);
    boost::iterator_range<rows_type::iterator> range(const schema& schema, const query::clustering_range& r);
+    // Returns an iterator range of rows_entry, with only non-dummy entries.
+    auto non_dummy_rows() const {
+        return boost::make_iterator_range(_rows.begin(), _rows.end())
+            | boost::adaptors::filtered([] (const rows_entry& e) { return bool(!e.dummy()); });
+    }
    // Writes this partition using supplied query result writer.
    // The partition should be first compacted with compact_for_query(), otherwise
    // results may include data which is deleted/expired.
--- a/mutation_partition_applier.hh
+++ b/mutation_partition_applier.hh
@@ -50,8 +50,8 @@ public:
        _p.apply_row_tombstone(_schema, rt);
    }

-    virtual void accept_row(clustering_key_view key, const row_tombstone& deleted_at, const row_marker& rm) override {
-        deletable_row& r = _p.clustered_row(_schema, key);
+    virtual void accept_row(position_in_partition_view key, const row_tombstone& deleted_at, const row_marker& rm, is_dummy dummy, is_continuous continuous) override {
+        deletable_row& r = _p.clustered_row(_schema, key, dummy, continuous);
        r.apply(rm);
        r.apply(deleted_at);
        _current_row = &r;
--- a/mutation_partition_serializer.cc
+++ b/mutation_partition_serializer.cc
@@ -196,7 +196,7 @@ void mutation_partition_serializer::write_serialized(Writer&& writer, const sche
    auto row_tombstones = write_row_cells(std::move(srow_writer), mp.static_row(), s, column_kind::static_column).end_static_row().start_range_tombstones();
    write_tombstones(s, row_tombstones, mp.row_tombstones());
    auto clustering_rows = std::move(row_tombstones).end_range_tombstones().start_rows();
-    for (auto&& cr : mp.clustered_rows()) {
+    for (auto&& cr : mp.non_dummy_rows()) {
        write_row(clustering_rows.add(), s, cr.key(), cr.row().cells(), cr.row().marker(), cr.row().deleted_at());
    }
    std::move(clustering_rows).end_rows().end_mutation_partition();
--- a/mutation_partition_view.cc
+++ b/mutation_partition_view.cc
@@ -210,7 +210,7 @@ mutation_partition_view::accept(const column_mapping& cm, mutation_partition_vis

    for (auto&& cr : mpv.rows()) {
        auto t = row_tombstone(cr.deleted_at(), shadowable_tombstone(cr.shadowable_deleted_at()));
-        visitor.accept_row(cr.key(), t, read_row_marker(cr.marker()));
+        visitor.accept_row(position_in_partition_view::for_key(cr.key()), t, read_row_marker(cr.marker()));

        struct cell_visitor {
            mutation_partition_visitor& _visitor;
--- a/mutation_partition_visitor.hh
+++ b/mutation_partition_visitor.hh
@@ -29,6 +29,19 @@
 class row_marker;
 class row_tombstone;

+// When used on an entry, marks the range between this entry and the previous
+// one as continuous or discontinuous, excluding the keys of both entries.
+// This information doesn't apply to continuity of the entries themselves,
+// that is specified by is_dummy flag.
+// See class doc of mutation_partition.
+using is_continuous = bool_class<class continuous_tag>;
+
+// Dummy entry is an entry which is incomplete.
+// Typically used for marking bounds of continuity range.
+// See class doc of mutation_partition.
+class dummy_tag {};
+using is_dummy = bool_class<dummy_tag>;
+
 // Guarantees:
 //
 // - any tombstones which affect cell's liveness are visited before that cell
@@ -56,7 +69,8 @@ public:

    virtual void accept_row_tombstone(const range_tombstone&) = 0;

-    virtual void accept_row(clustering_key_view key, const row_tombstone& deleted_at, const row_marker& rm) = 0;
+    virtual void accept_row(position_in_partition_view key, const row_tombstone& deleted_at, const row_marker& rm,
+        is_dummy = is_dummy::no, is_continuous = is_continuous::yes) = 0;

    virtual void accept_row_cell(column_id id, atomic_cell_view) = 0;

--- a/mutation_reader.cc
+++ b/mutation_reader.cc
@@ -153,8 +153,8 @@ public:
    }
 };

-mutation_reader make_reader_returning(mutation m) {
-    return make_mutation_reader<reader_returning>(streamed_mutation_from_mutation(std::move(m)));
+mutation_reader make_reader_returning(mutation m, streamed_mutation::forwarding fwd) {
+    return make_mutation_reader<reader_returning>(streamed_mutation_from_mutation(std::move(m), std::move(fwd)));
 }

 mutation_reader make_reader_returning(streamed_mutation m) {
@@ -324,3 +324,36 @@ make_multi_range_reader(schema_ptr s, mutation_source source, const dht::partiti
    return make_mutation_reader<multi_range_mutation_reader>(std::move(s), std::move(source), ranges,
                                                             slice, pc, std::move(trace_state), fwd, fwd_mr);
 }
+
+snapshot_source make_empty_snapshot_source() {
+    return snapshot_source([] {
+        return make_empty_mutation_source();
+    });
+}
+
+mutation_source make_empty_mutation_source() {
+    return mutation_source([](schema_ptr s,
+            const dht::partition_range& pr,
+            const query::partition_slice& slice,
+            const io_priority_class& pc,
+            tracing::trace_state_ptr tr,
+            streamed_mutation::forwarding fwd) {
+        return make_empty_reader();
+    });
+}
+
+mutation_source make_combined_mutation_source(std::vector<mutation_source> addends) {
+    return mutation_source([addends = std::move(addends)] (schema_ptr s,
+            const dht::partition_range& pr,
+            const query::partition_slice& slice,
+            const io_priority_class& pc,
+            tracing::trace_state_ptr tr,
+            streamed_mutation::forwarding fwd) {
+        std::vector<mutation_reader> rd;
+        rd.reserve(addends.size());
+        for (auto&& ms : addends) {
+            rd.emplace_back(ms(s, pr, slice, pc, tr, fwd));
+        }
+        return make_combined_reader(std::move(rd));
+    });
+}
--- a/mutation_reader.hh
+++ b/mutation_reader.hh
@@ -159,7 +159,7 @@ public:
 mutation_reader make_combined_reader(std::vector<mutation_reader>);
 mutation_reader make_combined_reader(mutation_reader&& a, mutation_reader&& b);
 // reads from the input readers, in order
-mutation_reader make_reader_returning(mutation);
+mutation_reader make_reader_returning(mutation, streamed_mutation::forwarding fwd = streamed_mutation::forwarding::no);
 mutation_reader make_reader_returning(streamed_mutation);
 mutation_reader make_reader_returning_many(std::vector<mutation>,
    const query::partition_slice& slice = query::full_slice,
@@ -279,34 +279,36 @@ class mutation_source {
    // We could have our own version of std::function<> that is nothrow
    // move constructible and save some indirection and allocation.
    // Probably not worth the effort though.
-    std::unique_ptr<func_type> _fn;
+    lw_shared_ptr<func_type> _fn;
 private:
    mutation_source() = default;
    explicit operator bool() const { return bool(_fn); }
    friend class optimized_optional<mutation_source>;
 public:
-    mutation_source(func_type fn) : _fn(std::make_unique<func_type>(std::move(fn))) {}
+    mutation_source(func_type fn) : _fn(make_lw_shared<func_type>(std::move(fn))) {}
+    // For sources which don't care about the mutation_reader::forwarding flag (always fast forwardable)
+    mutation_source(std::function<mutation_reader(schema_ptr s, partition_range range, const query::partition_slice& slice, io_priority pc, tracing::trace_state_ptr, streamed_mutation::forwarding)> fn)
+        : _fn(make_lw_shared<func_type>([fn = std::move(fn)] (schema_ptr s, partition_range range, const query::partition_slice& slice, io_priority pc, tracing::trace_state_ptr tr, streamed_mutation::forwarding fwd, mutation_reader::forwarding) {
+            return fn(s, range, slice, pc, std::move(tr), fwd);
+        })) {}
    mutation_source(std::function<mutation_reader(schema_ptr, partition_range, const query::partition_slice&, io_priority)> fn)
-        : _fn(std::make_unique<func_type>([fn = std::move(fn)] (schema_ptr s, partition_range range, const query::partition_slice& slice, io_priority pc, tracing::trace_state_ptr, streamed_mutation::forwarding, mutation_reader::forwarding) {
+        : _fn(make_lw_shared<func_type>([fn = std::move(fn)] (schema_ptr s, partition_range range, const query::partition_slice& slice, io_priority pc, tracing::trace_state_ptr, streamed_mutation::forwarding fwd, mutation_reader::forwarding) {
+            assert(!fwd);
            return fn(s, range, slice, pc);
        })) {}
    mutation_source(std::function<mutation_reader(schema_ptr, partition_range, const query::partition_slice&)> fn)
-        : _fn(std::make_unique<func_type>([fn = std::move(fn)] (schema_ptr s, partition_range range, const query::partition_slice& slice, io_priority, tracing::trace_state_ptr, streamed_mutation::forwarding, mutation_reader::forwarding) {
+        : _fn(make_lw_shared<func_type>([fn = std::move(fn)] (schema_ptr s, partition_range range, const query::partition_slice& slice, io_priority, tracing::trace_state_ptr, streamed_mutation::forwarding fwd, mutation_reader::forwarding) {
+            assert(!fwd);
            return fn(s, range, slice);
        })) {}
    mutation_source(std::function<mutation_reader(schema_ptr, partition_range range)> fn)
-        : _fn(std::make_unique<func_type>([fn = std::move(fn)] (schema_ptr s, partition_range range, const query::partition_slice&, io_priority, tracing::trace_state_ptr, streamed_mutation::forwarding, mutation_reader::forwarding) {
+        : _fn(make_lw_shared<func_type>([fn = std::move(fn)] (schema_ptr s, partition_range range, const query::partition_slice&, io_priority, tracing::trace_state_ptr, streamed_mutation::forwarding fwd, mutation_reader::forwarding) {
+            assert(!fwd);
            return fn(s, range);
        })) {}

-    mutation_source(const mutation_source& other)
-        : _fn(std::make_unique<func_type>(*other._fn)) { }
-
-    mutation_source& operator=(const mutation_source& other) {
-        _fn = std::make_unique<func_type>(*other._fn);
-        return *this;
-    }
-
+    mutation_source(const mutation_source& other) = default;
+    mutation_source& operator=(const mutation_source& other) = default;
    mutation_source(mutation_source&&) = default;
    mutation_source& operator=(mutation_source&&) = default;

@@ -326,6 +328,32 @@ public:
    }
 };

+// Returns a mutation_source which is the sum of given mutation_sources.
+//
+// Adding two mutation sources gives a mutation source which contains
+// the sum of writes contained in the addends.
+mutation_source make_combined_mutation_source(std::vector<mutation_source>);
+
+// Represent mutation_source which can be snapshotted.
+class snapshot_source {
+private:
+    std::function<mutation_source()> _func;
+public:
+    snapshot_source(std::function<mutation_source()> func)
+        : _func(std::move(func))
+    { }
+
+    // Creates a new snapshot.
+    // The returned mutation_source represents all earlier writes and only those.
+    // Note though that the mutations in the snapshot may get compacted over time.
+    mutation_source operator()() {
+        return _func();
+    }
+};
+
+mutation_source make_empty_mutation_source();
+snapshot_source make_empty_snapshot_source();
+
 template<>
 struct move_constructor_disengages<mutation_source> {
    enum { value = true };
--- a/partition_builder.hh
+++ b/partition_builder.hh
@@ -56,8 +56,8 @@ public:
        _partition.apply_row_tombstone(_schema, rt);
    }

-    virtual void accept_row(clustering_key_view key, const row_tombstone& deleted_at, const row_marker& rm) override {
-        deletable_row& r = _partition.clustered_row(_schema, key);
+    virtual void accept_row(position_in_partition_view key, const row_tombstone& deleted_at, const row_marker& rm, is_dummy dummy, is_continuous continuous) override {
+        deletable_row& r = _partition.clustered_row(_schema, key, dummy, continuous);
        r.apply(rm);
        r.apply(deleted_at);
        _current_row = &r;
--- a/partition_snapshot_reader.hh
+++ b/partition_snapshot_reader.hh
@@ -30,6 +30,26 @@ struct partition_snapshot_reader_dummy_accounter {
 };
 extern partition_snapshot_reader_dummy_accounter no_accounter;

+inline void maybe_merge_versions(lw_shared_ptr<partition_snapshot>& snp,
+                                 logalloc::region& lsa_region,
+                                 logalloc::allocating_section& read_section) {
+    if (!snp.owned()) {
+        return;
+    }
+    // If no one else is using this particular snapshot try to merge partition
+    // versions.
+    with_allocator(lsa_region.allocator(), [&snp, &lsa_region, &read_section] {
+        return with_linearized_managed_bytes([&snp, &lsa_region, &read_section] {
+            try {
+                read_section(lsa_region, [&snp] {
+                    snp->merge_partition_versions();
+                });
+            } catch (...) { }
+            snp = {};
+        });
+    });
+}
+
 template <typename MemoryAccounter = partition_snapshot_reader_dummy_accounter>
 class partition_snapshot_reader : public streamed_mutation::impl, public MemoryAccounter {
    struct rows_position {
@@ -45,21 +65,6 @@ class partition_snapshot_reader : public streamed_mutation::impl, public MemoryA
            return _cmp(*b._position, *a._position);
        }
    };
-    class rows_entry_compare {
-        position_in_partition::less_compare _cmp;
-    public:
-        explicit rows_entry_compare(const schema& s) : _cmp(s) { }
-        bool operator()(const rows_entry& a, const position_in_partition& b) const {
-            position_in_partition_view a_view(position_in_partition_view::clustering_row_tag_t(),
-                                              a.key());
-            return _cmp(a_view, b);
-        }
-        bool operator()(const position_in_partition& a, const rows_entry& b) const {
-            position_in_partition_view b_view(position_in_partition_view::clustering_row_tag_t(),
-                                              b.key());
-            return _cmp(a, b_view);
-        }
-    };
 private:
    // Keeps shared pointer to the container we read mutation from to make sure
    // that its lifetime is appropriately extended.
@@ -70,8 +75,8 @@ private:
    query::clustering_row_ranges::const_iterator _ck_range_end;
    bool _in_ck_range = false;

-    rows_entry_compare _cmp;
-    clustering_key_prefix::equality _eq;
+    rows_entry::compare _cmp;
+    position_in_partition::equal_compare _eq;
    heap_compare _heap_cmp;

    lw_shared_ptr<partition_snapshot> _snapshot;
@@ -94,8 +99,14 @@ private:
    void refresh_iterators() {
        _clustering_rows.clear();

-        if (!_in_ck_range && _current_ck_range == _ck_range_end) {
-            return;
+        if (!_in_ck_range) {
+            if (_current_ck_range == _ck_range_end) {
+                _end_of_stream = true;
+                return;
+            }
+            for (auto&& v : _snapshot->versions()) {
+                _range_tombstones.apply(v.partition().row_tombstones(), *_current_ck_range);
+            }
        }

        for (auto&& v : _snapshot->versions()) {
@@ -117,14 +128,27 @@ private:
        boost::range::make_heap(_clustering_rows, _heap_cmp);
    }

-    void pop_clustering_row() {
+    // Valid if has_more_rows()
+    const rows_entry& pop_clustering_row() {
+        boost::range::pop_heap(_clustering_rows, _heap_cmp);
        auto& current = _clustering_rows.back();
+        const rows_entry& e = *current._position;
        current._position = std::next(current._position);
        if (current._position == current._end) {
            _clustering_rows.pop_back();
        } else {
            boost::range::push_heap(_clustering_rows, _heap_cmp);
        }
+        return e;
+    }
+
+    // Valid if has_more_rows()
+    const rows_entry& peek_row() const {
+        return *_clustering_rows.front()._position;
+    }
+
+    bool has_more_rows() const {
+        return !_clustering_rows.empty();
    }

    mutation_fragment_opt read_static_row() {
@@ -143,20 +167,18 @@ private:
    }

    mutation_fragment_opt read_next() {
-        if (!_clustering_rows.empty()) {
-            auto mf = _range_tombstones.get_next(*_clustering_rows.front()._position);
+        while (has_more_rows()) {
+            auto mf = _range_tombstones.get_next(peek_row());
            if (mf) {
                return mf;
            }
-
-            boost::range::pop_heap(_clustering_rows, _heap_cmp);
-            clustering_row result = *_clustering_rows.back()._position;
-            pop_clustering_row();
-            while (!_clustering_rows.empty() && _eq(_clustering_rows.front()._position->key(), result.key())) {
-                boost::range::pop_heap(_clustering_rows, _heap_cmp);
-                auto& current = _clustering_rows.back();
-                result.apply(*_schema, *current._position);
-                pop_clustering_row();
+            const rows_entry& e = pop_clustering_row();
+            if (e.dummy()) {
+                continue;
+            }
+            clustering_row result = e;
+            while (has_more_rows() && _eq(peek_row().position(), result.position())) {
+                result.apply(*_schema, pop_clustering_row());
            }
            _last_entry = position_in_partition(result.position());
            return mutation_fragment(std::move(result));
@@ -184,18 +206,13 @@ private:
        }

        while (!is_end_of_stream() && !is_buffer_full()) {
-            if (_in_ck_range && _clustering_rows.empty()) {
-                _in_ck_range = false;
-                _current_ck_range = std::next(_current_ck_range);
-                refresh_iterators();
-                continue;
-            }
-
            auto mfopt = read_next();
            if (mfopt) {
                emplace_mutation_fragment(std::move(*mfopt));
            } else {
-                _end_of_stream = true;
+                _in_ck_range = false;
+                _current_ck_range = std::next(_current_ck_range);
+                refresh_iterators();
            }
        }
    }
@@ -226,31 +243,11 @@ public:
    , _range_tombstones(*s)
    , _lsa_region(region)
    , _read_section(read_section) {
-        for (auto&& v : _snapshot->versions()) {
-            auto&& rt_list = v.partition().row_tombstones();
-            for (auto&& range : _ck_ranges.ranges()) {
-                _range_tombstones.apply(rt_list, range);
-            }
-        }
        do_fill_buffer();
    }

    ~partition_snapshot_reader() {
-        if (!_snapshot.owned()) {
-            return;
-        }
-        // If no one else is using this particular snapshot try to merge partition
-        // versions.
-        with_allocator(_lsa_region.allocator(), [this] {
-            return with_linearized_managed_bytes([this] {
-                try {
-                    _read_section(_lsa_region, [this] {
-                        _snapshot->merge_partition_versions();
-                    });
-                } catch (...) { }
-                _snapshot = {};
-            });
-        });
+        maybe_merge_versions(_snapshot, _lsa_region, _read_section);
    }

    virtual future<> fill_buffer() override {
--- a/partition_snapshot_row_cursor.hh
+++ b/partition_snapshot_row_cursor.hh
@@ -0,0 +1,208 @@
+/*
+ * Copyright (C) 2017 ScyllaDB
+ */
+
+/*
+ * This file is part of Scylla.
+ *
+ * Scylla is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Scylla is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Scylla.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#include "partition_version.hh"
+
+// Allows iterating over rows of mutation_partition represented by given partition_snapshot.
+//
+// The cursor initially has a position before all rows and is not pointing at any row.
+// To position the cursor, use advance_to().
+//
+// All methods should be called with the region of the snapshot locked. The cursor is invalidated
+// when that lock section is left, or if the snapshot is modified.
+//
+// When the cursor is invalidated, it still maintains its previous position. It can be brought
+// back to validity by calling maybe_refresh(), or advance_to().
+//
+class partition_snapshot_row_cursor final {
+    struct position_in_version {
+        mutation_partition::rows_type::iterator it;
+        mutation_partition::rows_type::iterator end;
+        int version_no;
+
+        struct less_compare {
+            rows_entry::tri_compare _cmp;
+        public:
+            explicit less_compare(const schema& s) : _cmp(s) { }
+            bool operator()(const position_in_version& a, const position_in_version& b) {
+                auto res = _cmp(*a.it, *b.it);
+                return res > 0 || (res == 0 && a.version_no > b.version_no);
+            }
+        };
+    };
+
+    const schema& _schema;
+    logalloc::region& _region;
+    partition_snapshot& _snp;
+    std::vector<position_in_version> _heap;
+    std::vector<position_in_version> _current_row;
+    position_in_partition _position;
+    uint64_t _last_reclaim_count = 0;
+    size_t _last_versions_count = 0;
+
+    // Removes the next row from _heap and puts it into _current_row
+    void recreate_current_row() {
+        position_in_version::less_compare heap_less(_schema);
+        position_in_partition::equal_compare eq(_schema);
+        do {
+            boost::range::pop_heap(_heap, heap_less);
+            _current_row.push_back(_heap.back());
+            _heap.pop_back();
+        } while (!_heap.empty() && eq(_current_row[0].it->position(), _heap[0].it->position()));
+        _position = position_in_partition(_current_row[0].it->position());
+    }
+public:
+    partition_snapshot_row_cursor(const schema& s, logalloc::region& region, partition_snapshot& snp)
+        : _schema(s)
+        , _region(region)
+        , _snp(snp)
+        , _position(position_in_partition::static_row_tag_t{})
+    { }
+    bool has_up_to_date_row_from_latest_version() const {
+        return up_to_date() && _current_row[0].version_no == 0;
+    }
+    mutation_partition::rows_type::iterator get_iterator_in_latest_version() const {
+        return _current_row[0].it;
+    }
+    bool up_to_date() const {
+        return _region.reclaim_counter() == _last_reclaim_count && _last_versions_count == _snp.version_count();
+    }
+
+    // Brings back the cursor to validity.
+    // Can be only called when cursor is pointing at a row.
+    //
+    // Semantically equivalent to:
+    //
+    //   advance_to(position());
+    //
+    // but avoids work if not necessary.
+    bool maybe_refresh() {
+        if (!up_to_date()) {
+            return advance_to(_position);
+        }
+        return true;
+    }
+
+    // Moves the cursor to the first entry with position >= pos.
+    //
+    // The caller must ensure that such entry exists.
+    //
+    // Returns true iff there can't be any clustering row entries
+    // between lower_bound (inclusive) and the entry to which the cursor
+    // was advanced.
+    //
+    // May be called when cursor is not valid.
+    // The cursor is valid after the call.
+    // Must be called under reclaim lock.
+    bool advance_to(position_in_partition_view lower_bound) {
+        rows_entry::compare less(_schema);
+        position_in_version::less_compare heap_less(_schema);
+        _heap.clear();
+        _current_row.clear();
+        int version_no = 0;
+        for (auto&& v : _snp.versions()) {
+            auto& rows = v.partition().clustered_rows();
+            auto pos = rows.lower_bound(lower_bound, less);
+            auto end = rows.end();
+            if (pos != end) {
+                _heap.push_back({pos, end, version_no});
+            }
+            ++version_no;
+        }
+        boost::range::make_heap(_heap, heap_less);
+        _last_reclaim_count = _region.reclaim_counter();
+        _last_versions_count = _snp.version_count();
+        bool found = no_clustering_row_between(_schema, lower_bound, _heap[0].it->position());
+        recreate_current_row();
+        return found;
+    }
+
+    // Advances the cursor to the next row.
+    // If there is no next row, returns false and the cursor is no longer pointing at a row.
+    // Can be only called on a valid cursor pointing at a row.
+    bool next() {
+        position_in_version::less_compare heap_less(_schema);
+        assert(up_to_date());
+        for (auto&& curr : _current_row) {
+            ++curr.it;
+            if (curr.it != curr.end) {
+                _heap.push_back(curr);
+                boost::range::push_heap(_heap, heap_less);
+            }
+        }
+        _current_row.clear();
+        if (_heap.empty()) {
+            return false;
+        }
+        recreate_current_row();
+        return true;
+    }
+
+    // Can be called only when cursor is valid and pointing at a row.
+    bool continuous() const { return bool(_current_row[0].it->continuous()); }
+
+    // Can be called only when cursor is valid and pointing at a row.
+    bool dummy() const { return bool(_current_row[0].it->dummy()); }
+
+    // Can be called only when cursor is valid and pointing at a row, and !dummy().
+    const clustering_key& key() const { return _current_row[0].it->key(); }
+
+    // Can be called only when cursor is valid and pointing at a row.
+    clustering_row row() const {
+        clustering_row result(key());
+        for (auto&& v : _current_row) {
+            result.apply(_schema, *v.it);
+        }
+        return result;
+    }
+
+    // Can be called when cursor is pointing at a row, even when invalid.
+    const position_in_partition& position() const {
+        return _position;
+    }
+
+    bool is_in_latest_version() const;
+    bool previous_row_in_latest_version_has_key(const clustering_key_prefix& key) const;
+    void set_continuous(bool val);
+};
+
+inline
+bool partition_snapshot_row_cursor::is_in_latest_version() const {
+    return _current_row[0].version_no == 0;
+}
+
+inline
+bool partition_snapshot_row_cursor::previous_row_in_latest_version_has_key(const clustering_key_prefix& key) const {
+    if (_current_row[0].it == _snp.version()->partition().clustered_rows().begin()) {
+        return false;
+    }
+    auto prev_it = _current_row[0].it;
+    --prev_it;
+    clustering_key_prefix::tri_compare tri_comp(_schema);
+    return tri_comp(prev_it->key(), key) == 0;
+}
+
+inline
+void partition_snapshot_row_cursor::set_continuous(bool val) {
+    _current_row[0].it->set_continuous(val);
+}
--- a/partition_version.cc
+++ b/partition_version.cc
@@ -20,6 +20,7 @@
 */

 #include <boost/range/algorithm/heap_algorithm.hpp>
+#include <seastar/util/defer.hh>

 #include "partition_version.hh"

@@ -62,6 +63,72 @@ partition_version::~partition_version()
    }
 }

+namespace {
+
+GCC6_CONCEPT(
+
+// A functor which transforms objects from Domain into objects from CoDomain
+template<typename U, typename Domain, typename CoDomain>
+concept bool Mapper() {
+    return requires(U obj, const Domain& src) {
+        { obj(src) } -> const CoDomain&
+    };
+}
+
+// A functor which merges two objects from Domain into one. The result is stored in the first argument.
+template<typename U, typename Domain>
+concept bool Reducer() {
+    return requires(U obj, Domain& dst, const Domain& src) {
+        { obj(dst, src) } -> void;
+    };
+}
+
+)
+
+// Calculates the value of particular part of mutation_partition represented by
+// the version chain starting from v.
+// |map| extracts the part from each version.
+// |reduce| Combines parts from the two versions.
+template <typename Result, typename Map, typename Reduce>
+GCC6_CONCEPT(
+requires Mapper<Map, mutation_partition, Result>() && Reducer<Reduce, Result>()
+)
+inline Result squashed(const partition_version_ref& v, Map&& map, Reduce&& reduce) {
+    Result r = map(v->partition());
+    auto it = v->next();
+    while (it) {
+        reduce(r, map(it->partition()));
+        it = it->next();
+    }
+    return r;
+}
+
+}
+
+row partition_snapshot::static_row() const {
+    return ::squashed<row>(version(),
+                         [] (const mutation_partition& mp) -> const row& { return mp.static_row(); },
+                         [this] (row& a, const row& b) { a.apply(*_schema, column_kind::static_column, b); });
+}
+
+tombstone partition_snapshot::partition_tombstone() const {
+    return ::squashed<tombstone>(version(),
+                               [] (const mutation_partition& mp) { return mp.partition_tombstone(); },
+                               [] (tombstone& a, tombstone b) { a.apply(b); });
+}
+
+mutation_partition partition_snapshot::squashed() const {
+    return ::squashed<mutation_partition>(version(),
+                               [] (const mutation_partition& mp) -> const mutation_partition& { return mp; },
+                               [this] (mutation_partition& a, const mutation_partition& b) { a.apply(*_schema, b, *_schema); });
+}
+
+tombstone partition_entry::partition_tombstone() const {
+    return ::squashed<tombstone>(_version,
+        [] (const mutation_partition& mp) { return mp.partition_tombstone(); },
+        [] (tombstone& a, tombstone b) { a.apply(b); });
+}
+
 partition_snapshot::~partition_snapshot() {
    if (_version && _version.is_unique_owner()) {
        auto v = &*_version;
@@ -139,20 +206,6 @@ void partition_entry::set_version(partition_version* new_version)
    _version = partition_version_ref(*new_version);
 }

-void partition_entry::apply(const schema& s, partition_version* pv, const schema& pv_schema)
-{
-    if (!_snapshot) {
-        _version->partition().apply(s, std::move(pv->partition()), pv_schema);
-        current_allocator().destroy(pv);
-    } else {
-        if (s.version() != pv_schema.version()) {
-            pv->partition().upgrade(pv_schema, s);
-        }
-        pv->insert_before(*_version);
-        set_version(pv);
-    }
-}
-
 void partition_entry::apply(const schema& s, const mutation_partition& mp, const schema& mp_schema)
 {
    if (!_snapshot) {
@@ -169,22 +222,6 @@ void partition_entry::apply(const schema& s, const mutation_partition& mp, const
    }
 }

-void partition_entry::apply(const schema& s, mutation_partition&& mp, const schema& mp_schema)
-{
-    if (!_snapshot) {
-        _version->partition().apply(s, std::move(mp), mp_schema);
-    } else {
-        if (s.version() != mp_schema.version()) {
-            apply(s, mp, mp_schema);
-        } else {
-            auto new_version = current_allocator().construct<partition_version>(std::move(mp));
-            new_version->insert_before(*_version);
-
-            set_version(new_version);
-        }
-    }
-}
-
 void partition_entry::apply(const schema& s, mutation_partition_view mpv, const schema& mp_schema)
 {
    if (!_snapshot) {
@@ -199,75 +236,286 @@ void partition_entry::apply(const schema& s, mutation_partition_view mpv, const
    }
 }

-void partition_entry::apply(const schema& s, partition_entry&& pe, const schema& mp_schema)
-{
-    auto begin = &*pe._version;
-    auto snapshot = pe._snapshot;
-    if (pe._snapshot) {
-        pe._snapshot->_version = std::move(pe._version);
-        pe._snapshot->_entry = nullptr;
-        pe._snapshot = nullptr;
+// Iterates over all rows in mutation represented by partition_entry.
+// It abstracts away the fact that rows may be spread across multiple versions.
+class partition_entry::rows_iterator final {
+    struct version {
+        mutation_partition::rows_type::iterator current_row;
+        mutation_partition::rows_type* rows;
+        bool can_move;
+        struct compare {
+            const rows_entry::tri_compare& _cmp;
+        public:
+            explicit compare(const rows_entry::tri_compare& cmp) : _cmp(cmp) { }
+            bool operator()(const version& a, const version& b) const {
+                return _cmp(*a.current_row, *b.current_row) > 0;
+            }
+        };
+    };
+    const schema& _schema;
+    rows_entry::tri_compare _rows_cmp;
+    rows_entry::compare _rows_less_cmp;
+    version::compare _version_cmp;
+    std::vector<version> _heap;
+    std::vector<version> _current_row;
+public:
+    rows_iterator(partition_version* version, const schema& schema)
+        : _schema(schema)
+        , _rows_cmp(schema)
+        , _rows_less_cmp(schema)
+        , _version_cmp(_rows_cmp)
+    {
+        bool can_move = true;
+        while (version) {
+            can_move &= !version->is_referenced();
+            auto& rows = version->partition().clustered_rows();
+            if (!rows.empty()) {
+                _heap.push_back({rows.begin(), &rows, can_move});
+            }
+            version = version->next();
+        }
+        boost::range::make_heap(_heap, _version_cmp);
+        move_to_next_row();
    }
-    pe._version = { };
-
-    auto current = begin;
-    if (!current->next() && !current->is_referenced()) {
-        try {
-            apply(s, current, mp_schema);
-        } catch (...) {
-            pe._version = partition_version_ref(*current);
-            throw;
-        }
-        return;
+    bool done() const {
+        return _current_row.empty();
    }
+    // Return clustering key of the current row in source.
+    // Valid only when !is_dummy().
+    const clustering_key& key() const {
+        return _current_row[0].current_row->key();
+    }
+    bool is_dummy() const {
+        return bool(_current_row[0].current_row->dummy());
+    }
+    template<typename RowConsumer>
+    void consume_row(RowConsumer&& consumer) {
+        assert(!_current_row.empty());
+        // versions in _current_row are not ordered but it is not a problem
+        // due to the fact that all rows are continuous.
+        for (version& v : _current_row) {
+            if (!v.can_move) {
+                consumer(deletable_row(v.current_row->row()));
+            } else {
+                consumer(std::move(v.current_row->row()));
+            }
+        }
+    }
+    void remove_current_row_when_possible() {
+        assert(!_current_row.empty());
+        auto deleter = current_deleter<rows_entry>();
+        for (version& v : _current_row) {
+            if (v.can_move) {
+                v.rows->erase_and_dispose(v.current_row, deleter);
+            }
+        }
+    }
+    void move_to_next_row() {
+        _current_row.clear();
+        while (!_heap.empty() &&
+                (_current_row.empty() || _rows_cmp(*_current_row[0].current_row, *_heap[0].current_row) == 0)) {
+            boost::range::pop_heap(_heap, _version_cmp);
+            auto& curr = _heap.back();
+            _current_row.push_back({curr.current_row, curr.rows, curr.can_move});
+            ++curr.current_row;
+            if (curr.current_row == curr.rows->end()) {
+                _heap.pop_back();
+            } else {
+                boost::range::push_heap(_heap, _version_cmp);
+            }
+        }
+    }
+};

-    try {
-        while (current && !current->is_referenced()) {
-            auto next = current->next();
-            apply(s, std::move(current->partition()), mp_schema);
-            // Leave current->partition() valid (albeit empty) in case we throw later.
-            current->partition() = mutation_partition(mp_schema.shared_from_this());
-            current = next;
+namespace {
+
+// When applying partition_entry to an incomplete partition_entry this class is used to represent
+// the target incomplete partition_entry. It encapsulates the logic needed for handling multiple versions.
+class apply_incomplete_target final {
+    struct version {
+        mutation_partition::rows_type::iterator current_row;
+        mutation_partition::rows_type* rows;
+        size_t version_no;
+
+        struct compare {
+            const rows_entry::tri_compare& _cmp;
+        public:
+            explicit compare(const rows_entry::tri_compare& cmp) : _cmp(cmp) { }
+            bool operator()(const version& a, const version& b) const {
+                auto res = _cmp(*a.current_row, *b.current_row);
+                return res > 0 || (res == 0 && a.version_no > b.version_no);
+            }
+        };
+    };
+    const schema& _schema;
+    partition_entry& _pe;
+    rows_entry::tri_compare _rows_cmp;
+    rows_entry::compare _rows_less_cmp;
+    version::compare _version_cmp;
+    std::vector<version> _heap;
+    mutation_partition::rows_type::iterator _next_in_latest_version;
+public:
+    apply_incomplete_target(partition_entry& pe, const schema& schema)
+        : _schema(schema)
+        , _pe(pe)
+        , _rows_cmp(schema)
+        , _rows_less_cmp(schema)
+        , _version_cmp(_rows_cmp)
+    {
+        size_t version_no = 0;
+        _next_in_latest_version = pe.version()->partition().clustered_rows().begin();
+        for (auto&& v : pe.version()->elements_from_this()) {
+            if (!v.partition().clustered_rows().empty()) {
+                _heap.push_back({v.partition().clustered_rows().begin(), &v.partition().clustered_rows(), version_no});
+            }
+            ++version_no;
        }
-        while (current) {
-            auto next = current->next();
-            apply(s, current->partition(), mp_schema);
-            current = next;
+        boost::range::make_heap(_heap, _version_cmp);
+    }
+    // Applies the row from source.
+    // Must be called for rows with monotonic keys.
+    // Weak exception guarantees. The target and source partitions are left
+    // in a state such that the two still commute to the same value on retry.
+    void apply(partition_entry::rows_iterator& src) {
+        auto&& key = src.key();
+        while (!_heap.empty() && _rows_less_cmp(*_heap[0].current_row, key)) {
+            boost::range::pop_heap(_heap, _version_cmp);
+            auto& curr = _heap.back();
+            curr.current_row = curr.rows->lower_bound(key, _rows_less_cmp);
+            if (curr.version_no == 0) {
+                _next_in_latest_version = curr.current_row;
+            }
+            if (curr.current_row == curr.rows->end()) {
+                _heap.pop_back();
+            } else {
+                boost::range::push_heap(_heap, _version_cmp);
+            }
        }
-    } catch (...) {
-        if (snapshot) {
-            pe._snapshot = snapshot;
-            snapshot->_entry = &pe;
-            pe._version = std::move(snapshot->_version);
+
+        if (!_heap.empty()) {
+            rows_entry& next_row = *_heap[0].current_row;
+            if (_rows_cmp(key, next_row) == 0) {
+                if (next_row.dummy()) {
+                    return;
+                }
+            } else if (!next_row.continuous()) {
+                return;
+            }
+        }
+
+        mutation_partition::rows_type& rows = _pe.version()->partition().clustered_rows();
+        if (_next_in_latest_version != rows.end() && _rows_cmp(key, *_next_in_latest_version) == 0) {
+            src.consume_row([&] (deletable_row&& row) {
+                _next_in_latest_version->row().apply(_schema, std::move(row));
+            });
        } else {
-            pe._version = partition_version_ref(*begin);
+            auto e = current_allocator().construct<rows_entry>(key);
+            e->set_continuous(_heap.empty() ? is_continuous::yes : _heap[0].current_row->continuous());
+            rows.insert_before(_next_in_latest_version, *e);
+            src.consume_row([&] (deletable_row&& row) {
+                e->row().apply(_schema, std::move(row));
+            });
        }
-        throw;
+    }
+};
+
+} // namespace
+
+template<typename Func>
+void partition_entry::with_detached_versions(Func&& func) {
+    partition_version* current = &*_version;
+    auto snapshot = _snapshot;
+    if (snapshot) {
+        snapshot->_version = std::move(_version);
+        snapshot->_entry = nullptr;
+        _snapshot = nullptr;
+    }
+    _version = { };
+
+    auto revert = defer([&] {
+        if (snapshot) {
+            _snapshot = snapshot;
+            snapshot->_entry = this;
+            _version = std::move(snapshot->_version);
+        } else {
+            _version = partition_version_ref(*current);
+        }
+    });
+
+    func(current);
+}
+
+void partition_entry::apply_to_incomplete(const schema& s, partition_entry&& pe, const schema& pe_schema)
+{
+    if (s.version() != pe_schema.version()) {
+        partition_entry entry(pe.squashed(pe_schema.shared_from_this(), s.shared_from_this()));
+        entry.with_detached_versions([&] (partition_version* v) {
+            apply_to_incomplete(s, v);
+        });
+    } else {
+        pe.with_detached_versions([&](partition_version* v) {
+            apply_to_incomplete(s, v);
+        });
+    }
+}
+
+void partition_entry::apply_to_incomplete(const schema& s, partition_version* version) {
+    partition_version& dst = open_version(s);
+
+    bool can_move = true;
+    auto current = version;
+    bool static_row_continuous = dst.partition().static_row_continuous();
+    while (current) {
+        can_move &= !current->is_referenced();
+        dst.partition().apply(current->partition().partition_tombstone());
+        if (static_row_continuous) {
+            row& static_row = dst.partition().static_row();
+            if (can_move) {
+                static_row.apply(s, column_kind::static_column, std::move(current->partition().static_row()));
+            } else {
+                static_row.apply(s, column_kind::static_column, current->partition().static_row());
+            }
+        }
+        range_tombstone_list& tombstones = dst.partition().row_tombstones();
+        if (can_move) {
+            tombstones.apply_reversibly(s, current->partition().row_tombstones()).cancel();
+        } else {
+            tombstones.apply(s, current->partition().row_tombstones());
+        }
+        current = current->next();
    }

-    current = begin;
-    while (current && !current->is_referenced()) {
-        auto next = current->next();
-        current_allocator().destroy(current);
-        current = next;
-    }
-    if (current) {
-        current->back_reference().mark_as_unique_owner();
+    partition_entry::rows_iterator source(version, s);
+    apply_incomplete_target target(*this, s);
+
+    while (!source.done()) {
+        if (!source.is_dummy()) {
+            target.apply(source);
+        }
+        source.remove_current_row_when_possible();
+        source.move_to_next_row();
    }
 }

 mutation_partition partition_entry::squashed(schema_ptr from, schema_ptr to)
 {
    mutation_partition mp(to);
+    mp.set_static_row_continuous(_version->partition().static_row_continuous());
    for (auto&& v : _version->all_elements()) {
        mp.apply(*to, v.partition(), *from);
    }
    return mp;
 }

+mutation_partition partition_entry::squashed(const schema& s)
+{
+    return squashed(s.shared_from_this(), s.shared_from_this());
+}
+
 void partition_entry::upgrade(schema_ptr from, schema_ptr to)
 {
    auto new_version = current_allocator().construct<partition_version>(mutation_partition(to));
+    new_version->partition().set_static_row_continuous(_version->partition().static_row_continuous());
    try {
        for (auto&& v : _version->all_elements()) {
            new_version->partition().apply(*to, v.partition(), *from);
@@ -282,13 +530,45 @@ void partition_entry::upgrade(schema_ptr from, schema_ptr to)
    remove_or_mark_as_unique_owner(old_version);
 }

-lw_shared_ptr<partition_snapshot> partition_entry::read(schema_ptr entry_schema)
+lw_shared_ptr<partition_snapshot> partition_entry::read(schema_ptr entry_schema, partition_snapshot::phase_type phase)
 {
+    open_version(*entry_schema, phase);
    if (_snapshot) {
        return _snapshot->shared_from_this();
    } else {
-        auto snp = make_lw_shared<partition_snapshot>(entry_schema, this);
+        auto snp = make_lw_shared<partition_snapshot>(entry_schema, this, phase);
        _snapshot = snp.get();
        return snp;
    }
 }
+
+std::vector<range_tombstone>
+partition_snapshot::range_tombstones(const schema& s, position_in_partition_view start, position_in_partition_view end)
+{
+    range_tombstone_list list(s);
+    for (auto&& v : versions()) {
+        for (auto&& rt : v.partition().row_tombstones().slice(s, start, end)) {
+            list.apply(s, rt);
+        }
+    }
+    return boost::copy_range<std::vector<range_tombstone>>(list);
+}
+
+std::ostream& operator<<(std::ostream& out, partition_entry& e) {
+    out << "{";
+    bool first = true;
+    if (e._version) {
+        for (const partition_version& v : e.versions()) {
+            if (!first) {
+                out << ", ";
+            }
+            if (v.is_referenced()) {
+                out << "(*) ";
+            }
+            out << v.partition();
+            first = false;
+        }
+    }
+    out << "}";
+    return out;
+}
--- a/partition_version.hh
+++ b/partition_version.hh
@@ -117,6 +117,8 @@ class partition_version : public anchorless_list_base_hook<partition_version> {

    friend class partition_version_ref;
 public:
+    explicit partition_version(schema_ptr s) noexcept
+        : _partition(std::move(s)) { }
    explicit partition_version(mutation_partition mp) noexcept
        : _partition(std::move(mp)) { }
    partition_version(partition_version&& pv) noexcept;
@@ -126,10 +128,12 @@ public:
    mutation_partition& partition() { return _partition; }
    const mutation_partition& partition() const { return _partition; }

-    bool is_referenced() { return _backref; }
+    bool is_referenced() const { return _backref; }
    partition_version_ref& back_reference() { return *_backref; }
 };

+using partition_version_range = anchorless_list_base_hook<partition_version>::range;
+
 class partition_version_ref {
    partition_version* _version = nullptr;
    bool _unique_owner = false;
@@ -160,7 +164,7 @@ public:
        return *this;
    }

-    explicit operator bool() { return _version; }
+    explicit operator bool() const { return _version; }

    partition_version& operator*() {
        assert(_version);
@@ -170,6 +174,10 @@ public:
        assert(_version);
        return _version;
    }
+    const partition_version* operator->() const {
+        assert(_version);
+        return _version;
+    }

    bool is_unique_owner() const { return _unique_owner; }
    void mark_as_unique_owner() { _unique_owner = true; }
@@ -178,15 +186,24 @@ public:
 class partition_entry;

 class partition_snapshot : public enable_lw_shared_from_this<partition_snapshot> {
+public:
+    // Only snapshots created with the same value of phase can point to the same version.
+    using phase_type = uint64_t;
+    static constexpr phase_type default_phase = 0;
+    static constexpr phase_type max_phase = std::numeric_limits<phase_type>::max();
+private:
    schema_ptr _schema;
    // Either _version or _entry is non-null.
    partition_version_ref _version;
    partition_entry* _entry;
+    phase_type _phase;

    friend class partition_entry;
 public:
-    explicit partition_snapshot(schema_ptr s, partition_entry* entry)
-        : _schema(std::move(s)), _entry(entry) { }
+    explicit partition_snapshot(schema_ptr s,
+                                partition_entry* entry,
+                                phase_type phase = default_phase)
+        : _schema(std::move(s)), _entry(entry), _phase(phase) { }
    partition_snapshot(const partition_snapshot&) = delete;
    partition_snapshot(partition_snapshot&&) = delete;
    partition_snapshot& operator=(const partition_snapshot&) = delete;
@@ -201,23 +218,48 @@ public:

    partition_version_ref& version();

-    auto versions() {
+    const partition_version_ref& version() const;
+
+    partition_version_range versions() {
        return version()->elements_from_this();
    }

    unsigned version_count();
+
+    bool at_latest_version() const {
+        return _entry != nullptr;
+    }
+
+    tombstone partition_tombstone() const;
+    row static_row() const;
+    mutation_partition squashed() const;
+    // Returns range tombstones overlapping with [start, end)
+    std::vector<range_tombstone> range_tombstones(const schema& s, position_in_partition_view start, position_in_partition_view end);
 };

+// Represents mutation_partition with snapshotting support a la MVCC.
+//
+// Internally the state is represented by an ordered list of mutation_partition
+// objects called versions. The logical mutation_partition state represented
+// by that chain is equal to reducing the chain using mutation_partition::apply()
+// from left (latest version) to right.
 class partition_entry {
    partition_snapshot* _snapshot = nullptr;
    partition_version_ref _version;

    friend class partition_snapshot;
+    friend class cache_entry;
 private:
+    // Detaches all versions temporarily around execution of the function.
+    // The function receives partition_version* pointing to the latest version.
+    template<typename Func>
+    void with_detached_versions(Func&&);
+
    void set_version(partition_version*);

-    void apply(const schema& s, partition_version* pv, const schema& pv_schema);
+    void apply_to_incomplete(const schema& s, partition_version* other);
 public:
+    class rows_iterator;
    partition_entry() = default;
    explicit partition_entry(mutation_partition mp);
    ~partition_entry();
@@ -238,28 +280,68 @@ public:
        return *this;
    }

+    partition_version_ref& version() {
+        return _version;
+    }
+
+    partition_version_range versions() {
+        return _version->elements_from_this();
+    }
+
    // Strong exception guarantees.
+    // Assumes this instance and mp are fully continuous.
    void apply(const schema& s, const mutation_partition& mp, const schema& mp_schema);

-    // Same exception guarantees as:
-    // mutation_partition::apply(const schema&, mutation_partition&&, const schema&)
-    void apply(const schema& s, mutation_partition&& mp, const schema& mp_schema);
-
    // Strong exception guarantees.
+    // Assumes this instance and mpv are fully continuous.
    void apply(const schema& s, mutation_partition_view mpv, const schema& mp_schema);

+    // Adds mutation_partition represented by "other" to the one represented
+    // by this entry.
+    //
+    // The argument must be fully-continuous.
+    //
+    // The rules of addition differ from that used by regular
+    // mutation_partition addition with regards to continuity. The continuity
+    // of the result is the same as in this instance. Information from "other"
+    // which is incomplete in this instance is dropped. In other words, this
+    // performs set intersection on continuity information, drops information
+    // which falls outside of the continuity range, and applies regular merging
+    // rules for the rest.
+    //
    // Weak exception guarantees.
    // If an exception is thrown this and pe will be left in some valid states
    // such that if the operation is retried (possibly many times) and eventually
    // succeeds the result will be as if the first attempt didn't fail.
-    void apply(const schema& s, partition_entry&& pe, const schema& pe_schema);
+    void apply_to_incomplete(const schema& s, partition_entry&& pe, const schema& pe_schema);
+
+    // Ensures that the latest version can be populated with data from given phase
+    // by inserting a new version if necessary.
+    // Doesn't affect value or continuity of the partition.
+    // Returns a reference to the new latest version.
+    partition_version& open_version(const schema& s, partition_snapshot::phase_type phase = partition_snapshot::max_phase) {
+        if (_snapshot && _snapshot->_phase != phase) {
+            auto new_version = current_allocator().construct<partition_version>(mutation_partition(s.shared_from_this()));
+            new_version->partition().set_static_row_continuous(_version->partition().static_row_continuous());
+            new_version->insert_before(*_version);
+            set_version(new_version);
+            return *new_version;
+        }
+        return *_version;
+    }

    mutation_partition squashed(schema_ptr from, schema_ptr to);
+    mutation_partition squashed(const schema&);
+    tombstone partition_tombstone() const;

    // needs to be called with reclaiming disabled
    void upgrade(schema_ptr from, schema_ptr to);

-    lw_shared_ptr<partition_snapshot> read(schema_ptr entry_schema);
+    // Snapshots with different values of phase will point to different partition_version objects.
+    lw_shared_ptr<partition_snapshot> read(schema_ptr entry_schema,
+        partition_snapshot::phase_type phase = partition_snapshot::default_phase);
+
+    friend std::ostream& operator<<(std::ostream& out, partition_entry& e);
 };

 inline partition_version_ref& partition_snapshot::version()
@@ -270,3 +352,12 @@ inline partition_version_ref& partition_snapshot::version()
        return _entry->_version;
    }
 }
+
+inline const partition_version_ref& partition_snapshot::version() const
+{
+    if (_version) {
+        return _version;
+    } else {
+        return _entry->_version;
+    }
+}
--- a/position_in_partition.hh
+++ b/position_in_partition.hh
@@ -0,0 +1,462 @@
+/*
+ * Copyright (C) 2017 ScyllaDB
+ */
+
+/*
+ * This file is part of Scylla.
+ *
+ * Scylla is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Scylla is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Scylla.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#include "types.hh"
+#include "keys.hh"
+#include "clustering_bounds_comparator.hh"
+#include "query-request.hh"
+
+inline
+lexicographical_relation relation_for_lower_bound(composite_view v) {
+    switch (v.last_eoc()) {
+        case composite::eoc::start:
+        case composite::eoc::none:
+            return lexicographical_relation::before_all_prefixed;
+        case composite::eoc::end:
+            return lexicographical_relation::after_all_prefixed;
+        default:
+            assert(0);
+    }
+}
+
+inline
+lexicographical_relation relation_for_upper_bound(composite_view v) {
+    switch (v.last_eoc()) {
+        case composite::eoc::start:
+            return lexicographical_relation::before_all_prefixed;
+        case composite::eoc::none:
+            return lexicographical_relation::before_all_strictly_prefixed;
+        case composite::eoc::end:
+            return lexicographical_relation::after_all_prefixed;
+        default:
+            assert(0);
+    }
+}
+
+class position_in_partition_view {
+    friend class position_in_partition;
+
+    int _bound_weight = 0;
+    const clustering_key_prefix* _ck; // nullptr for static row
+private:
+    position_in_partition_view(int bound_weight, const clustering_key_prefix* ck)
+        : _bound_weight(bound_weight)
+        , _ck(ck)
+    { }
+    // Returns placement of this position_in_partition relative to *_ck,
+    // or lexicographical_relation::at_prefix if !_ck.
+    lexicographical_relation relation() const {
+        // FIXME: Currently position_range cannot represent a range end bound which
+        // includes just the prefix key or a range start which excludes just a prefix key.
+        // In both cases we should return lexicographical_relation::before_all_strictly_prefixed here.
+        // Refs #1446.
+        if (_bound_weight <= 0) {
+            return lexicographical_relation::before_all_prefixed;
+        } else {
+            return lexicographical_relation::after_all_prefixed;
+        }
+    }
+public:
+    struct static_row_tag_t { };
+    struct clustering_row_tag_t { };
+    struct range_tag_t { };
+    using range_tombstone_tag_t = range_tag_t;
+
+    position_in_partition_view(static_row_tag_t) : _ck(nullptr) { }
+    position_in_partition_view(clustering_row_tag_t, const clustering_key_prefix& ck)
+        : _ck(&ck) { }
+    position_in_partition_view(const clustering_key_prefix& ck)
+        : _ck(&ck) { }
+    position_in_partition_view(range_tag_t, bound_view bv)
+        : _bound_weight(weight(bv.kind)), _ck(&bv.prefix) { }
+
+    static position_in_partition_view for_range_start(const query::clustering_range& r) {
+        return {position_in_partition_view::range_tag_t(), bound_view::from_range_start(r)};
+    }
+
+    static position_in_partition_view for_range_end(const query::clustering_range& r) {
+        return {position_in_partition_view::range_tag_t(), bound_view::from_range_end(r)};
+    }
+
+    static position_in_partition_view before_all_clustered_rows() {
+        return {range_tag_t(), bound_view::bottom()};
+    }
+
+    static position_in_partition_view after_all_clustered_rows() {
+        return {position_in_partition_view::range_tag_t(), bound_view::top()};
+    }
+
+    static position_in_partition_view for_static_row() {
+        return {static_row_tag_t()};
+    }
+
+    static position_in_partition_view for_key(const clustering_key& ck) {
+        return {clustering_row_tag_t(), ck};
+    }
+
+    static position_in_partition_view after_key(const clustering_key& ck) {
+        return {1, &ck};
+    }
+
+    bool is_static_row() const { return !_ck; }
+    bool is_clustering_row() const { return _ck && !_bound_weight; }
+
+    // Returns true if all fragments that can be seen for given schema have
+    // positions >= than this.
+    bool is_before_all_fragments(const schema& s) const {
+        return !_ck || (!s.has_static_columns() && _bound_weight < 0 && _ck->is_empty(s));
+    }
+
+    bool is_after_all_clustered_rows(const schema& s) const {
+        return _ck && _ck->is_empty(s) && _bound_weight > 0;
+    }
+
+    // Valid when >= before_all_clustered_rows()
+    const clustering_key_prefix& key() const {
+        return *_ck;
+    }
+
+    // Can be called only when !is_static_row && !is_clustering_row().
+    bound_view as_start_bound_view() const {
+        assert(_bound_weight != 0);
+        return bound_view(*_ck, _bound_weight < 0 ? bound_kind::incl_start : bound_kind::excl_start);
+    }
+
+    friend std::ostream& operator<<(std::ostream&, position_in_partition_view);
+    friend bool no_clustering_row_between(const schema&, position_in_partition_view, position_in_partition_view);
+};
+
+class position_in_partition {
+    int _bound_weight = 0;
+    stdx::optional<clustering_key_prefix> _ck;
+public:
+    struct static_row_tag_t { };
+    struct after_static_row_tag_t { };
+    struct clustering_row_tag_t { };
+    struct after_clustering_row_tag_t { };
+    struct range_tag_t { };
+    using range_tombstone_tag_t = range_tag_t;
+
+    explicit position_in_partition(static_row_tag_t) { }
+    position_in_partition(clustering_row_tag_t, clustering_key_prefix ck)
+        : _ck(std::move(ck)) { }
+    position_in_partition(after_clustering_row_tag_t, clustering_key_prefix ck)
+        // FIXME: Use lexicographical_relation::before_strictly_prefixed here. Refs #1446
+        : _bound_weight(1), _ck(std::move(ck)) { }
+    position_in_partition(range_tag_t, bound_view bv)
+        : _bound_weight(weight(bv.kind)), _ck(bv.prefix) { }
+    position_in_partition(after_static_row_tag_t) :
+        position_in_partition(range_tag_t(), bound_view::bottom()) { }
+    explicit position_in_partition(position_in_partition_view view)
+        : _bound_weight(view._bound_weight)
+        {
+            if (view._ck) {
+                _ck = *view._ck;
+            }
+        }
+
+    static position_in_partition before_all_clustered_rows() {
+        return {position_in_partition::range_tag_t(), bound_view::bottom()};
+    }
+
+    static position_in_partition after_all_clustered_rows() {
+        return {position_in_partition::range_tag_t(), bound_view::top()};
+    }
+
+    static position_in_partition after_key(clustering_key ck) {
+        return {after_clustering_row_tag_t(), std::move(ck)};
+    }
+
+    static position_in_partition for_key(clustering_key ck) {
+        return {clustering_row_tag_t(), std::move(ck)};
+    }
+
+    static position_in_partition for_range_start(const query::clustering_range&);
+    static position_in_partition for_range_end(const query::clustering_range&);
+
+    bool is_static_row() const { return !_ck; }
+    bool is_clustering_row() const { return _ck && !_bound_weight; }
+
+    bool is_after_all_clustered_rows(const schema& s) const {
+        return _ck && _ck->is_empty(s) && _bound_weight > 0;
+    }
+
+    template<typename Hasher>
+    void feed_hash(Hasher& hasher, const schema& s) const {
+        ::feed_hash(hasher, _bound_weight);
+        if (_ck) {
+            ::feed_hash(hasher, true);
+            _ck->feed_hash(hasher, s);
+        } else {
+            ::feed_hash(hasher, false);
+        }
+    }
+
+    clustering_key_prefix& key() {
+        return *_ck;
+    }
+    const clustering_key_prefix& key() const {
+        return *_ck;
+    }
+    operator position_in_partition_view() const {
+        return { _bound_weight, _ck ? &*_ck : nullptr };
+    }
+
+    // Defines total order on the union of position_and_partition and composite objects.
+    //
+    // The ordering is compatible with position_range (r). The following is satisfied for
+    // all cells with name c included by the range:
+    //
+    //   r.start() <= c < r.end()
+    //
+    // The ordering on composites given by this is compatible with but weaker than the cell name order.
+    //
+    // The ordering on position_in_partition given by this is compatible but weaker than the ordering
+    // given by position_in_partition::tri_compare.
+    //
+    class composite_tri_compare {
+        const schema& _s;
+    public:
+        composite_tri_compare(const schema& s) : _s(s) {}
+
+        int operator()(position_in_partition_view a, position_in_partition_view b) const {
+            if (a.is_static_row() || b.is_static_row()) {
+                return b.is_static_row() - a.is_static_row();
+            }
+            auto&& types = _s.clustering_key_type()->types();
+            auto cmp = [&] (const data_type& t, bytes_view c1, bytes_view c2) { return t->compare(c1, c2); };
+            return lexicographical_tri_compare(types.begin(), types.end(),
+                a._ck->begin(_s), a._ck->end(_s),
+                b._ck->begin(_s), b._ck->end(_s),
+                cmp, a.relation(), b.relation());
+        }
+
+        int operator()(position_in_partition_view a, composite_view b) const {
+            if (b.empty()) {
+                return 1; // a cannot be empty.
+            }
+            if (a.is_static_row() || b.is_static()) {
+                return b.is_static() - a.is_static_row();
+            }
+            auto&& types = _s.clustering_key_type()->types();
+            auto b_values = b.values();
+            auto cmp = [&] (const data_type& t, bytes_view c1, bytes_view c2) { return t->compare(c1, c2); };
+            return lexicographical_tri_compare(types.begin(), types.end(),
+                a._ck->begin(_s), a._ck->end(_s),
+                b_values.begin(), b_values.end(),
+                cmp, a.relation(), relation_for_lower_bound(b));
+        }
+
+        int operator()(composite_view a, position_in_partition_view b) const {
+            return -(*this)(b, a);
+        }
+
+        int operator()(composite_view a, composite_view b) const {
+            if (a.is_static() != b.is_static()) {
+                return a.is_static() ? -1 : 1;
+            }
+            auto&& types = _s.clustering_key_type()->types();
+            auto a_values = a.values();
+            auto b_values = b.values();
+            auto cmp = [&] (const data_type& t, bytes_view c1, bytes_view c2) { return t->compare(c1, c2); };
+            return lexicographical_tri_compare(types.begin(), types.end(),
+                a_values.begin(), a_values.end(),
+                b_values.begin(), b_values.end(),
+                cmp,
+                relation_for_lower_bound(a),
+                relation_for_lower_bound(b));
+        }
+    };
+
+    // Less comparator giving the same order as composite_tri_compare.
+    class composite_less_compare {
+        composite_tri_compare _cmp;
+    public:
+        composite_less_compare(const schema& s) : _cmp(s) {}
+
+        template<typename T, typename U>
+        bool operator()(const T& a, const U& b) const {
+            return _cmp(a, b) < 0;
+        }
+    };
+
+    class tri_compare {
+        bound_view::tri_compare _cmp;
+    private:
+        template<typename T, typename U>
+        int compare(const T& a, const U& b) const {
+            bool a_rt_weight = bool(a._ck);
+            bool b_rt_weight = bool(b._ck);
+            if (!a_rt_weight || !b_rt_weight) {
+                return a_rt_weight - b_rt_weight;
+            }
+            return _cmp(*a._ck, a._bound_weight, *b._ck, b._bound_weight);
+        }
+    public:
+        tri_compare(const schema& s) : _cmp(s) { }
+        int operator()(const position_in_partition& a, const position_in_partition& b) const {
+            return compare(a, b);
+        }
+        int operator()(const position_in_partition_view& a, const position_in_partition_view& b) const {
+            return compare(a, b);
+        }
+        int operator()(const position_in_partition& a, const position_in_partition_view& b) const {
+            return compare(a, b);
+        }
+        int operator()(const position_in_partition_view& a, const position_in_partition& b) const {
+            return compare(a, b);
+        }
+    };
+    class less_compare {
+        tri_compare _cmp;
+    public:
+        less_compare(const schema& s) : _cmp(s) { }
+        bool operator()(const position_in_partition& a, const position_in_partition& b) const {
+            return _cmp(a, b) < 0;
+        }
+        bool operator()(const position_in_partition_view& a, const position_in_partition_view& b) const {
+            return _cmp(a, b) < 0;
+        }
+        bool operator()(const position_in_partition& a, const position_in_partition_view& b) const {
+            return _cmp(a, b) < 0;
+        }
+        bool operator()(const position_in_partition_view& a, const position_in_partition& b) const {
+            return _cmp(a, b) < 0;
+        }
+    };
+    class equal_compare {
+        clustering_key_prefix::equality _equal;
+        template<typename T, typename U>
+        bool compare(const T& a, const U& b) const {
+            bool a_rt_weight = bool(a._ck);
+            bool b_rt_weight = bool(b._ck);
+            return a_rt_weight == b_rt_weight
+                   && (!a_rt_weight || (_equal(*a._ck, *b._ck)
+                        && a._bound_weight == b._bound_weight));
+        }
+    public:
+        equal_compare(const schema& s) : _equal(s) { }
+        bool operator()(const position_in_partition& a, const position_in_partition& b) const {
+            return compare(a, b);
+        }
+        bool operator()(const position_in_partition_view& a, const position_in_partition_view& b) const {
+            return compare(a, b);
+        }
+        bool operator()(const position_in_partition_view& a, const position_in_partition& b) const {
+            return compare(a, b);
+        }
+        bool operator()(const position_in_partition& a, const position_in_partition_view& b) const {
+            return compare(a, b);
+        }
+    };
+    friend std::ostream& operator<<(std::ostream&, const position_in_partition&);
+};
+
+inline
+position_in_partition position_in_partition::for_range_start(const query::clustering_range& r) {
+    return {position_in_partition::range_tag_t(), bound_view::from_range_start(r)};
+}
+
+inline
+position_in_partition position_in_partition::for_range_end(const query::clustering_range& r) {
+    return {position_in_partition::range_tag_t(), bound_view::from_range_end(r)};
+}
+
+// Returns true if and only if there can't be any clustering_row with position > a and < b.
+// It is assumed that a <= b.
+inline
+bool no_clustering_row_between(const schema& s, position_in_partition_view a, position_in_partition_view b) {
+    clustering_key_prefix::equality eq(s);
+    if (a._ck && b._ck) {
+        return eq(*a._ck, *b._ck) && (a._bound_weight >= 0 || b._bound_weight <= 0);
+    } else {
+        return !a._ck && !b._ck;
+    }
+}
+
+// Includes all position_in_partition objects "p" for which: start <= p < end
+// And only those.
+class position_range {
+private:
+    position_in_partition _start;
+    position_in_partition _end;
+public:
+    static position_range from_range(const query::clustering_range&);
+
+    static position_range for_static_row() {
+        return {
+            position_in_partition(position_in_partition::static_row_tag_t()),
+            position_in_partition(position_in_partition::after_static_row_tag_t())
+        };
+    }
+
+    static position_range full() {
+        return {
+            position_in_partition(position_in_partition::static_row_tag_t()),
+            position_in_partition::after_all_clustered_rows()
+        };
+    }
+
+    static position_range all_clustered_rows() {
+        return {
+            position_in_partition::before_all_clustered_rows(),
+            position_in_partition::after_all_clustered_rows()
+        };
+    }
+
+    position_range(position_range&&) = default;
+    position_range& operator=(position_range&&) = default;
+    position_range(const position_range&) = default;
+    position_range& operator=(const position_range&) = default;
+
+    // Constructs position_range which covers the same rows as given clustering_range.
+    // position_range includes a fragment if it includes position of that fragment.
+    position_range(const query::clustering_range&);
+    position_range(query::clustering_range&&);
+
+    position_range(position_in_partition start, position_in_partition end)
+        : _start(std::move(start))
+        , _end(std::move(end))
+    { }
+
+    const position_in_partition& start() const& { return _start; }
+    position_in_partition&& start() && { return std::move(_start); }
+    const position_in_partition& end() const& { return _end; }
+    position_in_partition&& end() && { return std::move(_end); }
+    bool contains(const schema& s, position_in_partition_view pos) const;
+    bool overlaps(const schema& s, position_in_partition_view start, position_in_partition_view end) const;
+
+    friend std::ostream& operator<<(std::ostream&, const position_range&);
+};
+
+inline
+bool position_range::contains(const schema& s, position_in_partition_view pos) const {
+    position_in_partition::less_compare less(s);
+    return !less(pos, _start) && less(pos, _end);
+}
+
+inline
+bool position_range::overlaps(const schema& s, position_in_partition_view start, position_in_partition_view end) const {
+    position_in_partition::less_compare less(s);
+    return !less(end, _start) && less(start, _end);
+}
--- a/range.hh
+++ b/range.hh
@@ -548,6 +548,12 @@ public:
            return nonwrapping_range(range_bound<T>(split_point, false), end());
        }
    }
+    // Creates a new sub-range which is the intersection of this range and a range starting with "start".
+    // If there is no overlap, returns stdx::nullopt.
+    template<typename Comparator>
+    stdx::optional<nonwrapping_range> trim_front(stdx::optional<bound>&& start, Comparator&& cmp) const {
+        return intersection(nonwrapping_range(std::move(start), {}), cmp);
+    }
    // Transforms this range into a new range of a different value type
    // Supplied transformer should transform value of type T (the old type) into value of type U (the new type).
    template<typename Transformer, typename U = typename std::result_of<Transformer(T)>::type>
--- a/range_tombstone.hh
+++ b/range_tombstone.hh
@@ -29,11 +29,10 @@
 #include "tombstone.hh"
 #include "clustering_bounds_comparator.hh"
 #include "stdx.hh"
+#include "position_in_partition.hh"

 namespace bi = boost::intrusive;

-class position_in_partition_view;
-
 /**
 * Represents a ranged deletion operation. Can be empty.
 */
@@ -149,6 +148,27 @@ public:
    // is larger than the end bound of this.
    stdx::optional<range_tombstone> apply(const schema& s, range_tombstone&& src);

+    // Intersects the range of this tombstone with [pos, +inf) and replaces
+    // the range of the tombstone if there is an overlap.
+    // Returns true if there is an overlap. When returns false, the tombstone
+    // is not modified.
+    //
+    // pos must satisfy:
+    //   1) before_all_clustered_rows() <= pos
+    //   2) !pos.is_clustering_row() - because range_tombstone bounds can't represent such positions
+    bool trim_front(const schema& s, position_in_partition_view pos) {
+        position_in_partition::less_compare less(s);
+        if (!less(pos, end_position())) {
+            return false;
+        }
+        if (less(position(), pos)) {
+            bound_view new_start = pos.as_start_bound_view();
+            start = new_start.prefix;
+            start_kind = new_start.kind;
+        }
+        return true;
+    }
+
    size_t external_memory_usage() const {
        return start.external_memory_usage() + end.external_memory_usage();
    }
--- a/range_tombstone_list.cc
+++ b/range_tombstone_list.cc
@@ -307,11 +307,46 @@ range_tombstone_list::slice(const schema& s, const query::clustering_range& r) c
        _tombstones.upper_bound(bv_range.second, order_by_start{s}));
 }

+boost::iterator_range<range_tombstone_list::const_iterator>
+range_tombstone_list::slice(const schema& s, position_in_partition_view start, position_in_partition_view end) const {
+    struct order_by_end {
+        position_in_partition::less_compare less;
+        order_by_end(const schema& s) : less(s) {}
+        bool operator()(position_in_partition_view v, const range_tombstone& rt) const { return less(v, rt.end_position()); }
+        bool operator()(const range_tombstone& rt, position_in_partition_view v) const { return less(rt.end_position(), v); }
+    };
+    struct order_by_start {
+        position_in_partition::less_compare less;
+        order_by_start(const schema& s) : less(s) {}
+        bool operator()(position_in_partition_view v, const range_tombstone& rt) const { return less(v, rt.position()); }
+        bool operator()(const range_tombstone& rt, position_in_partition_view v) const { return less(rt.position(), v); }
+    };
+    return boost::make_iterator_range(
+        _tombstones.upper_bound(start, order_by_end{s}), // end_position() is exclusive, hence upper_bound()
+        _tombstones.lower_bound(end, order_by_start{s}));
+}
+
 range_tombstone_list::iterator
 range_tombstone_list::erase(const_iterator a, const_iterator b) {
    return _tombstones.erase_and_dispose(a, b, current_deleter<range_tombstone>());
 }

+void range_tombstone_list::trim(const schema& s, const query::clustering_row_ranges& ranges) {
+    range_tombstone_list list(s);
+    bound_view::compare less(s);
+    for (auto&& range : ranges) {
+        auto start = bound_view::from_range_start(range);
+        auto end = bound_view::from_range_end(range);
+        for (const range_tombstone& rt : slice(s, range)) {
+            list.apply(s, range_tombstone(
+                std::max(rt.start_bound(), start, less),
+                std::min(rt.end_bound(), end, less),
+                rt.tomb));
+        }
+    }
+    *this = std::move(list);
+}
+
 range_tombstone_list::range_tombstones_type::iterator
 range_tombstone_list::reverter::insert(range_tombstones_type::iterator it, range_tombstone& new_rt) {
    _ops.emplace_back(insert_undo_op(new_rt));
@@ -368,3 +403,13 @@ void range_tombstone_list::update_undo_op::undo(const schema& s, range_tombstone
    assert (it != rt_list.end());
    *it = std::move(_old_rt);
 }
+
+std::ostream& operator<<(std::ostream& out, const range_tombstone_list& list) {
+    return out << "{" << ::join(", ", list) << "}";
+}
+
+bool range_tombstone_list::equal(const schema& s, const range_tombstone_list& other) const {
+    return boost::equal(_tombstones, other._tombstones, [&s] (auto&& rt1, auto&& rt2) {
+        return rt1.equal(s, rt2);
+    });
+}
--- a/range_tombstone_list.hh
+++ b/range_tombstone_list.hh
@@ -23,6 +23,8 @@

 #include "range_tombstone.hh"
 #include "query-request.hh"
+#include "position_in_partition.hh"
+#include <iosfwd>

 class range_tombstone_list final {
    using range_tombstones_type = range_tombstone::container_type;
@@ -139,7 +141,12 @@ public:
    tombstone search_tombstone_covering(const schema& s, const clustering_key_prefix& key) const;
    // Returns range of tombstones which overlap with given range
    boost::iterator_range<const_iterator> slice(const schema& s, const query::clustering_range&) const;
+    // Returns range tombstones which overlap with [start, end)
+    boost::iterator_range<const_iterator> slice(const schema& s, position_in_partition_view start, position_in_partition_view end) const;
    iterator erase(const_iterator, const_iterator);
+    // Ensures that every range tombstone is strictly contained within given clustering ranges.
+    // Preserves all information which may be relevant for rows from that ranges.
+    void trim(const schema& s, const query::clustering_row_ranges&);
    range_tombstone_list difference(const schema& s, const range_tombstone_list& rt_list) const;
    // Erases the range tombstones for which filter returns true.
    template <typename Pred>
@@ -161,6 +168,9 @@ public:
    void apply(const schema& s, const range_tombstone_list& rt_list);
    // See reversibly_mergeable.hh
    reverter apply_reversibly(const schema& s, range_tombstone_list& rt_list);
+
+    friend std::ostream& operator<<(std::ostream& out, const range_tombstone_list&);
+    bool equal(const schema&, const range_tombstone_list&) const;
 private:
    void apply_reversibly(const schema& s, clustering_key_prefix start, bound_kind start_kind,
                          clustering_key_prefix end, bound_kind end_kind, tombstone tomb, reverter& rev);
--- a/read_context.hh
+++ b/read_context.hh
@@ -0,0 +1,212 @@
+/*
+ * Copyright (C) 2017 ScyllaDB
+ */
+
+/*
+ * This file is part of Scylla.
+ *
+ * Scylla is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Scylla is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Scylla.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#include "schema.hh"
+#include "query-request.hh"
+#include "streamed_mutation.hh"
+#include "partition_version.hh"
+#include "tracing/tracing.hh"
+#include "row_cache.hh"
+
+namespace cache {
+
+/*
+ * Represent a reader to the underlying source.
+ * This reader automatically makes sure that it's up to date with all cache updates
+ */
+class autoupdating_underlying_reader final {
+    row_cache& _cache;
+    read_context& _read_context;
+    stdx::optional<mutation_reader> _reader;
+    utils::phased_barrier::phase_type _reader_creation_phase;
+    dht::partition_range _range = { };
+    stdx::optional<dht::decorated_key> _last_key;
+    stdx::optional<dht::decorated_key> _new_last_key;
+public:
+    autoupdating_underlying_reader(row_cache& cache, read_context& context)
+        : _cache(cache)
+        , _read_context(context)
+    { }
+    // Reads next partition without changing mutation source snapshot.
+    future<streamed_mutation_opt> read_next_same_phase() {
+        _last_key = std::move(_new_last_key);
+        return (*_reader)().then([this] (auto&& smopt) {
+            if (smopt) {
+                _new_last_key = smopt->decorated_key();
+            }
+            return std::move(smopt);
+        });
+    }
+    future<streamed_mutation_opt> operator()() {
+        _last_key = std::move(_new_last_key);
+        auto start = population_range_start();
+        auto phase = _cache.phase_of(start);
+        if (!_reader || _reader_creation_phase != phase) {
+            if (_last_key) {
+                auto cmp = dht::ring_position_comparator(*_cache._schema);
+                auto&& new_range = _range.split_after(*_last_key, cmp);
+                if (!new_range) {
+                    return make_ready_future<streamed_mutation_opt>(streamed_mutation_opt());
+                }
+                _range = std::move(*new_range);
+                _last_key = {};
+            }
+            auto& snap = _cache.snapshot_for_phase(phase);
+            _reader = _cache.create_underlying_reader(_read_context, snap, _range);
+            _reader_creation_phase = phase;
+        }
+        return (*_reader)().then([this] (auto&& smopt) {
+            if (smopt) {
+                _new_last_key = smopt->decorated_key();
+            }
+            return std::move(smopt);
+        });
+    }
+    future<> fast_forward_to(dht::partition_range&& range) {
+        auto snapshot_and_phase = _cache.snapshot_of(dht::ring_position_view::for_range_start(_range));
+        return fast_forward_to(std::move(range), snapshot_and_phase.snapshot, snapshot_and_phase.phase);
+    }
+    future<> fast_forward_to(dht::partition_range&& range, mutation_source& snapshot, row_cache::phase_type phase) {
+        _range = std::move(range);
+        _last_key = { };
+        _new_last_key = { };
+        if (_reader && _reader_creation_phase == phase) {
+            return _reader->fast_forward_to(_range);
+        }
+        _reader = _cache.create_underlying_reader(_read_context, snapshot, _range);
+        _reader_creation_phase = phase;
+        return make_ready_future<>();
+    }
+    utils::phased_barrier::phase_type creation_phase() const {
+        assert(_reader);
+        return _reader_creation_phase;
+    }
+    const dht::partition_range& range() const {
+        return _range;
+    }
+    dht::ring_position_view population_range_start() const {
+        return _last_key ? dht::ring_position_view::for_after_key(*_last_key)
+                         : dht::ring_position_view::for_range_start(_range);
+    }
+};
+
+class read_context final : public enable_lw_shared_from_this<read_context> {
+    row_cache& _cache;
+    schema_ptr _schema;
+    const dht::partition_range& _range;
+    const query::partition_slice& _slice;
+    const io_priority_class& _pc;
+    tracing::trace_state_ptr _trace_state;
+    streamed_mutation::forwarding _fwd;
+    mutation_reader::forwarding _fwd_mr;
+    bool _range_query;
+    autoupdating_underlying_reader _underlying;
+
+    // When reader enters a partition, it must be set up for reading that
+    // partition from the underlying mutation source (_sm) in one of two ways:
+    //
+    //  1) either _underlying is already in that partition, then _sm is set to the
+    //     stream obtained from it.
+    //
+    //  2) _underlying is before the partition, then _underlying_snapshot and _key
+    //     are set so that _sm can be created on demand.
+    //
+    streamed_mutation_opt _sm;
+    mutation_source_opt _underlying_snapshot;
+    dht::partition_range _sm_range;
+    stdx::optional<dht::decorated_key> _key;
+    row_cache::phase_type _phase;
+public:
+    read_context(row_cache& cache,
+            schema_ptr schema,
+            const dht::partition_range& range,
+            const query::partition_slice& slice,
+            const io_priority_class& pc,
+            tracing::trace_state_ptr trace_state,
+            streamed_mutation::forwarding fwd,
+            mutation_reader::forwarding fwd_mr)
+        : _cache(cache)
+        , _schema(std::move(schema))
+        , _range(range)
+        , _slice(slice)
+        , _pc(pc)
+        , _trace_state(std::move(trace_state))
+        , _fwd(fwd)
+        , _fwd_mr(fwd_mr)
+        , _range_query(!range.is_singular() || !range.start()->value().has_key())
+        , _underlying(_cache, *this)
+    { }
+    read_context(const read_context&) = delete;
+    row_cache& cache() { return _cache; }
+    const schema_ptr& schema() const { return _schema; }
+    const dht::partition_range& range() const { return _range; }
+    const query::partition_slice& slice() const { return _slice; }
+    const io_priority_class& pc() const { return _pc; }
+    tracing::trace_state_ptr trace_state() const { return _trace_state; }
+    streamed_mutation::forwarding fwd() const { return _fwd; }
+    mutation_reader::forwarding fwd_mr() const { return _fwd_mr; }
+    bool is_range_query() const { return _range_query; }
+    autoupdating_underlying_reader& underlying() { return _underlying; }
+    row_cache::phase_type phase() const { return _phase; }
+    const dht::decorated_key& key() const { return _sm->decorated_key(); }
+private:
+    future<> create_sm();
+    future<> ensure_sm_created() {
+        if (_sm) {
+            return make_ready_future<>();
+        }
+        return create_sm();
+    }
+public:
+    // Prepares the underlying streamed_mutation to represent dk in given snapshot.
+    // Partitions must be entered with strictly monotonic keys.
+    // The key must be after the current range of the underlying() reader.
+    // The phase argument must match the snapshot's phase.
+    void enter_partition(const dht::decorated_key& dk, mutation_source& snapshot, row_cache::phase_type phase) {
+        _phase = phase;
+        _sm = {};
+        _underlying_snapshot = snapshot;
+        _key = dk;
+    }
+    // Prepares the underlying streamed_mutation to be sm.
+    // The phase argument must match the phase of the snapshot used to obtain sm.
+    void enter_partition(streamed_mutation&& sm, row_cache::phase_type phase) {
+        _phase = phase;
+        _sm = std::move(sm);
+        _underlying_snapshot = {};
+    }
+    // Fast forwards the underlying streamed_mutation to given range.
+    future<> fast_forward_to(position_range range) {
+        return ensure_sm_created().then([this, range = std::move(range)] () mutable {
+            return _sm->fast_forward_to(std::move(range));
+        });
+    }
+    // Gets the next fragment from the underlying streamed_mutation
+    future<mutation_fragment_opt> get_next_fragment() {
+        return ensure_sm_created().then([this] {
+            return (*_sm)();
+        });
+    }
+};
+
+}
--- a/row_cache.cc
+++ b/row_cache.cc
--- a/row_cache.hh
+++ b/row_cache.hh
@@ -40,6 +40,16 @@
 namespace bi = boost::intrusive;

 class row_cache;
+class memtable_entry;
+
+namespace cache {
+
+class autoupdating_underlying_reader;
+class cache_streamed_mutation;
+class read_context;
+class lsa_manager;
+
+}

 // Intrusive set entry which holds partition data.
 //
@@ -60,119 +70,113 @@ class cache_entry {
    // True when we know that there is nothing between this entry and the next one in cache
    struct {
        bool _continuous : 1;
-        bool _wide_partition : 1;
        bool _dummy_entry : 1;
    } _flags{};
    lru_link_type _lru_link;
    cache_link_type _cache_link;
    friend class size_calculator;
+
+    streamed_mutation do_read(row_cache&, cache::read_context& reader);
 public:
    friend class row_cache;
    friend class cache_tracker;

    struct dummy_entry_tag{};
+    struct incomplete_tag{};
+
    cache_entry(dummy_entry_tag)
        : _key{dht::token(), partition_key::make_empty()}
    {
        _flags._dummy_entry = true;
    }

-    struct wide_partition_tag{};
-
-    cache_entry(schema_ptr s, const dht::decorated_key& key, wide_partition_tag)
-        : _schema(std::move(s))
-        , _key(key)
-    {
-        _flags._wide_partition = true;
-    }
+    // Creates an entry which is fully discontinuous, except for the partition tombstone.
+    cache_entry(incomplete_tag, schema_ptr s, const dht::decorated_key& key, tombstone t)
+        : cache_entry(s, key, mutation_partition::make_incomplete(*s, t))
+    { }

    cache_entry(schema_ptr s, const dht::decorated_key& key, const mutation_partition& p)
        : _schema(std::move(s))
        , _key(key)
        , _pe(p)
-    { }
+    {
+        _pe.version()->partition().ensure_last_dummy(*_schema);
+    }

    cache_entry(schema_ptr s, dht::decorated_key&& key, mutation_partition&& p) noexcept
        : _schema(std::move(s))
        , _key(std::move(key))
        , _pe(std::move(p))
-    { }
+    {
+        _pe.version()->partition().ensure_last_dummy(*_schema);
+    }

+    // It is assumed that pe is fully continuous
    cache_entry(schema_ptr s, dht::decorated_key&& key, partition_entry&& pe) noexcept
        : _schema(std::move(s))
        , _key(std::move(key))
        , _pe(std::move(pe))
-    { }
+    {
+        // If we can assume that _pe is fully continuous, we don't need to check all versions
+        // to determine what the continuity is.
+        // This doesn't change value and doesn't invalidate iterators, so can be called even with a snapshot.
+        _pe.version()->partition().ensure_last_dummy(*_schema);
+    }

    cache_entry(cache_entry&&) noexcept;

    bool is_evictable() { return _lru_link.is_linked(); }
    const dht::decorated_key& key() const { return _key; }
+    dht::ring_position_view position() const {
+        if (is_dummy_entry()) {
+            return dht::ring_position_view::max();
+        }
+        return _key;
+    }
    const partition_entry& partition() const { return _pe; }
    partition_entry& partition() { return _pe; }
    const schema_ptr& schema() const { return _schema; }
    schema_ptr& schema() { return _schema; }
-    // Requires: !wide_partition()
-    streamed_mutation read(row_cache&, const schema_ptr&, streamed_mutation::forwarding);
-    // Requires: !wide_partition()
-    streamed_mutation read(row_cache&, const schema_ptr&, const query::partition_slice&, streamed_mutation::forwarding);
-    // May return disengaged optional if the partition is empty.
-    future<streamed_mutation_opt> read_wide(row_cache&, schema_ptr, const query::partition_slice&, const io_priority_class&, streamed_mutation::forwarding);
+    streamed_mutation read(row_cache&, cache::read_context& reader);
+    streamed_mutation read(row_cache&, cache::read_context& reader, streamed_mutation&& underlying, utils::phased_barrier::phase_type);
    bool continuous() const { return _flags._continuous; }
    void set_continuous(bool value) { _flags._continuous = value; }
-    bool wide_partition() const { return _flags._wide_partition; }
-    void set_wide_partition() {
-        _flags._wide_partition = true;
-        _pe = {};
-    }

    bool is_dummy_entry() const { return _flags._dummy_entry; }

    struct compare {
-        dht::decorated_key::less_comparator _c;
+        dht::ring_position_less_comparator _c;

        compare(schema_ptr s)
-            : _c(std::move(s))
+            : _c(*s)
        {}

        bool operator()(const dht::decorated_key& k1, const cache_entry& k2) const {
-            if (k2.is_dummy_entry()) {
-                return true;
-            }
-            return _c(k1, k2._key);
+            return _c(k1, k2.position());
        }

-        bool operator()(const dht::ring_position& k1, const cache_entry& k2) const {
-            if (k2.is_dummy_entry()) {
-                return true;
-            }
-            return _c(k1, k2._key);
+        bool operator()(dht::ring_position_view k1, const cache_entry& k2) const {
+            return _c(k1, k2.position());
        }

        bool operator()(const cache_entry& k1, const cache_entry& k2) const {
-            if (k1.is_dummy_entry()) {
-                return false;
-            }
-            if (k2.is_dummy_entry()) {
-                return true;
-            }
-            return _c(k1._key, k2._key);
+            return _c(k1.position(), k2.position());
        }

        bool operator()(const cache_entry& k1, const dht::decorated_key& k2) const {
-            if (k1.is_dummy_entry()) {
-                return false;
-            }
-            return _c(k1._key, k2);
+            return _c(k1.position(), k2);
        }

-        bool operator()(const cache_entry& k1, const dht::ring_position& k2) const {
-            if (k1.is_dummy_entry()) {
-                return false;
-            }
-            return _c(k1._key, k2);
+        bool operator()(const cache_entry& k1, dht::ring_position_view k2) const {
+            return _c(k1.position(), k2);
+        }
+
+        bool operator()(dht::ring_position_view k1, dht::ring_position_view k2) const {
+            return _c(k1, k2);
        }
    };
+
+    friend std::ostream& operator<<(std::ostream&, cache_entry&);
 };

 // Tracks accesses and performs eviction of cache entries.
@@ -190,23 +194,20 @@ public:
    struct stats {
        uint64_t hits;
        uint64_t misses;
-        uint64_t uncached_wide_partitions;
-        uint64_t wide_partition_mispopulations;
        uint64_t insertions;
        uint64_t concurrent_misses_same_key;
        uint64_t merges;
        uint64_t evictions;
-        uint64_t wide_partition_evictions;
        uint64_t removals;
        uint64_t partitions;
        uint64_t modification_count;
+        uint64_t mispopulations;
    };
 private:
    stats _stats{};
    seastar::metrics::metric_groups _metrics;
    logalloc::region _region;
    lru_type _lru;
-    lru_type _wide_partition_lru;
 private:
    void setup_metrics();
 public:
@@ -215,21 +216,18 @@ public:
    void clear();
    void touch(cache_entry&);
    void insert(cache_entry&);
-    void mark_wide(cache_entry&);
    void clear_continuity(cache_entry& ce);
    void on_erase();
    void on_merge();
    void on_hit();
    void on_miss();
    void on_miss_already_populated();
-    void on_uncached_wide_partition();
-    void on_wide_partition_mispopulation();
+    void on_mispopulate();
    allocation_strategy& allocator();
    logalloc::region& region();
    const logalloc::region& region() const;
    uint64_t modification_count() const { return _stats.modification_count; }
    uint64_t partitions() const { return _stats.partitions; }
-    uint64_t uncached_wide_partitions() const { return _stats.uncached_wide_partitions; }
    const stats& get_stats() const { return _stats; }
 };

@@ -240,21 +238,27 @@ cache_tracker& global_cache_tracker();
 // A data source which wraps another data source such that data obtained from the underlying data source
 // is cached in-memory in order to serve queries faster.
 //
-// To query the underlying data source through cache, use make_reader().
-//
 // Cache populates itself automatically during misses.
 //
-// Cache needs to be maintained externally so that it remains consistent with the underlying data source.
-// Any incremental change to the underlying data source should result in update() being called on cache.
+// Cache represents a snapshot of the underlying mutation source. When the
+// underlying mutation source changes, cache needs to be explicitly synchronized
+// to the latest snapshot. This is done by calling update() or invalidate().
 //
 class row_cache final {
 public:
+    using phase_type = utils::phased_barrier::phase_type;
    using partitions_type = bi::set<cache_entry,
        bi::member_hook<cache_entry, cache_entry::cache_link_type, &cache_entry::_cache_link>,
        bi::constant_time_size<false>, // we need this to have bi::auto_unlink on hooks
        bi::compare<cache_entry::compare>>;
+    friend class cache::autoupdating_underlying_reader;
    friend class single_partition_populating_reader;
    friend class cache_entry;
+    friend class cache::cache_streamed_mutation;
+    friend class cache::lsa_manager;
+    friend class cache::read_context;
+    friend class partition_range_cursor;
+    friend class cache_tester;
 public:
    struct stats {
        utils::timed_rate_moving_average hits;
@@ -265,32 +269,52 @@ private:
    stats _stats{};
    schema_ptr _schema;
    partitions_type _partitions; // Cached partitions are complete.
-    mutation_source _underlying;
-    uint64_t _max_cached_partition_size_in_bytes;

-    // Synchronizes populating reads with updates of underlying data source to ensure that cache
-    // remains consistent across flushes with the underlying data source.
-    // Readers obtained from the underlying data source in earlier than
-    // current phases must not be used to populate the cache, unless they hold
-    // phaser::operation created in the reader's phase of origin. Readers
-    // should hold to a phase only briefly because this inhibits progress of
-    // updates. Phase changes occur in update()/clear(), which can be assumed to
-    // be asynchronous wrt invoking of the underlying data source.
-    utils::phased_barrier _populate_phaser;
+    // The snapshots used by cache are versioned. The version number of a snapshot is
+    // called the "population phase", or simply "phase". Between updates, cache
+    // represents the same snapshot.
+    //
+    // Update doesn't happen atomically. Before it completes, some entries reflect
+    // the old snapshot, while others reflect the new snapshot. After update
+    // completes, all entries must reflect the new snapshot. There is a race between the
+    // update process and populating reads. Since after the update all entries must
+    // reflect the new snapshot, reads using the old snapshot cannot be allowed to
+    // insert data which will no longer be reached by the update process. The whole
+    // range can be therefore divided into two sub-ranges, one which was already
+    // processed by the update and one which hasn't. Each key can be assigned a
+    // population phase which determines to which range it belongs, as well as which
+    // snapshot it reflects. The methods snapshot_of() and phase_of() can
+    // be used to determine this.
+    //
+    // In general, reads are allowed to populate given range only if the phase
+    // of the snapshot they use matches the phase of all keys in that range
+    // when the population is committed. This guarantees that the range will
+    // be reached by the update process or already has been in its entirety.
+    // In case of phase conflict, current solution is to give up on
+    // population. Since the update process is a scan, it's sufficient to
+    // check when committing the population if the start and end of the range
+    // have the same phases and that it's the same phase as that of the start
+    // of the range at the time when reading began.
+
+    mutation_source _underlying;
+    phase_type _underlying_phase = 0;
+    mutation_source_opt _prev_snapshot;
+
+    // Positions >= than this are using _prev_snapshot, the rest is using _underlying.
+    stdx::optional<dht::ring_position> _prev_snapshot_pos;
+
+    snapshot_source _snapshot_source;
+
+    // There can be at most one update in progress.
+    seastar::semaphore _update_sem = {1};

    logalloc::allocating_section _update_section;
    logalloc::allocating_section _populate_section;
    logalloc::allocating_section _read_section;
-    mutation_reader make_scanning_reader(schema_ptr,
-                                         const dht::partition_range&,
-                                         const io_priority_class& pc,
-                                         const query::partition_slice& slice,
-                                         tracing::trace_state_ptr trace_state,
-                                         streamed_mutation::forwarding,
-                                         mutation_reader::forwarding);
+    mutation_reader create_underlying_reader(cache::read_context&, mutation_source&, const dht::partition_range&);
+    mutation_reader make_scanning_reader(const dht::partition_range&, lw_shared_ptr<cache::read_context>);
    void on_hit();
    void on_miss();
-    void on_uncached_wide_partition();
    void upgrade_entry(cache_entry&);
    void invalidate_locked(const dht::decorated_key&);
    void invalidate_unwrapped(const dht::partition_range&);
@@ -298,13 +322,10 @@ private:
    static thread_local seastar::thread_scheduling_group _update_thread_scheduling_group;

    struct previous_entry_pointer {
-        utils::phased_barrier::phase_type _populate_phase;
        stdx::optional<dht::decorated_key> _key;

-        void reset(stdx::optional<dht::decorated_key> key, utils::phased_barrier::phase_type populate_phase) {
-            _populate_phase = populate_phase;
-            _key = std::move(key);
-        }
+        previous_entry_pointer() = default; // Represents dht::ring_position_view::min()
+        previous_entry_pointer(dht::decorated_key key) : _key(std::move(key)) {};

        // TODO: Currently inserting an entry to the cache increases
        // modification counter. That doesn't seem to be necessary and if we
@@ -317,15 +338,55 @@ private:
    //        { create(it) } -> partitions_type::iterator;
    //        { visit(it) } -> void;
    //    }
-    void do_find_or_create_entry(const dht::decorated_key& key, const previous_entry_pointer* previous,
+    //
+    // Must be run under reclaim lock
+    cache_entry& do_find_or_create_entry(const dht::decorated_key& key, const previous_entry_pointer* previous,
                                 CreateEntry&& create_entry, VisitEntry&& visit_entry);

+    // Ensures that partition entry for given key exists in cache and returns a reference to it.
+    // Prepares the entry for reading. "phase" must match the current phase of the entry.
+    //
+    // Since currently every entry has to have a complete tombstone, it has to be provided here.
+    // The entry which is returned will have the tombstone applied to it.
+    //
+    // Must be run under reclaim lock
+    cache_entry& find_or_create(const dht::decorated_key& key, tombstone t, row_cache::phase_type phase, const previous_entry_pointer* previous = nullptr);
+
    partitions_type::iterator partitions_end() {
        return std::prev(_partitions.end());
    }
+
+    // Only active phases are accepted.
+    // Reference valid only until next deferring point.
+    mutation_source& snapshot_for_phase(phase_type);
+
+    // Returns population phase for given position in the ring.
+    // snapshot_for_phase() can be called to obtain mutation_source for given phase, but
+    // only until the next deferring point.
+    // Should be only called outside update().
+    phase_type phase_of(dht::ring_position_view);
+
+    struct snapshot_and_phase {
+        mutation_source& snapshot;
+        phase_type phase;
+    };
+
+    // Optimized version of:
+    //
+    //  { snapshot_for_phase(phase_of(pos)), phase_of(pos) };
+    //
+    snapshot_and_phase snapshot_of(dht::ring_position_view pos);
+
+    // Merges the memtable into cache with configurable logic for handling memtable entries.
+    // The Updater gets invoked for every entry in the memtable with a lower bound iterator
+    // into _partitions (cache_i), and the memtable entry.
+    // It is invoked inside allocating section and in the context of cache's allocator.
+    // All memtable entries will be removed.
+    template <typename Updater>
+    future<> do_update(memtable& m, Updater func);
 public:
    ~row_cache();
-    row_cache(schema_ptr, mutation_source underlying, cache_tracker&, uint64_t _max_cached_partition_size_in_bytes = 10 * 1024 * 1024);
+    row_cache(schema_ptr, snapshot_source, cache_tracker&);
    row_cache(row_cache&&) = default;
    row_cache(const row_cache&) = delete;
    row_cache& operator=(row_cache&&) = default;
@@ -344,43 +405,49 @@ public:

    const stats& stats() const { return _stats; }
 public:
-    // Populate cache from given mutation. The mutation must contain all
-    // information there is for its partition in the underlying data sources.
+    // Populate cache from given mutation, which must be fully continuous.
+    // Intended to be used only in tests.
+    // Can only be called prior to any reads.
    void populate(const mutation& m, const previous_entry_pointer* previous = nullptr);

-    // Caches an information that a partition with a given key is wide.
-    void mark_partition_as_wide(const dht::decorated_key& key, const previous_entry_pointer* previous = nullptr);
-
-    // Clears the cache.
-    // Guarantees that cache will not be populated using readers created
-    // before this method was invoked.
-    future<> clear();
-
    // Synchronizes cache with the underlying data source from a memtable which
    // has just been flushed to the underlying data source.
    // The memtable can be queried during the process, but must not be written.
    // After the update is complete, memtable is empty.
    future<> update(memtable&, partition_presence_checker underlying_negative);

+    // Like update(), synchronizes cache with an incremental change to the underlying
+    // mutation source, but instead of inserting and merging data, invalidates affected ranges.
+    // Can be thought of as a more fine-grained version of invalidate(), which invalidates
+    // as few elements as possible.
+    future<> update_invalidating(memtable&);
+
    // Moves given partition to the front of LRU if present in cache.
    void touch(const dht::decorated_key&);

-    // Removes given partition from cache.
+    // Synchronizes cache with the underlying mutation source
+    // by invalidating ranges which were modified. This will force
+    // them to be re-read from the underlying mutation source
+    // during next read overlapping with the invalidated ranges.
    //
-    // Guarantees that cache will not be populated with given key
-    // using readers created before this method was invoked.
+    // The ranges passed to invalidate() must include all
+    // data which changed since last synchronization. Failure
+    // to do so may result in reads seeing partial writes,
+    // which would violate write atomicity.
    //
-    // The key must be kept alive until method resolves.
-    future<> invalidate(const dht::decorated_key& key);
+    // Guarantees that readers created after invalidate()
+    // completes will see all writes from the underlying
+    // mutation source made prior to the call to invalidate().
+    future<> invalidate(const dht::decorated_key&);
+    future<> invalidate(const dht::partition_range& = query::full_partition_range);
+    future<> invalidate(dht::partition_range_vector&&);

-    // Removes given range of partitions from cache.
-    // The range can be a wrap around.
+    // Evicts entries from given range in cache.
    //
-    // Guarantees that cache will not be populated with partitions from that range
-    // using readers created before this method was invoked.
-    //
-    // The range must be kept alive until method resolves.
-    future<> invalidate(const dht::partition_range&);
+    // Note that this does not synchronize with the underlying source,
+    // it is assumed that the underlying source didn't change.
+    // If it did, use invalidate() instead.
+    void evict(const dht::partition_range& = query::full_partition_range);

    auto num_entries() const {
        return _partitions.size();
@@ -392,6 +459,8 @@ public:
    void set_schema(schema_ptr) noexcept;
    const schema_ptr& schema() const;

+    friend std::ostream& operator<<(std::ostream&, row_cache&);
+
    friend class just_cache_scanning_reader;
    friend class scanning_and_populating_reader;
    friend class range_populating_reader;
--- a/schema_upgrader.hh
+++ b/schema_upgrader.hh
@@ -0,0 +1,68 @@
+/*
+ * Copyright (C) 2017 ScyllaDB
+ */
+
+/*
+ * This file is part of Scylla.
+ *
+ * Scylla is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Scylla is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Scylla.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#include "streamed_mutation.hh"
+#include "converting_mutation_partition_applier.hh"
+
+// A StreamedMutationTransformer which transforms the stream to a different schema
+class schema_upgrader {
+    schema_ptr _prev;
+    schema_ptr _new;
+private:
+    row transform(row&& r, column_kind kind) {
+        row new_row;
+        r.for_each_cell([&] (column_id id, atomic_cell_or_collection& cell) {
+            const column_definition& col = _prev->column_at(kind, id);
+            const column_definition* new_col = _new->get_column_definition(col.name());
+            if (new_col) {
+                converting_mutation_partition_applier::append_cell(new_row, kind, *new_col, col.type, std::move(cell));
+            }
+        });
+        return new_row;
+    }
+public:
+    schema_upgrader(schema_ptr s)
+        : _new(std::move(s))
+    { }
+    schema_ptr operator()(schema_ptr old) {
+        _prev = std::move(old);
+        return _new;
+    }
+    mutation_fragment consume(static_row&& row) {
+        return mutation_fragment(static_row(transform(std::move(row.cells()), column_kind::static_column)));
+    }
+    mutation_fragment consume(clustering_row&& row) {
+        return mutation_fragment(clustering_row(row.key(), row.tomb(), row.marker(),
+            transform(std::move(row.cells()), column_kind::regular_column)));
+    }
+    mutation_fragment consume(range_tombstone&& rt) {
+        return std::move(rt);
+    }
+    mutation_fragment operator()(mutation_fragment&& mf) {
+        return std::move(mf).consume(*this);
+    }
+};
+
+GCC6_CONCEPT(
+static_assert(StreamedMutationTranformer<schema_upgrader>());
+)
--- a/service/storage_proxy.cc
+++ b/service/storage_proxy.cc
@@ -2042,9 +2042,10 @@ private:
            virtual void accept_static_cell(column_id, atomic_cell_view) override { }
            virtual void accept_static_cell(column_id, collection_mutation_view) override { }
            virtual void accept_row_tombstone(const range_tombstone&) override { }
-            virtual void accept_row(clustering_key_view key, const row_tombstone&, const row_marker&) override {
+            virtual void accept_row(position_in_partition_view pos, const row_tombstone&, const row_marker&, is_dummy dummy, is_continuous) override {
+                assert(!dummy);
                if (!_is_reversed || !_last_ck) {
-                    _last_ck = clustering_key(key);
+                    _last_ck = pos.key();
                }
            }
            virtual void accept_row_cell(column_id id, atomic_cell_view) override { }
--- a/streamed_mutation.cc
+++ b/streamed_mutation.cc
@@ -154,6 +154,10 @@ std::ostream& operator<<(std::ostream& os, const mutation_fragment& mf) {
    return os;
 }

+streamed_mutation make_empty_streamed_mutation(schema_ptr s, dht::decorated_key key, streamed_mutation::forwarding fwd) {
+    return streamed_mutation_from_mutation(mutation(std::move(key), std::move(s)), fwd);
+}
+
 streamed_mutation streamed_mutation_from_mutation(mutation m, streamed_mutation::forwarding fwd)
 {
    class reader final : public streamed_mutation::impl {
@@ -165,10 +169,16 @@ streamed_mutation streamed_mutation_from_mutation(mutation m, streamed_mutation:
    private:
        void prepare_next_clustering_row() {
            auto& crs = _mutation.partition().clustered_rows();
-            auto re = crs.unlink_leftmost_without_rebalance();
-            if (re) {
+            while (true) {
+                auto re = crs.unlink_leftmost_without_rebalance();
+                if (!re) {
+                    break;
+                }
                auto re_deleter = defer([re] { current_deleter<rows_entry>()(re); });
-                _cr = mutation_fragment(std::move(*re));
+                if (!re->dummy()) {
+                    _cr = mutation_fragment(std::move(*re));
+                    break;
+                }
            }
        }
        void prepare_next_range_tombstone() {
@@ -262,6 +272,44 @@ streamed_mutation streamed_mutation_from_mutation(mutation m, streamed_mutation:
    return std::move(sm);
 }

+streamed_mutation streamed_mutation_from_forwarding_streamed_mutation(streamed_mutation&& sm)
+{
+    class reader final : public streamed_mutation::impl {
+        streamed_mutation _sm;
+        bool _static_row_done = false;
+    public:
+        explicit reader(streamed_mutation&& sm)
+            : streamed_mutation::impl(sm.schema(), sm.decorated_key(), sm.partition_tombstone())
+            , _sm(std::move(sm))
+        { }
+
+        virtual future<> fill_buffer() override {
+            if (!_static_row_done) {
+                _static_row_done = true;
+                return _sm().then([this] (auto&& mf) {
+                    if (mf) {
+                        this->push_mutation_fragment(std::move(*mf));
+                    }
+                    return _sm.fast_forward_to(query::clustering_range{}).then([this] {
+                        return this->fill_buffer();
+                    });
+                });
+            }
+            return do_until([this] { return is_end_of_stream() || is_buffer_full(); }, [this] {
+                return _sm().then([this] (auto&& mf) {
+                    if (mf) {
+                        this->push_mutation_fragment(std::move(*mf));
+                    } else {
+                        _end_of_stream = true;
+                    }
+                });
+            });
+        }
+    };
+
+    return make_streamed_mutation<reader>(std::move(sm));
+}
+
 streamed_mutation make_forwardable(streamed_mutation m) {
    class reader : public streamed_mutation::impl {
        streamed_mutation _sm;
@@ -472,8 +520,7 @@ mutation_fragment_opt range_tombstone_stream::do_get_next()
 mutation_fragment_opt range_tombstone_stream::get_next(const rows_entry& re)
 {
    if (!_list.empty()) {
-        position_in_partition_view view(position_in_partition_view::clustering_row_tag_t(), re.key());
-        return !_cmp(view, _list.begin()->position()) ? do_get_next() : mutation_fragment_opt();
+        return !_cmp(re.position(), _list.begin()->position()) ? do_get_next() : mutation_fragment_opt();
    }
    return { };
 }
@@ -632,3 +679,7 @@ bool mutation_fragment::relevant_for_range_assuming_after(const schema& s, posit
    // Range tombstones overlapping with the new range are let in
    return is_range_tombstone() && cmp(pos, as_range_tombstone().end_position());
 }
+
+std::ostream& operator<<(std::ostream& out, const range_tombstone_stream& rtl) {
+    return out << rtl._list;
+}
--- a/streamed_mutation.hh
+++ b/streamed_mutation.hh
@@ -23,6 +23,7 @@

 #include "mutation_partition.hh"
 #include "utils/optimized_optional.hh"
+#include "position_in_partition.hh"

 #include <experimental/optional>

@@ -38,8 +39,6 @@
 // mutation_fragment objects. It reflects the order in which content of
 // partition appears in the sstables.

-class position_in_partition_view;
-
 class clustering_row {
    clustering_key_prefix _ck;
    row_tombstone _t;
@@ -111,6 +110,13 @@ public:
        return sizeof(clustering_row) + external_memory_usage();
    }

+    bool equal(const schema& s, const clustering_row& other) const {
+        return _ck.equal(s, other._ck)
+               && _t == other._t
+               && _marker == other._marker
+               && _cells.equal(column_kind::static_column, s, other._cells, s);
+    }
+
    friend std::ostream& operator<<(std::ostream& os, const clustering_row& row);
 };

@@ -148,6 +154,10 @@ public:
        return sizeof(static_row) + external_memory_usage();
    }

+    bool equal(const schema& s, const static_row& other) const {
+        return _cells.equal(column_kind::static_column, s, other._cells, s);
+    }
+
    friend std::ostream& operator<<(std::ostream& is, const static_row& row);
 };

@@ -185,9 +195,29 @@ public:
    mutation_fragment(clustering_row&& r);
    mutation_fragment(range_tombstone&& r);

-    mutation_fragment(const mutation_fragment&) = delete;
+    mutation_fragment(const mutation_fragment& o)
+        : _kind(o._kind), _data(std::make_unique<data>()) {
+        switch(_kind) {
+            case kind::static_row:
+                new (&_data->_static_row) static_row(o._data->_static_row);
+                break;
+            case kind::clustering_row:
+                new (&_data->_clustering_row) clustering_row(o._data->_clustering_row);
+                break;
+            case kind::range_tombstone:
+                new (&_data->_range_tombstone) range_tombstone(o._data->_range_tombstone);
+                break;
+        }
+    }
    mutation_fragment(mutation_fragment&& other) = default;
-    mutation_fragment& operator=(const mutation_fragment&) = delete;
+    mutation_fragment& operator=(const mutation_fragment& other) {
+        if (this != &other) {
+            mutation_fragment copy(other);
+            this->~mutation_fragment();
+            new (this) mutation_fragment(std::move(copy));
+        }
+        return *this;
+    }
    mutation_fragment& operator=(mutation_fragment&& other) noexcept {
        if (this != &other) {
            this->~mutation_fragment();
@@ -297,395 +327,24 @@ public:
        return *_data->_size_in_bytes;
    }

+    bool equal(const schema& s, const mutation_fragment& other) const {
+        if (other._kind != _kind) {
+            return false;
+        }
+        switch(_kind) {
+        case kind::static_row:
+            return as_static_row().equal(s, other.as_static_row());
+        case kind::clustering_row:
+            return as_clustering_row().equal(s, other.as_clustering_row());
+        case kind::range_tombstone:
+            return as_range_tombstone().equal(s, other.as_range_tombstone());
+        }
+        abort();
+    }
+
    friend std::ostream& operator<<(std::ostream&, const mutation_fragment& mf);
 };

-std::ostream& operator<<(std::ostream&, mutation_fragment::kind);
-
-std::ostream& operator<<(std::ostream&, const mutation_fragment& mf);
-
-class position_in_partition;
-
-inline
-lexicographical_relation relation_for_lower_bound(composite_view v) {
-    switch (v.last_eoc()) {
-        case composite::eoc::start:
-        case composite::eoc::none:
-            return lexicographical_relation::before_all_prefixed;
-        case composite::eoc::end:
-            return lexicographical_relation::after_all_prefixed;
-        default:
-            assert(0);
-    }
-}
-
-inline
-lexicographical_relation relation_for_upper_bound(composite_view v) {
-    switch (v.last_eoc()) {
-        case composite::eoc::start:
-            return lexicographical_relation::before_all_prefixed;
-        case composite::eoc::none:
-            return lexicographical_relation::before_all_strictly_prefixed;
-        case composite::eoc::end:
-            return lexicographical_relation::after_all_prefixed;
-        default:
-            assert(0);
-    }
-}
-
-class position_in_partition_view {
-    friend class position_in_partition;
-
-    int _bound_weight = 0;
-    const clustering_key_prefix* _ck; // nullptr for static row
-private:
-    position_in_partition_view(int bound_weight, const clustering_key_prefix* ck)
-        : _bound_weight(bound_weight)
-        , _ck(ck)
-    { }
-    // Returns placement of this position_in_partition relative to *_ck,
-    // or lexicographical_relation::at_prefix if !_ck.
-    lexicographical_relation relation() const {
-        // FIXME: Currently position_range cannot represent a range end bound which
-        // includes just the prefix key or a range start which excludes just a prefix key.
-        // In both cases we should return lexicographical_relation::before_all_strictly_prefixed here.
-        // Refs #1446.
-        if (_bound_weight <= 0) {
-            return lexicographical_relation::before_all_prefixed;
-        } else {
-            return lexicographical_relation::after_all_prefixed;
-        }
-    }
-public:
-    struct static_row_tag_t { };
-    struct clustering_row_tag_t { };
-    struct range_tag_t { };
-    using range_tombstone_tag_t = range_tag_t;
-
-    position_in_partition_view(static_row_tag_t) : _ck(nullptr) { }
-    position_in_partition_view(clustering_row_tag_t, const clustering_key_prefix& ck)
-        : _ck(&ck) { }
-    position_in_partition_view(range_tag_t, bound_view bv)
-        : _bound_weight(weight(bv.kind)), _ck(&bv.prefix) { }
-
-    static position_in_partition_view for_range_start(const query::clustering_range&);
-    static position_in_partition_view for_range_end(const query::clustering_range&);
-
-    static position_in_partition_view before_all_clustered_rows() {
-        return {range_tag_t(), bound_view::bottom()};
-    }
-
-    static position_in_partition_view for_static_row() {
-        return {static_row_tag_t()};
-    }
-
-    bool is_static_row() const { return !_ck; }
-
-    // Returns true if all fragments that can be seen for given schema have
-    // positions >= than this.
-    bool is_before_all_fragments(const schema& s) const {
-        return !_ck || (!s.has_static_columns() && _bound_weight < 0 && _ck->is_empty(s));
-    }
-
-    friend std::ostream& operator<<(std::ostream&, position_in_partition_view);
-};
-
-inline
-position_in_partition_view position_in_partition_view::for_range_start(const query::clustering_range& r) {
-    return {position_in_partition_view::range_tag_t(), bound_view::from_range_start(r)};
-}
-
-inline
-position_in_partition_view position_in_partition_view::for_range_end(const query::clustering_range& r) {
-    return {position_in_partition_view::range_tag_t(), bound_view::from_range_end(r)};
-}
-
-class position_in_partition {
-    int _bound_weight = 0;
-    stdx::optional<clustering_key_prefix> _ck;
-public:
-    struct static_row_tag_t { };
-    struct after_static_row_tag_t { };
-    struct clustering_row_tag_t { };
-    struct after_clustering_row_tag_t { };
-    struct range_tag_t { };
-    using range_tombstone_tag_t = range_tag_t;
-
-    explicit position_in_partition(static_row_tag_t) { }
-    position_in_partition(clustering_row_tag_t, clustering_key_prefix ck)
-        : _ck(std::move(ck)) { }
-    position_in_partition(after_clustering_row_tag_t, clustering_key_prefix ck)
-        // FIXME: Use lexicographical_relation::before_strictly_prefixed here. Refs #1446
-        : _bound_weight(1), _ck(std::move(ck)) { }
-    position_in_partition(range_tag_t, bound_view bv)
-        : _bound_weight(weight(bv.kind)), _ck(bv.prefix) { }
-    position_in_partition(after_static_row_tag_t) :
-        position_in_partition(range_tag_t(), bound_view::bottom()) { }
-    explicit position_in_partition(position_in_partition_view view)
-        : _bound_weight(view._bound_weight)
-        {
-            if (view._ck) {
-                _ck = *view._ck;
-            }
-        }
-
-    static position_in_partition before_all_clustered_rows() {
-        return {position_in_partition::range_tag_t(), bound_view::bottom()};
-    }
-
-    static position_in_partition after_all_clustered_rows() {
-        return {position_in_partition::range_tag_t(), bound_view::top()};
-    }
-
-    static position_in_partition after_key(clustering_key ck) {
-        return {after_clustering_row_tag_t(), std::move(ck)};
-    }
-
-    static position_in_partition for_key(clustering_key ck) {
-        return {clustering_row_tag_t(), std::move(ck)};
-    }
-
-    bool is_static_row() const { return !_ck; }
-    bool is_clustering_row() const { return _ck && !_bound_weight; }
-
-    template<typename Hasher>
-    void feed_hash(Hasher& hasher, const schema& s) const {
-        ::feed_hash(hasher, _bound_weight);
-        if (_ck) {
-            ::feed_hash(hasher, true);
-            _ck->feed_hash(hasher, s);
-        } else {
-            ::feed_hash(hasher, false);
-        }
-    }
-
-    clustering_key_prefix& key() {
-        return *_ck;
-    }
-    const clustering_key_prefix& key() const {
-        return *_ck;
-    }
-    operator position_in_partition_view() const {
-        return { _bound_weight, _ck ? &*_ck : nullptr };
-    }
-
-    // Defines total order on the union of position_and_partition and composite objects.
-    //
-    // The ordering is compatible with position_range (r). The following is satisfied for
-    // all cells with name c included by the range:
-    //
-    //   r.start() <= c < r.end()
-    //
-    // The ordering on composites given by this is compatible with but weaker than the cell name order.
-    //
-    // The ordering on position_in_partition given by this is compatible but weaker than the ordering
-    // given by position_in_partition::tri_compare.
-    //
-    class composite_tri_compare {
-        const schema& _s;
-    public:
-        composite_tri_compare(const schema& s) : _s(s) {}
-
-        int operator()(position_in_partition_view a, position_in_partition_view b) const {
-            if (a.is_static_row() || b.is_static_row()) {
-                return b.is_static_row() - a.is_static_row();
-            }
-            auto&& types = _s.clustering_key_type()->types();
-            auto cmp = [&] (const data_type& t, bytes_view c1, bytes_view c2) { return t->compare(c1, c2); };
-            return lexicographical_tri_compare(types.begin(), types.end(),
-                a._ck->begin(_s), a._ck->end(_s),
-                b._ck->begin(_s), b._ck->end(_s),
-                cmp, a.relation(), b.relation());
-        }
-
-        int operator()(position_in_partition_view a, composite_view b) const {
-            if (b.empty()) {
-                return 1; // a cannot be empty.
-            }
-            if (a.is_static_row() || b.is_static()) {
-                return b.is_static() - a.is_static_row();
-            }
-            auto&& types = _s.clustering_key_type()->types();
-            auto b_values = b.values();
-            auto cmp = [&] (const data_type& t, bytes_view c1, bytes_view c2) { return t->compare(c1, c2); };
-            return lexicographical_tri_compare(types.begin(), types.end(),
-                a._ck->begin(_s), a._ck->end(_s),
-                b_values.begin(), b_values.end(),
-                cmp, a.relation(), relation_for_lower_bound(b));
-        }
-
-        int operator()(composite_view a, position_in_partition_view b) const {
-            return -(*this)(b, a);
-        }
-
-        int operator()(composite_view a, composite_view b) const {
-            if (a.is_static() != b.is_static()) {
-                return a.is_static() ? -1 : 1;
-            }
-            auto&& types = _s.clustering_key_type()->types();
-            auto a_values = a.values();
-            auto b_values = b.values();
-            auto cmp = [&] (const data_type& t, bytes_view c1, bytes_view c2) { return t->compare(c1, c2); };
-            return lexicographical_tri_compare(types.begin(), types.end(),
-                a_values.begin(), a_values.end(),
-                b_values.begin(), b_values.end(),
-                cmp,
-                relation_for_lower_bound(a),
-                relation_for_lower_bound(b));
-        }
-    };
-
-    // Less comparator giving the same order as composite_tri_compare.
-    class composite_less_compare {
-        composite_tri_compare _cmp;
-    public:
-        composite_less_compare(const schema& s) : _cmp(s) {}
-
-        template<typename T, typename U>
-        bool operator()(const T& a, const U& b) const {
-            return _cmp(a, b) < 0;
-        }
-    };
-
-    class tri_compare {
-        bound_view::tri_compare _cmp;
-    private:
-        template<typename T, typename U>
-        int compare(const T& a, const U& b) const {
-            bool a_rt_weight = bool(a._ck);
-            bool b_rt_weight = bool(b._ck);
-            if (!a_rt_weight || !b_rt_weight) {
-                return a_rt_weight - b_rt_weight;
-            }
-            return _cmp(*a._ck, a._bound_weight, *b._ck, b._bound_weight);
-        }
-    public:
-        tri_compare(const schema& s) : _cmp(s) { }
-        int operator()(const position_in_partition& a, const position_in_partition& b) const {
-            return compare(a, b);
-        }
-        int operator()(const position_in_partition_view& a, const position_in_partition_view& b) const {
-            return compare(a, b);
-        }
-        int operator()(const position_in_partition& a, const position_in_partition_view& b) const {
-            return compare(a, b);
-        }
-        int operator()(const position_in_partition_view& a, const position_in_partition& b) const {
-            return compare(a, b);
-        }
-    };
-    class less_compare {
-        tri_compare _cmp;
-    public:
-        less_compare(const schema& s) : _cmp(s) { }
-        bool operator()(const position_in_partition& a, const position_in_partition& b) const {
-            return _cmp(a, b) < 0;
-        }
-        bool operator()(const position_in_partition_view& a, const position_in_partition_view& b) const {
-            return _cmp(a, b) < 0;
-        }
-        bool operator()(const position_in_partition& a, const position_in_partition_view& b) const {
-            return _cmp(a, b) < 0;
-        }
-        bool operator()(const position_in_partition_view& a, const position_in_partition& b) const {
-            return _cmp(a, b) < 0;
-        }
-    };
-    class equal_compare {
-        clustering_key_prefix::equality _equal;
-        template<typename T, typename U>
-        bool compare(const T& a, const U& b) const {
-            bool a_rt_weight = bool(a._ck);
-            bool b_rt_weight = bool(b._ck);
-            return a_rt_weight == b_rt_weight
-                   && (!a_rt_weight || (_equal(*a._ck, *b._ck)
-                        && a._bound_weight == b._bound_weight));
-        }
-    public:
-        equal_compare(const schema& s) : _equal(s) { }
-        bool operator()(const position_in_partition& a, const position_in_partition& b) const {
-            return compare(a, b);
-        }
-        bool operator()(const position_in_partition_view& a, const position_in_partition_view& b) const {
-            return compare(a, b);
-        }
-        bool operator()(const position_in_partition_view& a, const position_in_partition& b) const {
-            return compare(a, b);
-        }
-        bool operator()(const position_in_partition& a, const position_in_partition_view& b) const {
-            return compare(a, b);
-        }
-    };
-    friend std::ostream& operator<<(std::ostream&, const position_in_partition&);
-};
-
-// Includes all position_in_partition objects "p" for which: start <= p < end
-// And only those.
-class position_range {
-private:
-    position_in_partition _start;
-    position_in_partition _end;
-public:
-    static position_range from_range(const query::clustering_range&);
-
-    static position_range for_static_row() {
-        return {
-            position_in_partition(position_in_partition::static_row_tag_t()),
-            position_in_partition(position_in_partition::after_static_row_tag_t())
-        };
-    }
-
-    static position_range full() {
-        return {
-            position_in_partition(position_in_partition::static_row_tag_t()),
-            position_in_partition::after_all_clustered_rows()
-        };
-    }
-
-    static position_range all_clustered_rows() {
-        return {
-            position_in_partition::before_all_clustered_rows(),
-            position_in_partition::after_all_clustered_rows()
-        };
-    }
-
-    position_range(position_range&&) = default;
-    position_range& operator=(position_range&&) = default;
-    position_range(const position_range&) = default;
-    position_range& operator=(const position_range&) = default;
-
-    // Constructs position_range which covers the same rows as given clustering_range.
-    // position_range includes a fragment if it includes position of that fragment.
-    position_range(const query::clustering_range&);
-    position_range(query::clustering_range&&);
-
-    position_range(position_in_partition start, position_in_partition end)
-        : _start(std::move(start))
-        , _end(std::move(end))
-    { }
-
-    const position_in_partition& start() const& { return _start; }
-    position_in_partition&& start() && { return std::move(_start); }
-    const position_in_partition& end() const& { return _end; }
-    position_in_partition&& end() && { return std::move(_end); }
-    bool contains(const schema& s, position_in_partition_view pos) const;
-    bool overlaps(const schema& s, position_in_partition_view start, position_in_partition_view end) const;
-
-    friend std::ostream& operator<<(std::ostream&, const position_range&);
-};
-
-inline
-bool position_range::contains(const schema& s, position_in_partition_view pos) const {
-    position_in_partition::less_compare less(s);
-    return !less(pos, _start) && less(pos, _end);
-}
-
-inline
-bool position_range::overlaps(const schema& s, position_in_partition_view start, position_in_partition_view end) const {
-    position_in_partition::less_compare less(s);
-    return !less(end, _start) && less(start, _end);
-}
-
 inline position_in_partition_view static_row::position() const
 {
    return position_in_partition_view(position_in_partition_view::static_row_tag_t());
@@ -696,6 +355,10 @@ inline position_in_partition_view clustering_row::position() const
    return position_in_partition_view(position_in_partition_view::clustering_row_tag_t(), _ck);
 }

+std::ostream& operator<<(std::ostream&, mutation_fragment::kind);
+
+std::ostream& operator<<(std::ostream&, const mutation_fragment& mf);
+
 template<>
 struct move_constructor_disengages<mutation_fragment> {
    enum { value = true };
@@ -889,11 +552,14 @@ class mutation;

 streamed_mutation streamed_mutation_from_mutation(mutation, streamed_mutation::forwarding fwd = streamed_mutation::forwarding::no);
 streamed_mutation streamed_mutation_returning(schema_ptr, dht::decorated_key, std::vector<mutation_fragment>, tombstone t = {});
+streamed_mutation streamed_mutation_from_forwarding_streamed_mutation(streamed_mutation&&);

 //Requires all streamed_mutations to have the same schema.
 streamed_mutation merge_mutations(std::vector<streamed_mutation>);
 streamed_mutation reverse_streamed_mutation(streamed_mutation);

+streamed_mutation make_empty_streamed_mutation(schema_ptr, dht::decorated_key, streamed_mutation::forwarding fwd = streamed_mutation::forwarding::no);
+
 // range_tombstone_stream is a helper object that simplifies producing a stream
 // of range tombstones and merging it with a stream of clustering rows.
 // Tombstones are added using apply() and retrieved using get_next().
@@ -935,6 +601,7 @@ public:
    }
    void apply(const range_tombstone_list&, const query::clustering_range&);
    void reset();
+    friend std::ostream& operator<<(std::ostream& out, const range_tombstone_stream&);
 };

 // mutation_hasher is an equivalent of hashing_partition_visitor for
@@ -1049,3 +716,50 @@ public:
        consume_range_tombstones_until_end();
    }
 };
+
+
+GCC6_CONCEPT(
+    // F gets a stream element as an argument and returns the new value which replaces that element
+    // in the transformed stream.
+    template<typename F>
+    concept bool StreamedMutationTranformer() {
+        return requires(F f, mutation_fragment mf, schema_ptr s) {
+            { f(std::move(mf)) } -> mutation_fragment
+            { f(s) } -> schema_ptr
+        };
+    }
+)
+
+// Creates a stream which is like sm but with transformation applied to the elements.
+template<typename T>
+GCC6_CONCEPT(
+    requires StreamedMutationTranformer<T>()
+)
+streamed_mutation transform(streamed_mutation sm, T t) {
+    class reader : public streamed_mutation::impl {
+        streamed_mutation _sm;
+        T _t;
+    public:
+        explicit reader(streamed_mutation sm, T&& t)
+            : impl(t(sm.schema()), sm.decorated_key(), sm.partition_tombstone())
+            , _sm(std::move(sm))
+            , _t(std::move(t))
+        { }
+
+        virtual future<> fill_buffer() override {
+            return _sm.fill_buffer().then([this] {
+                while (!_sm.is_buffer_empty()) {
+                    push_mutation_fragment(_t(_sm.pop_mutation_fragment()));
+                }
+                _end_of_stream = _sm.is_end_of_stream();
+            });
+        }
+
+        virtual future<> fast_forward_to(position_range pr) override {
+            _end_of_stream = false;
+            forward_buffer_to(pr.start());
+            return _sm.fast_forward_to(std::move(pr));
+        }
+    };
+    return make_streamed_mutation<reader>(std::move(sm), std::move(t));
+}
--- a/test.py
+++ b/test.py
@@ -53,6 +53,7 @@ boost_tests = [
    'canonical_mutation_test',
    'gossiping_property_file_snitch_test',
    'row_cache_test',
+    'cache_streamed_mutation_test',
    'network_topology_strategy_test',
    'query_processor_test',
    'batchlog_manager_test',
@@ -150,6 +151,7 @@ if __name__ == "__main__":
                            '-c1 -m1G'.split()))
        test_to_run.append(('build/release/tests/sstable_test', 'boost', ['-c1']))
        test_to_run.append(('build/release/tests/view_schema_test', 'boost', ['-c1']))
+        test_to_run.append(('build/release/tests/row_cache_stress_test', 'other', '-c1 -m1G --seconds 10'.split()))
    if 'debug' in modes_to_run:
        test_to_run.append(('build/debug/tests/sstable_test', 'boost', ['-c1']))
        test_to_run.append(('build/debug/tests/view_schema_test', 'boost', ['-c1']))
--- a/tests/cache_streamed_mutation_test.cc
+++ b/tests/cache_streamed_mutation_test.cc
--- a/tests/memory_footprint.cc
+++ b/tests/memory_footprint.cc
@@ -175,7 +175,7 @@ static sizes calculate_sizes(const mutation& m) {
    auto s = m.schema();
    auto mt = make_lw_shared<memtable>(s);
    cache_tracker tracker;
-    row_cache cache(s, mt->as_data_source(), tracker);
+    row_cache cache(s, make_empty_snapshot_source(), tracker);

    auto cache_initial_occupancy = tracker.region().occupancy().used_space();

--- a/tests/memtable_snapshot_source.hh
+++ b/tests/memtable_snapshot_source.hh
@@ -0,0 +1,129 @@
+/*
+ * Copyright (C) 2017 ScyllaDB
+ */
+
+/*
+ * This file is part of Scylla.
+ *
+ * Scylla is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Scylla is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Scylla.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#include "mutation_reader.hh"
+#include "memtable.hh"
+#include "utils/phased_barrier.hh"
+#include <seastar/core/circular_buffer.hh>
+#include <seastar/core/thread.hh>
+#include <seastar/core/condition-variable.hh>
+
+// in-memory snapshottable mutation source.
+// Must be destroyed in a seastar thread.
+class memtable_snapshot_source {
+    schema_ptr _s;
+    circular_buffer<lw_shared_ptr<memtable>> _memtables;
+    utils::phased_barrier _apply;
+    bool _closed = false;
+    seastar::condition_variable _should_compact;
+    future<> _compactor;
+private:
+    bool should_compact() const {
+        return !_closed && _memtables.size() >= 3;
+    }
+    lw_shared_ptr<memtable> new_memtable() {
+        return make_lw_shared<memtable>(_s);
+    }
+    lw_shared_ptr<memtable> pending() {
+        if (_memtables.empty()) {
+            _memtables.push_back(new_memtable());
+            on_new_memtable();
+        }
+        return _memtables.back();
+    }
+    void on_new_memtable() {
+        if (should_compact()) {
+            _should_compact.signal();
+        }
+    }
+    void compact() {
+        if (_memtables.empty()) {
+            return;
+        }
+        auto count = _memtables.size();
+        auto op = _apply.start();
+        auto new_mt = make_lw_shared<memtable>(_memtables.back()->schema());
+        std::vector<mutation_reader> readers;
+        for (auto&& mt : _memtables) {
+            readers.push_back(mt->make_reader(new_mt->schema()));
+        }
+        auto&& rd = make_combined_reader(std::move(readers));
+        consume(rd, [&] (mutation&& m) {
+            new_mt->apply(std::move(m));
+            return stop_iteration::no;
+        }).get();
+        _memtables.erase(_memtables.begin(), _memtables.begin() + count);
+        _memtables.push_back(new_mt);
+    }
+public:
+    memtable_snapshot_source(schema_ptr s)
+        : _s(s)
+        , _compactor(seastar::async([this] {
+            while (!_closed) {
+                _should_compact.wait().get();
+                while (should_compact()) {
+                    compact();
+                }
+            }
+        }))
+    { }
+    memtable_snapshot_source(memtable_snapshot_source&&) = delete; // 'this' captured.
+    ~memtable_snapshot_source() {
+        _closed = true;
+        _should_compact.broadcast();
+        _compactor.get();
+    }
+    // Must run in a seastar thread
+    void clear() {
+        _memtables.erase(_memtables.begin(), _memtables.end());
+        _apply.advance_and_await().get();
+        _memtables.erase(_memtables.begin(), _memtables.end());
+    }
+    // Must run in a seastar thread
+    void apply(const mutation& mt) {
+        pending()->apply(mt);
+    }
+    // Must run in a seastar thread
+    void apply(memtable& mt) {
+        auto op = _apply.start();
+        auto new_mt = new_memtable();
+        new_mt->apply(mt).get();
+        _memtables.push_back(new_mt);
+    }
+    // Must run in a seastar thread
+    // mt must not change from now on.
+    void apply(lw_shared_ptr<memtable> mt) {
+        auto op = _apply.start();
+        _memtables.push_back(std::move(mt));
+        on_new_memtable();
+    }
+    mutation_source operator()() {
+        std::vector<mutation_source> src;
+        for (auto&& mt : _memtables) {
+            src.push_back(mt->as_data_source());
+        }
+        _memtables.push_back(new_memtable()); // so that src won't change any more.
+        on_new_memtable();
+        return make_combined_mutation_source(std::move(src));
+    }
+};
--- a/tests/mutation_assertions.hh
+++ b/tests/mutation_assertions.hh
@@ -30,7 +30,12 @@ public:
        : _m(std::move(m))
    { }

-    mutation_assertion& is_equal_to(const mutation& other) {
+    // If ck_ranges is passed, verifies only that information relevant for ck_ranges matches.
+    mutation_assertion& is_equal_to(const mutation& other, const query::clustering_row_ranges& ck_ranges = {}) {
+        if (!ck_ranges.empty()) {
+            mutation_assertion(_m.sliced(ck_ranges)).is_equal_to(other.sliced(ck_ranges));
+            return *this;
+        }
        if (_m != other) {
            BOOST_FAIL(sprint("Mutations differ, expected %s\n ...but got: %s", other, _m));
        }
@@ -54,6 +59,13 @@ public:
        return *this;
    }

+    mutation_assertion& has_same_continuity(const mutation& other) {
+        if (!_m.partition().equal_continuity(*_m.schema(), other.partition())) {
+            BOOST_FAIL(sprint("Continuity doesn't match: %s\n ...and: %s", other, _m));
+        }
+        return *this;
+    }
+
    // Verifies that mutation data remains unchanged when upgraded to the new schema
    void is_upgrade_equivalent(schema_ptr new_schema) {
        mutation m2 = _m;
@@ -148,6 +160,25 @@ public:
        return *this;
    }

+    streamed_mutation_assertions& produces(mutation_fragment mf) {
+        auto mfopt = _sm().get0();
+        if (!mfopt) {
+            BOOST_FAIL(sprint("Expected mutation fragment %s, got end of stream", mf));
+        }
+        if (!mfopt->equal(*_sm.schema(), mf)) {
+            BOOST_FAIL(sprint("Expected %s, but got %s", mf, *mfopt));
+        }
+        return *this;
+    }
+
+    streamed_mutation_assertions& produces_only(const std::deque<mutation_fragment>& fragments) {
+        for (auto&& f : fragments) {
+            produces(f);
+        }
+        produces_end_of_stream();
+        return *this;
+    }
+
    streamed_mutation_assertions& produces_row_with_key(const clustering_key& ck) {
        BOOST_TEST_MESSAGE(sprint("Expect %s", ck));
        auto mfo = _sm().get0();
@@ -164,7 +195,8 @@ public:
        return *this;
    }

-    streamed_mutation_assertions& produces_range_tombstone(const range_tombstone& rt) {
+    // If ck_ranges is passed, verifies only that information relevant for ck_ranges matches.
+    streamed_mutation_assertions& produces_range_tombstone(const range_tombstone& rt, const query::clustering_row_ranges& ck_ranges = {}) {
        BOOST_TEST_MESSAGE(sprint("Expect %s", rt));
        auto mfo = _sm().get0();
        if (!mfo) {
@@ -174,7 +206,18 @@ public:
            BOOST_FAIL(sprint("Expected range tombstone %s, but got %s", rt, *mfo));
        }
        auto& actual = mfo->as_range_tombstone();
-        if (!actual.equal(*_sm.schema(), rt)) {
+        const schema& s = *_sm.schema();
+        if (!ck_ranges.empty()) {
+            range_tombstone_list actual_list(s);
+            range_tombstone_list expected_list(s);
+            actual_list.apply(s, actual);
+            expected_list.apply(s, rt);
+            actual_list.trim(s, ck_ranges);
+            expected_list.trim(s, ck_ranges);
+            if (!actual_list.equal(s, expected_list)) {
+                BOOST_FAIL(sprint("Expected %s, but got %s", expected_list, actual_list));
+            }
+        } else if (!actual.equal(s, rt)) {
            BOOST_FAIL(sprint("Expected range tombstone %s, but got %s", rt, actual));
        }
        return *this;
--- a/tests/mutation_reader_assertions.hh
+++ b/tests/mutation_reader_assertions.hh
@@ -29,6 +29,11 @@
 class reader_assertions {
    mutation_reader _reader;
    dht::partition_range _pr;
+private:
+    mutation_opt read_next() {
+        auto smo = _reader().get0();
+        return mutation_from_streamed_mutation(std::move(smo)).get0();
+    }
 public:
    reader_assertions(mutation_reader reader)
        : _reader(std::move(reader))
@@ -36,35 +41,28 @@ public:

    reader_assertions& produces(const dht::decorated_key& dk) {
        BOOST_TEST_MESSAGE(sprint("Expecting key %s", dk));
-        _reader().then([&] (auto sm) {
-            if (!sm) {
-                BOOST_FAIL(sprint("Expected: %s, got end of stream", dk));
-            }
-            if (!sm->decorated_key().equal(*sm->schema(), dk)) {
-                BOOST_FAIL(sprint("Expected: %s, got: %s", dk, sm->decorated_key()));
-            }
-        }).get0();
+        auto mo = read_next();
+        if (!mo) {
+            BOOST_FAIL(sprint("Expected: %s, got end of stream", dk));
+        }
+        if (!mo->decorated_key().equal(*mo->schema(), dk)) {
+            BOOST_FAIL(sprint("Expected: %s, got: %s", dk, mo->decorated_key()));
+        }
        return *this;
    }

-    reader_assertions& produces(mutation m) {
+    reader_assertions& produces(mutation m, const query::clustering_row_ranges& ck_ranges = {}) {
        BOOST_TEST_MESSAGE(sprint("Expecting %s", m));
-        _reader().then([] (auto sm) {
-            return mutation_from_streamed_mutation(std::move(sm));
-        }).then([this, m = std::move(m)] (mutation_opt&& mo) mutable {
-            BOOST_REQUIRE(bool(mo));
-            assert_that(*mo).is_equal_to(m);
-        }).get0();
+        auto mo = read_next();
+        BOOST_REQUIRE(bool(mo));
+        assert_that(*mo).is_equal_to(m, ck_ranges);
        return *this;
    }

    mutation_assertion next_mutation() {
-        return _reader().then([] (auto sm) {
-            return mutation_from_streamed_mutation(std::move(sm));
-        }).then([] (mutation_opt&& mo) mutable {
-            BOOST_REQUIRE(bool(mo));
-            return mutation_assertion(std::move(*mo));
-        }).get0();
+        auto mo = read_next();
+        BOOST_REQUIRE(bool(mo));
+        return mutation_assertion(std::move(*mo));
    }

    template<typename RangeOfMutations>
@@ -77,20 +75,16 @@ public:

    reader_assertions& produces_end_of_stream() {
        BOOST_TEST_MESSAGE("Expecting end of stream");
-        _reader().then([] (auto sm) {
-            return mutation_from_streamed_mutation(std::move(sm));
-        }).then([this] (mutation_opt&& mo) mutable {
-            if (bool(mo)) {
-                BOOST_FAIL(sprint("Expected end of stream, got %s", *mo));
-            }
-        }).get0();
+        auto mo = read_next();
+        if (bool(mo)) {
+            BOOST_FAIL(sprint("Expected end of stream, got %s", *mo));
+        }
        return *this;
    }

    reader_assertions& produces_eos_or_empty_mutation() {
        BOOST_TEST_MESSAGE("Expecting eos or empty mutation");
-        auto sm = _reader().get0();
-        mutation_opt mo = mutation_from_streamed_mutation(std::move(sm)).get0();
+        auto mo = read_next();
        if (mo) {
            if (!mo->partition().empty()) {
                BOOST_FAIL(sprint("Mutation is not empty: %s", *mo));
--- a/tests/mutation_source_test.cc
+++ b/tests/mutation_source_test.cc
@@ -90,7 +90,7 @@ static void test_streamed_mutation_forwarding_is_consistent_with_slicing(populat
        }

        mutation sliced_m = mutation_from_streamed_mutation(sliced_sm).get0();
-        assert_that(sliced_m).is_equal_to(fwd_m);
+        assert_that(sliced_m).is_equal_to(fwd_m, slice_with_ranges.row_ranges(*m.schema(), m.key()));
    }
 }

@@ -295,9 +295,9 @@ static void test_streamed_mutation_slicing_returns_only_relevant_tombstones(popu
        auto sm = assert_that_stream(std::move(*smo));

        sm.produces_row_with_key(keys[2]);
-        sm.produces_range_tombstone(rt3);
+        sm.produces_range_tombstone(rt3, slice.row_ranges(*s, m.key()));
        sm.produces_row_with_key(keys[8]);
-        sm.produces_range_tombstone(rt4);
+        sm.produces_range_tombstone(rt4, slice.row_ranges(*s, m.key()));
        sm.produces_end_of_stream();
    }

@@ -314,9 +314,9 @@ static void test_streamed_mutation_slicing_returns_only_relevant_tombstones(popu
        streamed_mutation_opt smo = rd().get0();
        BOOST_REQUIRE(bool(smo));
        assert_that_stream(std::move(*smo))
-            .produces_range_tombstone(rt3)
+            .produces_range_tombstone(rt3, slice.row_ranges(*s, m.key()))
            .produces_row_with_key(keys[8])
-            .produces_range_tombstone(rt4)
+            .produces_range_tombstone(rt4, slice.row_ranges(*s, m.key()))
            .produces_end_of_stream();
    }
 }
@@ -676,7 +676,7 @@ static void test_clustering_slices(populate_fn populate) {
            .with_range(query::clustering_range::make_singular(make_ck(2)))
            .build();
        assert_that(ds(s, pr, slice))
-            .produces(row6 + row7 + del_1 + del_2)
+            .produces(row6 + row7 + del_1 + del_2, slice.row_ranges(*s, pk.key()))
            .produces_end_of_stream();
    }

@@ -761,7 +761,6 @@ static mutation_sets generate_mutation_sets() {

        auto m1 = mutation(partition_key::from_single_value(*s1, to_bytes("key1")), s1);
        auto m2 = mutation(partition_key::from_single_value(*s2, to_bytes("key1")), s2);
-
        result.equal.emplace_back(mutations{m1, m2});

        clustering_key ck1 = clustering_key::from_deeply_exploded(*s1, {data_value(bytes("ck1_0")), data_value(bytes("ck1_1"))});
@@ -841,6 +840,14 @@ static mutation_sets generate_mutation_sets() {
            result.equal.emplace_back(mutations{m1, m2});
        }

+        {
+            m1.partition().ensure_last_dummy(*m1.schema());
+            result.equal.emplace_back(mutations{m1, m2});
+
+            m2.partition().ensure_last_dummy(*m2.schema());
+            result.equal.emplace_back(mutations{m1, m2});
+        }
+
        {
            auto ts = new_timestamp();
            m1.set_clustered_cell(ck2, "regular_col_1_s1", data_value(bytes("x")), ts);
@@ -934,6 +941,7 @@ class random_mutation_generator::impl {
    std::vector<bytes> _blobs;
    std::uniform_int_distribution<size_t> _ck_index_dist{0, n_blobs - 1};
    std::uniform_int_distribution<int> _bool_dist{0, 1};
+    std::uniform_int_distribution<int> _not_dummy_dist{0, 19};

    template <typename Generator>
    static gc_clock::time_point expiry_dist(Generator& gen) {
@@ -1164,9 +1172,14 @@ public:
        size_t row_count = row_count_dist(_gen);
        for (size_t i = 0; i < row_count; ++i) {
            auto ckey = make_random_key();
-            deletable_row& row = m.partition().clustered_row(*_schema, ckey);
-            set_random_cells(row.cells(), column_kind::regular_column);
-            row.marker() = random_row_marker();
+            is_continuous continuous = is_continuous(_bool_dist(_gen));
+            if (_not_dummy_dist(_gen)) {
+                deletable_row& row = m.partition().clustered_row(*_schema, ckey, is_dummy::no, continuous);
+                set_random_cells(row.cells(), column_kind::regular_column);
+                row.marker() = random_row_marker();
+            } else {
+                m.partition().clustered_row(*_schema, ckey, is_dummy::yes, continuous);
+            }
        }

        size_t range_tombstone_count = row_count_dist(_gen);
@@ -1180,6 +1193,12 @@ public:
            m.partition().apply_row_tombstone(*_schema,
                    range_tombstone(std::move(start), std::move(end), random_tombstone()));
        }
+
+        if (_bool_dist(_gen)) {
+            m.partition().ensure_last_dummy(*_schema);
+            m.partition().clustered_rows().rbegin()->set_continuous(is_continuous(_bool_dist(_gen)));
+        }
+
        return m;
    }
 };
--- a/tests/mutation_test.cc
+++ b/tests/mutation_test.cc
@@ -48,6 +48,7 @@
 #include "cell_locking.hh"

 #include "disk-error-handler.hh"
+#include "simple_schema.hh"

 thread_local disk_error_signal_type commit_error;
 thread_local disk_error_signal_type general_disk_error;
@@ -830,7 +831,8 @@ SEASTAR_TEST_CASE(test_apply_is_atomic_in_case_of_allocation_failures) {
                        break; // we exhausted all allocation points
                    } catch (const std::bad_alloc&) {
                        BOOST_TEST_MESSAGE("Checking that apply was reverted");
-                        assert_that(m).is_equal_to(target);
+                        assert_that(m).is_equal_to(target)
+                            .has_same_continuity(target);
                    }
                }
            }
@@ -851,7 +853,8 @@ SEASTAR_TEST_CASE(test_apply_is_atomic_in_case_of_allocation_failures) {
                        assert_that(m).is_equal_to(target);
                        // they should still commute
                        m.apply(copy_of_second);
-                        assert_that(m).is_equal_to(expected_apply_result);
+                        assert_that(m).is_equal_to(expected_apply_result)
+                            .has_same_continuity(expected_apply_result);
                    }
                }
            }
@@ -1513,3 +1516,193 @@ SEASTAR_TEST_CASE(test_mutation_diff_with_random_generator) {
        });
    });
 }
+
+SEASTAR_TEST_CASE(test_continuity_merging) {
+    return seastar::async([] {
+        simple_schema table;
+        auto&& s = *table.schema();
+
+        auto new_mutation = [&] {
+            return mutation(table.make_pkey(0), table.schema());
+        };
+
+        {
+            auto left = new_mutation();
+            auto right = new_mutation();
+            auto result = new_mutation();
+
+            left.partition().clustered_row(s, table.make_ckey(0), is_dummy::no, is_continuous::yes);
+            right.partition().clustered_row(s, table.make_ckey(0), is_dummy::no, is_continuous::no);
+            result.partition().clustered_row(s, table.make_ckey(0), is_dummy::no, is_continuous::yes);
+
+            left.partition().clustered_row(s, table.make_ckey(1), is_dummy::yes, is_continuous::yes);
+            right.partition().clustered_row(s, table.make_ckey(2), is_dummy::yes, is_continuous::no);
+            result.partition().clustered_row(s, table.make_ckey(1), is_dummy::yes, is_continuous::yes);
+            result.partition().clustered_row(s, table.make_ckey(2), is_dummy::yes, is_continuous::no);
+
+            left.partition().clustered_row(s, table.make_ckey(3), is_dummy::yes, is_continuous::yes);
+            right.partition().clustered_row(s, table.make_ckey(3), is_dummy::no, is_continuous::no);
+            result.partition().clustered_row(s, table.make_ckey(3), is_dummy::yes, is_continuous::yes);
+
+            left.partition().clustered_row(s, table.make_ckey(4), is_dummy::no, is_continuous::no);
+            right.partition().clustered_row(s, table.make_ckey(4), is_dummy::no, is_continuous::yes);
+            result.partition().clustered_row(s, table.make_ckey(4), is_dummy::no, is_continuous::no);
+
+            left.partition().clustered_row(s, table.make_ckey(5), is_dummy::no, is_continuous::no);
+            right.partition().clustered_row(s, table.make_ckey(5), is_dummy::yes, is_continuous::yes);
+            result.partition().clustered_row(s, table.make_ckey(5), is_dummy::no, is_continuous::no);
+
+            left.partition().clustered_row(s, table.make_ckey(6), is_dummy::no, is_continuous::yes);
+            right.partition().clustered_row(s, table.make_ckey(6), is_dummy::yes, is_continuous::no);
+            result.partition().clustered_row(s, table.make_ckey(6), is_dummy::no, is_continuous::yes);
+
+            left.partition().clustered_row(s, table.make_ckey(7), is_dummy::yes, is_continuous::yes);
+            right.partition().clustered_row(s, table.make_ckey(7), is_dummy::yes, is_continuous::no);
+            result.partition().clustered_row(s, table.make_ckey(7), is_dummy::yes, is_continuous::yes);
+
+            left.partition().clustered_row(s, table.make_ckey(8), is_dummy::yes, is_continuous::no);
+            right.partition().clustered_row(s, table.make_ckey(8), is_dummy::yes, is_continuous::yes);
+            result.partition().clustered_row(s, table.make_ckey(8), is_dummy::yes, is_continuous::no);
+
+            assert_that(left + right).has_same_continuity(result);
+        }
+
+        // static row continuity
+        {
+            auto complete = mutation(table.make_pkey(0), table.schema());
+            auto incomplete = mutation(table.make_pkey(0), table.schema());
+            incomplete.partition().set_static_row_continuous(false);
+
+            assert_that(complete + complete).has_same_continuity(complete);
+            assert_that(complete + incomplete).has_same_continuity(complete);
+            assert_that(incomplete + complete).has_same_continuity(incomplete);
+            assert_that(incomplete + incomplete).has_same_continuity(incomplete);
+        }
+    });
+
+}
+
+SEASTAR_TEST_CASE(test_apply_to_incomplete) {
+    return seastar::async([] {
+        simple_schema table;
+        auto&& s = *table.schema();
+
+        auto new_mutation = [&] {
+            return mutation(table.make_pkey(0), table.schema());
+        };
+
+        auto mutation_with_row = [&] (clustering_key ck) {
+            auto m = new_mutation();
+            table.add_row(m, ck, "v");
+            return m;
+        };
+
+        // FIXME: There is no assert_that() for mutation_partition
+        auto assert_equal = [&] (mutation_partition mp1, mutation_partition mp2) {
+            auto key = table.make_pkey(0);
+            assert_that(mutation(table.schema(), key, std::move(mp1)))
+                .is_equal_to(mutation(table.schema(), key, std::move(mp2)));
+        };
+
+        auto apply = [&] (partition_entry& e, const mutation& m) {
+            e.apply_to_incomplete(s, partition_entry(m.partition()), s);
+        };
+
+        auto ck1 = table.make_ckey(1);
+        auto ck2 = table.make_ckey(2);
+
+        BOOST_TEST_MESSAGE("Check that insert falling into discontinuous range is dropped");
+        {
+            auto e = partition_entry(mutation_partition::make_incomplete(s));
+            auto m = new_mutation();
+            table.add_row(m, ck1, "v");
+            apply(e, m);
+            assert_equal(e.squashed(s), mutation_partition::make_incomplete(s));
+        }
+
+        BOOST_TEST_MESSAGE("Check that continuity from latest version wins");
+        {
+            auto m1 = mutation_with_row(ck2);
+            auto e = partition_entry(m1.partition());
+
+            auto snap1 = e.read(table.schema());
+
+            auto m2 = mutation_with_row(ck2);
+            apply(e, m2);
+
+            partition_version* latest = &*e.version();
+            partition_version* prev = latest->next();
+
+            for (rows_entry& row : prev->partition().clustered_rows()) {
+                row.set_continuous(is_continuous::no);
+            }
+
+            auto m3 = mutation_with_row(ck1);
+            apply(e, m3);
+            assert_equal(e.squashed(s), (m2 + m3).partition());
+
+            // Check that snapshot data is not stolen when its entry is applied
+            auto e2 = partition_entry(mutation_partition(table.schema()));
+            e2.apply_to_incomplete(s, std::move(e), s);
+            assert_equal(snap1->squashed(), m1.partition());
+            assert_equal(e2.squashed(s), (m2 + m3).partition());
+        }
+    });
+
+}
+
+SEASTAR_TEST_CASE(test_schema_upgrade_preserves_continuity) {
+    return seastar::async([] {
+        simple_schema table;
+
+        auto new_mutation = [&] {
+            return mutation(table.make_pkey(0), table.schema());
+        };
+
+        auto mutation_with_row = [&] (clustering_key ck) {
+            auto m = new_mutation();
+            table.add_row(m, ck, "v");
+            return m;
+        };
+
+        // FIXME: There is no assert_that() for mutation_partition
+        auto assert_entry_equal = [&] (schema_ptr e_schema, partition_entry& e, mutation m) {
+            auto key = table.make_pkey(0);
+            assert_that(mutation(e_schema, key, e.squashed(*e_schema)))
+                .is_equal_to(m)
+                .has_same_continuity(m);
+        };
+
+        auto apply = [&] (schema_ptr e_schema, partition_entry& e, const mutation& m) {
+            e.apply_to_incomplete(*e_schema, partition_entry(m.partition()), *m.schema());
+        };
+
+        auto m1 = mutation_with_row(table.make_ckey(1));
+        m1.partition().clustered_rows().begin()->set_continuous(is_continuous::no);
+        m1.partition().set_static_row_continuous(false);
+        m1.partition().ensure_last_dummy(*m1.schema());
+
+        auto e = partition_entry(m1.partition());
+        auto rd1 = e.read(table.schema());
+
+        auto m2 = mutation_with_row(table.make_ckey(3));
+        m2.partition().ensure_last_dummy(*m2.schema());
+        apply(table.schema(), e, m2);
+
+        auto new_schema = schema_builder(table.schema()).with_column("__new_column", utf8_type).build();
+
+        e.upgrade(table.schema(), new_schema);
+        rd1 = {};
+
+        assert_entry_equal(new_schema, e, m1 + m2);
+
+        auto m3 = mutation_with_row(table.make_ckey(2));
+        apply(new_schema, e, m3);
+
+        auto m4 = mutation_with_row(table.make_ckey(0));
+        table.add_static_row(m4, "s_val");
+        apply(new_schema, e, m4);
+
+        assert_entry_equal(new_schema, e, m1 + m2 + m3);
+    });
+}
--- a/tests/perf_row_cache_update.cc
+++ b/tests/perf_row_cache_update.cc
@@ -73,7 +73,7 @@ int main(int argc, char** argv) {
                .build();

            cache_tracker tracker;
-            row_cache cache(s, mutation_source([] (schema_ptr, auto&&) { return make_empty_reader(); }), tracker);
+            row_cache cache(s, make_empty_snapshot_source(), tracker);

            size_t partitions = app.configuration()["partitions"].as<unsigned>();
            size_t cell_size = app.configuration()["cell-size"].as<unsigned>();
--- a/tests/row_cache_alloc_stress.cc
+++ b/tests/row_cache_alloc_stress.cc
@@ -70,10 +70,8 @@ int main(int argc, char** argv) {
                .with_column("v", bytes_type, column_kind::regular_column)
                .build();

-            auto mt0 = make_lw_shared<memtable>(s);
-
            cache_tracker tracker;
-            row_cache cache(s, mt0->as_data_source(), tracker);
+            row_cache cache(s, make_empty_snapshot_source(), tracker);

            auto mt = make_lw_shared<memtable>(s);
            std::vector<dht::decorated_key> keys;
@@ -134,16 +132,16 @@ int main(int argc, char** argv) {
            auto fill_cache_to_the_top = [&] {
                std::cout << "Filling up memory with evictable data\n";
                while (true) {
+                    auto evictions_before = tracker.get_stats().evictions;
                    // Ensure that entries matching memtable partitions are evicted
                    // last, we want to hit the merge path in row_cache::update()
                    for (auto&& key : keys) {
                        cache.touch(key);
                    }
-                    auto occupancy_before = tracker.region().occupancy().used_space();
                    auto m = make_small_mutation();
                    cache_stuffing.push_back(m.decorated_key());
                    cache.populate(m);
-                    if (tracker.region().occupancy().used_space() <= occupancy_before) {
+                    if (tracker.get_stats().evictions > evictions_before) {
                        break;
                    }
                }
--- a/tests/row_cache_stress_test.cc
+++ b/tests/row_cache_stress_test.cc
@@ -0,0 +1,366 @@
+/*
+ * Copyright (C) 2017 ScyllaDB
+ */
+
+/*
+ * This file is part of Scylla.
+ *
+ * Scylla is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Scylla is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Scylla.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <boost/range/irange.hpp>
+#include "seastarx.hh"
+#include "tests/simple_schema.hh"
+#include "core/app-template.hh"
+#include "memtable.hh"
+#include "row_cache.hh"
+#include "partition_slice_builder.hh"
+#include "utils/int_range.hh"
+#include "utils/div_ceil.hh"
+#include "tests/memtable_snapshot_source.hh"
+#include <seastar/core/reactor.hh>
+
+#include "disk-error-handler.hh"
+
+logging::logger test_log("test");
+
+thread_local disk_error_signal_type commit_error;
+thread_local disk_error_signal_type general_disk_error;
+
+static thread_local bool cancelled = false;
+
+using namespace std::chrono_literals;
+
+struct table {
+    simple_schema s;
+    std::vector<dht::decorated_key> p_keys;
+    std::vector<api::timestamp_type> p_writetime; // committed writes
+    std::vector<clustering_key> c_keys;
+    uint64_t mutation_phase = 0;
+    uint64_t mutations = 0;
+    uint64_t reads_started = 0;
+    uint64_t scans_started = 0;
+
+    lw_shared_ptr<memtable> mt;
+    lw_shared_ptr<memtable> prev_mt;
+    memtable_snapshot_source underlying;
+    row_cache cache;
+
+    table(unsigned partitions, unsigned rows)
+        : mt(make_lw_shared<memtable>(s.schema()))
+        , underlying(s.schema())
+        , cache(s.schema(), snapshot_source([this] { return underlying(); }), global_cache_tracker())
+    {
+        p_keys = s.make_pkeys(partitions);
+        p_writetime.resize(p_keys.size());
+        c_keys = s.make_ckeys(rows);
+    }
+
+    size_t index_of_key(const dht::decorated_key& dk) {
+        for (auto i : boost::irange<size_t>(0, p_keys.size())) {
+            if (p_keys[i].equal(*s.schema(), dk)) {
+                return i;
+            }
+        }
+        throw std::runtime_error(sprint("key not found: %s", dk));
+    }
+
+    sstring value_tag(int key, uint64_t phase) {
+        return sprint("k_0x%x_p_0x%x", key, phase);
+    }
+
+    mutation get_mutation(int key, api::timestamp_type t, const sstring& tag) {
+        mutation m(p_keys[key], s.schema());
+        for (auto ck : c_keys) {
+            s.add_row(m, ck, tag, t);
+        }
+        return m;
+    }
+
+    // Must not be called concurrently
+    void flush() {
+        test_log.trace("flushing");
+        prev_mt = std::exchange(mt, make_lw_shared<memtable>(s.schema()));
+        auto flushed = make_lw_shared<memtable>(s.schema());
+        flushed->apply(*prev_mt).get();
+        prev_mt->mark_flushed(flushed->as_data_source());
+        underlying.apply(flushed);
+        test_log.trace("updating cache");
+        cache.update(*prev_mt, [] (const dht::decorated_key& dk) {
+            return partition_presence_checker_result::maybe_exists;
+        }).get();
+        test_log.trace("flush done");
+        prev_mt = {};
+    }
+
+    void mutate_next_phase() {
+        test_log.trace("mutating, phase={}", mutation_phase);
+        for (auto i : boost::irange<int>(0, p_keys.size())) {
+            auto t = s.new_timestamp();
+            auto tag = value_tag(i, mutation_phase);
+            auto m = get_mutation(i, t, tag);
+            mt->apply(std::move(m));
+            p_writetime[i] = t;
+            test_log.trace("updated key {}, {} @{}", i, tag, t);
+            ++mutations;
+            later().get();
+        }
+        test_log.trace("mutated whole ring");
+        ++mutation_phase;
+        // FIXME: mutate concurrently with flush
+        flush();
+    }
+
+    struct reader {
+        dht::partition_range pr;
+        query::partition_slice slice;
+        mutation_reader rd;
+    };
+
+    std::unique_ptr<reader> make_reader(dht::partition_range pr, query::partition_slice slice) {
+        test_log.trace("making reader, pk={} ck={}", pr, slice);
+        auto r = std::make_unique<reader>(reader{std::move(pr), std::move(slice)});
+        std::vector<mutation_reader> rd;
+        if (prev_mt) {
+            rd.push_back(prev_mt->make_reader(s.schema(), r->pr, r->slice));
+        }
+        rd.push_back(mt->make_reader(s.schema(), r->pr, r->slice));
+        rd.push_back(cache.make_reader(s.schema(), r->pr, r->slice));
+        r->rd = make_combined_reader(std::move(rd));
+        return r;
+    }
+
+    std::unique_ptr<reader> make_single_key_reader(int pk, int_range ck_range) {
+        ++reads_started;
+        auto slice = partition_slice_builder(*s.schema())
+            .with_range(ck_range.transform([this] (int key) { return c_keys[key]; }))
+            .build();
+        auto pr = dht::partition_range::make_singular(p_keys[pk]);
+        return make_reader(std::move(pr), std::move(slice));
+    }
+
+    std::unique_ptr<reader> make_scanning_reader() {
+        ++scans_started;
+        return make_reader(query::full_partition_range, query::full_slice);
+    }
+};
+
+struct reader_id {
+    sstring name;
+
+    friend std::ostream& operator<<(std::ostream& out, reader_id id) {
+        return out << id.name;
+    }
+};
+
+class validating_consumer {
+    table& _t;
+    reader_id _id;
+    stdx::optional<sstring> _value;
+    size_t _row_count = 0;
+    size_t _key = 0;
+    std::vector<api::timestamp_type> _writetimes;
+public:
+    validating_consumer(table& t, reader_id id)
+        : _t(t)
+        , _id(id)
+        , _writetimes(t.p_writetime)
+    { }
+
+    void consume_new_partition(const dht::decorated_key& key) {
+        test_log.trace("reader {}: enters partition {}", _id, key);
+        _value = {};
+        _key = _t.index_of_key(key);
+    }
+
+    stop_iteration consume_end_of_partition() { return stop_iteration::no; }
+    stop_iteration consume(tombstone) { return stop_iteration::no; }
+    stop_iteration consume(const static_row&) { return stop_iteration::no; }
+    stop_iteration consume(const range_tombstone&) { return stop_iteration::no; }
+
+    stop_iteration consume(const clustering_row& row) {
+        ++_row_count;
+        sstring value;
+        api::timestamp_type t;
+        std::tie(value, t) = _t.s.get_value(row);
+        test_log.trace("reader {}: {} @{}, {}", _id, value, t, row);
+        if (_value && value != _value) {
+            throw std::runtime_error(sprint("Saw values from two different writes in partition %d: %s and %s", _key, _value, value));
+        }
+        auto lowest_timestamp = _writetimes[_key];
+        if (t < lowest_timestamp) {
+            throw std::runtime_error(sprint("Expected to see the write @%d, but saw @%d (%s), c_key=%s", lowest_timestamp, t, value, row.key()));
+        }
+        _value = std::move(value);
+        return stop_iteration::no;
+    }
+
+    size_t consume_end_of_stream() {
+        test_log.trace("reader {}: done, {} rows", _id, _row_count);
+        return _row_count;
+    }
+};
+
+template<typename T>
+class monotonic_counter {
+    std::function<T()> _getter;
+    T _prev;
+public:
+    monotonic_counter(std::function<T()> getter)
+        : _getter(std::move(getter)) {
+        _prev = _getter();
+    }
+    // Return change in value since the last call to change() or rate().
+    auto change() {
+        auto now = _getter();
+        return now - std::exchange(_prev, now);
+    }
+};
+
+int main(int argc, char** argv) {
+    namespace bpo = boost::program_options;
+    app_template app;
+    app.add_options()
+        ("trace", "Enables trace-level logging for the test actions")
+        ("concurrency", bpo::value<unsigned>()->default_value(10), "Number of concurrent single partition readers")
+        ("scan-concurrency", bpo::value<unsigned>()->default_value(2), "Number of concurrent ring scanners")
+        ("partitions", bpo::value<unsigned>()->default_value(10), "Number of partitions")
+        ("rows", bpo::value<unsigned>()->default_value(10000), "Number of rows in each partitions")
+        ("seconds", bpo::value<unsigned>()->default_value(600), "Duration [s] after which the test terminates with a success")
+        ;
+
+    return app.run(argc, argv, [&app] {
+        if (app.configuration().count("trace")) {
+            test_log.set_level(seastar::log_level::trace);
+        }
+
+        return seastar::async([&app] {
+            auto concurrency = app.configuration()["concurrency"].as<unsigned>();
+            auto scan_concurrency = app.configuration()["scan-concurrency"].as<unsigned>();
+            auto partitions = app.configuration()["partitions"].as<unsigned>();
+            auto rows = app.configuration()["rows"].as<unsigned>();
+            auto seconds = app.configuration()["seconds"].as<unsigned>();
+
+            table t(partitions, rows);
+
+            engine().at_exit([] {
+                cancelled = true;
+                return make_ready_future();
+            });
+
+            timer<> completion_timer;
+            completion_timer.set_callback([&] {
+                test_log.info("Test done.");
+                cancelled = true;
+            });
+            completion_timer.arm(std::chrono::seconds(seconds));
+
+            auto fail = [&] (sstring msg) {
+                test_log.error("{}", msg);
+                cancelled = true;
+                completion_timer.cancel();
+            };
+
+            // Stats printer
+            timer<> stats_printer;
+            monotonic_counter<uint64_t> reads([&] { return t.reads_started; });
+            monotonic_counter<uint64_t> scans([&] { return t.scans_started; });
+            monotonic_counter<uint64_t> mutations([&] { return t.mutations; });
+            monotonic_counter<uint64_t> flushes([&] { return t.mutation_phase; });
+            stats_printer.set_callback([&] {
+                auto MB = 1024 * 1024;
+                test_log.info("reads/s: {}, scans/s: {}, mutations/s: {}, flushes/s: {}, Cache: {}/{} [MB], LSA: {}/{} [MB], std free: {} [MB]",
+                    reads.change(), scans.change(), mutations.change(), flushes.change(),
+                    global_cache_tracker().region().occupancy().used_space() / MB,
+                    global_cache_tracker().region().occupancy().total_space() / MB,
+                    logalloc::shard_tracker().region_occupancy().used_space() / MB,
+                    logalloc::shard_tracker().region_occupancy().total_space() / MB,
+                    seastar::memory::stats().free_memory() / MB);
+            });
+            stats_printer.arm_periodic(1s);
+
+            auto single_partition_reader = [&] (int i, reader_id id) {
+                auto n_keys = t.c_keys.size();
+
+                // Assign ranges so that there is ~30% overlap between adjacent readers.
+                auto len = div_ceil(n_keys, concurrency);
+                len = std::min(n_keys, len + div_ceil(len, 3)); // so that read ranges overlap
+                auto start = (n_keys - len) * i / (std::max(concurrency - 1, 1u));
+                int_range ck_range = make_int_range(start, start + len);
+
+                int pk = t.p_keys.size() / 2; // FIXME: spread over 3 consecutive partitions
+                test_log.info("{} is using pk={} ck={}", id, pk, ck_range);
+                while (!cancelled) {
+                    test_log.trace("{}: starting read", id);
+                    auto rd = t.make_single_key_reader(pk, ck_range);
+                    auto row_count = consume_flattened(std::move(rd->rd), validating_consumer(t, id)).get0();
+                    if (row_count != len) {
+                        throw std::runtime_error(sprint("Expected %d fragments, got %d", len, row_count));
+                    }
+                }
+            };
+
+            auto scanning_reader = [&] (reader_id id) {
+                auto expected_row_count = t.p_keys.size() * t.c_keys.size();
+                while (!cancelled) {
+                    test_log.trace("{}: starting read", id);
+                    auto rd = t.make_scanning_reader();
+                    auto row_count = consume_flattened(std::move(rd->rd), validating_consumer(t, id)).get0();
+                    if (row_count != expected_row_count) {
+                        throw std::runtime_error(sprint("Expected %d fragments, got %d", expected_row_count, row_count));
+                    }
+                }
+            };
+
+            // populate the initial phase, readers expect constant fragment count.
+            t.mutate_next_phase();
+
+            auto readers = parallel_for_each(boost::irange(0u, concurrency), [&] (auto i) {
+                reader_id id{sprint("single-%d", i)};
+                return seastar::async([&, i, id] {
+                    single_partition_reader(i, id);
+                }).handle_exception([&, id] (auto e) {
+                    fail(sprint("%s failed: %s", id, e));
+                });
+            });
+
+            auto scanning_readers = parallel_for_each(boost::irange(0u, scan_concurrency), [&] (auto i) {
+                reader_id id{sprint("scan-%d", i)};
+                return seastar::async([&, id] {
+                    scanning_reader(id);
+                }).handle_exception([&, id] (auto e) {
+                    fail(sprint("%s failed: %s", id, e));
+                });
+            });
+
+            timer<> evictor;
+            evictor.set_callback([&] {
+                test_log.trace("evicting");
+                t.cache.evict();
+            });
+            evictor.arm_periodic(3s);
+
+            // Mutator
+            while (!cancelled) {
+                t.mutate_next_phase();
+            }
+
+            stats_printer.cancel();
+            completion_timer.cancel();
+            evictor.cancel();
+            readers.get();
+            scanning_readers.get();
+        });
+    });
+}
--- a/tests/row_cache_test.cc
+++ b/tests/row_cache_test.cc
--- a/tests/simple_schema.hh
+++ b/tests/simple_schema.hh
@@ -28,15 +28,18 @@
 #include "keys.hh"
 #include "streamed_mutation.hh"
 #include "mutation.hh"
+#include "schema_builder.hh"
+#include "streamed_mutation.hh"

 // Helper for working with the following table:
 //
-//   CREATE TABLE ks.cf (pk utf8, ck utf8, v utf8, s1 utf8 static, PRIMARY KEY (pk, ck));
+//   CREATE TABLE ks.cf (pk text, ck text, v text, s1 text static, PRIMARY KEY (pk, ck));
 //
 class simple_schema {
    schema_ptr _s;
    api::timestamp_type _timestamp = api::min_timestamp;
-private:
+    const column_definition& _v_def;
+public:
    api::timestamp_type new_timestamp() {
        return _timestamp++;
    }
@@ -48,6 +51,7 @@ public:
            .with_column("s1", utf8_type, column_kind::static_column)
            .with_column("v", utf8_type)
            .build())
+        , _v_def(*_s->get_column_definition(to_bytes("v")))
    { }

    clustering_key make_ckey(sstring ck) {
@@ -70,8 +74,23 @@ public:
        return dht::global_partitioner().decorate_key(*_s, key);
    }

-    void add_row(mutation& m, const clustering_key& key, sstring v) {
-        m.set_clustered_cell(key, to_bytes("v"), data_value(v), new_timestamp());
+    void add_row(mutation& m, const clustering_key& key, const sstring& v, api::timestamp_type t = api::missing_timestamp) {
+        if (t == api::missing_timestamp) {
+            t = new_timestamp();
+        }
+        m.set_clustered_cell(key, _v_def, atomic_cell::make_live(t, data_value(v).serialize()));
+    }
+
+    std::pair<sstring, api::timestamp_type> get_value(const clustering_row& row) {
+        auto cell = row.cells().find_cell(_v_def.id);
+        if (!cell) {
+            throw std::runtime_error("cell not found");
+        }
+        atomic_cell_view ac = cell->as_atomic_cell();
+        if (!ac.is_live()) {
+            throw std::runtime_error("cell is dead");
+        }
+        return std::make_pair(value_cast<sstring>(utf8_type->deserialize(ac.value())), ac.timestamp());
    }

    mutation_fragment make_row(const clustering_key& key, sstring v) {
@@ -91,9 +110,12 @@ public:
        return rt;
    }

-    range_tombstone make_range_tombstone(const query::clustering_range& range) {
+    range_tombstone make_range_tombstone(const query::clustering_range& range, tombstone t = {}) {
        auto bv_range = bound_view::from_range(range);
-        range_tombstone rt(bv_range.first, bv_range.second, tombstone(new_timestamp(), gc_clock::now()));
+        if (!t) {
+            t = tombstone(new_timestamp(), gc_clock::now());
+        }
+        range_tombstone rt(bv_range.first, bv_range.second, t);
        return rt;
    }

@@ -114,4 +136,13 @@ public:
        std::sort(keys.begin(), keys.end(), dht::decorated_key::less_comparator(_s));
        return keys;
    }
+
+    // Returns n clustering keys in their natural order
+    std::vector<clustering_key> make_ckeys(int n) {
+        std::vector<clustering_key> keys;
+        for (int i = 0; i < n; ++i) {
+            keys.push_back(make_ckey(i));
+        }
+        return keys;
+    }
 };
--- a/tests/streamed_mutation_test.cc
+++ b/tests/streamed_mutation_test.cc
@@ -29,8 +29,10 @@
 #include "tests/test_services.hh"
 #include "schema_builder.hh"
 #include "total_order_check.hh"
+#include "schema_upgrader.hh"

 #include "disk-error-handler.hh"
+#include "mutation_assertions.hh"

 thread_local disk_error_signal_type commit_error;
 thread_local disk_error_signal_type general_disk_error;
@@ -241,7 +243,7 @@ SEASTAR_TEST_CASE(test_fragmenting_and_freezing_streamed_mutations) {
                return make_ready_future<>();
            }, 1).get0();

-            auto expected_fragments = m.partition().clustered_rows().calculate_size()
+            auto expected_fragments = boost::size(m.partition().non_dummy_rows())
                                      + m.partition().row_tombstones().size()
                                      + !m.partition().static_row().empty();
            BOOST_REQUIRE_EQUAL(fms.size(), std::max(expected_fragments, size_t(1)));
@@ -538,3 +540,21 @@ SEASTAR_TEST_CASE(test_ordering_of_position_in_partition_and_composite_view_in_a
            .check();
    });
 }
+
+SEASTAR_TEST_CASE(test_schema_upgrader_is_equivalent_with_mutation_upgrade) {
+    return seastar::async([] {
+        for_each_mutation_pair([](const mutation& m1, const mutation& m2, are_equal eq) {
+            if (m1.schema()->version() != m2.schema()->version()) {
+                // upgrade m1 to m2's schema
+
+                auto from_upgrader = mutation_from_streamed_mutation(
+                    transform(streamed_mutation_from_mutation(m1), schema_upgrader(m2.schema()))).get0();
+
+                auto regular = m1;
+                regular.upgrade(m2.schema());
+
+                assert_that(from_upgrader).has_mutation().is_equal_to(regular);
+            }
+        });
+    });
+}
--- a/utils/int_range.hh
+++ b/utils/int_range.hh
@@ -0,0 +1,55 @@
+/*
+ * Copyright (C) 2017 ScyllaDB
+ */
+
+/*
+ * This file is part of Scylla.
+ *
+ * Scylla is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Scylla is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Scylla.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#include "range.hh"
+#include <seastar/core/print.hh>
+
+using int_range = nonwrapping_range<int>;
+
+inline
+unsigned cardinality(const int_range& r) {
+    assert(r.start());
+    assert(r.end());
+    return r.end()->value() - r.start()->value() + r.start()->is_inclusive() + r.end()->is_inclusive() - 1;
+}
+
+inline
+unsigned cardinality(const stdx::optional<int_range>& ropt) {
+    return ropt ? cardinality(*ropt) : 0;
+}
+
+inline
+stdx::optional<int_range> intersection(const int_range& a, const int_range& b) {
+    auto int_tri_cmp = [] (int x, int y) {
+        return x < y ? -1 : (x > y ? 1 : 0);
+    };
+    return a.intersection(b, int_tri_cmp);
+}
+
+inline
+int_range make_int_range(int start_inclusive, int end_exclusive) {
+    if (end_exclusive <= start_inclusive) {
+        throw std::runtime_error(sprint("invalid range: [%d, %d)", start_inclusive, end_exclusive));
+    }
+    return int_range({start_inclusive}, {end_exclusive - 1});
+}