scylladb/mutation_partition.hh

/*
 * Copyright (C) 2014 ScyllaDB
 */

/*
 * This file is part of Scylla.
 *
 * Scylla is free software: you can redistribute it and/or modify
 * it under the terms of the GNU Affero General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 *
 * Scylla is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with Scylla.  If not, see <http://www.gnu.org/licenses/>.
 */

#pragma once

#include <iosfwd>
#include <map>
#include <boost/intrusive/set.hpp>
#include <boost/range/iterator_range.hpp>
#include <boost/range/adaptor/indexed.hpp>
#include <boost/range/adaptor/filtered.hpp>
#include <boost/intrusive/parent_from_member.hpp>

#include <seastar/core/bitset-iter.hh>
#include <seastar/util/optimized_optional.hh>

#include "schema.hh"
#include "tombstone.hh"
#include "keys.hh"
#include "position_in_partition.hh"
#include "atomic_cell_or_collection.hh"
#include "query-result.hh"
#include "mutation_partition_view.hh"
#include "mutation_partition_visitor.hh"
#include "utils/managed_vector.hh"
#include "hashing_partition_visitor.hh"
#include "range_tombstone_list.hh"
#include "clustering_key_filter.hh"
#include "intrusive_set_external_comparator.hh"
#include "utils/with_relational_operators.hh"
#include "utils/preempt.hh"

class mutation_fragment;
class clustering_row;

struct cell_hash {
    using size_type = uint64_t;
    static constexpr size_type no_hash = 0;

    size_type hash = no_hash;

    explicit operator bool() const noexcept {
        return hash != no_hash;
    }
};

template<>
struct appending_hash<cell_hash> {
    template<typename Hasher>
    void operator()(Hasher& h, const cell_hash& ch) const {
        feed_hash(h, ch.hash);
    }
};

using cell_hash_opt = seastar::optimized_optional<cell_hash>;

struct cell_and_hash {
    atomic_cell_or_collection cell;
    mutable cell_hash_opt hash;

    cell_and_hash() = default;
    cell_and_hash(cell_and_hash&&) noexcept = default;
    cell_and_hash& operator=(cell_and_hash&&) noexcept = default;

    cell_and_hash(atomic_cell_or_collection&& cell, cell_hash_opt hash)
        : cell(std::move(cell))
        , hash(hash)
    { }
};

class compaction_garbage_collector;

//
// Container for cells of a row. Cells are identified by column_id.
//
// All cells must belong to a single column_kind. The kind is not stored
// for space-efficiency reasons. Whenever a method accepts a column_kind,
// the caller must always supply the same column_kind.
//
// Can be used as a range of row::cell_entry.
//
class row {

    class cell_entry {
        boost::intrusive::set_member_hook<> _link;
        column_id _id;
        cell_and_hash _cell_and_hash;
        friend class row;
    public:
        cell_entry(column_id id, cell_and_hash c_a_h)
            : _id(id)
            , _cell_and_hash(std::move(c_a_h))
        { }
        cell_entry(column_id id, atomic_cell_or_collection cell)
            : cell_entry(id, cell_and_hash{std::move(cell), cell_hash_opt()})
        { }
        cell_entry(column_id id)
            : _id(id)
        { }
        cell_entry(cell_entry&&) noexcept;
        cell_entry(const abstract_type&, const cell_entry&);

        column_id id() const { return _id; }
        const atomic_cell_or_collection& cell() const { return _cell_and_hash.cell; }
        atomic_cell_or_collection& cell() { return _cell_and_hash.cell; }
        const cell_hash_opt& hash() const { return _cell_and_hash.hash; }
        const cell_and_hash& get_cell_and_hash() const { return _cell_and_hash; }
        cell_and_hash& get_cell_and_hash() { return _cell_and_hash; }

        struct compare {
            bool operator()(const cell_entry& e1, const cell_entry& e2) const {
                return e1._id < e2._id;
            }
            bool operator()(column_id id1, const cell_entry& e2) const {
                return id1 < e2._id;
            }
            bool operator()(const cell_entry& e1, column_id id2) const {
                return e1._id < id2;
            }
        };
    };

    using size_type = std::make_unsigned_t<column_id>;

    enum class storage_type {
        vector,
        set,
    };
    storage_type _type = storage_type::vector;
    size_type _size = 0;

    using map_type = boost::intrusive::set<cell_entry,
        boost::intrusive::member_hook<cell_entry, boost::intrusive::set_member_hook<>, &cell_entry::_link>,
        boost::intrusive::compare<cell_entry::compare>, boost::intrusive::constant_time_size<false>>;
public:
    static constexpr size_t max_vector_size = 32;
    static constexpr size_t internal_count = 5;
private:
    using vector_type = managed_vector<cell_and_hash, internal_count, size_type>;

    struct vector_storage {
        std::bitset<max_vector_size> present;
        vector_type v;

        vector_storage() = default;
        vector_storage(const vector_storage&) = default;
        vector_storage(vector_storage&& other) noexcept
                : present(other.present)
                , v(std::move(other.v)) {
            other.present = {};
        }
    };

    union storage {
        storage() { }
        ~storage() { }
        map_type set;
        vector_storage vector;
    } _storage;
public:
    row();
    ~row();
    row(const schema&, column_kind, const row&);
    row(row&& other) noexcept;
    row& operator=(row&& other) noexcept;
    size_t size() const { return _size; }
    bool empty() const { return _size == 0; }

    void reserve(column_id);

    const atomic_cell_or_collection& cell_at(column_id id) const;

    // Returns a pointer to cell's value or nullptr if column is not set.
    const atomic_cell_or_collection* find_cell(column_id id) const;
    // Returns a pointer to cell's value and hash or nullptr if column is not set.
    const cell_and_hash* find_cell_and_hash(column_id id) const;
private:
    template<typename Func>
    void remove_if(Func&& func) {
        if (_type == storage_type::vector) {
            for (unsigned i = 0; i < _storage.vector.v.size(); i++) {
                if (!_storage.vector.present.test(i)) {
                    continue;
                }
                auto& c = _storage.vector.v[i].cell;
                if (func(i, c)) {
                    c = atomic_cell_or_collection();
                    _storage.vector.present.reset(i);
                    _size--;
                }
            }
        } else {
            for (auto it = _storage.set.begin(); it != _storage.set.end();) {
                if (func(it->id(), it->cell())) {
                    auto& entry = *it;
                    it = _storage.set.erase(it);
                    current_allocator().destroy(&entry);
                    _size--;
                } else {
                    ++it;
                }
            }
        }
    }

private:
    auto get_range_vector() const {
        auto id_range = boost::irange<column_id>(0, _storage.vector.v.size());
        return boost::combine(id_range, _storage.vector.v)
        | boost::adaptors::filtered([this] (const boost::tuple<const column_id&, const cell_and_hash&>& t) {
            return _storage.vector.present.test(t.get<0>());
        }) | boost::adaptors::transformed([] (const boost::tuple<const column_id&, const cell_and_hash&>& t) {
            return std::pair<column_id, const atomic_cell_or_collection&>(t.get<0>(), t.get<1>().cell);
        });
    }
    auto get_range_set() const {
        auto range = boost::make_iterator_range(_storage.set.begin(), _storage.set.end());
        return range | boost::adaptors::transformed([] (const cell_entry& c) {
            return std::pair<column_id, const atomic_cell_or_collection&>(c.id(), c.cell());
        });
    }
    template<typename Func>
    auto with_both_ranges(const row& other, Func&& func) const;

    void vector_to_set();

    template<typename Func>
    void consume_with(Func&&);

    // Func obeys the same requirements as for for_each_cell below.
    template<typename Func, typename MaybeConstCellAndHash>
    static constexpr auto maybe_invoke_with_hash(Func& func, column_id id, MaybeConstCellAndHash& c_a_h) {
        if constexpr (std::is_invocable_v<Func, column_id, const cell_and_hash&>) {
            return func(id, c_a_h);
        } else {
            return func(id, c_a_h.cell);
        }
    }

public:
    // Calls Func(column_id, cell_and_hash&) or Func(column_id, atomic_cell_and_collection&)
    // for each cell in this row, depending on the concrete Func type.
    // noexcept if Func doesn't throw.
    template<typename Func>
    void for_each_cell(Func&& func) {
        if (_type == storage_type::vector) {
            for (auto i : bitsets::for_each_set(_storage.vector.present)) {
                maybe_invoke_with_hash(func, i, _storage.vector.v[i]);
            }
        } else {
            for (auto& cell : _storage.set) {
                maybe_invoke_with_hash(func, cell.id(), cell.get_cell_and_hash());
            }
        }
    }

    template<typename Func>
    void for_each_cell(Func&& func) const {
        if (_type == storage_type::vector) {
            for (auto i : bitsets::for_each_set(_storage.vector.present)) {
                maybe_invoke_with_hash(func, i, _storage.vector.v[i]);
            }
        } else {
            for (auto& cell : _storage.set) {
                maybe_invoke_with_hash(func, cell.id(), cell.get_cell_and_hash());
            }
        }
    }

    template<typename Func>
    void for_each_cell_until(Func&& func) const {
        if (_type == storage_type::vector) {
            for (auto i : bitsets::for_each_set(_storage.vector.present)) {
                if (maybe_invoke_with_hash(func, i, _storage.vector.v[i]) == stop_iteration::yes) {
                    break;
                }
            }
        } else {
            for (auto& cell : _storage.set) {
                if (maybe_invoke_with_hash(func, cell.id(), cell.get_cell_and_hash()) == stop_iteration::yes) {
                    break;
                }
            }
        }
    }

    // Merges cell's value into the row.
    // Weak exception guarantees.
    void apply(const column_definition& column, const atomic_cell_or_collection& cell, cell_hash_opt hash = cell_hash_opt());

    // Merges cell's value into the row.
    // Weak exception guarantees.
    void apply(const column_definition& column, atomic_cell_or_collection&& cell, cell_hash_opt hash = cell_hash_opt());

    // Monotonic exception guarantees. In case of exception the sum of cell and this remains the same as before the exception.
    void apply_monotonically(const column_definition& column, atomic_cell_or_collection&& cell, cell_hash_opt hash = cell_hash_opt());

    // Adds cell to the row. The column must not be already set.
    void append_cell(column_id id, atomic_cell_or_collection cell);

    // Weak exception guarantees
    void apply(const schema&, column_kind, const row& src);
    // Weak exception guarantees
    void apply(const schema&, column_kind, row&& src);
    // Monotonic exception guarantees
    void apply_monotonically(const schema&, column_kind, row&& src);

    // Expires cells based on query_time. Expires tombstones based on gc_before
    // and max_purgeable. Removes cells covered by tomb.
    // Returns true iff there are any live cells left.
    bool compact_and_expire(
            const schema& s,
            column_kind kind,
            row_tombstone tomb,
            gc_clock::time_point query_time,
            can_gc_fn&,
            gc_clock::time_point gc_before,
            const row_marker& marker,
            compaction_garbage_collector* collector = nullptr);

    bool compact_and_expire(
            const schema& s,
            column_kind kind,
            row_tombstone tomb,
            gc_clock::time_point query_time,
            can_gc_fn&,
            gc_clock::time_point gc_before,
            compaction_garbage_collector* collector = nullptr);

    row difference(const schema&, column_kind, const row& other) const;

    bool equal(column_kind kind, const schema& this_schema, const row& other, const schema& other_schema) const;

    size_t external_memory_usage(const schema&, column_kind) const;

    cell_hash_opt cell_hash_for(column_id id) const;

    void prepare_hash(const schema& s, column_kind kind) const;
    void clear_hash() const;

    bool is_live(const schema&, column_kind kind, tombstone tomb = tombstone(), gc_clock::time_point now = gc_clock::time_point::min()) const;

    class printer {
        const schema& _schema;
        column_kind _kind;
        const row& _row;
    public:
        printer(const schema& s, column_kind k, const row& r) : _schema(s), _kind(k), _row(r) { }
        printer(const printer&) = delete;
        printer(printer&&) = delete;

        friend std::ostream& operator<<(std::ostream& os, const printer& p);
    };
    friend std::ostream& operator<<(std::ostream& os, const printer& p);
};

class row_marker;
int compare_row_marker_for_merge(const row_marker& left, const row_marker& right) noexcept;

class row_marker {
    static constexpr gc_clock::duration no_ttl { 0 };
    static constexpr gc_clock::duration dead { -1 };
    static constexpr gc_clock::time_point no_expiry { gc_clock::duration(0) };
    api::timestamp_type _timestamp = api::missing_timestamp;
    gc_clock::duration _ttl = no_ttl;
    gc_clock::time_point _expiry = no_expiry;
public:
    row_marker() = default;
    explicit row_marker(api::timestamp_type created_at) : _timestamp(created_at) { }
    row_marker(api::timestamp_type created_at, gc_clock::duration ttl, gc_clock::time_point expiry)
        : _timestamp(created_at), _ttl(ttl), _expiry(expiry)
    { }
    explicit row_marker(tombstone deleted_at)
        : _timestamp(deleted_at.timestamp), _ttl(dead), _expiry(deleted_at.deletion_time)
    { }
    bool is_missing() const {
        return _timestamp == api::missing_timestamp;
    }
    bool is_live() const {
        return !is_missing() && _ttl != dead;
    }
    bool is_live(tombstone t, gc_clock::time_point now) const {
        if (is_missing() || _ttl == dead) {
            return false;
        }
        if (_ttl != no_ttl && _expiry < now) {
            return false;
        }
        return _timestamp > t.timestamp;
    }
    // Can be called only when !is_missing().
    bool is_dead(gc_clock::time_point now) const {
        if (_ttl == dead) {
            return true;
        }
        return _ttl != no_ttl && _expiry < now;
    }
    // Can be called only when is_live().
    bool is_expiring() const {
        return _ttl != no_ttl;
    }
    // Can be called only when is_expiring().
    gc_clock::duration ttl() const {
        return _ttl;
    }
    // Can be called only when is_expiring().
    gc_clock::time_point expiry() const {
        return _expiry;
    }
    // Should be called when is_dead() or is_expiring().
    // Safe to be called when is_missing().
    // When is_expiring(), returns the the deletion time of the marker when it finally expires.
    gc_clock::time_point deletion_time() const {
        return _ttl == dead ? _expiry : _expiry - _ttl;
    }
    api::timestamp_type timestamp() const {
        return _timestamp;
    }
    void apply(const row_marker& rm) {
        if (compare_row_marker_for_merge(*this, rm) < 0) {
            *this = rm;
        }
    }
    // Expires cells and tombstones. Removes items covered by higher level
    // tombstones.
    // Returns true if row marker is live.
    bool compact_and_expire(tombstone tomb, gc_clock::time_point now,
            can_gc_fn& can_gc, gc_clock::time_point gc_before, compaction_garbage_collector* collector = nullptr);
    // Consistent with feed_hash()
    bool operator==(const row_marker& other) const {
        if (_timestamp != other._timestamp) {
            return false;
        }
        if (is_missing()) {
            return true;
        }
        if (_ttl != other._ttl) {
            return false;
        }
        return _ttl == no_ttl || _expiry == other._expiry;
    }
    bool operator!=(const row_marker& other) const {
        return !(*this == other);
    }
    // Consistent with operator==()
    template<typename Hasher>
    void feed_hash(Hasher& h) const {
        ::feed_hash(h, _timestamp);
        if (!is_missing()) {
            ::feed_hash(h, _ttl);
            if (_ttl != no_ttl) {
                ::feed_hash(h, _expiry);
            }
        }
    }
    friend std::ostream& operator<<(std::ostream& os, const row_marker& rm);
};

template<>
struct appending_hash<row_marker> {
    template<typename Hasher>
    void operator()(Hasher& h, const row_marker& m) const {
        m.feed_hash(h);
    }
};

class clustering_row;

class shadowable_tombstone : public with_relational_operators<shadowable_tombstone> {
    tombstone _tomb;
public:

    explicit shadowable_tombstone(api::timestamp_type timestamp, gc_clock::time_point deletion_time)
            : _tomb(timestamp, deletion_time) {
    }

    explicit shadowable_tombstone(tombstone tomb = tombstone())
            : _tomb(std::move(tomb)) {
    }

    int compare(const shadowable_tombstone& t) const {
        return _tomb.compare(t._tomb);
    }

    explicit operator bool() const {
        return bool(_tomb);
    }

    const tombstone& tomb() const {
        return _tomb;
    }

    // A shadowable row tombstone is valid only if the row has no live marker. In other words,
    // the row tombstone is only valid as long as no newer insert is done (thus setting a
    // live row marker; note that if the row timestamp set is lower than the tombstone's,
    // then the tombstone remains in effect as usual). If a row has a shadowable tombstone
    // with timestamp Ti and that row is updated with a timestamp Tj, such that Tj > Ti
    // (and that update sets the row marker), then the shadowable tombstone is shadowed by
    // that update. A concrete consequence is that if the update has cells with timestamp
    // lower than Ti, then those cells are preserved (since the deletion is removed), and
    // this is contrary to a regular, non-shadowable row tombstone where the tombstone is
    // preserved and such cells are removed.
    bool is_shadowed_by(const row_marker& marker) const {
        return marker.is_live() && marker.timestamp() > _tomb.timestamp;
    }

    void maybe_shadow(tombstone t, row_marker marker) noexcept {
        if (is_shadowed_by(marker)) {
            _tomb = std::move(t);
        }
    }

    void apply(tombstone t) noexcept {
        _tomb.apply(t);
    }

    void apply(shadowable_tombstone t) noexcept {
        _tomb.apply(t._tomb);
    }

    friend std::ostream& operator<<(std::ostream& out, const shadowable_tombstone& t) {
        if (t) {
            return out << "{shadowable tombstone: timestamp=" << t.tomb().timestamp
                   << ", deletion_time=" << t.tomb().deletion_time.time_since_epoch().count()
                   << "}";
        } else {
            return out << "{shadowable tombstone: none}";
        }
    }
};

template<>
struct appending_hash<shadowable_tombstone> {
    template<typename Hasher>
    void operator()(Hasher& h, const shadowable_tombstone& t) const {
        feed_hash(h, t.tomb());
    }
};

/*
The rules for row_tombstones are as follows:
  - The shadowable tombstone is always >= than the regular one;
  - The regular tombstone works as expected;
  - The shadowable tombstone doesn't erase or compact away the regular
    row tombstone, nor dead cells;
  - The shadowable tombstone can erase live cells, but only provided they
    can be recovered (e.g., by including all cells in a MV update, both
    updated cells and pre-existing ones);
  - The shadowable tombstone can be erased or compacted away by a newer
    row marker.
*/
class row_tombstone : public with_relational_operators<row_tombstone> {
    tombstone _regular;
    shadowable_tombstone _shadowable; // _shadowable is always >= _regular
public:
    explicit row_tombstone(tombstone regular, shadowable_tombstone shadowable)
            : _regular(std::move(regular))
            , _shadowable(std::move(shadowable)) {
    }

    explicit row_tombstone(tombstone regular)
            : row_tombstone(regular, shadowable_tombstone(regular)) {
    }

    row_tombstone() = default;

    int compare(const row_tombstone& t) const {
        return _shadowable.compare(t._shadowable);
    }

    explicit operator bool() const {
        return bool(_shadowable);
    }

    const tombstone& tomb() const {
        return _shadowable.tomb();
    }

    const gc_clock::time_point max_deletion_time() const {
        return std::max(_regular.deletion_time, _shadowable.tomb().deletion_time);
    }

    const tombstone& regular() const {
        return _regular;
    }

    const shadowable_tombstone& shadowable() const {
        return _shadowable;
    }

    bool is_shadowable() const {
        return _shadowable.tomb() > _regular;
    }

    void maybe_shadow(const row_marker& marker) noexcept {
        _shadowable.maybe_shadow(_regular, marker);
    }

    void apply(tombstone regular) noexcept {
        _shadowable.apply(regular);
        _regular.apply(regular);
    }

    void apply(shadowable_tombstone shadowable, row_marker marker) noexcept {
        _shadowable.apply(shadowable.tomb());
        _shadowable.maybe_shadow(_regular, marker);
    }

    void apply(row_tombstone t, row_marker marker) noexcept {
        _regular.apply(t._regular);
        _shadowable.apply(t._shadowable);
        _shadowable.maybe_shadow(_regular, marker);
    }

    friend std::ostream& operator<<(std::ostream& out, const row_tombstone& t) {
        if (t) {
            return out << "{row_tombstone: " << t._regular << (t.is_shadowable() ? t._shadowable : shadowable_tombstone()) << "}";
        } else {
            return out << "{row_tombstone: none}";
        }
    }
};

template<>
struct appending_hash<row_tombstone> {
    template<typename Hasher>
    void operator()(Hasher& h, const row_tombstone& t) const {
        feed_hash(h, t.regular());
        if (t.is_shadowable()) {
            feed_hash(h, t.shadowable());
        }
    }
};

class deletable_row final {
    row_tombstone _deleted_at;
    row_marker _marker;
    row _cells;
public:
    deletable_row() {}
    explicit deletable_row(clustering_row&&);
    deletable_row(const schema& s, const deletable_row& other)
        : _deleted_at(other._deleted_at)
        , _marker(other._marker)
        , _cells(s, column_kind::regular_column, other._cells)
    { }
    deletable_row(const schema& s, row_tombstone tomb, const row_marker& marker, const row& cells)
        : _deleted_at(tomb), _marker(marker), _cells(s, column_kind::regular_column, cells)
    {}

    void apply(const schema&, clustering_row);

    void apply(tombstone deleted_at) {
        _deleted_at.apply(deleted_at);
    }

    void apply(shadowable_tombstone deleted_at) {
        _deleted_at.apply(deleted_at, _marker);
    }

    void apply(row_tombstone deleted_at) {
        _deleted_at.apply(deleted_at, _marker);
    }

    void apply(const row_marker& rm) {
        _marker.apply(rm);
        _deleted_at.maybe_shadow(_marker);
    }

    void remove_tombstone() {
        _deleted_at = {};
    }

    // Weak exception guarantees. After exception, both src and this will commute to the same value as
    // they would should the exception not happen.
    void apply(const schema& s, deletable_row&& src);
    void apply_monotonically(const schema& s, deletable_row&& src);
public:
    row_tombstone deleted_at() const { return _deleted_at; }
    api::timestamp_type created_at() const { return _marker.timestamp(); }
    row_marker& marker() { return _marker; }
    const row_marker& marker() const { return _marker; }
    const row& cells() const { return _cells; }
    row& cells() { return _cells; }
    bool equal(column_kind, const schema& s, const deletable_row& other, const schema& other_schema) const;
    bool is_live(const schema& s, tombstone base_tombstone = tombstone(), gc_clock::time_point query_time = gc_clock::time_point::min()) const;
    bool empty() const { return !_deleted_at && _marker.is_missing() && !_cells.size(); }
    deletable_row difference(const schema&, column_kind, const deletable_row& other) const;

    class printer {
        const schema& _schema;
        const deletable_row& _deletable_row;
    public:
        printer(const schema& s, const deletable_row& r) : _schema(s), _deletable_row(r) { }
        printer(const printer&) = delete;
        printer(printer&&) = delete;

        friend std::ostream& operator<<(std::ostream& os, const printer& p);
    };
    friend std::ostream& operator<<(std::ostream& os, const printer& p);
};

class cache_tracker;

class rows_entry {
    using lru_link_type = bi::list_member_hook<bi::link_mode<bi::auto_unlink>>;
    friend class cache_tracker;
    friend class size_calculator;
    intrusive_set_external_comparator_member_hook _link;
    clustering_key _key;
    deletable_row _row;
    lru_link_type _lru_link;
    struct flags {
        // _before_ck and _after_ck encode position_in_partition::weight
        bool _before_ck : 1;
        bool _after_ck : 1;
        bool _continuous : 1; // See doc of is_continuous.
        bool _dummy : 1;
        // Marks a dummy entry which is after_all_clustered_rows() position.
        // Needed so that eviction, which can't use comparators, can check if it's dealing with it.
        bool _last_dummy : 1;
        flags() : _before_ck(0), _after_ck(0), _continuous(true), _dummy(false), _last_dummy(false) { }
    } _flags{};
    friend class mutation_partition;
public:
    struct last_dummy_tag {};
    explicit rows_entry(clustering_key&& key)
        : _key(std::move(key))
    { }
    explicit rows_entry(const clustering_key& key)
        : _key(key)
    { }
    rows_entry(const schema& s, position_in_partition_view pos, is_dummy dummy, is_continuous continuous)
        : _key(pos.key())
    {
        _flags._last_dummy = bool(dummy) && pos.is_after_all_clustered_rows(s);
        _flags._dummy = bool(dummy);
        _flags._continuous = bool(continuous);
        _flags._before_ck = pos.is_before_key();
        _flags._after_ck = pos.is_after_key();
    }
    rows_entry(const schema& s, last_dummy_tag, is_continuous continuous)
        : rows_entry(s, position_in_partition_view::after_all_clustered_rows(), is_dummy::yes, continuous)
    { }
    rows_entry(const clustering_key& key, deletable_row&& row)
        : _key(key), _row(std::move(row))
    { }
    rows_entry(const schema& s, const clustering_key& key, const deletable_row& row)
        : _key(key), _row(s, row)
    { }
    rows_entry(const schema& s, const clustering_key& key, row_tombstone tomb, const row_marker& marker, const row& row)
        : _key(key), _row(s, tomb, marker, row)
    { }
    rows_entry(rows_entry&& o) noexcept;
    rows_entry(const schema& s, const rows_entry& e)
        : _key(e._key)
        , _row(s, e._row)
        , _flags(e._flags)
    { }
    // Valid only if !dummy()
    clustering_key& key() {
        return _key;
    }
    // Valid only if !dummy()
    const clustering_key& key() const {
        return _key;
    }
    deletable_row& row() {
        return _row;
    }
    const deletable_row& row() const {
        return _row;
    }
    position_in_partition_view position() const {
        return position_in_partition_view(partition_region::clustered, bound_weight(_flags._after_ck - _flags._before_ck), &_key);
    }

    is_continuous continuous() const { return is_continuous(_flags._continuous); }
    void set_continuous(bool value) { _flags._continuous = value; }
    void set_continuous(is_continuous value) { set_continuous(bool(value)); }
    is_dummy dummy() const { return is_dummy(_flags._dummy); }
    bool is_last_dummy() const { return _flags._last_dummy; }
    void set_dummy(bool value) { _flags._dummy = value; }
    void set_dummy(is_dummy value) { _flags._dummy = bool(value); }
    void apply(row_tombstone t) {
        _row.apply(t);
    }
    void apply_monotonically(const schema& s, rows_entry&& e) {
        _row.apply(s, std::move(e._row));
    }
    bool empty() const {
        return _row.empty();
    }
    struct tri_compare {
        position_in_partition::tri_compare _c;
        explicit tri_compare(const schema& s) : _c(s) {}
        int operator()(const rows_entry& e1, const rows_entry& e2) const {
            return _c(e1.position(), e2.position());
        }
        int operator()(const clustering_key& key, const rows_entry& e) const {
            return _c(position_in_partition_view::for_key(key), e.position());
        }
        int operator()(const rows_entry& e, const clustering_key& key) const {
            return _c(e.position(), position_in_partition_view::for_key(key));
        }
        int operator()(const rows_entry& e, position_in_partition_view p) const {
            return _c(e.position(), p);
        }
        int operator()(position_in_partition_view p, const rows_entry& e) const {
            return _c(p, e.position());
        }
        int operator()(position_in_partition_view p1, position_in_partition_view p2) const {
            return _c(p1, p2);
        }
    };
    struct compare {
        tri_compare _c;
        explicit compare(const schema& s) : _c(s) {}
        bool operator()(const rows_entry& e1, const rows_entry& e2) const {
            return _c(e1, e2) < 0;
        }
        bool operator()(const clustering_key& key, const rows_entry& e) const {
            return _c(key, e) < 0;
        }
        bool operator()(const rows_entry& e, const clustering_key& key) const {
            return _c(e, key) < 0;
        }
        bool operator()(const clustering_key_view& key, const rows_entry& e) const {
            return _c(key, e) < 0;
        }
        bool operator()(const rows_entry& e, const clustering_key_view& key) const {
            return _c(e, key) < 0;
        }
        bool operator()(const rows_entry& e, position_in_partition_view p) const {
            return _c(e.position(), p) < 0;
        }
        bool operator()(position_in_partition_view p, const rows_entry& e) const {
            return _c(p, e.position()) < 0;
        }
        bool operator()(position_in_partition_view p1, position_in_partition_view p2) const {
            return _c(p1, p2) < 0;
        }
    };
    bool equal(const schema& s, const rows_entry& other) const;
    bool equal(const schema& s, const rows_entry& other, const schema& other_schema) const;

    size_t memory_usage(const schema&) const;
    void on_evicted(cache_tracker&) noexcept;

    class printer {
        const schema& _schema;
        const rows_entry& _rows_entry;
    public:
        printer(const schema& s, const rows_entry& r) : _schema(s), _rows_entry(r) { }
        printer(const printer&) = delete;
        printer(printer&&) = delete;

        friend std::ostream& operator<<(std::ostream& os, const printer& p);
    };
    friend std::ostream& operator<<(std::ostream& os, const printer& p);
};

// Represents a set of writes made to a single partition.
//
// The object is schema-dependent. Each instance is governed by some
// specific schema version. Accessors require a reference to the schema object
// of that version.
//
// There is an operation of addition defined on mutation_partition objects
// (also called "apply"), which gives as a result an object representing the
// sum of writes contained in the addends. For instances governed by the same
// schema, addition is commutative and associative.
//
// In addition to representing writes, the object supports specifying a set of
// partition elements called "continuity". This set can be used to represent
// lack of information about certain parts of the partition. It can be
// specified which ranges of clustering keys belong to that set. We say that a
// key range is continuous if all keys in that range belong to the continuity
// set, and discontinuous otherwise. By default everything is continuous.
// The static row may be also continuous or not.
// Partition tombstone is always continuous.
//
// Continuity is ignored by instance equality. It's also transient, not
// preserved by serialization.
//
// Continuity is represented internally using flags on row entries. The key
// range between two consecutive entries (both ends exclusive) is continuous
// if and only if rows_entry::continuous() is true for the later entry. The
// range starting after the last entry is assumed to be continuous. The range
// corresponding to the key of the entry is continuous if and only if
// rows_entry::dummy() is false.
//
// Adding two fully-continuous instances gives a fully-continuous instance.
// Continuity doesn't affect how the write part is added.
//
// Addition of continuity is not commutative in general, but is associative.
// The default continuity merging rules are those required by MVCC to
// preserve its invariants. For details, refer to "Continuity merging rules" section
// in the doc in partition_version.hh.
class mutation_partition final {
public:
    using rows_type = intrusive_set_external_comparator<rows_entry, &rows_entry::_link>;
    friend class rows_entry;
    friend class size_calculator;
private:
    tombstone _tombstone;
    row _static_row;
    bool _static_row_continuous = true;
    rows_type _rows;
    // Contains only strict prefixes so that we don't have to lookup full keys
    // in both _row_tombstones and _rows.
    range_tombstone_list _row_tombstones;

    friend class mutation_partition_applier;
    friend class converting_mutation_partition_applier;
public:
    struct copy_comparators_only {};
    struct incomplete_tag {};
    // Constructs an empty instance which is fully discontinuous except for the partition tombstone.
    mutation_partition(incomplete_tag, const schema& s, tombstone);
    static mutation_partition make_incomplete(const schema& s, tombstone t = {}) {
        return mutation_partition(incomplete_tag(), s, t);
    }
    mutation_partition(schema_ptr s)
        : _rows()
        , _row_tombstones(*s)
    { }
    mutation_partition(mutation_partition& other, copy_comparators_only)
        : _rows()
        , _row_tombstones(other._row_tombstones, range_tombstone_list::copy_comparator_only())
    { }
    mutation_partition(mutation_partition&&) = default;
    mutation_partition(const schema& s, const mutation_partition&);
    mutation_partition(const mutation_partition&, const schema&, query::clustering_key_filter_ranges);
    mutation_partition(mutation_partition&&, const schema&, query::clustering_key_filter_ranges);
    ~mutation_partition();
    static mutation_partition& container_of(rows_type&);
    mutation_partition& operator=(mutation_partition&& x) noexcept;
    bool equal(const schema&, const mutation_partition&) const;
    bool equal(const schema& this_schema, const mutation_partition& p, const schema& p_schema) const;
    bool equal_continuity(const schema&, const mutation_partition&) const;
    // Consistent with equal()
    template<typename Hasher>
    void feed_hash(Hasher& h, const schema& s) const {
        hashing_partition_visitor<Hasher> v(h, s);
        accept(s, v);
    }

    class printer {
        const schema& _schema;
        const mutation_partition& _mutation_partition;
    public:
        printer(const schema& s, const mutation_partition& mp) : _schema(s), _mutation_partition(mp) { }
        printer(const printer&) = delete;
        printer(printer&&) = delete;

        friend std::ostream& operator<<(std::ostream& os, const printer& p);
    };
    friend std::ostream& operator<<(std::ostream& os, const printer& p);
public:
    // Makes sure there is a dummy entry after all clustered rows. Doesn't affect continuity.
    // Doesn't invalidate iterators.
    void ensure_last_dummy(const schema&);
    bool static_row_continuous() const { return _static_row_continuous; }
    void set_static_row_continuous(bool value) { _static_row_continuous = value; }
    bool is_fully_continuous() const;
    void make_fully_continuous();
    // Sets or clears continuity of clustering ranges between existing rows.
    void set_continuity(const schema&, const position_range& pr, is_continuous);
    // Returns clustering row ranges which have continuity matching the is_continuous argument.
    clustering_interval_set get_continuity(const schema&, is_continuous = is_continuous::yes) const;
    // Returns true iff all keys from given range are marked as continuous, or range is empty.
    bool fully_continuous(const schema&, const position_range&);
    // Returns true iff all keys from given range are marked as not continuous and range is not empty.
    bool fully_discontinuous(const schema&, const position_range&);
    // Returns true iff all keys from given range have continuity membership as specified by is_continuous.
    bool check_continuity(const schema&, const position_range&, is_continuous) const;
    // Frees elements of the partition in batches.
    // Returns stop_iteration::yes iff there are no more elements to free.
    // Continuity is unspecified after this.
    stop_iteration clear_gently(cache_tracker*) noexcept;
    // Applies mutation_fragment.
    // The fragment must be goverened by the same schema as this object.
    void apply(const schema& s, const mutation_fragment&);
    void apply(tombstone t) { _tombstone.apply(t); }
    void apply_delete(const schema& schema, const clustering_key_prefix& prefix, tombstone t);
    void apply_delete(const schema& schema, range_tombstone rt);
    void apply_delete(const schema& schema, clustering_key_prefix&& prefix, tombstone t);
    void apply_delete(const schema& schema, clustering_key_prefix_view prefix, tombstone t);
    // Equivalent to applying a mutation with an empty row, created with given timestamp
    void apply_insert(const schema& s, clustering_key_view, api::timestamp_type created_at);
    void apply_insert(const schema& s, clustering_key_view, api::timestamp_type created_at,
                      gc_clock::duration ttl, gc_clock::time_point expiry);
    // prefix must not be full
    void apply_row_tombstone(const schema& schema, clustering_key_prefix prefix, tombstone t);
    void apply_row_tombstone(const schema& schema, range_tombstone rt);
    //
    // Applies p to current object.
    //
    // Commutative when this_schema == p_schema. If schemas differ, data in p which
    // is not representable in this_schema is dropped, thus apply() loses commutativity.
    //
    // Weak exception guarantees.
    void apply(const schema& this_schema, const mutation_partition& p, const schema& p_schema);
    // Use in case this instance and p share the same schema.
    // Same guarantees as apply(const schema&, mutation_partition&&, const schema&);
    void apply(const schema& s, mutation_partition&& p);
    // Same guarantees and constraints as for apply(const schema&, const mutation_partition&, const schema&).
    void apply(const schema& this_schema, mutation_partition_view p, const schema& p_schema);

    // Applies p to this instance.
    //
    // Monotonic exception guarantees. In case of exception the sum of p and this remains the same as before the exception.
    // This instance and p are governed by the same schema.
    //
    // Must be provided with a pointer to the cache_tracker, which owns both this and p.
    //
    // Returns stop_iteration::no if the operation was preempted before finished, and stop_iteration::yes otherwise.
    // On preemption the sum of this and p stays the same (represents the same set of writes), and the state of this
    // object contains at least all the writes it contained before the call (monotonicity). It may contain partial writes.
    // Also, some progress is always guaranteed (liveness).
    //
    // The operation can be drien to completion like this:
    //
    //   while (apply_monotonically(..., is_preemtable::yes) == stop_iteration::no) { }
    //
    // If is_preemptible::no is passed as argument then stop_iteration::no is never returned.
    stop_iteration apply_monotonically(const schema& s, mutation_partition&& p, cache_tracker*, is_preemptible = is_preemptible::no);
    stop_iteration apply_monotonically(const schema& s, mutation_partition&& p, const schema& p_schema, is_preemptible = is_preemptible::no);

    // Weak exception guarantees.
    // Assumes this and p are not owned by a cache_tracker.
    void apply_weak(const schema& s, const mutation_partition& p, const schema& p_schema);
    void apply_weak(const schema& s, mutation_partition&&);
    void apply_weak(const schema& s, mutation_partition_view p, const schema& p_schema);

    // Converts partition to the new schema. When succeeds the partition should only be accessed
    // using the new schema.
    //
    // Strong exception guarantees.
    void upgrade(const schema& old_schema, const schema& new_schema);
private:
    void insert_row(const schema& s, const clustering_key& key, deletable_row&& row);
    void insert_row(const schema& s, const clustering_key& key, const deletable_row& row);

    uint32_t do_compact(const schema& s,
        gc_clock::time_point now,
        const std::vector<query::clustering_range>& row_ranges,
        bool reverse,
        uint32_t row_limit,
        can_gc_fn&);

    // Calls func for each row entry inside row_ranges until func returns stop_iteration::yes.
    // Removes all entries for which func didn't return stop_iteration::no or wasn't called at all.
    // Removes all entries that are empty, check rows_entry::empty().
    // If reversed is true, func will be called on entries in reverse order. In that case row_ranges
    // must be already in reverse order.
    template<bool reversed, typename Func>
    void trim_rows(const schema& s,
        const std::vector<query::clustering_range>& row_ranges,
        Func&& func);
public:
    // Performs the following:
    //   - throws out data which doesn't belong to row_ranges
    //   - expires cells and tombstones based on query_time
    //   - drops cells covered by higher-level tombstones (compaction)
    //   - leaves at most row_limit live rows
    //
    // Note: a partition with a static row which has any cell live but no
    // clustered rows still counts as one row, according to the CQL row
    // counting rules.
    //
    // Returns the count of CQL rows which remained. If the returned number is
    // smaller than the row_limit it means that there was no more data
    // satisfying the query left.
    //
    // The row_limit parameter must be > 0.
    //
    uint32_t compact_for_query(const schema& s, gc_clock::time_point query_time,
        const std::vector<query::clustering_range>& row_ranges, bool reversed, uint32_t row_limit);

    // Performs the following:
    //   - expires cells based on compaction_time
    //   - drops cells covered by higher-level tombstones
    //   - drops expired tombstones which timestamp is before max_purgeable
    void compact_for_compaction(const schema& s, can_gc_fn&,
        gc_clock::time_point compaction_time);

    // Returns the minimal mutation_partition that when applied to "other" will
    // create a mutation_partition equal to the sum of other and this one.
    // This and other must both be governed by the same schema s.
    mutation_partition difference(schema_ptr s, const mutation_partition& other) const;

    // Returns a subset of this mutation holding only information relevant for given clustering ranges.
    // Range tombstones will be trimmed to the boundaries of the clustering ranges.
    mutation_partition sliced(const schema& s, const query::clustering_row_ranges&) const;

    // Returns true if the mutation_partition represents no writes.
    bool empty() const;
public:
    deletable_row& clustered_row(const schema& s, const clustering_key& key);
    deletable_row& clustered_row(const schema& s, clustering_key&& key);
    deletable_row& clustered_row(const schema& s, clustering_key_view key);
    deletable_row& clustered_row(const schema& s, position_in_partition_view pos, is_dummy, is_continuous);
public:
    tombstone partition_tombstone() const { return _tombstone; }
    row& static_row() { return _static_row; }
    const row& static_row() const { return _static_row; }
    // return a set of rows_entry where each entry represents a CQL row sharing the same clustering key.
    const rows_type& clustered_rows() const { return _rows; }
    const range_tombstone_list& row_tombstones() const { return _row_tombstones; }
    rows_type& clustered_rows() { return _rows; }
    range_tombstone_list& row_tombstones() { return _row_tombstones; }
    const row* find_row(const schema& s, const clustering_key& key) const;
    tombstone range_tombstone_for_row(const schema& schema, const clustering_key& key) const;
    row_tombstone tombstone_for_row(const schema& schema, const clustering_key& key) const;
    // Can be called only for non-dummy entries
    row_tombstone tombstone_for_row(const schema& schema, const rows_entry& e) const;
    boost::iterator_range<rows_type::const_iterator> range(const schema& schema, const query::clustering_range& r) const;
    rows_type::const_iterator lower_bound(const schema& schema, const query::clustering_range& r) const;
    rows_type::const_iterator upper_bound(const schema& schema, const query::clustering_range& r) const;
    rows_type::iterator lower_bound(const schema& schema, const query::clustering_range& r);
    rows_type::iterator upper_bound(const schema& schema, const query::clustering_range& r);
    boost::iterator_range<rows_type::iterator> range(const schema& schema, const query::clustering_range& r);
    // Returns an iterator range of rows_entry, with only non-dummy entries.
    auto non_dummy_rows() const {
        return boost::make_iterator_range(_rows.begin(), _rows.end())
            | boost::adaptors::filtered([] (const rows_entry& e) { return bool(!e.dummy()); });
    }
    // Writes this partition using supplied query result writer.
    // The partition should be first compacted with compact_for_query(), otherwise
    // results may include data which is deleted/expired.
    // At most row_limit CQL rows will be written and digested.
    void query_compacted(query::result::partition_writer& pw, const schema& s, uint32_t row_limit) const;
    void accept(const schema&, mutation_partition_visitor&) const;

    // Returns the number of live CQL rows in this partition.
    //
    // Note: If no regular rows are live, but there's something live in the
    // static row, the static row counts as one row. If there is at least one
    // regular row live, static row doesn't count.
    //
    size_t live_row_count(const schema&,
        gc_clock::time_point query_time = gc_clock::time_point::min()) const;

    bool is_static_row_live(const schema&,
        gc_clock::time_point query_time = gc_clock::time_point::min()) const;

    size_t external_memory_usage(const schema&) const;
private:
    template<typename Func>
    void for_each_row(const schema& schema, const query::clustering_range& row_range, bool reversed, Func&& func) const;
    friend class counter_write_query_result_builder;
};

inline
mutation_partition& mutation_partition::container_of(rows_type& rows) {
    return *boost::intrusive::get_parent_from_member(&rows, &mutation_partition::_rows);
}