Files
scylladb/mutation/mutation_partition_v2.cc
Michał Chojnowski eb5ccb7356 mutation_partition_v2: fix a minor bug in printer
Commit 1cb95b8cf caused a small regression in the debug printer.
After that commit, range tombstones are printed to stdout,
instead of the target stream.
In practice, this causes range tombstones to appear in test logs
out of order with respect to other parts of the debug message.

Fix that.

Closes #13766
2023-05-04 16:56:40 +03:00

1208 lines
48 KiB
C++

/*
* Copyright (C) 2014-present ScyllaDB
*/
/*
* SPDX-License-Identifier: AGPL-3.0-or-later
*/
#include <seastar/core/coroutine.hh>
#include <seastar/coroutine/maybe_yield.hh>
#include <boost/range/adaptor/reversed.hpp>
#include "mutation_partition_v2.hh"
#include "clustering_interval_set.hh"
#include "converting_mutation_partition_applier.hh"
#include "partition_builder.hh"
#include "query-result-writer.hh"
#include "atomic_cell_hash.hh"
#include "reversibly_mergeable.hh"
#include "mutation_fragment.hh"
#include "mutation_query.hh"
#include "service/priority_manager.hh"
#include "mutation_compactor.hh"
#include "counters.hh"
#include "row_cache.hh"
#include "view_info.hh"
#include "mutation_cleaner.hh"
#include <seastar/core/execution_stage.hh>
#include "types/map.hh"
#include "compaction/compaction_garbage_collector.hh"
#include "utils/exceptions.hh"
#include "clustering_key_filter.hh"
#include "mutation_partition_view.hh"
#include "tombstone_gc.hh"
#include "utils/unconst.hh"
extern logging::logger mplog;
mutation_partition_v2::mutation_partition_v2(const schema& s, const mutation_partition_v2& x)
: _tombstone(x._tombstone)
, _static_row(s, column_kind::static_column, x._static_row)
, _static_row_continuous(x._static_row_continuous)
, _rows()
#ifdef SEASTAR_DEBUG
, _schema_version(s.version())
#endif
{
#ifdef SEASTAR_DEBUG
assert(x._schema_version == _schema_version);
#endif
auto cloner = [&s] (const rows_entry* x) -> rows_entry* {
return current_allocator().construct<rows_entry>(s, *x);
};
_rows.clone_from(x._rows, cloner, current_deleter<rows_entry>());
}
mutation_partition_v2::mutation_partition_v2(const schema& s, mutation_partition&& x)
: _tombstone(x.partition_tombstone())
, _static_row(std::move(x.static_row()))
, _static_row_continuous(x.static_row_continuous())
, _rows(std::move(x.mutable_clustered_rows()))
#ifdef SEASTAR_DEBUG
, _schema_version(s.version())
#endif
{
auto&& tombstones = x.mutable_row_tombstones();
if (!tombstones.empty()) {
try {
mutation_partition_v2 p(s.shared_from_this());
for (auto&& t: tombstones) {
range_tombstone & rt = t.tombstone();
p.clustered_rows_entry(s, rt.position(), is_dummy::yes, is_continuous::no);
p.clustered_rows_entry(s, rt.end_position(), is_dummy::yes, is_continuous::yes)
.set_range_tombstone(rt.tomb);
}
mutation_application_stats app_stats;
apply_monotonically(s, std::move(p), s, app_stats);
} catch (...) {
_rows.clear_and_dispose(current_deleter<rows_entry>());
throw;
}
}
}
mutation_partition_v2::mutation_partition_v2(const schema& s, const mutation_partition& x)
: mutation_partition_v2(s, mutation_partition(s, x))
{ }
mutation_partition_v2::~mutation_partition_v2() {
_rows.clear_and_dispose(current_deleter<rows_entry>());
}
mutation_partition_v2&
mutation_partition_v2::operator=(mutation_partition_v2&& x) noexcept {
if (this != &x) {
this->~mutation_partition_v2();
new (this) mutation_partition_v2(std::move(x));
}
return *this;
}
void mutation_partition_v2::ensure_last_dummy(const schema& s) {
check_schema(s);
if (_rows.empty() || !_rows.rbegin()->is_last_dummy()) {
auto e = alloc_strategy_unique_ptr<rows_entry>(
current_allocator().construct<rows_entry>(s, rows_entry::last_dummy_tag(), is_continuous::yes));
_rows.insert_before(_rows.end(), std::move(e));
}
}
template <>
struct fmt::formatter<apply_resume> : fmt::formatter<std::string_view> {
template <typename FormatContext>
auto format(const apply_resume& res, FormatContext& ctx) const {
return fmt::format_to(ctx.out(), "{{{}, {}}}", int(res._stage), res._pos);
}
};
stop_iteration mutation_partition_v2::apply_monotonically(const schema& s, mutation_partition_v2&& p, cache_tracker* tracker,
mutation_application_stats& app_stats, is_preemptible preemptible, apply_resume& res, is_evictable evictable) {
return apply_monotonically(s, std::move(p), tracker, app_stats,
preemptible ? default_preemption_check() : never_preempt(), res, evictable);
}
stop_iteration mutation_partition_v2::apply_monotonically(const schema& s, mutation_partition_v2&& p, cache_tracker* tracker,
mutation_application_stats& app_stats, preemption_check need_preempt, apply_resume& res, is_evictable evictable) {
#ifdef SEASTAR_DEBUG
assert(s.version() == _schema_version);
assert(p._schema_version == _schema_version);
#endif
_tombstone.apply(p._tombstone);
_static_row.apply_monotonically(s, column_kind::static_column, std::move(p._static_row));
_static_row_continuous |= p._static_row_continuous;
rows_entry::tri_compare cmp(s);
position_in_partition::equal_compare eq(s);
auto del = current_deleter<rows_entry>();
auto compact = [&] (rows_entry& e) {
++app_stats.rows_compacted_with_tombstones;
e.compact(s, _tombstone);
};
if (p._tombstone) {
rows_type::iterator i;
if (res._stage == apply_resume::stage::partition_tombstone_compaction) {
i = _rows.upper_bound(res._pos, cmp);
} else {
i = _rows.begin();
}
auto prev_i = (i == _rows.begin()) ? rows_type::iterator() : std::prev(i);
while (i != _rows.end()) {
compact(*i);
if (prev_i) {
maybe_drop(s, tracker, prev_i, app_stats);
}
if (need_preempt() && i != _rows.end()) {
res = apply_resume(apply_resume::stage::partition_tombstone_compaction, i->position());
return stop_iteration::no;
}
prev_i = i;
++i;
}
if (prev_i != _rows.end()) {
maybe_drop(s, tracker, prev_i, app_stats);
}
// TODO: Drop redundant range tombstones
p._tombstone = {};
}
// Inserting new entries into LRU here is generally unsafe because
// it may violate the "older versions are evicted first" rule (see row_cache.md).
// It could happen, that there are newer versions in the MVCC chain with the same
// key, not involved in this merge. Inserting an entry here would put this
// entry ahead in the LRU, and the newer entry could get evicted earlier leading
// to apparent loss of writes.
// To avoid this, when inserting sentinels we must use lru::add_before() so that
// they are put right before in the same place in the LRU.
// Note: This procedure is not violating the "older versions are evicted first" rule.
// It may move some entries from the newer version into the old version,
// so the older version may have entries while the new version is already experiencing
// eviction. However, the original information which was there in the old version
// is guaranteed to be evicted prior to that, so there is no way for old information
// to be exposed by such eviction.
auto p_i = p._rows.begin();
auto i = _rows.begin();
rows_type::iterator lb_i; // iterator into _rows for previously inserted entry.
// When resuming, the predecessor of the sentinel may have been compacted.
bool prev_compacted = true;
if (res._stage < apply_resume::stage::merging_rows) {
prev_compacted = false;
res = apply_resume::merging_rows();
}
bool made_progress = false;
// Engaged p_sentinel indicates that information in p up to sentinel->position() was
// merged into this instance and that flags on the entry pointed to by p_i are
// only valid for the key range up to sentinel->position().
// We should insert the sentinel back before returning so that the sum of p and this instance
// remains consistent, and attributes like continuity and range_tombstone do not
// extend to before_all_clustering_keys() in p.
// If this_sentinel is engaged then it will be inserted into this instance at
// the same position as p_sentinel, and reflects information about the interval
// preceding the sentinel.
// We need two sentinels so that there is no gap in continuity in case there is no entry
// in this instance at the position of p_sentinel.
// The sentinel never has a clustering key position, so it carries no row information.
alloc_strategy_unique_ptr<rows_entry> p_sentinel;
alloc_strategy_unique_ptr<rows_entry> this_sentinel;
auto insert_sentinel_back = defer([&] {
// Insert this_sentinel before sentinel so that the former lands before the latter in LRU.
if (this_sentinel) {
assert(p_i != p._rows.end());
auto rt = this_sentinel->range_tombstone();
auto insert_result = _rows.insert_before_hint(i, std::move(this_sentinel), cmp);
auto i2 = insert_result.first;
if (insert_result.second) {
mplog.trace("{}: inserting sentinel at {}", fmt::ptr(this), i2->position());
if (tracker) {
tracker->insert(*std::prev(i2), *i2);
}
} else {
mplog.trace("{}: merging sentinel at {}", fmt::ptr(this), i2->position());
i2->set_continuous(true);
i2->set_range_tombstone(rt);
}
}
if (p_sentinel) {
assert(p_i != p._rows.end());
if (cmp(p_i->position(), p_sentinel->position()) == 0) {
mplog.trace("{}: clearing attributes on {}", fmt::ptr(&p), p_i->position());
assert(p_i->dummy());
p_i->set_continuous(false);
p_i->set_range_tombstone({});
} else {
mplog.trace("{}: inserting sentinel at {}", fmt::ptr(&p), p_sentinel->position());
auto insert_result = p._rows.insert_before_hint(p_i, std::move(p_sentinel), cmp);
if (tracker) {
tracker->insert(*p_i, *insert_result.first);
}
}
}
});
while (p_i != p._rows.end()) {
rows_entry& src_e = *p_i;
bool miss = true;
if (i != _rows.end()) {
auto x = cmp(*i, src_e);
if (x < 0) {
bool match;
i = _rows.lower_bound(src_e, match, cmp);
miss = !match;
} else {
miss = x > 0;
}
}
// Invariants:
// i->position() >= p_i->position()
// The block below reflects the information from interval (lb_i->position(), p_i->position()) to _rows,
// up to the last entry in _rows which has position() < p_i->position(). The remainder is reflected by the act of
// moving p_i itself.
bool prev_interval_loaded = (evictable && src_e.continuous()) || (!evictable && src_e.range_tombstone());
if (prev_interval_loaded) {
// lb_i is only valid if prev_interval_loaded.
rows_type::iterator prev_lb_i;
if (lb_i) {
// If there is lb_i, it means the interval starts exactly at lb_i->position() in p.
// Increment is needed, we don't want to set attributes on the lower bound of the interval.
prev_lb_i = lb_i;
++lb_i;
} else {
lb_i = _rows.begin();
}
while (lb_i != i) {
bool compaction_worthwhile = src_e.range_tombstone() > lb_i->range_tombstone();
// This works for both evictable and non-evictable snapshots.
// For evictable snapshots we could replace the tombstone with newer, but due to
// the "information monotonicity" rule, adding tombstone works too.
lb_i->set_range_tombstone(lb_i->range_tombstone() + src_e.range_tombstone());
lb_i->set_continuous(true);
if (prev_compacted && prev_lb_i) {
maybe_drop(s, tracker, prev_lb_i, app_stats);
}
prev_compacted = false;
if (lb_i->dummy()) {
prev_compacted = true;
} else if (compaction_worthwhile) {
compact(*lb_i);
prev_compacted = true;
}
if (need_preempt()) {
auto s1 = alloc_strategy_unique_ptr<rows_entry>(
current_allocator().construct<rows_entry>(s,
position_in_partition::after_key(s, lb_i->position()), is_dummy::yes, is_continuous::no));
alloc_strategy_unique_ptr<rows_entry> s2;
if (lb_i->position().is_clustering_row()) {
s2 = alloc_strategy_unique_ptr<rows_entry>(
current_allocator().construct<rows_entry>(s, s1->position(), is_dummy::yes, is_continuous::yes));
auto lb_i_next = std::next(lb_i);
if (lb_i_next != _rows.end() && lb_i_next->continuous()) {
s2->set_range_tombstone(lb_i_next->range_tombstone() + src_e.range_tombstone());
} else {
s2->set_range_tombstone(src_e.range_tombstone());
}
}
p_sentinel = std::move(s1);
this_sentinel = std::move(s2);
mplog.trace("preempted, res={}", res);
return stop_iteration::no;
}
prev_lb_i = lb_i;
++lb_i;
}
}
auto next_p_i = std::next(p_i);
// next_interval_loaded is true iff there are attributes on next_p_i which apply
// to the interval (p_i->position(), next_p_i->position), and we
// have to prepare a sentinel when removing p_i from p in case merging
// needs to stop before next_p_i is moved.
bool next_interval_loaded = next_p_i != p._rows.end()
&& ((evictable && next_p_i->continuous()) || (!evictable && next_p_i->range_tombstone()));
bool do_compact = false;
if (miss) {
alloc_strategy_unique_ptr<rows_entry> s1;
alloc_strategy_unique_ptr<rows_entry> s2;
if (next_interval_loaded) {
// FIXME: Avoid reallocation
s1 = alloc_strategy_unique_ptr<rows_entry>(
current_allocator().construct<rows_entry>(s,
position_in_partition::after_key(s, src_e.position()), is_dummy::yes, is_continuous::no));
if (src_e.position().is_clustering_row()) {
s2 = alloc_strategy_unique_ptr<rows_entry>(
current_allocator().construct<rows_entry>(s,
s1->position(), is_dummy::yes, is_continuous::yes));
if (i != _rows.end() && i->continuous()) {
s2->set_range_tombstone(i->range_tombstone() + src_e.range_tombstone());
} else {
s2->set_range_tombstone(src_e.range_tombstone());
}
}
}
rows_type::key_grabber pi_kg(p_i);
lb_i = _rows.insert_before(i, std::move(pi_kg));
p_sentinel = std::move(s1);
this_sentinel = std::move(s2);
// Check if src_e falls into a continuous range.
// The range past the last entry is also always implicitly continuous.
if (i == _rows.end() || i->continuous()) {
tombstone i_rt = i != _rows.end() ? i->range_tombstone() : tombstone();
// Cannot apply only-row range tombstone falling into a continuous range without inserting extra entry.
// Should not occur in practice due to the "older versions are evicted first" rule.
// Never occurs in non-evictable snapshots because they are continuous.
if (!src_e.continuous() && src_e.range_tombstone() > i_rt) {
if (src_e.dummy()) {
lb_i->set_range_tombstone(i_rt);
} else {
position_in_partition_view i_pos = i != _rows.end() ? i->position()
: position_in_partition_view::after_all_clustered_rows();
// See the "no singular tombstones" rule.
mplog.error("Cannot merge entry {} with rt={}, cont=0 into continuous range before {} with rt={}",
src_e.position(), src_e.range_tombstone(), i_pos, i_rt);
abort();
}
} else {
lb_i->set_range_tombstone(src_e.range_tombstone() + i_rt);
}
lb_i->set_continuous(true);
}
} else {
assert(i->dummy() == src_e.dummy());
alloc_strategy_unique_ptr<rows_entry> s1;
alloc_strategy_unique_ptr<rows_entry> s2;
if (next_interval_loaded) {
// FIXME: Avoid reallocation
s1 = alloc_strategy_unique_ptr<rows_entry>(
current_allocator().construct<rows_entry>(s,
position_in_partition::after_key(s, src_e.position()), is_dummy::yes, is_continuous::no));
if (src_e.position().is_clustering_row()) {
s2 = alloc_strategy_unique_ptr<rows_entry>(
current_allocator().construct<rows_entry>(s, s1->position(), is_dummy::yes, is_continuous::yes));
auto next_i = std::next(i);
if (next_i != _rows.end() && next_i->continuous()) {
s2->set_range_tombstone(next_i->range_tombstone() + src_e.range_tombstone());
} else {
s2->set_range_tombstone(src_e.range_tombstone());
}
}
}
{
// FIXME: This can be an evictable snapshot even if !tracker, see partition_entry::squashed()
// So we need to handle continuity as if it was an evictable snapshot.
if (i->continuous()) {
if (src_e.range_tombstone() > i->range_tombstone()) {
// Cannot apply range tombstone in such a case.
// Should not occur in practice due to the "older versions are evicted first" rule.
if (!src_e.continuous()) {
// range tombstone on a discontinuous dummy does not matter
if (!src_e.dummy()) {
// See the "no singular tombstones" rule.
mplog.error("Cannot merge entry {} with rt={}, cont=0 into an entry which has rt={}, cont=1",
src_e.position(), src_e.range_tombstone(), i->range_tombstone());
abort();
}
} else {
i->set_range_tombstone(i->range_tombstone() + src_e.range_tombstone());
}
}
} else {
i->set_continuous(src_e.continuous());
i->set_range_tombstone(i->range_tombstone() + src_e.range_tombstone());
}
}
if (tracker) {
// Newer evictable versions store complete rows
i->row() = std::move(src_e.row());
// Need to preserve the LRU link of the later version in case it's
// the last dummy entry which holds the partition entry linked in LRU.
i->swap(src_e);
tracker->remove(src_e);
} else {
// Avoid row compaction if no newer range tombstone.
do_compact = (src_e.range_tombstone() + src_e.row().deleted_at().regular()) >
(i->range_tombstone() + i->row().deleted_at().regular());
memory::on_alloc_point();
i->apply_monotonically(s, std::move(src_e));
}
++app_stats.row_hits;
p_i = p._rows.erase_and_dispose(p_i, del);
lb_i = i;
++i;
p_sentinel = std::move(s1);
this_sentinel = std::move(s2);
}
// All operations above up to each insert_before() must be noexcept.
if (prev_compacted && lb_i != _rows.begin()) {
maybe_drop(s, tracker, std::prev(lb_i), app_stats);
}
if (lb_i->dummy()) {
prev_compacted = true;
} else if (do_compact) {
compact(*lb_i);
prev_compacted = true;
} else {
prev_compacted = false;
}
if (prev_compacted && !next_interval_loaded) {
// next_p_i will not see prev_interval_loaded so will not attempt to drop predecessors.
// We have to do it now.
maybe_drop(s, tracker, lb_i, app_stats);
lb_i = {};
}
++app_stats.row_writes;
// We must not return stop_iteration::no if we removed the last element from p._rows.
// Otherwise, p_i will be left empty, and thus fully continuous, violating the
// invariant that the sum of this and p has the same continuity as before merging.
if (made_progress && need_preempt() && p_i != p._rows.end()) {
return stop_iteration::no;
}
made_progress = true;
}
if (prev_compacted && lb_i != _rows.end()) {
maybe_drop(s, tracker, lb_i, app_stats);
}
return stop_iteration::yes;
}
stop_iteration mutation_partition_v2::apply_monotonically(const schema& s, mutation_partition_v2&& p, const schema& p_schema,
mutation_application_stats& app_stats, is_preemptible preemptible, apply_resume& res, is_evictable evictable) {
if (s.version() == p_schema.version()) {
return apply_monotonically(s, std::move(p), no_cache_tracker, app_stats,
preemptible ? default_preemption_check() : never_preempt(), res, evictable);
} else {
mutation_partition_v2 p2(s, p);
p2.upgrade(p_schema, s);
return apply_monotonically(s, std::move(p2), no_cache_tracker, app_stats, never_preempt(), res, evictable); // FIXME: make preemptible
}
}
stop_iteration mutation_partition_v2::apply_monotonically(const schema& s, mutation_partition_v2&& p, cache_tracker *tracker,
mutation_application_stats& app_stats, is_evictable evictable) {
apply_resume res;
return apply_monotonically(s, std::move(p), tracker, app_stats, is_preemptible::no, res, evictable);
}
stop_iteration mutation_partition_v2::apply_monotonically(const schema& s, mutation_partition_v2&& p, const schema& p_schema,
mutation_application_stats& app_stats) {
apply_resume res;
return apply_monotonically(s, std::move(p), p_schema, app_stats, is_preemptible::no, res, is_evictable::no);
}
void mutation_partition_v2::apply(const schema& s, const mutation_partition_v2& p, const schema& p_schema,
mutation_application_stats& app_stats) {
apply_monotonically(s, mutation_partition_v2(p_schema, std::move(p)), p_schema, app_stats);
}
void mutation_partition_v2::apply(const schema& s, mutation_partition_v2&& p, mutation_application_stats& app_stats) {
apply_monotonically(s, mutation_partition_v2(s, std::move(p)), no_cache_tracker, app_stats, is_evictable::no);
}
void
mutation_partition_v2::apply_weak(const schema& s, mutation_partition_view p,
const schema& p_schema, mutation_application_stats& app_stats) {
// FIXME: Optimize
mutation_partition p2(p_schema.shared_from_this());
partition_builder b(p_schema, p2);
p.accept(p_schema, b);
apply_monotonically(s, mutation_partition_v2(p_schema, std::move(p2)), p_schema, app_stats);
}
void mutation_partition_v2::apply_weak(const schema& s, const mutation_partition& p,
const schema& p_schema, mutation_application_stats& app_stats) {
// FIXME: Optimize
apply_monotonically(s, mutation_partition_v2(s, p), p_schema, app_stats);
}
void mutation_partition_v2::apply_weak(const schema& s, mutation_partition&& p, mutation_application_stats& app_stats) {
apply_monotonically(s, mutation_partition_v2(s, std::move(p)), no_cache_tracker, app_stats, is_evictable::no);
}
void
mutation_partition_v2::apply_row_tombstone(const schema& schema, clustering_key_prefix prefix, tombstone t) {
check_schema(schema);
assert(!prefix.is_full(schema));
auto start = prefix;
apply_row_tombstone(schema, range_tombstone{std::move(start), std::move(prefix), std::move(t)});
}
void
mutation_partition_v2::apply_row_tombstone(const schema& schema, range_tombstone rt) {
check_schema(schema);
mutation_partition mp(schema.shared_from_this());
mp.apply_row_tombstone(schema, std::move(rt));
mutation_application_stats stats;
apply_weak(schema, std::move(mp), stats);
}
void
mutation_partition_v2::apply_delete(const schema& schema, const clustering_key_prefix& prefix, tombstone t) {
check_schema(schema);
if (prefix.is_empty(schema)) {
apply(t);
} else if (prefix.is_full(schema)) {
clustered_row(schema, prefix).apply(t);
} else {
apply_row_tombstone(schema, prefix, t);
}
}
void
mutation_partition_v2::apply_delete(const schema& schema, range_tombstone rt) {
check_schema(schema);
if (range_tombstone::is_single_clustering_row_tombstone(schema, rt.start, rt.start_kind, rt.end, rt.end_kind)) {
apply_delete(schema, std::move(rt.start), std::move(rt.tomb));
return;
}
apply_row_tombstone(schema, std::move(rt));
}
void
mutation_partition_v2::apply_delete(const schema& schema, clustering_key&& prefix, tombstone t) {
check_schema(schema);
if (prefix.is_empty(schema)) {
apply(t);
} else if (prefix.is_full(schema)) {
clustered_row(schema, std::move(prefix)).apply(t);
} else {
apply_row_tombstone(schema, std::move(prefix), t);
}
}
void
mutation_partition_v2::apply_delete(const schema& schema, clustering_key_prefix_view prefix, tombstone t) {
check_schema(schema);
if (prefix.is_empty(schema)) {
apply(t);
} else if (prefix.is_full(schema)) {
clustered_row(schema, prefix).apply(t);
} else {
apply_row_tombstone(schema, prefix, t);
}
}
void
mutation_partition_v2::apply_insert(const schema& s, clustering_key_view key, api::timestamp_type created_at) {
clustered_row(s, key).apply(row_marker(created_at));
}
void mutation_partition_v2::apply_insert(const schema& s, clustering_key_view key, api::timestamp_type created_at,
gc_clock::duration ttl, gc_clock::time_point expiry) {
clustered_row(s, key).apply(row_marker(created_at, ttl, expiry));
}
void mutation_partition_v2::insert_row(const schema& s, const clustering_key& key, deletable_row&& row) {
auto e = alloc_strategy_unique_ptr<rows_entry>(
current_allocator().construct<rows_entry>(key, std::move(row)));
_rows.insert_before_hint(_rows.end(), std::move(e), rows_entry::tri_compare(s));
}
void mutation_partition_v2::insert_row(const schema& s, const clustering_key& key, const deletable_row& row) {
check_schema(s);
auto e = alloc_strategy_unique_ptr<rows_entry>(
current_allocator().construct<rows_entry>(s, key, row));
_rows.insert_before_hint(_rows.end(), std::move(e), rows_entry::tri_compare(s));
}
const row*
mutation_partition_v2::find_row(const schema& s, const clustering_key& key) const {
check_schema(s);
auto i = _rows.find(key, rows_entry::tri_compare(s));
if (i == _rows.end()) {
return nullptr;
}
return &i->row().cells();
}
deletable_row&
mutation_partition_v2::clustered_row(const schema& s, clustering_key&& key) {
check_schema(s);
auto i = _rows.find(key, rows_entry::tri_compare(s));
if (i == _rows.end()) {
auto e = alloc_strategy_unique_ptr<rows_entry>(
current_allocator().construct<rows_entry>(std::move(key)));
i = _rows.insert_before_hint(i, std::move(e), rows_entry::tri_compare(s)).first;
}
return i->row();
}
deletable_row&
mutation_partition_v2::clustered_row(const schema& s, const clustering_key& key) {
check_schema(s);
auto i = _rows.find(key, rows_entry::tri_compare(s));
if (i == _rows.end()) {
auto e = alloc_strategy_unique_ptr<rows_entry>(
current_allocator().construct<rows_entry>(key));
i = _rows.insert_before_hint(i, std::move(e), rows_entry::tri_compare(s)).first;
}
return i->row();
}
deletable_row&
mutation_partition_v2::clustered_row(const schema& s, clustering_key_view key) {
check_schema(s);
auto i = _rows.find(key, rows_entry::tri_compare(s));
if (i == _rows.end()) {
auto e = alloc_strategy_unique_ptr<rows_entry>(
current_allocator().construct<rows_entry>(key));
i = _rows.insert_before_hint(i, std::move(e), rows_entry::tri_compare(s)).first;
}
return i->row();
}
rows_entry&
mutation_partition_v2::clustered_rows_entry(const schema& s, position_in_partition_view pos, is_dummy dummy, is_continuous continuous) {
check_schema(s);
auto i = _rows.find(pos, rows_entry::tri_compare(s));
if (i == _rows.end()) {
auto e = alloc_strategy_unique_ptr<rows_entry>(
current_allocator().construct<rows_entry>(s, pos, dummy, continuous));
i = _rows.insert_before_hint(i, std::move(e), rows_entry::tri_compare(s)).first;
}
return *i;
}
deletable_row&
mutation_partition_v2::clustered_row(const schema& s, position_in_partition_view pos, is_dummy dummy, is_continuous continuous) {
return clustered_rows_entry(s, pos, dummy, continuous).row();
}
rows_entry&
mutation_partition_v2::clustered_row(const schema& s, position_in_partition_view pos, is_dummy dummy) {
check_schema(s);
auto cmp = rows_entry::tri_compare(s);
auto i = _rows.lower_bound(pos, cmp);
if (i == _rows.end() || cmp(i->position(), pos) != 0) {
auto e = alloc_strategy_unique_ptr<rows_entry>(
current_allocator().construct<rows_entry>(s, pos, dummy, is_continuous::no));
if (i != _rows.end()) {
e->set_continuous(i->continuous());
e->set_range_tombstone(i->range_tombstone());
}
i = _rows.insert_before_hint(i, std::move(e), rows_entry::tri_compare(s)).first;
}
return *i;
}
deletable_row&
mutation_partition_v2::append_clustered_row(const schema& s, position_in_partition_view pos, is_dummy dummy, is_continuous continuous) {
check_schema(s);
const auto cmp = rows_entry::tri_compare(s);
auto i = _rows.end();
if (!_rows.empty() && (cmp(*std::prev(i), pos) >= 0)) {
throw std::runtime_error(format("mutation_partition_v2::append_clustered_row(): cannot append clustering row with key {} to the partition"
", last clustering row is equal or greater: {}", pos, std::prev(i)->key()));
}
auto e = alloc_strategy_unique_ptr<rows_entry>(current_allocator().construct<rows_entry>(s, pos, dummy, continuous));
i = _rows.insert_before_hint(i, std::move(e), cmp).first;
return i->row();
}
mutation_partition_v2::rows_type::const_iterator
mutation_partition_v2::lower_bound(const schema& schema, const query::clustering_range& r) const {
check_schema(schema);
if (!r.start()) {
return std::cbegin(_rows);
}
return _rows.lower_bound(position_in_partition_view::for_range_start(r), rows_entry::tri_compare(schema));
}
mutation_partition_v2::rows_type::const_iterator
mutation_partition_v2::upper_bound(const schema& schema, const query::clustering_range& r) const {
check_schema(schema);
if (!r.end()) {
return std::cend(_rows);
}
return _rows.lower_bound(position_in_partition_view::for_range_end(r), rows_entry::tri_compare(schema));
}
boost::iterator_range<mutation_partition_v2::rows_type::const_iterator>
mutation_partition_v2::range(const schema& schema, const query::clustering_range& r) const {
check_schema(schema);
return boost::make_iterator_range(lower_bound(schema, r), upper_bound(schema, r));
}
boost::iterator_range<mutation_partition_v2::rows_type::iterator>
mutation_partition_v2::range(const schema& schema, const query::clustering_range& r) {
return unconst(_rows, static_cast<const mutation_partition_v2*>(this)->range(schema, r));
}
mutation_partition_v2::rows_type::iterator
mutation_partition_v2::lower_bound(const schema& schema, const query::clustering_range& r) {
return unconst(_rows, static_cast<const mutation_partition_v2*>(this)->lower_bound(schema, r));
}
mutation_partition_v2::rows_type::iterator
mutation_partition_v2::upper_bound(const schema& schema, const query::clustering_range& r) {
return unconst(_rows, static_cast<const mutation_partition_v2*>(this)->upper_bound(schema, r));
}
template<typename Func>
void mutation_partition_v2::for_each_row(const schema& schema, const query::clustering_range& row_range, bool reversed, Func&& func) const
{
check_schema(schema);
auto r = range(schema, row_range);
if (!reversed) {
for (const auto& e : r) {
if (func(e) == stop_iteration::yes) {
break;
}
}
} else {
for (const auto& e : r | boost::adaptors::reversed) {
if (func(e) == stop_iteration::yes) {
break;
}
}
}
}
// Transforms given range of printable into a range of strings where each element
// in the original range is prefxied with given string.
template<typename RangeOfPrintable>
static auto prefixed(const sstring& prefix, const RangeOfPrintable& r) {
return r | boost::adaptors::transformed([&] (auto&& e) { return format("{}{}", prefix, e); });
}
std::ostream&
operator<<(std::ostream& os, const mutation_partition_v2::printer& p) {
const auto indent = " ";
auto& mp = p._mutation_partition;
os << "mutation_partition_v2: {\n";
if (mp._tombstone) {
fmt::print(os, "{:2}tombstone: {},\n", "", mp._tombstone);
}
if (!mp.static_row().empty()) {
os << indent << "static_row: {\n";
const auto& srow = mp.static_row().get();
srow.for_each_cell([&] (column_id& c_id, const atomic_cell_or_collection& cell) {
auto& column_def = p._schema.column_at(column_kind::static_column, c_id);
os << indent << indent << "'" << column_def.name_as_text()
<< "': " << atomic_cell_or_collection::printer(column_def, cell) << ",\n";
});
os << indent << "},\n";
}
os << indent << "rows: [\n";
for (const auto& re : mp.clustered_rows()) {
os << indent << indent << "{\n";
const auto& row = re.row();
os << indent << indent << indent << "cont: " << re.continuous() << ",\n";
os << indent << indent << indent << "dummy: " << re.dummy() << ",\n";
if (!row.marker().is_missing()) {
os << indent << indent << indent << "marker: " << row.marker() << ",\n";
}
if (row.deleted_at()) {
os << indent << indent << indent << "tombstone: " << row.deleted_at() << ",\n";
}
if (re.range_tombstone()) {
fmt::print(os, "{:6}rt: {},\n", "", re.range_tombstone());
}
position_in_partition pip(re.position());
if (pip.get_clustering_key_prefix()) {
os << indent << indent << indent << "position: {\n";
auto ck = *pip.get_clustering_key_prefix();
auto type_iterator = ck.get_compound_type(p._schema)->types().begin();
auto column_iterator = p._schema.clustering_key_columns().begin();
os << indent << indent << indent << indent << "bound_weight: " << int32_t(pip.get_bound_weight()) << ",\n";
for (auto&& e : ck.components(p._schema)) {
os << indent << indent << indent << indent << "'" << column_iterator->name_as_text()
<< "': " << (*type_iterator)->to_string(to_bytes(e)) << ",\n";
++type_iterator;
++column_iterator;
}
os << indent << indent << indent << "},\n";
}
row.cells().for_each_cell([&] (column_id& c_id, const atomic_cell_or_collection& cell) {
auto& column_def = p._schema.column_at(column_kind::regular_column, c_id);
os << indent << indent << indent << "'" << column_def.name_as_text()
<< "': " << atomic_cell_or_collection::printer(column_def, cell) << ",\n";
});
os << indent << indent << "},\n";
}
os << indent << "]\n}";
return os;
}
bool mutation_partition_v2::equal(const schema& s, const mutation_partition_v2& p) const {
return equal(s, p, s);
}
bool mutation_partition_v2::equal(const schema& this_schema, const mutation_partition_v2& p, const schema& p_schema) const {
#ifdef SEASTAR_DEBUG
assert(_schema_version == this_schema.version());
assert(p._schema_version == p_schema.version());
#endif
if (_tombstone != p._tombstone) {
return false;
}
if (!boost::equal(non_dummy_rows(), p.non_dummy_rows(),
[&] (const rows_entry& e1, const rows_entry& e2) {
return e1.equal(this_schema, e2, p_schema);
}
)) {
return false;
}
return _static_row.equal(column_kind::static_column, this_schema, p._static_row, p_schema);
}
bool mutation_partition_v2::equal_continuity(const schema& s, const mutation_partition_v2& p) const {
return _static_row_continuous == p._static_row_continuous
&& get_continuity(s).equals(s, p.get_continuity(s));
}
size_t mutation_partition_v2::external_memory_usage(const schema& s) const {
check_schema(s);
size_t sum = 0;
sum += static_row().external_memory_usage(s, column_kind::static_column);
sum += clustered_rows().external_memory_usage();
for (auto& clr : clustered_rows()) {
sum += clr.memory_usage(s);
}
return sum;
}
// Returns true if the mutation_partition_v2 represents no writes.
bool mutation_partition_v2::empty() const
{
if (_tombstone.timestamp != api::missing_timestamp) {
return false;
}
return !_static_row.size() && _rows.empty();
}
bool
mutation_partition_v2::is_static_row_live(const schema& s, gc_clock::time_point query_time) const {
check_schema(s);
return has_any_live_data(s, column_kind::static_column, static_row().get(), _tombstone, query_time);
}
uint64_t
mutation_partition_v2::row_count() const {
return _rows.calculate_size();
}
void mutation_partition_v2::accept(const schema& s, mutation_partition_visitor& v) const {
check_schema(s);
v.accept_partition_tombstone(_tombstone);
_static_row.for_each_cell([&] (column_id id, const atomic_cell_or_collection& cell) {
const column_definition& def = s.static_column_at(id);
if (def.is_atomic()) {
v.accept_static_cell(id, cell.as_atomic_cell(def));
} else {
v.accept_static_cell(id, cell.as_collection_mutation());
}
});
std::optional<position_in_partition> prev_pos;
for (const rows_entry& e : _rows) {
const deletable_row& dr = e.row();
if (e.range_tombstone()) {
if (!e.continuous()) {
v.accept_row_tombstone(range_tombstone(position_in_partition::before_key(e.position()),
position_in_partition::after_key(s, e.position()),
e.range_tombstone()));
} else {
v.accept_row_tombstone(range_tombstone(prev_pos ? position_in_partition::after_key(s, *prev_pos)
: position_in_partition::before_all_clustered_rows(),
position_in_partition::after_key(s, e.position()),
e.range_tombstone()));
}
}
v.accept_row(e.position(), dr.deleted_at(), dr.marker(), e.dummy(), e.continuous());
dr.cells().for_each_cell([&] (column_id id, const atomic_cell_or_collection& cell) {
const column_definition& def = s.regular_column_at(id);
if (def.is_atomic()) {
v.accept_row_cell(id, cell.as_atomic_cell(def));
} else {
v.accept_row_cell(id, cell.as_collection_mutation());
}
});
prev_pos = e.position();
}
}
void
mutation_partition_v2::upgrade(const schema& old_schema, const schema& new_schema) {
// We need to copy to provide strong exception guarantees.
mutation_partition tmp(new_schema.shared_from_this());
tmp.set_static_row_continuous(_static_row_continuous);
converting_mutation_partition_applier v(old_schema.get_column_mapping(), new_schema, tmp);
accept(old_schema, v);
*this = mutation_partition_v2(new_schema, std::move(tmp));
}
mutation_partition mutation_partition_v2::as_mutation_partition(const schema& s) const {
mutation_partition tmp(s.shared_from_this());
tmp.set_static_row_continuous(_static_row_continuous);
partition_builder v(s, tmp);
accept(s, v);
return tmp;
}
mutation_partition_v2::mutation_partition_v2(mutation_partition_v2::incomplete_tag, const schema& s, tombstone t)
: _tombstone(t)
, _static_row_continuous(!s.has_static_columns())
, _rows()
#ifdef SEASTAR_DEBUG
, _schema_version(s.version())
#endif
{
auto e = alloc_strategy_unique_ptr<rows_entry>(
current_allocator().construct<rows_entry>(s, rows_entry::last_dummy_tag(), is_continuous::no));
_rows.insert_before(_rows.end(), std::move(e));
}
bool mutation_partition_v2::is_fully_continuous() const {
if (!_static_row_continuous) {
return false;
}
for (auto&& row : _rows) {
if (!row.continuous()) {
return false;
}
}
return true;
}
void mutation_partition_v2::make_fully_continuous() {
_static_row_continuous = true;
auto i = _rows.begin();
while (i != _rows.end()) {
i->set_continuous(true);
++i;
}
}
void mutation_partition_v2::set_continuity(const schema& s, const position_range& pr, is_continuous cont) {
auto cmp = rows_entry::tri_compare(s);
if (cmp(pr.start(), pr.end()) >= 0) {
return; // empty range
}
auto end = _rows.lower_bound(pr.end(), cmp);
if (end == _rows.end() || cmp(pr.end(), end->position()) < 0) {
auto e = alloc_strategy_unique_ptr<rows_entry>(
current_allocator().construct<rows_entry>(s, pr.end(), is_dummy::yes,
end == _rows.end() ? is_continuous::yes : end->continuous()));
end = _rows.insert_before(end, std::move(e));
}
auto i = _rows.lower_bound(pr.start(), cmp);
if (cmp(pr.start(), i->position()) < 0) {
auto e = alloc_strategy_unique_ptr<rows_entry>(
current_allocator().construct<rows_entry>(s, pr.start(), is_dummy::yes, i->continuous()));
i = _rows.insert_before(i, std::move(e));
}
assert(i != end);
++i;
while (1) {
i->set_continuous(cont);
if (i == end) {
break;
}
++i;
}
}
clustering_interval_set mutation_partition_v2::get_continuity(const schema& s, is_continuous cont) const {
check_schema(s);
clustering_interval_set result;
auto i = _rows.begin();
auto prev_pos = position_in_partition::before_all_clustered_rows();
while (i != _rows.end()) {
if (i->continuous() == cont) {
result.add(s, position_range(std::move(prev_pos), position_in_partition::before_key(i->position())));
}
if (i->position().is_clustering_row() && cont) {
result.add(s, position_range(position_in_partition::before_key(i->position()),
position_in_partition::after_key(s, i->position())));
}
prev_pos = position_in_partition::after_key(s, i->position());
++i;
}
if (cont) {
result.add(s, position_range(std::move(prev_pos), position_in_partition::after_all_clustered_rows()));
}
return result;
}
stop_iteration mutation_partition_v2::clear_gently(cache_tracker* tracker) noexcept {
auto del = current_deleter<rows_entry>();
auto i = _rows.begin();
auto end = _rows.end();
while (i != end) {
if (tracker) {
tracker->remove(*i);
}
i = _rows.erase_and_dispose(i, del);
// The iterator comparison below is to not defer destruction of now empty
// mutation_partition_v2 objects. Not doing this would cause eviction to leave garbage
// versions behind unnecessarily.
if (need_preempt() && i != end) {
return stop_iteration::no;
}
}
return stop_iteration::yes;
}
bool
mutation_partition_v2::check_continuity(const schema& s, const position_range& r, is_continuous cont) const {
check_schema(s);
auto cmp = rows_entry::tri_compare(s);
auto i = _rows.lower_bound(r.start(), cmp);
auto end = _rows.lower_bound(r.end(), cmp);
if (cmp(r.start(), r.end()) >= 0) {
return bool(cont);
}
if (i != end) {
if (no_clustering_row_between(s, r.start(), i->position())) {
++i;
}
while (i != end) {
if (i->continuous() != cont) {
return false;
}
++i;
}
if (end != _rows.begin() && no_clustering_row_between(s, std::prev(end)->position(), r.end())) {
return true;
}
}
return (end == _rows.end() ? is_continuous::yes : end->continuous()) == cont;
}
bool
mutation_partition_v2::fully_continuous(const schema& s, const position_range& r) {
return check_continuity(s, r, is_continuous::yes);
}
bool
mutation_partition_v2::fully_discontinuous(const schema& s, const position_range& r) {
return check_continuity(s, r, is_continuous::no);
}
mutation_partition_v2::rows_type::iterator
mutation_partition_v2::maybe_drop(const schema& s,
cache_tracker* tracker,
mutation_partition_v2::rows_type::iterator i,
mutation_application_stats& app_stats)
{
rows_entry& e = *i;
auto next_i = std::next(i);
if (!e.row().empty() || e.is_last_dummy()) {
return next_i;
}
// Pass only if continuity is the same on both sides and
// range tombstones for the intervals are the same on both sides (if intervals are continuous).
bool next_continuous = next_i == _rows.end() || next_i->continuous();
if (e.continuous() && next_continuous) {
tombstone next_range_tombstone = (next_i == _rows.end() ? tombstone{} : next_i->range_tombstone());
if (e.range_tombstone() != next_range_tombstone) {
return next_i;
}
} else if (!e.continuous() && !next_continuous) {
if (!e.dummy() && e.range_tombstone()) {
return next_i;
}
} else {
return next_i;
}
++app_stats.rows_dropped_by_tombstones; // FIXME: it's more general than that now
auto del = current_deleter<rows_entry>();
i = _rows.erase(i);
if (tracker) {
tracker->remove(e);
}
del(&e);
return next_i;
}
void mutation_partition_v2::compact(const schema& s, cache_tracker* tracker) {
mutation_application_stats stats;
auto i = _rows.begin();
rows_type::iterator prev_i;
while (i != _rows.end()) {
i->compact(s, _tombstone);
if (prev_i) {
// We cannot call maybe_drop() on i because the entry may become redundant
// only after the next entry is compacted, e.g. when next entry's range tombstone is dropped.
maybe_drop(s, tracker, prev_i, stats);
}
prev_i = i++;
}
if (prev_i) {
maybe_drop(s, tracker, prev_i, stats);
}
}
bool has_redundant_dummies(const mutation_partition_v2& p) {
bool last_dummy = false;
bool last_cont = false;
tombstone last_rt;
auto i = p.clustered_rows().begin();
while (i != p.clustered_rows().end()) {
const rows_entry& e = *i;
if (last_dummy) {
bool redundant = last_cont == bool(e.continuous()) && last_rt == e.range_tombstone();
if (redundant) {
return true;
}
}
last_dummy = bool(e.dummy());
last_rt = e.range_tombstone();
last_cont = bool(e.continuous());
++i;
}
return false;
}