mirror of
https://github.com/scylladb/scylladb.git
synced 2026-04-22 17:40:34 +00:00
When compacting a mutation fragment stream (e.g. for sstable compaction, data query, repair), the compactor needs to accumulate range tombstones which are relevant for the yet-to-be-processed range. See range_tombstone_accumulator. One problem is that it has unbounded memory footprint because the accumulator needs to keep track of all the tombstoned ranges which are still active. Another, although more benign, problem is computational complexity needed to maintain that data structure. The fix is to get rid of the overlap of range tombstones in the mutation fragment stream. In v2 of the stream, there is no longer a range_tombstone fragment. Deletions of ranges of rows within a given partition are represented with range_tombstone_change fragments. At any point in the stream there is a single active clustered tombstone. It is initially equal to the neutral tombstone when the stream of each partition starts. The range_tombstone_change fragment type signify changes of the active clustered tombstone. All fragments emitted while a given clustered tombstone is active are affected by that tombstone. Like with the old range_tombstone fragments, the clustered tombstone is independent from the partition tombstone carried in partition_start. The v2 stream is strict about range tombstone trimming. It emits range tombstone changes which reflect range tombstones trimmed to query restrictions, and fast-forwarding ranges. This makes the stream more canonical, meaning that for a given set of writes, querying the database should produce the same stream of fragments for a given restrictions. There is less ambiguity in how the writes are represented in the fragment stream. It wasn't the case with v1. For example, A given set of deletions could be produced either as one range_tombstone, or may, split and/or deoverlapped with other fragments. Making a stream canonical is easier for diff-calculating. The classes related to mutation fragment streams were cloned: flat_mutation_reader_v2, mutation_fragment_v2, and related concepts. Refs #8625.
112 lines
3.8 KiB
C++
112 lines
3.8 KiB
C++
/*
|
|
* Copyright (C) 2016-present ScyllaDB
|
|
*/
|
|
|
|
/*
|
|
* This file is part of Scylla.
|
|
*
|
|
* Scylla is free software: you can redistribute it and/or modify
|
|
* it under the terms of the GNU Affero General Public License as published by
|
|
* the Free Software Foundation, either version 3 of the License, or
|
|
* (at your option) any later version.
|
|
*
|
|
* Scylla is distributed in the hope that it will be useful,
|
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
* GNU General Public License for more details.
|
|
*
|
|
* You should have received a copy of the GNU General Public License
|
|
* along with Scylla. If not, see <http://www.gnu.org/licenses/>.
|
|
*/
|
|
|
|
#include "range_tombstone.hh"
|
|
#include "mutation_fragment.hh"
|
|
#include "mutation_fragment_v2.hh"
|
|
|
|
#include <boost/range/algorithm/upper_bound.hpp>
|
|
|
|
std::ostream& operator<<(std::ostream& out, const range_tombstone& rt) {
|
|
if (rt) {
|
|
return out << "{range_tombstone: start=" << rt.start_bound() << ", end=" << rt.end_bound() << ", " << rt.tomb << "}";
|
|
} else {
|
|
return out << "{range_tombstone: none}";
|
|
}
|
|
}
|
|
|
|
std::ostream& operator<<(std::ostream& out, const range_tombstone_change& rt) {
|
|
return out << "{range_tombstone_change: pos=" << rt.position() << ", " << rt.tombstone() << "}";
|
|
}
|
|
|
|
std::optional<range_tombstone> range_tombstone::apply(const schema& s, range_tombstone&& src)
|
|
{
|
|
bound_view::compare cmp(s);
|
|
if (tomb == src.tomb) {
|
|
if (cmp(end_bound(), src.end_bound())) {
|
|
end = std::move(src.end);
|
|
end_kind = src.end_kind;
|
|
}
|
|
return { };
|
|
}
|
|
if (tomb < src.tomb) {
|
|
std::swap(*this, src);
|
|
}
|
|
if (cmp(end_bound(), src.end_bound())) {
|
|
return range_tombstone(end, invert_kind(end_kind), std::move(src.end), src.end_kind, src.tomb);
|
|
}
|
|
return { };
|
|
}
|
|
|
|
position_in_partition_view range_tombstone::position() const {
|
|
return position_in_partition_view(position_in_partition_view::range_tombstone_tag_t(), start_bound());
|
|
}
|
|
|
|
position_in_partition_view range_tombstone::end_position() const {
|
|
return position_in_partition_view(position_in_partition_view::range_tombstone_tag_t(), end_bound());
|
|
}
|
|
|
|
void range_tombstone_accumulator::update_current_tombstone() {
|
|
_current_tombstone = boost::accumulate(_range_tombstones, _partition_tombstone, [] (tombstone t, const range_tombstone& rt) {
|
|
t.apply(rt.tomb);
|
|
return t;
|
|
});
|
|
}
|
|
|
|
void range_tombstone_accumulator::drop_unneeded_tombstones(const clustering_key_prefix& ck, int w) {
|
|
auto cmp = [&] (const range_tombstone& rt, const clustering_key_prefix& ck, int w) {
|
|
if (_reversed) {
|
|
auto bv = rt.start_bound();
|
|
return _cmp(ck, w, bv.prefix(), weight(bv.kind()));
|
|
}
|
|
auto bv = rt.end_bound();
|
|
return _cmp(bv.prefix(), weight(bv.kind()), ck, w);
|
|
};
|
|
bool dropped = false;
|
|
while (!_range_tombstones.empty() && cmp(*_range_tombstones.begin(), ck, w)) {
|
|
dropped = true;
|
|
_range_tombstones.pop_front();
|
|
}
|
|
if (dropped) {
|
|
update_current_tombstone();
|
|
}
|
|
}
|
|
|
|
void range_tombstone_accumulator::apply(range_tombstone rt) {
|
|
if (_reversed) {
|
|
drop_unneeded_tombstones(rt.end, weight(rt.end_kind));
|
|
} else {
|
|
drop_unneeded_tombstones(rt.start, weight(rt.start_kind));
|
|
}
|
|
_current_tombstone.apply(rt.tomb);
|
|
|
|
auto cmp = [&] (const range_tombstone& rt1, const range_tombstone& rt2) {
|
|
return _reversed ? _cmp(rt2.start_bound(), rt1.start_bound()) : _cmp(rt1.end_bound(), rt2.end_bound());
|
|
};
|
|
_range_tombstones.insert(boost::upper_bound(_range_tombstones, rt, cmp), std::move(rt));
|
|
}
|
|
|
|
void range_tombstone_accumulator::clear() {
|
|
_range_tombstones.clear();
|
|
_partition_tombstone = { };
|
|
_current_tombstone = { };
|
|
}
|