diff --git a/cache_flat_mutation_reader.hh b/cache_flat_mutation_reader.hh index 945281b019..0b69af520c 100644 --- a/cache_flat_mutation_reader.hh +++ b/cache_flat_mutation_reader.hh @@ -362,13 +362,10 @@ future<> cache_flat_mutation_reader::read_from_underlying(db::timeout_clock::tim } if (_next_row_in_range) { maybe_update_continuity(); - add_to_buffer(_next_row); - try { - move_to_next_entry(); - } catch (const std::bad_alloc&) { - // We cannot reenter the section, since we may have moved to the new range, and - // because add_to_buffer() should not be repeated. - _snp->region().allocator().invalidate_references(); // Invalidates _next_row + if (!_next_row.dummy()) { + _lower_bound = position_in_partition::before_key(_next_row.key()); + } else { + _lower_bound = _next_row.position(); } } else { if (no_clustering_row_between(*_schema, _upper_bound, _next_row.position())) { diff --git a/clustering_ranges_walker.hh b/clustering_ranges_walker.hh index 0051476fdf..d18fbc94be 100644 --- a/clustering_ranges_walker.hh +++ b/clustering_ranges_walker.hh @@ -26,6 +26,7 @@ #include "schema.hh" #include "query-request.hh" #include "mutation_fragment.hh" +#include "mutation_fragment_v2.hh" // Utility for in-order checking of overlap with position ranges. class clustering_ranges_walker { @@ -33,21 +34,26 @@ class clustering_ranges_walker { const query::clustering_row_ranges& _ranges; boost::iterator_range _current_range; bool _in_current; // next position is known to be >= _current_start + bool _past_current; // next position is known to be >= _current_end + bool _using_clustering_range; // Whether current range comes from _current_range bool _with_static_row; position_in_partition_view _current_start; position_in_partition_view _current_end; std::optional _trim; size_t _change_counter = 1; + tombstone _tombstone; private: bool advance_to_next_range() { _in_current = false; - if (!_current_start.is_static_row()) { + _past_current = false; + if (_using_clustering_range) { if (!_current_range) { return false; } _current_range.advance_begin(1); } ++_change_counter; + _using_clustering_range = true; if (!_current_range) { _current_end = _current_start = position_in_partition_view::after_all_clustered_rows(); return false; @@ -58,17 +64,20 @@ private: } void set_current_positions() { + _using_clustering_range = false; if (!_with_static_row) { if (!_current_range) { _current_start = position_in_partition_view::before_all_clustered_rows(); } else { _current_start = position_in_partition_view::for_range_start(_current_range.front()); _current_end = position_in_partition_view::for_range_end(_current_range.front()); + _using_clustering_range = true; } } else { // If the first range is contiguous with the static row, then advance _current_end as much as we can if (_current_range && !_current_range.front().start()) { _current_end = position_in_partition_view::for_range_end(_current_range.front()); + _using_clustering_range = true; } } } @@ -79,6 +88,7 @@ public: , _ranges(ranges) , _current_range(ranges) , _in_current(with_static_row) + , _past_current(false) , _with_static_row(with_static_row) , _current_start(position_in_partition_view::for_static_row()) , _current_end(position_in_partition_view::before_all_clustered_rows()) { @@ -91,11 +101,33 @@ public: clustering_ranges_walker& operator=(const clustering_ranges_walker&) = delete; clustering_ranges_walker& operator=(clustering_ranges_walker&&) = delete; + using range_tombstones = utils::small_vector; + + // Result of advancing to a given position. + struct progress { + // True iff the position is contained in requested ranges. + bool contained; + + // Range tombstone changes to emit which reflect current range tombstone + // trimmed to requested ranges, up to the advanced-to position (inclusive). + // + // It is guaranteed that the sequence of tombstones returned from all + // advance_to() calls will be the same for a given ranges no matter at + // which positions you call advance_to(), provided that you change + // the current tombstone at the same positions. + // Redundant changes will not be generated. + // This is to support the guarantees of flat_mutation_reader_v2. + range_tombstones rts; + }; + // Excludes positions smaller than pos from the ranges. // pos should be monotonic. // No constraints between pos and positions passed to advance_to(). // // After the invocation, when !out_of_range(), lower_bound() returns the smallest position still contained. + // + // After this, advance_to(lower_bound()) will always emit a range tombstone change for pos + // if there is an active range tombstone and !out_of_range(). void trim_front(position_in_partition pos) { position_in_partition::less_compare less(_schema); @@ -117,22 +149,51 @@ public: // Must be called with monotonic positions. // Idempotent. bool advance_to(position_in_partition_view pos) { + return advance_to(pos, _tombstone).contained; + } + + // Returns result of advancing over clustering restrictions. + // Must be called with monotonic positions. + // + // The walker tracks current clustered tombstone. + // The new_tombstone will be the current clustered tombstone after advancing, starting from pos (inclusive). + // The returned progress object contains range_tombstone_change fragments which reflect changes of + // the current clustered tombstone trimmed to the boundaries of requested ranges, up to the + // advanced-to position (inclusive). + progress advance_to(position_in_partition_view pos, tombstone new_tombstone) { position_in_partition::less_compare less(_schema); + range_tombstones rts; + + auto prev_tombstone = _tombstone; + _tombstone = new_tombstone; do { if (!_in_current && less(pos, _current_start)) { break; } + + if (!_in_current && prev_tombstone) { + rts.push_back(range_tombstone_change(_current_start, prev_tombstone)); + } + // All subsequent clustering keys are larger than the start of this // range so there is no need to check that again. _in_current = true; if (less(pos, _current_end)) { - return true; + if (prev_tombstone != new_tombstone) { + rts.push_back(range_tombstone_change(pos, new_tombstone)); + } + return progress{.contained = true, .rts = std::move(rts)}; + } else { + if (!_past_current && prev_tombstone) { + rts.push_back(range_tombstone_change(_current_end, {})); + } + _past_current = true; } } while (advance_to_next_range()); - return false; + return progress{.contained = false, .rts = std::move(rts)}; } // Returns true if the range expressed by start and end (as in position_range) overlaps @@ -140,6 +201,8 @@ public: // Must be called with monotonic start position. That position must also be greater than // the last position passed to the other advance_to() overload. // Idempotent. + // Breaks the tracking of current range tombstone, so don't use if you also use the advance_to() + // overload which tracks tombstones. bool advance_to(position_in_partition_view start, position_in_partition_view end) { position_in_partition::less_compare less(_schema); @@ -181,6 +244,48 @@ public: return false; } + // Intersects rt with query ranges. The first overlap is returned and the rest is applied to dst. + // If returns a disengaged optional, there is no overlap and nothing was applied to dst. + // No monotonicity restrictions on argument values across calls. + // Does not affect lower_bound(). + std::optional split_tombstone(range_tombstone rt, range_tombstone_stream& dst) const { + position_in_partition::less_compare less(_schema); + + if (_trim && !rt.trim_front(_schema, *_trim)) { + return std::nullopt; + } + + std::optional first; + + for (const auto& rng : _current_range) { + auto range_start = position_in_partition_view::for_range_start(rng); + auto range_end = position_in_partition_view::for_range_end(rng); + if (!less(rt.position(), range_start) && !less(range_end, rt.end_position())) { + // Fully enclosed by this range. + assert(!first); + return std::move(rt); + } + auto this_range_rt = rt; + if (this_range_rt.trim(_schema, range_start, range_end)) { + if (first) { + dst.apply(std::move(this_range_rt)); + } else { + first = std::move(this_range_rt); + } + } + } + + return first; + } + + tombstone current_tombstone() const { + return _tombstone; + } + + void set_tombstone(tombstone t) { + _tombstone = t; + } + // Returns true if advanced past all contained positions. Any later advance_to() until reset() will return false. bool out_of_range() const { return !_in_current && !_current_range; @@ -191,6 +296,7 @@ public: void reset() { _current_range = _ranges; _in_current = _with_static_row; + _past_current = false; _current_start = position_in_partition_view::for_static_row(); _current_end = position_in_partition_view::before_all_clustered_rows(); set_current_positions(); diff --git a/flat_mutation_reader.cc b/flat_mutation_reader.cc index f0e9baa7e0..18abc6350c 100644 --- a/flat_mutation_reader.cc +++ b/flat_mutation_reader.cc @@ -20,6 +20,9 @@ */ #include "flat_mutation_reader.hh" +#include "flat_mutation_reader_v2.hh" +#include "range_tombstone_assembler.hh" +#include "range_tombstone_change_generator.hh" #include "mutation_fragment_stream_validator.hh" #include "mutation_reader.hh" #include "seastar/util/reference_wrapper.hh" @@ -31,6 +34,8 @@ #include #include #include "utils/exceptions.hh" +#include "mutation_rebuilder.hh" +#include "range_tombstone_splitter.hh" #include #include "clustering_key_filter.hh" @@ -930,31 +935,40 @@ flat_mutation_reader make_flat_mutation_reader_from_fragments(schema_ptr schema, reader_permit permit, std::deque fragments, const dht::partition_range& pr, const query::partition_slice& slice) { std::optional ranges_walker; - for (auto it = fragments.begin(); it != fragments.end();) { - switch (it->mutation_fragment_kind()) { + std::optional splitter; + std::deque filtered; + for (auto&& mf : fragments) { + switch (mf.mutation_fragment_kind()) { case mutation_fragment::kind::partition_start: - ranges_walker.emplace(*schema, slice.row_ranges(*schema, it->as_partition_start().key().key()), false); - case mutation_fragment::kind::static_row: // fall-through - case mutation_fragment::kind::partition_end: // fall-through - ++it; + ranges_walker.emplace(*schema, slice.row_ranges(*schema, mf.as_partition_start().key().key()), false); + splitter.emplace(*schema, permit, *ranges_walker); + filtered.emplace_back(std::move(mf)); + break; + case mutation_fragment::kind::static_row: + filtered.push_back(std::move(mf)); + break; + case mutation_fragment::kind::partition_end: + splitter->flush(position_in_partition::after_all_clustered_rows(), [&] (mutation_fragment mf) { + filtered.emplace_back(std::move(mf)); + }); + filtered.push_back(std::move(mf)); break; case mutation_fragment::kind::clustering_row: - if (ranges_walker->advance_to(it->position())) { - ++it; - } else { - it = fragments.erase(it); + splitter->flush(mf.position(), [&] (mutation_fragment mf) { + filtered.emplace_back(std::move(mf)); + }); + if (ranges_walker->advance_to(mf.position())) { + filtered.push_back(std::move(mf)); } break; case mutation_fragment::kind::range_tombstone: - if (ranges_walker->advance_to(it->as_range_tombstone().position(), it->as_range_tombstone().end_position())) { - ++it; - } else { - it = fragments.erase(it); - } + splitter->consume(std::move(mf).as_range_tombstone(), [&] (mutation_fragment mf) { + filtered.emplace_back(std::move(mf)); + }); break; } } - return make_flat_mutation_reader_from_fragments(std::move(schema), std::move(permit), std::move(fragments), pr); + return make_flat_mutation_reader_from_fragments(std::move(schema), std::move(permit), std::move(filtered), pr); } /* @@ -1195,3 +1209,270 @@ void mutation_fragment_stream_validating_filter::on_end_of_stream() { _validator.previous_mutation_fragment_kind())); } } + +flat_mutation_reader_v2& flat_mutation_reader_v2::operator=(flat_mutation_reader_v2&& o) noexcept { + if (_impl) { + impl* ip = _impl.get(); + // Abort to enforce calling close() before readers are closed + // to prevent leaks and potential use-after-free due to background + // tasks left behind. + on_internal_error_noexcept(fmr_logger, format("{} [{}]: permit {}: was not closed before overwritten by move-assign", typeid(*ip).name(), fmt::ptr(ip), ip->_permit.description())); + abort(); + } + _impl = std::move(o._impl); + return *this; +} + +flat_mutation_reader_v2::~flat_mutation_reader_v2() { + if (_impl) { + impl* ip = _impl.get(); + // Abort to enforce calling close() before readers are closed + // to prevent leaks and potential use-after-free due to background + // tasks left behind. + on_internal_error_noexcept(fmr_logger, format("{} [{}]: permit {}: was not closed before destruction", typeid(*ip).name(), fmt::ptr(ip), ip->_permit.description())); + abort(); + } +} + +static size_t compute_buffer_size(const schema& s, const flat_mutation_reader_v2::tracked_buffer& buffer) +{ + return boost::accumulate( + buffer + | boost::adaptors::transformed([&s] (const mutation_fragment_v2& mf) { + return mf.memory_usage(); + }), size_t(0) + ); +} + +void flat_mutation_reader_v2::impl::forward_buffer_to(const position_in_partition& pos) { + clear_buffer(); + _buffer_size = compute_buffer_size(*_schema, _buffer); +} + +void flat_mutation_reader_v2::impl::clear_buffer_to_next_partition() { + auto next_partition_start = std::find_if(_buffer.begin(), _buffer.end(), [] (const mutation_fragment_v2& mf) { + return mf.is_partition_start(); + }); + _buffer.erase(_buffer.begin(), next_partition_start); + + _buffer_size = compute_buffer_size(*_schema, _buffer); +} + +template +future flat_mutation_reader_v2::impl::fill_buffer_from(Source& source, db::timeout_clock::time_point timeout) { + if (source.is_buffer_empty()) { + if (source.is_end_of_stream()) { + return make_ready_future(true); + } + return source.fill_buffer(timeout).then([this, &source, timeout] { + return fill_buffer_from(source, timeout); + }); + } else { + while (!source.is_buffer_empty() && !is_buffer_full()) { + push_mutation_fragment(source.pop_mutation_fragment()); + } + return make_ready_future(source.is_end_of_stream() && source.is_buffer_empty()); + } +} + +template future flat_mutation_reader_v2::impl::fill_buffer_from(flat_mutation_reader_v2&, db::timeout_clock::time_point); + +void flat_mutation_reader_v2::do_upgrade_schema(const schema_ptr& s) { + *this = transform(std::move(*this), schema_upgrader_v2(s)); +} + +future read_mutation_from_flat_mutation_reader(flat_mutation_reader_v2& r, db::timeout_clock::time_point timeout) { + return r.consume(mutation_rebuilder_v2(r.schema()), timeout); +} + +void flat_mutation_reader_v2::on_close_error(std::unique_ptr i, std::exception_ptr ep) noexcept { + impl* ip = i.get(); + on_internal_error_noexcept(fmr_logger, + format("Failed to close {} [{}]: permit {}: {}", typeid(*ip).name(), fmt::ptr(ip), ip->_permit.description(), ep)); +} + +flat_mutation_reader downgrade_to_v1(flat_mutation_reader_v2 r) { + class transforming_reader : public flat_mutation_reader::impl { + flat_mutation_reader_v2 _reader; + struct consumer { + transforming_reader* _owner; + stop_iteration operator()(mutation_fragment_v2&& mf) { + std::move(mf).consume(*_owner); + return stop_iteration(_owner->is_buffer_full()); + } + }; + range_tombstone_assembler _rt_assembler; + public: + void consume(static_row mf) { + push_mutation_fragment(*_schema, _permit, std::move(mf)); + } + void consume(clustering_row mf) { + if (_rt_assembler.needs_flush()) { + if (auto rt_opt = _rt_assembler.flush(*_schema, position_in_partition::after_key(mf.position()))) { + push_mutation_fragment(*_schema, _permit, std::move(*rt_opt)); + } + } + push_mutation_fragment(*_schema, _permit, std::move(mf)); + } + void consume(range_tombstone_change mf) { + if (auto rt_opt = _rt_assembler.consume(*_schema, std::move(mf))) { + push_mutation_fragment(*_schema, _permit, std::move(*rt_opt)); + } + } + void consume(partition_start mf) { + _rt_assembler.reset(); + push_mutation_fragment(*_schema, _permit, std::move(mf)); + } + void consume(partition_end mf) { + _rt_assembler.on_end_of_stream(); + push_mutation_fragment(*_schema, _permit, std::move(mf)); + } + transforming_reader(flat_mutation_reader_v2&& r) + : impl(r.schema(), r.permit()) + , _reader(std::move(r)) + {} + virtual future<> fill_buffer(db::timeout_clock::time_point timeout) override { + if (_end_of_stream) { + return make_ready_future<>(); + } + return _reader.consume_pausable(consumer{this}, timeout).then([this] { + if (_reader.is_end_of_stream() && _reader.is_buffer_empty()) { + _rt_assembler.on_end_of_stream(); + _end_of_stream = true; + } + }); + } + virtual future<> next_partition() override { + clear_buffer_to_next_partition(); + if (is_buffer_empty()) { + _end_of_stream = false; + return _reader.next_partition(); + } + return make_ready_future<>(); + } + virtual future<> fast_forward_to(const dht::partition_range& pr, db::timeout_clock::time_point timeout) override { + clear_buffer(); + _end_of_stream = false; + return _reader.fast_forward_to(pr, timeout); + } + virtual future<> fast_forward_to(position_range pr, db::timeout_clock::time_point timeout) override { + clear_buffer(); + _end_of_stream = false; + return _reader.fast_forward_to(std::move(pr), timeout); + } + virtual future<> close() noexcept override { + return _reader.close(); + } + }; + return make_flat_mutation_reader(std::move(r)); +} + +flat_mutation_reader_v2 upgrade_to_v2(flat_mutation_reader r) { + class transforming_reader : public flat_mutation_reader_v2::impl { + flat_mutation_reader _reader; + struct consumer { + transforming_reader* _owner; + stop_iteration operator()(mutation_fragment&& mf) { + std::move(mf).consume(*_owner); + return stop_iteration(_owner->is_buffer_full()); + } + }; + range_tombstone_change_generator _rt_gen; + tombstone _current_rt; + std::optional _pr; + public: + void flush_tombstones(position_in_partition_view pos) { + _rt_gen.flush(pos, [&] (range_tombstone_change rt) { + _current_rt = rt.tombstone(); + push_mutation_fragment(*_schema, _permit, std::move(rt)); + }); + } + void consume(static_row mf) { + push_mutation_fragment(*_schema, _permit, std::move(mf)); + } + void consume(clustering_row mf) { + flush_tombstones(mf.position()); + push_mutation_fragment(*_schema, _permit, std::move(mf)); + } + void consume(range_tombstone rt) { + if (_pr) { + if (!rt.trim_front(*_schema, _pr->start())) { + return; + } + } + flush_tombstones(rt.position()); + _rt_gen.consume(std::move(rt)); + } + void consume(partition_start mf) { + _rt_gen.reset(); + _current_rt = {}; + _pr = std::nullopt; + push_mutation_fragment(*_schema, _permit, std::move(mf)); + } + void consume(partition_end mf) { + flush_tombstones(position_in_partition::after_all_clustered_rows()); + if (_current_rt) { + assert(!_pr); + push_mutation_fragment(*_schema, _permit, range_tombstone_change( + position_in_partition::after_all_clustered_rows(), {})); + } + push_mutation_fragment(*_schema, _permit, std::move(mf)); + } + transforming_reader(flat_mutation_reader&& r) + : impl(r.schema(), r.permit()) + , _reader(std::move(r)) + , _rt_gen(*_schema) + {} + virtual future<> fill_buffer(db::timeout_clock::time_point timeout) override { + if (_end_of_stream) { + return make_ready_future<>(); + } + return _reader.consume_pausable(consumer{this}, timeout).then([this] { + if (_reader.is_end_of_stream() && _reader.is_buffer_empty()) { + if (_pr) { + // If !_pr we should flush on partition_end + flush_tombstones(_pr->end()); + if (_current_rt) { + push_mutation_fragment(*_schema, _permit, range_tombstone_change(_pr->end(), {})); + } + } + _end_of_stream = true; + } + }); + } + virtual future<> next_partition() override { + clear_buffer_to_next_partition(); + if (is_buffer_empty()) { + _end_of_stream = false; + return _reader.next_partition(); + } + return make_ready_future<>(); + } + virtual future<> fast_forward_to(const dht::partition_range& pr, db::timeout_clock::time_point timeout) override { + clear_buffer(); + _end_of_stream = false; + return _reader.fast_forward_to(pr, timeout); + } + virtual future<> fast_forward_to(position_range pr, db::timeout_clock::time_point timeout) override { + clear_buffer(); + // r is used to trim range tombstones and range_tombstone:s can be trimmed only to positions + // which are !is_clustering_row(). Replace with equivalent ranges. + // Long-term we should guarantee this on position_range. + if (pr.start().is_clustering_row()) { + pr.set_start(position_in_partition::before_key(pr.start().key())); + } + if (pr.end().is_clustering_row()) { + pr.set_end(position_in_partition::before_key(pr.end().key())); + } + _rt_gen.trim(pr.start()); + _current_rt = {}; + _pr = pr; + _end_of_stream = false; + return _reader.fast_forward_to(std::move(pr), timeout); + } + virtual future<> close() noexcept override { + return _reader.close(); + } + }; + return make_flat_mutation_reader_v2(std::move(r)); +} diff --git a/flat_mutation_reader.hh b/flat_mutation_reader.hh index 718ddc12a2..5b6c5bc10b 100644 --- a/flat_mutation_reader.hh +++ b/flat_mutation_reader.hh @@ -42,18 +42,99 @@ using seastar::future; class mutation_source; class position_in_partition; -/* - * Allows iteration on mutations using mutation_fragments. - * It iterates over mutations one by one and for each mutation - * it returns: - * 1. partition_start mutation_fragment - * 2. static_row mutation_fragment if one exists - * 3. mutation_fragments for all clustering rows and range tombstones - * in clustering key order - * 4. partition_end mutation_fragment - * The best way to consume those mutation_fragments is to call - * flat_mutation_reader::consume with a consumer that receives the fragments. - */ +/// \brief Represents a stream of mutation fragments. +/// +/// Mutation fragments represent writes to the database. +/// +/// Each fragment has an implicit position in the database, +/// which also determines its position in the stream relative to other fragments. +/// The global position of a fragment is a tuple ordered lexicographically: +/// +/// (ring_position of a partition key, position_in_partition) +/// +/// The stream has a hierarchical form. All fragments which occur +/// between partition_start and partition_end represent writes to the partition +/// identified by the partition_start::key(). The partition key is not repeated +/// with inner fragments. +/// +/// The stream of mutation fragments conforms to the following form: +/// +/// stream ::= partition* +/// partition ::= partition_start static_row? clustered* partition_end +/// clustered ::= clustering_row | range_tombstone +/// +/// The range_tombstone fragments can have ranges which overlap with other +/// clustered fragments. +/// +/// Consecutive range_tombstone fragments can have the same position(), so they +/// are weakly ordered. This makes merging two streams easier, and is +/// relied upon by combined_mutation_reader. +/// +/// \section Clustering restrictions +/// +/// A stream may produce writes relevant to only some clustering ranges, for +/// example by specifying clustering ranges in a partition_slice passed to +/// mutation_source::make_reader(). This will make the stream return information +/// for a subset of writes that it would normally return should the stream be +/// unrestricted. +/// +/// The restricted stream obeys the following rules: +/// +/// 0) The stream must contain fragments corresponding to all writes +/// which are relevant to the requested ranges. +/// +/// 1) The stream _may_ contain fragments with information +/// about _some_ of the writes which are relevant to clustering ranges +/// outside of the requested ranges. +/// +/// 2) The stream will not contain writes which are absent in the unrestricted stream, +/// both for the requested clustering ranges and not requested ranges. +/// This means that it's safe to populate cache with all the returned information. +/// Even though it may be incomplete for non-requested ranges, it won't contain +/// incorrect information. +/// +/// 3) All clustered fragments have position() which is within the requested +/// ranges. +/// +/// 4) range_tombstone ranges are trimmed to the boundaries of requested ranges. +/// +/// \section Intra-partition fast-forwarding mode +/// +/// The stream can operate in an alternative mode when streamed_mutation::forwarding::yes +/// is passed to the stream constructor (see mutation_source). +/// +/// In this mode, the original stream is not produced at once, but divided into sub-streams, where +/// each is produced at a time, ending with the end-of-stream condition (is_end_of_stream()). +/// The user needs to explicitly advance the stream to the next sub-stream by calling +/// fast_forward_to() or next_partition(). +/// +/// The original stream is divided like this: +/// +/// 1) For every partition, the first sub-stream will contain +/// partition_start and the static_row +/// +/// 2) Calling fast_forward_to() moves to the next sub-stream within the +/// current partition. The stream will contain all fragments relevant to +/// the position_range passed to fast_forward_to(). +/// +/// 3) The position_range passed to fast_forward_to() is a clustering key restriction. +/// Same rules apply as with clustering restrictions described above except for point (4) above: +/// range tombstones can extend the range passed to fast_forward_to(). +/// +/// 4) range_tombstones produced in earlier sub-stream which are also relevant +/// for next sub-streams do not have to be repeated. They _may_ be repeated +/// with a starting position trimmed. +/// +/// 5) partition_end is never emitted, the user needs to call next_partition() +/// to move to the next partition in the original stream, which will open +/// the initial sub-stream of the next partition. +/// An empty sub-stream after next_partition() indicates global end-of-stream (no next partition). +/// +/// \section Consuming +/// +/// The best way to consume those mutation_fragments is to call +/// flat_mutation_reader::consume with a consumer that receives the fragments. +/// class flat_mutation_reader final { public: using tracked_buffer = circular_buffer>; diff --git a/flat_mutation_reader_v2.hh b/flat_mutation_reader_v2.hh new file mode 100644 index 0000000000..dc6a57d4d4 --- /dev/null +++ b/flat_mutation_reader_v2.hh @@ -0,0 +1,760 @@ +/* + * Copyright (C) 2021-present ScyllaDB + */ + +/* + * This file is part of Scylla. + * + * Scylla is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * Scylla is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with Scylla. If not, see . + */ + +#pragma once + +#include +#include + +#include "dht/i_partitioner.hh" +#include "position_in_partition.hh" +#include "flat_mutation_reader.hh" +#include "mutation_fragment_v2.hh" +#include "tracing/trace_state.hh" +#include "mutation.hh" +#include "query_class_config.hh" +#include "mutation_consumer_concepts.hh" + +#include +#include +#include "db/timeout_clock.hh" +#include "reader_permit.hh" + +#include + +using seastar::future; + +template +concept StreamedMutationConsumerV2 = +FragmentConsumerV2 && requires(T t, tombstone tomb) { + t.consume(tomb); + t.consume_end_of_stream(); +}; + +template +concept FlatMutationReaderConsumerV2 = + requires(Consumer c, mutation_fragment_v2 mf) { + { c(std::move(mf)) } -> std::same_as; + } || requires(Consumer c, mutation_fragment_v2 mf) { + { c(std::move(mf)) } -> std::same_as>; + }; + +template +concept FlattenedConsumerV2 = + StreamedMutationConsumerV2 && requires(T obj, const dht::decorated_key& dk) { + { obj.consume_new_partition(dk) }; + { obj.consume_end_of_partition() }; + }; + +template +concept FlattenedConsumerFilterV2 = + requires(T filter, const dht::decorated_key& dk, const mutation_fragment_v2& mf) { + { filter(dk) } -> std::same_as; + { filter(mf) } -> std::same_as; + { filter.on_end_of_stream() } -> std::same_as; + }; + +/// \brief Represents a stream of mutation fragments. +/// +/// Mutation fragments represent writes to the database. +/// +/// Each fragment has an implicit position in the database, +/// which also determines its position in the stream relative to other fragments. +/// The global position of a fragment is a tuple ordered lexicographically: +/// +/// (ring_position of a partition key, position_in_partition) +/// +/// The stream has a hierarchical form. All fragments which occur +/// between partition_start and partition_end represent writes to the partition +/// identified by the partition_start::key(). The partition key is not repeated +/// with inner fragments. +/// +/// The stream of mutation fragments conforms to the following form: +/// +/// stream ::= partition* +/// partition ::= partition_start static_row? clustered* partition_end +/// clustered ::= clustering_row | range_tombstone_change +/// +/// Deletions of ranges of rows within a given partition are represented with range_tombstone_change fragments. +/// At any point in the stream there is a single active clustered tombstone. +/// It is initially equal to the neutral tombstone when the stream of each partition starts. +/// range_tombstone_change fragments signify changes of the active clustered tombstone. +/// All fragments emitted while a given clustered tombstone is active are affected by that tombstone. +/// The clustered tombstone is independent from the partition tombstone carried in partition_start. +/// The partition tombstone takes effect for all fragments within the partition. +/// +/// The stream guarantees that each partition ends with a neutral active clustered tombstone +/// by closing active tombstones with a range_tombstone_change. +/// In fast-forwarding mode, each sub-stream ends with a neutral active clustered tombstone. +/// +/// All fragments within a partition have weakly monotonically increasing position(). +/// Consecutive range_tombstone_change fragments may share the position. +/// All clustering row fragments within a partition have strictly monotonically increasing position(). +/// +/// \section Clustering restrictions +/// +/// A stream may produce writes relevant to only some clustering ranges, for +/// example by specifying clustering ranges in a partition_slice passed to +/// mutation_source::make_reader(). This will make the stream return information +/// for a subset of writes that it would normally return should the stream be +/// unrestricted. +/// +/// The restricted stream obeys the following rules: +/// +/// 0) The stream must contain fragments corresponding to all writes +/// which are relevant to the requested ranges. +/// +/// 1) The ranges of non-neutral clustered tombstones must be enclosed in requested +/// ranges. In other words, range tombstones don't extend beyond boundaries of requested ranges. +/// +/// 2) The stream will not return writes which are absent in the unrestricted stream, +/// both for the requested clustering ranges and not requested ranges. +/// This means that it's safe to populate cache with all the returned information. +/// Even though it may be incomplete for non-requested ranges, it won't contain +/// incorrect information. +/// +/// 3) All clustered fragments have position() which is within the requested +/// ranges or, in case of range_tombstone_change fragments, equal to the end bound. +/// +/// 4) Streams may produce redundant range_tombstone_change fragments +/// which do not change the current clustered tombstone, or have the same position. +/// +/// \section Intra-partition fast-forwarding mode +/// +/// The stream can operate in an alternative mode when streamed_mutation::forwarding::yes +/// is passed to the stream constructor (see mutation_source). +/// +/// In this mode, the original stream is not produced at once, but divided into sub-streams, where +/// each is produced at a time, ending with the end-of-stream condition (is_end_of_stream()). +/// The user needs to explicitly advance the stream to the next sub-stream by calling +/// fast_forward_to() or next_partition(). +/// +/// The original stream is divided like this: +/// +/// 1) For every partition, the first sub-stream will contain +/// partition_start and the static_row +/// +/// 2) Calling fast_forward_to() moves to the next sub-stream within the +/// current partition. The stream will contain all fragments relevant to +/// the position_range passed to fast_forward_to(). +/// +/// 3) The position_range passed to fast_forward_to() is a clustering key restriction. +/// Same rules apply as with clustering restrictions described above. +/// +/// 4) The sub-stream will not end with a non-neutral active clustered tombstone. All range tombstones are closed. +/// +/// 5) partition_end is never emitted, the user needs to call next_partition() +/// to move to the next partition in the original stream, which will open +/// the initial sub-stream of the next partition. +/// An empty sub-stream after next_partition() indicates global end-of-stream (no next partition). +/// +/// \section Consuming +/// +/// The best way to consume those mutation_fragments is to call +/// flat_mutation_reader::consume with a consumer that receives the fragments. +class flat_mutation_reader_v2 final { +public: + using tracked_buffer = circular_buffer>; + + class impl { + private: + tracked_buffer _buffer; + size_t _buffer_size = 0; + protected: + size_t max_buffer_size_in_bytes = 8 * 1024; + bool _end_of_stream = false; + schema_ptr _schema; + reader_permit _permit; + friend class flat_mutation_reader_v2; + protected: + template + void push_mutation_fragment(Args&&... args) { + seastar::memory::on_alloc_point(); // for exception safety tests + _buffer.emplace_back(std::forward(args)...); + _buffer_size += _buffer.back().memory_usage(); + } + void clear_buffer() { + _buffer.erase(_buffer.begin(), _buffer.end()); + _buffer_size = 0; + } + void forward_buffer_to(const position_in_partition& pos); + void clear_buffer_to_next_partition(); + template + future fill_buffer_from(Source&, db::timeout_clock::time_point); + // When succeeds, makes sure that the next push_mutation_fragment() will not fail. + void reserve_one() { + if (_buffer.capacity() == _buffer.size()) { + _buffer.reserve(_buffer.size() * 2 + 1); + } + } + const tracked_buffer& buffer() const { + return _buffer; + } + public: + impl(schema_ptr s, reader_permit permit) : _buffer(permit), _schema(std::move(s)), _permit(std::move(permit)) { } + virtual ~impl() {} + virtual future<> fill_buffer(db::timeout_clock::time_point) = 0; + virtual future<> next_partition() = 0; + + bool is_end_of_stream() const { return _end_of_stream; } + bool is_buffer_empty() const { return _buffer.empty(); } + bool is_buffer_full() const { return _buffer_size >= max_buffer_size_in_bytes; } + + mutation_fragment_v2 pop_mutation_fragment() { + auto mf = std::move(_buffer.front()); + _buffer.pop_front(); + _buffer_size -= mf.memory_usage(); + return mf; + } + + void unpop_mutation_fragment(mutation_fragment_v2 mf) { + const auto memory_usage = mf.memory_usage(); + _buffer.emplace_front(std::move(mf)); + _buffer_size += memory_usage; + } + + future operator()(db::timeout_clock::time_point timeout) { + if (is_buffer_empty()) { + if (is_end_of_stream()) { + return make_ready_future(); + } + return fill_buffer(timeout).then([this, timeout] { return operator()(timeout); }); + } + return make_ready_future(pop_mutation_fragment()); + } + + template + requires FlatMutationReaderConsumerV2 + // Stops when consumer returns stop_iteration::yes or end of stream is reached. + // Next call will start from the next mutation_fragment_v2 in the stream. + future<> consume_pausable(Consumer consumer, db::timeout_clock::time_point timeout) { + return repeat([this, consumer = std::move(consumer), timeout] () mutable { + if (is_buffer_empty()) { + if (is_end_of_stream()) { + return make_ready_future(stop_iteration::yes); + } + return fill_buffer(timeout).then([] { + return make_ready_future(stop_iteration::no); + }); + } + + if constexpr (std::is_same_v, decltype(consumer(pop_mutation_fragment()))>) { + return consumer(pop_mutation_fragment()); + } else { + auto result = stop_iteration::no; + while ((result = consumer(pop_mutation_fragment())) != stop_iteration::yes && !is_buffer_empty() && !need_preempt()) {} + return make_ready_future(result); + } + }); + } + + template + requires FlatMutationReaderConsumerV2 && FlattenedConsumerFilterV2 + // A variant of consume_pausable() that expects to be run in + // a seastar::thread. + // Partitions for which filter(decorated_key) returns false are skipped + // entirely and never reach the consumer. + void consume_pausable_in_thread(Consumer consumer, Filter filter, db::timeout_clock::time_point timeout) { + while (true) { + if (need_preempt()) { + seastar::thread::yield(); + } + if (is_buffer_empty()) { + if (is_end_of_stream()) { + return; + } + fill_buffer(timeout).get(); + continue; + } + auto mf = pop_mutation_fragment(); + if (mf.is_partition_start() && !filter(mf.as_partition_start().key())) { + next_partition().get(); + continue; + } + if (!filter(mf)) { + continue; + } + auto do_stop = futurize_invoke([&consumer, mf = std::move(mf)] () mutable { + return consumer(std::move(mf)); + }); + if (do_stop.get0()) { + return; + } + } + }; + + private: + template + struct consumer_adapter { + flat_mutation_reader_v2::impl& _reader; + std::optional _decorated_key; + Consumer _consumer; + consumer_adapter(flat_mutation_reader_v2::impl& reader, Consumer c) + : _reader(reader) + , _consumer(std::move(c)) + { } + future operator()(mutation_fragment_v2&& mf) { + return std::move(mf).consume(*this); + } + future consume(static_row&& sr) { + return handle_result(_consumer.consume(std::move(sr))); + } + future consume(clustering_row&& cr) { + return handle_result(_consumer.consume(std::move(cr))); + } + future consume(range_tombstone_change&& rt) { + return handle_result(_consumer.consume(std::move(rt))); + } + future consume(partition_start&& ps) { + _decorated_key.emplace(std::move(ps.key())); + _consumer.consume_new_partition(*_decorated_key); + if (ps.partition_tombstone()) { + _consumer.consume(ps.partition_tombstone()); + } + return make_ready_future(stop_iteration::no); + } + future consume(partition_end&& pe) { + return futurize_invoke([this] { + return _consumer.consume_end_of_partition(); + }); + } + private: + future handle_result(stop_iteration si) { + if (si) { + if (_consumer.consume_end_of_partition()) { + return make_ready_future(stop_iteration::yes); + } + return _reader.next_partition().then([] { + return make_ready_future(stop_iteration::no); + }); + } + return make_ready_future(stop_iteration::no); + } + }; + public: + template + requires FlattenedConsumerV2 + // Stops when consumer returns stop_iteration::yes from consume_end_of_partition or end of stream is reached. + // Next call will receive fragments from the next partition. + // When consumer returns stop_iteration::yes from methods other than consume_end_of_partition then the read + // of the current partition is ended, consume_end_of_partition is called and if it returns stop_iteration::no + // then the read moves to the next partition. + // Reference to the decorated key that is passed to consume_new_partition() remains valid until after + // the call to consume_end_of_partition(). + // + // This method is useful because most of current consumers use this semantic. + // + // + // This method returns whatever is returned from Consumer::consume_end_of_stream().S + auto consume(Consumer consumer, db::timeout_clock::time_point timeout) { + return do_with(consumer_adapter(*this, std::move(consumer)), [this, timeout] (consumer_adapter& adapter) { + return consume_pausable(std::ref(adapter), timeout).then([this, &adapter] { + return adapter._consumer.consume_end_of_stream(); + }); + }); + } + + template + requires FlattenedConsumerV2 && FlattenedConsumerFilterV2 + // A variant of consumee() that expects to be run in a seastar::thread. + // Partitions for which filter(decorated_key) returns false are skipped + // entirely and never reach the consumer. + auto consume_in_thread(Consumer consumer, Filter filter, db::timeout_clock::time_point timeout) { + auto adapter = consumer_adapter(*this, std::move(consumer)); + consume_pausable_in_thread(std::ref(adapter), std::move(filter), timeout); + filter.on_end_of_stream(); + return adapter._consumer.consume_end_of_stream(); + }; + + /* + * fast_forward_to is forbidden on flat_mutation_reader_v2 created for a single partition. + */ + virtual future<> fast_forward_to(const dht::partition_range&, db::timeout_clock::time_point timeout) = 0; + virtual future<> fast_forward_to(position_range, db::timeout_clock::time_point timeout) = 0; + + // close should cancel any outstanding background operations, + // if possible, and wait on them to complete. + // It should also transitively close underlying resources + // and wait on them too. + // + // Once closed, the reader should be unusable. + // + // Similar to destructors, close must never fail. + virtual future<> close() noexcept = 0; + + size_t buffer_size() const { + return _buffer_size; + } + + tracked_buffer detach_buffer() { + _buffer_size = 0; + return std::exchange(_buffer, tracked_buffer(_permit)); + } + + void move_buffer_content_to(impl& other) { + if (other._buffer.empty()) { + std::swap(_buffer, other._buffer); + other._buffer_size = std::exchange(_buffer_size, 0); + } else { + seastar::memory::on_alloc_point(); // for exception safety tests + other._buffer.reserve(other._buffer.size() + _buffer.size()); + std::move(_buffer.begin(), _buffer.end(), std::back_inserter(other._buffer)); + _buffer.clear(); + other._buffer_size += std::exchange(_buffer_size, 0); + } + } + }; +private: + std::unique_ptr _impl; + + flat_mutation_reader_v2() = default; + explicit operator bool() const noexcept { return bool(_impl); } + friend class optimized_optional; + void do_upgrade_schema(const schema_ptr&); + static void on_close_error(std::unique_ptr, std::exception_ptr ep) noexcept; +public: + // Documented in mutation_reader::forwarding. + class partition_range_forwarding_tag; + using partition_range_forwarding = bool_class; + + flat_mutation_reader_v2(std::unique_ptr impl) noexcept : _impl(std::move(impl)) {} + flat_mutation_reader_v2(const flat_mutation_reader_v2&) = delete; + flat_mutation_reader_v2(flat_mutation_reader_v2&&) = default; + + flat_mutation_reader_v2& operator=(const flat_mutation_reader_v2&) = delete; + flat_mutation_reader_v2& operator=(flat_mutation_reader_v2&& o) noexcept; + + ~flat_mutation_reader_v2(); + + future operator()(db::timeout_clock::time_point timeout) { + return _impl->operator()(timeout); + } + + template + requires FlatMutationReaderConsumerV2 + auto consume_pausable(Consumer consumer, db::timeout_clock::time_point timeout) { + return _impl->consume_pausable(std::move(consumer), timeout); + } + + template + requires FlattenedConsumerV2 + auto consume(Consumer consumer, db::timeout_clock::time_point timeout) { + return _impl->consume(std::move(consumer), timeout); + } + + class filter { + private: + std::function _partition_filter = [] (const dht::decorated_key&) { return true; }; + std::function _mutation_fragment_filter = [] (const mutation_fragment_v2&) { return true; }; + public: + filter() = default; + + filter(std::function&& pf) + : _partition_filter(std::move(pf)) + { } + + filter(std::function&& pf, + std::function&& mf) + : _partition_filter(std::move(pf)) + , _mutation_fragment_filter(std::move(mf)) + { } + + template + filter(Functor&& f) + : _partition_filter(std::forward(f)) + { } + + bool operator()(const dht::decorated_key& dk) const { + return _partition_filter(dk); + } + + bool operator()(const mutation_fragment_v2& mf) const { + return _mutation_fragment_filter(mf); + } + + void on_end_of_stream() const { } + }; + + struct no_filter { + bool operator()(const dht::decorated_key& dk) const { + return true; + } + + bool operator()(const mutation_fragment_v2& mf) const { + return true; + } + + void on_end_of_stream() const { } + }; + + template + requires FlattenedConsumerV2 && FlattenedConsumerFilterV2 + auto consume_in_thread(Consumer consumer, Filter filter, db::timeout_clock::time_point timeout) { + return _impl->consume_in_thread(std::move(consumer), std::move(filter), timeout); + } + + template + requires FlattenedConsumerV2 + auto consume_in_thread(Consumer consumer, db::timeout_clock::time_point timeout) { + return consume_in_thread(std::move(consumer), no_filter{}, timeout); + } + + // Skips to the next partition. + // + // Skips over the remaining fragments of the current partitions. If the + // reader is currently positioned at a partition start nothing is done. + // + // If the last produced fragment comes from partition `P`, then the reader + // is considered to still be in partition `P`, which means that `next_partition` + // will move the reader to the partition immediately following `P`. + // This case happens in particular when the last produced fragment was + // `partition_end` for `P`. + // + // Only skips within the current partition range, i.e. if the current + // partition is the last in the range the reader will be at EOS. + // + // Can be used to skip over entire partitions if interleaved with + // `operator()()` calls. + future<> next_partition() { return _impl->next_partition(); } + + future<> fill_buffer(db::timeout_clock::time_point timeout) { return _impl->fill_buffer(timeout); } + + // Changes the range of partitions to pr. The range can only be moved + // forwards. pr.begin() needs to be larger than pr.end() of the previousl + // used range (i.e. either the initial one passed to the constructor or a + // previous fast forward target). + // pr needs to be valid until the reader is destroyed or fast_forward_to() + // is called again. + future<> fast_forward_to(const dht::partition_range& pr, db::timeout_clock::time_point timeout) { + return _impl->fast_forward_to(pr, timeout); + } + // Skips to a later range of rows. + // The new range must not overlap with the current range. + // + // In forwarding mode the stream does not return all fragments right away, + // but only those belonging to the current clustering range. Initially + // current range only covers the static row. The stream can be forwarded + // (even before end-of- stream) to a later range with fast_forward_to(). + // Forwarding doesn't change initial restrictions of the stream, it can + // only be used to skip over data. + // + // Monotonicity of positions is preserved by forwarding. That is fragments + // emitted after forwarding will have greater positions than any fragments + // emitted before forwarding. + // + // For any range, all range tombstones relevant for that range which are + // present in the original stream will be emitted. Range tombstones + // emitted before forwarding which overlap with the new range are not + // necessarily re-emitted. + // + // When forwarding mode is not enabled, fast_forward_to() + // cannot be used. + // + // `fast_forward_to` can be called only when the reader is within a partition + // and it affects the set of fragments returned from that partition. + // In particular one must first enter a partition by fetching a `partition_start` + // fragment before calling `fast_forward_to`. + future<> fast_forward_to(position_range cr, db::timeout_clock::time_point timeout) { + return _impl->fast_forward_to(std::move(cr), timeout); + } + // Closes the reader. + // + // Note: The reader object can can be safely destroyed after close returns. + // since close makes sure to keep the underlying impl object alive until + // the latter's close call is resolved. + future<> close() noexcept { + if (auto i = std::move(_impl)) { + auto f = i->close(); + // most close implementations are expexcted to return a ready future + // so expedite prcessing it. + if (f.available() && !f.failed()) { + return std::move(f); + } + // close must not fail + return f.handle_exception([i = std::move(i)] (std::exception_ptr ep) mutable { + on_close_error(std::move(i), std::move(ep)); + }); + } + return make_ready_future<>(); + } + bool is_end_of_stream() const { return _impl->is_end_of_stream(); } + bool is_buffer_empty() const { return _impl->is_buffer_empty(); } + bool is_buffer_full() const { return _impl->is_buffer_full(); } + mutation_fragment_v2 pop_mutation_fragment() { return _impl->pop_mutation_fragment(); } + void unpop_mutation_fragment(mutation_fragment_v2 mf) { _impl->unpop_mutation_fragment(std::move(mf)); } + const schema_ptr& schema() const { return _impl->_schema; } + const reader_permit& permit() const { return _impl->_permit; } + void set_max_buffer_size(size_t size) { + _impl->max_buffer_size_in_bytes = size; + } + // Resolves with a pointer to the next fragment in the stream without consuming it from the stream, + // or nullptr if there are no more fragments. + // The returned pointer is invalidated by any other non-const call to this object. + future peek(db::timeout_clock::time_point timeout) { + if (!is_buffer_empty()) { + return make_ready_future(&_impl->_buffer.front()); + } + if (is_end_of_stream()) { + return make_ready_future(nullptr); + } + return fill_buffer(timeout).then([this, timeout] { + return peek(timeout); + }); + } + // A peek at the next fragment in the buffer. + // Cannot be called if is_buffer_empty() returns true. + const mutation_fragment_v2& peek_buffer() const { return _impl->_buffer.front(); } + // The actual buffer size of the reader. + // Altough we consistently refer to this as buffer size throught the code + // we really use "buffer size" as the size of the collective memory + // used by all the mutation fragments stored in the buffer of the reader. + size_t buffer_size() const { + return _impl->buffer_size(); + } + const tracked_buffer& buffer() const { + return _impl->buffer(); + } + // Detach the internal buffer of the reader. + // Roughly equivalent to depleting it by calling pop_mutation_fragment() + // until is_buffer_empty() returns true. + // The reader will need to allocate a new buffer on the next fill_buffer() + // call. + tracked_buffer detach_buffer() { + return _impl->detach_buffer(); + } + // Moves the buffer content to `other`. + // + // If the buffer of `other` is empty this is very efficient as the buffers + // are simply swapped. Otherwise the content of the buffer is moved + // fragmuent-by-fragment. + // Allows efficient implementation of wrapping readers that do no + // transformation to the fragment stream. + void move_buffer_content_to(impl& other) { + _impl->move_buffer_content_to(other); + } + + // Causes this reader to conform to s. + // Multiple calls of upgrade_schema() compose, effects of prior calls on the stream are preserved. + void upgrade_schema(const schema_ptr& s) { + if (__builtin_expect(s != schema(), false)) { + do_upgrade_schema(s); + } + } +}; + +using flat_mutation_reader_v2_opt = optimized_optional; + +template +flat_mutation_reader_v2 make_flat_mutation_reader_v2(Args &&... args) { + return flat_mutation_reader_v2(std::make_unique(std::forward(args)...)); +} + +// Consumes mutation fragments until StopCondition is true. +// The consumer will stop iff StopCondition returns true, in particular +// reaching the end of stream alone won't stop the reader. +template +requires requires(StopCondition stop, ConsumeMutationFragment consume_mf, ConsumeEndOfStream consume_eos, mutation_fragment_v2 mf) { + { stop() } -> std::same_as; + { consume_mf(std::move(mf)) } -> std::same_as; + { consume_eos() } -> std::same_as>; +} +future<> consume_mutation_fragments_until( + flat_mutation_reader_v2& r, + StopCondition&& stop, + ConsumeMutationFragment&& consume_mf, + ConsumeEndOfStream&& consume_eos, + db::timeout_clock::time_point timeout) { + return do_until([stop] { return stop(); }, [&r, stop, consume_mf, consume_eos, timeout] { + while (!r.is_buffer_empty()) { + consume_mf(r.pop_mutation_fragment()); + if (stop() || need_preempt()) { + return make_ready_future<>(); + } + } + if (r.is_end_of_stream()) { + return consume_eos(); + } + return r.fill_buffer(timeout); + }); +} + +// Creates a stream which is like r but with transformation applied to the elements. +template +requires StreamedMutationTranformerV2 +flat_mutation_reader_v2 transform(flat_mutation_reader_v2 r, T t) { + class transforming_reader : public flat_mutation_reader_v2::impl { + flat_mutation_reader_v2 _reader; + T _t; + struct consumer { + transforming_reader* _owner; + stop_iteration operator()(mutation_fragment_v2&& mf) { + _owner->push_mutation_fragment(_owner->_t(std::move(mf))); + return stop_iteration(_owner->is_buffer_full()); + } + }; + public: + transforming_reader(flat_mutation_reader_v2&& r, T&& t) + : impl(t(r.schema()), r.permit()) + , _reader(std::move(r)) + , _t(std::move(t)) + {} + virtual future<> fill_buffer(db::timeout_clock::time_point timeout) override { + if (_end_of_stream) { + return make_ready_future<>(); + } + return _reader.consume_pausable(consumer{this}, timeout).then([this] { + if (_reader.is_end_of_stream() && _reader.is_buffer_empty()) { + _end_of_stream = true; + } + }); + } + virtual future<> next_partition() override { + clear_buffer_to_next_partition(); + if (is_buffer_empty()) { + return _reader.next_partition(); + } + return make_ready_future<>(); + } + virtual future<> fast_forward_to(const dht::partition_range& pr, db::timeout_clock::time_point timeout) override { + clear_buffer(); + _end_of_stream = false; + return _reader.fast_forward_to(pr, timeout); + } + virtual future<> fast_forward_to(position_range pr, db::timeout_clock::time_point timeout) override { + forward_buffer_to(pr.start()); + _end_of_stream = false; + return _reader.fast_forward_to(std::move(pr), timeout); + } + virtual future<> close() noexcept override { + return _reader.close(); + } + }; + return make_flat_mutation_reader_v2(std::move(r), std::move(t)); +} + +// Adapts a v2 reader to v1 reader +flat_mutation_reader downgrade_to_v1(flat_mutation_reader_v2); + +// Adapts a v1 reader to v2 reader +flat_mutation_reader_v2 upgrade_to_v2(flat_mutation_reader); + +// Reads a single partition from a reader. Returns empty optional if there are no more partitions to be read. +future read_mutation_from_flat_mutation_reader(flat_mutation_reader_v2&, db::timeout_clock::time_point timeout); diff --git a/mutation_fragment.cc b/mutation_fragment.cc index 32d52409bd..a15cc9031c 100644 --- a/mutation_fragment.cc +++ b/mutation_fragment.cc @@ -25,6 +25,7 @@ #include "mutation.hh" #include "mutation_fragment.hh" +#include "mutation_fragment_v2.hh" #include "clustering_interval_set.hh" std::ostream& @@ -145,11 +146,68 @@ void mutation_fragment::destroy_data() noexcept } } +mutation_fragment_v2::mutation_fragment_v2(const schema& s, reader_permit permit, static_row&& r) + : _kind(kind::static_row), _data(std::make_unique(std::move(permit))) +{ + new (&_data->_static_row) static_row(std::move(r)); + _data->_memory.reset(reader_resources::with_memory(calculate_memory_usage(s))); +} + +mutation_fragment_v2::mutation_fragment_v2(const schema& s, reader_permit permit, clustering_row&& r) + : _kind(kind::clustering_row), _data(std::make_unique(std::move(permit))) +{ + new (&_data->_clustering_row) clustering_row(std::move(r)); + _data->_memory.reset(reader_resources::with_memory(calculate_memory_usage(s))); +} + +mutation_fragment_v2::mutation_fragment_v2(const schema& s, reader_permit permit, range_tombstone_change&& r) + : _kind(kind::range_tombstone_change), _data(std::make_unique(std::move(permit))) +{ + new (&_data->_range_tombstone_chg) range_tombstone_change(std::move(r)); + _data->_memory.reset(reader_resources::with_memory(calculate_memory_usage(s))); +} + +mutation_fragment_v2::mutation_fragment_v2(const schema& s, reader_permit permit, partition_start&& r) + : _kind(kind::partition_start), _data(std::make_unique(std::move(permit))) +{ + new (&_data->_partition_start) partition_start(std::move(r)); + _data->_memory.reset(reader_resources::with_memory(calculate_memory_usage(s))); +} + +mutation_fragment_v2::mutation_fragment_v2(const schema& s, reader_permit permit, partition_end&& r) + : _kind(kind::partition_end), _data(std::make_unique(std::move(permit))) +{ + new (&_data->_partition_end) partition_end(std::move(r)); + _data->_memory.reset(reader_resources::with_memory(calculate_memory_usage(s))); +} + +void mutation_fragment_v2::destroy_data() noexcept +{ + switch (_kind) { + case kind::static_row: + _data->_static_row.~static_row(); + break; + case kind::clustering_row: + _data->_clustering_row.~clustering_row(); + break; + case kind::range_tombstone_change: + _data->_range_tombstone_chg.~range_tombstone_change(); + break; + case kind::partition_start: + _data->_partition_start.~partition_start(); + break; + case kind::partition_end: + _data->_partition_end.~partition_end(); + break; + } +} + namespace { struct get_key_visitor { const clustering_key_prefix& operator()(const clustering_row& cr) { return cr.key(); } const clustering_key_prefix& operator()(const range_tombstone& rt) { return rt.start; } + const clustering_key_prefix& operator()(const range_tombstone_change& rt) { return rt.position().key(); } template const clustering_key_prefix& operator()(const T&) { abort(); } }; @@ -235,6 +293,68 @@ std::ostream& operator<<(std::ostream& os, const mutation_fragment::printer& p) return os; } +const clustering_key_prefix& mutation_fragment_v2::key() const +{ + assert(has_key()); + return visit(get_key_visitor()); +} + +void mutation_fragment_v2::apply(const schema& s, mutation_fragment_v2&& mf) +{ + assert(mergeable_with(mf)); + switch (_kind) { + case mutation_fragment_v2::kind::partition_start: + _data->_partition_start.partition_tombstone().apply(mf._data->_partition_start.partition_tombstone()); + mf._data->_partition_start.~partition_start(); + break; + case kind::static_row: + _data->_static_row.apply(s, std::move(mf._data->_static_row)); + _data->_memory.reset(reader_resources::with_memory(calculate_memory_usage(s))); + mf._data->_static_row.~static_row(); + break; + case kind::clustering_row: + _data->_clustering_row.apply(s, std::move(mf._data->_clustering_row)); + _data->_memory.reset(reader_resources::with_memory(calculate_memory_usage(s))); + mf._data->_clustering_row.~clustering_row(); + break; + case mutation_fragment_v2::kind::partition_end: + // Nothing to do for this guy. + mf._data->_partition_end.~partition_end(); + break; + default: abort(); + } + mf._data.reset(); +} + +position_in_partition_view mutation_fragment_v2::position() const +{ + return visit([] (auto& mf) -> position_in_partition_view { return mf.position(); }); +} + +std::ostream& operator<<(std::ostream& os, mutation_fragment_v2::kind k) +{ + switch (k) { + case mutation_fragment_v2::kind::static_row: return os << "static row"; + case mutation_fragment_v2::kind::clustering_row: return os << "clustering row"; + case mutation_fragment_v2::kind::range_tombstone_change: return os << "range tombstone change"; + case mutation_fragment_v2::kind::partition_start: return os << "partition start"; + case mutation_fragment_v2::kind::partition_end: return os << "partition end"; + } + abort(); +} + +std::ostream& operator<<(std::ostream& os, const mutation_fragment_v2::printer& p) { + auto& mf = p._mutation_fragment; + os << "{mutation_fragment: " << mf._kind << " " << mf.position() << " "; + mf.visit(make_visitor( + [&] (const clustering_row& cr) { os << clustering_row::printer(p._schema, cr); }, + [&] (const static_row& sr) { os << static_row::printer(p._schema, sr); }, + [&] (const auto& what) -> void { os << what; } + )); + os << "}"; + return os; +} + mutation_fragment_opt range_tombstone_stream::do_get_next() { return mutation_fragment(_schema, _permit, _list.pop_as(_list.begin())); @@ -316,6 +436,14 @@ bool mutation_fragment::relevant_for_range_assuming_after(const schema& s, posit return is_range_tombstone() && cmp(pos, as_range_tombstone().end_position()); } +bool mutation_fragment_v2::relevant_for_range(const schema& s, position_in_partition_view pos) const { + position_in_partition::less_compare less(s); + if (!less(position(), pos)) { + return true; + } + return false; +} + std::ostream& operator<<(std::ostream& out, const range_tombstone_stream& rtl) { return out << rtl._list; } diff --git a/mutation_fragment_v2.hh b/mutation_fragment_v2.hh new file mode 100644 index 0000000000..2112df3ab0 --- /dev/null +++ b/mutation_fragment_v2.hh @@ -0,0 +1,364 @@ +/* + * Copyright (C) 2021-present ScyllaDB + */ + +/* + * This file is part of Scylla. + * + * Scylla is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * Scylla is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with Scylla. If not, see . + */ + +#pragma once + +#include "mutation_partition.hh" +#include "mutation_fragment.hh" +#include "position_in_partition.hh" + +#include +#include + +#include "seastar/core/future-util.hh" + +#include "db/timeout_clock.hh" +#include "reader_permit.hh" + +// Mutation fragment which represents a range tombstone boundary. +// +// The range_tombstone_change::tombstone() method returns the tombstone which takes effect +// for positions >= range_tombstone_change::position() in the stream, until the next +// range_tombstone_change is encountered. +// +// Note, a range_tombstone_change with an empty tombstone() ends the range tombstone. +// An empty tombstone naturally does not cover any timestamp. +class range_tombstone_change { + position_in_partition _pos; + ::tombstone _tomb; +public: + range_tombstone_change(position_in_partition pos, tombstone tomb) + : _pos(std::move(pos)) + , _tomb(tomb) + { } + range_tombstone_change(position_in_partition_view pos, tombstone tomb) + : _pos(pos) + , _tomb(tomb) + { } + const position_in_partition& position() const { + return _pos; + } + void set_position(position_in_partition pos) { + _pos = std::move(pos); + } + ::tombstone tombstone() const { + return _tomb; + } + size_t external_memory_usage(const schema& s) const { + return _pos.external_memory_usage(); + } + bool equal(const schema& s, const range_tombstone_change& other) const { + position_in_partition::equal_compare eq(s); + return _tomb == other._tomb && eq(_pos, other._pos); + } + friend std::ostream& operator<<(std::ostream& out, const range_tombstone_change&); +}; + +template +concept MutationFragmentConsumerV2 = + requires(T t, + static_row sr, + clustering_row cr, + range_tombstone_change rt_chg, + partition_start ph, + partition_end pe) { + { t.consume(std::move(sr)) } -> std::same_as; + { t.consume(std::move(cr)) } -> std::same_as; + { t.consume(std::move(rt_chg)) } -> std::same_as; + { t.consume(std::move(ph)) } -> std::same_as; + { t.consume(std::move(pe)) } -> std::same_as; + }; + +template +concept MutationFragmentVisitorV2 = + requires(T t, + const static_row& sr, + const clustering_row& cr, + const range_tombstone_change& rt, + const partition_start& ph, + const partition_end& eop) { + { t(sr) } -> std::same_as; + { t(cr) } -> std::same_as; + { t(rt) } -> std::same_as; + { t(ph) } -> std::same_as; + { t(eop) } -> std::same_as; + }; + +template +concept FragmentConsumerReturningV2 = +requires(T t, static_row sr, clustering_row cr, range_tombstone_change rt, tombstone tomb) { + { t.consume(std::move(sr)) } -> std::same_as; + { t.consume(std::move(cr)) } -> std::same_as; + { t.consume(std::move(rt)) } -> std::same_as; +}; + +template +concept FragmentConsumerV2 = +FragmentConsumerReturningV2 || FragmentConsumerReturningV2>; + +class mutation_fragment_v2 { +public: + enum class kind { + static_row, + clustering_row, + range_tombstone_change, + partition_start, + partition_end, + }; +private: + struct data { + data(reader_permit permit) : _memory(permit.consume_memory()) { } + ~data() { } + + reader_permit::resource_units _memory; + union { + static_row _static_row; + clustering_row _clustering_row; + range_tombstone_change _range_tombstone_chg; + partition_start _partition_start; + partition_end _partition_end; + }; + }; +private: + kind _kind; + std::unique_ptr _data; + + mutation_fragment_v2() = default; + explicit operator bool() const noexcept { return bool(_data); } + void destroy_data() noexcept; + friend class optimized_optional; + + friend class position_in_partition; +public: + struct clustering_row_tag_t { }; + + template + mutation_fragment_v2(clustering_row_tag_t, const schema& s, reader_permit permit, Args&&... args) + : _kind(kind::clustering_row) + , _data(std::make_unique(std::move(permit))) + { + new (&_data->_clustering_row) clustering_row(std::forward(args)...); + _data->_memory.reset(reader_resources::with_memory(calculate_memory_usage(s))); + } + + mutation_fragment_v2(const schema& s, reader_permit permit, static_row&& r); + mutation_fragment_v2(const schema& s, reader_permit permit, clustering_row&& r); + mutation_fragment_v2(const schema& s, reader_permit permit, range_tombstone_change&& r); + mutation_fragment_v2(const schema& s, reader_permit permit, partition_start&& r); + mutation_fragment_v2(const schema& s, reader_permit permit, partition_end&& r); + + mutation_fragment_v2(const schema& s, reader_permit permit, const mutation_fragment_v2& o) + : _kind(o._kind), _data(std::make_unique(std::move(permit))) { + switch (_kind) { + case kind::static_row: + new (&_data->_static_row) static_row(s, o._data->_static_row); + break; + case kind::clustering_row: + new (&_data->_clustering_row) clustering_row(s, o._data->_clustering_row); + break; + case kind::range_tombstone_change: + new (&_data->_range_tombstone_chg) range_tombstone_change(o._data->_range_tombstone_chg); + break; + case kind::partition_start: + new (&_data->_partition_start) partition_start(o._data->_partition_start); + break; + case kind::partition_end: + new (&_data->_partition_end) partition_end(o._data->_partition_end); + break; + } + _data->_memory.reset(o._data->_memory.resources()); + } + mutation_fragment_v2(mutation_fragment_v2&& other) = default; + mutation_fragment_v2& operator=(mutation_fragment_v2&& other) noexcept { + if (this != &other) { + this->~mutation_fragment_v2(); + new (this) mutation_fragment_v2(std::move(other)); + } + return *this; + } + [[gnu::always_inline]] + ~mutation_fragment_v2() { + if (_data) { + destroy_data(); + } + } + + position_in_partition_view position() const; + + // Checks if this fragment may be relevant for any range starting at given position. + bool relevant_for_range(const schema& s, position_in_partition_view pos) const; + + bool has_key() const { return is_clustering_row() || is_range_tombstone_change(); } + + // Requirements: has_key() == true + const clustering_key_prefix& key() const; + + kind mutation_fragment_kind() const { return _kind; } + + bool is_static_row() const { return _kind == kind::static_row; } + bool is_clustering_row() const { return _kind == kind::clustering_row; } + bool is_range_tombstone_change() const { return _kind == kind::range_tombstone_change; } + bool is_partition_start() const { return _kind == kind::partition_start; } + bool is_end_of_partition() const { return _kind == kind::partition_end; } + + void mutate_as_static_row(const schema& s, std::invocable auto&& fn) { + fn(_data->_static_row); + _data->_memory.reset(reader_resources::with_memory(calculate_memory_usage(s))); + } + void mutate_as_clustering_row(const schema& s, std::invocable auto&& fn) { + fn(_data->_clustering_row); + _data->_memory.reset(reader_resources::with_memory(calculate_memory_usage(s))); + } + void mutate_as_range_tombstone_change(const schema& s, std::invocable auto&& fn) { + fn(_data->_range_tombstone_chg); + _data->_memory.reset(reader_resources::with_memory(calculate_memory_usage(s))); + } + void mutate_as_partition_start(const schema& s, std::invocable auto&& fn) { + fn(_data->_partition_start); + _data->_memory.reset(reader_resources::with_memory(calculate_memory_usage(s))); + } + + static_row&& as_static_row() && { return std::move(_data->_static_row); } + clustering_row&& as_clustering_row() && { return std::move(_data->_clustering_row); } + range_tombstone_change&& as_range_tombstone_change() && { return std::move(_data->_range_tombstone_chg); } + partition_start&& as_partition_start() && { return std::move(_data->_partition_start); } + partition_end&& as_end_of_partition() && { return std::move(_data->_partition_end); } + + const static_row& as_static_row() const & { return _data->_static_row; } + const clustering_row& as_clustering_row() const & { return _data->_clustering_row; } + const range_tombstone_change& as_range_tombstone_change() const & { return _data->_range_tombstone_chg; } + const partition_start& as_partition_start() const & { return _data->_partition_start; } + const partition_end& as_end_of_partition() const & { return _data->_partition_end; } + + // Requirements: mergeable_with(mf) + void apply(const schema& s, mutation_fragment_v2&& mf); + + template + requires MutationFragmentConsumerV2().consume(std::declval()))> + decltype(auto) consume(Consumer& consumer) && { + _data->_memory.reset(); + switch (_kind) { + case kind::static_row: + return consumer.consume(std::move(_data->_static_row)); + case kind::clustering_row: + return consumer.consume(std::move(_data->_clustering_row)); + case kind::range_tombstone_change: + return consumer.consume(std::move(_data->_range_tombstone_chg)); + case kind::partition_start: + return consumer.consume(std::move(_data->_partition_start)); + case kind::partition_end: + return consumer.consume(std::move(_data->_partition_end)); + } + abort(); + } + + template + requires MutationFragmentVisitorV2()(std::declval()))> + decltype(auto) visit(Visitor&& visitor) const { + switch (_kind) { + case kind::static_row: + return visitor(as_static_row()); + case kind::clustering_row: + return visitor(as_clustering_row()); + case kind::range_tombstone_change: + return visitor(as_range_tombstone_change()); + case kind::partition_start: + return visitor(as_partition_start()); + case kind::partition_end: + return visitor(as_end_of_partition()); + } + abort(); + } + + size_t memory_usage() const { + return _data->_memory.resources().memory; + } + + reader_permit permit() const { + return _data->_memory.permit(); + } + + bool equal(const schema& s, const mutation_fragment_v2& other) const { + if (other._kind != _kind) { + return false; + } + switch (_kind) { + case kind::static_row: + return as_static_row().equal(s, other.as_static_row()); + case kind::clustering_row: + return as_clustering_row().equal(s, other.as_clustering_row()); + case kind::range_tombstone_change: + return as_range_tombstone_change().equal(s, other.as_range_tombstone_change()); + case kind::partition_start: + return as_partition_start().equal(s, other.as_partition_start()); + case kind::partition_end: + return as_end_of_partition().equal(s, other.as_end_of_partition()); + } + abort(); + } + + // Fragments which have the same position() and are mergeable can be + // merged into one fragment with apply() which represents the sum of + // writes represented by each of the fragments. + // Fragments which have the same position() but are not mergeable + // and at least one of them is not a range_tombstone_change can be emitted one after the other in the stream. + // + // Undefined for range_tombstone_change. + // Merging range tombstones requires a more complicated handling + // because range_tombstone_change doesn't represent a write on its own, only + // with a matching change for the end bound. It's not enough to chose one fragment over another, + // the upper bound of the winning tombstone needs to be taken into account when merging + // later range_tombstone_change fragments in the stream. + bool mergeable_with(const mutation_fragment_v2& mf) const { + return _kind == mf._kind && _kind != kind::range_tombstone_change; + } + + class printer { + const schema& _schema; + const mutation_fragment_v2& _mutation_fragment; + public: + printer(const schema& s, const mutation_fragment_v2& mf) : _schema(s), _mutation_fragment(mf) { } + printer(const printer&) = delete; + printer(printer&&) = delete; + + friend std::ostream& operator<<(std::ostream& os, const printer& p); + }; + friend std::ostream& operator<<(std::ostream& os, const printer& p); + +private: + size_t calculate_memory_usage(const schema& s) const { + return sizeof(data) + visit([&s] (auto& mf) -> size_t { return mf.external_memory_usage(s); }); + } +}; + +std::ostream& operator<<(std::ostream&, mutation_fragment_v2::kind); + +using mutation_fragment_v2_opt = optimized_optional; + + +// F gets a stream element as an argument and returns the new value which replaces that element +// in the transformed stream. +template +concept StreamedMutationTranformerV2 = +requires(F f, mutation_fragment_v2 mf, schema_ptr s) { + { f(std::move(mf)) } -> std::same_as; + { f(s) } -> std::same_as; +}; diff --git a/mutation_partition.cc b/mutation_partition.cc index df8824e72e..6dc2664ef7 100644 --- a/mutation_partition.cc +++ b/mutation_partition.cc @@ -200,7 +200,7 @@ mutation_partition::mutation_partition(mutation_partition&& x, const schema& sch , _static_row(std::move(x._static_row)) , _static_row_continuous(x._static_row_continuous) , _rows(std::move(x._rows)) - , _row_tombstones(std::move(x._row_tombstones)) + , _row_tombstones(schema) #ifdef SEASTAR_DEBUG , _schema_version(schema.version()) #endif @@ -220,16 +220,14 @@ mutation_partition::mutation_partition(mutation_partition&& x, const schema& sch { range_tombstone_list::const_iterator it = _row_tombstones.begin(); for (auto&& range : ck_ranges.ranges()) { - auto rt_range = _row_tombstones.slice(schema, range); - // upper bound for previous range may be after lower bound for the next range - // if both ranges are connected through a range tombstone. In this case the - // erase range would be invalid. - if (rt_range.begin() == _row_tombstones.end() || std::next(rt_range.begin()) != it) { - _row_tombstones.erase(it, rt_range.begin()); + for (auto&& x_rt : x._row_tombstones.slice(schema, range)) { + auto rt = x_rt; + rt.trim(schema, + position_in_partition_view::for_range_start(range), + position_in_partition_view::for_range_end(range)); + _row_tombstones.apply(schema, std::move(rt)); } - it = rt_range.end(); } - _row_tombstones.erase(it, _row_tombstones.end()); } } diff --git a/mutation_reader.hh b/mutation_reader.hh index b64605544f..b17a6b841c 100644 --- a/mutation_reader.hh +++ b/mutation_reader.hh @@ -28,6 +28,7 @@ #include #include "tracing/trace_state.hh" #include "flat_mutation_reader.hh" +#include "flat_mutation_reader_v2.hh" #include "reader_concurrency_semaphore.hh" class reader_selector { @@ -164,10 +165,20 @@ class mutation_source { tracing::trace_state_ptr, streamed_mutation::forwarding, mutation_reader::forwarding)>; + using flat_reader_v2_factory_type = std::function; // We could have our own version of std::function<> that is nothrow // move constructible and save some indirection and allocation. // Probably not worth the effort though. + // Either _fn or _fn_v2 is engaged. lw_shared_ptr _fn; + lw_shared_ptr _fn_v2; lw_shared_ptr> _presence_checker_factory; private: mutation_source() = default; @@ -179,6 +190,11 @@ public: , _presence_checker_factory(make_lw_shared>(std::move(pcf))) { } + mutation_source(flat_reader_v2_factory_type fn, std::function pcf = [] { return make_default_partition_presence_checker(); }) + : _fn_v2(make_lw_shared(std::move(fn))) + , _presence_checker_factory(make_lw_shared>(std::move(pcf))) + { } + // For sources which don't care about the mutation_reader::forwarding flag (always fast forwardable) mutation_source(std::function fn) @@ -249,6 +265,10 @@ public: streamed_mutation::forwarding fwd = streamed_mutation::forwarding::no, mutation_reader::forwarding fwd_mr = mutation_reader::forwarding::yes) const { + if (_fn_v2) { + return downgrade_to_v1( + (*_fn_v2)(std::move(s), std::move(permit), range, slice, pc, std::move(trace_state), fwd, fwd_mr)); + } return (*_fn)(std::move(s), std::move(permit), range, slice, pc, std::move(trace_state), fwd, fwd_mr); } @@ -262,6 +282,38 @@ public: return this->make_reader(std::move(s), std::move(permit), range, full_slice); } + // Creates a new reader. + // + // All parameters captured by reference must remain live as long as returned + // mutation_reader or streamed_mutation obtained through it are alive. + flat_mutation_reader_v2 + make_reader_v2( + schema_ptr s, + reader_permit permit, + partition_range range, + const query::partition_slice& slice, + io_priority pc = default_priority_class(), + tracing::trace_state_ptr trace_state = nullptr, + streamed_mutation::forwarding fwd = streamed_mutation::forwarding::no, + mutation_reader::forwarding fwd_mr = mutation_reader::forwarding::yes) const + { + if (_fn_v2) { + return (*_fn_v2)(std::move(s), std::move(permit), range, slice, pc, std::move(trace_state), fwd, fwd_mr); + } + return upgrade_to_v2( + (*_fn)(std::move(s), std::move(permit), range, slice, pc, std::move(trace_state), fwd, fwd_mr)); + } + + flat_mutation_reader_v2 + make_reader_v2( + schema_ptr s, + reader_permit permit, + partition_range range = query::full_partition_range) const + { + auto& full_slice = s->full_slice(); + return make_reader_v2(std::move(s), std::move(permit), range, full_slice); + } + partition_presence_checker make_partition_presence_checker() { return (*_presence_checker_factory)(); } diff --git a/mutation_rebuilder.hh b/mutation_rebuilder.hh index 28f0a4a929..c893cc26e8 100644 --- a/mutation_rebuilder.hh +++ b/mutation_rebuilder.hh @@ -22,6 +22,7 @@ #pragma once #include "mutation.hh" +#include "range_tombstone_assembler.hh" class mutation_rebuilder { mutation _m; @@ -58,3 +59,70 @@ public: return mutation_opt(std::move(_m)); } }; + +// Builds the mutation corresponding to the next partition in the mutation fragment stream. +// Implements FlattenedConsumerV2, MutationFragmentConsumerV2 and FlatMutationReaderConsumerV2. +// Does not work with streams in streamed_mutation::forwarding::yes mode. +class mutation_rebuilder_v2 { + schema_ptr _s; + std::optional _builder; + range_tombstone_assembler _rt_assembler; +public: + mutation_rebuilder_v2(schema_ptr s) : _s(std::move(s)) { } +public: + stop_iteration consume(partition_start mf) { + consume_new_partition(mf.key()); + return consume(mf.partition_tombstone()); + } + stop_iteration consume(partition_end) { + return consume_end_of_partition(); + } + stop_iteration consume(mutation_fragment_v2&& mf) { + return std::move(mf).consume(*this); + } +public: + void consume_new_partition(const dht::decorated_key& dk) { + assert(!_builder); + _builder = mutation_rebuilder(dk, _s); + } + + stop_iteration consume(tombstone t) { + assert(_builder); + _builder->consume(t); + return stop_iteration::no; + } + + stop_iteration consume(range_tombstone_change&& rt) { + assert(_builder); + if (auto rt_opt = _rt_assembler.consume(*_s, std::move(rt))) { + _builder->consume(std::move(*rt_opt)); + } + return stop_iteration::no; + } + + stop_iteration consume(static_row&& sr) { + assert(_builder); + _builder->consume(std::move(sr)); + return stop_iteration::no; + } + + stop_iteration consume(clustering_row&& cr) { + assert(_builder); + _builder->consume(std::move(cr)); + return stop_iteration::no; + } + + stop_iteration consume_end_of_partition() { + assert(_builder); + _rt_assembler.on_end_of_stream(); + return stop_iteration::yes; + } + + mutation_opt consume_end_of_stream() { + if (!_builder) { + return mutation_opt(); + } + _rt_assembler.on_end_of_stream(); + return _builder->consume_end_of_stream(); + } +}; diff --git a/partition_snapshot_reader.hh b/partition_snapshot_reader.hh index 9d31777c35..6ab956a9c4 100644 --- a/partition_snapshot_reader.hh +++ b/partition_snapshot_reader.hh @@ -251,8 +251,11 @@ class partition_snapshot_flat_reader : public flat_mutation_reader::impl, public position_in_partition::less_compare rt_less(_schema); while (has_more_range_tombstones() && !rt_less(pos, peek_range_tombstone().position())) { range_tombstone rt = pop_range_tombstone(); - rt.trim_front(_schema, position_in_partition_view::for_range_start(ck_range)); - _rt_stream.apply(std::move(rt)); + if (rt.trim(_schema, + position_in_partition_view::for_range_start(ck_range), + position_in_partition_view::for_range_end(ck_range))) { + _rt_stream.apply(std::move(rt)); + } } return _rt_stream.get_next(std::move(pos)); diff --git a/position_in_partition.hh b/position_in_partition.hh index 766f8b5d5c..ee597d48b0 100644 --- a/position_in_partition.hh +++ b/position_in_partition.hh @@ -587,12 +587,16 @@ public: , _end(std::move(end)) { } + void set_start(position_in_partition pos) { _start = std::move(pos); } + void set_end(position_in_partition pos) { _end = std::move(pos); } const position_in_partition& start() const& { return _start; } position_in_partition&& start() && { return std::move(_start); } const position_in_partition& end() const& { return _end; } position_in_partition&& end() && { return std::move(_end); } bool contains(const schema& s, position_in_partition_view pos) const; bool overlaps(const schema& s, position_in_partition_view start, position_in_partition_view end) const; + // Returns true iff this range contains all keys contained by position_range(start, end). + bool contains(const schema& s, position_in_partition_view start, position_in_partition_view end) const; bool is_all_clustered_rows(const schema&) const; friend std::ostream& operator<<(std::ostream&, const position_range&); @@ -606,6 +610,12 @@ bool position_range::contains(const schema& s, position_in_partition_view pos) c return !less(pos, _start) && less(pos, _end); } +inline +bool position_range::contains(const schema& s, position_in_partition_view start, position_in_partition_view end) const { + position_in_partition::less_compare less(s); + return !less(start, _start) && !less(_end, end); +} + inline bool position_range::overlaps(const schema& s, position_in_partition_view start, position_in_partition_view end) const { position_in_partition::less_compare less(s); diff --git a/range_tombstone.cc b/range_tombstone.cc index cc780b11ac..ce6cb83761 100644 --- a/range_tombstone.cc +++ b/range_tombstone.cc @@ -21,6 +21,7 @@ #include "range_tombstone.hh" #include "mutation_fragment.hh" +#include "mutation_fragment_v2.hh" #include @@ -32,6 +33,10 @@ std::ostream& operator<<(std::ostream& out, const range_tombstone& rt) { } } +std::ostream& operator<<(std::ostream& out, const range_tombstone_change& rt) { + return out << "{range_tombstone_change: pos=" << rt.position() << ", " << rt.tombstone() << "}"; +} + std::optional range_tombstone::apply(const schema& s, range_tombstone&& src) { bound_view::compare cmp(s); diff --git a/range_tombstone.hh b/range_tombstone.hh index c4a3b74209..210a5c9c40 100644 --- a/range_tombstone.hh +++ b/range_tombstone.hh @@ -158,6 +158,33 @@ public: return true; } + // Intersects the range of this tombstone with [start, end) and replaces + // the range of the tombstone if there is an overlap. + // Returns true if there is an overlap and false otherwise. When returns false, the tombstone + // is not modified. + // + // start and end must satisfy: + // 1) has_clustering_key() == true + // 2) is_clustering_row() == false + // + // Also: start <= end + bool trim(const schema& s, position_in_partition_view start, position_in_partition_view end) { + position_in_partition::less_compare less(s); + if (!less(start, end_position())) { + return false; + } + if (!less(position(), end)) { + return false; + } + if (less(position(), start)) { + set_start(start); + } + if (less(end, end_position())) { + set_end(s, end); + } + return true; + } + // Assumes !pos.is_clustering_row(), because range_tombstone bounds can't represent such positions void set_start(position_in_partition_view pos) { bound_view new_start = pos.as_start_bound_view(); diff --git a/range_tombstone_assembler.hh b/range_tombstone_assembler.hh new file mode 100644 index 0000000000..7f153d7a52 --- /dev/null +++ b/range_tombstone_assembler.hh @@ -0,0 +1,91 @@ +/* + * Copyright (C) 2021 ScyllaDB + */ + +/* + * This file is part of Scylla. + * + * Scylla is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * Scylla is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with Scylla. If not, see . + */ + +#pragma once + +#include +#include + +#include "mutation_fragment_v2.hh" + +/// Converts a stream of range_tombstone_change fragments to an equivalent stream of range_tombstone objects. +/// The input fragments must be ordered by their position(). +/// The produced range_tombstone objects are non-overlapping and ordered by their position(). +/// +/// on_end_of_stream() must be called after consuming all fragments to produce the final fragment. +/// +/// Example usage: +/// +/// range_tombstone_assembler rta; +/// if (auto rt_opt = rta.consume(range_tombstone_change(...))) { +/// produce(*rt_opt); +/// } +/// if (auto rt_opt = rta.consume(range_tombstone_change(...))) { +/// produce(*rt_opt); +/// } +/// if (auto rt_opt = rta.flush(position_in_partition(...)) { +/// produce(*rt_opt); +/// } +/// rta.on_end_of_stream(); +/// +class range_tombstone_assembler { + std::optional _prev_rt; +private: + bool has_active_tombstone() const { + return _prev_rt && _prev_rt->tombstone(); + } +public: + void reset() { + _prev_rt = std::nullopt; + } + + std::optional consume(const schema& s, range_tombstone_change&& rt) { + std::optional rt_opt; + auto less = position_in_partition::less_compare(s); + if (has_active_tombstone() && less(_prev_rt->position(), rt.position())) { + rt_opt = range_tombstone(_prev_rt->position(), rt.position(), _prev_rt->tombstone()); + } + _prev_rt = std::move(rt); + return rt_opt; + } + + void on_end_of_stream() { + if (has_active_tombstone()) { + throw std::logic_error(format("Stream ends with an active range tombstone: {}", *_prev_rt)); + } + } + + // Returns true if and only if flush() may return something. + // Returns false if flush() won't return anything for sure. + bool needs_flush() const { + return has_active_tombstone(); + } + + std::optional flush(const schema& s, position_in_partition_view pos) { + auto less = position_in_partition::less_compare(s); + if (has_active_tombstone() && less(_prev_rt->position(), pos)) { + position_in_partition start = _prev_rt->position(); + _prev_rt->set_position(position_in_partition(pos)); + return range_tombstone(std::move(start), pos, _prev_rt->tombstone()); + } + return std::nullopt; + } +}; diff --git a/range_tombstone_change_generator.hh b/range_tombstone_change_generator.hh new file mode 100644 index 0000000000..792297e986 --- /dev/null +++ b/range_tombstone_change_generator.hh @@ -0,0 +1,135 @@ +/* + * Copyright (C) 2021 ScyllaDB + */ + +/* + * This file is part of Scylla. + * + * Scylla is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * Scylla is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with Scylla. If not, see . + */ + +#pragma once + +#include "mutation_fragment_v2.hh" +#include "range_tombstone_list.hh" + +template +concept RangeTombstoneChangeConsumer = std::invocable; + +/// Generates range_tombstone_change fragments for a stream of range_tombstone fragments. +/// +/// The input range_tombstones passed to consume() may be overlapping, but must be weakly ordered by position(). +/// It's ok to pass consecutive range_tombstone objects with the same position. +/// +/// Generated range_tombstone_change fragments will have strictly monotonic positions. +/// +/// Example usage: +/// +/// consume(range_tombstone(1, +inf, t)); +/// flush(2, consumer); +/// consume(range_tombstone(2, +inf, t)); +/// flush(3, consumer); +/// consume(range_tombstone(4, +inf, t)); +/// consume(range_tombstone(4, 7, t)); +/// flush(5, consumer); +/// flush(6, consumer); +/// +class range_tombstone_change_generator { + range_tombstone_list _range_tombstones; + // All range_tombstone_change fragments with positions < than this have been emitted. + position_in_partition _lower_bound = position_in_partition::before_all_clustered_rows(); + const schema& _schema; +public: + range_tombstone_change_generator(const schema& s) + : _range_tombstones(s) + , _schema(s) + { } + + // Discards deletion information for positions < lower_bound. + // After this, the lowest position of emitted range_tombstone_change will be before_key(lower_bound). + void trim(const position_in_partition& lower_bound) { + position_in_partition::less_compare less(_schema); + + if (lower_bound.is_clustering_row()) { + _lower_bound = position_in_partition::before_key(lower_bound.key()); + } else { + _lower_bound = lower_bound; + } + + while (!_range_tombstones.empty() && !less(lower_bound, _range_tombstones.begin()->end_position())) { + _range_tombstones.pop_as(_range_tombstones.begin()); + } + + if (!_range_tombstones.empty() && less(_range_tombstones.begin()->position(), _lower_bound)) { + // _range_tombstones.begin()->end_position() < lower_bound is guaranteed by previous loop. + _range_tombstones.begin()->set_start(_lower_bound); + } + } + + // Emits range_tombstone_change fragments with positions smaller than upper_bound + // for accumulated range tombstones. + // After this, only range_tombstones with positions >= upper_bound may be added, + // which guarantees that they won't affect the output of this flush. + // FIXME: respect preemption + template + void flush(position_in_partition_view upper_bound, C consumer) { + position_in_partition::less_compare less(_schema); + std::optional prev; + + while (!_range_tombstones.empty() && less(_range_tombstones.begin()->end_position(), upper_bound)) { + auto rt = _range_tombstones.pop_as(_range_tombstones.begin()); + + if (prev && less(prev->end_position(), rt.position())) { // [1] + // previous range tombstone not adjacent, emit gap. + consumer(range_tombstone_change(prev->end_position(), tombstone())); + } + + // Check if start of rt was already emitted, emit if not. + if (!less(rt.position(), _lower_bound)) { + consumer(range_tombstone_change(rt.position(), rt.tomb)); + } + + // Delay emitting end bound in case it's adjacent with the next tombstone. See [1] and [2] + prev = std::move(rt); + } + + // If previous range tombstone not adjacent with current, emit gap. + // It cannot get adjacent later because prev->end_position() < upper_bound, + // so nothing == prev->end_position() can be added after this invocation. + if (prev && (_range_tombstones.empty() + || less(prev->end_position(), _range_tombstones.begin()->position()))) { + consumer(range_tombstone_change(prev->end_position(), tombstone())); // [2] + } + + // Emit the fragment for start bound of a range_tombstone which is overlapping with upper_bound, + // unless no such fragment or already emitted. + if (!_range_tombstones.empty() + && less(_range_tombstones.begin()->position(), upper_bound) + && (!less(_range_tombstones.begin()->position(), _lower_bound))) { + consumer(range_tombstone_change( + _range_tombstones.begin()->position(), _range_tombstones.begin()->tomb)); + } + + _lower_bound = upper_bound; + } + + void consume(range_tombstone rt) { + _range_tombstones.apply(_schema, std::move(rt)); + } + + void reset() { + _range_tombstones.clear(); + _lower_bound = position_in_partition::before_all_clustered_rows(); + } +}; diff --git a/range_tombstone_splitter.hh b/range_tombstone_splitter.hh new file mode 100644 index 0000000000..3ee1606898 --- /dev/null +++ b/range_tombstone_splitter.hh @@ -0,0 +1,54 @@ +/* + * Copyright (C) 2021 ScyllaDB + */ + +/* + * This file is part of Scylla. + * + * Scylla is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * Scylla is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with Scylla. If not, see . + */ + +#pragma once + +#include "clustering_ranges_walker.hh" +#include "mutation_fragment.hh" + +template +concept SplitterFragmentConsumer = std::invocable; + +/// Takes a stream of range tombstone fragments and trims them to the boundaries of clustering key restrictions. +class range_tombstone_splitter { + clustering_ranges_walker& _walker; + range_tombstone_stream _rts; +public: + range_tombstone_splitter(const schema& s, reader_permit permit, clustering_ranges_walker& w) + : _walker(w) + , _rts(s, std::move(permit)) + { } + + template + void flush(position_in_partition_view pos, C consumer) { + while (auto rto = _rts.get_next(pos)) { + consumer(std::move(*rto)); + } + } + + template + void consume(range_tombstone rt, C consumer) { + if (auto rto = _walker.split_tombstone(std::move(rt), _rts)) { + _rts.apply(std::move(*rto)); + } + flush(rt.position(), std::move(consumer)); + } +}; diff --git a/schema_upgrader.hh b/schema_upgrader.hh index e551db8ebf..b4d89b61b9 100644 --- a/schema_upgrader.hh +++ b/schema_upgrader.hh @@ -71,4 +71,52 @@ public: } }; +// A StreamedMutationTransformer which transforms the stream to a different schema +class schema_upgrader_v2 { + schema_ptr _prev; + schema_ptr _new; + std::optional _permit; +private: + row transform(row&& r, column_kind kind) { + row new_row; + r.for_each_cell([&] (column_id id, atomic_cell_or_collection& cell) { + const column_definition& col = _prev->column_at(kind, id); + const column_definition* new_col = _new->get_column_definition(col.name()); + if (new_col) { + converting_mutation_partition_applier::append_cell(new_row, kind, *new_col, col, std::move(cell)); + } + }); + return new_row; + } +public: + schema_upgrader_v2(schema_ptr s) + : _new(std::move(s)) + { } + schema_ptr operator()(schema_ptr old) { + _prev = std::move(old); + return _new; + } + mutation_fragment_v2 consume(static_row&& row) { + return mutation_fragment_v2(*_new, std::move(*_permit), static_row(transform(std::move(row.cells()), column_kind::static_column))); + } + mutation_fragment_v2 consume(clustering_row&& row) { + return mutation_fragment_v2(*_new, std::move(*_permit), clustering_row(row.key(), row.tomb(), row.marker(), + transform(std::move(row.cells()), column_kind::regular_column))); + } + mutation_fragment_v2 consume(range_tombstone_change&& rt) { + return mutation_fragment_v2(*_new, std::move(*_permit), std::move(rt)); + } + mutation_fragment_v2 consume(partition_start&& ph) { + return mutation_fragment_v2(*_new, std::move(*_permit), std::move(ph)); + } + mutation_fragment_v2 consume(partition_end&& eop) { + return mutation_fragment_v2(*_new, std::move(*_permit), std::move(eop)); + } + mutation_fragment_v2 operator()(mutation_fragment_v2&& mf) { + _permit = mf.permit(); + return std::move(mf).consume(*this); + } +}; + static_assert(StreamedMutationTranformer); +static_assert(StreamedMutationTranformerV2); diff --git a/scylla-gdb.py b/scylla-gdb.py index c3690aec53..c172eff69e 100755 --- a/scylla-gdb.py +++ b/scylla-gdb.py @@ -1311,10 +1311,16 @@ def find_single_sstable_readers(): # FIXME: this only finds range readers types = [_lookup_type(['sstable_range_wrapping_reader'])] except gdb.error: - types = [_lookup_type(['sstables::sstable_mutation_reader', - 'sstables::sstable_mutation_reader']), - _lookup_type(['sstables::sstable_mutation_reader', - 'sstables::sstable_mutation_reader'])] + try: + # for Scylla <= 4.5 + types = [_lookup_type(['sstables::sstable_mutation_reader', + 'sstables::sstable_mutation_reader']), + _lookup_type(['sstables::sstable_mutation_reader', + 'sstables::sstable_mutation_reader'])] + except gdb.error: + types = [_lookup_type(['sstables::mx::mx_sstable_mutation_reader']), + _lookup_type(['sstables::kl::sstable_mutation_reader'])] + def _lookup_obj(obj_addr, vtable_addr): vtable_pfx = 'vtable for ' diff --git a/sstables/kl/reader.cc b/sstables/kl/reader.cc index 6795709182..e6dd356544 100644 --- a/sstables/kl/reader.cc +++ b/sstables/kl/reader.cc @@ -35,7 +35,7 @@ static inline bytes_view pop_back(std::vector& vec) { class mp_row_consumer_k_l : public row_consumer { private: - mp_row_consumer_reader* _reader; + mp_row_consumer_reader_k_l* _reader; schema_ptr _schema; const query::partition_slice& _slice; bool _out_of_range = false; @@ -312,7 +312,7 @@ private: public: mutation_opt mut; - mp_row_consumer_k_l(mp_row_consumer_reader* reader, + mp_row_consumer_k_l(mp_row_consumer_reader_k_l* reader, const schema_ptr schema, reader_permit permit, const query::partition_slice& slice, @@ -329,7 +329,7 @@ public: , _treat_non_compound_rt_as_compound(!sst->has_correct_non_compound_range_tombstones()) { } - mp_row_consumer_k_l(mp_row_consumer_reader* reader, + mp_row_consumer_k_l(mp_row_consumer_reader_k_l* reader, const schema_ptr schema, reader_permit permit, const io_priority_class& pc, @@ -385,7 +385,10 @@ public: ret = flush(); } advance_to(rt); - _in_progress = mutation_fragment(*_schema, permit(), std::move(rt)); + auto rt_opt = _ck_ranges_walker->split_tombstone(rt, _range_tombstones); + if (rt_opt) { + _in_progress = mutation_fragment(*_schema, permit(), std::move(*rt_opt)); + } if (_out_of_range) { ret = push_ready_fragments_out_of_range(); } @@ -667,8 +670,8 @@ public: } // Workaround for #1203 if (!_first_row_encountered) { - if (_ck_ranges_walker->contains_tombstone(rt_pos, rt.end_position())) { - _range_tombstones.apply(std::move(rt)); + if (auto rt_opt = _ck_ranges_walker->split_tombstone(std::move(rt), _range_tombstones)) { + _range_tombstones.apply(std::move(*rt_opt)); } return proceed::yes; } @@ -752,6 +755,14 @@ public: _out_of_range = _is_mutation_end; _fwd_end = std::move(r).end(); + // range_tombstone::trim() requires !is_clustering_row(). + if (r.start().is_clustering_row()) { + r.set_start(position_in_partition::before_key(r.start().key())); + } + if (r.end().is_clustering_row()) { + r.set_end(position_in_partition::before_key(r.end().key())); + } + _range_tombstones.forward_to(r.start()); _ck_ranges_walker->trim_front(std::move(r).start()); @@ -810,6 +821,354 @@ public: } }; +class sstable_mutation_reader : public mp_row_consumer_reader_k_l { + using DataConsumeRowsContext = kl::data_consume_rows_context; + using Consumer = mp_row_consumer_k_l; + static_assert(RowConsumer); + Consumer _consumer; + bool _will_likely_slice = false; + bool _read_enabled = true; + std::unique_ptr _context; + std::unique_ptr _index_reader; + // We avoid unnecessary lookup for single partition reads thanks to this flag + bool _single_partition_read = false; + const dht::partition_range& _pr; + const query::partition_slice& _slice; + streamed_mutation::forwarding _fwd; + mutation_reader::forwarding _fwd_mr; + read_monitor& _monitor; +public: + sstable_mutation_reader(shared_sstable sst, + schema_ptr schema, + reader_permit permit, + const dht::partition_range& pr, + const query::partition_slice& slice, + const io_priority_class& pc, + tracing::trace_state_ptr trace_state, + streamed_mutation::forwarding fwd, + mutation_reader::forwarding fwd_mr, + read_monitor& mon) + : mp_row_consumer_reader_k_l(std::move(schema), permit, std::move(sst)) + , _consumer(this, _schema, std::move(permit), slice, pc, std::move(trace_state), fwd, _sst) + // FIXME: I want to add `&& fwd_mr == mutation_reader::forwarding::no` below + // but can't because many call sites use the default value for + // `mutation_reader::forwarding` which is `yes`. + , _single_partition_read(pr.is_singular()) + , _pr(pr) + , _slice(slice) + , _fwd(fwd) + , _fwd_mr(fwd_mr) + , _monitor(mon) { } + + // Reference to _consumer is passed to data_consume_rows() in the constructor so we must not allow move/copy + sstable_mutation_reader(sstable_mutation_reader&&) = delete; + sstable_mutation_reader(const sstable_mutation_reader&) = delete; + ~sstable_mutation_reader() { + if (_context || _index_reader) { + sstlog.warn("sstable_mutation_reader was not closed. Closing in the background. Backtrace: {}", current_backtrace()); + // FIXME: discarded future. + (void)close(); + } + } +private: + static bool will_likely_slice(const query::partition_slice& slice) { + return (!slice.default_row_ranges().empty() && !slice.default_row_ranges()[0].is_full()) + || slice.get_specific_ranges(); + } + index_reader& get_index_reader() { + if (!_index_reader) { + _index_reader = std::make_unique(_sst, _consumer.permit(), _consumer.io_priority(), _consumer.trace_state()); + } + return *_index_reader; + } + future<> advance_to_next_partition() { + sstlog.trace("reader {}: advance_to_next_partition()", fmt::ptr(this)); + _before_partition = true; + auto& consumer = _consumer; + if (consumer.is_mutation_end()) { + sstlog.trace("reader {}: already at partition boundary", fmt::ptr(this)); + _index_in_current_partition = false; + return make_ready_future<>(); + } + return (_index_in_current_partition + ? _index_reader->advance_to_next_partition() + : get_index_reader().advance_to(dht::ring_position_view::for_after_key(*_current_partition_key))).then([this] { + _index_in_current_partition = true; + auto [start, end] = _index_reader->data_file_positions(); + if (end && start > *end) { + _read_enabled = false; + return make_ready_future<>(); + } + assert(_index_reader->element_kind() == indexable_element::partition); + return skip_to(_index_reader->element_kind(), start).then([this] { + _sst->get_stats().on_partition_seek(); + }); + }); + } + future<> read_from_index() { + sstlog.trace("reader {}: read from index", fmt::ptr(this)); + auto tomb = _index_reader->partition_tombstone(); + if (!tomb) { + sstlog.trace("reader {}: no tombstone", fmt::ptr(this)); + return read_from_datafile(); + } + auto pk = _index_reader->partition_key().to_partition_key(*_schema); + auto key = dht::decorate_key(*_schema, std::move(pk)); + _consumer.setup_for_partition(key.key()); + on_next_partition(std::move(key), tombstone(*tomb)); + return make_ready_future<>(); + } + future<> read_from_datafile() { + sstlog.trace("reader {}: read from data file", fmt::ptr(this)); + return _context->consume_input(); + } + // Assumes that we're currently positioned at partition boundary. + future<> read_partition() { + sstlog.trace("reader {}: reading partition", fmt::ptr(this)); + + _end_of_stream = true; // on_next_partition() will set it to true + if (!_read_enabled) { + sstlog.trace("reader {}: eof", fmt::ptr(this)); + return make_ready_future<>(); + } + + if (!_consumer.is_mutation_end()) { + throw malformed_sstable_exception(format("consumer not at partition boundary, position: {}", + position_in_partition_view::printer(*_schema, _consumer.position())), _sst->get_filename()); + } + + // It's better to obtain partition information from the index if we already have it. + // We can save on IO if the user will skip past the front of partition immediately. + // + // It is also better to pay the cost of reading the index if we know that we will + // need to use the index anyway soon. + // + if (_index_in_current_partition) { + if (_context->eof()) { + sstlog.trace("reader {}: eof", fmt::ptr(this)); + return make_ready_future<>(); + } + if (_index_reader->partition_data_ready()) { + return read_from_index(); + } + if (_will_likely_slice) { + return _index_reader->read_partition_data().then([this] { + return read_from_index(); + }); + } + } + + // FIXME: advance index to current partition if _will_likely_slice + return read_from_datafile(); + } + // Can be called from any position. + future<> read_next_partition() { + sstlog.trace("reader {}: read next partition", fmt::ptr(this)); + // If next partition exists then on_next_partition will be called + // and _end_of_stream will be set to false again. + _end_of_stream = true; + if (!_read_enabled || _single_partition_read) { + sstlog.trace("reader {}: eof", fmt::ptr(this)); + return make_ready_future<>(); + } + return advance_to_next_partition().then([this] { + return read_partition(); + }); + } + future<> advance_context(std::optional pos) { + if (!pos || pos->is_before_all_fragments(*_schema)) { + return make_ready_future<>(); + } + assert (_current_partition_key); + return [this] { + if (!_index_in_current_partition) { + _index_in_current_partition = true; + return get_index_reader().advance_to(*_current_partition_key); + } + return make_ready_future(); + }().then([this, pos] { + return get_index_reader().advance_to(*pos).then([this] { + index_reader& idx = *_index_reader; + auto index_position = idx.data_file_positions(); + if (index_position.start <= _context->position()) { + return make_ready_future<>(); + } + return skip_to(idx.element_kind(), index_position.start).then([this, &idx] { + _sst->get_stats().on_partition_seek(); + }); + }); + }); + } + bool is_initialized() const { + return bool(_context); + } + future<> initialize() { + if (_single_partition_read) { + _sst->get_stats().on_single_partition_read(); + const auto& key = dht::ring_position_view(_pr.start()->value()); + position_in_partition_view pos = get_slice_upper_bound(*_schema, _slice, key); + const auto present = co_await get_index_reader().advance_lower_and_check_if_present(key, pos); + + if (!present) { + _sst->get_filter_tracker().add_false_positive(); + co_return; + } + + _sst->get_filter_tracker().add_true_positive(); + } else { + _sst->get_stats().on_range_partition_read(); + co_await get_index_reader().advance_to(_pr); + } + + auto [begin, end] = _index_reader->data_file_positions(); + assert(end); + + if (_single_partition_read) { + _read_enabled = (begin != *end); + _context = data_consume_single_partition(*_schema, _sst, _consumer, { begin, *end }); + } else { + sstable::disk_read_range drr{begin, *end}; + auto last_end = _fwd_mr ? _sst->data_size() : drr.end; + _read_enabled = bool(drr); + _context = data_consume_rows(*_schema, _sst, _consumer, std::move(drr), last_end); + } + + _monitor.on_read_started(_context->reader_position()); + _index_in_current_partition = true; + _will_likely_slice = will_likely_slice(_slice); + } + future<> ensure_initialized() { + if (is_initialized()) { + return make_ready_future<>(); + } + return initialize(); + } + future<> skip_to(indexable_element el, uint64_t begin) { + sstlog.trace("sstable_reader: {}: skip_to({} -> {}, el={})", fmt::ptr(_context.get()), _context->position(), begin, static_cast(el)); + if (begin <= _context->position()) { + return make_ready_future<>(); + } + _context->reset(el); + return _context->skip_to(begin); + } +public: + void on_out_of_clustering_range() override { + if (_fwd == streamed_mutation::forwarding::yes) { + _end_of_stream = true; + } else { + this->push_mutation_fragment(mutation_fragment(*_schema, _permit, partition_end())); + _partition_finished = true; + } + } + virtual future<> fast_forward_to(const dht::partition_range& pr, db::timeout_clock::time_point timeout) override { + return ensure_initialized().then([this, &pr] { + if (!is_initialized()) { + _end_of_stream = true; + return make_ready_future<>(); + } else { + clear_buffer(); + _partition_finished = true; + _before_partition = true; + _end_of_stream = false; + assert(_index_reader); + auto f1 = _index_reader->advance_to(pr); + return f1.then([this] { + auto [start, end] = _index_reader->data_file_positions(); + assert(end); + if (start != *end) { + _read_enabled = true; + _index_in_current_partition = true; + _context->reset(indexable_element::partition); + return _context->fast_forward_to(start, *end); + } + _index_in_current_partition = false; + _read_enabled = false; + return make_ready_future<>(); + }); + } + }); + } + virtual future<> fill_buffer(db::timeout_clock::time_point timeout) override { + if (_end_of_stream) { + return make_ready_future<>(); + } + if (!is_initialized()) { + return initialize().then([this, timeout] { + if (!is_initialized()) { + _end_of_stream = true; + return make_ready_future<>(); + } else { + return fill_buffer(timeout); + } + }); + } + return do_until([this] { return is_end_of_stream() || is_buffer_full(); }, [this] { + if (_partition_finished) { + if (_before_partition) { + return read_partition(); + } else { + return read_next_partition(); + } + } else { + return do_until([this] { return is_buffer_full() || _partition_finished || _end_of_stream; }, [this] { + _consumer.push_ready_fragments(); + if (is_buffer_full() || _partition_finished || _end_of_stream) { + return make_ready_future<>(); + } + return advance_context(_consumer.maybe_skip()).then([this] { + return _context->consume_input(); + }); + }); + } + }); + } + virtual future<> next_partition() override { + if (is_initialized()) { + if (_fwd == streamed_mutation::forwarding::yes) { + clear_buffer(); + _partition_finished = true; + _end_of_stream = false; + } else { + clear_buffer_to_next_partition(); + if (!_partition_finished && is_buffer_empty()) { + _partition_finished = true; + } + } + } + return make_ready_future<>(); + // If _ds is not created then next_partition() has no effect because there was no partition_start emitted yet. + } + virtual future<> fast_forward_to(position_range cr, db::timeout_clock::time_point timeout) override { + forward_buffer_to(cr.start()); + if (!_partition_finished) { + _end_of_stream = false; + return advance_context(_consumer.fast_forward_to(std::move(cr), timeout)); + } else { + _end_of_stream = true; + return make_ready_future<>(); + } + } + virtual future<> close() noexcept override { + auto close_context = make_ready_future<>(); + if (_context) { + _monitor.on_read_completed(); + // move _context to prevent double-close from destructor. + close_context = _context->close().finally([_ = std::move(_context)] {}); + } + + auto close_index_reader = make_ready_future<>(); + if (_index_reader) { + // move _index_reader to prevent double-close from destructor. + close_index_reader = _index_reader->close().finally([_ = std::move(_index_reader)] {}); + } + + return when_all_succeed(std::move(close_context), std::move(close_index_reader)).discard_result().handle_exception([] (std::exception_ptr ep) { + // close can not fail as it is called either from the destructor or from flat_mutation_reader::close + sstlog.warn("Failed closing of sstable_mutation_reader: {}. Ignored since the reader is already done.", ep); + }); + } +}; + flat_mutation_reader make_reader( shared_sstable sstable, schema_ptr schema, @@ -821,7 +1180,7 @@ flat_mutation_reader make_reader( streamed_mutation::forwarding fwd, mutation_reader::forwarding fwd_mr, read_monitor& monitor) { - return make_flat_mutation_reader>( + return make_flat_mutation_reader( std::move(sstable), std::move(schema), std::move(permit), range, slice, pc, std::move(trace_state), fwd, fwd_mr, monitor); } diff --git a/sstables/mutation_fragment_filter.hh b/sstables/mutation_fragment_filter.hh index 97f95bf09b..cdc9802689 100644 --- a/sstables/mutation_fragment_filter.hh +++ b/sstables/mutation_fragment_filter.hh @@ -71,6 +71,11 @@ public: store_and_finish }; + struct clustering_result { + result action; + clustering_ranges_walker::range_tombstones rts; + }; + result apply(const static_row& sr) { bool inside_requested_ranges = _walker.advance_to(sr.position()); _out_of_range |= _walker.out_of_range(); @@ -81,33 +86,40 @@ public: } } - result apply(position_in_partition_view pos) { + clustering_result apply(position_in_partition_view pos, tombstone t) { if (is_after_fwd_window(pos)) { // This happens only when fwd is set _out_of_range = true; - return result::store_and_finish; + clustering_ranges_walker::progress pr = _walker.advance_to(_fwd_end, _walker.current_tombstone()); + if (_walker.current_tombstone()) { + // Close range tombstone before EOS + pr.rts.push_back(range_tombstone_change(_fwd_end, {})); + } + return clustering_result{ + .action = result::store_and_finish, + .rts = std::move(pr.rts) + }; } - bool inside_requested_ranges = _walker.advance_to(pos); - if (!inside_requested_ranges) { + clustering_ranges_walker::progress pr = _walker.advance_to(pos, t); + if (!pr.contained) { _out_of_range |= _walker.out_of_range(); - return result::ignore; + return clustering_result{ + .action = result::ignore, + .rts = std::move(pr.rts) + }; } - return result::emit; + return clustering_result{ + .action = result::emit, + .rts = std::move(pr.rts) + }; } - result apply(const range_tombstone& rt) { - bool inside_requested_ranges = _walker.advance_to(rt.position(), rt.end_position()); - if (!inside_requested_ranges) { - _out_of_range |= _walker.out_of_range(); - return result::ignore; - } - if (is_after_fwd_window(rt.position())) { - // This happens only when fwd is set - _out_of_range = true; - return result::store_and_finish; - } else { - return result::emit; - } + clustering_result apply(position_in_partition_view pos) { + return apply(pos, _walker.current_tombstone()); + } + + void set_tombstone(tombstone t) { + _walker.set_tombstone(t); } bool out_of_range() const { @@ -130,10 +142,13 @@ public: */ std::optional fast_forward_to(position_range r) { assert(_fwd); - _walker.trim_front(r.start()); _fwd_end = std::move(r).end(); _out_of_range = !_walker.advance_to(r.start(), _fwd_end); + // Must be after advance_to() so that advance_to() doesn't enter the range. + // Doing so would cause us to not emit a range_tombstone_change for the beginning of the range. + _walker.trim_front(r.start()); + if (_out_of_range) { return {}; } @@ -149,6 +164,9 @@ public: return (_last_lower_bound_counter != _walker.lower_bound_change_counter()); } + tombstone current_tombstone() const { + return _walker.current_tombstone(); + } position_in_partition_view lower_bound() const { return _walker.lower_bound(); diff --git a/sstables/mx/reader.cc b/sstables/mx/reader.cc index 9ee074e808..77f17e0d01 100644 --- a/sstables/mx/reader.cc +++ b/sstables/mx/reader.cc @@ -959,8 +959,19 @@ public: } }; +class mp_row_consumer_reader_mx : public mp_row_consumer_reader_base, public flat_mutation_reader_v2::impl { + friend class sstables::mx::mp_row_consumer_m; +public: + mp_row_consumer_reader_mx(schema_ptr s, reader_permit permit, shared_sstable sst) + : mp_row_consumer_reader_base(std::move(sst)) + , impl(std::move(s), std::move(permit)) + { } + + void on_next_partition(dht::decorated_key, tombstone); +}; + class mp_row_consumer_m : public consumer_m { - mp_row_consumer_reader* _reader; + mp_row_consumer_reader_mx* _reader; schema_ptr _schema; const query::partition_slice& _slice; std::optional _mf_filter; @@ -971,7 +982,7 @@ class mp_row_consumer_m : public consumer_m { const bool _treat_static_row_as_regular; std::optional _in_progress_row; - std::optional _stored_tombstone; + std::optional _stored_tombstone; static_row _in_progress_static_row; bool _inside_static_row = false; @@ -999,39 +1010,45 @@ class mp_row_consumer_m : public consumer_m { return o; } - std::optional _opened_range_tombstone; - - void consume_range_tombstone_start(clustering_key_prefix ck, bound_kind k, tombstone t) { + proceed consume_range_tombstone_start(clustering_key_prefix ck, bound_kind k, tombstone t) { sstlog.trace("mp_row_consumer_m {}: consume_range_tombstone_start(ck={}, k={}, t={})", fmt::ptr(this), ck, k, t); - if (_opened_range_tombstone) { + if (_mf_filter->current_tombstone()) { throw sstables::malformed_sstable_exception( format("Range tombstones have to be disjoint: current opened range tombstone {}, new tombstone {}", - *_opened_range_tombstone, t)); + _mf_filter->current_tombstone(), t)); } - _opened_range_tombstone = {std::move(ck), k, std::move(t)}; + auto pos = position_in_partition(position_in_partition::range_tag_t(), k, std::move(ck)); + return on_range_tombstone_change(std::move(pos), t); } proceed consume_range_tombstone_end(clustering_key_prefix ck, bound_kind k, tombstone t) { sstlog.trace("mp_row_consumer_m {}: consume_range_tombstone_end(ck={}, k={}, t={})", fmt::ptr(this), ck, k, t); - if (!_opened_range_tombstone) { + if (!_mf_filter->current_tombstone()) { throw sstables::malformed_sstable_exception( format("Closing range tombstone that wasn't opened: clustering {}, kind {}, tombstone {}", ck, k, t)); } - if (_opened_range_tombstone->tomb != t) { + if (_mf_filter->current_tombstone() != t) { throw sstables::malformed_sstable_exception( format("Range tombstone with ck {} and two different tombstones at ends: {}, {}", - ck, _opened_range_tombstone->tomb, t)); + ck, _mf_filter->current_tombstone(), t)); } + auto pos = position_in_partition(position_in_partition::range_tag_t(), k, std::move(ck)); + return on_range_tombstone_change(std::move(pos), {}); + } - - auto rt = range_tombstone {std::move(_opened_range_tombstone->ck), - _opened_range_tombstone->kind, - std::move(ck), - k, - std::move(t)}; - _opened_range_tombstone.reset(); - return maybe_push_range_tombstone(std::move(rt)); + proceed consume_range_tombstone_boundary(position_in_partition pos, tombstone left, tombstone right) { + sstlog.trace("mp_row_consumer_m {}: consume_range_tombstone_boundary(pos={}, left={}, right={})", fmt::ptr(this), pos, left, right); + if (!_mf_filter->current_tombstone()) { + throw sstables::malformed_sstable_exception( + format("Closing range tombstone that wasn't opened: pos {}, tombstone {}", pos, left)); + } + if (_mf_filter->current_tombstone() != left) { + throw sstables::malformed_sstable_exception( + format("Range tombstone at {} and two different tombstones at ends: {}, {}", + pos, _mf_filter->current_tombstone(), left)); + } + return on_range_tombstone_change(std::move(pos), right); } const column_definition& get_column_definition(std::optional column_id) const { @@ -1039,13 +1056,23 @@ class mp_row_consumer_m : public consumer_m { return _schema->column_at(column_type, *column_id); } - inline proceed maybe_push_range_tombstone(range_tombstone&& rt) { - const auto action = _mf_filter->apply(rt); - switch (action) { + inline proceed on_range_tombstone_change(position_in_partition pos, tombstone t) { + sstlog.trace("mp_row_consumer_m {}: on_range_tombstone_change({}, {}->{})", fmt::ptr(this), pos, + _mf_filter->current_tombstone(), t); + + mutation_fragment_filter::clustering_result result = _mf_filter->apply(pos, t); + + for (auto&& rt : result.rts) { + sstlog.trace("mp_row_consumer_m {}: push({})", fmt::ptr(this), rt); + _reader->push_mutation_fragment(mutation_fragment_v2(*_schema, permit(), std::move(rt))); + } + + switch (result.action) { case mutation_fragment_filter::result::emit: - _reader->push_mutation_fragment(mutation_fragment(*_schema, permit(), std::move(rt))); + sstlog.trace("mp_row_consumer_m {}: emit", fmt::ptr(this)); break; case mutation_fragment_filter::result::ignore: + sstlog.trace("mp_row_consumer_m {}: ignore", fmt::ptr(this)); if (_mf_filter->out_of_range()) { _reader->on_out_of_clustering_range(); return proceed::no; @@ -1055,7 +1082,8 @@ class mp_row_consumer_m : public consumer_m { } break; case mutation_fragment_filter::result::store_and_finish: - _stored_tombstone = std::move(rt); + sstlog.trace("mp_row_consumer_m {}: store", fmt::ptr(this)); + _stored_tombstone = range_tombstone_change(pos, t); _reader->on_out_of_clustering_range(); return proceed::no; } @@ -1068,7 +1096,6 @@ class mp_row_consumer_m : public consumer_m { _in_progress_row.reset(); _stored_tombstone.reset(); _mf_filter.reset(); - _opened_range_tombstone.reset(); } void check_schema_mismatch(const column_translation::column_info& column_info, const column_definition& column_def) const { @@ -1093,18 +1120,7 @@ class mp_row_consumer_m : public consumer_m { } public: - - /* - * In m format, RTs are represented as separate start and end bounds, - * so setting/resetting RT start is needed so that we could skip using index. - * For this, the following methods need to be defined: - * - * void set_range_tombstone_start(clustering_key_prefix, bound_kind, tombstone); - * void reset_range_tombstone_start(); - */ - constexpr static bool is_setting_range_tombstone_start_supported = true; - - mp_row_consumer_m(mp_row_consumer_reader* reader, + mp_row_consumer_m(mp_row_consumer_reader_mx* reader, const schema_ptr schema, reader_permit permit, const query::partition_slice& slice, @@ -1123,7 +1139,7 @@ public: _cells.reserve(std::max(_schema->static_columns_count(), _schema->regular_columns_count())); } - mp_row_consumer_m(mp_row_consumer_reader* reader, + mp_row_consumer_m(mp_row_consumer_reader_mx* reader, const schema_ptr schema, reader_permit permit, const io_priority_class& pc, @@ -1137,24 +1153,10 @@ public: // See the RowConsumer concept void push_ready_fragments() { - auto maybe_push = [this] (auto&& mfopt) { - if (mfopt) { - assert(_mf_filter); - switch (_mf_filter->apply(*mfopt)) { - case mutation_fragment_filter::result::emit: - _reader->push_mutation_fragment(mutation_fragment(*_schema, permit(), *std::exchange(mfopt, {}))); - break; - case mutation_fragment_filter::result::ignore: - mfopt.reset(); - break; - case mutation_fragment_filter::result::store_and_finish: - _reader->on_out_of_clustering_range(); - break; - } - } - }; - - maybe_push(_stored_tombstone); + if (auto rto = std::move(_stored_tombstone)) { + _stored_tombstone = std::nullopt; + on_range_tombstone_change(rto->position(), rto->tombstone()); + } } std::optional maybe_skip() { @@ -1179,6 +1181,15 @@ public: _reader->on_out_of_clustering_range(); return {}; } + // r is used to trim range tombstones and range_tombstone:s can be trimmed only to positions + // which are !is_clustering_row(). Replace with equivalent ranges. + // Long-term we should guarantee this on position_range. + if (r.start().is_clustering_row()) { + r.set_start(position_in_partition::before_key(r.start().key())); + } + if (r.end().is_clustering_row()) { + r.set_end(position_in_partition::before_key(r.end().key())); + } auto skip = _mf_filter->fast_forward_to(std::move(r)); if (skip) { position_in_partition::less_compare less(*_schema); @@ -1201,15 +1212,9 @@ public: * Used for skipping through wide partitions using index when the data block * skipped to starts in the middle of an opened range tombstone. */ - void set_range_tombstone_start(clustering_key_prefix ck, bound_kind k, tombstone t) { - _opened_range_tombstone = {std::move(ck), k, std::move(t)}; - } - - /* - * Resets the previously set range tombstone start if any. - */ - void reset_range_tombstone_start() { - _opened_range_tombstone.reset(); + void set_range_tombstone(tombstone t) { + sstlog.trace("mp_row_consumer_m {}: set_range_tombstone({})", fmt::ptr(this), t); + _mf_filter->set_tombstone(t); } virtual proceed consume_partition_start(sstables::key_view key, sstables::deletion_time deltime) override { @@ -1231,31 +1236,16 @@ public: sstlog.trace("mp_row_consumer_m {}: consume_row_start({})", fmt::ptr(this), key); - // enagaged _in_progress_row means we have already split around this key. - if (_opened_range_tombstone && !_in_progress_row) { - // We have an opened range tombstone which means that the current row is spanned by that RT. - auto ck = key; - bool was_non_full_key = clustering_key::make_full(*_schema, ck); - auto end_kind = was_non_full_key ? bound_kind::excl_end : bound_kind::incl_end; - assert(!_stored_tombstone); - auto rt = range_tombstone(std::move(_opened_range_tombstone->ck), - _opened_range_tombstone->kind, - ck, - end_kind, - _opened_range_tombstone->tomb); - sstlog.trace("mp_row_consumer_m {}: push({})", fmt::ptr(this), rt); - _opened_range_tombstone->ck = std::move(ck); - _opened_range_tombstone->kind = was_non_full_key ? bound_kind::incl_start : bound_kind::excl_start; - - if (maybe_push_range_tombstone(std::move(rt)) == proceed::no) { - _in_progress_row.emplace(std::move(key)); - return consumer_m::row_processing_result::retry_later; - } - } - _in_progress_row.emplace(std::move(key)); - switch (_mf_filter->apply(_in_progress_row->position())) { + mutation_fragment_filter::clustering_result res = _mf_filter->apply(_in_progress_row->position()); + + for (auto&& rt : res.rts) { + sstlog.trace("mp_row_consumer_m {}: push({})", fmt::ptr(this), rt); + _reader->push_mutation_fragment(mutation_fragment_v2(*_schema, permit(), std::move(rt))); + } + + switch (res.action) { case mutation_fragment_filter::result::emit: sstlog.trace("mp_row_consumer_m {}: emit", fmt::ptr(this)); return consumer_m::row_processing_result::do_proceed; @@ -1413,8 +1403,7 @@ public: auto ck = clustering_key_prefix::from_range(ecp | boost::adaptors::transformed( [] (const fragmented_temporary_buffer& b) { return fragmented_temporary_buffer::view(b); })); if (kind == bound_kind::incl_start || kind == bound_kind::excl_start) { - consume_range_tombstone_start(std::move(ck), kind, std::move(tomb)); - return proceed(!_reader->is_buffer_full() && !need_preempt()); + return consume_range_tombstone_start(std::move(ck), kind, std::move(tomb)); } else { // *_end kind return consume_range_tombstone_end(std::move(ck), kind, std::move(tomb)); } @@ -1424,23 +1413,20 @@ public: sstables::bound_kind_m kind, tombstone end_tombstone, tombstone start_tombstone) override { - auto result = proceed::yes; auto ck = clustering_key_prefix::from_range(ecp | boost::adaptors::transformed( [] (const fragmented_temporary_buffer& b) { return fragmented_temporary_buffer::view(b); })); switch (kind) { - case bound_kind_m::incl_end_excl_start: - result = consume_range_tombstone_end(ck, bound_kind::incl_end, std::move(end_tombstone)); - consume_range_tombstone_start(std::move(ck), bound_kind::excl_start, std::move(start_tombstone)); - break; - case bound_kind_m::excl_end_incl_start: - result = consume_range_tombstone_end(ck, bound_kind::excl_end, std::move(end_tombstone)); - consume_range_tombstone_start(std::move(ck), bound_kind::incl_start, std::move(start_tombstone)); - break; + case bound_kind_m::incl_end_excl_start: { + auto pos = position_in_partition(position_in_partition::range_tag_t(), bound_kind::incl_end, std::move(ck)); + return consume_range_tombstone_boundary(std::move(pos), end_tombstone, start_tombstone); + } + case bound_kind_m::excl_end_incl_start: { + auto pos = position_in_partition(position_in_partition::range_tag_t(), bound_kind::excl_end, std::move(ck)); + return consume_range_tombstone_boundary(std::move(pos), end_tombstone, start_tombstone); + } default: assert(false && "Invalid boundary type"); } - - return result; } virtual proceed consume_row_end() override { @@ -1459,7 +1445,7 @@ public: auto action = _mf_filter->apply(_in_progress_static_row); switch (action) { case mutation_fragment_filter::result::emit: - _reader->push_mutation_fragment(mutation_fragment(*_schema, permit(), std::move(_in_progress_static_row))); + _reader->push_mutation_fragment(mutation_fragment_v2(*_schema, permit(), std::move(_in_progress_static_row))); break; case mutation_fragment_filter::result::ignore: break; @@ -1472,7 +1458,8 @@ public: if (!_cells.empty()) { fill_cells(column_kind::regular_column, _in_progress_row->cells()); } - _reader->push_mutation_fragment(mutation_fragment(*_schema, permit(), *std::exchange(_in_progress_row, {}))); + _reader->push_mutation_fragment(mutation_fragment_v2( + *_schema, permit(), *std::exchange(_in_progress_row, {}))); } return proceed(!_reader->is_buffer_full() && !need_preempt()); @@ -1480,26 +1467,14 @@ public: virtual void on_end_of_stream() override { sstlog.trace("mp_row_consumer_m {}: on_end_of_stream()", fmt::ptr(this)); - if (_opened_range_tombstone) { - if (!_mf_filter || _mf_filter->out_of_range()) { + if (_mf_filter && _mf_filter->current_tombstone()) { + if (_mf_filter->out_of_range()) { throw sstables::malformed_sstable_exception("Unclosed range tombstone."); } - auto range_end = _mf_filter->uppermost_bound(); - position_in_partition::less_compare less(*_schema); - auto start_pos = position_in_partition_view(position_in_partition_view::range_tag_t{}, - bound_view(_opened_range_tombstone->ck, _opened_range_tombstone->kind)); - if (less(start_pos, range_end)) { - auto end_bound = range_end.is_clustering_row() - ? position_in_partition_view::after_key(range_end.key()).as_end_bound_view() - : range_end.as_end_bound_view(); - auto rt = range_tombstone {std::move(_opened_range_tombstone->ck), - _opened_range_tombstone->kind, - end_bound.prefix(), - end_bound.kind(), - _opened_range_tombstone->tomb}; + auto result = _mf_filter->apply(position_in_partition_view::after_all_clustered_rows(), {}); + for (auto&& rt : result.rts) { sstlog.trace("mp_row_consumer_m {}: on_end_of_stream(), emitting last tombstone: {}", fmt::ptr(this), rt); - _opened_range_tombstone.reset(); - _reader->push_mutation_fragment(mutation_fragment(*_schema, permit(), std::move(rt))); + _reader->push_mutation_fragment(mutation_fragment_v2(*_schema, permit(), std::move(rt))); } } if (!_reader->_partition_finished) { @@ -1520,7 +1495,7 @@ public: _reader->_index_in_current_partition = false; _reader->_partition_finished = true; _reader->_before_partition = true; - _reader->push_mutation_fragment(mutation_fragment(*_schema, permit(), partition_end())); + _reader->push_mutation_fragment(mutation_fragment_v2(*_schema, permit(), partition_end())); return proceed(!_reader->is_buffer_full() && !need_preempt()); } @@ -1530,6 +1505,7 @@ public: reset_for_new_partition(); } else { _in_progress_row.reset(); + _stored_tombstone.reset(); _is_mutation_end = false; } } @@ -1551,6 +1527,361 @@ public: } }; +class mx_sstable_mutation_reader : public mp_row_consumer_reader_mx { + using DataConsumeRowsContext = data_consume_rows_context_m; + using Consumer = mp_row_consumer_m; + static_assert(RowConsumer); + Consumer _consumer; + bool _will_likely_slice = false; + bool _read_enabled = true; + std::unique_ptr _context; + std::unique_ptr _index_reader; + // We avoid unnecessary lookup for single partition reads thanks to this flag + bool _single_partition_read = false; + const dht::partition_range& _pr; + const query::partition_slice& _slice; + streamed_mutation::forwarding _fwd; + mutation_reader::forwarding _fwd_mr; + read_monitor& _monitor; +public: + mx_sstable_mutation_reader(shared_sstable sst, + schema_ptr schema, + reader_permit permit, + const dht::partition_range& pr, + const query::partition_slice& slice, + const io_priority_class& pc, + tracing::trace_state_ptr trace_state, + streamed_mutation::forwarding fwd, + mutation_reader::forwarding fwd_mr, + read_monitor& mon) + : mp_row_consumer_reader_mx(std::move(schema), permit, std::move(sst)) + , _consumer(this, _schema, std::move(permit), slice, pc, std::move(trace_state), fwd, _sst) + // FIXME: I want to add `&& fwd_mr == mutation_reader::forwarding::no` below + // but can't because many call sites use the default value for + // `mutation_reader::forwarding` which is `yes`. + , _single_partition_read(pr.is_singular()) + , _pr(pr) + , _slice(slice) + , _fwd(fwd) + , _fwd_mr(fwd_mr) + , _monitor(mon) { } + + // Reference to _consumer is passed to data_consume_rows() in the constructor so we must not allow move/copy + mx_sstable_mutation_reader(mx_sstable_mutation_reader&&) = delete; + mx_sstable_mutation_reader(const mx_sstable_mutation_reader&) = delete; + ~mx_sstable_mutation_reader() { + if (_context || _index_reader) { + sstlog.warn("sstable_mutation_reader was not closed. Closing in the background. Backtrace: {}", current_backtrace()); + // FIXME: discarded future. + (void)close(); + } + } +private: + static bool will_likely_slice(const query::partition_slice& slice) { + return (!slice.default_row_ranges().empty() && !slice.default_row_ranges()[0].is_full()) + || slice.get_specific_ranges(); + } + index_reader& get_index_reader() { + if (!_index_reader) { + _index_reader = std::make_unique(_sst, _consumer.permit(), _consumer.io_priority(), _consumer.trace_state()); + } + return *_index_reader; + } + future<> advance_to_next_partition() { + sstlog.trace("reader {}: advance_to_next_partition()", fmt::ptr(this)); + _before_partition = true; + auto& consumer = _consumer; + if (consumer.is_mutation_end()) { + sstlog.trace("reader {}: already at partition boundary", fmt::ptr(this)); + _index_in_current_partition = false; + return make_ready_future<>(); + } + return (_index_in_current_partition + ? _index_reader->advance_to_next_partition() + : get_index_reader().advance_to(dht::ring_position_view::for_after_key(*_current_partition_key))).then([this] { + _index_in_current_partition = true; + auto [start, end] = _index_reader->data_file_positions(); + if (end && start > *end) { + _read_enabled = false; + return make_ready_future<>(); + } + assert(_index_reader->element_kind() == indexable_element::partition); + return skip_to(_index_reader->element_kind(), start).then([this] { + _sst->get_stats().on_partition_seek(); + }); + }); + } + future<> read_from_index() { + sstlog.trace("reader {}: read from index", fmt::ptr(this)); + auto tomb = _index_reader->partition_tombstone(); + if (!tomb) { + sstlog.trace("reader {}: no tombstone", fmt::ptr(this)); + return read_from_datafile(); + } + auto pk = _index_reader->partition_key().to_partition_key(*_schema); + auto key = dht::decorate_key(*_schema, std::move(pk)); + _consumer.setup_for_partition(key.key()); + on_next_partition(std::move(key), tombstone(*tomb)); + return make_ready_future<>(); + } + future<> read_from_datafile() { + sstlog.trace("reader {}: read from data file", fmt::ptr(this)); + return _context->consume_input(); + } + // Assumes that we're currently positioned at partition boundary. + future<> read_partition() { + sstlog.trace("reader {}: reading partition", fmt::ptr(this)); + + _end_of_stream = true; // on_next_partition() will set it to true + if (!_read_enabled) { + sstlog.trace("reader {}: eof", fmt::ptr(this)); + return make_ready_future<>(); + } + + if (!_consumer.is_mutation_end()) { + throw malformed_sstable_exception(format("consumer not at partition boundary, position: {}", + position_in_partition_view::printer(*_schema, _consumer.position())), _sst->get_filename()); + } + + // It's better to obtain partition information from the index if we already have it. + // We can save on IO if the user will skip past the front of partition immediately. + // + // It is also better to pay the cost of reading the index if we know that we will + // need to use the index anyway soon. + // + if (_index_in_current_partition) { + if (_context->eof()) { + sstlog.trace("reader {}: eof", fmt::ptr(this)); + return make_ready_future<>(); + } + if (_index_reader->partition_data_ready()) { + return read_from_index(); + } + if (_will_likely_slice) { + return _index_reader->read_partition_data().then([this] { + return read_from_index(); + }); + } + } + + // FIXME: advance index to current partition if _will_likely_slice + return read_from_datafile(); + } + // Can be called from any position. + future<> read_next_partition() { + sstlog.trace("reader {}: read next partition", fmt::ptr(this)); + // If next partition exists then on_next_partition will be called + // and _end_of_stream will be set to false again. + _end_of_stream = true; + if (!_read_enabled || _single_partition_read) { + sstlog.trace("reader {}: eof", fmt::ptr(this)); + return make_ready_future<>(); + } + return advance_to_next_partition().then([this] { + return read_partition(); + }); + } + future<> advance_context(std::optional pos) { + if (!pos || pos->is_before_all_fragments(*_schema)) { + return make_ready_future<>(); + } + assert (_current_partition_key); + return [this] { + if (!_index_in_current_partition) { + _index_in_current_partition = true; + return get_index_reader().advance_to(*_current_partition_key); + } + return make_ready_future(); + }().then([this, pos] { + return get_index_reader().advance_to(*pos).then([this] { + index_reader& idx = *_index_reader; + auto index_position = idx.data_file_positions(); + if (index_position.start <= _context->position()) { + return make_ready_future<>(); + } + return skip_to(idx.element_kind(), index_position.start).then([this, &idx] { + _sst->get_stats().on_partition_seek(); + auto open_end_marker = idx.end_open_marker(); + if (open_end_marker) { + _consumer.set_range_tombstone(open_end_marker->tomb); + } else { + _consumer.set_range_tombstone({}); + } + }); + }); + }); + } + bool is_initialized() const { + return bool(_context); + } + future<> initialize() { + if (_single_partition_read) { + _sst->get_stats().on_single_partition_read(); + const auto& key = dht::ring_position_view(_pr.start()->value()); + position_in_partition_view pos = get_slice_upper_bound(*_schema, _slice, key); + const auto present = co_await get_index_reader().advance_lower_and_check_if_present(key, pos); + + if (!present) { + _sst->get_filter_tracker().add_false_positive(); + co_return; + } + + _sst->get_filter_tracker().add_true_positive(); + } else { + _sst->get_stats().on_range_partition_read(); + co_await get_index_reader().advance_to(_pr); + } + + auto [begin, end] = _index_reader->data_file_positions(); + assert(end); + + if (_single_partition_read) { + _read_enabled = (begin != *end); + _context = data_consume_single_partition(*_schema, _sst, _consumer, { begin, *end }); + } else { + sstable::disk_read_range drr{begin, *end}; + auto last_end = _fwd_mr ? _sst->data_size() : drr.end; + _read_enabled = bool(drr); + _context = data_consume_rows(*_schema, _sst, _consumer, std::move(drr), last_end); + } + + _monitor.on_read_started(_context->reader_position()); + _index_in_current_partition = true; + _will_likely_slice = will_likely_slice(_slice); + } + future<> ensure_initialized() { + if (is_initialized()) { + return make_ready_future<>(); + } + return initialize(); + } + future<> skip_to(indexable_element el, uint64_t begin) { + sstlog.trace("sstable_reader: {}: skip_to({} -> {}, el={})", fmt::ptr(_context.get()), _context->position(), begin, static_cast(el)); + if (begin <= _context->position()) { + return make_ready_future<>(); + } + _context->reset(el); + return _context->skip_to(begin); + } +public: + void on_out_of_clustering_range() override { + if (_fwd == streamed_mutation::forwarding::yes) { + _end_of_stream = true; + } else { + this->push_mutation_fragment(mutation_fragment_v2(*_schema, _permit, partition_end())); + _partition_finished = true; + } + } + virtual future<> fast_forward_to(const dht::partition_range& pr, db::timeout_clock::time_point timeout) override { + return ensure_initialized().then([this, &pr] { + if (!is_initialized()) { + _end_of_stream = true; + return make_ready_future<>(); + } else { + clear_buffer(); + _partition_finished = true; + _before_partition = true; + _end_of_stream = false; + assert(_index_reader); + auto f1 = _index_reader->advance_to(pr); + return f1.then([this] { + auto [start, end] = _index_reader->data_file_positions(); + assert(end); + if (start != *end) { + _read_enabled = true; + _index_in_current_partition = true; + _context->reset(indexable_element::partition); + return _context->fast_forward_to(start, *end); + } + _index_in_current_partition = false; + _read_enabled = false; + return make_ready_future<>(); + }); + } + }); + } + virtual future<> fill_buffer(db::timeout_clock::time_point timeout) override { + if (_end_of_stream) { + return make_ready_future<>(); + } + if (!is_initialized()) { + return initialize().then([this, timeout] { + if (!is_initialized()) { + _end_of_stream = true; + return make_ready_future<>(); + } else { + return fill_buffer(timeout); + } + }); + } + return do_until([this] { return is_end_of_stream() || is_buffer_full(); }, [this] { + if (_partition_finished) { + if (_before_partition) { + return read_partition(); + } else { + return read_next_partition(); + } + } else { + return do_until([this] { return is_buffer_full() || _partition_finished || _end_of_stream; }, [this] { + _consumer.push_ready_fragments(); + if (is_buffer_full() || _partition_finished || _end_of_stream) { + return make_ready_future<>(); + } + return advance_context(_consumer.maybe_skip()).then([this] { + return _context->consume_input(); + }); + }); + } + }); + } + virtual future<> next_partition() override { + if (is_initialized()) { + if (_fwd == streamed_mutation::forwarding::yes) { + clear_buffer(); + _partition_finished = true; + _end_of_stream = false; + } else { + clear_buffer_to_next_partition(); + if (!_partition_finished && is_buffer_empty()) { + _partition_finished = true; + } + } + } + return make_ready_future<>(); + // If _ds is not created then next_partition() has no effect because there was no partition_start emitted yet. + } + virtual future<> fast_forward_to(position_range cr, db::timeout_clock::time_point timeout) override { + forward_buffer_to(cr.start()); + if (!_partition_finished) { + _end_of_stream = false; + return advance_context(_consumer.fast_forward_to(std::move(cr), timeout)); + } else { + _end_of_stream = true; + return make_ready_future<>(); + } + } + virtual future<> close() noexcept override { + auto close_context = make_ready_future<>(); + if (_context) { + _monitor.on_read_completed(); + // move _context to prevent double-close from destructor. + close_context = _context->close().finally([_ = std::move(_context)] {}); + } + + auto close_index_reader = make_ready_future<>(); + if (_index_reader) { + // move _index_reader to prevent double-close from destructor. + close_index_reader = _index_reader->close().finally([_ = std::move(_index_reader)] {}); + } + + return when_all_succeed(std::move(close_context), std::move(close_index_reader)).discard_result().handle_exception([] (std::exception_ptr ep) { + // close can not fail as it is called either from the destructor or from flat_mutation_reader::close + sstlog.warn("Failed closing of sstable_mutation_reader: {}. Ignored since the reader is already done.", ep); + }); + } +}; + + flat_mutation_reader make_reader( shared_sstable sstable, schema_ptr schema, @@ -1562,9 +1893,21 @@ flat_mutation_reader make_reader( streamed_mutation::forwarding fwd, mutation_reader::forwarding fwd_mr, read_monitor& monitor) { - return make_flat_mutation_reader>( - std::move(sstable), std::move(schema), std::move(permit), range, slice, pc, std::move(trace_state), fwd, fwd_mr, monitor); + return downgrade_to_v1( + make_flat_mutation_reader_v2( + std::move(sstable), std::move(schema), std::move(permit), range, slice, pc, std::move(trace_state), fwd, fwd_mr, monitor)); } } // namespace mx + +void mx::mp_row_consumer_reader_mx::on_next_partition(dht::decorated_key key, tombstone tomb) { + _partition_finished = false; + _before_partition = false; + _end_of_stream = false; + _current_partition_key = std::move(key); + push_mutation_fragment( + mutation_fragment_v2(*_schema, _permit, partition_start(*_current_partition_key, tomb))); + _sst->get_stats().on_partition_read(); +} + } // namespace sstables diff --git a/sstables/sstable_mutation_reader.cc b/sstables/sstable_mutation_reader.cc index b5233c2508..075c62080d 100644 --- a/sstables/sstable_mutation_reader.cc +++ b/sstables/sstable_mutation_reader.cc @@ -162,7 +162,7 @@ position_in_partition_view get_slice_upper_bound(const schema& s, const query::p return position_in_partition_view::for_range_end(ranges.back()); } -void mp_row_consumer_reader::on_next_partition(dht::decorated_key key, tombstone tomb) { +void mp_row_consumer_reader_k_l::on_next_partition(dht::decorated_key key, tombstone tomb) { _partition_finished = false; _before_partition = false; _end_of_stream = false; diff --git a/sstables/sstable_mutation_reader.hh b/sstables/sstable_mutation_reader.hh index 5e7f8cdd4b..f49719795f 100644 --- a/sstables/sstable_mutation_reader.hh +++ b/sstables/sstable_mutation_reader.hh @@ -35,6 +35,7 @@ #include "clustering_ranges_walker.hh" #include "binary_search.hh" #include "../dht/i_partitioner.hh" +#include "flat_mutation_reader_v2.hh" namespace sstables { @@ -46,9 +47,7 @@ namespace mx { class mp_row_consumer_m; } -class mp_row_consumer_reader : public flat_mutation_reader::impl { - friend class sstables::kl::mp_row_consumer_k_l; - friend class sstables::mx::mp_row_consumer_m; +class mp_row_consumer_reader_base { protected: shared_sstable _sst; @@ -67,9 +66,8 @@ protected: std::optional _current_partition_key; public: - mp_row_consumer_reader(schema_ptr s, reader_permit permit, shared_sstable sst) - : impl(std::move(s), std::move(permit)) - , _sst(std::move(sst)) + mp_row_consumer_reader_base(shared_sstable sst) + : _sst(std::move(sst)) { } // Called when all fragments relevant to the query range or fast forwarding window @@ -77,8 +75,17 @@ public: // If no skipping is required, this method may not be called before transitioning // to the next partition. virtual void on_out_of_clustering_range() = 0; +}; - void on_next_partition(dht::decorated_key key, tombstone tomb); +class mp_row_consumer_reader_k_l : public mp_row_consumer_reader_base, public flat_mutation_reader::impl { + friend class sstables::kl::mp_row_consumer_k_l; +public: + mp_row_consumer_reader_k_l(schema_ptr s, reader_permit permit, shared_sstable sst) + : mp_row_consumer_reader_base(std::move(sst)) + , impl(std::move(s), std::move(permit)) + {} + + void on_next_partition(dht::decorated_key, tombstone); }; inline atomic_cell make_atomic_cell(const abstract_type& type, @@ -185,371 +192,9 @@ void set_range_tombstone_start_from_end_open_marker(Consumer& c, const schema& s auto open_end_marker = idx.end_open_marker(); if (open_end_marker) { auto[pos, tomb] = *open_end_marker; - if (pos.is_clustering_row()) { - auto ck = pos.key(); - bool was_non_full = clustering_key::make_full(s, ck); - c.set_range_tombstone_start( - std::move(ck), - was_non_full ? bound_kind::incl_start : bound_kind::excl_start, - tomb); - } else { - auto view = position_in_partition_view(pos).as_start_bound_view(); - c.set_range_tombstone_start(view.prefix(), view.kind(), tomb); - } - } else { - c.reset_range_tombstone_start(); + c.set_range_tombstone_start(tomb); } } } -template -requires RowConsumer -class sstable_mutation_reader : public mp_row_consumer_reader { - Consumer _consumer; - bool _will_likely_slice = false; - bool _read_enabled = true; - std::unique_ptr _context; - std::unique_ptr _index_reader; - // We avoid unnecessary lookup for single partition reads thanks to this flag - bool _single_partition_read = false; - const dht::partition_range& _pr; - const query::partition_slice& _slice; - const io_priority_class& _pc; - streamed_mutation::forwarding _fwd; - mutation_reader::forwarding _fwd_mr; - read_monitor& _monitor; -public: - sstable_mutation_reader(shared_sstable sst, - schema_ptr schema, - reader_permit permit, - const dht::partition_range& pr, - const query::partition_slice& slice, - const io_priority_class& pc, - tracing::trace_state_ptr trace_state, - streamed_mutation::forwarding fwd, - mutation_reader::forwarding fwd_mr, - read_monitor& mon) - : mp_row_consumer_reader(std::move(schema), permit, std::move(sst)) - , _consumer(this, _schema, std::move(permit), slice, pc, std::move(trace_state), fwd, _sst) - // FIXME: I want to add `&& fwd_mr == mutation_reader::forwarding::no` below - // but can't because many call sites use the default value for - // `mutation_reader::forwarding` which is `yes`. - , _single_partition_read(pr.is_singular()) - , _pr(pr) - , _slice(slice) - , _pc(pc) - , _fwd(fwd) - , _fwd_mr(fwd_mr) - , _monitor(mon) { } - - // Reference to _consumer is passed to data_consume_rows() in the constructor so we must not allow move/copy - sstable_mutation_reader(sstable_mutation_reader&&) = delete; - sstable_mutation_reader(const sstable_mutation_reader&) = delete; - ~sstable_mutation_reader() { - if (_context || _index_reader) { - sstlog.warn("sstable_mutation_reader was not closed. Closing in the background. Backtrace: {}", current_backtrace()); - // FIXME: discarded future. - (void)close(); - } - } -private: - static bool will_likely_slice(const query::partition_slice& slice) { - return (!slice.default_row_ranges().empty() && !slice.default_row_ranges()[0].is_full()) - || slice.get_specific_ranges(); - } - index_reader& get_index_reader() { - if (!_index_reader) { - _index_reader = std::make_unique(_sst, _consumer.permit(), _consumer.io_priority(), _consumer.trace_state()); - } - return *_index_reader; - } - future<> advance_to_next_partition() { - sstlog.trace("reader {}: advance_to_next_partition()", fmt::ptr(this)); - _before_partition = true; - auto& consumer = _consumer; - if (consumer.is_mutation_end()) { - sstlog.trace("reader {}: already at partition boundary", fmt::ptr(this)); - _index_in_current_partition = false; - return make_ready_future<>(); - } - return (_index_in_current_partition - ? _index_reader->advance_to_next_partition() - : get_index_reader().advance_to(dht::ring_position_view::for_after_key(*_current_partition_key))).then([this] { - _index_in_current_partition = true; - auto [start, end] = _index_reader->data_file_positions(); - if (end && start > *end) { - _read_enabled = false; - return make_ready_future<>(); - } - assert(_index_reader->element_kind() == indexable_element::partition); - return skip_to(_index_reader->element_kind(), start).then([this] { - _sst->get_stats().on_partition_seek(); - }); - }); - } - future<> read_from_index() { - sstlog.trace("reader {}: read from index", fmt::ptr(this)); - auto tomb = _index_reader->partition_tombstone(); - if (!tomb) { - sstlog.trace("reader {}: no tombstone", fmt::ptr(this)); - return read_from_datafile(); - } - auto pk = _index_reader->partition_key().to_partition_key(*_schema); - auto key = dht::decorate_key(*_schema, std::move(pk)); - _consumer.setup_for_partition(key.key()); - on_next_partition(std::move(key), tombstone(*tomb)); - return make_ready_future<>(); - } - future<> read_from_datafile() { - sstlog.trace("reader {}: read from data file", fmt::ptr(this)); - return _context->consume_input(); - } - // Assumes that we're currently positioned at partition boundary. - future<> read_partition() { - sstlog.trace("reader {}: reading partition", fmt::ptr(this)); - - _end_of_stream = true; // on_next_partition() will set it to true - if (!_read_enabled) { - sstlog.trace("reader {}: eof", fmt::ptr(this)); - return make_ready_future<>(); - } - - if (!_consumer.is_mutation_end()) { - throw malformed_sstable_exception(format("consumer not at partition boundary, position: {}", - position_in_partition_view::printer(*_schema, _consumer.position())), _sst->get_filename()); - } - - // It's better to obtain partition information from the index if we already have it. - // We can save on IO if the user will skip past the front of partition immediately. - // - // It is also better to pay the cost of reading the index if we know that we will - // need to use the index anyway soon. - // - if (_index_in_current_partition) { - if (_context->eof()) { - sstlog.trace("reader {}: eof", fmt::ptr(this)); - return make_ready_future<>(); - } - if (_index_reader->partition_data_ready()) { - return read_from_index(); - } - if (_will_likely_slice) { - return _index_reader->read_partition_data().then([this] { - return read_from_index(); - }); - } - } - - // FIXME: advance index to current partition if _will_likely_slice - return read_from_datafile(); - } - // Can be called from any position. - future<> read_next_partition() { - sstlog.trace("reader {}: read next partition", fmt::ptr(this)); - // If next partition exists then on_next_partition will be called - // and _end_of_stream will be set to false again. - _end_of_stream = true; - if (!_read_enabled || _single_partition_read) { - sstlog.trace("reader {}: eof", fmt::ptr(this)); - return make_ready_future<>(); - } - return advance_to_next_partition().then([this] { - return read_partition(); - }); - } - future<> advance_context(std::optional pos) { - if (!pos || pos->is_before_all_fragments(*_schema)) { - return make_ready_future<>(); - } - assert (_current_partition_key); - return [this] { - if (!_index_in_current_partition) { - _index_in_current_partition = true; - return get_index_reader().advance_to(*_current_partition_key); - } - return make_ready_future(); - }().then([this, pos] { - return get_index_reader().advance_to(*pos).then([this] { - index_reader& idx = *_index_reader; - auto index_position = idx.data_file_positions(); - if (index_position.start <= _context->position()) { - return make_ready_future<>(); - } - return skip_to(idx.element_kind(), index_position.start).then([this, &idx] { - _sst->get_stats().on_partition_seek(); - set_range_tombstone_start_from_end_open_marker(_consumer, *_schema, idx); - }); - }); - }); - } - bool is_initialized() const { - return bool(_context); - } - future<> initialize() { - if (_single_partition_read) { - _sst->get_stats().on_single_partition_read(); - const auto& key = dht::ring_position_view(_pr.start()->value()); - position_in_partition_view pos = get_slice_upper_bound(*_schema, _slice, key); - const auto present = co_await get_index_reader().advance_lower_and_check_if_present(key, pos); - - if (!present) { - _sst->get_filter_tracker().add_false_positive(); - co_return; - } - - _sst->get_filter_tracker().add_true_positive(); - } else { - _sst->get_stats().on_range_partition_read(); - co_await get_index_reader().advance_to(_pr); - } - - auto [begin, end] = _index_reader->data_file_positions(); - assert(end); - - if (_single_partition_read) { - _read_enabled = (begin != *end); - _context = data_consume_single_partition(*_schema, _sst, _consumer, { begin, *end }); - } else { - sstable::disk_read_range drr{begin, *end}; - auto last_end = _fwd_mr ? _sst->data_size() : drr.end; - _read_enabled = bool(drr); - _context = data_consume_rows(*_schema, _sst, _consumer, std::move(drr), last_end); - } - - _monitor.on_read_started(_context->reader_position()); - _index_in_current_partition = true; - _will_likely_slice = will_likely_slice(_slice); - } - future<> ensure_initialized() { - if (is_initialized()) { - return make_ready_future<>(); - } - return initialize(); - } - future<> skip_to(indexable_element el, uint64_t begin) { - sstlog.trace("sstable_reader: {}: skip_to({} -> {}, el={})", fmt::ptr(_context.get()), _context->position(), begin, static_cast(el)); - if (begin <= _context->position()) { - return make_ready_future<>(); - } - _context->reset(el); - return _context->skip_to(begin); - } -public: - void on_out_of_clustering_range() override { - if (_fwd == streamed_mutation::forwarding::yes) { - _end_of_stream = true; - } else { - this->push_mutation_fragment(mutation_fragment(*_schema, _permit, partition_end())); - _partition_finished = true; - } - } - virtual future<> fast_forward_to(const dht::partition_range& pr, db::timeout_clock::time_point timeout) override { - return ensure_initialized().then([this, &pr] { - if (!is_initialized()) { - _end_of_stream = true; - return make_ready_future<>(); - } else { - clear_buffer(); - _partition_finished = true; - _before_partition = true; - _end_of_stream = false; - assert(_index_reader); - auto f1 = _index_reader->advance_to(pr); - return f1.then([this] { - auto [start, end] = _index_reader->data_file_positions(); - assert(end); - if (start != *end) { - _read_enabled = true; - _index_in_current_partition = true; - _context->reset(indexable_element::partition); - return _context->fast_forward_to(start, *end); - } - _index_in_current_partition = false; - _read_enabled = false; - return make_ready_future<>(); - }); - } - }); - } - virtual future<> fill_buffer(db::timeout_clock::time_point timeout) override { - if (_end_of_stream) { - return make_ready_future<>(); - } - if (!is_initialized()) { - return initialize().then([this, timeout] { - if (!is_initialized()) { - _end_of_stream = true; - return make_ready_future<>(); - } else { - return fill_buffer(timeout); - } - }); - } - return do_until([this] { return is_end_of_stream() || is_buffer_full(); }, [this] { - if (_partition_finished) { - if (_before_partition) { - return read_partition(); - } else { - return read_next_partition(); - } - } else { - return do_until([this] { return is_buffer_full() || _partition_finished || _end_of_stream; }, [this] { - _consumer.push_ready_fragments(); - if (is_buffer_full() || _partition_finished || _end_of_stream) { - return make_ready_future<>(); - } - return advance_context(_consumer.maybe_skip()).then([this] { - return _context->consume_input(); - }); - }); - } - }); - } - virtual future<> next_partition() override { - if (is_initialized()) { - if (_fwd == streamed_mutation::forwarding::yes) { - clear_buffer(); - _partition_finished = true; - _end_of_stream = false; - } else { - clear_buffer_to_next_partition(); - if (!_partition_finished && is_buffer_empty()) { - _partition_finished = true; - } - } - } - return make_ready_future<>(); - // If _ds is not created then next_partition() has no effect because there was no partition_start emitted yet. - } - virtual future<> fast_forward_to(position_range cr, db::timeout_clock::time_point timeout) override { - forward_buffer_to(cr.start()); - if (!_partition_finished) { - _end_of_stream = false; - return advance_context(_consumer.fast_forward_to(std::move(cr), timeout)); - } else { - _end_of_stream = true; - return make_ready_future<>(); - } - } - virtual future<> close() noexcept override { - auto close_context = make_ready_future<>(); - if (_context) { - _monitor.on_read_completed(); - // move _context to prevent double-close from destructor. - close_context = _context->close().finally([_ = std::move(_context)] {}); - } - - auto close_index_reader = make_ready_future<>(); - if (_index_reader) { - // move _index_reader to prevent double-close from destructor. - close_index_reader = _index_reader->close().finally([_ = std::move(_index_reader)] {}); - } - - return when_all_succeed(std::move(close_context), std::move(close_index_reader)).discard_result().handle_exception([] (std::exception_ptr ep) { - // close can not fail as it is called either from the destructor or from flat_mutation_reader::close - sstlog.warn("Failed closing of sstable_mutation_reader: {}. Ignored since the reader is already done.", ep); - }); - } -}; - } diff --git a/test/boost/broken_sstable_test.cc b/test/boost/broken_sstable_test.cc index 1436bcc8df..04678416ad 100644 --- a/test/boost/broken_sstable_test.cc +++ b/test/boost/broken_sstable_test.cc @@ -142,9 +142,9 @@ SEASTAR_THREAD_TEST_CASE(broken_open_tombstone) { .with_column("val", utf8_type, column_kind::regular_column) .build(schema_builder::compact_storage::no); broken_sst("test/resource/sstables/broken_open_tombstone", 122, s, - "Range tombstones have to be disjoint: current opened range tombstone { clustering: " - "ckp{00056b65793262}, kind: incl start, tombstone: {tombstone: timestamp=1544745393692803, " - "deletion_time=1544745393} }, new tombstone {tombstone: timestamp=1544745393692803, " + "Range tombstones have to be disjoint: current opened range tombstone " + "{tombstone: timestamp=1544745393692803, deletion_time=1544745393}, " + "new tombstone {tombstone: timestamp=1544745393692803, " "deletion_time=1544745393} in sstable " "test/resource/sstables/broken_open_tombstone/mc-122-big-Data.db", sstable::version_types::mc); diff --git a/test/boost/sstable_3_x_test.cc b/test/boost/sstable_3_x_test.cc index 6fe3e0b76e..040db3c1c0 100644 --- a/test/boost/sstable_3_x_test.cc +++ b/test/boost/sstable_3_x_test.cc @@ -792,7 +792,7 @@ SEASTAR_THREAD_TEST_CASE(test_uncompressed_filtering_and_forwarding_range_tombst // when we fast-forward to a block that doesn't have an end open marker. { r.fast_forward_to(to_full_ck(13413, 13413), to_non_full_ck(13417)); - auto slice = make_clustering_range(to_non_full_ck(13412), to_non_full_ck(13417)); + auto slice = make_clustering_range(to_full_ck(13413, 13413), to_non_full_ck(13417)); r.produces_range_tombstone( make_range_tombstone(to_non_full_ck(13412), to_non_full_ck(13413), tomb), {slice}) @@ -805,7 +805,7 @@ SEASTAR_THREAD_TEST_CASE(test_uncompressed_filtering_and_forwarding_range_tombst { r.fast_forward_to(to_non_full_ck(13417), to_non_full_ck(13420)); - auto slice = make_clustering_range(to_non_full_ck(13419), to_non_full_ck(13420)); + auto slice = make_clustering_range(to_non_full_ck(13417), to_non_full_ck(13420)); r.produces_row(to_full_ck(13417, 13417), to_expected(13417)) .produces_range_tombstone( make_range_tombstone(to_non_full_ck(13418), to_non_full_ck(13419), tomb), @@ -852,6 +852,7 @@ SEASTAR_THREAD_TEST_CASE(test_uncompressed_filtering_and_forwarding_range_tombst auto r = make_assertions(sst.make_reader(query::full_partition_range, slice)); std::array rt_deletion_times {1534898600, 1534899416}; for (auto pkey : boost::irange(1, 3)) { + auto slices = slice.get_all_ranges(); const tombstone tomb = make_tombstone(1525385507816568, rt_deletion_times[pkey - 1]); r.produces_partition_start(to_pkey(pkey)) .produces_static_row({{st_cdef, int32_type->decompose(static_row_values[pkey - 1])}}); @@ -862,31 +863,28 @@ SEASTAR_THREAD_TEST_CASE(test_uncompressed_filtering_and_forwarding_range_tombst } { - auto slice = make_clustering_range(to_non_full_ck(13412), to_non_full_ck(13417)); r.produces_range_tombstone( make_range_tombstone(to_non_full_ck(13412), to_non_full_ck(13413), tomb), - {slice}) + slices) .produces_row(to_full_ck(13414, 13414), to_expected(13414)) .produces_range_tombstone( make_range_tombstone(to_non_full_ck(13415), to_non_full_ck(13416), tomb), - {slice}); + slices); } { - auto slice = make_clustering_range(to_non_full_ck(13419), to_non_full_ck(13420)); r.produces_row(to_full_ck(13417, 13417), to_expected(13417)) .produces_range_tombstone( make_range_tombstone(to_non_full_ck(13418), to_non_full_ck(13419), tomb), - {slice}) + slices) .produces_row(to_full_ck(13420, 13420), to_expected(13420)) .produces_row(to_full_ck(13423, 13423), to_expected(13423)); } { - auto slice = make_clustering_range(to_non_full_ck(13425), to_non_full_ck(13426)); r.produces_range_tombstone( make_range_tombstone(to_non_full_ck(13424), to_non_full_ck(13425), tomb), - {slice}); + slices); } r.next_partition(); @@ -914,6 +912,7 @@ SEASTAR_THREAD_TEST_CASE(test_uncompressed_filtering_and_forwarding_range_tombst std::array rt_deletion_times {1534898600, 1534899416}; for (auto pkey : boost::irange(1, 3)) { + auto slices = slice.get_all_ranges(); const tombstone tomb = make_tombstone(1525385507816568, rt_deletion_times[pkey - 1]); r.produces_partition_start(to_pkey(pkey)) .produces_static_row({{st_cdef, int32_type->decompose(static_row_values[pkey - 1])}}) @@ -929,31 +928,28 @@ SEASTAR_THREAD_TEST_CASE(test_uncompressed_filtering_and_forwarding_range_tombst r.fast_forward_to(to_non_full_ck(13000), to_non_full_ck(15000)); { - auto slice = make_clustering_range(to_non_full_ck(13412), to_non_full_ck(13417)); r.produces_range_tombstone( make_range_tombstone(to_non_full_ck(13412), to_non_full_ck(13413), tomb), - {slice}) + slices) .produces_row(to_full_ck(13414, 13414), to_expected(13414)) .produces_range_tombstone( make_range_tombstone(to_non_full_ck(13415), to_non_full_ck(13416), tomb), - {slice}); + slices); } { - auto slice = make_clustering_range(to_non_full_ck(13419), to_non_full_ck(13420)); r.produces_row(to_full_ck(13417, 13417), to_expected(13417)) .produces_range_tombstone( make_range_tombstone(to_non_full_ck(13418), to_non_full_ck(13419), tomb), - {slice}) + slices) .produces_row(to_full_ck(13420, 13420), to_expected(13420)) .produces_row(to_full_ck(13423, 13423), to_expected(13423)); } { - auto slice = make_clustering_range(to_non_full_ck(13425), to_non_full_ck(13426)); r.produces_range_tombstone( make_range_tombstone(to_non_full_ck(13424), to_non_full_ck(13425), tomb), - {slice}); + slices); } r.produces_end_of_stream(); @@ -1118,7 +1114,7 @@ SEASTAR_THREAD_TEST_CASE(test_uncompressed_slicing_interleaved_rows_and_rts_read r.produces_partition_start(to_pkey(1)); r.fast_forward_to(to_full_ck(7460, 7461), to_full_ck(7500, 7501)); { - auto clustering_range = make_clustering_range(to_non_full_ck(7000), to_non_full_ck(8000)); + auto clustering_range = make_clustering_range(to_full_ck(7460, 7461), to_full_ck(7500, 7501)); range_tombstone rt = make_rt_excl_start(to_full_ck(7459, 7459), to_non_full_ck(7460), tomb); @@ -1150,11 +1146,10 @@ SEASTAR_THREAD_TEST_CASE(test_uncompressed_slicing_interleaved_rows_and_rts_read r.produces_partition_start(to_pkey(1)) .produces_static_row({{st_cdef, int32_type->decompose(int32_t(555))}}); - auto clustering_range = make_clustering_range(to_non_full_ck(7000), to_non_full_ck(8000)); range_tombstone rt = make_rt_excl_start(to_full_ck(7459, 7459), to_non_full_ck(7460), tomb); - r.produces_range_tombstone(rt, {clustering_range}); + r.produces_range_tombstone(rt, slice.get_all_ranges()); for (auto idx : boost::irange(7461, 7501, 5)) { range_tombstone rt1 = @@ -1162,9 +1157,9 @@ SEASTAR_THREAD_TEST_CASE(test_uncompressed_slicing_interleaved_rows_and_rts_read range_tombstone rt2 = make_rt_excl_start(to_full_ck(idx + 3, idx + 3), to_non_full_ck(idx + 4), tomb); - r.produces_range_tombstone(rt1, {clustering_range}) + r.produces_range_tombstone(rt1, slice.get_all_ranges()) .produces_row(to_full_ck(idx + 3, idx + 3), to_expected(idx + 3)) - .produces_range_tombstone(rt2, {clustering_range}); + .produces_range_tombstone(rt2, slice.get_all_ranges()); } r.produces_partition_end() .produces_end_of_stream(); @@ -1185,7 +1180,7 @@ SEASTAR_THREAD_TEST_CASE(test_uncompressed_slicing_interleaved_rows_and_rts_read r.produces_partition_start(to_pkey(1)); r.fast_forward_to(to_full_ck(7460, 7461), to_full_ck(7600, 7601)); - auto clustering_range = make_clustering_range(to_non_full_ck(7000), to_non_full_ck(8000)); + auto clustering_range = make_clustering_range(to_full_ck(7470, 7471), to_full_ck(7500, 7501)); range_tombstone rt = make_rt_excl_start(to_full_ck(7469, 7469), to_non_full_ck(7470), tomb); diff --git a/test/boost/sstable_conforms_to_mutation_source_test.cc b/test/boost/sstable_conforms_to_mutation_source_test.cc index 69466fc5e2..ee12822b7a 100644 --- a/test/boost/sstable_conforms_to_mutation_source_test.cc +++ b/test/boost/sstable_conforms_to_mutation_source_test.cc @@ -28,6 +28,10 @@ #include "sstables/sstables.hh" #include "test/lib/mutation_source_test.hh" #include "test/lib/sstable_utils.hh" +#include "row_cache.hh" +#include "test/lib/simple_schema.hh" +#include "partition_slice_builder.hh" +#include "test/lib/flat_mutation_reader_assertions.hh" using namespace sstables; using namespace std::chrono_literals; @@ -38,17 +42,57 @@ mutation_source make_sstable_mutation_source(sstables::test_env& env, schema_ptr return as_mutation_source(make_sstable(env, s, dir, std::move(mutations), cfg, version, query_time)); } -// Must be run in a seastar thread -static -void test_mutation_source(sstables::test_env& env, sstable_writer_config cfg, sstables::sstable::version_types version) { - std::vector dirs; - run_mutation_source_tests([&env, &dirs, &cfg, version] (schema_ptr s, const std::vector& partitions, - gc_clock::time_point query_time) -> mutation_source { - dirs.emplace_back(); - return make_sstable_mutation_source(env, s, dirs.back().path().string(), partitions, cfg, version, query_time); +static void consume_all(flat_mutation_reader& rd) { + while (auto mfopt = rd(db::no_timeout).get0()) {} +} + +// It is assumed that src won't change. +static snapshot_source snapshot_source_from_snapshot(mutation_source src) { + return snapshot_source([src = std::move(src)] { + return src; }); } +static +void test_cache_population_with_range_tombstone_adjacent_to_population_range(populate_fn_ex populate) { + simple_schema s; + auto cache_mt = make_lw_shared(s.schema()); + + auto pkey = s.make_pkey(); + + // underlying should not be empty, otherwise cache will make the whole range continuous + mutation m1(s.schema(), pkey); + s.add_row(m1, s.make_ckey(0), "v1"); + s.add_row(m1, s.make_ckey(1), "v2"); + s.add_row(m1, s.make_ckey(2), "v3"); + s.delete_range(m1, s.make_ckey_range(2, 100)); + cache_mt->apply(m1); + + cache_tracker tracker; + auto ms = populate(s.schema(), std::vector({m1}), gc_clock::now()); + row_cache cache(s.schema(), snapshot_source_from_snapshot(std::move(ms)), tracker); + + auto pr = dht::partition_range::make_singular(pkey); + + auto populate_range = [&] (int start) { + auto slice = partition_slice_builder(*s.schema()) + .with_range(query::clustering_range::make_singular(s.make_ckey(start))) + .build(); + auto rd = cache.make_reader(s.schema(), tests::make_permit(), pr, slice); + auto close_rd = deferred_close(rd); + consume_all(rd); + }; + + populate_range(2); + + // The cache now has only row with ckey 2 populated and the rest is discontinuous. + // Populating reader which stops populating at entry with ckey 2 should not forget + // to emit range_tombstone which starts at before(2). + + assert_that(cache.make_reader(s.schema(), tests::make_permit())) + .produces(m1) + .produces_end_of_stream(); +} SEASTAR_TEST_CASE(test_sstable_conforms_to_mutation_source) { return sstables::test_env::do_with_async([] (sstables::test_env& env) { @@ -56,7 +100,20 @@ SEASTAR_TEST_CASE(test_sstable_conforms_to_mutation_source) { for (auto index_block_size : {1, 128, 64*1024}) { sstable_writer_config cfg = env.manager().configure_writer(); cfg.promoted_index_block_size = index_block_size; - test_mutation_source(env, cfg, version); + + std::vector dirs; + auto populate = [&env, &dirs, &cfg, version] (schema_ptr s, const std::vector& partitions, + gc_clock::time_point query_time) -> mutation_source { + dirs.emplace_back(); + return make_sstable_mutation_source(env, s, dirs.back().path().string(), partitions, cfg, version, query_time); + }; + + run_mutation_source_tests(populate); + + if (index_block_size == 1) { + // The tests below are not sensitive to index bock size so run once. + test_cache_population_with_range_tombstone_adjacent_to_population_range(populate); + } } } }); diff --git a/test/boost/sstable_datafile_test.cc b/test/boost/sstable_datafile_test.cc index fe490b77af..df698e444a 100644 --- a/test/boost/sstable_datafile_test.cc +++ b/test/boost/sstable_datafile_test.cc @@ -56,7 +56,6 @@ #include "test/lib/index_reader_assertions.hh" #include "test/lib/flat_mutation_reader_assertions.hh" #include "test/lib/make_random_string.hh" -#include "test/lib/normalizing_reader.hh" #include "test/lib/sstable_run_based_compaction_strategy_for_tests.hh" #include "compatible_ring_position.hh" #include "mutation_compactor.hh" @@ -1032,22 +1031,6 @@ static flat_mutation_reader sstable_reader(shared_sstable sst, schema_ptr s, con return sst->as_mutation_source().make_reader(s, tests::make_permit(), pr, s->full_slice()); } -// We don't need to normalize the sstable reader for 'mc' format -// because it is naturally normalized now. -static flat_mutation_reader make_normalizing_sstable_reader( - shared_sstable sst, schema_ptr s, const dht::partition_range& pr) { - auto sstable_reader = sst->as_mutation_source().make_reader(s, tests::make_permit(), pr, s->full_slice()); - if (sst->get_version() == sstables::sstable::version_types::mc) { - return sstable_reader; - } - - return make_normalizing_reader(std::move(sstable_reader)); -} - -static flat_mutation_reader make_normalizing_sstable_reader(shared_sstable sst, schema_ptr s) { - return make_normalizing_sstable_reader(sst, s, query::full_partition_range); -} - SEASTAR_TEST_CASE(compaction_manager_test) { return test_env::do_with_async([] (test_env& env) { BOOST_REQUIRE(smp::count == 1); @@ -2711,7 +2694,7 @@ SEASTAR_TEST_CASE(test_wrong_range_tombstone_order) { auto sst = env.make_sstable(s, get_test_dir("wrong_range_tombstone_order", s), 1, version, big); sst->load().get0(); - auto reader = make_normalizing_sstable_reader(sst, s); + auto reader = sstable_reader(sst, s); using kind = mutation_fragment::kind; assert_that(std::move(reader)) @@ -2723,9 +2706,7 @@ SEASTAR_TEST_CASE(test_wrong_range_tombstone_order) { .produces(kind::clustering_row, { 1, 2, 3 }) .produces(kind::range_tombstone, { 1, 3 }) .produces(kind::clustering_row, { 1, 3 }) - .produces(kind::range_tombstone, { 1, 3 }, true) .produces(kind::clustering_row, { 1, 3, 4 }) - .produces(kind::range_tombstone, { 1, 3, 4 }) .produces(kind::clustering_row, { 1, 4 }) .produces(kind::clustering_row, { 1, 4, 0 }) .produces(kind::range_tombstone, { 2 }) @@ -3384,15 +3365,21 @@ SEASTAR_TEST_CASE(test_promoted_index_read) { auto pkey = partition_key::from_exploded(*s, { int32_type->decompose(0) }); auto dkey = dht::decorate_key(*s, std::move(pkey)); - auto rd = make_normalizing_sstable_reader(sst, s); + auto ck1 = clustering_key::from_exploded(*s, {int32_type->decompose(0)}); + auto ck2 = clustering_key::from_exploded(*s, {int32_type->decompose(0), int32_type->decompose(0)}); + auto ck3 = clustering_key::from_exploded(*s, {int32_type->decompose(0), int32_type->decompose(1)}); + + auto rd = sstable_reader(sst, s); using kind = mutation_fragment::kind; assert_that(std::move(rd)) .produces_partition_start(dkey) .produces(kind::range_tombstone, { 0 }) .produces(kind::clustering_row, { 0, 0 }) - .produces(kind::range_tombstone, { 0, 0 }) + .may_produce_tombstones({position_in_partition::after_key(ck2), + position_in_partition::before_key(ck3)}) .produces(kind::clustering_row, { 0, 1 }) - .produces(kind::range_tombstone, { 0, 1 }) + .may_produce_tombstones({position_in_partition::after_key(ck2), + position_in_partition(position_in_partition::range_tag_t(), bound_kind::incl_end, std::move(ck1))}) .produces_partition_end() .produces_end_of_stream(); } diff --git a/test/boost/sstable_mutation_test.cc b/test/boost/sstable_mutation_test.cc index 430769d51d..fd0ba14d97 100644 --- a/test/boost/sstable_mutation_test.cc +++ b/test/boost/sstable_mutation_test.cc @@ -1040,7 +1040,7 @@ SEASTAR_TEST_CASE(test_promoted_index_blocks_are_monotonic_compound_dense) { { auto slice = partition_slice_builder(*s).with_range(query::clustering_range::make_starting_with({ck1})).build(); assert_that(sst->as_mutation_source().make_reader(s, tests::make_permit(), dht::partition_range::make_singular(dk), slice)) - .produces(m) + .produces(m, slice.get_all_ranges()) .produces_end_of_stream(); } } @@ -1151,7 +1151,7 @@ SEASTAR_TEST_CASE(test_promoted_index_repeats_open_tombstones) { { auto slice = partition_slice_builder(*s).with_range(query::clustering_range::make_starting_with({ck})).build(); assert_that(sst->as_mutation_source().make_reader(s, tests::make_permit(), dht::partition_range::make_singular(dk), slice)) - .produces(m) + .produces(m, slice.get_all_ranges()) .produces_end_of_stream(); } } diff --git a/test/lib/flat_mutation_reader_assertions.hh b/test/lib/flat_mutation_reader_assertions.hh index 4b769514b5..ad1d6906dd 100644 --- a/test/lib/flat_mutation_reader_assertions.hh +++ b/test/lib/flat_mutation_reader_assertions.hh @@ -23,7 +23,7 @@ #include #include -#include "flat_mutation_reader.hh" +#include "flat_mutation_reader_v2.hh" #include "mutation_assertions.hh" #include "schema.hh" #include "test/lib/log.hh" @@ -494,3 +494,400 @@ inline flat_reader_assertions assert_that(flat_mutation_reader r) { return { std::move(r) }; } + +// Intended to be called in a seastar thread +class flat_reader_assertions_v2 { + flat_mutation_reader_v2 _reader; + dht::partition_range _pr; +private: + mutation_fragment_v2_opt read_next() { + return _reader(db::no_timeout).get0(); + } +public: + flat_reader_assertions_v2(flat_mutation_reader_v2 reader) + : _reader(std::move(reader)) + { } + + ~flat_reader_assertions_v2() { + _reader.close().get(); + } + + flat_reader_assertions_v2(const flat_reader_assertions_v2&) = delete; + flat_reader_assertions_v2(flat_reader_assertions_v2&&) = default; + + flat_reader_assertions_v2& operator=(flat_reader_assertions_v2&& o) { + if (this != &o) { + _reader.close().get(); + _reader = std::move(o._reader); + _pr = std::move(o._pr); + } + return *this; + } + + flat_reader_assertions_v2& produces_partition_start(const dht::decorated_key& dk, + std::optional tomb = std::nullopt) { + testlog.trace("Expecting partition start with key {}", dk); + auto mfopt = read_next(); + if (!mfopt) { + BOOST_FAIL(format("Expected: partition start with key {}, got end of stream", dk)); + } + if (!mfopt->is_partition_start()) { + BOOST_FAIL(format("Expected: partition start with key {}, got: {}", dk, mutation_fragment_v2::printer(*_reader.schema(), *mfopt))); + } + if (!mfopt->as_partition_start().key().equal(*_reader.schema(), dk)) { + BOOST_FAIL(format("Expected: partition start with key {}, got: {}", dk, mutation_fragment_v2::printer(*_reader.schema(), *mfopt))); + } + if (tomb && mfopt->as_partition_start().partition_tombstone() != *tomb) { + BOOST_FAIL(format("Expected: partition start with tombstone {}, got: {}", *tomb, mutation_fragment_v2::printer(*_reader.schema(), *mfopt))); + } + return *this; + } + + flat_reader_assertions_v2& produces_static_row() { + testlog.trace("Expecting static row"); + auto mfopt = read_next(); + if (!mfopt) { + BOOST_FAIL("Expected static row, got end of stream"); + } + if (!mfopt->is_static_row()) { + BOOST_FAIL(format("Expected static row, got: {}", mutation_fragment_v2::printer(*_reader.schema(), *mfopt))); + } + return *this; + } + + flat_reader_assertions_v2& produces_row_with_key(const clustering_key& ck) { + testlog.trace("Expect {}", ck); + auto mfopt = read_next(); + if (!mfopt) { + BOOST_FAIL(format("Expected row with key {}, but got end of stream", ck)); + } + if (!mfopt->is_clustering_row()) { + BOOST_FAIL(format("Expected row with key {}, but got {}", ck, mutation_fragment_v2::printer(*_reader.schema(), *mfopt))); + } + auto& actual = mfopt->as_clustering_row().key(); + if (!actual.equal(*_reader.schema(), ck)) { + BOOST_FAIL(format("Expected row with key {}, but key is {}", ck, actual)); + } + return *this; + } + + struct expected_column { + column_id id; + const sstring& name; + bytes value; + expected_column(const column_definition* cdef, bytes value) + : id(cdef->id) + , name(cdef->name_as_text()) + , value(std::move(value)) + { } + }; + + flat_reader_assertions_v2& produces_static_row(const std::vector& columns) { + testlog.trace("Expecting static row"); + auto mfopt = read_next(); + if (!mfopt) { + BOOST_FAIL("Expected static row, got end of stream"); + } + if (!mfopt->is_static_row()) { + BOOST_FAIL(format("Expected static row, got: {}", mutation_fragment_v2::printer(*_reader.schema(), *mfopt))); + } + auto& cells = mfopt->as_static_row().cells(); + if (cells.size() != columns.size()) { + BOOST_FAIL(format("Expected static row with {} columns, but has {}", columns.size(), cells.size())); + } + for (size_t i = 0; i < columns.size(); ++i) { + const atomic_cell_or_collection* cell = cells.find_cell(columns[i].id); + if (!cell) { + BOOST_FAIL(format("Expected static row with column {}, but it is not present", columns[i].name)); + } + auto& cdef = _reader.schema()->static_column_at(columns[i].id); + auto cmp = compare_unsigned(columns[i].value, cell->as_atomic_cell(cdef).value().linearize()); + if (cmp != 0) { + BOOST_FAIL(format("Expected static row with column {} having value {}, but it has value {}", + columns[i].name, + columns[i].value, + cell->as_atomic_cell(cdef).value())); + } + } + return *this; + } + + flat_reader_assertions_v2& produces_row(const clustering_key& ck, const std::vector& columns) { + testlog.trace("Expect {}", ck); + auto mfopt = read_next(); + if (!mfopt) { + BOOST_FAIL(format("Expected row with key {}, but got end of stream", ck)); + } + if (!mfopt->is_clustering_row()) { + BOOST_FAIL(format("Expected row with key {}, but got {}", ck, mutation_fragment_v2::printer(*_reader.schema(), *mfopt))); + } + auto& actual = mfopt->as_clustering_row().key(); + if (!actual.equal(*_reader.schema(), ck)) { + BOOST_FAIL(format("Expected row with key {}, but key is {}", ck, actual)); + } + auto& cells = mfopt->as_clustering_row().cells(); + if (cells.size() != columns.size()) { + BOOST_FAIL(format("Expected row with {} columns, but has {}", columns.size(), cells.size())); + } + for (size_t i = 0; i < columns.size(); ++i) { + const atomic_cell_or_collection* cell = cells.find_cell(columns[i].id); + if (!cell) { + BOOST_FAIL(format("Expected row with column {}, but it is not present", columns[i].name)); + } + auto& cdef = _reader.schema()->regular_column_at(columns[i].id); + assert (!cdef.is_multi_cell()); + auto cmp = compare_unsigned(columns[i].value, cell->as_atomic_cell(cdef).value().linearize()); + if (cmp != 0) { + BOOST_FAIL(format("Expected row with column {} having value {}, but it has value {}", + columns[i].name, + columns[i].value, + cell->as_atomic_cell(cdef).value().linearize())); + } + } + return *this; + } + + using assert_function = noncopyable_function; + + flat_reader_assertions_v2& produces_row(const clustering_key& ck, + const std::vector& column_ids, + const std::vector& column_assert) { + testlog.trace("Expect {}", ck); + auto mfopt = read_next(); + if (!mfopt) { + BOOST_FAIL(format("Expected row with key {}, but got end of stream", ck)); + } + if (!mfopt->is_clustering_row()) { + BOOST_FAIL(format("Expected row with key {}, but got {}", ck, mutation_fragment_v2::printer(*_reader.schema(), *mfopt))); + } + auto& actual = mfopt->as_clustering_row().key(); + if (!actual.equal(*_reader.schema(), ck)) { + BOOST_FAIL(format("Expected row with key {}, but key is {}", ck, actual)); + } + auto& cells = mfopt->as_clustering_row().cells(); + if (cells.size() != column_ids.size()) { + BOOST_FAIL(format("Expected row with {} columns, but has {}", column_ids.size(), cells.size())); + } + for (size_t i = 0; i < column_ids.size(); ++i) { + const atomic_cell_or_collection* cell = cells.find_cell(column_ids[i]); + if (!cell) { + BOOST_FAIL(format("Expected row with column {:d}, but it is not present", column_ids[i])); + } + auto& cdef = _reader.schema()->regular_column_at(column_ids[i]); + column_assert[i](cdef, cell); + } + return *this; + } + + flat_reader_assertions_v2& produces_range_tombstone_change(const range_tombstone_change& rt) { + testlog.trace("Expect {}", rt); + auto mfo = read_next(); + if (!mfo) { + BOOST_FAIL(format("Expected range tombstone {}, but got end of stream", rt)); + } + if (!mfo->is_range_tombstone_change()) { + BOOST_FAIL(format("Expected range tombstone change {}, but got {}", rt, mutation_fragment_v2::printer(*_reader.schema(), *mfo))); + } + if (!mfo->as_range_tombstone_change().equal(*_reader.schema(), rt)) { + BOOST_FAIL(format("Expected {}, but got {}", rt, mutation_fragment_v2::printer(*_reader.schema(), *mfo))); + } + return *this; + } + + flat_reader_assertions_v2& produces_partition_end() { + testlog.trace("Expecting partition end"); + auto mfopt = read_next(); + if (!mfopt) { + BOOST_FAIL(format("Expected partition end but got end of stream")); + } + if (!mfopt->is_end_of_partition()) { + BOOST_FAIL(format("Expected partition end but got {}", mutation_fragment_v2::printer(*_reader.schema(), *mfopt))); + } + return *this; + } + + flat_reader_assertions_v2& produces(const schema& s, const mutation_fragment_v2& mf) { + auto mfopt = read_next(); + if (!mfopt) { + BOOST_FAIL(format("Expected {}, but got end of stream", mutation_fragment_v2::printer(*_reader.schema(), mf))); + } + if (!mfopt->equal(s, mf)) { + BOOST_FAIL(format("Expected {}, but got {}", mutation_fragment_v2::printer(*_reader.schema(), mf), mutation_fragment_v2::printer(*_reader.schema(), *mfopt))); + } + return *this; + } + + flat_reader_assertions_v2& produces_end_of_stream() { + testlog.trace("Expecting end of stream"); + auto mfopt = read_next(); + if (bool(mfopt)) { + BOOST_FAIL(format("Expected end of stream, got {}", mutation_fragment_v2::printer(*_reader.schema(), *mfopt))); + } + return *this; + } + + flat_reader_assertions_v2& produces(mutation_fragment_v2::kind k, std::vector ck_elements, bool make_full_key = false) { + std::vector ck_bytes; + for (auto&& e : ck_elements) { + ck_bytes.emplace_back(int32_type->decompose(e)); + } + auto ck = clustering_key_prefix::from_exploded(*_reader.schema(), std::move(ck_bytes)); + if (make_full_key) { + clustering_key::make_full(*_reader.schema(), ck); + } + + auto mfopt = read_next(); + if (!mfopt) { + BOOST_FAIL(format("Expected mutation fragment {}, got end of stream", ck)); + } + if (mfopt->mutation_fragment_kind() != k) { + BOOST_FAIL(format("Expected mutation fragment kind {}, got: {}", k, mfopt->mutation_fragment_kind())); + } + clustering_key::equality ck_eq(*_reader.schema()); + if (!ck_eq(mfopt->key(), ck)) { + BOOST_FAIL(format("Expected key {}, got: {}", ck, mfopt->key())); + } + return *this; + } + + flat_reader_assertions_v2& produces_partition(const mutation& m) { + return produces(m); + } + + flat_reader_assertions_v2& produces(const mutation& m, const std::optional& ck_ranges = {}) { + auto mo = read_mutation_from_flat_mutation_reader(_reader, db::no_timeout).get0(); + if (!mo) { + BOOST_FAIL(format("Expected {}, but got end of stream, at: {}", m, seastar::current_backtrace())); + } + memory::scoped_critical_alloc_section dfg; + assert_that(*mo).is_equal_to(m, ck_ranges); + return *this; + } + + flat_reader_assertions_v2& produces(const dht::decorated_key& dk) { + produces_partition_start(dk); + next_partition(); + return *this; + } + + template + flat_reader_assertions_v2& produces(const Range& range) { + for (auto&& m : range) { + produces(m); + } + return *this; + } + + flat_reader_assertions_v2& produces_eos_or_empty_mutation() { + testlog.trace("Expecting eos or empty mutation"); + auto mo = read_mutation_from_flat_mutation_reader(_reader, db::no_timeout).get0(); + if (mo) { + if (!mo->partition().empty()) { + BOOST_FAIL(format("Mutation is not empty: {}", *mo)); + } + } + return *this; + } + + void has_monotonic_positions() { + position_in_partition::less_compare less(*_reader.schema()); + mutation_fragment_v2_opt previous_fragment; + mutation_fragment_v2_opt previous_partition; + bool inside_partition = false; + for (;;) { + auto mfo = read_next(); + if (!mfo) { + break; + } + if (mfo->is_partition_start()) { + BOOST_REQUIRE(!inside_partition); + auto& dk = mfo->as_partition_start().key(); + if (previous_partition && !previous_partition->as_partition_start().key().less_compare(*_reader.schema(), dk)) { + BOOST_FAIL(format("previous partition had greater or equal key: prev={}, current={}", + mutation_fragment_v2::printer(*_reader.schema(), *previous_partition), mutation_fragment_v2::printer(*_reader.schema(), *mfo))); + } + previous_partition = std::move(mfo); + previous_fragment = std::nullopt; + inside_partition = true; + } else if (mfo->is_end_of_partition()) { + BOOST_REQUIRE(inside_partition); + inside_partition = false; + } else { + BOOST_REQUIRE(inside_partition); + if (previous_fragment) { + if (!less(previous_fragment->position(), mfo->position())) { + BOOST_FAIL(format("previous fragment is not strictly before: prev={}, current={}", + mutation_fragment_v2::printer(*_reader.schema(), *previous_fragment), mutation_fragment_v2::printer(*_reader.schema(), *mfo))); + } + } + previous_fragment = std::move(mfo); + } + } + BOOST_REQUIRE(!inside_partition); + } + + flat_reader_assertions_v2& fast_forward_to(const dht::partition_range& pr) { + testlog.trace("Fast forward to partition range: {}", pr); + _pr = pr; + _reader.fast_forward_to(_pr, db::no_timeout).get(); + return *this; + } + + flat_reader_assertions_v2& next_partition() { + testlog.trace("Skip to next partition"); + _reader.next_partition().get(); + return *this; + } + + flat_reader_assertions_v2& fast_forward_to(position_range pr) { + testlog.trace("Fast forward to clustering range: {}", pr); + _reader.fast_forward_to(std::move(pr), db::no_timeout).get(); + return *this; + } + + flat_reader_assertions_v2& fast_forward_to(const clustering_key& ck1, const clustering_key& ck2) { + testlog.trace("Fast forward to clustering range: [{}, {})", ck1, ck2); + return fast_forward_to(position_range{ + position_in_partition(position_in_partition::clustering_row_tag_t(), ck1), + position_in_partition(position_in_partition::clustering_row_tag_t(), ck2) + }); + } + + flat_reader_assertions_v2& produces_compacted(const mutation& m, gc_clock::time_point query_time, + const std::optional& ck_ranges = {}) { + auto mo = read_mutation_from_flat_mutation_reader(_reader, db::no_timeout).get0(); + // If the passed in mutation is empty, allow for the reader to produce an empty or no partition. + if (m.partition().empty() && !mo) { + return *this; + } + BOOST_REQUIRE(bool(mo)); + memory::scoped_critical_alloc_section dfg; + mutation got = *mo; + got.partition().compact_for_compaction(*m.schema(), always_gc, query_time); + assert_that(got).is_equal_to(m, ck_ranges); + return *this; + } + + mutation_assertion next_mutation() { + auto mo = read_mutation_from_flat_mutation_reader(_reader, db::no_timeout).get0(); + BOOST_REQUIRE(bool(mo)); + return mutation_assertion(std::move(*mo)); + } + + future<> fill_buffer() { + return _reader.fill_buffer(db::no_timeout); + } + + bool is_buffer_full() const { + return _reader.is_buffer_full(); + } + + void set_max_buffer_size(size_t size) { + _reader.set_max_buffer_size(size); + } +}; + +inline +flat_reader_assertions_v2 assert_that(flat_mutation_reader_v2 r) { + return { std::move(r) }; +} diff --git a/test/lib/mutation_source_test.cc b/test/lib/mutation_source_test.cc index c0f79d0af5..2aecbec3ba 100644 --- a/test/lib/mutation_source_test.cc +++ b/test/lib/mutation_source_test.cc @@ -25,6 +25,7 @@ #include "schema_builder.hh" #include "test/lib/mutation_source_test.hh" #include "counters.hh" +#include "mutation_rebuilder.hh" #include "test/lib/simple_schema.hh" #include "flat_mutation_reader.hh" #include "test/lib/flat_mutation_reader_assertions.hh" @@ -990,8 +991,12 @@ void test_mutation_reader_fragments_have_monotonic_positions(populate_fn_ex popu for_each_mutation([&populate] (const mutation& m) { auto ms = populate(m.schema(), {m}, gc_clock::now()); + auto rd = ms.make_reader(m.schema(), tests::make_permit()); assert_that(std::move(rd)).has_monotonic_positions(); + + auto rd2 = ms.make_reader_v2(m.schema(), tests::make_permit()); + assert_that(std::move(rd2)).has_monotonic_positions(); }); } @@ -1355,6 +1360,28 @@ void test_slicing_with_overlapping_range_tombstones(populate_fn_ex populate) { assert_that(result).is_equal_to(m1 + m2, query::clustering_row_ranges({range})); } + { + auto slice = partition_slice_builder(*s).with_range(range).build(); + auto rd = ds.make_reader_v2(s, tests::make_permit(), query::full_partition_range, slice); + auto close_rd = deferred_close(rd); + + auto prange = position_range(range); + + mutation_rebuilder_v2 rebuilder(s); + rd.consume_pausable([&] (mutation_fragment_v2&& mf) { + testlog.trace("mf: {}", mutation_fragment_v2::printer(*s, mf)); + if (mf.position().is_clustering_row() && !prange.contains(*s, mf.position())) { + testlog.trace("m1: {}", m1); + testlog.trace("m2: {}", m2); + BOOST_FAIL(format("Received row which is not relevant for the slice: {}, slice: {}", + mutation_fragment_v2::printer(*s, mf), prange)); + } + return rebuilder.consume(std::move(mf)); + }, db::no_timeout).get(); + auto result = *rebuilder.consume_end_of_stream(); + + assert_that(result).is_equal_to(m1 + m2, query::clustering_row_ranges({range})); + } // Check fast_forward_to() { @@ -1392,7 +1419,226 @@ void test_slicing_with_overlapping_range_tombstones(populate_fn_ex populate) { } } +void test_range_tombstones_v2(populate_fn_ex populate) { + simple_schema s; + auto pkey = s.make_pkey(); + + std::vector mutations; + + mutation m(s.schema(), pkey); + s.add_row(m, s.make_ckey(0), "v1"); + auto t1 = s.new_tombstone(); + s.delete_range(m, s.make_ckey_range(1, 10), t1); + s.add_row(m, s.make_ckey(5), "v2"); + auto t2 = s.new_tombstone(); + s.delete_range(m, s.make_ckey_range(7, 12), t2); + s.add_row(m, s.make_ckey(15), "v2"); + auto t3 = s.new_tombstone(); + s.delete_range(m, s.make_ckey_range(17, 19), t3); + + mutations.push_back(std::move(m)); + + auto ms = populate(s.schema(), mutations, gc_clock::now()); + auto pr = dht::partition_range::make_singular(pkey); + + assert_that(ms.make_reader_v2(s.schema(), tests::make_permit())) + .next_partition() // Does nothing before first partition + .produces_partition_start(pkey) + .produces_row_with_key(s.make_ckey(0)) + .produces_range_tombstone_change(range_tombstone_change(position_in_partition_view::before_key(s.make_ckey(1)), t1)) + .produces_row_with_key(s.make_ckey(5)) + .produces_range_tombstone_change(range_tombstone_change(position_in_partition_view::before_key(s.make_ckey(7)), t2)) + .produces_range_tombstone_change(range_tombstone_change(position_in_partition_view::after_key(s.make_ckey(12)), tombstone())) + .produces_row_with_key(s.make_ckey(15)) + .produces_range_tombstone_change(range_tombstone_change(position_in_partition_view::before_key(s.make_ckey(17)), t3)) + .produces_range_tombstone_change(range_tombstone_change(position_in_partition_view::after_key(s.make_ckey(19)), tombstone())) + .produces_partition_end() + .produces_end_of_stream(); + + assert_that(ms.make_reader_v2(s.schema(), tests::make_permit(), pr, + s.schema()->full_slice(), + default_priority_class(), + nullptr, + streamed_mutation::forwarding::yes, + mutation_reader::forwarding::no)) + .produces_partition_start(pkey) + .produces_end_of_stream() + + .fast_forward_to(position_range( + position_in_partition::after_key(s.make_ckey(0)), + position_in_partition::before_key(s.make_ckey(2)))) + .produces_range_tombstone_change(range_tombstone_change(position_in_partition_view::before_key(s.make_ckey(1)), t1)) + .produces_range_tombstone_change(range_tombstone_change(position_in_partition_view::before_key(s.make_ckey(2)), {})) + .produces_end_of_stream() + + .fast_forward_to(position_range( + position_in_partition::before_key(s.make_ckey(5)), + position_in_partition::after_key(s.make_ckey(5)))) + .produces_range_tombstone_change(range_tombstone_change(position_in_partition_view::before_key(s.make_ckey(5)), t1)) + .produces_row_with_key(s.make_ckey(5)) + .produces_range_tombstone_change(range_tombstone_change(position_in_partition_view::after_key(s.make_ckey(5)), {})) + .produces_end_of_stream(); + + assert_that(ms.make_reader_v2(s.schema(), tests::make_permit(), pr, + s.schema()->full_slice(), + default_priority_class(), + nullptr, + streamed_mutation::forwarding::yes, + mutation_reader::forwarding::no)) + .produces_partition_start(pkey) + .produces_end_of_stream() + + .fast_forward_to(position_range( + position_in_partition::before_key(s.make_ckey(0)), + position_in_partition::before_key(s.make_ckey(1)))) + .produces_row_with_key(s.make_ckey(0)) + .produces_end_of_stream() + + .fast_forward_to(position_range( + position_in_partition::before_key(s.make_ckey(1)), + position_in_partition::before_key(s.make_ckey(2)))) + .produces_range_tombstone_change({position_in_partition_view::before_key(s.make_ckey(1)), t1}) + .produces_range_tombstone_change({position_in_partition_view::before_key(s.make_ckey(2)), {}}) + .produces_end_of_stream(); + + + assert_that(ms.make_reader_v2(s.schema(), tests::make_permit(), pr, + s.schema()->full_slice(), + default_priority_class(), + nullptr, + streamed_mutation::forwarding::yes, + mutation_reader::forwarding::no)) + .produces_partition_start(pkey) + .produces_end_of_stream() + + .fast_forward_to(position_range( + position_in_partition::before_key(s.make_ckey(1)), + position_in_partition::before_key(s.make_ckey(6)))) + .produces_range_tombstone_change({position_in_partition_view::before_key(s.make_ckey(1)), t1}) + .produces_row_with_key(s.make_ckey(5)) + .produces_range_tombstone_change({position_in_partition_view::before_key(s.make_ckey(6)), {}}) + .produces_end_of_stream(); + + assert_that(ms.make_reader_v2(s.schema(), tests::make_permit(), pr, + s.schema()->full_slice(), + default_priority_class(), + nullptr, + streamed_mutation::forwarding::yes, + mutation_reader::forwarding::no)) + .produces_partition_start(pkey) + .produces_end_of_stream() + + .fast_forward_to(position_range( + position_in_partition::before_key(s.make_ckey(6)), + position_in_partition::before_key(s.make_ckey(7)))) + .produces_range_tombstone_change({position_in_partition_view::before_key(s.make_ckey(6)), t1}) + .produces_range_tombstone_change({position_in_partition_view::before_key(s.make_ckey(7)), {}}) + .produces_end_of_stream(); + + assert_that(ms.make_reader_v2(s.schema(), tests::make_permit(), pr, + s.schema()->full_slice(), + default_priority_class(), + nullptr, + streamed_mutation::forwarding::yes, + mutation_reader::forwarding::no)) + .produces_partition_start(pkey) + .produces_end_of_stream() + + .fast_forward_to(position_range( + position_in_partition::before_key(s.make_ckey(6)), + position_in_partition::before_key(s.make_ckey(8)))) + .produces_range_tombstone_change({position_in_partition_view::before_key(s.make_ckey(6)), t1}) + .produces_range_tombstone_change({position_in_partition_view::before_key(s.make_ckey(7)), t2}) + .produces_range_tombstone_change({position_in_partition_view::before_key(s.make_ckey(8)), {}}) + .produces_end_of_stream() + + .fast_forward_to(position_range( + position_in_partition::before_key(s.make_ckey(9)), + position_in_partition::before_key(s.make_ckey(10)))) + .produces_range_tombstone_change({position_in_partition_view::before_key(s.make_ckey(9)), t2}) + .produces_range_tombstone_change({position_in_partition_view::before_key(s.make_ckey(10)), {}}) + .produces_end_of_stream() + + .fast_forward_to(position_range( + position_in_partition::before_key(s.make_ckey(10)), + position_in_partition::before_key(s.make_ckey(13)))) + .produces_range_tombstone_change({position_in_partition_view::before_key(s.make_ckey(10)), t2}) + .produces_range_tombstone_change({position_in_partition_view::after_key(s.make_ckey(12)), {}}) + .produces_end_of_stream() + + .fast_forward_to(position_range( + position_in_partition::before_key(s.make_ckey(16)), + position_in_partition::after_key(s.make_ckey(16)))) + .produces_end_of_stream() + + .fast_forward_to(position_range( + position_in_partition::before_key(s.make_ckey(17)), + position_in_partition::after_key(s.make_ckey(18)))) + .produces_range_tombstone_change({position_in_partition_view::before_key(s.make_ckey(17)), t3}) + .produces_range_tombstone_change({position_in_partition_view::after_key(s.make_ckey(18)), {}}) + .produces_end_of_stream(); + + // Slicing using query restrictions + + { + auto slice = partition_slice_builder(*s.schema()) + .with_range(s.make_ckey_range(16, 18)) + .build(); + assert_that(ms.make_reader_v2(s.schema(), tests::make_permit(), pr, slice)) + .produces_partition_start(pkey) + .produces_range_tombstone_change({position_in_partition_view::before_key(s.make_ckey(17)), t3}) + .produces_range_tombstone_change({position_in_partition_view::after_key(s.make_ckey(18)), {}}) + .produces_partition_end() + .produces_end_of_stream(); + } + + { + auto slice = partition_slice_builder(*s.schema()) + .with_range(s.make_ckey_range(0, 3)) + .with_range(s.make_ckey_range(8, 11)) + .build(); + assert_that(ms.make_reader_v2(s.schema(), tests::make_permit(), pr, slice)) + .produces_partition_start(pkey) + .produces_row_with_key(s.make_ckey(0)) + .produces_range_tombstone_change(range_tombstone_change(position_in_partition_view::before_key(s.make_ckey(1)), t1)) + .produces_range_tombstone_change(range_tombstone_change(position_in_partition_view::after_key(s.make_ckey(3)), {})) + .produces_range_tombstone_change(range_tombstone_change(position_in_partition_view::before_key(s.make_ckey(8)), t2)) + .produces_range_tombstone_change(range_tombstone_change(position_in_partition_view::after_key(s.make_ckey(11)), {})) + .produces_partition_end() + .produces_end_of_stream(); + } +} + +void test_reader_conversions(populate_fn_ex populate) { + BOOST_TEST_MESSAGE(__PRETTY_FUNCTION__); + for_each_mutation([&] (const mutation& m) mutable { + const auto query_time = gc_clock::now(); + + std::vector mutations = { m }; + auto ms = populate(m.schema(), mutations, gc_clock::now()); + + mutation m_compacted(m); + m_compacted.partition().compact_for_compaction(*m_compacted.schema(), always_gc, query_time); + + { + auto rd = ms.make_reader_v2(m.schema(), tests::make_permit()); + assert_that(downgrade_to_v1(std::move(rd))) + .produces_compacted(m_compacted, query_time); + } + + { + auto rd = ms.make_reader(m.schema(), tests::make_permit()); + assert_that(upgrade_to_v2(std::move(rd))) + .produces_compacted(m_compacted, query_time); + } + }); +} + +void test_next_partition(populate_fn_ex); + void run_mutation_reader_tests(populate_fn_ex populate) { + test_range_tombstones_v2(populate); + test_reader_conversions(populate); test_slicing_and_fast_forwarding(populate); test_date_tiered_clustering_slicing(populate); test_fast_forwarding_across_partitions_to_empty_range(populate); @@ -1406,6 +1652,9 @@ void run_mutation_reader_tests(populate_fn_ex populate) { test_range_queries(populate); test_query_only_static_row(populate); test_query_no_clustering_ranges_no_static_columns(populate); + test_next_partition(populate); + test_streamed_mutation_forwarding_succeeds_with_no_data(populate); + test_slicing_with_overlapping_range_tombstones(populate); } void test_next_partition(populate_fn_ex populate) { @@ -1441,12 +1690,6 @@ void test_next_partition(populate_fn_ex populate) { .produces_end_of_stream(); } -void run_flat_mutation_reader_tests(populate_fn_ex populate) { - test_next_partition(populate); - test_streamed_mutation_forwarding_succeeds_with_no_data(populate); - test_slicing_with_overlapping_range_tombstones(populate); -} - void run_mutation_source_tests(populate_fn populate) { auto populate_ex = [populate = std::move(populate)] (schema_ptr s, const std::vector& muts, gc_clock::time_point) { return populate(std::move(s), muts); @@ -1456,7 +1699,36 @@ void run_mutation_source_tests(populate_fn populate) { void run_mutation_source_tests(populate_fn_ex populate) { run_mutation_reader_tests(populate); - run_flat_mutation_reader_tests(populate); + + // ? -> v2 -> v1 -> * + run_mutation_reader_tests([populate] (schema_ptr s, const std::vector& m, gc_clock::time_point t) -> mutation_source { + return mutation_source([ms = populate(s, m, t)] (schema_ptr s, + reader_permit permit, + const dht::partition_range& pr, + const query::partition_slice& slice, + const io_priority_class& pc, + tracing::trace_state_ptr tr, + streamed_mutation::forwarding fwd, + mutation_reader::forwarding mr_fwd) { + return downgrade_to_v1( + ms.make_reader_v2(s, std::move(permit), pr, slice, pc, std::move(tr), fwd, mr_fwd)); + }); + }); + + // ? -> v1 -> v2 -> * + run_mutation_reader_tests([populate] (schema_ptr s, const std::vector& m, gc_clock::time_point t) -> mutation_source { + return mutation_source([ms = populate(s, m, t)] (schema_ptr s, + reader_permit permit, + const dht::partition_range& pr, + const query::partition_slice& slice, + const io_priority_class& pc, + tracing::trace_state_ptr tr, + streamed_mutation::forwarding fwd, + mutation_reader::forwarding mr_fwd) { + return upgrade_to_v2( + ms.make_reader(s, std::move(permit), pr, slice, pc, std::move(tr), fwd, mr_fwd)); + }); + }); } struct mutation_sets { diff --git a/test/lib/simple_schema.hh b/test/lib/simple_schema.hh index 0acdd60e59..b9741583c0 100644 --- a/test/lib/simple_schema.hh +++ b/test/lib/simple_schema.hh @@ -162,8 +162,8 @@ public: return t; } - range_tombstone delete_range(mutation& m, const query::clustering_range& range) { - auto rt = make_range_tombstone(range); + range_tombstone delete_range(mutation& m, const query::clustering_range& range, tombstone t = {}) { + auto rt = make_range_tombstone(range, t); m.partition().apply_delete(*_s, rt); return rt; } @@ -171,7 +171,7 @@ public: range_tombstone make_range_tombstone(const query::clustering_range& range, tombstone t = {}) { auto bv_range = bound_view::from_range(range); if (!t) { - t = tombstone(new_timestamp(), gc_clock::now()); + t = new_tombstone(); } range_tombstone rt(bv_range.first, bv_range.second, t); return rt;