Merge 'readers: evictable_reader: don't accidentally consume the entire partition' from Kamil Braun

The evictable reader must ensure that each buffer fill makes forward progress, i.e. the last fragment in the buffer has a position larger than the last fragment from the previous buffer-fill. Otherwise, the reader could get stuck in an infinite loop between buffer fills, if the reader is evicted in-between.

The code guranteeing this forward progress had a bug: the comparison between the position after the last buffer-fill and the current last fragment position was done in the wrong direction.

So if the condition that we wanted to achieve was already true, we would continue filling the buffer until partition end which may lead to OOMs such as in #13491.

There was already a fix in this area to handle `partition_start` fragments correctly - #13563 - but it missed that the position comparison was done in the wrong order.

Fix the comparison and adjust one of the tests (added in #13563) to detect this case.

After the fix, the evictable reader starts generating some redundant (but expected) range tombstone change fragments since it's now being paused and resumed. For this we need to adjust mutation source tests which were a bit too specific. We modify `flat_mutation_reader_assertions` to squash the redundant `r_t_c`s.

Fixes #13491

Closes #14375

* github.com:scylladb/scylladb:
  readers: evictable_reader: don't accidentally consume the entire partition
  test: flat_mutation_reader_assertions: squash `r_t_c`s with the same position
This commit is contained in:
Botond Dénes
2023-06-28 07:58:45 +03:00
6 changed files with 31 additions and 10 deletions

View File

@@ -607,7 +607,7 @@ future<> evictable_reader_v2::fill_buffer() {
// First make sure we've made progress w.r.t. _next_position_in_partition.
// This loop becomes inifinite when next pos is a partition start.
// In that case progress is guranteed anyway, so skip this loop entirely.
while (!_next_position_in_partition.is_partition_start() && next_mf && _tri_cmp(_next_position_in_partition, buffer().back().position()) <= 0) {
while (!_next_position_in_partition.is_partition_start() && next_mf && _tri_cmp(buffer().back().position(), _next_position_in_partition) <= 0) {
push_mutation_fragment(_reader->pop_mutation_fragment());
next_mf = co_await _reader->peek();
}

View File

@@ -3648,9 +3648,13 @@ SEASTAR_THREAD_TEST_CASE(test_evictable_reader_next_pos_is_partition_start) {
auto stop_rd = deferred_close(rd);
rd.set_max_buffer_size(max_buf_size);
// #13491 - the reader must not consume the entire partition but a small batch of fragments based on the buffer size.
rd.fill_buffer().get();
rd.fill_buffer().get();
auto buf1 = rd.detach_buffer();
BOOST_REQUIRE_EQUAL(buf1.size(), 3);
// There should be 6-7 fragments, but to avoid computing the exact number of fragments that should fit in `max_buf_size`,
// just ensure that there are <= 10 (consuming the whole partition would give ~1000 fragments).
BOOST_REQUIRE_LE(buf1.size(), 10);
}
struct mutation_bounds {

View File

@@ -269,7 +269,7 @@ SEASTAR_THREAD_TEST_CASE(test_sstable_reversing_reader_random_schema) {
streamed_mutation::forwarding::no, mutation_reader::forwarding::no);
close_r1.cancel();
compare_readers(*query_schema, std::move(r1), std::move(r2));
compare_readers(*query_schema, std::move(r1), std::move(r2), true);
}
auto r1 = source.make_reader_v2(query_schema, semaphore.make_permit(), prange,

View File

@@ -59,10 +59,27 @@ private:
continue;
}
// silently ignore rtcs that don't change anything
if (next->is_range_tombstone_change() && next->as_range_tombstone_change().tombstone() == _rt) {
testlog.trace("Received spurious closing rtc: {}", mutation_fragment_v2::printer(*_reader.schema(), *next));
_reader().get();
continue;
if (next->is_range_tombstone_change()) {
auto rtc_mf = std::move(*_reader().get());
auto tomb = rtc_mf.as_range_tombstone_change().tombstone();
auto cmp = position_in_partition::tri_compare(*_reader.schema());
// squash rtcs with the same pos
while (auto next_maybe_rtc = _reader.peek().get0()) {
if (next_maybe_rtc->is_range_tombstone_change() && cmp(next_maybe_rtc->position(), rtc_mf.position()) == 0) {
testlog.trace("Squashing {} with {}", next_maybe_rtc->as_range_tombstone_change().tombstone(), tomb);
tomb = next_maybe_rtc->as_range_tombstone_change().tombstone();
_reader().get0();
} else {
break;
}
}
rtc_mf.mutate_as_range_tombstone_change(*_reader.schema(), [tomb] (range_tombstone_change& rtc) { rtc.set_tombstone(tomb); });
if (tomb == _rt) {
testlog.trace("Received spurious rtcs, equivalent to: {}", mutation_fragment_v2::printer(*_reader.schema(), rtc_mf));
continue;
}
_reader.unpop_mutation_fragment(std::move(rtc_mf));
next = _reader.peek().get0();
}
return next;
}

View File

@@ -2669,9 +2669,9 @@ static bool compare_readers(const schema& s, flat_mutation_reader_v2& authority,
return !empty;
}
void compare_readers(const schema& s, flat_mutation_reader_v2 authority, flat_mutation_reader_v2 tested) {
void compare_readers(const schema& s, flat_mutation_reader_v2 authority, flat_mutation_reader_v2 tested, bool exact) {
auto close_authority = deferred_close(authority);
auto assertions = assert_that(std::move(tested));
auto assertions = assert_that(std::move(tested)).exact(exact);
compare_readers(s, authority, assertions);
}

View File

@@ -74,7 +74,7 @@ bytes make_blob(size_t blob_size);
void for_each_schema_change(std::function<void(schema_ptr, const std::vector<mutation>&,
schema_ptr, const std::vector<mutation>&)>);
void compare_readers(const schema&, flat_mutation_reader_v2 authority, flat_mutation_reader_v2 tested);
void compare_readers(const schema&, flat_mutation_reader_v2 authority, flat_mutation_reader_v2 tested, bool exact = false);
void compare_readers(const schema&, flat_mutation_reader_v2 authority, flat_mutation_reader_v2 tested, const std::vector<position_range>& fwd_ranges);
// Forward `r` to each range in `fwd_ranges` and consume all fragments produced by `r` in these ranges.