From 24d966f806128efd859b06ca73021972f91fbb1f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C5=82=20Chojnowski?= Date: Mon, 15 May 2023 17:31:22 +0200 Subject: [PATCH 1/2] range_tombstone_change_generator: fix an edge case in flush() range_tombstone_change_generator::flush() mishandles the case when two range tombstones are adjacent and flush(pos, end_of_range=true) is called with pos equal to the end bound of the lesser-position range tombstone. In such case, the start change of the greater-position rtc will be accidentally emitted, and there won't be an end change, which breaks reader assumptions by ending the stream with an unclosed range tombstone, triggering an assertion. This is due to a non-strict inequality used in a place where strict inequality should be used. The modified line was intended to close range tombstones which end exactly on the flush position, but this is unnecessary because such range tombstones are handled by the last `if` in the function anyway. Instead, this line caused range tombstones beginning right after the flush position to be emitted sometimes. Fixes #12462 --- range_tombstone_change_generator.hh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/range_tombstone_change_generator.hh b/range_tombstone_change_generator.hh index 6f98be5dce..48d3fc0043 100644 --- a/range_tombstone_change_generator.hh +++ b/range_tombstone_change_generator.hh @@ -83,7 +83,7 @@ public: position_in_partition::tri_compare cmp(_schema); std::optional prev; - const bool allow_eq = end_of_range || upper_bound.is_after_all_clustered_rows(_schema); + const bool allow_eq = upper_bound.is_after_all_clustered_rows(_schema); const auto should_flush = [&] (position_in_partition_view pos) { const auto res = cmp(pos, upper_bound); if (allow_eq) { From 7c1bdc6553f573d27b0b7817814486541f924579 Mon Sep 17 00:00:00 2001 From: Tomasz Grabiec Date: Sat, 22 Apr 2023 14:56:46 +0200 Subject: [PATCH 2/2] tests: row_cache: Add reproducer for reader producing missing closing range tombstone Adds a reproducer for #12462. The bug manifests by reader throwing: std::logic_error: Stream ends with an active range tombstone: {range_tombstone_change: pos={position: clustered,ckp{},-1}, {tombstone: timestamp=-9223372036854775805, deletion_time=2}} The reason is that prior to the fix range_tombstone_change_generator::flush() was used with end_of_range=true to produce the closing range_tombstone_change and it did not handle correctly the case when there are two adjacent range tombstones and flush(pos, end_of_range=true) is called such that pos is the boundary between the two. Cherry-picked from a717c803c78898f03c29bb657d0f6cd06b9f6540. --- test/boost/row_cache_test.cc | 60 ++++++++++++++++++++++++++++++++++++ 1 file changed, 60 insertions(+) diff --git a/test/boost/row_cache_test.cc b/test/boost/row_cache_test.cc index e8ab5d12ea..c343a37e6a 100644 --- a/test/boost/row_cache_test.cc +++ b/test/boost/row_cache_test.cc @@ -37,6 +37,7 @@ #include "readers/from_mutations_v2.hh" #include "readers/delegating_v2.hh" #include "readers/empty_v2.hh" +#include "seastar/testing/thread_test_case.hh" using namespace std::chrono_literals; @@ -2369,6 +2370,65 @@ SEASTAR_TEST_CASE(test_tombstone_and_row_with_small_buffer) { }); } +// Reproducer for https://github.com/scylladb/scylladb/issues/12462 +SEASTAR_THREAD_TEST_CASE(test_range_tombstone_adjacent_to_slice_is_closed) { + simple_schema s; + tests::reader_concurrency_semaphore_wrapper semaphore; + cache_tracker tracker; + memtable_snapshot_source underlying(s.schema()); + + auto pk = s.make_pkey(0); + auto pr = dht::partition_range::make_singular(pk); + + mutation m1(s.schema(), pk); + auto rt0 = s.make_range_tombstone(*position_range_to_clustering_range(position_range( + position_in_partition::before_key(s.make_ckey(1)), + position_in_partition::before_key(s.make_ckey(3))), *s.schema())); + m1.partition().apply_delete(*s.schema(), rt0); + s.add_row(m1, s.make_ckey(0), "v1"); + + underlying.apply(m1); + + row_cache cache(s.schema(), snapshot_source([&] { return underlying(); }), tracker); + populate_range(cache); + + // Create a reader to pin the MVCC version + auto rd0 = cache.make_reader(s.schema(), semaphore.make_permit(), pr); + auto close_rd0 = deferred_close(rd0); + rd0.set_max_buffer_size(1); + rd0.fill_buffer().get(); + + mutation m2(s.schema(), pk); + auto rt1 = s.make_range_tombstone(*position_range_to_clustering_range(position_range( + position_in_partition::before_key(s.make_ckey(1)), + position_in_partition::before_key(s.make_ckey(2))), *s.schema())); + m2.partition().apply_delete(*s.schema(), rt1); + apply(cache, underlying, m2); + + // State of cache: + // v2: ROW(0), RT(before(1), before(2))@t1 + // v1: RT(before(1), before(3))@t0 + + // range_tombstone_change_generator will work with the stream: RT(1, before(2))@t1, RT(before(2), before(3))@t0 + // It's important that there is an RT which starts exactly at the slice upper bound to trigger + // the problem, and the RT will be in the stream only because it is a residual from RT(before(1), before(3)), + // which overlaps with the slice in the older version. That's why we need two MVCC versions. + + auto slice = partition_slice_builder(*s.schema()) + .with_range(*position_range_to_clustering_range(position_range( + position_in_partition::before_key(s.make_ckey(0)), + position_in_partition::before_key(s.make_ckey(2))), *s.schema())) + .build(); + + assert_that(cache.make_reader(s.schema(), semaphore.make_permit(), pr, slice)) + .produces_partition_start(pk) + .produces_row_with_key(s.make_ckey(0)) + .produces_range_tombstone_change(start_change(rt1)) + .produces_range_tombstone_change(end_change(rt1)) + .produces_partition_end() + .produces_end_of_stream(); +} + // // Tests the following case of eviction and re-population: //