From fb157599343cc98607ffd75309fe5fcc77a04447 Mon Sep 17 00:00:00 2001 From: Tomasz Grabiec Date: Thu, 13 Dec 2018 14:49:43 +0100 Subject: [PATCH] sstables: reader: Do not read the head of the partition when index can be used read_partition() was always called through read_next_partition(), even if we're at the beginning of the read. read_next_partition() is supposed to skip to the next partition. It still works when we're positioned before a partition, it doesn't advance the consumer, but it clears _index_in_current_partition, because it (correctly) assumes it corresponds to the partition we're about to leave, not the one we're about to enter. This means that index lookups we did in the read initializer will be disregarded when reading starts, and we'll always start by reading partition data from the data file. This is suboptimal for reads which are slicing a large partition and don't need to read the front of the partition. Regression introduced in 4b9a34a85425d1279b471b2ff0b0f2462328929c. The fix is to call read_partition() directly when we're positioned at the beginning of the partition. For that purpose a new flag was introduced. test_no_index_reads_when_rows_fall_into_range_boundaries has to be relaxed, because it assumed that slicing reads will read the head of the partition. Refs #3984 Fixes #3992 Tested using: ./build/release/tests/perf/perf_fast_forward_g \ --sstable-format=mc \ --datasets large-part-ds1 \ --run-tests=large-partition-slicing-clustering-keys Before (focus on aio): offset read time (s) frags frag/s mad f/s max f/s min f/s aio (KiB) blocked dropped idx hit idx miss idx blk c hit c miss c blk cpu 4000000 1 0.001378 1 726 5 736 102 6 200 4 2 0 1 1 0 0 0 65.8% After: offset read time (s) frags frag/s mad f/s max f/s min f/s aio (KiB) blocked dropped idx hit idx miss idx blk c hit c miss c blk cpu 4000000 1 0.001290 1 775 6 788 716 2 136 2 0 0 1 1 0 0 0 69.1% --- sstables/partition.cc | 14 +++++++++++++- tests/sstable_mutation_test.cc | 14 -------------- 2 files changed, 13 insertions(+), 15 deletions(-) diff --git a/sstables/partition.cc b/sstables/partition.cc index d59920fe2f..86ecbc34a7 100644 --- a/sstables/partition.cc +++ b/sstables/partition.cc @@ -142,6 +142,9 @@ class sstable_mutation_reader : public mp_row_consumer_reader { read_monitor& _monitor; std::optional _current_partition_key; bool _partition_finished = true; + // when set, the consumer is positioned right before a partition. + // _index_in_current_partition applies to the partition which is about to be read. + bool _before_partition = true; public: sstable_mutation_reader(shared_sstable sst, schema_ptr schema, const io_priority_class &pc, @@ -250,6 +253,7 @@ private: } future<> advance_to_next_partition() { sstlog.trace("reader {}: advance_to_next_partition()", this); + _before_partition = true; auto& consumer = _consumer; if (consumer.is_mutation_end()) { sstlog.trace("reader {}: already at partition boundary", this); @@ -311,7 +315,9 @@ private: future<> read_partition() { sstlog.trace("reader {}: reading partition", this); + _end_of_stream = true; // on_next_partition() will set it to true if (!_read_enabled) { + sstlog.trace("reader {}: eof", this); return make_ready_future<>(); } @@ -391,6 +397,7 @@ private: } void on_next_partition(dht::decorated_key key, tombstone tomb) { _partition_finished = false; + _before_partition = false; _end_of_stream = false; _current_partition_key = std::move(key); push_mutation_fragment( @@ -422,6 +429,7 @@ public: } else { clear_buffer(); _partition_finished = true; + _before_partition = true; _end_of_stream = false; assert(_index_reader); auto f1 = _index_reader->advance_to(pr); @@ -456,7 +464,11 @@ public: } return do_until([this] { return is_end_of_stream() || is_buffer_full(); }, [this] { if (_partition_finished) { - return read_next_partition(); + if (_before_partition) { + return read_partition(); + } else { + return read_next_partition(); + } } else { return do_until([this] { return is_buffer_full() || _partition_finished || _end_of_stream; }, [this] { _consumer.push_ready_fragments(); diff --git a/tests/sstable_mutation_test.cc b/tests/sstable_mutation_test.cc index 160682881c..b7fbd214fd 100644 --- a/tests/sstable_mutation_test.cc +++ b/tests/sstable_mutation_test.cc @@ -1325,20 +1325,6 @@ SEASTAR_TEST_CASE(test_no_index_reads_when_rows_fall_into_range_boundaries) { auto before = index_accesses(); - { - auto slice = partition_slice_builder(*s).with_ranges({ - ss.make_ckey_range(0, 3), - ss.make_ckey_range(5, 8) - }).build(); - - assert_that(ms.make_reader(s, query::full_partition_range, slice)) - .produces(m1) - .produces(m2) - .produces_end_of_stream(); - - BOOST_REQUIRE_EQUAL(index_accesses(), before); - } - { assert_that(ms.make_reader(s)) .produces(m1)