mirror of
https://github.com/scylladb/scylladb.git
synced 2026-05-29 11:10:40 +00:00
sstables: reader: Do not read the head of the partition when index can be used
read_partition() was always called through read_next_partition(), even
if we're at the beginning of the read. read_next_partition() is
supposed to skip to the next partition. It still works when we're
positioned before a partition, it doesn't advance the consumer, but it
clears _index_in_current_partition, because it (correctly) assumes it
corresponds to the partition we're about to leave, not the one we're
about to enter.
This means that index lookups we did in the read initializer will be
disregarded when reading starts, and we'll always start by reading
partition data from the data file. This is suboptimal for reads which
are slicing a large partition and don't need to read the front of the
partition.
Regression introduced in 4b9a34a854.
The fix is to call read_partition() directly when we're positioned at
the beginning of the partition. For that purpose a new flag was
introduced.
test_no_index_reads_when_rows_fall_into_range_boundaries has to be
relaxed, because it assumed that slicing reads will read the head of
the partition.
Refs #3984
Fixes #3992
Tested using:
./build/release/tests/perf/perf_fast_forward_g \
--sstable-format=mc \
--datasets large-part-ds1 \
--run-tests=large-partition-slicing-clustering-keys
Before (focus on aio):
offset read time (s) frags frag/s mad f/s max f/s min f/s aio (KiB) blocked dropped idx hit idx miss idx blk c hit c miss c blk cpu
4000000 1 0.001378 1 726 5 736 102 6 200 4 2 0 1 1 0 0 0 65.8%
After:
offset read time (s) frags frag/s mad f/s max f/s min f/s aio (KiB) blocked dropped idx hit idx miss idx blk c hit c miss c blk cpu
4000000 1 0.001290 1 775 6 788 716 2 136 2 0 0 1 1 0 0 0 69.1%
This commit is contained in:
@@ -142,6 +142,9 @@ class sstable_mutation_reader : public mp_row_consumer_reader {
|
||||
read_monitor& _monitor;
|
||||
std::optional<dht::decorated_key> _current_partition_key;
|
||||
bool _partition_finished = true;
|
||||
// when set, the consumer is positioned right before a partition.
|
||||
// _index_in_current_partition applies to the partition which is about to be read.
|
||||
bool _before_partition = true;
|
||||
public:
|
||||
sstable_mutation_reader(shared_sstable sst, schema_ptr schema,
|
||||
const io_priority_class &pc,
|
||||
@@ -250,6 +253,7 @@ private:
|
||||
}
|
||||
future<> advance_to_next_partition() {
|
||||
sstlog.trace("reader {}: advance_to_next_partition()", this);
|
||||
_before_partition = true;
|
||||
auto& consumer = _consumer;
|
||||
if (consumer.is_mutation_end()) {
|
||||
sstlog.trace("reader {}: already at partition boundary", this);
|
||||
@@ -311,7 +315,9 @@ private:
|
||||
future<> read_partition() {
|
||||
sstlog.trace("reader {}: reading partition", this);
|
||||
|
||||
_end_of_stream = true; // on_next_partition() will set it to true
|
||||
if (!_read_enabled) {
|
||||
sstlog.trace("reader {}: eof", this);
|
||||
return make_ready_future<>();
|
||||
}
|
||||
|
||||
@@ -391,6 +397,7 @@ private:
|
||||
}
|
||||
void on_next_partition(dht::decorated_key key, tombstone tomb) {
|
||||
_partition_finished = false;
|
||||
_before_partition = false;
|
||||
_end_of_stream = false;
|
||||
_current_partition_key = std::move(key);
|
||||
push_mutation_fragment(
|
||||
@@ -422,6 +429,7 @@ public:
|
||||
} else {
|
||||
clear_buffer();
|
||||
_partition_finished = true;
|
||||
_before_partition = true;
|
||||
_end_of_stream = false;
|
||||
assert(_index_reader);
|
||||
auto f1 = _index_reader->advance_to(pr);
|
||||
@@ -456,7 +464,11 @@ public:
|
||||
}
|
||||
return do_until([this] { return is_end_of_stream() || is_buffer_full(); }, [this] {
|
||||
if (_partition_finished) {
|
||||
return read_next_partition();
|
||||
if (_before_partition) {
|
||||
return read_partition();
|
||||
} else {
|
||||
return read_next_partition();
|
||||
}
|
||||
} else {
|
||||
return do_until([this] { return is_buffer_full() || _partition_finished || _end_of_stream; }, [this] {
|
||||
_consumer.push_ready_fragments();
|
||||
|
||||
@@ -1325,20 +1325,6 @@ SEASTAR_TEST_CASE(test_no_index_reads_when_rows_fall_into_range_boundaries) {
|
||||
|
||||
auto before = index_accesses();
|
||||
|
||||
{
|
||||
auto slice = partition_slice_builder(*s).with_ranges({
|
||||
ss.make_ckey_range(0, 3),
|
||||
ss.make_ckey_range(5, 8)
|
||||
}).build();
|
||||
|
||||
assert_that(ms.make_reader(s, query::full_partition_range, slice))
|
||||
.produces(m1)
|
||||
.produces(m2)
|
||||
.produces_end_of_stream();
|
||||
|
||||
BOOST_REQUIRE_EQUAL(index_accesses(), before);
|
||||
}
|
||||
|
||||
{
|
||||
assert_that(ms.make_reader(s))
|
||||
.produces(m1)
|
||||
|
||||
Reference in New Issue
Block a user