Files
scylladb/sstables/index_reader.hh

479 lines
18 KiB
C++

/*
* Copyright (C) 2015 ScyllaDB
*/
/*
* This file is part of Scylla.
*
* Scylla is free software: you can redistribute it and/or modify
* it under the terms of the GNU Affero General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* Scylla is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with Scylla. If not, see <http://www.gnu.org/licenses/>.
*/
#pragma once
#include "sstables.hh"
#include "consumer.hh"
#include "downsampling.hh"
#include "sstables/shared_index_lists.hh"
namespace sstables {
class index_consumer {
uint64_t max_quantity;
public:
index_list indexes;
index_consumer(uint64_t q) : max_quantity(q) {
indexes.reserve(q);
}
bool should_continue() {
return indexes.size() < max_quantity;
}
void consume_entry(index_entry&& ie, uint64_t offset) {
indexes.push_back(std::move(ie));
}
void reset() {
indexes.clear();
}
};
// IndexConsumer is a concept that implements:
//
// bool should_continue();
// void consume_entry(index_entry&& ie, uintt64_t offset);
template <class IndexConsumer>
class index_consume_entry_context: public data_consumer::continuous_data_consumer<index_consume_entry_context<IndexConsumer>> {
using proceed = data_consumer::proceed;
using continuous_data_consumer = data_consumer::continuous_data_consumer<index_consume_entry_context<IndexConsumer>>;
private:
IndexConsumer& _consumer;
uint64_t _entry_offset;
enum class state {
START,
KEY_SIZE,
KEY_BYTES,
POSITION,
PROMOTED_SIZE,
PROMOTED_BYTES,
CONSUME_ENTRY,
} _state = state::START;
temporary_buffer<char> _key;
temporary_buffer<char> _promoted;
public:
void verify_end_state() {
}
bool non_consuming() const {
return ((_state == state::CONSUME_ENTRY) || (_state == state::START) ||
((_state == state::PROMOTED_BYTES) && (continuous_data_consumer::_prestate == continuous_data_consumer::prestate::NONE)));
}
proceed process_state(temporary_buffer<char>& data) {
switch (_state) {
// START comes first, to make the handling of the 0-quantity case simpler
case state::START:
if (!_consumer.should_continue()) {
return proceed::no;
}
_state = state::KEY_SIZE;
break;
case state::KEY_SIZE:
if (this->read_16(data) != continuous_data_consumer::read_status::ready) {
_state = state::KEY_BYTES;
break;
}
case state::KEY_BYTES:
if (this->read_bytes(data, this->_u16, _key) != continuous_data_consumer::read_status::ready) {
_state = state::POSITION;
break;
}
case state::POSITION:
if (this->read_64(data) != continuous_data_consumer::read_status::ready) {
_state = state::PROMOTED_SIZE;
break;
}
case state::PROMOTED_SIZE:
if (this->read_32(data) != continuous_data_consumer::read_status::ready) {
_state = state::PROMOTED_BYTES;
break;
}
case state::PROMOTED_BYTES:
if (this->read_bytes(data, this->_u32, _promoted) != continuous_data_consumer::read_status::ready) {
_state = state::CONSUME_ENTRY;
break;
}
case state::CONSUME_ENTRY: {
auto len = (_key.size() + _promoted.size() + 14);
_consumer.consume_entry(index_entry(std::move(_key), this->_u64, std::move(_promoted)), _entry_offset);
_entry_offset += len;
_state = state::START;
}
break;
default:
throw malformed_sstable_exception("unknown state");
}
return proceed::yes;
}
index_consume_entry_context(IndexConsumer& consumer,
input_stream<char>&& input, uint64_t start, uint64_t maxlen)
: continuous_data_consumer(std::move(input), start, maxlen)
, _consumer(consumer), _entry_offset(start)
{}
void reset(uint64_t offset) {
_state = state::START;
_entry_offset = offset;
_consumer.reset();
}
};
// Less-comparator for lookups in the partition index.
class index_comparator {
dht::ring_position_comparator _tri_cmp;
public:
index_comparator(const schema& s) : _tri_cmp(s) {}
bool operator()(const summary_entry& e, dht::ring_position_view rp) const {
return _tri_cmp(e.get_key(), rp) < 0;
}
bool operator()(const index_entry& e, dht::ring_position_view rp) const {
return _tri_cmp(e.get_key(), rp) < 0;
}
bool operator()(dht::ring_position_view rp, const summary_entry& e) const {
return _tri_cmp(e.get_key(), rp) > 0;
}
bool operator()(dht::ring_position_view rp, const index_entry& e) const {
return _tri_cmp(e.get_key(), rp) > 0;
}
};
// Provides access to sstable indexes.
//
// Maintains logical cursor to sstable elements (partitions, cells).
// Initially the cursor is positioned on the first partition in the sstable.
// The cursor can be advanced forward using advance_to().
//
// If eof() then the cursor is positioned past all partitions in the sstable.
class index_reader {
shared_sstable _sstable;
shared_index_lists::list_ptr _current_list;
const io_priority_class& _pc;
struct reader {
index_consumer _consumer;
index_consume_entry_context<index_consumer> _context;
static auto create_file_input_stream(shared_sstable sst, const io_priority_class& pc, uint64_t begin, uint64_t end) {
file_input_stream_options options;
options.buffer_size = sst->sstable_buffer_size;
options.read_ahead = 2;
options.io_priority_class = pc;
return make_file_input_stream(sst->_index_file, begin, end - begin, std::move(options));
}
reader(shared_sstable sst, const io_priority_class& pc, uint64_t begin, uint64_t end, uint64_t quantity)
: _consumer(quantity)
, _context(_consumer, create_file_input_stream(sst, pc, begin, end), begin, end - begin)
{ }
};
stdx::optional<reader> _reader;
uint64_t _previous_summary_idx = 0;
uint64_t _current_summary_idx = 0;
uint64_t _current_index_idx = 0;
uint64_t _current_pi_idx = 0;
uint64_t _data_file_position = 0;
indexable_element _element = indexable_element::partition;
private:
future<> advance_to_end() {
_data_file_position = data_file_end();
_element = indexable_element::partition;
_current_list = {};
return close_reader().finally([this] {
_reader = stdx::nullopt;
});
}
// Must be called for non-decreasing summary_idx.
future<> advance_to_page(uint64_t summary_idx) {
sstlog.trace("index {}: advance_to_page({})", this, summary_idx);
assert(!_current_list || _current_summary_idx <= summary_idx);
if (_current_list && _current_summary_idx == summary_idx) {
sstlog.trace("index {}: same page", this);
return make_ready_future<>();
}
auto& summary = _sstable->get_summary();
if (summary_idx >= summary.header.size) {
sstlog.trace("index {}: eof", this);
return advance_to_end();
}
auto loader = [this] (uint64_t summary_idx) -> future<index_list> {
auto& summary = _sstable->get_summary();
uint64_t position = summary.entries[summary_idx].position;
uint64_t quantity = downsampling::get_effective_index_interval_after_index(summary_idx, summary.header.sampling_level,
summary.header.min_index_interval);
uint64_t end;
if (summary_idx + 1 >= summary.header.size) {
end = _sstable->index_size();
} else {
end = summary.entries[summary_idx + 1].position;
}
return close_reader().then_wrapped([this, position, end, quantity, summary_idx] (auto&& f) {
try {
f.get();
_reader.emplace(_sstable, _pc, position, end, quantity);
} catch (...) {
_reader = stdx::nullopt;
throw;
}
return _reader->_context.consume_input(_reader->_context).then([this] {
return std::move(_reader->_consumer.indexes);
});
});
};
return _sstable->_index_lists.get_or_load(summary_idx, loader).then([this, summary_idx] (shared_index_lists::list_ptr ref) {
_current_list = std::move(ref);
_current_summary_idx = summary_idx;
_current_index_idx = 0;
_current_pi_idx = 0;
assert(!_current_list->empty());
_data_file_position = (*_current_list)[0].position();
_element = indexable_element::partition;
if (sstlog.is_enabled(seastar::log_level::trace)) {
sstlog.trace("index {}: page:", this);
for (const index_entry& e : *_current_list) {
auto dk = dht::global_partitioner().decorate_key(*_sstable->_schema,
e.get_key().to_partition_key(*_sstable->_schema));
sstlog.trace(" {} -> {}", dk, e.position());
}
}
});
}
public:
future<> advance_to_start(const dht::partition_range& range) {
if (range.start()) {
return advance_to(dht::ring_position_view(range.start()->value(),
dht::ring_position_view::after_key(!range.start()->is_inclusive())));
}
return make_ready_future<>();
}
future<> advance_to_end(const dht::partition_range& range) {
if (range.end()) {
return advance_to(dht::ring_position_view(range.end()->value(),
dht::ring_position_view::after_key(range.end()->is_inclusive())));
}
return advance_to_end();
}
public:
index_reader(shared_sstable sst, const io_priority_class& pc)
: _sstable(std::move(sst))
, _pc(pc)
{
sstlog.trace("index {}: index_reader for {}", this, _sstable->get_filename());
}
// Cannot be used twice on the same summary_idx and together with advance_to().
[[deprecated]]
future<index_list> get_index_entries(uint64_t summary_idx) {
return advance_to_page(summary_idx).then([this] {
return _current_list ? _current_list.release() : index_list();
});
}
public:
// Forwards the cursor to given position in current partition.
//
// Note that the index within partition, unlike the partition index, doesn't cover all keys.
// So this may forward the cursor to some position pos' which precedes pos, even though
// there exist rows with positions in the range [pos', pos].
//
// Must be called for non-decreasing positions.
// Must be called only after advanced to some partition and !eof().
future<> advance_to(position_in_partition_view pos) {
sstlog.trace("index {}: advance_to({}), current data_file_pos={}", this, pos, _data_file_position);
if (!_current_list) {
// Page is not read after advancing to the first partition.
return advance_to_page(_current_summary_idx).then([this, pos] {
sstlog.trace("index {}: page done", this);
assert(_current_list);
return advance_to(pos);
});
}
const schema& s = *_sstable->_schema;
index_entry& e = (*_current_list)[_current_index_idx];
promoted_index* pi = nullptr;
try {
pi = e.get_promoted_index(s);
} catch (...) {
sstlog.error("Failed to get promoted index for sstable {}, page {}, index {}: {}", _sstable->get_filename(),
_current_summary_idx, _current_index_idx, std::current_exception());
}
if (!pi) {
sstlog.trace("index {}: no promoted index", this);
return make_ready_future<>();
}
if (sstlog.is_enabled(seastar::log_level::trace)) {
sstlog.trace("index {}: promoted index:", this);
for (auto&& e : pi->entries) {
sstlog.trace(" {}-{}: +{} len={}", e.start, e.end, e.offset, e.width);
}
}
auto cmp_with_end = [pos_cmp = position_in_partition::composite_less_compare(s)]
(const promoted_index::entry& e, position_in_partition_view pos) -> bool {
return pos_cmp(e.end, pos);
};
// Optimize short skips which typically land in the same block
if (_current_pi_idx >= pi->entries.size() || !cmp_with_end(pi->entries[_current_pi_idx], pos)) {
sstlog.trace("index {}: position in current block", this);
return make_ready_future<>();
}
auto i = std::lower_bound(pi->entries.begin() + _current_pi_idx, pi->entries.end(), pos, cmp_with_end);
_current_pi_idx = std::distance(pi->entries.begin(), i);
if (i == pi->entries.end()) {
if (!pi->entries.empty()) {
// Skip to last block. Even though we know there are no rows in this block for the range
// we must skip to it in case it contains tombstones relevant for the requested range.
auto& last = pi->entries.back();
_data_file_position = e.position() + last.offset;
_element = indexable_element::cell;
sstlog.trace("index {}: skipping to last block", this);
}
} else {
_data_file_position = e.position() + i->offset;
_element = indexable_element::cell;
sstlog.trace("index {}: skipped to cell", this);
}
sstlog.trace("index {}: data_file_pos={}", this, _data_file_position);
return make_ready_future<>();
}
// Positions the cursor on the first partition which is not smaller than pos (like std::lower_bound).
// Must be called for non-decreasing positions.
future<> advance_to(dht::ring_position_view pos) {
sstlog.trace("index {}: advance_to({}), _previous_summary_idx={}, _current_summary_idx={}", this, pos, _previous_summary_idx, _current_summary_idx);
auto& summary = _sstable->get_summary();
_previous_summary_idx = std::distance(std::begin(summary.entries),
std::lower_bound(summary.entries.begin() + _previous_summary_idx, summary.entries.end(), pos, index_comparator(*_sstable->_schema)));
if (_previous_summary_idx == 0) {
sstlog.trace("index {}: first entry", this);
return make_ready_future<>();
}
auto summary_idx = _previous_summary_idx - 1;
sstlog.trace("index {}: summary_idx={}", this, summary_idx);
// Despite the requirement that the values of 'pos' in subsequent calls
// are increasing we still may encounter a situation when we try to read
// the previous bucket.
// For example, let's say we have index like this:
// summary: A K ...
// index: A C D F K M N O ...
// Now, we want to get positions for range [G, J]. We start with [G,
// summary look up will tel us to check the first bucket. However, there
// is no G in that bucket so we read the following one to get the
// position (see the advance_to_page() call below). After we've got it, it's time to
// get J] position. Again, summary points us to the first bucket and we
// hit an assert since the reader is already at the second bucket and we
// cannot go backward.
// The solution is this condition above. If our lookup requires reading
// the previous bucket we assume that the entry doesn't exist and return
// the position of the first one in the current index bucket.
if (summary_idx + 1 == _current_summary_idx) {
return make_ready_future<>();
}
return advance_to_page(summary_idx).then([this, pos, summary_idx] {
index_list& il = *_current_list;
sstlog.trace("index {}: old page index = {}", this, _current_index_idx);
auto i = std::lower_bound(il.begin() + _current_index_idx, il.end(), pos, index_comparator(*_sstable->_schema));
if (i == il.end()) {
sstlog.trace("index {}: not found", this);
return advance_to_page(summary_idx + 1);
}
_current_index_idx = std::distance(il.begin(), i);
_current_pi_idx = 0;
_data_file_position = i->position();
_element = indexable_element::partition;
sstlog.trace("index {}: new page index = {}, pos={}", this, _current_index_idx, _data_file_position);
return make_ready_future<>();
});
}
// Returns position in the data file of the cursor.
// Returns non-decreasing positions.
// When eof(), returns data_file_end().
uint64_t data_file_position() const {
return _data_file_position;
}
// Returns the kind of sstable element the cursor is pointing at.
indexable_element element_kind() const {
return _element;
}
// Returns position right after all partitions in the sstable
uint64_t data_file_end() const {
return _sstable->data_size();
}
bool eof() const {
return _data_file_position == data_file_end();
}
private:
future<> close_reader() {
if (_reader) {
return _reader->_context.close();
}
return make_ready_future<>();
}
public:
future<sstable::disk_read_range> get_disk_read_range(const dht::partition_range& range) {
return advance_to_start(range).then([this, &range] () {
uint64_t start = data_file_position();
return advance_to_end(range).then([this, &range, start] () {
uint64_t end = data_file_position();
return sstable::disk_read_range(start, end);
});
});
}
future<> close() {
return close_reader();
}
};
}