Files
scylladb/sstables/index_reader.hh
Rafael Ávila de Espíndola 684fb607c4 sstable: handle missing index entry
This patch fixes a crash when the index file is corrupted and we get
an empty index entry list.

Tests: unit (release)

Fixes: 2532

Signed-off-by: Rafael Ávila de Espíndola <espindola@scylladb.com>
Message-Id: <20190110202833.29333-1-espindola@scylladb.com>
2019-01-14 10:47:21 +01:00

824 lines
34 KiB
C++

/*
* Copyright (C) 2015 ScyllaDB
*/
/*
* This file is part of Scylla.
*
* Scylla is free software: you can redistribute it and/or modify
* it under the terms of the GNU Affero General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* Scylla is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with Scylla. If not, see <http://www.gnu.org/licenses/>.
*/
#pragma once
#include "sstables.hh"
#include "consumer.hh"
#include "downsampling.hh"
#include "sstables/shared_index_lists.hh"
#include <seastar/util/bool_class.hh>
#include "utils/buffer_input_stream.hh"
#include "sstables/prepended_input_stream.hh"
namespace sstables {
class index_consumer {
uint64_t max_quantity;
public:
index_list indexes;
index_consumer(uint64_t q) : max_quantity(q) {
indexes.reserve(q);
}
void consume_entry(index_entry&& ie, uint64_t offset) {
indexes.push_back(std::move(ie));
}
void reset() {
indexes.clear();
}
};
// See #2993
class trust_promoted_index_tag;
using trust_promoted_index = bool_class<trust_promoted_index_tag>;
// IndexConsumer is a concept that implements:
//
// bool should_continue();
// void consume_entry(index_entry&& ie, uint64_t offset);
//
// TODO: make it templated on SSTables version since the exact format can be passed in at compile time
template <class IndexConsumer>
class index_consume_entry_context : public data_consumer::continuous_data_consumer<index_consume_entry_context<IndexConsumer>> {
using proceed = data_consumer::proceed;
using processing_result = data_consumer::processing_result;
using continuous_data_consumer = data_consumer::continuous_data_consumer<index_consume_entry_context<IndexConsumer>>;
using read_status = typename continuous_data_consumer::read_status;
private:
IndexConsumer& _consumer;
file _index_file;
file_input_stream_options _options;
uint64_t _entry_offset;
enum class state {
START,
KEY_SIZE,
KEY_BYTES,
POSITION,
PROMOTED_SIZE,
PARTITION_HEADER_LENGTH_1,
PARTITION_HEADER_LENGTH_2,
LOCAL_DELETION_TIME,
MARKED_FOR_DELETE_AT,
NUM_PROMOTED_INDEX_BLOCKS,
CONSUME_ENTRY,
} _state = state::START;
temporary_buffer<char> _key;
uint32_t _promoted_index_end;
uint64_t _position;
uint64_t _partition_header_length = 0;
std::optional<deletion_time> _deletion_time;
uint32_t _num_pi_blocks = 0;
trust_promoted_index _trust_pi;
const schema& _s;
std::optional<column_values_fixed_lengths> _ck_values_fixed_lengths;
inline bool is_mc_format() const { return static_cast<bool>(_ck_values_fixed_lengths); }
public:
void verify_end_state() {
if (this->_remain > 0) {
throw std::runtime_error("index_consume_entry_context - no more data but parsing is incomplete");
}
}
bool non_consuming() const {
return ((_state == state::CONSUME_ENTRY) || (_state == state::START));
}
processing_result process_state(temporary_buffer<char>& data) {
auto current_pos = [&] { return this->position() - data.size(); };
auto read_vint_or_uint64 = [this] (temporary_buffer<char>& data) {
return is_mc_format() ? this->read_unsigned_vint(data) : this->read_64(data);
};
auto read_vint_or_uint32 = [this] (temporary_buffer<char>& data) {
return is_mc_format() ? this->read_unsigned_vint(data) : this->read_32(data);
};
auto get_uint32 = [this] {
return is_mc_format() ? static_cast<uint32_t>(this->_u64) : this->_u32;
};
switch (_state) {
// START comes first, to make the handling of the 0-quantity case simpler
case state::START:
_state = state::KEY_SIZE;
break;
case state::KEY_SIZE:
_entry_offset = current_pos();
if (this->read_16(data) != continuous_data_consumer::read_status::ready) {
_state = state::KEY_BYTES;
break;
}
case state::KEY_BYTES:
if (this->read_bytes(data, this->_u16, _key) != continuous_data_consumer::read_status::ready) {
_state = state::POSITION;
break;
}
case state::POSITION:
if (read_vint_or_uint64(data) != continuous_data_consumer::read_status::ready) {
_state = state::PROMOTED_SIZE;
break;
}
case state::PROMOTED_SIZE:
_position = this->_u64;
if (read_vint_or_uint32(data) != continuous_data_consumer::read_status::ready) {
_state = state::PARTITION_HEADER_LENGTH_1;
break;
}
case state::PARTITION_HEADER_LENGTH_1: {
auto promoted_index_size_with_header = get_uint32();
_promoted_index_end = current_pos() + promoted_index_size_with_header;
if (promoted_index_size_with_header == 0) {
_state = state::CONSUME_ENTRY;
goto state_CONSUME_ENTRY;
}
if (!is_mc_format()) {
// SSTables ka/la don't have a partition_header_length field
_state = state::LOCAL_DELETION_TIME;
goto state_LOCAL_DELETION_TIME;
}
if (this->read_unsigned_vint(data) != continuous_data_consumer::read_status::ready) {
_state = state::PARTITION_HEADER_LENGTH_2;
break;
}
}
case state::PARTITION_HEADER_LENGTH_2:
_partition_header_length = this->_u64;
state_LOCAL_DELETION_TIME:
case state::LOCAL_DELETION_TIME:
_deletion_time.emplace();
if (this->read_32(data) != continuous_data_consumer::read_status::ready) {
_state = state::MARKED_FOR_DELETE_AT;
break;
}
case state::MARKED_FOR_DELETE_AT:
_deletion_time->local_deletion_time = this->_u32;
if (this->read_64(data) != continuous_data_consumer::read_status::ready) {
_state = state::NUM_PROMOTED_INDEX_BLOCKS;
break;
}
case state::NUM_PROMOTED_INDEX_BLOCKS:
_deletion_time->marked_for_delete_at = this->_u64;
if (read_vint_or_uint32(data) != continuous_data_consumer::read_status::ready) {
_state = state::CONSUME_ENTRY;
break;
}
state_CONSUME_ENTRY:
case state::CONSUME_ENTRY: {
auto promoted_index_size = _promoted_index_end - current_pos();
if (_deletion_time) {
_num_pi_blocks = get_uint32();
}
auto data_size = data.size();
std::optional<input_stream<char>> promoted_index_stream;
if ((_trust_pi == trust_promoted_index::yes) && (promoted_index_size > 0)) {
if (promoted_index_size <= data_size) {
auto buf = data.share();
buf.trim(promoted_index_size);
promoted_index_stream = make_buffer_input_stream(std::move(buf));
} else {
promoted_index_stream = make_prepended_input_stream(
std::move(data),
make_file_input_stream(_index_file, this->position(), promoted_index_size - data_size, _options).detach());
}
} else {
_num_pi_blocks = 0;
}
std::unique_ptr<promoted_index> index;
if (promoted_index_stream) {
if (is_mc_format()) {
index = std::make_unique<promoted_index>(_s, *_deletion_time, std::move(*promoted_index_stream),
promoted_index_size,
_num_pi_blocks, *_ck_values_fixed_lengths);
} else {
index = std::make_unique<promoted_index>(_s, *_deletion_time, std::move(*promoted_index_stream),
promoted_index_size, _num_pi_blocks);
}
}
_consumer.consume_entry(index_entry{std::move(_key), _position, std::move(index)}, _entry_offset);
_deletion_time = std::nullopt;
_num_pi_blocks = 0;
_state = state::START;
if (promoted_index_size <= data_size) {
data.trim_front(promoted_index_size);
} else {
data.trim(0);
return skip_bytes{promoted_index_size - data_size};
}
}
break;
}
return proceed::yes;
}
index_consume_entry_context(IndexConsumer& consumer, trust_promoted_index trust_pi, const schema& s,
file index_file, file_input_stream_options options, uint64_t start,
uint64_t maxlen, std::optional<column_values_fixed_lengths> ck_values_fixed_lengths)
: continuous_data_consumer(make_file_input_stream(index_file, start, maxlen, options), start, maxlen)
, _consumer(consumer), _index_file(index_file), _options(options)
, _entry_offset(start), _trust_pi(trust_pi), _s(s), _ck_values_fixed_lengths(std::move(ck_values_fixed_lengths))
{}
void reset(uint64_t offset) {
_state = state::START;
_entry_offset = offset;
_consumer.reset();
}
};
// Less-comparator for lookups in the partition index.
class index_comparator {
dht::ring_position_comparator _tri_cmp;
public:
index_comparator(const schema& s) : _tri_cmp(s) {}
bool operator()(const summary_entry& e, dht::ring_position_view rp) const {
return _tri_cmp(e.get_decorated_key(), rp) < 0;
}
bool operator()(const index_entry& e, dht::ring_position_view rp) const {
return _tri_cmp(e.get_decorated_key(), rp) < 0;
}
bool operator()(dht::ring_position_view rp, const summary_entry& e) const {
return _tri_cmp(e.get_decorated_key(), rp) > 0;
}
bool operator()(dht::ring_position_view rp, const index_entry& e) const {
return _tri_cmp(e.get_decorated_key(), rp) > 0;
}
};
inline static
future<> close_index_list(shared_index_lists::list_ptr& list) {
if (list) {
return parallel_for_each(*list, [](index_entry &ie) {
return ie.close_pi_stream();
}).finally([&list] {
list = {};
});
}
return make_ready_future<>();
}
// Provides access to sstable indexes.
//
// Maintains logical cursors to sstable elements (partitions, cells).
// Holds two cursors pointing to the range within sstable (upper cursor may be not set).
// Initially the lower cursor is positioned on the first partition in the sstable.
// Lower cursor can be accessed and advanced from outside.
// Upper cursor can only be advanced along with the lower cursor and not accessed from outside.
//
// If eof() then the lower bound cursor is positioned past all partitions in the sstable.
class index_reader {
shared_sstable _sstable;
const io_priority_class& _pc;
shared_index_lists _index_lists;
struct reader {
index_consumer _consumer;
index_consume_entry_context<index_consumer> _context;
inline static file_input_stream_options get_file_input_stream_options(shared_sstable sst, const io_priority_class& pc) {
file_input_stream_options options;
options.buffer_size = sst->sstable_buffer_size;
options.read_ahead = 2;
options.io_priority_class = pc;
return options;
}
reader(shared_sstable sst, const io_priority_class& pc, uint64_t begin, uint64_t end, uint64_t quantity)
: _consumer(quantity)
, _context(_consumer,
trust_promoted_index(sst->has_correct_promoted_index_entries()), *sst->_schema, sst->_index_file,
get_file_input_stream_options(sst, pc), begin, end - begin,
(sst->get_version() == sstable_version_types::mc
? std::make_optional(get_clustering_values_fixed_lengths(sst->get_serialization_header()))
: std::optional<column_values_fixed_lengths>{}))
{ }
};
// Stores information about open end RT marker
// of the lower index bound
struct open_rt_marker {
position_in_partition pos;
tombstone tomb;
};
// Contains information about index_reader position in the index file
struct index_bound {
shared_index_lists::list_ptr current_list;
uint64_t previous_summary_idx = 0;
uint64_t current_summary_idx = 0;
uint64_t current_index_idx = 0;
uint64_t current_pi_idx = 0; // Points to upper bound of the cursor.
uint64_t data_file_position = 0;
indexable_element element = indexable_element::partition;
std::optional<open_rt_marker> end_open_marker;
};
index_bound _lower_bound;
// Upper bound may remain uninitialized
std::optional<index_bound> _upper_bound;
private:
void advance_to_end(index_bound& bound) {
sstlog.trace("index {}: advance_to_end() bound {}", this, &bound);
bound.data_file_position = data_file_end();
bound.element = indexable_element::partition;
bound.current_list = {};
bound.end_open_marker.reset();
}
// Must be called for non-decreasing summary_idx.
future<> advance_to_page(index_bound& bound, uint64_t summary_idx) {
sstlog.trace("index {}: advance_to_page({}), bound {}", this, summary_idx, &bound);
assert(!bound.current_list || bound.current_summary_idx <= summary_idx);
if (bound.current_list && bound.current_summary_idx == summary_idx) {
sstlog.trace("index {}: same page", this);
return make_ready_future<>();
}
auto& summary = _sstable->get_summary();
if (summary_idx >= summary.header.size) {
sstlog.trace("index {}: eof", this);
advance_to_end(bound);
return make_ready_future<>();
}
auto loader = [this] (uint64_t summary_idx) -> future<index_list> {
auto& summary = _sstable->get_summary();
uint64_t position = summary.entries[summary_idx].position;
uint64_t quantity = downsampling::get_effective_index_interval_after_index(summary_idx, summary.header.sampling_level,
summary.header.min_index_interval);
uint64_t end;
if (summary_idx + 1 >= summary.header.size) {
end = _sstable->index_size();
} else {
end = summary.entries[summary_idx + 1].position;
}
return do_with(std::make_unique<reader>(_sstable, _pc, position, end, quantity), [this, summary_idx] (auto& entries_reader) {
return entries_reader->_context.consume_input().then([this, summary_idx, &entries_reader] {
auto indexes = std::move(entries_reader->_consumer.indexes);
return entries_reader->_context.close().then([indexes = std::move(indexes)] () mutable {
return std::move(indexes);
});
});
});
};
return _index_lists.get_or_load(summary_idx, loader).then([this, &bound, summary_idx] (shared_index_lists::list_ptr ref) {
bound.current_list = std::move(ref);
bound.current_summary_idx = summary_idx;
bound.current_index_idx = 0;
bound.current_pi_idx = 0;
if (bound.current_list->empty()) {
throw malformed_sstable_exception("missing index entry", _sstable->filename(component_type::Index));
}
bound.data_file_position = (*bound.current_list)[0].position();
bound.element = indexable_element::partition;
bound.end_open_marker.reset();
if (sstlog.is_enabled(seastar::log_level::trace)) {
sstlog.trace("index {} bound {}: page:", this, &bound);
for (const index_entry& e : *bound.current_list) {
auto dk = dht::global_partitioner().decorate_key(*_sstable->_schema,
e.get_key().to_partition_key(*_sstable->_schema));
sstlog.trace(" {} -> {}", dk, e.position());
}
}
});
}
future<> advance_lower_to_start(const dht::partition_range &range) {
if (range.start()) {
return advance_to(_lower_bound,
dht::ring_position_view(range.start()->value(),
dht::ring_position_view::after_key(!range.start()->is_inclusive())));
}
return make_ready_future<>();
}
future<> advance_upper_to_end(const dht::partition_range &range) {
if (!_upper_bound) {
_upper_bound.emplace();
}
if (range.end()) {
return advance_to(*_upper_bound,
dht::ring_position_view(range.end()->value(),
dht::ring_position_view::after_key(range.end()->is_inclusive())));
}
advance_to_end(*_upper_bound);
return make_ready_future<>();
}
// Tells whether details about current partition can be accessed.
// If this returns false, you have to call read_partition_data().
//
// Calling read_partition_data() may involve doing I/O. The reason
// why control over this is exposed and not done under the hood is that
// in some cases it only makes sense to access partition details from index
// if it is readily available, and if it is not, we're better off obtaining
// them by continuing reading from sstable.
bool partition_data_ready(const index_bound& bound) const {
return static_cast<bool>(bound.current_list);
}
// Valid if partition_data_ready(bound)
index_entry& current_partition_entry(index_bound& bound) {
assert(bound.current_list);
return (*bound.current_list)[bound.current_index_idx];
}
future<> advance_to_next_partition(index_bound& bound) {
sstlog.trace("index {} bound {}: advance_to_next_partition()", &bound, this);
if (!partition_data_ready(bound)) {
return advance_to_page(bound, 0).then([this, &bound] {
return advance_to_next_partition(bound);
});
}
if (bound.current_index_idx + 1 < bound.current_list->size()) {
++bound.current_index_idx;
bound.current_pi_idx = 0;
bound.data_file_position = (*bound.current_list)[bound.current_index_idx].position();
bound.element = indexable_element::partition;
bound.end_open_marker.reset();
return make_ready_future<>();
}
auto& summary = _sstable->get_summary();
if (bound.current_summary_idx + 1 < summary.header.size) {
return advance_to_page(bound, bound.current_summary_idx + 1);
}
advance_to_end(bound);
return make_ready_future<>();
}
future<> advance_to(index_bound& bound, dht::ring_position_view pos) {
sstlog.trace("index {} bound {}: advance_to({}), _previous_summary_idx={}, _current_summary_idx={}",
this, &bound, pos, bound.previous_summary_idx, bound.current_summary_idx);
if (pos.is_min()) {
sstlog.trace("index {}: first entry", this);
return make_ready_future<>();
} else if (pos.is_max()) {
advance_to_end(bound);
return make_ready_future<>();
}
auto& summary = _sstable->get_summary();
bound.previous_summary_idx = std::distance(std::begin(summary.entries),
std::lower_bound(summary.entries.begin() + bound.previous_summary_idx, summary.entries.end(), pos, index_comparator(*_sstable->_schema)));
if (bound.previous_summary_idx == 0) {
sstlog.trace("index {}: first entry", this);
return make_ready_future<>();
}
auto summary_idx = bound.previous_summary_idx - 1;
sstlog.trace("index {}: summary_idx={}", this, summary_idx);
// Despite the requirement that the values of 'pos' in subsequent calls
// are increasing we still may encounter a situation when we try to read
// the previous bucket.
// For example, let's say we have index like this:
// summary: A K ...
// index: A C D F K M N O ...
// Now, we want to get positions for range [G, J]. We start with [G,
// summary look up will tel us to check the first bucket. However, there
// is no G in that bucket so we read the following one to get the
// position (see the advance_to_page() call below). After we've got it, it's time to
// get J] position. Again, summary points us to the first bucket and we
// hit an assert since the reader is already at the second bucket and we
// cannot go backward.
// The solution is this condition above. If our lookup requires reading
// the previous bucket we assume that the entry doesn't exist and return
// the position of the first one in the current index bucket.
if (summary_idx + 1 == bound.current_summary_idx) {
return make_ready_future<>();
}
return advance_to_page(bound, summary_idx).then([this, &bound, pos, summary_idx] {
sstlog.trace("index {}: old page index = {}", this, bound.current_index_idx);
auto& entries = *bound.current_list;
auto i = std::lower_bound(std::begin(entries) + bound.current_index_idx, std::end(entries), pos, index_comparator(*_sstable->_schema));
if (i == std::end(entries)) {
sstlog.trace("index {}: not found", this);
return advance_to_page(bound, summary_idx + 1);
}
bound.current_index_idx = std::distance(std::begin(entries), i);
bound.current_pi_idx = 0;
bound.data_file_position = i->position();
bound.element = indexable_element::partition;
bound.end_open_marker.reset();
sstlog.trace("index {}: new page index = {}, pos={}", this, bound.current_index_idx, bound.data_file_position);
return make_ready_future<>();
});
}
// Forwards the upper bound cursor to a position which is greater than given position in current partition.
//
// Note that the index within partition, unlike the partition index, doesn't cover all keys.
// So this may not forward to the smallest position which is greater than pos.
//
// May advance to the next partition if it's not possible to find a suitable position inside
// current partition.
//
// Must be called only when !eof().
future<> advance_upper_past(position_in_partition_view pos) {
sstlog.trace("index {}: advance_upper_past({})", this, pos);
// We advance cursor within the current lower bound partition
// So need to make sure first that it is read
if (!partition_data_ready(_lower_bound)) {
return read_partition_data().then([this, pos] {
assert(partition_data_ready());
return advance_upper_past(pos);
});
}
if (!_upper_bound) {
_upper_bound = _lower_bound;
}
index_entry& e = current_partition_entry(*_upper_bound);
if (e.get_total_pi_blocks_count() == 0) {
sstlog.trace("index {}: no promoted index", this);
return advance_to_next_partition(*_upper_bound);
}
if (e.get_read_pi_blocks_count() == 0) {
return e.get_next_pi_blocks().then([this, pos] {
return advance_upper_past(pos);
});
}
const schema& s = *_sstable->_schema;
auto cmp_with_start = [pos_cmp = promoted_index_block_compare(s), s]
(position_in_partition_view pos, const promoted_index_block& info) -> bool {
return pos_cmp(pos, info.start(s));
};
promoted_index_blocks* pi_blocks = e.get_pi_blocks();
assert(pi_blocks);
auto i = std::upper_bound(pi_blocks->begin() + _upper_bound->current_pi_idx, pi_blocks->end(), pos, cmp_with_start);
_upper_bound->current_pi_idx = std::distance(pi_blocks->begin(), i);
if (i == pi_blocks->end()) {
return advance_to_next_partition(*_upper_bound);
}
_upper_bound->data_file_position = e.position() + i->offset();
_upper_bound->element = indexable_element::cell;
sstlog.trace("index {} upper bound: skipped to cell, _current_pi_idx={}, _data_file_position={}",
this, _upper_bound->current_pi_idx, _upper_bound->data_file_position);
return make_ready_future<>();
}
// Returns position right after all partitions in the sstable
uint64_t data_file_end() const {
return _sstable->data_size();
}
void get_info_from_promoted_block(const promoted_index_blocks::const_iterator iter,
const promoted_index_blocks& pi_blocks) {
const index_entry& e = current_partition_entry();
_lower_bound.data_file_position = e.position() + iter->offset();
_lower_bound.element = indexable_element::cell;
if (iter == pi_blocks.cbegin() || !std::prev(iter)->end_open_marker()) {
_lower_bound.end_open_marker.reset();
} else {
auto prev = std::prev(iter);
// End open marker can be only engaged in SSTables 3.x ('mc' format) and never in ka/la
auto end_pos = prev->end(*_sstable->get_schema());
position_in_partition_view* open_rt_pos = std::get_if<position_in_partition_view>(&end_pos);
assert(open_rt_pos);
_lower_bound.end_open_marker = open_rt_marker{
position_in_partition{*open_rt_pos},
tombstone(*prev->end_open_marker())};
}
}
public:
index_reader(shared_sstable sst, const io_priority_class& pc)
: _sstable(std::move(sst))
, _pc(pc)
{
sstlog.trace("index {}: index_reader for {}", this, _sstable->get_filename());
}
// Ensures that partition_data_ready() returns true.
// Can be called only when !eof()
future<> read_partition_data() {
assert(!eof());
if (partition_data_ready(_lower_bound)) {
return make_ready_future<>();
}
// The only case when _current_list may be missing is when the cursor is at the beginning
assert(_lower_bound.current_summary_idx == 0);
return advance_to_page(_lower_bound, 0);
}
// Advance index_reader bounds to the bounds of the supplied range
future<> advance_to(const dht::partition_range& range) {
return seastar::when_all_succeed(
advance_lower_to_start(range),
advance_upper_to_end(range));
}
// Get current index entry
index_entry& current_partition_entry() {
return current_partition_entry(_lower_bound);
}
// Returns tombstone for the current partition if it was recorded in the sstable.
// It may be unavailable for old sstables for which this information was not generated.
// Can be called only when partition_data_ready().
std::optional<sstables::deletion_time> partition_tombstone() {
return current_partition_entry(_lower_bound).get_deletion_time();
}
// Returns the key for current partition.
// Can be called only when partition_data_ready().
// The result is valid as long as index_reader is valid.
key_view partition_key() {
index_entry& e = current_partition_entry(_lower_bound);
return e.get_key();
}
bool partition_data_ready() const {
return partition_data_ready(_lower_bound);
}
// Forwards the cursor to the given position in the current partition.
//
// Note that the index within partition, unlike the partition index, doesn't cover all keys.
// So this may forward the cursor to some position pos' which precedes pos, even though
// there exist rows with positions in the range [pos', pos].
//
// Must be called for non-decreasing positions.
// Must be called only after advanced to some partition and !eof().
future<> advance_to(position_in_partition_view pos) {
sstlog.trace("index {}: advance_to({}), current data_file_pos={}",
this, pos, _lower_bound.data_file_position);
const schema& s = *_sstable->_schema;
if (pos.is_before_all_fragments(s)) {
return make_ready_future<>();
}
if (!partition_data_ready()) {
return read_partition_data().then([this, pos] {
sstlog.trace("index {}: page done", this);
assert(partition_data_ready(_lower_bound));
return advance_to(pos);
});
}
index_entry& e = current_partition_entry();
if (e.get_total_pi_blocks_count() == 0) {
sstlog.trace("index {}: no promoted index", this);
return make_ready_future<>();
}
const promoted_index_blocks* pi_blocks = e.get_pi_blocks();
assert(pi_blocks);
if ((e.get_total_pi_blocks_count() == e.get_read_pi_blocks_count())
&& _lower_bound.current_pi_idx >= pi_blocks->size() - 1) {
sstlog.trace("index {}: position in current block (all blocks are read)", this);
return make_ready_future<>();
}
auto cmp_with_start = [pos_cmp = promoted_index_block_compare(s), &s]
(position_in_partition_view pos, const promoted_index_block& info) -> bool {
return pos_cmp(pos, info.start(s));
};
if (!pi_blocks->empty() && cmp_with_start(pos, (*pi_blocks)[_lower_bound.current_pi_idx])) {
sstlog.trace("index {}: position in current block (exact match)", this);
return make_ready_future<>();
}
auto i = std::upper_bound(pi_blocks->cbegin() + _lower_bound.current_pi_idx, pi_blocks->cend(), pos, cmp_with_start);
_lower_bound.current_pi_idx = std::distance(pi_blocks->cbegin(), i);
if ((i != pi_blocks->cend()) || (e.get_read_pi_blocks_count() == e.get_total_pi_blocks_count())) {
if (i != pi_blocks->begin()) {
--i;
}
get_info_from_promoted_block(i, *pi_blocks);
sstlog.trace("index {}: lower bound skipped to cell, _current_pi_idx={}, _data_file_position={}",
this, _lower_bound.current_pi_idx, _lower_bound.data_file_position);
return make_ready_future<>();
}
return e.get_pi_blocks_until(pos).then([this, &s, &e, pi_blocks] (size_t current_pi_idx) {
_lower_bound.current_pi_idx = current_pi_idx;
auto i = std::cbegin(*pi_blocks);
if (_lower_bound.current_pi_idx > 0) {
std::advance(i, _lower_bound.current_pi_idx - 1);
}
get_info_from_promoted_block(i, *pi_blocks);
sstlog.trace("index {}: skipped to cell, _current_pi_idx={}, _data_file_position={}",
this, _lower_bound.current_pi_idx, _lower_bound.data_file_position);
});
}
// Like advance_to(dht::ring_position_view), but returns information whether the key was found
// If upper_bound is provided, the upper bound within position is looked up
future<bool> advance_lower_and_check_if_present(
dht::ring_position_view key, std::optional<position_in_partition_view> pos = {}) {
return advance_to(_lower_bound, key).then([this, key, pos] {
if (eof()) {
return make_ready_future<bool>(false);
}
return read_partition_data().then([this, key, pos] {
index_comparator cmp(*_sstable->_schema);
bool found = cmp(key, current_partition_entry(_lower_bound)) == 0;
if (!found || !pos) {
return make_ready_future<bool>(found);
}
return advance_upper_past(*pos).then([] {
return make_ready_future<bool>(true);
});
});
});
}
// Moves the cursor to the beginning of next partition.
// Can be called only when !eof().
future<> advance_to_next_partition() {
return advance_to_next_partition(_lower_bound);
}
// Positions the cursor on the first partition which is not smaller than pos (like std::lower_bound).
// Must be called for non-decreasing positions.
future<> advance_to(dht::ring_position_view pos) {
return advance_to(_lower_bound, pos);
}
struct data_file_positions_range {
uint64_t start;
std::optional<uint64_t> end;
};
// Returns positions in the data file of the cursor.
// End position may be unset
data_file_positions_range data_file_positions() const {
data_file_positions_range result;
result.start = _lower_bound.data_file_position;
if (_upper_bound) {
result.end = _upper_bound->data_file_position;
}
return result;
}
// Returns the kind of sstable element the cursor is pointing at.
indexable_element element_kind() const {
return _lower_bound.element;
}
std::optional<open_rt_marker> end_open_marker() const {
return _lower_bound.end_open_marker;
}
bool eof() const {
return _lower_bound.data_file_position == data_file_end();
}
future<> close() {
// Need to close consequently as we expect to not have close_current_list_ptr to run in parallel
return close_index_list(_lower_bound.current_list).then([this] {
if (_upper_bound) {
return close_index_list(_upper_bound->current_list);
}
return make_ready_future<>();
});
}
};
}