mirror of
https://github.com/scylladb/scylladb.git
synced 2026-04-27 11:55:15 +00:00
195 lines
9.2 KiB
C++
195 lines
9.2 KiB
C++
/*
|
|
* Copyright (C) 2025-present ScyllaDB
|
|
*/
|
|
|
|
/*
|
|
* SPDX-License-Identifier: LicenseRef-ScyllaDB-Source-Available-1.1
|
|
*/
|
|
|
|
#pragma once
|
|
|
|
#include "mutation/tombstone.hh"
|
|
#include "mutation/position_in_partition.hh"
|
|
#include "sstables/types.hh"
|
|
|
|
namespace utils {
|
|
struct hashed_key;
|
|
}
|
|
|
|
namespace sstables {
|
|
|
|
struct data_file_positions_range {
|
|
uint64_t start;
|
|
std::optional<uint64_t> end;
|
|
};
|
|
|
|
// Stores information about open end RT marker
|
|
// of the lower index bound
|
|
struct open_rt_marker {
|
|
position_in_partition pos;
|
|
tombstone tomb;
|
|
};
|
|
|
|
// An abstract interface for a reader of sstable indexes,
|
|
// which can be used by queries to locate the needed file offsets in the Data file.
|
|
//
|
|
// Conceptually, an index reader is a pair of index cursors -- "lower bound" and "upper bound" --
|
|
// which can be set to "point" before or after chosen positions in the dataset,
|
|
// and can be queried for Data file offsets which match the pointed-to positions.
|
|
//
|
|
// As of this writing, there is only one index format used in Scylla
|
|
// (the one used by SSTables in the "BIG" format, which is the only format
|
|
// supported by Scylla). And so there is only one implementation of this interface.
|
|
// But we want to add another implementation soon (BTI-format indexes),
|
|
// and this interface has been extracted in preparation for that.
|
|
//
|
|
// Note: in the comments below, "PK" means a position that corresponds either
|
|
// to a start of some partition or to EOF,
|
|
// while "position" means a position that corresponds either to the start of some partition,
|
|
// to the start of some clustering entry, or to EOF.
|
|
//
|
|
// Note: even though some methods of the index are inexact (i.e. they advance the index to *some*
|
|
// Data position close to the queried ring position), they are monotonic.
|
|
// I.e. if B >= A, then advance(B) >= advance(A).
|
|
class abstract_index_reader {
|
|
public:
|
|
virtual ~abstract_index_reader() = default;
|
|
// Must be called before the reader is destroyed.
|
|
virtual future<> close() noexcept = 0;
|
|
// True iff lower bound is at EOF.
|
|
virtual bool eof() const = 0;
|
|
|
|
// If `key` is a partition key present in the sstable, advances lower bound to `key`.
|
|
// Otherwise advances lower bound to the some PK no greater than `key`.
|
|
//
|
|
// Returns `true` iff it's possible that `key` is a partition key present in the sstable.
|
|
// (In other words, if it returns `false`, then the key is definitely not present.
|
|
// Otherwise it's unknown if it's present).
|
|
//
|
|
// If `hash` is provided, it must be the murmur hash of the partition key in `key`.
|
|
// (Some index use the hash to filter out false positives. They could compute the hash
|
|
// themselves, but since the caller often has to compute the hash anyway,
|
|
// it can be passed down to avoid recomputation).
|
|
//
|
|
// If the return value is `false`, the reader becomes broken and cannot be used again.
|
|
// (This method is only used for single-partition reads, so no reason to keep the reader
|
|
// usable after we know that the entire sstable read is already doomed).
|
|
//
|
|
// Precondition: pos >= lower bound
|
|
//
|
|
// Note: this is the most important and performance-sensitive method of the reader.
|
|
// This is what's used by sstable readers to find positions for single-partition reads.
|
|
virtual future<bool> advance_lower_and_check_if_present(dht::ring_position_view key) = 0;
|
|
virtual future<bool> advance_lower_and_check_if_present(dht::ring_position_view key, const utils::hashed_key& hash) = 0;
|
|
// Advances lower bound to the first PK greater than dk.
|
|
//
|
|
// Preconditions: dk >= lower bound, dk is present in the sstable
|
|
virtual future<> advance_past_definitely_present_partition(const dht::decorated_key& dk) = 0;
|
|
// Advances lower bound to dk.
|
|
//
|
|
// Preconditions: dk >= lower bound, dk is present in the sstable
|
|
virtual future<> advance_to_definitely_present_partition(const dht::decorated_key& dk) = 0;
|
|
// Advances lower bound to the first PK which lies inside or after the range,
|
|
// or to some close predecessor of that optimal PK.
|
|
// Advances upper bound to the first PK which lies after the range.
|
|
// or to some close successor of that optimal PK.
|
|
// Preconditions:
|
|
// 1. next lower bound >= lower bound
|
|
// 2. next upper bound >= upper bound
|
|
virtual future<> advance_to(const dht::partition_range& range) = 0;
|
|
// Advances lower bound to the first PK greater than lower bound.
|
|
// Precondition: !eof()
|
|
virtual future<> advance_to_next_partition() = 0;
|
|
// Advances upper bound to the first PK greater than lower bound.
|
|
// (Or to EOF if lower bound is EOF).
|
|
virtual future<> advance_reverse_to_next_partition() = 0;
|
|
|
|
// Partially advances some internals in order to warm up some caches.
|
|
//
|
|
// Does not move the bounds, but does "advance" the lower bound to `pos`
|
|
// for the purposes of "pos >= lower bound" preconditions.
|
|
//
|
|
// Preconditions:
|
|
// 1. Lower bound has been advanced.
|
|
// 2. !eof().
|
|
// 3. Must be called for non-decreasing positions.
|
|
virtual future<> prefetch_lower_bound(position_in_partition_view pos) = 0;
|
|
// Advances lower bound to some position (in the current partition) no greater than pos.
|
|
// Preconditions:
|
|
// 1. !eof()
|
|
// 2. `pos` >= lower bound
|
|
virtual future<> advance_to(position_in_partition_view pos) = 0;
|
|
// Advances upper bound to some position strictly after `pos`.
|
|
// Preconditions:
|
|
// 1. !eof()
|
|
// 2. upper bound is unset
|
|
virtual future<> advance_upper_past(position_in_partition_view pos) = 0;
|
|
// Advances the upper bound to the start of the first promoted index block after `pos`,
|
|
// or to the next PK if there are no blocks after `pos`.
|
|
//
|
|
// Supports advancing backwards (i.e. `pos` can be smaller than the previous upper bound position).
|
|
virtual future<> advance_reverse(position_in_partition_view pos) = 0;
|
|
|
|
// Tells whether details about current partition can be accessed.
|
|
// If this returns false, you have to call read_partition_data(),
|
|
// before calling the relevant accessors below.
|
|
//
|
|
// Calling read_partition_data() may involve doing I/O. The reason
|
|
// why control over this is exposed and not done under the hood is that
|
|
// in some cases it only makes sense to access partition details from index
|
|
// if it is readily available, and if it is not, we're better off obtaining
|
|
// them by continuing reading from sstable.
|
|
virtual bool partition_data_ready() const = 0;
|
|
// Ensures that partition_data_ready() returns true.
|
|
// Precondition: !eof()
|
|
virtual future<> read_partition_data() = 0;
|
|
// Returns tombstone for the current partition,
|
|
// if such information is available in the index.
|
|
//
|
|
// Note: it's an arbitrary decision of the writer of the index whether
|
|
// the the partition tombstone has been attached to a given index entry,
|
|
// and the user of the index reader should not assume that it has.
|
|
//
|
|
// The main use case for this information is reads which start in the
|
|
// middle of a large partition. The Data reader needs to know the partition
|
|
// header (full partition key and partition tombstone) to emit a partition,
|
|
// but the header is at the beginning of the partition, potentially far
|
|
// from the queried rows.
|
|
// Embedding the partition header in the index lets the Data reader skip
|
|
// avoid doing a separate disk read to get the header from the Data file.
|
|
//
|
|
// Thus, `partition_tombstone()` and `get_partition_key()` usually
|
|
// return an engaged optional at least for those partitions which have an
|
|
// intra-partition index (because that's when they can be used to skip
|
|
// a disk seek) but the reader shouldn't assume that. It should check
|
|
// if they are available, and if not, it should fall back to reading
|
|
// the partition header from the Data file.
|
|
//
|
|
// Precondition: partition_data_ready()
|
|
virtual std::optional<sstables::deletion_time> partition_tombstone() = 0;
|
|
// Returns the key for current partition of the lower bound,
|
|
// if available (se the comment for partition_tombstone) in the index.
|
|
//
|
|
// Precondition: partition_data_ready()
|
|
virtual std::optional<partition_key> get_partition_key() = 0;
|
|
// Returns data file positions corresponding to the bounds.
|
|
// End position may be unset
|
|
virtual data_file_positions_range data_file_positions() const = 0;
|
|
// Returns the offset (from partition start) of the first row in the last promoted index block
|
|
// in the current partition or nullopt if there are no blocks in the current partition.
|
|
//
|
|
// Preconditions: partition_data_ready()
|
|
virtual future<std::optional<uint64_t>> last_block_offset() = 0;
|
|
// Returns the kind of sstable element the cursor is pointing at.
|
|
// No preconditions.
|
|
virtual indexable_element element_kind() const = 0;
|
|
// Returns info about the range tombstone (if any) which covers lower bound.
|
|
// Precondition: !eof()
|
|
virtual std::optional<open_rt_marker> end_open_marker() const = 0;
|
|
// Returns info about the range tombstone (if any) which covers upper bound.
|
|
// Precondition: !eof()
|
|
virtual std::optional<open_rt_marker> reverse_end_open_marker() const = 0;
|
|
};
|
|
|
|
} // namespace sstables
|