Files
scylladb/sstables/trie/node_reader.hh
Michał Chojnowski 1f85069389 sstables/trie: support reader_permit and trace_state properly
Before this patch, `reader_permit` taken by `bti_index_reader`.
wasn't actually being passed down to disk reads. In this patch,
we fix this FIXME by propagating the permit down to the I/O
operations on the `cached_file`.

Also, it didn't take `trace_state_ptr` at all.
In this patch, we add a `trace_state_ptr` argument and propagate
it down to disk reads.

(We combine the two changes because the permit and the trace state
are passed together everywhere anyway).
2025-09-17 12:22:40 +02:00

174 lines
6.6 KiB
C++

/*
* Copyright (C) 2024-present ScyllaDB
*/
/*
* SPDX-License-Identifier: LicenseRef-ScyllaDB-Source-Available-1.0
*/
#pragma once
#include <seastar/core/future.hh>
// This file defines an interface between the format-agnostic part of a trie
// reader (the cursor which can be set to a specific key and then
// stepped forwards and backwards) and the format-specific part
// (which actually understand the bytes and bits of the nodes on disk).
class reader_permit;
namespace tracing {
class trace_state_ptr;
}
namespace sstables::trie {
using const_bytes = std::span<const std::byte>;
// Contains information about the result of the (partial) walk down.
//
// For example, if the trie contains strings ("123", "124", "23"),
// then the trie looks like this (where * is the root node):
//
// *
// 1-----2
// 2 3
// 3-4
//
// and if node "124" is on a different page than node "12", then
// `walk_down_along_key(pos_of_root, "123")` might
// return node "12" as the result, with
//
// payload_bits = 0
// n_children = 2
// found_idx = idx_of_child_4_in_node_12
// found_byte = '4'
// traversed_key_bytes = 2
// body_pos = pos_of_node_12
// child_offset = pos_of_node_12 - pos_of_node_123
struct node_traverse_result {
// Payload bits for this node.
uint8_t payload_bits;
// Number of child indexes (slots) for this node.
// ("Slots" because not all indexes have to be occupied).
int n_children;
// Index of first child whose transition is greater or equal to the next key byte.
// (Or, if such children don't exist, this is equal to n_children).
int found_idx;
// The transition byte of the child at found_idx.
// (Or, if there is no such child, this is equal to -1).
int found_byte;
// The number of edges traversed while walking down.
// I.e. the number of edges between this node and the ancestor
// at which the walk started.
//
// E.g. if we started at node X, and the
// at which the walk started.
int traversed_key_bytes;
// File position of this node.
int64_t body_pos;
// Position of this node minus position of the child at found_idx.
// Only valid if found_idx is valid in range [0, n_children).
int64_t child_offset;
};
// Fields have same meaning as in node_traverse_result.
struct node_traverse_sidemost_result {
uint8_t payload_bits;
int n_children;
int64_t body_pos;
int64_t child_offset;
};
// Contains information about the result of the get_child call.
struct get_child_result {
// The index of the child selected by the call.
int idx;
// The offset of the selected child from the parent.
// (A children always has position smaller than its parent).
uint64_t offset;
};
// Fields have same meaning as in node_traverse_result.
struct load_final_node_result {
uint16_t n_children;
uint8_t payload_bits;
};
// This concept contains all operations which abstract trie nodes
// needs to provide in order so that a trie cursor can be implemented
// over them.
template <typename T>
concept node_reader = requires(T& o, int64_t pos, const_bytes key, int child_idx, bool forwards, const reader_permit& permit, const tracing::trace_state_ptr& trace_ptr) {
// Checks if the page containing the given file position has been loaded.
//
// Note: forcing the caller to explicitly call `load` before use
// is error-prone, but it lets us keep the performance-sensitive methods
// synchronous.
{ o.cached(pos) } -> std::same_as<bool>;
// Loads the page containing the given file position, and potentially
// unloads previously loaded pages.
//
// Ensures `cached(pos)` until the next `load(pos)` call.
//
// Precondition: pos lies within the file.
{ o.load(pos, permit, trace_ptr) } -> std::same_as<seastar::future<>>;
// Reads some basic information (payload and number of children)
// about the node.
//
// Precondition: cached(pos)
{ o.read_node(pos) } -> std::same_as<load_final_node_result>;
// Walks some distance down the trie
// from node at `pos` along `key`.
// Might walk over any number of *unimportant* nodes,
// but does not walk past the first *important* node.
//
// An *important* node is one which fulfills at least one of the following conditions:
// 1. Has a payload.
// 2. Has more than one child.
//
// (A trie iterator has to remember the important nodes on the path from the root
// to the current position, because stepping forwards or backwards might involve
// going up to the closest important node. Unimportant nodes don't have to be revisited).
//
// Note: we allow this method to traverse many *unimportant* nodes
// at once for optimization reasons. As of this writing, the method always
// walks over one node, but we might want to add an optimization which
// walks over long chains of SINGLE_NOPAYLOAD_4 nodes in batches.
//
// Precondition: cached(pos), key.size() > 0
{ o.walk_down_along_key(pos, key) } -> std::same_as<node_traverse_result>;
// Like walk_down_along_key, but always chooses the leftmost child instead
// of following a key.
//
// Precondition: cached(pos)
{ o.walk_down_leftmost_path(pos) } -> std::same_as<node_traverse_sidemost_result>;
// Like walk_down_along_key, but always chooses the rightmost child instead
// of following a key.
//
// Precondition: cached(pos)
{ o.walk_down_rightmost_path(pos) } -> std::same_as<node_traverse_sidemost_result>;
// Returns some information about a child of node at `pos`.
//
// The child is specified by an index, but not all child indexes must be occupied by a child.
// If an unoccupied index is passed, the method will return information for the nearest
// occupied child index which is greater (if `forwards`) or smaller (if `!forwards`)
//
// (This is used for stepping forwards/backwards over the trie).
//
// Precondition: cached(pos), child_idx lies between the smallest and the largest
// occupied child indexes (inclusive) for this node.
{ o.get_child(pos, child_idx, forwards) } -> std::same_as<get_child_result>;
// Returns the payload for the payload-carrying node at `pos`.
//
// The child is specified by an index, but not all child indexes must be occupied by a child.
// If an unoccupied index is passed, the method will return information for the nearest
// occupied child index which is greater (if `forwards`) or smaller (if `!forwards`)
//
// (This is used for stepping forwards/backwards over the trie).
//
// Precondition: cached(pos), the node carries a payload
{ o.get_payload(pos) } -> std::same_as<const_bytes>;
};
} // namespace sstables::trie