Files
scylladb/sstables/trie/bti_node_reader.cc
Michał Chojnowski 1f85069389 sstables/trie: support reader_permit and trace_state properly
Before this patch, `reader_permit` taken by `bti_index_reader`.
wasn't actually being passed down to disk reads. In this patch,
we fix this FIXME by propagating the permit down to the I/O
operations on the `cached_file`.

Also, it didn't take `trace_state_ptr` at all.
In this patch, we add a `trace_state_ptr` argument and propagate
it down to disk reads.

(We combine the two changes because the permit and the trace state
are passed together everywhere anyway).
2025-09-17 12:22:40 +02:00

487 lines
15 KiB
C++

/*
* Copyright (C) 2024-present ScyllaDB
*/
/*
* SPDX-License-Identifier: LicenseRef-ScyllaDB-Source-Available-1.0
*/
#include "bti_node_reader.hh"
#include "bti_node_type.hh"
namespace sstables::trie {
get_child_result bti_get_child(uint64_t pos, const_bytes sp, int child_idx, bool forward) {
auto type = uint8_t(sp[0]) >> 4;
trie::get_child_result result;
auto single = [&](uint64_t offset) {
result.offset = offset;
result.idx = 0;
return result;
};
auto sparse = [&] [[gnu::always_inline]] (int type) {
auto bpp = bits_per_pointer_arr[type];
result.idx = child_idx;
result.offset = read_offset(sp.subspan(2 + int(sp[1])), child_idx, bpp);
return result;
};
auto dense = [&] [[gnu::always_inline]] (int type) {
auto bpp = bits_per_pointer_arr[type];
auto dense_span = uint64_t(sp[2]) + 1;
int idx = child_idx;
int increment = forward ? 1 : -1;
while (idx < int(dense_span) && idx >= 0) {
if (auto off = read_offset(sp.subspan(3), idx, bpp)) {
result.idx = idx;
result.offset = off;
return result;
} else {
idx += increment;
}
}
[[unlikely]] sstables::on_bti_parse_error(pos);
};
switch (type) {
case PAYLOAD_ONLY:
[[unlikely]] sstables::on_bti_parse_error(pos);
case SINGLE_NOPAYLOAD_4:
return single(uint64_t(sp[0]) & 0xf);
case SINGLE_NOPAYLOAD_12:
return single((uint64_t(sp[0]) & 0xf) << 8 | uint64_t(sp[1]));
case SINGLE_8:
return single(uint64_t(sp[2]));
case SINGLE_16:
return single(uint64_t(sp[2]) << 8 | uint64_t(sp[3]));
case SPARSE_8:
return sparse(type);
case SPARSE_12:
return sparse(type);
case SPARSE_16:
return sparse(type);
case SPARSE_24:
return sparse(type);
case SPARSE_40:
return sparse(type);
case DENSE_12:
return dense(type);
case DENSE_16:
return dense(type);
case DENSE_24:
return dense(type);
case DENSE_32:
return dense(type);
case DENSE_40:
return dense(type);
case LONG_DENSE:
return dense(type);
}
[[unlikely]] sstables::on_bti_parse_error(pos);
}
std::byte bti_get_child_transition(uint64_t pos, const_bytes raw, int idx) {
auto type = uint8_t(raw[0]) >> 4;
switch (type) {
case PAYLOAD_ONLY:
abort();
case SINGLE_NOPAYLOAD_4:
return raw[1];
case SINGLE_8:
return raw[1];
case SINGLE_NOPAYLOAD_12:
return raw[2];
case SINGLE_16:
return raw[1];
case SPARSE_8:
case SPARSE_12:
case SPARSE_16:
case SPARSE_24:
case SPARSE_40:
return raw[2 + idx];
case DENSE_12:
case DENSE_16:
case DENSE_24:
case DENSE_32:
case DENSE_40:
case LONG_DENSE:
return std::byte(uint8_t(raw[1]) + idx);
}
[[unlikely]] sstables::on_bti_parse_error(pos);
}
load_final_node_result bti_read_node(int64_t pos, const_bytes sp) {
load_final_node_result result;
auto type = uint8_t(sp[0]) >> 4;
auto single = [&](uint8_t payload_bits) {
result.n_children = 1;
result.payload_bits = payload_bits;
return result;
};
auto sparse = [&] [[gnu::always_inline]] (int type) {
int n_children = int(sp[1]);
result.n_children = n_children;
result.payload_bits = uint8_t(sp[0]) & 0xf;
return result;
};
auto dense = [&] [[gnu::always_inline]] (int type) {
auto dense_span = uint64_t(sp[2]) + 1;
result.n_children = dense_span;
result.payload_bits = uint8_t(sp[0]) & 0xf;
return result;
};
switch (type) {
case PAYLOAD_ONLY:
result.payload_bits = uint8_t(sp[0]) & 0xf;
result.n_children = 0;
return result;
case SINGLE_NOPAYLOAD_4:
return single(0);
case SINGLE_NOPAYLOAD_12:
return single(0);
case SINGLE_8:
return single(uint8_t(sp[0]) & 0xf);
case SINGLE_16:
return single(uint8_t(sp[0]) & 0xf);
case SPARSE_8:
return sparse(type);
case SPARSE_12:
return sparse(type);
case SPARSE_16:
return sparse(type);
case SPARSE_24:
return sparse(type);
case SPARSE_40:
return sparse(type);
case DENSE_12:
return dense(type);
case DENSE_16:
return dense(type);
case DENSE_24:
return dense(type);
case DENSE_32:
return dense(type);
case DENSE_40:
return dense(type);
case LONG_DENSE:
return dense(type);
}
[[unlikely]] sstables::on_bti_parse_error(pos);
}
const_bytes bti_get_payload(int64_t pos, const_bytes sp) {
auto type = uint8_t(sp[0]) >> 4;
switch (type) {
case PAYLOAD_ONLY:
return sp.subspan(1);
case SINGLE_NOPAYLOAD_4:
case SINGLE_NOPAYLOAD_12:
return sp.subspan(1 + div_ceil(bits_per_pointer_arr[type], 8));
case SINGLE_8:
case SINGLE_16:
return sp.subspan(2 + div_ceil(bits_per_pointer_arr[type], 8));
case SPARSE_8:
case SPARSE_12:
case SPARSE_16:
case SPARSE_24:
case SPARSE_40: {
auto n_children = uint8_t(sp[1]);
return sp.subspan(2 + div_ceil(n_children * (8 + bits_per_pointer_arr[type]), 8));
}
case DENSE_12:
case DENSE_16:
case DENSE_24:
case DENSE_32:
case DENSE_40:
case LONG_DENSE: {
auto dense_span = uint8_t(sp[2]) + 1;
return sp.subspan(3 + div_ceil(dense_span * bits_per_pointer_arr[type], 8));
}
}
[[unlikely]] sstables::on_bti_parse_error(pos);
}
node_traverse_result bti_walk_down_along_key(int64_t pos, const_bytes sp, const_bytes key) {
auto type = uint8_t(sp[0]) >> 4;
trie::node_traverse_result result;
result.body_pos = pos;
result.traversed_key_bytes = 0;
auto single = [&](std::byte edge, uint64_t offset, uint8_t payload_bits) {
result.n_children = 1;
result.payload_bits = payload_bits;
if (key[0] <= edge) {
result.found_idx = 0;
result.found_byte = int(edge);
result.child_offset = offset;
} else {
result.found_idx = 1;
result.found_byte = -1;
result.child_offset = -1;
}
return result;
};
auto sparse = [&] [[gnu::always_inline]] (int type) {
int n_children = int(sp[1]);
auto idx = std::lower_bound(&sp[2], &sp[2 + n_children], key[0]) - &sp[2];
result.n_children = n_children;
result.payload_bits = uint8_t(sp[0]) & 0xf;
result.found_idx = idx;
if (idx < n_children) {
auto bpp = bits_per_pointer_arr[type];
result.child_offset = read_offset(sp.subspan(2 + n_children), idx, bpp);
result.found_byte = int(sp[2 + idx]);
} else {
result.child_offset = -1;
result.found_byte = -1;
}
return result;
};
auto dense = [&] [[gnu::always_inline]] (int type) {
auto start = int(sp[1]);
auto idx = std::max<int>(0, int(key[0]) - start);
auto dense_span = uint64_t(sp[2]) + 1;
auto bpp = bits_per_pointer_arr[type];
result.n_children = dense_span;
result.payload_bits = uint8_t(sp[0]) & 0xf;
while (idx < int(dense_span)) {
if (auto off = read_offset(sp.subspan(3), idx, bpp)) {
result.child_offset = off;
result.found_idx = idx;
result.found_byte = start + idx;
return result;
} else {
++idx;
}
}
result.found_idx = dense_span;
result.child_offset = -1;
result.found_byte = -1;
return result;
};
switch (type) {
case PAYLOAD_ONLY:
result.payload_bits = uint8_t(sp[0]) & 0xf;
result.n_children = 0;
result.found_idx = 0;
result.found_byte = -1;
result.child_offset = -1;
return result;
case SINGLE_NOPAYLOAD_4:
return single(sp[1], uint64_t(sp[0]) & 0xf, 0);
case SINGLE_NOPAYLOAD_12:
return single(sp[2], (uint64_t(sp[0]) & 0xf) << 8 | uint64_t(sp[1]), 0);
case SINGLE_8:
return single(sp[1], uint64_t(sp[2]), uint8_t(sp[0]) & 0xf);
case SINGLE_16:
return single(sp[1], uint64_t(sp[2]) << 8 | uint64_t(sp[3]), uint8_t(sp[0]) & 0xf);
case SPARSE_8:
return sparse(type);
case SPARSE_12:
return sparse(type);
case SPARSE_16:
return sparse(type);
case SPARSE_24:
return sparse(type);
case SPARSE_40:
return sparse(type);
case DENSE_12:
return dense(type);
case DENSE_16:
return dense(type);
case DENSE_24:
return dense(type);
case DENSE_32:
return dense(type);
case DENSE_40:
return dense(type);
case LONG_DENSE:
return dense(type);
}
[[unlikely]] sstables::on_bti_parse_error(pos);
}
node_traverse_sidemost_result bti_walk_down_leftmost_path(int64_t pos, const_bytes sp) {
auto type = uint8_t(sp[0]) >> 4;
trie::node_traverse_sidemost_result result;
result.body_pos = pos;
auto single = [&](uint64_t offset, uint8_t payload_bits) {
result.n_children = 1;
result.payload_bits = payload_bits;
result.child_offset = offset;
return result;
};
auto sparse = [&] [[gnu::always_inline]] (int type) {
int n_children = int(sp[1]);
auto bpp = bits_per_pointer_arr[type];
result.n_children = n_children;
result.payload_bits = uint8_t(sp[0]) & 0xf;
result.child_offset = read_offset(sp.subspan(2 + n_children), 0, bpp);
return result;
};
auto dense = [&] [[gnu::always_inline]] (int type) {
auto dense_span = uint64_t(sp[2]) + 1;
auto bpp = bits_per_pointer_arr[type];
result.n_children = dense_span;
result.payload_bits = uint8_t(sp[0]) & 0xf;
result.child_offset = read_offset(sp.subspan(3), 0, bpp);
return result;
};
switch (type) {
case PAYLOAD_ONLY:
result.payload_bits = uint8_t(sp[0]) & 0xf;
result.n_children = 0;
result.child_offset = -1;
return result;
case SINGLE_NOPAYLOAD_4:
return single(uint64_t(sp[0]) & 0xf, 0);
case SINGLE_NOPAYLOAD_12:
return single((uint64_t(sp[0]) & 0xf) << 8 | uint64_t(sp[1]), 0);
case SINGLE_8:
return single(uint64_t(sp[2]), uint8_t(sp[0]) & 0xf);
case SINGLE_16:
return single(uint64_t(sp[2]) << 8 | uint64_t(sp[3]), uint8_t(sp[0]) & 0xf);
case SPARSE_8:
return sparse(type);
case SPARSE_12:
return sparse(type);
case SPARSE_16:
return sparse(type);
case SPARSE_24:
return sparse(type);
case SPARSE_40:
return sparse(type);
case DENSE_12:
return dense(type);
case DENSE_16:
return dense(type);
case DENSE_24:
return dense(type);
case DENSE_32:
return dense(type);
case DENSE_40:
return dense(type);
case LONG_DENSE:
return dense(type);
}
[[unlikely]] sstables::on_bti_parse_error(pos);
}
node_traverse_sidemost_result bti_walk_down_rightmost_path(int64_t pos, const_bytes sp) {
auto type = uint8_t(sp[0]) >> 4;
trie::node_traverse_sidemost_result result;
result.body_pos = pos;
auto single = [&](uint64_t offset, uint8_t payload_bits) {
result.n_children = 1;
result.payload_bits = payload_bits;
result.child_offset = offset;
return result;
};
auto sparse = [&] [[gnu::always_inline]] (int type) {
int n_children = int(sp[1]);
auto bpp = bits_per_pointer_arr[type];
result.n_children = n_children;
result.payload_bits = uint8_t(sp[0]) & 0xf;
result.child_offset = read_offset(sp.subspan(2 + n_children), n_children - 1, bpp);
return result;
};
auto dense = [&] [[gnu::always_inline]] (int type) {
auto dense_span = uint64_t(sp[2]) + 1;
auto bpp = bits_per_pointer_arr[type];
result.n_children = dense_span;
result.payload_bits = uint8_t(sp[0]) & 0xf;
result.child_offset = read_offset(sp.subspan(3), dense_span - 1, bpp);
return result;
};
switch (type) {
case PAYLOAD_ONLY:
result.payload_bits = uint8_t(sp[0]) & 0xf;
result.n_children = 0;
result.child_offset = -1;
return result;
case SINGLE_NOPAYLOAD_4:
return single(uint64_t(sp[0]) & 0xf, 0);
case SINGLE_NOPAYLOAD_12:
return single((uint64_t(sp[0]) & 0xf) << 8 | uint64_t(sp[1]), 0);
case SINGLE_8:
return single(uint64_t(sp[2]), uint8_t(sp[0]) & 0xf);
case SINGLE_16:
return single(uint64_t(sp[2]) << 8 | uint64_t(sp[3]), uint8_t(sp[0]) & 0xf);
case SPARSE_8:
return sparse(type);
case SPARSE_12:
return sparse(type);
case SPARSE_16:
return sparse(type);
case SPARSE_24:
return sparse(type);
case SPARSE_40:
return sparse(type);
case DENSE_12:
return dense(type);
case DENSE_16:
return dense(type);
case DENSE_24:
return dense(type);
case DENSE_32:
return dense(type);
case DENSE_40:
return dense(type);
case LONG_DENSE:
return dense(type);
}
[[unlikely]] sstables::on_bti_parse_error(pos);
}
bti_node_reader::bti_node_reader(cached_file& f)
: _file(f) {
}
bool bti_node_reader::cached(int64_t pos) const {
return _cached_page && _cached_page->pos() / cached_file::page_size == pos / cached_file::page_size;
}
seastar::future<> bti_node_reader::load(int64_t pos, const reader_permit& permit, const tracing::trace_state_ptr& trace_ptr) {
if (cached(pos)) {
return make_ready_future<>();
}
return _file.get().get_shared_page(pos, permit, trace_ptr).then([this](cached_file::page_read_result page) {
_cached_page = std::move(page.ptr);
});
}
trie::load_final_node_result bti_node_reader::read_node(int64_t pos) {
SCYLLA_ASSERT(cached(pos));
auto sp = _cached_page->get_view().subspan(pos % cached_file::page_size);
return bti_read_node(pos, sp);
}
trie::node_traverse_result bti_node_reader::walk_down_along_key(int64_t pos, const_bytes key) {
SCYLLA_ASSERT(cached(pos));
auto sp = _cached_page->get_view().subspan(pos % cached_file::page_size);
return bti_walk_down_along_key(pos, sp, key);
}
trie::node_traverse_sidemost_result bti_node_reader::walk_down_leftmost_path(int64_t pos) {
SCYLLA_ASSERT(cached(pos));
auto sp = _cached_page->get_view().subspan(pos % cached_file::page_size);
return bti_walk_down_leftmost_path(pos, sp);
}
trie::node_traverse_sidemost_result bti_node_reader::walk_down_rightmost_path(int64_t pos) {
SCYLLA_ASSERT(cached(pos));
auto sp = _cached_page->get_view().subspan(pos % cached_file::page_size);
return bti_walk_down_rightmost_path(pos, sp);
}
trie::get_child_result bti_node_reader::get_child(int64_t pos, int child_idx, bool forward) const {
SCYLLA_ASSERT(cached(pos));
auto sp = _cached_page->get_view().subspan(pos % cached_file::page_size);
return bti_get_child(pos, sp, child_idx, forward);
}
const_bytes bti_node_reader::get_payload(int64_t pos) const {
SCYLLA_ASSERT(cached(pos));
auto sp = _cached_page->get_view().subspan(pos % cached_file::page_size);
return bti_get_payload(pos, sp);
}
} // namespace sstables::trie