Before this patch, `reader_permit` taken by `bti_index_reader`. wasn't actually being passed down to disk reads. In this patch, we fix this FIXME by propagating the permit down to the I/O operations on the `cached_file`. Also, it didn't take `trace_state_ptr` at all. In this patch, we add a `trace_state_ptr` argument and propagate it down to disk reads. (We combine the two changes because the permit and the trace state are passed together everywhere anyway).
487 lines
15 KiB
C++
487 lines
15 KiB
C++
/*
|
|
* Copyright (C) 2024-present ScyllaDB
|
|
*/
|
|
|
|
/*
|
|
* SPDX-License-Identifier: LicenseRef-ScyllaDB-Source-Available-1.0
|
|
*/
|
|
|
|
#include "bti_node_reader.hh"
|
|
#include "bti_node_type.hh"
|
|
|
|
namespace sstables::trie {
|
|
|
|
get_child_result bti_get_child(uint64_t pos, const_bytes sp, int child_idx, bool forward) {
|
|
auto type = uint8_t(sp[0]) >> 4;
|
|
trie::get_child_result result;
|
|
auto single = [&](uint64_t offset) {
|
|
result.offset = offset;
|
|
result.idx = 0;
|
|
return result;
|
|
};
|
|
auto sparse = [&] [[gnu::always_inline]] (int type) {
|
|
auto bpp = bits_per_pointer_arr[type];
|
|
result.idx = child_idx;
|
|
result.offset = read_offset(sp.subspan(2 + int(sp[1])), child_idx, bpp);
|
|
return result;
|
|
};
|
|
auto dense = [&] [[gnu::always_inline]] (int type) {
|
|
auto bpp = bits_per_pointer_arr[type];
|
|
auto dense_span = uint64_t(sp[2]) + 1;
|
|
int idx = child_idx;
|
|
int increment = forward ? 1 : -1;
|
|
while (idx < int(dense_span) && idx >= 0) {
|
|
if (auto off = read_offset(sp.subspan(3), idx, bpp)) {
|
|
result.idx = idx;
|
|
result.offset = off;
|
|
return result;
|
|
} else {
|
|
idx += increment;
|
|
}
|
|
}
|
|
[[unlikely]] sstables::on_bti_parse_error(pos);
|
|
};
|
|
switch (type) {
|
|
case PAYLOAD_ONLY:
|
|
[[unlikely]] sstables::on_bti_parse_error(pos);
|
|
case SINGLE_NOPAYLOAD_4:
|
|
return single(uint64_t(sp[0]) & 0xf);
|
|
case SINGLE_NOPAYLOAD_12:
|
|
return single((uint64_t(sp[0]) & 0xf) << 8 | uint64_t(sp[1]));
|
|
case SINGLE_8:
|
|
return single(uint64_t(sp[2]));
|
|
case SINGLE_16:
|
|
return single(uint64_t(sp[2]) << 8 | uint64_t(sp[3]));
|
|
case SPARSE_8:
|
|
return sparse(type);
|
|
case SPARSE_12:
|
|
return sparse(type);
|
|
case SPARSE_16:
|
|
return sparse(type);
|
|
case SPARSE_24:
|
|
return sparse(type);
|
|
case SPARSE_40:
|
|
return sparse(type);
|
|
case DENSE_12:
|
|
return dense(type);
|
|
case DENSE_16:
|
|
return dense(type);
|
|
case DENSE_24:
|
|
return dense(type);
|
|
case DENSE_32:
|
|
return dense(type);
|
|
case DENSE_40:
|
|
return dense(type);
|
|
case LONG_DENSE:
|
|
return dense(type);
|
|
}
|
|
[[unlikely]] sstables::on_bti_parse_error(pos);
|
|
}
|
|
|
|
std::byte bti_get_child_transition(uint64_t pos, const_bytes raw, int idx) {
|
|
auto type = uint8_t(raw[0]) >> 4;
|
|
switch (type) {
|
|
case PAYLOAD_ONLY:
|
|
abort();
|
|
case SINGLE_NOPAYLOAD_4:
|
|
return raw[1];
|
|
case SINGLE_8:
|
|
return raw[1];
|
|
case SINGLE_NOPAYLOAD_12:
|
|
return raw[2];
|
|
case SINGLE_16:
|
|
return raw[1];
|
|
case SPARSE_8:
|
|
case SPARSE_12:
|
|
case SPARSE_16:
|
|
case SPARSE_24:
|
|
case SPARSE_40:
|
|
return raw[2 + idx];
|
|
case DENSE_12:
|
|
case DENSE_16:
|
|
case DENSE_24:
|
|
case DENSE_32:
|
|
case DENSE_40:
|
|
case LONG_DENSE:
|
|
return std::byte(uint8_t(raw[1]) + idx);
|
|
}
|
|
[[unlikely]] sstables::on_bti_parse_error(pos);
|
|
}
|
|
|
|
load_final_node_result bti_read_node(int64_t pos, const_bytes sp) {
|
|
load_final_node_result result;
|
|
auto type = uint8_t(sp[0]) >> 4;
|
|
auto single = [&](uint8_t payload_bits) {
|
|
result.n_children = 1;
|
|
result.payload_bits = payload_bits;
|
|
return result;
|
|
};
|
|
auto sparse = [&] [[gnu::always_inline]] (int type) {
|
|
int n_children = int(sp[1]);
|
|
result.n_children = n_children;
|
|
result.payload_bits = uint8_t(sp[0]) & 0xf;
|
|
return result;
|
|
};
|
|
auto dense = [&] [[gnu::always_inline]] (int type) {
|
|
auto dense_span = uint64_t(sp[2]) + 1;
|
|
result.n_children = dense_span;
|
|
result.payload_bits = uint8_t(sp[0]) & 0xf;
|
|
return result;
|
|
};
|
|
switch (type) {
|
|
case PAYLOAD_ONLY:
|
|
result.payload_bits = uint8_t(sp[0]) & 0xf;
|
|
result.n_children = 0;
|
|
return result;
|
|
case SINGLE_NOPAYLOAD_4:
|
|
return single(0);
|
|
case SINGLE_NOPAYLOAD_12:
|
|
return single(0);
|
|
case SINGLE_8:
|
|
return single(uint8_t(sp[0]) & 0xf);
|
|
case SINGLE_16:
|
|
return single(uint8_t(sp[0]) & 0xf);
|
|
case SPARSE_8:
|
|
return sparse(type);
|
|
case SPARSE_12:
|
|
return sparse(type);
|
|
case SPARSE_16:
|
|
return sparse(type);
|
|
case SPARSE_24:
|
|
return sparse(type);
|
|
case SPARSE_40:
|
|
return sparse(type);
|
|
case DENSE_12:
|
|
return dense(type);
|
|
case DENSE_16:
|
|
return dense(type);
|
|
case DENSE_24:
|
|
return dense(type);
|
|
case DENSE_32:
|
|
return dense(type);
|
|
case DENSE_40:
|
|
return dense(type);
|
|
case LONG_DENSE:
|
|
return dense(type);
|
|
}
|
|
[[unlikely]] sstables::on_bti_parse_error(pos);
|
|
}
|
|
|
|
const_bytes bti_get_payload(int64_t pos, const_bytes sp) {
|
|
auto type = uint8_t(sp[0]) >> 4;
|
|
switch (type) {
|
|
case PAYLOAD_ONLY:
|
|
return sp.subspan(1);
|
|
case SINGLE_NOPAYLOAD_4:
|
|
case SINGLE_NOPAYLOAD_12:
|
|
return sp.subspan(1 + div_ceil(bits_per_pointer_arr[type], 8));
|
|
case SINGLE_8:
|
|
case SINGLE_16:
|
|
return sp.subspan(2 + div_ceil(bits_per_pointer_arr[type], 8));
|
|
case SPARSE_8:
|
|
case SPARSE_12:
|
|
case SPARSE_16:
|
|
case SPARSE_24:
|
|
case SPARSE_40: {
|
|
auto n_children = uint8_t(sp[1]);
|
|
return sp.subspan(2 + div_ceil(n_children * (8 + bits_per_pointer_arr[type]), 8));
|
|
}
|
|
case DENSE_12:
|
|
case DENSE_16:
|
|
case DENSE_24:
|
|
case DENSE_32:
|
|
case DENSE_40:
|
|
case LONG_DENSE: {
|
|
auto dense_span = uint8_t(sp[2]) + 1;
|
|
return sp.subspan(3 + div_ceil(dense_span * bits_per_pointer_arr[type], 8));
|
|
}
|
|
}
|
|
[[unlikely]] sstables::on_bti_parse_error(pos);
|
|
}
|
|
|
|
node_traverse_result bti_walk_down_along_key(int64_t pos, const_bytes sp, const_bytes key) {
|
|
auto type = uint8_t(sp[0]) >> 4;
|
|
trie::node_traverse_result result;
|
|
result.body_pos = pos;
|
|
result.traversed_key_bytes = 0;
|
|
auto single = [&](std::byte edge, uint64_t offset, uint8_t payload_bits) {
|
|
result.n_children = 1;
|
|
result.payload_bits = payload_bits;
|
|
if (key[0] <= edge) {
|
|
result.found_idx = 0;
|
|
result.found_byte = int(edge);
|
|
result.child_offset = offset;
|
|
} else {
|
|
result.found_idx = 1;
|
|
result.found_byte = -1;
|
|
result.child_offset = -1;
|
|
}
|
|
return result;
|
|
};
|
|
auto sparse = [&] [[gnu::always_inline]] (int type) {
|
|
int n_children = int(sp[1]);
|
|
auto idx = std::lower_bound(&sp[2], &sp[2 + n_children], key[0]) - &sp[2];
|
|
result.n_children = n_children;
|
|
result.payload_bits = uint8_t(sp[0]) & 0xf;
|
|
result.found_idx = idx;
|
|
if (idx < n_children) {
|
|
auto bpp = bits_per_pointer_arr[type];
|
|
result.child_offset = read_offset(sp.subspan(2 + n_children), idx, bpp);
|
|
result.found_byte = int(sp[2 + idx]);
|
|
} else {
|
|
result.child_offset = -1;
|
|
result.found_byte = -1;
|
|
}
|
|
return result;
|
|
};
|
|
auto dense = [&] [[gnu::always_inline]] (int type) {
|
|
auto start = int(sp[1]);
|
|
auto idx = std::max<int>(0, int(key[0]) - start);
|
|
auto dense_span = uint64_t(sp[2]) + 1;
|
|
auto bpp = bits_per_pointer_arr[type];
|
|
result.n_children = dense_span;
|
|
result.payload_bits = uint8_t(sp[0]) & 0xf;
|
|
while (idx < int(dense_span)) {
|
|
if (auto off = read_offset(sp.subspan(3), idx, bpp)) {
|
|
result.child_offset = off;
|
|
result.found_idx = idx;
|
|
result.found_byte = start + idx;
|
|
return result;
|
|
} else {
|
|
++idx;
|
|
}
|
|
}
|
|
result.found_idx = dense_span;
|
|
result.child_offset = -1;
|
|
result.found_byte = -1;
|
|
return result;
|
|
};
|
|
switch (type) {
|
|
case PAYLOAD_ONLY:
|
|
result.payload_bits = uint8_t(sp[0]) & 0xf;
|
|
result.n_children = 0;
|
|
result.found_idx = 0;
|
|
result.found_byte = -1;
|
|
result.child_offset = -1;
|
|
return result;
|
|
case SINGLE_NOPAYLOAD_4:
|
|
return single(sp[1], uint64_t(sp[0]) & 0xf, 0);
|
|
case SINGLE_NOPAYLOAD_12:
|
|
return single(sp[2], (uint64_t(sp[0]) & 0xf) << 8 | uint64_t(sp[1]), 0);
|
|
case SINGLE_8:
|
|
return single(sp[1], uint64_t(sp[2]), uint8_t(sp[0]) & 0xf);
|
|
case SINGLE_16:
|
|
return single(sp[1], uint64_t(sp[2]) << 8 | uint64_t(sp[3]), uint8_t(sp[0]) & 0xf);
|
|
case SPARSE_8:
|
|
return sparse(type);
|
|
case SPARSE_12:
|
|
return sparse(type);
|
|
case SPARSE_16:
|
|
return sparse(type);
|
|
case SPARSE_24:
|
|
return sparse(type);
|
|
case SPARSE_40:
|
|
return sparse(type);
|
|
case DENSE_12:
|
|
return dense(type);
|
|
case DENSE_16:
|
|
return dense(type);
|
|
case DENSE_24:
|
|
return dense(type);
|
|
case DENSE_32:
|
|
return dense(type);
|
|
case DENSE_40:
|
|
return dense(type);
|
|
case LONG_DENSE:
|
|
return dense(type);
|
|
}
|
|
[[unlikely]] sstables::on_bti_parse_error(pos);
|
|
}
|
|
|
|
node_traverse_sidemost_result bti_walk_down_leftmost_path(int64_t pos, const_bytes sp) {
|
|
auto type = uint8_t(sp[0]) >> 4;
|
|
trie::node_traverse_sidemost_result result;
|
|
result.body_pos = pos;
|
|
auto single = [&](uint64_t offset, uint8_t payload_bits) {
|
|
result.n_children = 1;
|
|
result.payload_bits = payload_bits;
|
|
result.child_offset = offset;
|
|
return result;
|
|
};
|
|
auto sparse = [&] [[gnu::always_inline]] (int type) {
|
|
int n_children = int(sp[1]);
|
|
auto bpp = bits_per_pointer_arr[type];
|
|
result.n_children = n_children;
|
|
result.payload_bits = uint8_t(sp[0]) & 0xf;
|
|
result.child_offset = read_offset(sp.subspan(2 + n_children), 0, bpp);
|
|
return result;
|
|
};
|
|
auto dense = [&] [[gnu::always_inline]] (int type) {
|
|
auto dense_span = uint64_t(sp[2]) + 1;
|
|
auto bpp = bits_per_pointer_arr[type];
|
|
result.n_children = dense_span;
|
|
result.payload_bits = uint8_t(sp[0]) & 0xf;
|
|
result.child_offset = read_offset(sp.subspan(3), 0, bpp);
|
|
return result;
|
|
};
|
|
switch (type) {
|
|
case PAYLOAD_ONLY:
|
|
result.payload_bits = uint8_t(sp[0]) & 0xf;
|
|
result.n_children = 0;
|
|
result.child_offset = -1;
|
|
return result;
|
|
case SINGLE_NOPAYLOAD_4:
|
|
return single(uint64_t(sp[0]) & 0xf, 0);
|
|
case SINGLE_NOPAYLOAD_12:
|
|
return single((uint64_t(sp[0]) & 0xf) << 8 | uint64_t(sp[1]), 0);
|
|
case SINGLE_8:
|
|
return single(uint64_t(sp[2]), uint8_t(sp[0]) & 0xf);
|
|
case SINGLE_16:
|
|
return single(uint64_t(sp[2]) << 8 | uint64_t(sp[3]), uint8_t(sp[0]) & 0xf);
|
|
case SPARSE_8:
|
|
return sparse(type);
|
|
case SPARSE_12:
|
|
return sparse(type);
|
|
case SPARSE_16:
|
|
return sparse(type);
|
|
case SPARSE_24:
|
|
return sparse(type);
|
|
case SPARSE_40:
|
|
return sparse(type);
|
|
case DENSE_12:
|
|
return dense(type);
|
|
case DENSE_16:
|
|
return dense(type);
|
|
case DENSE_24:
|
|
return dense(type);
|
|
case DENSE_32:
|
|
return dense(type);
|
|
case DENSE_40:
|
|
return dense(type);
|
|
case LONG_DENSE:
|
|
return dense(type);
|
|
}
|
|
[[unlikely]] sstables::on_bti_parse_error(pos);
|
|
}
|
|
|
|
node_traverse_sidemost_result bti_walk_down_rightmost_path(int64_t pos, const_bytes sp) {
|
|
auto type = uint8_t(sp[0]) >> 4;
|
|
trie::node_traverse_sidemost_result result;
|
|
result.body_pos = pos;
|
|
auto single = [&](uint64_t offset, uint8_t payload_bits) {
|
|
result.n_children = 1;
|
|
result.payload_bits = payload_bits;
|
|
result.child_offset = offset;
|
|
return result;
|
|
};
|
|
auto sparse = [&] [[gnu::always_inline]] (int type) {
|
|
int n_children = int(sp[1]);
|
|
auto bpp = bits_per_pointer_arr[type];
|
|
result.n_children = n_children;
|
|
result.payload_bits = uint8_t(sp[0]) & 0xf;
|
|
result.child_offset = read_offset(sp.subspan(2 + n_children), n_children - 1, bpp);
|
|
return result;
|
|
};
|
|
auto dense = [&] [[gnu::always_inline]] (int type) {
|
|
auto dense_span = uint64_t(sp[2]) + 1;
|
|
auto bpp = bits_per_pointer_arr[type];
|
|
result.n_children = dense_span;
|
|
result.payload_bits = uint8_t(sp[0]) & 0xf;
|
|
result.child_offset = read_offset(sp.subspan(3), dense_span - 1, bpp);
|
|
return result;
|
|
};
|
|
switch (type) {
|
|
case PAYLOAD_ONLY:
|
|
result.payload_bits = uint8_t(sp[0]) & 0xf;
|
|
result.n_children = 0;
|
|
result.child_offset = -1;
|
|
return result;
|
|
case SINGLE_NOPAYLOAD_4:
|
|
return single(uint64_t(sp[0]) & 0xf, 0);
|
|
case SINGLE_NOPAYLOAD_12:
|
|
return single((uint64_t(sp[0]) & 0xf) << 8 | uint64_t(sp[1]), 0);
|
|
case SINGLE_8:
|
|
return single(uint64_t(sp[2]), uint8_t(sp[0]) & 0xf);
|
|
case SINGLE_16:
|
|
return single(uint64_t(sp[2]) << 8 | uint64_t(sp[3]), uint8_t(sp[0]) & 0xf);
|
|
case SPARSE_8:
|
|
return sparse(type);
|
|
case SPARSE_12:
|
|
return sparse(type);
|
|
case SPARSE_16:
|
|
return sparse(type);
|
|
case SPARSE_24:
|
|
return sparse(type);
|
|
case SPARSE_40:
|
|
return sparse(type);
|
|
case DENSE_12:
|
|
return dense(type);
|
|
case DENSE_16:
|
|
return dense(type);
|
|
case DENSE_24:
|
|
return dense(type);
|
|
case DENSE_32:
|
|
return dense(type);
|
|
case DENSE_40:
|
|
return dense(type);
|
|
case LONG_DENSE:
|
|
return dense(type);
|
|
}
|
|
[[unlikely]] sstables::on_bti_parse_error(pos);
|
|
}
|
|
|
|
bti_node_reader::bti_node_reader(cached_file& f)
|
|
: _file(f) {
|
|
}
|
|
|
|
bool bti_node_reader::cached(int64_t pos) const {
|
|
return _cached_page && _cached_page->pos() / cached_file::page_size == pos / cached_file::page_size;
|
|
}
|
|
|
|
seastar::future<> bti_node_reader::load(int64_t pos, const reader_permit& permit, const tracing::trace_state_ptr& trace_ptr) {
|
|
if (cached(pos)) {
|
|
return make_ready_future<>();
|
|
}
|
|
return _file.get().get_shared_page(pos, permit, trace_ptr).then([this](cached_file::page_read_result page) {
|
|
_cached_page = std::move(page.ptr);
|
|
});
|
|
}
|
|
|
|
trie::load_final_node_result bti_node_reader::read_node(int64_t pos) {
|
|
SCYLLA_ASSERT(cached(pos));
|
|
auto sp = _cached_page->get_view().subspan(pos % cached_file::page_size);
|
|
return bti_read_node(pos, sp);
|
|
}
|
|
|
|
trie::node_traverse_result bti_node_reader::walk_down_along_key(int64_t pos, const_bytes key) {
|
|
SCYLLA_ASSERT(cached(pos));
|
|
auto sp = _cached_page->get_view().subspan(pos % cached_file::page_size);
|
|
return bti_walk_down_along_key(pos, sp, key);
|
|
}
|
|
|
|
trie::node_traverse_sidemost_result bti_node_reader::walk_down_leftmost_path(int64_t pos) {
|
|
SCYLLA_ASSERT(cached(pos));
|
|
auto sp = _cached_page->get_view().subspan(pos % cached_file::page_size);
|
|
return bti_walk_down_leftmost_path(pos, sp);
|
|
}
|
|
|
|
trie::node_traverse_sidemost_result bti_node_reader::walk_down_rightmost_path(int64_t pos) {
|
|
SCYLLA_ASSERT(cached(pos));
|
|
auto sp = _cached_page->get_view().subspan(pos % cached_file::page_size);
|
|
return bti_walk_down_rightmost_path(pos, sp);
|
|
}
|
|
|
|
trie::get_child_result bti_node_reader::get_child(int64_t pos, int child_idx, bool forward) const {
|
|
SCYLLA_ASSERT(cached(pos));
|
|
auto sp = _cached_page->get_view().subspan(pos % cached_file::page_size);
|
|
return bti_get_child(pos, sp, child_idx, forward);
|
|
}
|
|
|
|
const_bytes bti_node_reader::get_payload(int64_t pos) const {
|
|
SCYLLA_ASSERT(cached(pos));
|
|
auto sp = _cached_page->get_view().subspan(pos % cached_file::page_size);
|
|
return bti_get_payload(pos, sp);
|
|
}
|
|
|
|
} // namespace sstables::trie
|