Files
scylladb/sstables/trie/writer_node.impl.hh
Avi Kivity 0ae22a09d4 LICENSE: Update to version 1.1
Updated terms of non-commercial use (must be a never-customer).
2026-04-12 19:46:33 +03:00

139 lines
6.2 KiB
C++

/*
* Copyright (C) 2024-present ScyllaDB
*/
/*
* SPDX-License-Identifier: LicenseRef-ScyllaDB-Source-Available-1.1
*/
#pragma once
#include "writer_node.hh"
#include "utils/small_vector.hh"
namespace sstables::trie {
template <trie_writer_sink Output>
sink_offset writer_node::recalc_sizes(ptr<writer_node> self, const Output& out, sink_pos global_pos) {
// Note: this routine is very similar to `write` in its overall structure.
// A trie can be arbitrarily deep, so we can't afford to use recursion.
// We have to maintain a walk stack manually.
struct local_state {
ptr<writer_node> _node;
// The index of the next child to be visited.
// When equal to the number of children, it's time to walk out of the node.
int _stage;
// The start position of the node's entire subtree.
sink_pos _startpos;
};
utils::small_vector<local_state, 8> stack;
// Depth-first walk.
// To calculate the sizes, we essentially simulate a write of the tree.
stack.push_back({self, 0, global_pos});
while (!stack.empty()) {
// Caution: note that `node`, `pos`, `stage` are references to contents of `stack`.
// Any pushing or popping will invalidate them, so be sure not to use them afterwards!
auto& [node, stage, startpos] = stack.back();
// If the node has out of page grandchildren, the sizes of children might have changed from the estimate,
// so we have to recurse into the children and update them.
if (stage < static_cast<int>(node->get_children().size()) && node->_has_out_of_page_descendants) {
stage += 1;
stack.push_back({node->get_children()[stage - 1], 0, global_pos});
continue;
}
// If we got here, then either we have recursed into children
// (and then global_pos was updated accordingly),
// or we skipped that because there was no need. In the latter case,
// we have to update global_pos manually here.
if (!node->_has_out_of_page_descendants) {
global_pos = global_pos + node->_branch_size;
}
node->_branch_size = global_pos - startpos;
if (node->_has_out_of_page_children || node->_has_out_of_page_descendants) {
// The size of children might have changed, which might have in turn changed
// our offsets to children, which might have changed our size.
// We have to recalculate.
node->_node_size = out.serialized_size(*node, sink_pos(global_pos));
expensive_assert(uint64_t(node->_node_size.value) <= out.page_size());
}
global_pos = global_pos + node->_node_size;
stack.pop_back();
}
return self->_branch_size + self->_node_size;
}
template <trie_writer_sink Output>
void writer_node::write(ptr<writer_node> self, Output& out, bool guaranteed_fit) {
expensive_log(
"writer_node::write: subtree={} pos={}, ps={} sz={} ",
fmt::ptr(self.get()), out.pos().value, out.page_size(), (self->_branch_size + self->_node_size).value);
expensive_assert(!self->_pos.valid());
expensive_assert(self->_node_size.valid());
if (guaranteed_fit) {
auto page_of_first_byte = round_down(out.pos().value, out.page_size());
auto page_of_last_byte = round_down((out.pos() + self->_branch_size + self->_node_size).value - 1, out.page_size());
expensive_assert(page_of_first_byte == page_of_last_byte);
}
auto starting_pos = out.pos();
// A trie can be arbitrarily deep, so we can't afford to use recursion.
// We have to maintain a walk stack manually.
struct local_state {
ptr<writer_node> _node;
// The index of the next child to be visited.
// When equal to the number of children, it's time to walk out of the node.
int _stage;
// The start position of the node's entire subtree.
sink_pos _startpos;
};
// Note: partition index should almost always have depth smaller than 6.
// Row index can have depth of up to several thousand.
utils::small_vector<local_state, 8> stack;
// Depth-first walk.
// We write out the subtree in postorder.
stack.push_back({self, 0, starting_pos});
while (!stack.empty()) {
// Caution: note that `node`, `pos`, `stage` are references to contents of `stack`.
// Any pushing or popping will invalidate them, so be sure not to use them afterwards!
auto& [node, stage, startpos] = stack.back();
if (stage < static_cast<int>(node->get_children().size())) {
stage += 1;
if (!node->get_children()[stage - 1]->_pos.valid()) {
stack.push_back({node->get_children()[stage - 1], 0, out.pos()});
}
continue;
}
// Leaf nodes must have a payload.
expensive_assert(node->get_children().size() || node->_payload._payload_bits);
expensive_log("writer_node::write: node={} pos={} n_children={} size={} transition_length={}",
fmt::ptr(node.get()), out.pos().value, node->get_children().size(), node->_node_size.value, node->_transition_length);
if (guaranteed_fit) {
SCYLLA_ASSERT(out.pos() - startpos == node->_branch_size);
node->_pos = sink_pos(out.write(*node, sink_pos(out.pos())));
SCYLLA_ASSERT(out.pos() - startpos == node->_branch_size + node->_node_size);
} else {
if (uint64_t(out.serialized_size(*node, sink_pos(out.pos())).value) > out.bytes_left_in_page()) {
out.pad_to_page_boundary();
}
node->_branch_size = out.pos() - startpos;
auto before = out.pos();
node->_pos = sink_pos(out.write(*node, sink_pos(out.pos())));
node->_node_size = node_size((out.pos() - before).value);
expensive_assert(uint64_t(node->_node_size.value) <= out.page_size());
}
node->_first_transition_byte = std::byte(node->_transition[0]);
stack.pop_back();
}
expensive_assert(out.pos() - starting_pos == self->_branch_size + self->_node_size);
self->_branch_size = sink_offset(0);
self->_node_size = node_size(0);
self->_has_out_of_page_children = false;
self->_has_out_of_page_descendants = false;
}
} // namespace sstables::trie