/* * Copyright (C) 2024-present ScyllaDB */ /* * SPDX-License-Identifier: LicenseRef-ScyllaDB-Source-Available-1.0 */ #pragma once #include "writer_node.hh" #include "utils/small_vector.hh" namespace sstables::trie { template sink_offset writer_node::recalc_sizes(ptr self, const Output& out, sink_pos global_pos) { // Note: this routine is very similar to `write` in its overall structure. // A trie can be arbitrarily deep, so we can't afford to use recursion. // We have to maintain a walk stack manually. struct local_state { ptr _node; // The index of the next child to be visited. // When equal to the number of children, it's time to walk out of the node. int _stage; // The start position of the node's entire subtree. sink_pos _startpos; }; utils::small_vector stack; // Depth-first walk. // To calculate the sizes, we essentially simulate a write of the tree. stack.push_back({self, 0, global_pos}); while (!stack.empty()) { // Caution: note that `node`, `pos`, `stage` are references to contents of `stack`. // Any pushing or popping will invalidate them, so be sure not to use them afterwards! auto& [node, stage, startpos] = stack.back(); // If the node has out of page grandchildren, the sizes of children might have changed from the estimate, // so we have to recurse into the children and update them. if (stage < static_cast(node->get_children().size()) && node->_has_out_of_page_descendants) { stage += 1; stack.push_back({node->get_children()[stage - 1], 0, global_pos}); continue; } // If we got here, then either we have recursed into children // (and then global_pos was updated accordingly), // or we skipped that because there was no need. In the latter case, // we have to update global_pos manually here. if (!node->_has_out_of_page_descendants) { global_pos = global_pos + node->_branch_size; } node->_branch_size = global_pos - startpos; if (node->_has_out_of_page_children || node->_has_out_of_page_descendants) { // The size of children might have changed, which might have in turn changed // our offsets to children, which might have changed our size. // We have to recalculate. node->_node_size = out.serialized_size(*node, sink_pos(global_pos)); expensive_assert(uint64_t(node->_node_size.value) <= out.page_size()); } global_pos = global_pos + node->_node_size; stack.pop_back(); } return self->_branch_size + self->_node_size; } template void writer_node::write(ptr self, Output& out, bool guaranteed_fit) { expensive_log( "writer_node::write: subtree={} pos={}, ps={} sz={} ", fmt::ptr(self.get()), out.pos().value, out.page_size(), (self->_branch_size + self->_node_size).value); expensive_assert(!self->_pos.valid()); expensive_assert(self->_node_size.valid()); if (guaranteed_fit) { auto page_of_first_byte = round_down(out.pos().value, out.page_size()); auto page_of_last_byte = round_down((out.pos() + self->_branch_size + self->_node_size).value - 1, out.page_size()); expensive_assert(page_of_first_byte == page_of_last_byte); } auto starting_pos = out.pos(); // A trie can be arbitrarily deep, so we can't afford to use recursion. // We have to maintain a walk stack manually. struct local_state { ptr _node; // The index of the next child to be visited. // When equal to the number of children, it's time to walk out of the node. int _stage; // The start position of the node's entire subtree. sink_pos _startpos; }; // Note: partition index should almost always have depth smaller than 6. // Row index can have depth of up to several thousand. utils::small_vector stack; // Depth-first walk. // We write out the subtree in postorder. stack.push_back({self, 0, starting_pos}); while (!stack.empty()) { // Caution: note that `node`, `pos`, `stage` are references to contents of `stack`. // Any pushing or popping will invalidate them, so be sure not to use them afterwards! auto& [node, stage, startpos] = stack.back(); if (stage < static_cast(node->get_children().size())) { stage += 1; if (!node->get_children()[stage - 1]->_pos.valid()) { stack.push_back({node->get_children()[stage - 1], 0, out.pos()}); } continue; } // Leaf nodes must have a payload. expensive_assert(node->get_children().size() || node->_payload._payload_bits); expensive_log("writer_node::write: node={} pos={} n_children={} size={} transition_length={}", fmt::ptr(node.get()), out.pos().value, node->get_children().size(), node->_node_size.value, node->_transition_length); if (guaranteed_fit) { SCYLLA_ASSERT(out.pos() - startpos == node->_branch_size); node->_pos = sink_pos(out.write(*node, sink_pos(out.pos()))); SCYLLA_ASSERT(out.pos() - startpos == node->_branch_size + node->_node_size); } else { if (uint64_t(out.serialized_size(*node, sink_pos(out.pos())).value) > out.bytes_left_in_page()) { out.pad_to_page_boundary(); } node->_branch_size = out.pos() - startpos; auto before = out.pos(); node->_pos = sink_pos(out.write(*node, sink_pos(out.pos()))); node->_node_size = node_size((out.pos() - before).value); expensive_assert(uint64_t(node->_node_size.value) <= out.page_size()); } node->_first_transition_byte = std::byte(node->_transition[0]); stack.pop_back(); } expensive_assert(out.pos() - starting_pos == self->_branch_size + self->_node_size); self->_branch_size = sink_offset(0); self->_node_size = node_size(0); self->_has_out_of_page_children = false; self->_has_out_of_page_descendants = false; } } // namespace sstables::trie