mirror of
https://github.com/scylladb/scylladb.git
synced 2026-05-22 15:52:13 +00:00
558 lines
23 KiB
C++
558 lines
23 KiB
C++
/*
|
|
* Copyright (C) 2024-present ScyllaDB
|
|
*/
|
|
|
|
/*
|
|
* SPDX-License-Identifier: LicenseRef-ScyllaDB-Source-Available-1.1
|
|
*/
|
|
|
|
#pragma once
|
|
|
|
#include <cstdint>
|
|
#include "common.hh"
|
|
#include "utils/assert.hh"
|
|
|
|
// Adds expensive extra checks against use-after-free on pointers obtained from the bump_allocator.
|
|
// Must be a macro so that it can affect whether some struct fields are defined or not.
|
|
#define TRIE_SANITIZE_BUMP_ALLOCATOR 0
|
|
|
|
#if TRIE_SANITIZE_BUMP_ALLOCATOR
|
|
static_assert(sstables::trie::developer_build, "TRIE_SANITIZE_BUMP_ALLOCATOR needs sstables::trie::developer_build == true");
|
|
#include <map>
|
|
#endif
|
|
|
|
namespace sstables::trie {
|
|
|
|
// Special-purpose stack-like allocator.
|
|
//
|
|
// It has two allowed operations:
|
|
// 1. Append a new buffer to the "stack".
|
|
// 2. Shrink the stack to the given address, invalidating everything allocated after it.
|
|
// (In particular, it's legal to shrink the stack to the middle of some allocation,
|
|
// invalidating the allocation's tail but preserving its head.)
|
|
//
|
|
// This exists because the trie writer's allocation pattern is quite stack-like,
|
|
// and it would be a shame not to try to take advantage of it.
|
|
//
|
|
// That said, this isn't really a critical optimization.
|
|
// Last time I checked, it only saves around ~25% of work.
|
|
// I'm not sure if it's worth the added complexity.
|
|
class bump_allocator {
|
|
public:
|
|
// This is just a sanitizing wrapper around a pointer.
|
|
// In release builds it should behave exactly like a regular pointer.
|
|
template <typename T>
|
|
struct ptr {
|
|
using value_type = std::conditional_t<std::is_array_v<T>, std::remove_extent_t<T>, T>;
|
|
friend bump_allocator;
|
|
value_type* _ptr = nullptr;
|
|
#if TRIE_SANITIZE_BUMP_ALLOCATOR
|
|
// Owner.
|
|
bump_allocator* _alctr = nullptr;
|
|
// The index of last discard at the time this allocation was created.
|
|
size_t _discard_counter = 0;
|
|
// The position of this allocation in the allocator's stack.
|
|
size_t _global_pos = 0;
|
|
// The size of this allocation, in bytes.
|
|
// Only a past-the-end pointer, obtained with x.offset(x._size), might have _size equal to zero.
|
|
// Other pointers must have non-zero _size.
|
|
size_t _size = 0;
|
|
#endif
|
|
public:
|
|
ptr() {}
|
|
[[nodiscard]] value_type* get() const noexcept {
|
|
#if TRIE_SANITIZE_BUMP_ALLOCATOR
|
|
if (_ptr) {
|
|
auto legal_timestamp_it = std::prev(this->_alctr->_max_legal_timestamp.lower_bound(this->_global_pos + this->_size - 1));
|
|
expensive_assert(this->_discard_counter >= legal_timestamp_it->second);
|
|
expensive_assert(this->_global_pos + (this->_size != 0) <= this->_alctr->global_pos());
|
|
}
|
|
#endif
|
|
return _ptr;
|
|
}
|
|
[[nodiscard]] value_type& operator[] (size_t i) const noexcept {
|
|
#if TRIE_SANITIZE_BUMP_ALLOCATOR
|
|
expensive_assert(_size);
|
|
#endif
|
|
return get()[i];
|
|
};
|
|
[[nodiscard]] value_type& operator*() const noexcept {
|
|
#if TRIE_SANITIZE_BUMP_ALLOCATOR
|
|
expensive_assert(_size);
|
|
#endif
|
|
return get()[0];
|
|
};
|
|
[[nodiscard]] value_type* operator->() const noexcept {
|
|
#if TRIE_SANITIZE_BUMP_ALLOCATOR
|
|
expensive_assert(_size);
|
|
#endif
|
|
return get();
|
|
};
|
|
// Returns a pointer which represents the tail of this allocation,
|
|
// but starting at `n`th element.
|
|
[[nodiscard]] ptr offset(size_t n) const noexcept {
|
|
auto ret = *this;
|
|
ret._ptr += n;
|
|
#if TRIE_SANITIZE_BUMP_ALLOCATOR
|
|
size_t sz = n * sizeof(value_type);
|
|
expensive_assert(ret._size >= sz);
|
|
ret._global_pos += sz;
|
|
ret._size -= sz;
|
|
#endif
|
|
return ret;
|
|
};
|
|
[[nodiscard]] ptr<value_type> element(size_t n) const noexcept
|
|
requires(std::is_array_v<T>) {
|
|
auto tmp = offset(n);
|
|
ptr<value_type> ret;
|
|
ret._ptr = tmp._ptr;
|
|
#if TRIE_SANITIZE_BUMP_ALLOCATOR
|
|
ret._alctr = tmp._alctr;
|
|
ret._discard_counter = tmp._discard_counter;
|
|
ret._global_pos = tmp._global_pos;
|
|
ret._size = tmp._size;
|
|
#endif
|
|
return ret;
|
|
};
|
|
// Splits the allocation in two.
|
|
[[nodiscard]] std::pair<ptr, ptr> split(size_t first_size) const noexcept
|
|
requires(std::is_array_v<T>)
|
|
{
|
|
auto ret1 = *this;
|
|
#if TRIE_SANITIZE_BUMP_ALLOCATOR
|
|
size_t sz = first_size * sizeof(value_type);
|
|
expensive_assert(_size > sz);
|
|
ret1._size = sz;
|
|
#endif
|
|
auto ret2 = this->offset(first_size);
|
|
return {ret1, ret2};
|
|
};
|
|
};
|
|
|
|
constexpr static size_t segment_alignment = alignof(max_align_t);
|
|
// As usual in Scylla, the allocation stack is fragmented to avoid large contiguous buffers.
|
|
size_t _segment_size;
|
|
|
|
private:
|
|
struct aligned_deleter {
|
|
void operator()(std::byte ptr[]) { free(ptr); }
|
|
};
|
|
using buf = std::unique_ptr<std::byte[], aligned_deleter>;
|
|
// We keep the most recently freed segment here.
|
|
// If we need a new buffer and this is present,
|
|
// we reuse it instead of allocating a new one.
|
|
// This ensures we won't repeatedly free and allocate a segment from the main allocator.
|
|
buf _spare = nullptr;
|
|
// The segment currently at the top of the stack.
|
|
buf _current = nullptr;
|
|
// Remaining free space in _current.
|
|
size_t _remaining = 0;
|
|
// Previous segments in the stack.
|
|
std::vector<buf> _old;
|
|
|
|
#if TRIE_SANITIZE_BUMP_ALLOCATOR
|
|
// The number of discard() calls until now.
|
|
uint64_t _discard_timestamp = 0;
|
|
// Maps given stack positions to the index of the last discard() covering that position.
|
|
// A valid pointer to some position will have a _discard_timestamp which matches this map.
|
|
// An invalidated pointer will have a _discard_timestamp which is smaller than that.
|
|
std::map<size_t, size_t> _max_legal_timestamp{{0, 0}};
|
|
#endif
|
|
|
|
private:
|
|
// Returns the current total size of the "stack".
|
|
size_t global_pos() const {
|
|
return allocated_memory() - _remaining;
|
|
}
|
|
|
|
// Invalidate all bytes in the stack since the byte at given address (inclusive).
|
|
void discard_impl(std::byte* addr) noexcept {
|
|
expensive_assert(bool(_current));
|
|
while (!(addr > _current.get() && addr <= _current.get() + _segment_size)) [[unlikely]] {
|
|
#if TRIE_SANITIZE_BUMP_ALLOCATOR
|
|
std::memset(_current.get(), 0xfe, _segment_size);
|
|
#endif
|
|
_spare = std::move(_current);
|
|
expensive_assert(!_old.empty());
|
|
_current = std::move(_old.back());
|
|
_remaining = 0;
|
|
_old.pop_back();
|
|
}
|
|
#if TRIE_SANITIZE_BUMP_ALLOCATOR
|
|
auto end = _current.get() + _segment_size - _remaining;
|
|
std::memset(addr, 0xfe, end - addr);
|
|
#endif
|
|
_remaining = _segment_size - (addr - _current.get());
|
|
}
|
|
|
|
void push_segment() {
|
|
if (_current) {
|
|
_old.push_back(std::move(_current));
|
|
}
|
|
if (_spare) {
|
|
_current = std::move(_spare);
|
|
} else {
|
|
_current = std::unique_ptr<std::byte[], aligned_deleter>(
|
|
static_cast<std::byte*>(aligned_alloc(_segment_size, _segment_size)));
|
|
}
|
|
}
|
|
|
|
template <typename T>
|
|
requires std::is_trivially_destructible_v<typename ptr<T>::value_type>
|
|
&& (alignof(typename ptr<T>::value_type) <= segment_alignment)
|
|
[[nodiscard]] ptr<T> alloc_impl(size_t n) {
|
|
using value_type = ptr<T>::value_type;
|
|
expensive_assert(n < _segment_size / sizeof(value_type));
|
|
SCYLLA_ASSERT(n > 0);
|
|
auto sz = n * sizeof(value_type);
|
|
_remaining -= _remaining % alignof(value_type);
|
|
if (sz > _remaining) [[unlikely]] {
|
|
push_segment();
|
|
// We don't use the first byte of a segment because that would make discard() ambiguous --
|
|
// a pointer to the first byte of a segment could point to an allocation starting at that byte,
|
|
// or point past the end of an allocation from a neighbouring segment.
|
|
_remaining = _segment_size - alignof(value_type);
|
|
}
|
|
auto result = &_current[_segment_size - _remaining];
|
|
ptr<T> ret;
|
|
#if TRIE_SANITIZE_BUMP_ALLOCATOR
|
|
ret._global_pos = global_pos();
|
|
ret._size = sz;
|
|
ret._discard_counter = _discard_timestamp;
|
|
ret._alctr = this;
|
|
#endif
|
|
ret._ptr = reinterpret_cast<value_type*>(result);
|
|
_remaining -= sz;
|
|
expensive_log("bump_allocator: alloc: this={} sz={} result={} _remaining={}",
|
|
fmt::ptr(this), sz, fmt::ptr(result), _remaining);
|
|
return ret;
|
|
}
|
|
|
|
public:
|
|
bump_allocator(size_t segment_size) : _segment_size(segment_size) {
|
|
SCYLLA_ASSERT(_segment_size % alignof(max_align_t) == 0);
|
|
}
|
|
|
|
// Total memory usage by this allocator.
|
|
size_t allocated_memory() const {
|
|
return (_old.size() + bool(_current)) * _segment_size;
|
|
}
|
|
|
|
// Allocate a suitably-aligned (uninitialized) buffer for an array of N elements of type T.
|
|
// The total size of this allocation must be smaller than segment_size.
|
|
template <typename T>
|
|
[[nodiscard]] ptr<T> alloc()
|
|
requires (!std::is_array_v<T>) {
|
|
return alloc_impl<T>(1);
|
|
}
|
|
|
|
template <typename T>
|
|
[[nodiscard]] ptr<T> alloc(size_t n)
|
|
requires (std::is_array_v<T>) {
|
|
return alloc_impl<T>(n);
|
|
}
|
|
|
|
// Invalidate all bytes in the stack since the byte at given address (inclusive).
|
|
template <typename T>
|
|
void discard(ptr<T> p) noexcept {
|
|
auto addr = reinterpret_cast<std::byte*>(p.get());
|
|
expensive_log("bump_allocator: discard: this={} addr={} _global_pos={} _remaining={}",
|
|
fmt::ptr(this), fmt::ptr(addr), global_pos(), _remaining);
|
|
discard_impl(addr);
|
|
#if TRIE_SANITIZE_BUMP_ALLOCATOR
|
|
_discard_timestamp++;
|
|
_max_legal_timestamp.erase(_max_legal_timestamp.upper_bound(p._global_pos), _max_legal_timestamp.end());
|
|
_max_legal_timestamp.insert({p._global_pos, _discard_timestamp});
|
|
#endif
|
|
}
|
|
};
|
|
|
|
// The below few types are just strongly-typed wrappers around integers,
|
|
// just so they can have some extra correctness checks and
|
|
// so that we can write .valid() instead of "!= -1".
|
|
// I'm not sure if this is better or worse than just using regular integers.
|
|
|
|
// Represents a difference between positions in the output stream.
|
|
// Just a strongly-typed wrapper around int64_t.
|
|
struct sink_offset {
|
|
int64_t value = -1;
|
|
sink_offset() = default;
|
|
explicit sink_offset(int64_t v) : value(v) {}
|
|
std::strong_ordering operator<=>(const sink_offset& o) const {
|
|
expensive_assert(valid());
|
|
expensive_assert(o.valid());
|
|
return value <=> o.value;
|
|
};
|
|
bool operator==(const sink_offset& o) const = default;
|
|
bool valid() const {
|
|
return value >= 0;
|
|
}
|
|
sink_offset operator+(sink_offset o) const {
|
|
expensive_assert(valid());
|
|
expensive_assert(o.valid());
|
|
return sink_offset{value + o.value};
|
|
}
|
|
};
|
|
|
|
// Represents a size of a serialized node.
|
|
// Just a strongly-typed wrapper around int16_t.
|
|
struct node_size {
|
|
int16_t value = -1;
|
|
node_size() = default;
|
|
explicit node_size(int16_t v) : value(v) {}
|
|
bool valid() const {
|
|
return value >= 0;
|
|
}
|
|
// node_size is a "subtype" of sink_offset
|
|
// in our typing exercise.
|
|
operator sink_offset() const {
|
|
return sink_offset(value);
|
|
}
|
|
};
|
|
|
|
// Represents a position in the output stream.
|
|
// Just a strongly-typed wrapper around int64_t,
|
|
// which forbids some arithmetic operations.
|
|
//
|
|
// E.g. a sink_pos can be subtracted from sink_pos
|
|
// to get an offset,
|
|
// but a sink_pos can't be added to sink_pos, which
|
|
// would be nonsense.
|
|
struct sink_pos {
|
|
int64_t value = -1;
|
|
sink_pos() = default;
|
|
explicit sink_pos(int64_t v) : value(v) {}
|
|
bool valid() const {
|
|
return value >= 0;
|
|
}
|
|
std::strong_ordering operator<=>(const sink_pos& o) const {
|
|
expensive_assert(valid());
|
|
expensive_assert(o.valid());
|
|
return value <=> o.value;
|
|
};
|
|
sink_pos operator+(sink_offset o) const {
|
|
expensive_assert(valid());
|
|
expensive_assert(o.valid());
|
|
return sink_pos{value + o.value};
|
|
}
|
|
sink_pos operator-(sink_offset o) const {
|
|
expensive_assert(valid());
|
|
expensive_assert(o.valid());
|
|
return sink_pos{value - o.value};
|
|
}
|
|
sink_offset operator-(sink_pos pos) const {
|
|
expensive_assert(valid());
|
|
expensive_assert(pos.valid());
|
|
return sink_offset{value - pos.value};
|
|
}
|
|
};
|
|
|
|
// The user of the writer inserts (key, payload) pairs. This is the payload.
|
|
// A payload consists of a 4-bit header and several bytes of content.
|
|
struct trie_payload {
|
|
// What the reader will see is a 4-bit "payload bits" header, and a pointer to the body ("bytes") of the payload.
|
|
// The reader will use the header to determine the length and the meaning of the "bytes".
|
|
//
|
|
// The meaning of "bits" and "bytes" is generally opaque to the trie-writing layer,
|
|
// except that the value `bits == 0` must represent "no payload", and "bytes" is assumed
|
|
// empty in this case.
|
|
uint8_t _payload_bits = 0;
|
|
// We store the payload in a std::array + size pair,
|
|
// since it makes managing allocations slightly easier.
|
|
//
|
|
// 20 bytes is the biggest maximum possible payload in the BTI index format,
|
|
// so we don't need more.
|
|
constexpr static int MAX_PAYLOAD_SIZE = 20;
|
|
std::array<std::byte, MAX_PAYLOAD_SIZE> _payload_buf = {};
|
|
// For the writer, the meaning of the "bits" is opaque,
|
|
// we just store an explicit size of the "bytes".
|
|
uint8_t _payload_size = 0;
|
|
|
|
trie_payload() noexcept = default;
|
|
trie_payload(uint8_t bits, const_bytes blob) noexcept {
|
|
expensive_assert(blob.size() <= _payload_buf.size());
|
|
expensive_assert(bits < 16);
|
|
expensive_assert(bool(blob.size()) == bool(bits));
|
|
_payload_bits = bits;
|
|
_payload_size = blob.size();
|
|
std::ranges::copy(blob, _payload_buf.data());
|
|
}
|
|
|
|
// For tests.
|
|
bool operator==(const trie_payload& other) const {
|
|
return other._payload_bits == _payload_bits
|
|
&& std::ranges::equal(other.blob(), blob());
|
|
}
|
|
|
|
const_bytes blob() const noexcept{
|
|
return {_payload_buf.data(), _payload_size};
|
|
}
|
|
};
|
|
|
|
class writer_node;
|
|
|
|
// trie_writer consumes a stream of keys and produces a stream of writer_node objects,
|
|
// which is fed into a trie_writer_sink.
|
|
//
|
|
// We pass the serialization logic to trie_writer via a template parameter for the sake
|
|
// of testability. This way the logic of trie_writer and the serialization format can be tested
|
|
// in isolation from each other.
|
|
template <typename T>
|
|
concept trie_writer_sink = requires(T& o, const T& co, const writer_node& x, sink_pos pos) {
|
|
// Returns the serialized size of node x, if it was written starting at position `pos`.
|
|
// This might only depend on the deltas between `pos` and the positions (known or expected)
|
|
// of children.
|
|
{ o.serialized_size(x, pos) } -> std::same_as<node_size>;
|
|
// Writes out the node and returns the position which the parent should point to.
|
|
// (Which might be different than the start position. This is because the BTI format will
|
|
// turns a node with a multi-character transition into many single-character nodes, and
|
|
// the parent must point to the last one, not the first one).
|
|
{ o.write(x, pos) } -> std::same_as<sink_pos>;
|
|
{ o.pad_to_page_boundary() };
|
|
{ co.pos() } -> std::same_as<sink_pos>;
|
|
{ co.page_size() } -> std::same_as<uint64_t>;
|
|
{ co.bytes_left_in_page() } -> std::same_as<uint64_t>;
|
|
};
|
|
|
|
// The trie writer holds a trie of these nodes while it's building the index pages.
|
|
//
|
|
// Whenever a new key is fed to the writer:
|
|
// 1. The "latest node" pointer climbs up the rightmost path to the point of mismatch between
|
|
// the previous key and the new key, and "completes" nodes along the way.
|
|
// (A node is "completed" when we climb above it, which means that it won't receive any new children).
|
|
// 2. If a newly-completed subtree can't fit into a page of the file writer,
|
|
// branches of the subtree are flushed and freed from memory,
|
|
// and only the root of the subtree remains in memory, with metadata updated accordingly.
|
|
// 3. A new node, containing the chain of new characters after the mismatch point, is added as the latest node.
|
|
struct writer_node {
|
|
template <typename T>
|
|
using ptr = bump_allocator::ptr<T>;
|
|
// The payload of this node. It might be absent -- this is represented by _payload._payload_bits == 0.
|
|
//
|
|
// A node has a payload iff it corresponds to some key.
|
|
// All leaf nodes must contain a payload.
|
|
trie_payload _payload;
|
|
// True iff any of the direct children of this node lies on a different page
|
|
// (i.e. has been written out beforehand, in a different lay_out_children() than this node).
|
|
// It means that the size of this node might change from the initial estimate at completion
|
|
// (because the offset to children might have grow, and the offset integers need more bits to fit),
|
|
// so it has to be recalculated before the actual write.
|
|
//
|
|
// After the node itself is written out, this is reset to `false`.
|
|
//
|
|
// Invalid after _pos is set.
|
|
bool _has_out_of_page_children = false;
|
|
// True iff some yet-unwritten descendant has out-of-page children.
|
|
// This means that the sizes of our children (and thus also our size) might change from the initial
|
|
// estimate at completion time (because some offsets in their subtree might have grown, and the offset
|
|
// integers now require more bits), so they have to be recalculated before the actual write.
|
|
//
|
|
// After the node's children are written out, this is always `false`.
|
|
//
|
|
// Invalid after _pos is set.
|
|
bool _has_out_of_page_descendants = false;
|
|
// This is a byte-based trie. A path from the root to a node corresponds
|
|
// to some key, with each successive node appending several characters.
|
|
// _transition points at the chain of characters (of length _transition_length)
|
|
// corresponding to the edge between this node and its parent.
|
|
//
|
|
// In the root node, _transition is meaningless, though it's still set to something non-empty.
|
|
//
|
|
// In uncompleted nodes, must be allocated after the node itself.
|
|
// (Our memory management assumes a certain order of things inside the bump_allocator).
|
|
//
|
|
// Invalid after _pos is set.
|
|
ptr<std::byte[]> _transition;
|
|
// Before _pos is set: _length represents the length of _transition.
|
|
// After the node is fully constructed, _length will be greater than 0.
|
|
//
|
|
// After _pos is set: _first_byte is the first byte of this node's transition chain.
|
|
union {
|
|
uint32_t _transition_length = 0;
|
|
std::byte _first_transition_byte;
|
|
};
|
|
// The expected serialized size of the node.
|
|
// Depending on context (especially the value of _has_out_of_page_children),
|
|
// this might contain an estimate (which will be re-checked later),
|
|
// or an exact value (which will be later required to be correct).
|
|
//
|
|
// Only valid for completed nodes.
|
|
// Invalidated after _pos is set.
|
|
node_size _node_size;
|
|
// A vector of children, ordered by the first byte of their `_transition`.
|
|
//
|
|
// In uncompleted nodes, must be allocated after the node itself.
|
|
// (Our memory management assumes a certain order of things inside the bump_allocator).
|
|
//
|
|
// Invalid after _pos is set.
|
|
uint16_t _children_capacity = 0;
|
|
uint16_t _children_size = 0;
|
|
ptr<ptr<writer_node>[]> _children;
|
|
// The expected total serialized size of the node's descendants.
|
|
// (In other words: if we wrote the node's entire subtree to a file, without any padding,
|
|
// we would expect the file to grow by _branch_size + _node_size).
|
|
//
|
|
// Depending on context (especially the values
|
|
// of _has_out_of_page_children and _has_out_of_page_descendants),
|
|
// this might be an estimate or an exact value. (Just like _node_size).
|
|
//
|
|
// Only valid for completed nodes.
|
|
// Invalidated after _pos is set.
|
|
sink_offset _branch_size;
|
|
// The identifier (in practice: position) of the node in the output file/stream.
|
|
//
|
|
// Only set after the node is flushed.
|
|
// After _pos is set, most fields stop being meaningful.
|
|
// Only _pos and _transition, remain meaningful, because they will be needed to
|
|
// will need them to writer.
|
|
sink_pos _pos;
|
|
|
|
static ptr<writer_node> create(const_bytes b, bump_allocator&);
|
|
// Emplaces a new child node into `_children`,
|
|
// and returns a pointer to it.
|
|
// Children must be added in order of rising transitions.
|
|
ptr<writer_node> add_child(const_bytes b, bump_allocator&);
|
|
// Accessors.
|
|
std::span<const ptr<writer_node>> get_children() const;
|
|
std::span<ptr<writer_node>> get_children();
|
|
// Resets the _children vector.
|
|
void reset_children();
|
|
// Ensures children has size at least n, growing it if needed.
|
|
void reserve_children(size_t n, bump_allocator& alctr);
|
|
// Appends the new child to _children. Expects sufficient _children_capacity reserved in advance.
|
|
void push_child(ptr<writer_node> x, bump_allocator& alctr);
|
|
// Returns the first character of this node's transition chain.
|
|
std::byte transition() const;
|
|
// Sets the payload.
|
|
//
|
|
// Must only be used on nodes immediately after they are created,
|
|
// i.e. before any other nodes are added.
|
|
//
|
|
// It's separate from `add_child` because the root might need a payload too,
|
|
// but it isn't created by `add_child`.
|
|
void set_payload(const trie_payload&) noexcept;
|
|
// Re-calculates _node_size and/or _branch_size of this node and its descendants
|
|
// (as needed by _has_out_of_page_children or _has_out_of_page_children) under
|
|
// the assumption that we will write out the entire subtree without any padding,
|
|
// with output initially at position start_pos. If the assumption is true,
|
|
// then the values of _node_size and _branch_size are exact after this call.
|
|
template <trie_writer_sink Output>
|
|
static sink_offset recalc_sizes(ptr<writer_node> self, const Output&, sink_pos start_pos);
|
|
// Writes out the entire subtree.
|
|
//
|
|
// When it's known in advance that the entire subtree will fit within the current page,
|
|
// guaranteed_fit can be set to true. This is the typical case.
|
|
//
|
|
// Otherwise, guaranteed_fit must be set to false. In this case it's legal for the subtree
|
|
// to not fit within current page (internal padding will be inserted accordingly so that
|
|
// each node doesn't cross node boundaries), but it will re-calculate sizes again,
|
|
// so it's more expensive.
|
|
//
|
|
// After this call, it's true that:
|
|
// get_children().empty() && _pos.valid() && !_has_out_of_page_children && !_has_out_of_page_descendants
|
|
template <trie_writer_sink Output>
|
|
static void write(ptr<writer_node> self, Output&, bool guaranteed_fit);
|
|
};
|
|
|
|
} // namespace sstables::trie
|