Files
scylladb/sstables/index_entry.hh
Tomasz Grabiec f55bb154ec sstables: mx: index_reader: Amoritze partition key storage
This change reduces the cost of partition index page construction and
LSA migration. This is achieved by several things working together:

 - index entries don't store keys as separate small objects (managed_bytes)
   They are written into one managed_bytes fragmented storage, entries
   hold offset into it.

   Before, we paid 16 bytes for managed_bytes plus LSA descriptor for
   the storage (1 byte) plus back-reference in the storage (8 bytes),
   so 25 bytes. Now we only pay 4 bytes for the size offset. If keys are 16
   bytes, that's a reduction from 31 bytes to 20 bytes per key.

 - index entries and key storage are now trivially moveable, so LSA
   migration can use memcpy() which amortizes the cost per key.
   memcpy().

   LSA eviction is now trivial and constant time for the whole page
   regardless of the number of entries. Page eviction dropped from
   14 us to 1 us.

This improves throughput in a CPU-bound miss-heavy read workload where
the partition index doesn't fit in memory.

  scylla perf-simple-query -c1 -m200M --partitions=1000000

Before:

    15328.25 tps (150.0 allocs/op,  14.1 logallocs/op,  45.4 tasks/op,  286769 insns/op,  218134 cycles/op,        0 errors)
    15279.01 tps (149.9 allocs/op,  14.1 logallocs/op,  45.3 tasks/op,  287696 insns/op,  218637 cycles/op,        0 errors)
    15347.78 tps (149.7 allocs/op,  14.1 logallocs/op,  45.3 tasks/op,  285851 insns/op,  217795 cycles/op,        0 errors)
    15403.68 tps (149.6 allocs/op,  14.1 logallocs/op,  45.2 tasks/op,  285111 insns/op,  216984 cycles/op,        0 errors)
    15189.47 tps (150.0 allocs/op,  14.1 logallocs/op,  45.5 tasks/op,  289509 insns/op,  219602 cycles/op,        0 errors)
    15295.04 tps (149.8 allocs/op,  14.1 logallocs/op,  45.3 tasks/op,  288021 insns/op,  218545 cycles/op,        0 errors)
    15162.01 tps (149.8 allocs/op,  14.1 logallocs/op,  45.4 tasks/op,  291265 insns/op,  220451 cycles/op,        0 errors)

After:

    21620.18 tps (148.4 allocs/op,  13.4 logallocs/op,  43.7 tasks/op,  176817 insns/op,  153183 cycles/op,        0 errors)
    20644.03 tps (149.8 allocs/op,  13.5 logallocs/op,  44.3 tasks/op,  187941 insns/op,  160409 cycles/op,        0 errors)
    20588.06 tps (150.1 allocs/op,  13.5 logallocs/op,  44.5 tasks/op,  188090 insns/op,  160818 cycles/op,        0 errors)
    20789.29 tps (149.5 allocs/op,  13.5 logallocs/op,  44.2 tasks/op,  186495 insns/op,  159382 cycles/op,        0 errors)
    20977.89 tps (149.5 allocs/op,  13.4 logallocs/op,  44.2 tasks/op,  183969 insns/op,  158140 cycles/op,        0 errors)
    21125.34 tps (149.1 allocs/op,  13.4 logallocs/op,  44.1 tasks/op,  183204 insns/op,  156925 cycles/op,        0 errors)
    21244.42 tps (148.6 allocs/op,  13.4 logallocs/op,  43.8 tasks/op,  181276 insns/op,  155973 cycles/op,        0 errors)

Mostly because the index now fits in memory.

When it doesn't, the benefits are still visible due to lower LSA overhead.
2026-03-18 16:25:21 +01:00

342 lines
14 KiB
C++

/*
* Copyright (C) 2017-present ScyllaDB
*/
/*
* SPDX-License-Identifier: LicenseRef-ScyllaDB-Source-Available-1.0
*/
#pragma once
#include <functional>
#include <variant>
#include "mutation/position_in_partition.hh"
#include "utils/overloaded_functor.hh"
#include "utils/lsa/chunked_managed_vector.hh"
#include "reader_permit.hh"
#include "sstables/types.hh"
#include "sstables/shared_sstable.hh"
#include "sstables/mx/parsers.hh"
#include "utils/allocation_strategy.hh"
#include "utils/managed_ref.hh"
#include "utils/managed_bytes.hh"
#include "utils/managed_vector.hh"
#include "dht/i_partitioner.hh"
#include <seastar/core/fstream.hh>
namespace sstables {
using promoted_index_block_position_view = std::variant<composite_view, position_in_partition_view>;
using promoted_index_block_position = std::variant<composite, position_in_partition>;
inline
promoted_index_block_position_view to_view(const promoted_index_block_position& v) {
return std::visit(overloaded_functor{
[] (const composite& v) -> promoted_index_block_position_view {
return composite_view(v);
},
[] (const position_in_partition& v) -> promoted_index_block_position_view {
return position_in_partition_view(v);
}
}, v);
}
// Return the owning version of the position given a view.
inline
promoted_index_block_position materialize(const promoted_index_block_position_view& v) {
return std::visit(overloaded_functor{
[] (const composite_view& v) -> promoted_index_block_position {
return composite(v);
},
[] (const position_in_partition_view& v) -> promoted_index_block_position {
return position_in_partition(v);
}
}, v);
}
class promoted_index_block_compare {
const position_in_partition::composite_less_compare _cmp;
public:
explicit promoted_index_block_compare(const schema& s) : _cmp{s} {}
bool operator()(const promoted_index_block_position_view& lhs, position_in_partition_view rhs) const {
return std::visit([this, rhs] (const auto& pos) { return _cmp(pos, rhs); }, lhs);
}
bool operator()(position_in_partition_view lhs, const promoted_index_block_position_view& rhs) const {
return std::visit([this, lhs] (const auto& pos) { return _cmp(lhs, pos); }, rhs);
}
bool operator()(const promoted_index_block_position_view& lhs, composite_view rhs) const {
return std::visit([this, rhs] (const auto& pos) { return _cmp(pos, rhs); }, lhs);
}
bool operator()(composite_view lhs, const promoted_index_block_position_view& rhs) const {
return std::visit([this, lhs] (const auto& pos) { return _cmp(lhs, pos); }, rhs);
}
bool operator()(const promoted_index_block_position_view& lhs, const promoted_index_block_position_view& rhs) const {
return std::visit([this, &lhs] (const auto& pos) { return (*this)(lhs, pos); }, rhs);
}
};
class promoted_index_block {
/*
* Block bounds are read and stored differently for ka/la and mc formats.
* For ka/la formats, we just read and store the whole sequence of bytes representing a 'composite' key,
* but for 'mc' we need to parse the clustering key prefix entirely along with its bound_kind.
* So we store them as a discriminated union, aka std::variant.
* As those representations are used differently for comparing positions in partition,
* we expose it through a discriminated union of views.
*/
using bound_storage = std::variant<temporary_buffer<char>, position_in_partition>;
// The block includes positions in the [_start, _end] range (both bounds inclusive)
bound_storage _start;
bound_storage _end;
uint64_t _offset;
uint64_t _width;
std::optional<deletion_time> _end_open_marker;
inline static
promoted_index_block_position_view get_position(const schema& s, const bound_storage& storage) {
return std::visit(overloaded_functor{
[&s] (const temporary_buffer<char>& buf) -> promoted_index_block_position_view {
return composite_view{to_bytes_view(buf), s.is_compound()}; },
[] (const position_in_partition& pos) -> promoted_index_block_position_view {
return pos;
}}, storage);
}
public:
// Constructor for ka/la format blocks
promoted_index_block(temporary_buffer<char>&& start, temporary_buffer<char>&& end,
uint64_t offset, uint64_t width)
: _start(std::move(start)), _end(std::move(end))
, _offset(offset), _width(width)
{}
// Constructor for mc format blocks
promoted_index_block(position_in_partition&& start, position_in_partition&& end,
uint64_t offset, uint64_t width, std::optional<deletion_time>&& end_open_marker)
: _start{std::move(start)}, _end{std::move(end)}
, _offset{offset}, _width{width}, _end_open_marker{end_open_marker}
{}
promoted_index_block(const promoted_index_block&) = delete;
promoted_index_block(promoted_index_block&&) noexcept = default;
promoted_index_block& operator=(const promoted_index_block&) = delete;
promoted_index_block& operator=(promoted_index_block&&) noexcept = default;
promoted_index_block_position_view start(const schema& s) const { return get_position(s, _start);}
promoted_index_block_position_view end(const schema& s) const { return get_position(s, _end);}
uint64_t offset() const { return _offset; }
uint64_t width() const { return _width; }
std::optional<deletion_time> end_open_marker() const { return _end_open_marker; }
};
// Cursor over the index for clustered elements of a single partition.
//
// The user is expected to call advance_to() for monotonically increasing positions
// in order to check if the index has information about more precise location
// of the fragments relevant for the range starting at given position.
//
// The user must serialize all async methods. The next call may start only when the future
// returned by the previous one has resolved.
//
// The user must call close() and wait for it to resolve before destroying.
//
class clustered_index_cursor {
public:
// Position of indexed elements in the data file relative to the start of the partition.
using offset_in_partition = uint64_t;
struct skip_info {
offset_in_partition offset;
tombstone active_tombstone;
position_in_partition active_tombstone_pos;
};
struct entry_info {
promoted_index_block_position_view start;
promoted_index_block_position_view end;
offset_in_partition offset;
};
virtual ~clustered_index_cursor() {};
// Note: Close must not fail
virtual future<> close() noexcept = 0;
// Advances the cursor to given position. When the cursor has more accurate information about
// location of the fragments from the range [pos, +inf) in the data file (since it was last advanced)
// it resolves with an engaged optional containing skip_info.
//
// The index may not be precise, so fragments from the range [pos, +inf) may be located after the
// position indicated by skip_info. It is guaranteed that no such fragments are located before the returned position.
//
// Offsets returned in skip_info are monotonically increasing.
//
// Must be called for non-decreasing positions.
// The caller must ensure that pos remains valid until the future resolves.
virtual future<std::optional<skip_info>> advance_to(position_in_partition_view pos) = 0;
virtual skip_info current_block() = 0;
// Determines the data file offset relative to the start of the partition such that fragments
// from the range (-inf, pos] are located before that offset.
//
// If such offset cannot be determined in a cheap way, returns a disengaged optional.
//
// Does not advance the cursor.
//
// The caller must ensure that pos remains valid until the future resolves.
virtual future<std::optional<offset_in_partition>> probe_upper_bound(position_in_partition_view pos) = 0;
// Returns skip information about the next position after the cursor
// or nullopt if there is no information about further positions.
//
// When entry_info is returned, the cursor was advanced to entry_info::start.
//
// The returned entry_info is only valid until the next invocation of any method on this instance.
virtual future<std::optional<entry_info>> next_entry() = 0;
};
// Promoted index information produced by the parser.
struct parsed_promoted_index_entry {
deletion_time del_time;
uint64_t promoted_index_start;
uint32_t promoted_index_size;
uint32_t num_blocks;
};
using promoted_index = parsed_promoted_index_entry;
// A partition index element.
// Allocated inside LSA.
struct [[gnu::packed]] index_entry {
mutable int64_t raw_token;
uint64_t data_file_offset;
uint32_t key_offset;
uint64_t position() const { return data_file_offset; }
dht::raw_token token() const { return dht::raw_token(raw_token); }
};
// Required for optimized LSA migration of storage of managed_vector.
static_assert(std::is_trivially_move_assignable_v<index_entry>);
static_assert(std::is_trivially_move_assignable_v<parsed_promoted_index_entry>);
// A partition index page.
//
// Allocated in the standard allocator space but with an LSA allocator as the current allocator.
// So the shallow part is in the standard allocator but all indirect objects are inside LSA.
class partition_index_page {
public:
lsa::chunked_managed_vector<index_entry> _entries;
managed_bytes _key_storage;
// Stores promoted index information of index entries.
// The i-th element corresponds to the i-th entry in _entries.
// Can be smaller than _entries. If _entries[i] doesn't have a matching element in _promoted_indexes then
// that entry doesn't have a promoted index.
// It's not chunked, because promoted index is present only when there are large partitions in the page,
// which also means the page will have typically only 1 entry due to summary:data_file size ratio.
// Kept separately to avoid paying for storage cost in pages where no entry has a promoted index,
// which is typical in workloads with small partitions.
managed_vector<promoted_index> _promoted_indexes;
public:
partition_index_page() = default;
partition_index_page(partition_index_page&&) noexcept = default;
partition_index_page& operator=(partition_index_page&&) noexcept = default;
bool empty() const { return _entries.empty(); }
size_t size() const { return _entries.size(); }
stop_iteration clear_gently() {
// Vectors have trivial storage, so are fast to destroy.
return stop_iteration::yes;
}
void clear_one_entry() {
_entries.pop_back();
}
bool has_promoted_index(size_t i) const {
return i < _promoted_indexes.size() && _promoted_indexes[i].promoted_index_size > 0;
}
/// Get promoted index for the i-th entry.
/// Call only when has_promoted_index(i) is true.
const promoted_index& get_promoted_index(size_t i) const {
return _promoted_indexes[i];
}
/// Get promoted index for the i-th entry.
/// Call only when has_promoted_index(i) is true.
promoted_index& get_promoted_index(size_t i) {
return _promoted_indexes[i];
}
/// Get promoted index size for the i-th entry.
uint32_t get_promoted_index_size(size_t i) const {
return has_promoted_index(i) ? get_promoted_index(i).promoted_index_size : 0;
}
/// Get deletion_time for partition represented by the i-th entry.
/// Returns disengaged optional if the entry doesn't have a promoted index, so we don't know the deletion_time.
/// It has to be read from the data file.
std::optional<deletion_time> get_deletion_time(size_t i) const {
if (has_promoted_index(i)) {
return get_promoted_index(i).del_time;
}
return {};
}
key_view get_key(size_t i) const {
auto start = _entries[i].key_offset;
auto end = i + 1 < _entries.size() ? _entries[i + 1].key_offset : _key_storage.size();
auto v = managed_bytes_view(_key_storage).prefix(end);
v.remove_prefix(start);
return key_view(v);
}
decorated_key_view get_decorated_key(const schema& s, size_t i) const {
auto key = get_key(i);
auto t = _entries[i].token();
if (!t) {
t = dht::raw_token(s.get_partitioner().get_token(key));
_entries[i].raw_token = t.value;
}
return decorated_key_view(dht::token(t), key);
}
size_t external_memory_usage() const {
size_t size = _entries.external_memory_usage();
size += _promoted_indexes.external_memory_usage();
size += _key_storage.external_memory_usage();
return size;
}
};
using index_list = partition_index_page;
}
inline std::ostream& operator<<(std::ostream& out, const sstables::promoted_index_block_position_view& pos) {
std::visit([&out] (const auto& pos) mutable { fmt::print(out, "{}", pos); }, pos);
return out;
}
inline std::ostream& operator<<(std::ostream& out, const sstables::promoted_index_block_position& pos) {
std::visit([&out] (const auto& pos) mutable { fmt::print(out, "{}", pos); }, pos);
return out;
}
template<>
struct fmt::formatter<sstables::clustered_index_cursor::skip_info> : fmt::formatter<std::string_view> {
auto format(const sstables::clustered_index_cursor::skip_info& info, fmt::format_context& ctx) const -> decltype(ctx.out()) {
return fmt::format_to(ctx.out(), "skip_info{{offset: {}, tombstone: ({}, {})}}",
info.offset, info.active_tombstone_pos, info.active_tombstone);
}
};