scylladb/test/boost/bti_index_test.cc

/*
 * Copyright (C) 2025-present ScyllaDB
 */

/*
 * SPDX-License-Identifier: LicenseRef-ScyllaDB-Source-Available-1.1
 */

// This file contains a test of BTI index writers and readers.
//
// It generates a random dataset (or sstable index entries),
// writes it to a BTI index file, and then runs a sequence of
// BTI index reader operations on it, checking that the results
// are consistent with a "reference" index on the same dataset.

#include <generator>
#include <seastar/testing/thread_test_case.hh>
#include <seastar/testing/test_case.hh>
#include <seastar/core/fstream.hh>
#include <seastar/core/seastar.hh>
#include <seastar/util/closeable.hh>
#include <seastar/util/defer.hh>
#include "sstables/mx/types.hh"
#include "sstables/trie/bti_index.hh"
#include "sstables/trie/bti_index_internal.hh"
#include "schema/schema_builder.hh"
#include "test/lib/log.hh"
#include "test/lib/nondeterministic_choice_stack.hh"
#include "test/lib/random_utils.hh"
#include "test/lib/tmpdir.hh"
#include "test/lib/reader_concurrency_semaphore.hh"
#include "utils/cached_file.hh"
#include "utils/i_filter.hh"
#include "utils/memory_data_sink.hh"
#include <fmt/std.h>

struct clustering_index_entry {
    sstables::clustering_info first_ck;
    sstables::clustering_info last_ck;
    uint64_t data_file_offset;
    sstables::deletion_time range_tombstone_before_first_ck;
};

struct partition_index_entry {
    dht::decorated_key dk;
    uint64_t data_file_offset;
    sstables::deletion_time partition_tombstone;
};

struct eof_index_entry {
    uint64_t data_file_offset;
};

struct partition_end_entry {
    uint64_t data_file_offset;
};

using index_entry = std::variant<partition_index_entry, clustering_index_entry, partition_end_entry, eof_index_entry>;

// Represents a sequence of entries written to the sstable index file,
// and sets of possible arguments for index reader operations on this index.
struct index_entry_dataset {
    const schema& s;
    // The set of ring positions participating in the test.
    // Will be used as the set of potential arguments for index reader operations
    // that take a ring position, and to construct partition ranges for `advance_to`.
    //
    // Assumed to contain all decorated keys used in `entries`.
    std::vector<dht::ring_position> ring_position_universe;
    // The set of clustering positions participating in the test.
    // Will be used as the set of potential arguments for index reader operations
    // that take a position in partition.
    std::vector<position_in_partition> position_in_partition_universe;
    // The sequence of entries written to the sstable index file.
    // in order.
    // It's assumed that the sequence contains at least one partition index entry,
    // and that it's of the form:
    // (partition_index_entry, clustering_index_entry*, partition_end_entry)+, eof_index_entry
    std::vector<index_entry> entries;
};

struct random_dataset_config {
    // Number of possible distinct values for each "component" of the decorated partition key.
    // (In a loose meaning -- in particular, we consider some individual bytes of the token
    // "components").
    // Needs to be big enough to have enough "power" to create enough trie branches to make for
    // intersesting trie shapes, but small enough to keep the values similar enough to
    // create interesting relationships between them.
    // The number of generated partition keys grows polynomially with this value,
    // but only a certain number of them will be picked to particpate in the test.
    int partition_key_component_values = 3;
    // The number of ring positions participating in the test.
    // The number of partition keys inserted into the index
    // will be some subset of that.
    // The complexity of the test grows exponentially with this value.
    int partition_universe_size = 3;
    // Like partition_key_component_values, but for clustering positions.
    int clustering_key_component_values = 3;
    // Like partition_universe_size, but for clustering positions.
    int clustering_universe_size = 6;
    // The max number of clustering blocks inserted into the index
    // for each partition.
    // Should be at most as big as clustering_universe_size / 2,
    // otherwise there isn't enough clustering positions to fill the blocks.
    int max_clustering_blocks = clustering_universe_size / 2;
};

// A recursive helper for generate_pips().
void generate_pips_impl(
    const schema& s,
    std::vector<data_value>& prefix,
    std::vector<position_in_partition>& result,
    std::span<data_value> possible_values
) {
    auto ckp = clustering_key_prefix::from_deeply_exploded(s, prefix);
    result.push_back(position_in_partition(partition_region::clustered, bound_weight(-1), ckp));
    if (prefix.size() == s.clustering_key_size()) {
        result.push_back(position_in_partition(partition_region::clustered, bound_weight(0), ckp));
    } else {
        for (const auto& value : possible_values) {
            prefix.push_back(value);
            generate_pips_impl(s, prefix, result, possible_values);
            prefix.pop_back();
        }
    }
    result.push_back(position_in_partition(partition_region::clustered, bound_weight(1), ckp));
}

// Generate all partition_in_position values,
// which use the given possible_values as values for clustering key columns.
// Assumes a clustering key of type `(short, short)`.
std::vector<position_in_partition> generate_pips(const schema& the_schema, std::span<data_value> possible_values) {
    std::vector<data_value> prefix;
    std::vector<position_in_partition> result;
    generate_pips_impl(the_schema, prefix, result, possible_values);
    return result;
}
static sstables::deletion_time make_random_tombstone() {
    if (tests::random::get_bool()) {
        return sstables::deletion_time::make_live();
    }
    return sstables::deletion_time{
        tests::random::get_int<int32_t>(std::numeric_limits<int32_t>::min(), std::numeric_limits<int32_t>::max()),
        tests::random::get_int<int64_t>(std::numeric_limits<int64_t>::min(), std::numeric_limits<int64_t>::max()),
    };
}

static sstables::bound_kind_m pip_bound_weight_to_m_bound_weight(const bound_weight bw) {
    switch (bw) {
    case bound_weight::before_all_prefixed:
        switch (tests::random::get_int<int>(0, 2)) {
            case 0:
                return sstables::bound_kind_m::incl_start;
            case 1:
                return sstables::bound_kind_m::excl_end;
            case 2:
                return sstables::bound_kind_m::excl_end_incl_start;
        }
        break;
    case bound_weight::after_all_prefixed:
        switch (tests::random::get_int<int>(0, 2)) {
            case 0:
                return sstables::bound_kind_m::incl_end;
            case 1:
                return sstables::bound_kind_m::excl_start;
            case 2:
                return sstables::bound_kind_m::incl_end_excl_start;
        }
        break;
    case bound_weight::equal:
        return sstables::bound_kind_m::clustering;
    default:
        abort();
    }
    abort();
}

// Generate some interesting dataset of sstable index entries and related key positions.
// Assumes a primary key of type `(short, (short, short))`.
index_entry_dataset generate_random_dataset(const schema& the_schema, const random_dataset_config& cfg) {
    const auto& s = the_schema;
    std::vector<index_entry> result;

    // Generate a few partition keys.
    std::vector<partition_key> pks;
    {
        std::set<int16_t> pks_set;
        while (pks_set.size() < static_cast<size_t>(cfg.partition_key_component_values)) {
            pks_set.insert(tests::random::get_int<int16_t>());
        }
        for (auto& x : pks_set) {
            pks.push_back(partition_key::from_deeply_exploded(s, std::vector<data_value>{data_value(x)}));
        }
    }

    // Generate the set of decorated keys participating in the test.
    // Some subset of them will be inserted into the index,
    // and ring positions related to them will be used as arguments for index reader operations.
    std::vector<dht::decorated_key> dk_universe;
    {
        // Generate a big set of possible decorated keys,
        // by taking a cartesian product of several values for each "component".
        // (We do that instead of generating completely random keys
        // because having some common prefixes between keys should explore
        // more logic, because the length of common prefixes is important for tries).
        std::vector<dht::decorated_key> dk_universe_candidates;
        std::vector<int> component_values;

        for (int i = 0; i < cfg.partition_key_component_values; ++i) {
            for (int j = 0; j < cfg.partition_key_component_values; ++j) {
                for (int k = 0; k < cfg.partition_key_component_values; ++k) {
                    auto token = dht::token::from_int64(i << 8 | j);
                    auto dk = dht::decorated_key(token, pks[k]);
                    dk_universe_candidates.push_back(dk);
                }
            }
        }
        // Select the configured number of decorated keys, at random, from the big set.
        auto indexes = std::vector(std::from_range, std::views::iota(size_t(0), std::size(dk_universe_candidates)));
        auto chosen_indexes = std::vector<size_t>(cfg.partition_universe_size);
        std::sample(indexes.begin(), indexes.end(), chosen_indexes.begin(), chosen_indexes.size(), tests::random::gen());
        for (const auto& i : chosen_indexes) {
            dk_universe.push_back(dk_universe_candidates[i]);
        }
    }

    // Generate some interesting partition_in_position values.
    // A subset of them will be inserted into the index,
    // and they will be used as arguments for index reader operations.
    std::vector<position_in_partition> pip_universe;
    {
        std::vector<data_value> clustering_key_value_set;
        for (int i = 0; i < cfg.clustering_key_component_values - 1; ++i) {
            clustering_key_value_set.push_back(data_value(int16_t(i)));
        }
        // Put some `0xff` bytes in the keys to stress the `nudge()` logic
        // for the last clustering key in the partition.
        clustering_key_value_set.push_back(data_value(int16_t(0xff)));
        auto pip_universe_candidates = generate_pips(s, clustering_key_value_set);
        auto pip_indexes = std::vector(std::from_range, std::views::iota(size_t(0), std::size(pip_universe_candidates)));
        auto chosen_pips = std::vector<size_t>(cfg.clustering_universe_size);
        std::sample(pip_indexes.begin(), pip_indexes.end(), chosen_pips.begin(), chosen_pips.size(), tests::random::gen());
        for (const auto& i : chosen_pips) {
            pip_universe.push_back(pip_universe_candidates[i]);
        }
    }

    // The exact value of this is unimportant,
    // but it must strictly grow with each entry.
    int64_t data_file_offset = 0;

    const int inserted_partitions = tests::random::get_int(1, cfg.partition_universe_size);
    std::vector<dht::decorated_key> inserted_dks;
    std::sample(dk_universe.begin(), dk_universe.end(), std::back_inserter(inserted_dks), inserted_partitions, tests::random::gen());
    for (int p = 0; p < inserted_partitions; ++p) {
        const auto dk = inserted_dks[p];

        const auto tombstone = make_random_tombstone();
        result.push_back(partition_index_entry{
            .dk = dk,
            .data_file_offset = data_file_offset++,
            .partition_tombstone = tombstone,
        });

        const int inserted_clustering_blocks = tests::random::get_int(0, cfg.max_clustering_blocks);
        std::vector<position_in_partition> inserted_pips;
        std::sample(pip_universe.begin(), pip_universe.end(), std::back_inserter(inserted_pips), inserted_clustering_blocks * 2, tests::random::gen());
        for (int c = 0; c < inserted_clustering_blocks; ++c) {
            auto tombstone = make_random_tombstone();
            auto first_pip = inserted_pips[c * 2];
            auto first_pip_weight = pip_bound_weight_to_m_bound_weight(first_pip.get_bound_weight());
            auto last_pip = inserted_pips[c * 2 + 1];
            auto last_pip_weight = pip_bound_weight_to_m_bound_weight(last_pip.get_bound_weight());
            result.push_back(clustering_index_entry{
                .first_ck = sstables::clustering_info(first_pip.key(), first_pip_weight),
                .last_ck = sstables::clustering_info(last_pip.key(), last_pip_weight),
                .data_file_offset = data_file_offset++,
                .range_tombstone_before_first_ck = tombstone,
            });
        }
        result.push_back(partition_end_entry{
            .data_file_offset = data_file_offset++,
        });
    }
    result.push_back(eof_index_entry{
        .data_file_offset = data_file_offset,
    });
    std::vector<dht::ring_position> rp_universe;
    // We only use the decorated keys as ring positions,
    // we don't bother with token positions or min/max positions.
    // They shouldnt be distinguishable from a non-inserted key
    // at the same position relative to inserted keys.
    for (const auto& dk : dk_universe) {
        rp_universe.push_back(dk);
    }
    return index_entry_dataset {
        .s = s,
        .ring_position_universe = std::move(rp_universe),
        .position_in_partition_universe = std::move(pip_universe),
        .entries = std::move(result),
    };
}

// A reference index that can be used to check the results of actual index reader operations.
// Tries to implement the contract of abstact index_reader in a relatively simple way.
//
struct reference_index {
    enum entry_idx : uint64_t {};
    enum rp_idx : uint64_t {};

    // These are directly from the generated index_entry_dataset.
    const schema& _s;
    std::vector<dht::ring_position> _ring_position_universe;
    std::vector<position_in_partition> _position_in_partition_universe;
    std::vector<index_entry> _entries;

    // These are some helpers for navigating the dataset above.

    // The set of decorated keys present in _entries.
    std::vector<dht::decorated_key> _present_dks;
    // `_present_dk_indices_in_rp_universe[i]`
    // is the index of `_present_dks[i]` within `_ring_position_universe`.
    std::vector<rp_idx> _present_dk_indices_in_rp_universe;
    // `_present_dk_indices[i]`
    // is the index of `_present_dks[i]` within `_entries`.
    // `_present_dk_indices[_present_dks.size()]` equals `_entries.size() - 1`.
    std::vector<entry_idx> _present_dk_indices;
    // `_data_file_offsets[i]` is the `_entries[i].data_file_offset`.
    // (Extracted for convenience, because `_entries[i]` is a variant).
    std::vector<uint64_t> _data_file_offsets;

    // The mutable state of this index reader.
    // An index reader is expected to behave like a (lower bound, upper bound) pair,
    // this is a "model" of that.
    entry_idx _lower = entry_idx(0);
    std::optional<entry_idx> _upper;
    bool _initialized = false;

    reference_index(const index_entry_dataset& raw)
        : _s(raw.s)
        , _ring_position_universe(raw.ring_position_universe)
        , _position_in_partition_universe(raw.position_in_partition_universe)
        , _entries(raw.entries)
    {
        for (const auto& e : _entries) {
            std::visit([this](const auto& entry) {
                _data_file_offsets.push_back(entry.data_file_offset);
            }, e);
        }
        {
            uint64_t idx = 0;
            for (const auto& e : _entries) {
                if (auto p = std::get_if<partition_index_entry>(&e)) {
                    _present_dks.push_back(p->dk);
                    _present_dk_indices.push_back(entry_idx(idx));
                } else if (std::get_if<eof_index_entry>(&e)) {
                    _present_dk_indices.push_back(entry_idx(idx));
                }
                ++idx;
            }
        }
        {
            auto cmp = dht::ring_position_comparator(_s);
            auto it = _present_dks.begin();
            uint64_t idx = 0;
            while (it != _present_dks.end() && idx < _ring_position_universe.size()) {
                if (cmp(_ring_position_universe[idx], *it) == std::strong_ordering::equal) {
                    _present_dk_indices_in_rp_universe.push_back(rp_idx(idx));
                    ++it;
                }
                ++idx;
            }
            SCYLLA_ASSERT(_present_dks.size() == _present_dk_indices_in_rp_universe.size());
        }
    }

    entry_idx entry_idx_from_data_position(uint64_t data_position) const {
        auto it = std::find(_data_file_offsets.begin(), _data_file_offsets.end(), data_position);
        if (it != _data_file_offsets.end()) {
            return static_cast<entry_idx>(std::distance(_data_file_offsets.begin(), it));
        }
        abort();
    }

    entry_idx get_partition_of_entry(entry_idx idx) const {
        for (int64_t i = std::to_underlying(idx); i >= 0; --i) {
            if (std::holds_alternative<partition_index_entry>(_entries[i])
                || std::holds_alternative<eof_index_entry>(_entries[i])
            ) {
                return entry_idx(i);
            }
        }
        abort();
    }

    entry_idx get_current_partition() const {
        return get_partition_of_entry(_lower);
    }

    void recalibrate(sstables::data_file_positions_range r) {
        _lower = entry_idx_from_data_position(r.start);
        if (r.end) {
            _upper = entry_idx_from_data_position(*r.end);
        } else {
            _upper.reset();
        }
    }

    std::span<const dht::ring_position> valid_targets_for_advance_lower_and_check_if_present() const {
        return _ring_position_universe;
    }
    bool advance_lower_and_check_if_present(dht::ring_position_view key) {
        _initialized = true;
        auto cmp = dht::ring_position_less_comparator(_s);
        auto tricmp = dht::ring_position_comparator(_s);
        auto it = std::ranges::lower_bound(_present_dks, key, cmp);
        _lower = entry_idx(_present_dk_indices[it - _present_dks.begin()]);
        if (it == _present_dks.end() || tricmp(*it, key) != std::strong_ordering::equal) {
            return false;
        }
        return true;
    }

    std::span<const dht::decorated_key> valid_targets_for_advance_past_definitely_present_partition() const {
        return _present_dks;
    }
    void advance_past_definitely_present_partition(const dht::decorated_key& dk) {
        _initialized = true;
        auto cmp = dht::ring_position_less_comparator(_s);
        auto it = std::ranges::lower_bound(_present_dks, dk, cmp);
        _lower = entry_idx(_present_dk_indices[it - _present_dks.begin() + 1]);
    }
    void advance_to_definitely_present_partition(const dht::decorated_key& dk) {
        _initialized = true;
        auto cmp = dht::ring_position_less_comparator(_s);
        auto it = std::ranges::lower_bound(_present_dks, dk, cmp);
        _lower = entry_idx(_present_dk_indices[it - _present_dks.begin()]);
    }
    std::span<const dht::ring_position> valid_lb_targets_for_advance_to() const {
        auto cmp = dht::ring_position_less_comparator(_s);
        auto boundary_idx = _upper ? *_upper : _lower;
        auto it = std::ranges::lower_bound(_ring_position_universe, ring_position_of_entry(boundary_idx), cmp);
        return std::span<const dht::ring_position>(it, _ring_position_universe.end());
    }
    std::span<const dht::ring_position> valid_ub_targets_for_advance_to(const dht::ring_position& lb) const {
        auto cmp = dht::ring_position_less_comparator(_s);
        auto it = std::ranges::lower_bound(_ring_position_universe, lb, cmp);
        return std::span<const dht::ring_position>(it, _ring_position_universe.end());
    }
    void advance_to(const dht::partition_range& range) {
        _initialized = true;
        auto cmp = dht::ring_position_less_comparator(_s);
        {
            auto rpv = dht::ring_position_view::for_range_start(range);
            auto it = std::ranges::lower_bound(_present_dks, rpv, cmp);
            _lower = entry_idx(_present_dk_indices[it - _present_dks.begin()]);
        }
        {
            auto rpv = dht::ring_position_view::for_range_end(range);
            auto it = std::ranges::lower_bound(_present_dks, rpv, cmp);
            _upper = entry_idx(_present_dk_indices[it - _present_dks.begin()]);
        }
    }
    void advance_to_next_partition() {
        _initialized = true;
        auto idx = std::to_underlying(_lower);
        if (std::holds_alternative<partition_index_entry>(_entries[idx])) {
            ++idx;
        }
        while (std::holds_alternative<clustering_index_entry>(_entries[idx])
            || std::holds_alternative<partition_end_entry>(_entries[idx])
        ) {
            ++idx;
        }
        _lower = entry_idx(idx);
    }
    void advance_reverse_to_next_partition() {
        _initialized = true;
        auto idx = std::to_underlying(_lower);
        if (std::holds_alternative<partition_index_entry>(_entries[idx])) {
            ++idx;
        }
        while (std::holds_alternative<clustering_index_entry>(_entries[idx])
            || std::holds_alternative<partition_end_entry>(_entries[idx])
        ) {
            ++idx;
        }
        _upper = entry_idx(idx);
    }
    entry_idx advance_bound_before(entry_idx eidx, position_in_partition_view pos) {
        auto less = position_in_partition::less_compare(_s);
        auto idx = std::to_underlying(get_current_partition());
        if (std::holds_alternative<eof_index_entry>(_entries[idx])) {
            return entry_idx(idx);
        }
        if (std::holds_alternative<clustering_index_entry>(_entries[idx + 1])) {
            ++idx;
            while (auto e = std::get_if<clustering_index_entry>(&_entries[idx])) {
                if (less(pip_of_clustering_info(e->last_ck), pos)) {
                    ++idx;
                } else {
                    break;
                }
            }
        }
        return entry_idx(idx);
    }
    static int weight_of_bound_kind_m(sstables::bound_kind_m b){
        using sstables::bound_kind_m;
        switch (b) {
            case bound_kind_m::incl_start:
            case bound_kind_m::excl_end_incl_start:
            case bound_kind_m::excl_end:
                return -1;
            case bound_kind_m::static_clustering:
            case bound_kind_m::clustering:
                return 0;
            case bound_kind_m::incl_end_excl_start:
            case bound_kind_m::incl_end:
            case bound_kind_m::excl_start:
                return 1;
        }
        abort();
    }
    static position_in_partition pip_of_clustering_info(const sstables::clustering_info& e) {
        int weight = weight_of_bound_kind_m(e.kind);
        return position_in_partition(partition_region::clustered, bound_weight(weight), e.clustering);
    }
    position_in_partition past_previous_pip() const {
        if (std::holds_alternative<eof_index_entry>(_entries[_lower])) {
            return position_in_partition::for_partition_end();
        }
        if (std::holds_alternative<partition_index_entry>(_entries[_lower])) {
            return position_in_partition::after_static_row_tag_t();
        }
        if (auto e = std::get_if<clustering_index_entry>(&_entries[_lower - 1])) {
            return pip_of_clustering_info(e->last_ck);
        }
        if (std::holds_alternative<partition_end_entry>(_entries[_lower])) {
            return position_in_partition::for_partition_end();
        }
        return position_in_partition::after_static_row_tag_t();
    }
    std::span<const position_in_partition> valid_targets_for_advance_to_pip() const {
        auto cmp = position_in_partition::less_compare(_s);
        auto cp = past_previous_pip();
        testlog.debug("valid_targets_for_advance_to_pip: current_pip={}", cp);
        auto it = std::ranges::upper_bound(_position_in_partition_universe, cp, cmp);
        return std::span<const position_in_partition>(it, _position_in_partition_universe.end());
    }
    std::span<const position_in_partition> valid_targets_for_advance_reverse() const {
        return _position_in_partition_universe;
    }
    void advance_to(position_in_partition_view pos) {
        _initialized = true;
        _lower = advance_bound_before(_lower, pos);
    }
    void advance_upper_past(position_in_partition_view pos) {
        _initialized = true;
        _upper = advance_bound_before(_lower, pos);
    }
    void advance_reverse(position_in_partition_view pos) {
        _initialized = true;
        auto less = position_in_partition::less_compare(_s);
        auto idx = std::to_underlying(get_partition_of_entry(_lower));
        if (std::holds_alternative<partition_index_entry>(_entries[idx])) {
            ++idx;
        }
        while (auto e = std::get_if<clustering_index_entry>(&_entries[idx])) {
            if (less(pos, pip_of_clustering_info(e->first_ck))) {
                break;
            } else {
                ++idx;
            }
        }
        _upper = entry_idx(idx);
    }

    bool has_row_index() const {
        auto curpar = get_current_partition();
        if (std::holds_alternative<eof_index_entry>(_entries[curpar])) {
            return false;
        }
        if (std::holds_alternative<clustering_index_entry>(_entries[curpar + 1])) {
            return true;
        }
        return false;
    }
    std::optional<sstables::deletion_time> partition_tombstone() const {
        if (const auto& hdr = std::get_if<partition_index_entry>(&_entries[get_current_partition()])) {
            return hdr->partition_tombstone;
        }
        return std::nullopt;
    }
    std::optional<partition_key> get_partition_key() const {
        if (const auto& hdr = std::get_if<partition_index_entry>(&_entries[get_current_partition()])) {
            return hdr->dk.key();
        }
        return std::nullopt;
    }
    sstables::data_file_positions_range data_file_positions() const {
        auto lo = _data_file_offsets[_lower];
        std::optional<uint64_t> hi;
        if (_upper) {
            hi = _data_file_offsets[*_upper];
        }
        return {lo, hi};
    }
    std::optional<uint64_t> last_block_offset() {
        auto curpar = get_current_partition();
        if (std::holds_alternative<eof_index_entry>(_entries[curpar])) {
            return std::nullopt;
        }
        if (!std::holds_alternative<clustering_index_entry>(_entries[curpar + 1])) {
            return std::nullopt;
        }
        for (uint64_t idx = curpar + 1; idx < _entries.size(); ++idx) {
            if (std::holds_alternative<partition_end_entry>(_entries[idx + 1])) {
                return _data_file_offsets[idx] - _data_file_offsets[curpar];
            }
        }
        abort();
    }
    sstables::indexable_element element_kind_for_entry(entry_idx idx) const {
        if (std::holds_alternative<clustering_index_entry>(_entries[idx])
            || std::holds_alternative<partition_end_entry>(_entries[idx])) {
            return sstables::indexable_element::cell;
        }
        return sstables::indexable_element::partition;
    }
    sstables::indexable_element element_kind() const {
        return element_kind_for_entry(_lower);
    }
    sstables::indexable_element element_kind_for_position(std::optional<uint64_t> pos) const {
        if (!pos) {
            return sstables::indexable_element::partition;
        }
        auto entry = entry_idx_from_data_position(*pos);
        return element_kind_for_entry(entry);
    }
    std::optional<sstables::open_rt_marker> end_open_marker() const {
        if (auto e = std::get_if<clustering_index_entry>(&_entries[_lower])) {
            auto tomb = tombstone(e->range_tombstone_before_first_ck);
            if (!tomb) {
                return std::nullopt;
            }
            return sstables::open_rt_marker{
                .pos = {position_in_partition::after_static_row_tag_t()},
                .tomb = tomb,
            };
        }
        return std::nullopt;
    }
    std::optional<sstables::open_rt_marker> reverse_end_open_marker() const {
        if (!_upper) {
            return std::nullopt;
        }
        if (auto e = std::get_if<clustering_index_entry>(&_entries[*_upper])) {
            auto tomb = tombstone(e->range_tombstone_before_first_ck);
            if (!tomb) {
                return std::nullopt;
            }
            return sstables::open_rt_marker{
                .pos = {position_in_partition::after_static_row_tag_t()},
                .tomb = tomb,
            };
        }
        return std::nullopt;
    }
    bool eof() const {
        return _lower == _entries.size() - 1;
    }
    bool partition_data_ready() const {
        return _initialized;
    }
    void read_partition_data() {
        _initialized = true;
    }
    void reset() {
        _initialized = false;
        _lower = entry_idx(0);
        _upper.reset();
    }
    dht::ring_position_view ring_position_of_entry(entry_idx eidx) const {
        auto curpar = get_partition_of_entry(eidx);
        if (std::holds_alternative<eof_index_entry>(_entries[curpar])) {
            return dht::ring_position_view::max();
        }
        auto pe = std::get_if<partition_index_entry>(&_entries[curpar]);
        if (curpar == eidx) {
            return dht::ring_position_view(pe->dk);
        } else {
            return dht::ring_position_view::for_after_key(pe->dk);
        }
    }
};

// Test all possible legal sequences of `abstract_index_reader` method calls,
// up to a certain sequence length (max_ops),
// on the given index dataset.
//
// The general structore of the test is:
// 1. We create a real index reader and a reference index reader.
// 2. For max_ops iterations:
// 2.1 We nondeterministically choose an operation to perform and its argument.
// 2.2 We call the operation on both readers.
// 2.3 We check that the real reader's positions after the method are
//     consistent (possibly less accurate, but within the contract of the method)
//     with the reference reader's positions.
// 2.4 We adjust the reference reader to point to exactly the same positions as the real reader,
//     so that their states match for the next method calls.
// 2.5 We check that all the const getters return the same thing for both readers.
//
// And nondeterministic_choice_stack is used to explore all possible outcomes of the nondeterministic choices.
void test_index(const index_entry_dataset& dataset, std::function<std::unique_ptr<sstables::abstract_index_reader>(void)> reader_factory, const int max_ops) {
    auto ri = reference_index(dataset);

    uint64_t n_cases = 0;
    nondeterministic_choice_stack ndcs;
    do {
        ++n_cases;
        auto reader = reader_factory();
        ri.reset();
        auto check_integrity = [&] {
            testlog.debug("check_integrity: reader->data_file_positions()={},{}", reader->data_file_positions().start, reader->data_file_positions().end);
            auto positions = reader->data_file_positions();
            // Adjust the reference index to match the reader's positions exactly.
            // (Before this call, the positions may be different, because the real
            // reader is allowed some degree of inexactness/suboptimality after some method calls).
            ri.recalibrate(positions);
            if (ri.partition_data_ready()) {
                SCYLLA_ASSERT(reader->partition_data_ready());
            }
            SCYLLA_ASSERT(ri.data_file_positions().start == positions.start);
            SCYLLA_ASSERT(ri.data_file_positions().end == positions.end);
            SCYLLA_ASSERT(ri.element_kind() == reader->element_kind());
            SCYLLA_ASSERT(ri.eof() == reader->eof());
            auto get_tombstone = [] (const sstables::open_rt_marker& marker) {
                return marker.tomb;
            };
            SCYLLA_ASSERT(ri.end_open_marker().transform(get_tombstone) == reader->end_open_marker().transform(get_tombstone));
            SCYLLA_ASSERT(ri.reverse_end_open_marker().transform(get_tombstone) == reader->reverse_end_open_marker().transform(get_tombstone));
            if (reader->partition_data_ready()) {
                SCYLLA_ASSERT(ri.last_block_offset() == reader->last_block_offset().get());
                if (ri.has_row_index()) {
                    SCYLLA_ASSERT(reader->partition_tombstone());
                    SCYLLA_ASSERT(reader->get_partition_key());
                    SCYLLA_ASSERT(ri.partition_tombstone() == reader->partition_tombstone());
                    SCYLLA_ASSERT(ri.get_partition_key() == reader->get_partition_key());
                } else {
                    SCYLLA_ASSERT(!reader->partition_tombstone());
                    SCYLLA_ASSERT(!reader->get_partition_key());
                }
            }
        };
        testlog.debug("Initial check_integrity");
        check_integrity();
        for (int op = 0; op < max_ops; ++op) {
            testlog.debug("op={}, start={}, end={}",
                op, reader->data_file_positions().start, reader->data_file_positions().end);
            if (auto vt = ri.valid_targets_for_advance_lower_and_check_if_present(); !vt.empty() && !ndcs.choose_bool()) {
                auto target = ndcs.choose_up_to(vt.size() - 1);
                auto rp = vt[target];
                testlog.debug("advance_lower_and_check_if_present(rp={})", rp);

                auto upper_before = reader->data_file_positions().end;
                auto possible_match = reader->advance_lower_and_check_if_present(rp).get();
                auto reference_match = ri.advance_lower_and_check_if_present(rp);
                if (!possible_match) {
                    testlog.debug("No match");
                    SCYLLA_ASSERT(!reference_match);
                    // After a mismatch in the advance_lower_and_check_if_present,
                    // the reader is in a broken state, and can't be used anymore.
                    break;
                }
                SCYLLA_ASSERT(reader->element_kind() == sstables::indexable_element::partition);
                testlog.debug("reader->data_file_positions()={},{}, ri.data_file_positions()={},{}, upper_before={}",
                    reader->data_file_positions().start,
                    reader->data_file_positions().end,
                    ri.data_file_positions().start,
                    ri.data_file_positions().end,
                    upper_before);
                SCYLLA_ASSERT(reader->data_file_positions().start <= ri.data_file_positions().start);
                if (reference_match) {
                    SCYLLA_ASSERT(possible_match);
                    SCYLLA_ASSERT(reader->data_file_positions().start == ri.data_file_positions().start);
                }
                SCYLLA_ASSERT(reader->data_file_positions().end == upper_before);
            } else if (auto vt = ri.valid_targets_for_advance_past_definitely_present_partition(); !vt.empty() && !ndcs.choose_bool()) {
                auto target = ndcs.choose_up_to(vt.size() - 1);
                auto dk = vt[target];
                auto upper_before = reader->data_file_positions().end;
                testlog.debug("advance_to_definitely_present_partition(dk={})", dk);
                reader->advance_to_definitely_present_partition(dk).get();
                ri.advance_to_definitely_present_partition(dk);
                SCYLLA_ASSERT(reader->data_file_positions().start == ri.data_file_positions().start);
                SCYLLA_ASSERT(reader->data_file_positions().end == upper_before);
            } else if (auto vt = ri.valid_targets_for_advance_past_definitely_present_partition(); !vt.empty() && !ndcs.choose_bool()) {
                auto target = ndcs.choose_up_to(vt.size() - 1);
                auto dk = vt[target];
                auto upper_before = reader->data_file_positions().end;
                testlog.debug("advance_past_definitely_present_partition(dk={})", dk);
                reader->advance_past_definitely_present_partition(dk).get();
                ri.advance_past_definitely_present_partition(dk);
                SCYLLA_ASSERT(reader->data_file_positions().start == ri.data_file_positions().start);
                SCYLLA_ASSERT(reader->data_file_positions().end == upper_before);
            } else if (!ndcs.choose_bool()) {
                std::optional<dht::partition_range::bound> lb;
                if (auto vt = ri.valid_lb_targets_for_advance_to(); !vt.empty() && !ndcs.choose_bool()) {
                    auto target = ndcs.choose_up_to(vt.size() - 1);
                    const auto& rp = vt[target];
                    auto inclusive = ndcs.choose_bool();
                    lb = dht::partition_range::bound(rp, inclusive);
                }
                std::optional<dht::partition_range::bound> ub;
                const auto& lb_rp = lb ? lb.value().value() : dht::ring_position::min();
                if (auto vt = ri.valid_ub_targets_for_advance_to(lb_rp); !vt.empty() && !ndcs.choose_bool()) {
                    auto target = ndcs.choose_up_to(vt.size() - 1);
                    const auto& rp = vt[target];
                    bool inclusive;
                    auto tricmp = dht::ring_position_comparator(ri._s);
                    if (lb.has_value() && lb.value().is_inclusive() && tricmp(lb_rp, rp) == std::strong_ordering::equal) {
                        inclusive = false;
                    } else {
                        inclusive = ndcs.choose_bool();
                    }
                    lb = dht::partition_range::bound(rp, inclusive);
                }
                auto pr = dht::partition_range(lb, ub);
                testlog.debug("advance_to(pr={})", pr);
                reader->advance_to(pr).get();
                ri.advance_to(pr);
                auto positions = reader->data_file_positions();
                SCYLLA_ASSERT(reader->element_kind() == sstables::indexable_element::partition);
                SCYLLA_ASSERT(positions.start <= ri.data_file_positions().start);
                if (ri.data_file_positions().end) {
                    SCYLLA_ASSERT(positions.end);
                    SCYLLA_ASSERT(*positions.end >= *ri.data_file_positions().end);
                } else {
                    SCYLLA_ASSERT(!positions.end);
                }
            } else if (!reader->eof() && !ndcs.choose_bool()) {
                testlog.debug("advance_to_next_partition()");
                reader->advance_to_next_partition().get();
                ri.advance_to_next_partition();
                SCYLLA_ASSERT(reader->data_file_positions().start == ri.data_file_positions().start);
                SCYLLA_ASSERT(reader->data_file_positions().end == ri.data_file_positions().end);
            } else if (!ndcs.choose_bool()) {
                testlog.debug("advance_reverse_to_next_partition()");
                reader->advance_reverse_to_next_partition().get();
                ri.advance_reverse_to_next_partition();
                SCYLLA_ASSERT(reader->data_file_positions().start == ri.data_file_positions().start);
                SCYLLA_ASSERT(reader->data_file_positions().end == ri.data_file_positions().end);
            } else if (auto vt = ri.valid_targets_for_advance_to_pip(); !reader->eof() && !vt.empty() && !ndcs.choose_bool()) {
                auto target = ndcs.choose_up_to(vt.size() - 1);
                const auto& pos = vt[target];
                testlog.debug("advance_to({})", pos);
                reader->advance_to(pos).get();
                ri.advance_to(pos);
                SCYLLA_ASSERT(reader->data_file_positions().start <= ri.data_file_positions().start);
                SCYLLA_ASSERT(reader->data_file_positions().end == ri.data_file_positions().end);
            } else if (auto vt = ri.valid_targets_for_advance_to_pip(); !reader->eof() && !vt.empty() && !ndcs.choose_bool()) {
                auto target = ndcs.choose_up_to(vt.size() - 1);
                const auto& pos = vt[target];
                testlog.debug("advance_upper_past({})", pos);
                reader->advance_upper_past(pos).get();
                ri.advance_upper_past(pos);
                SCYLLA_ASSERT(reader->data_file_positions().end.value() >= ri.data_file_positions().end.value());
                SCYLLA_ASSERT(reader->data_file_positions().start == ri.data_file_positions().start);
            } else if (auto vt = ri.valid_targets_for_advance_reverse(); !reader->eof() && !vt.empty() && !ndcs.choose_bool()) {
                auto target = ndcs.choose_up_to(vt.size() - 1);
                const auto& pos = vt[target];
                testlog.debug("advance_upper_past({})", pos);
                reader->advance_reverse(pos).get();
                ri.advance_reverse(pos);
                SCYLLA_ASSERT(reader->data_file_positions().end.value() >= ri.data_file_positions().end.value());
                SCYLLA_ASSERT(reader->data_file_positions().start == ri.data_file_positions().start);
            } else {
                testlog.debug("read_partition_data()");
                auto positions_before = reader->data_file_positions();
                reader->read_partition_data().get();
                ri.read_partition_data();
                SCYLLA_ASSERT(reader->partition_data_ready());
                auto positions_after = reader->data_file_positions();
                SCYLLA_ASSERT(positions_before.start == positions_after.start);
                SCYLLA_ASSERT(positions_before.end == positions_after.end);
            }
            check_integrity();
        }
    } while (ndcs.rewind());
    testlog.info("Number of run method sequences: {}", n_cases);
};

SEASTAR_THREAD_TEST_CASE(test_exhaustive) {
    auto the_schema = schema_builder("ks", "t")
        .with_column("pk", short_type, column_kind::partition_key)
        .with_column("ck1", short_type, column_kind::clustering_key)
        .with_column("ck2", short_type, column_kind::clustering_key)
        .build();
    auto sst_ver = sstables::sstable_version_types::me;

    random_dataset_config cfg;
    const int max_ops = 3;
#ifdef SEASTAR_DEBUG
    // The test generates millions of futures,
    // so it's extremely slow in debug mode which induces a preemption
    // after every future.
    // So we downsize the test.
    // FIXME: that's not big enough for full coverage.
    // E.g. there are branches in partition index writer which are only taken
    // since third partition key onward.
    cfg.partition_universe_size = 2;
    cfg.clustering_universe_size = 4;
    cfg.max_clustering_blocks = 2;
#endif
    // Step 1: generate the test dataset.
    testlog.debug("Generating a random dataset.");
    auto dataset = generate_random_dataset(*the_schema, cfg);

    // Log the contents of the dataset for debugging purposes.
    for (const auto& entry : dataset.entries) {
        std::visit(overloaded_functor {
            [](const partition_index_entry& e) {
                testlog.debug("Partition index entry: dk={}, data_file_offset={}, partition_tombstone={}",
                    e.dk, e.data_file_offset, e.partition_tombstone);
            },
            [](const clustering_index_entry& e) {
                sstables::clustering_info first = e.first_ck;
                sstables::clustering_info last = e.last_ck;
                testlog.debug("Clustering index entry: first={}@{}, last={}@{}, data_file_offset={}, tombstone_before_first_ck={}",
                    first.kind, first.clustering, last.kind, last.clustering, e.data_file_offset, e.range_tombstone_before_first_ck);
            },
            [](const partition_end_entry& e) {
                testlog.debug("Partition end entry: data_file_offset={}", e.data_file_offset);
            },
            [](const eof_index_entry& e) {
                testlog.debug("Eof index entry: data_file_offset={}", e.data_file_offset);
            },
        }, entry);
    }

    // Step 2: write the index to BTI files.
    tmpdir dir;
    auto partitions_path = dir.path() / "Partitions.db";
    auto rows_path = dir.path() / "Rows.db";
    testlog.debug("Writing index to {} and {}", partitions_path.c_str(), rows_path.c_str());
    {
        file partitions_db = open_file_dma(partitions_path.c_str(), open_flags::create | open_flags::wo).get();
        file rows_db = open_file_dma(rows_path.c_str(), open_flags::create | open_flags::wo).get();

        sstables::file_writer partitions_db_writer(make_file_output_stream(partitions_db).get());
        sstables::file_writer rows_db_writer(make_file_output_stream(rows_db).get());

        auto close_partitions_db = defer([&] { partitions_db_writer.close(); });
        auto close_rows_db = defer([&] { rows_db_writer.close(); });

        auto partition_index_writer = sstables::trie::bti_partition_index_writer(partitions_db_writer);
        auto row_index_writer = sstables::trie::bti_row_index_writer(rows_db_writer);

        std::optional<partition_index_entry> last_partition_entry;
        std::optional<partition_end_entry> last_partition_end_entry;
        auto push_partition = [&] () {
            if (last_partition_entry) {
                auto& last = *last_partition_entry;
                auto pk = sstables::key::from_partition_key(*the_schema, last.dk.key());
                auto hash = utils::make_hashed_key(bytes_view(pk));
                auto payload = row_index_writer.finish(
                    sst_ver,
                    *the_schema,
                    last.data_file_offset,
                    last_partition_end_entry.value().data_file_offset,
                    pk,
                    last.partition_tombstone);
                partition_index_writer.add(*the_schema, last.dk, hash, payload);
            }
            last_partition_entry.reset();
        };

        for (const auto& entry : dataset.entries) {
            std::visit(overloaded_functor {
                [&](const partition_index_entry& e) {
                    push_partition();
                    last_partition_entry = e;
                },
                [&](const clustering_index_entry& e) {
                    row_index_writer.add(
                        *the_schema,
                        e.first_ck,
                        e.last_ck,
                        e.data_file_offset - last_partition_entry.value().data_file_offset,
                        e.range_tombstone_before_first_ck);
                },
                [&](const partition_end_entry& e) {
                    last_partition_end_entry = e;
                },
                [&](const eof_index_entry& e) {
                    push_partition();
                },
            }, entry);
        }
        std::move(partition_index_writer).finish(sst_ver, sstables::key::from_bytes({}), sstables::key::from_bytes({}));
    }

    // Step 3: create the reader (or, more precisely, a factory of readers) over the index files.
    testlog.debug("Opening index from {} and {}", partitions_path.c_str(), rows_path.c_str());
    {
        file partitions_db = open_file_dma(partitions_path.c_str(), open_flags::create | open_flags::ro).get();
        file rows_db = open_file_dma(rows_path.c_str(), open_flags::create | open_flags::ro).get();
        auto close_partitions_db = deferred_close(partitions_db);
        auto close_rows_db = deferred_close(rows_db);

        auto stats = cached_file_stats();
        auto cached_file_lru = lru();
        auto region = logalloc::region();
        auto partitions_db_size = partitions_db.size().get();
        auto rows_db_size = rows_db.size().get();
        auto partitions_db_cached = seastar::make_shared<cached_file>(partitions_db, stats, cached_file_lru, region, partitions_db_size, "Partitions.db");
        auto rows_db_cached = seastar::make_shared<cached_file>(rows_db, stats, cached_file_lru, region, rows_db_size, "Rows.db");

        auto partitions_db_footer = sstables::trie::read_bti_partitions_db_footer(*the_schema, sst_ver, partitions_db, partitions_db_size).get();
        auto partitions_db_root_pos = partitions_db_footer.trie_root_position;

        auto semaphore = tests::reader_concurrency_semaphore_wrapper();
        auto trace_state = tracing::trace_state_ptr();

        // Step 4: run the reader test on the opened readers.
        test_index(dataset, [&] {
            return sstables::trie::make_bti_index_reader(
                partitions_db_cached,
                rows_db_cached,
                partitions_db_root_pos,
                std::get<eof_index_entry>(dataset.entries.back()).data_file_offset,
                the_schema,
                semaphore.make_permit(),
                trace_state
            );
        }, max_ops);
    }
}

static std::vector<std::byte> linearize(const memory_data_sink_buffers& bufs) {
    std::vector<std::byte> retval;
    for (const auto& frag : bufs.buffers()) {
        auto v = std::as_bytes(std::span(frag));
        retval.insert(retval.end(), v.begin(), v.end());
    }
    return retval;
}

static temporary_buffer<char> make_temporary_buffer(const std::span<const std::byte> data) {
    return temporary_buffer<char>(reinterpret_cast<const char*>(data.data()), data.size());
}

static std::span<const char> char_span(const temporary_buffer<char>& buf) {
    return std::span<const char>(buf.get(), buf.size());
}

static std::generator<future<temporary_buffer<char>>> make_fragmented_generator(
    nondeterministic_choice_stack& ndcs,
    std::span<const std::byte> raw,
    size_t max_cuts
) {
    auto buf = make_temporary_buffer(raw);
    size_t n_cuts = 0;
    while (true) {
        future<> potential_yield = ndcs.choose_bool() ? seastar::yield() : make_ready_future<>();
        size_t next_frag_size = buf.size();
        if (buf.size() >= 1) {
            if (n_cuts < max_cuts) {
                ++n_cuts;
                next_frag_size = 1 + ndcs.choose_up_to(buf.size() - 1);
            }
        }
        auto result = temporary_buffer<char>(buf.get(), next_frag_size);
        testlog.trace("Next fragment: {}", fmt_hex(std::as_bytes(char_span(result))));
        buf.trim_front(result.size());
        co_yield potential_yield.then([r = std::move(result)] () mutable {
            return make_ready_future<temporary_buffer<char>>(std::move(r));
        });
    }
}

static data_source make_fragmented(nondeterministic_choice_stack& ndcs, std::span<const std::byte> raw, size_t max_cuts) {
    // Wraps `make_fragmented_generator` into a `data_source` for an `input_stream<char>`.
    struct source : data_source_impl {
        std::generator<future<temporary_buffer<char>>> _gen;
        decltype(_gen.begin()) _gen_it;
        source(nondeterministic_choice_stack& ndcs, std::span<const std::byte> raw, size_t max_cuts)
            : _gen(make_fragmented_generator(ndcs, raw, max_cuts))
            , _gen_it(_gen.begin())
        {}
        future<temporary_buffer<char>> get() override {
            auto result = std::move(*_gen_it);
            ++_gen_it;
            return result;
        }
    };
    return data_source(std::make_unique<source>(ndcs, raw, max_cuts));
}

// The per-partition headers in Rows.db can cross page boundaries,
// so parsing those headers involves handling fragmented buffers.
//
// This test tries writes a header and parses it with all possible
// fragmentations (up to a given number of cuts) and yield points.
SEASTAR_THREAD_TEST_CASE(test_read_row_index_header) {
    auto pk = sstables::key(tests::random::get_bytes(4));
    uint64_t partition_data_start = tests::random::get_int<int64_t>(0, std::numeric_limits<int64_t>::max());
    uint64_t number_of_blocks = tests::random::get_int<uint64_t>();
    uint64_t root_pos = tests::random::get_int<uint64_t>();
    auto tomb = make_random_tombstone();
    memory_data_sink_buffers bufs;
    {
        sstables::file_writer fw(data_sink(std::make_unique<memory_data_sink>(bufs)));
        auto close_fw = defer([&] { fw.close(); });
        sstables::trie::write_row_index_header(
            sstables::sstable_version_types::me,
            fw,
            pk,
            partition_data_start,
            number_of_blocks,
            root_pos,
            tomb
        );
    }
    nondeterministic_choice_stack ndcs;
    size_t n_cases = 0;
    do {
        auto vec = linearize(bufs);
        vec.append_range(std::as_bytes(std::span(std::string_view("some_suffix"))));
        uint64_t stream_size = ndcs.choose_bool() ? bufs.size() : vec.size();
        constexpr size_t max_cuts = 2;
        auto in = seastar::input_stream<char>(make_fragmented(ndcs, vec, max_cuts));
        auto semaphore = tests::reader_concurrency_semaphore_wrapper();
        auto result = sstables::trie::read_row_index_header(std::move(in), 0, stream_size, semaphore.make_permit()).get();
        SCYLLA_ASSERT(bytes_view(result.partition_key) == bytes_view(pk));
        SCYLLA_ASSERT(result.data_file_offset == partition_data_start);
        SCYLLA_ASSERT(result.number_of_blocks == number_of_blocks);
        SCYLLA_ASSERT(result.trie_root == root_pos);
        SCYLLA_ASSERT(result.partition_tombstone == tomb);
        ++n_cases;
    } while (ndcs.rewind());
    testlog.debug("Executed test cases: {}", n_cases);
}