Files
scylladb/dht/token.hh
Tomasz Grabiec 01fb97ee78 locator: tablets: Support arbitrary tablet boundaries
There are several reasons we want to do that.

One is that it will give us more flexibility in distributing the
load. We can subdivide tablets at any points, and achieve more
evenly-sized tablets. In particular, we can isolate large partitions
into separate tablets.

Another reason is vnode-to-tablet migration. We could construct a
tablet map which matches exactly the vnode boundaries, so migration
can happen transparently from the CQL-coordinator's point of view.

Implementation details:

We store a vector of tokens which represent tablet boundaries in the
tablet_id_map. tablet_id keeps its meaning, it's an index into vector
of tablets. To avoid logarithmic lookup of tablet_id from the token,
we introduce a lookup structure with power-of-two aligned buckets, and
store the tablet_id of the tablet which owns the first token in the
bucket. This way, lookup needs to consider tablet id range which
overlaps with one bucket. If boundaries are more or less aligned,
there are around 1-2 tablets overlapping with a bucket, and the lookup
is still O(1).

Amount of memory used increased, but not significantly relative to old
size (because tablet_info is currently fat):

For 131'072 tablets:

Before:

  Size of tablet_metadata in memory: 57456 KiB

After:

  Size of tablet_metadata in memory: 59504 KiB
2026-04-15 01:25:14 +02:00

413 lines
12 KiB
C++

/*
* Copyright (C) 2020-present ScyllaDB
*/
/*
* SPDX-License-Identifier: LicenseRef-ScyllaDB-Source-Available-1.1
*/
#pragma once
#include "bytes_fwd.hh"
#include "types/types.hh"
#include "utils/chunked_vector.hh"
#include <limits>
#include <seastar/net/byteorder.hh>
#include <fmt/format.h>
#include <functional>
#include <utility>
#include <compare>
#include <byteswap.h>
namespace dht {
class token;
enum class token_kind {
before_all_keys,
key,
after_all_keys,
};
// Represents a token for partition keys.
// Has a disengaged state, which sorts before all engaged states.
struct raw_token {
int64_t value;
/// Constructs a disengaged token.
raw_token() : value(std::numeric_limits<int64_t>::min()) {}
/// Constructs an engaged token.
/// The token must be of token_kind::key kind.
explicit raw_token(const token&);
explicit raw_token(int64_t v) : value(v) {};
std::strong_ordering operator<=>(const raw_token& o) const noexcept = default;
std::strong_ordering operator<=>(const token& o) const noexcept;
/// Returns true iff engaged.
explicit operator bool() const noexcept {
return value != std::numeric_limits<int64_t>::min();
}
};
using raw_token_opt = seastar::optimized_optional<raw_token>;
class token {
// INT64_MIN is not a legal token, but a special value used to represent
// infinity in token intervals.
// If a token with value INT64_MIN is generated by the hashing algorithm,
// the result is coerced into INT64_MAX.
// (So INT64_MAX is twice as likely as every other token.)
static constexpr int64_t normalize(int64_t t) {
return t == std::numeric_limits<int64_t>::min() ? std::numeric_limits<int64_t>::max() : t;
}
public:
using kind = token_kind;
kind _kind;
int64_t _data;
private:
constexpr token(kind k, int64_t d) noexcept : _kind(k), _data(d) {}
public:
constexpr token() noexcept : token(kind::before_all_keys, 0) {}
constexpr explicit token(int64_t d) noexcept : token(kind::key, normalize(d)) {}
token(raw_token raw) noexcept
: token(raw ? kind::key : kind::before_all_keys, raw.value)
{ }
// This constructor seems redundant with the bytes_view constructor, but
// it's necessary for IDL, which passes a deserialized_bytes_proxy here.
// (deserialized_bytes_proxy is convertible to bytes&&, but not bytes_view.)
token(kind k, const bytes& b) : _kind(std::move(k)) {
if (_kind != kind::key) {
_data = std::numeric_limits<int64_t>::min();
} else {
if (b.size() != sizeof(_data)) {
throw std::runtime_error(fmt::format("Wrong token bytes size: expected {} but got {}", sizeof(_data), b.size()));
}
_data = net::ntoh(read_unaligned<int64_t>(b.begin()));
}
}
token(kind k, bytes_view b) : _kind(std::move(k)) {
if (_kind != kind::key) {
_data = std::numeric_limits<int64_t>::min();
} else {
if (b.size() != sizeof(_data)) {
throw std::runtime_error(fmt::format("Wrong token bytes size: expected {} but got {}", sizeof(_data), b.size()));
}
_data = net::ntoh(read_unaligned<int64_t>(b.begin()));
}
}
constexpr std::strong_ordering operator<=>(const token& o) const noexcept {
if (_kind < o._kind) {
return std::strong_ordering::less;
} else if (_kind > o._kind) {
return std::strong_ordering::greater;
} else if (_kind == token_kind::key) [[likely]] {
return _data <=> o._data;
}
return std::strong_ordering::equal;
}
constexpr bool operator==(const token& o) const noexcept = default;
static constexpr token minimum() noexcept {
return token{ kind::before_all_keys, std::numeric_limits<int64_t>::min() };
}
static constexpr token maximum() noexcept {
return token{ kind::after_all_keys, std::numeric_limits<int64_t>::min() };
}
// Returns the smallest token in the ring which can be associated with a partition key.
static constexpr token first() noexcept {
// dht::token::normalize() does not allow std::numeric_limits<int64_t>::min()
return token{ kind::key, std::numeric_limits<int64_t>::min() + 1 };
}
// Returns the largestest token in the ring which can be associated with a partition key.
static constexpr token last() noexcept {
return token{ kind::key, std::numeric_limits<int64_t>::max() };
}
// Returns a successor for token t.
// The caller must ensure there is a next token, otherwise
// the result is unspecified.
//
// Precondition: t.kind() == dht::token::kind::key
constexpr token next() const noexcept {
return token{ dht::token::kind::key, _data + 1 };
}
constexpr bool is_minimum() const noexcept {
return _kind == kind::before_all_keys;
}
constexpr bool is_maximum() const noexcept {
return _kind == kind::after_all_keys;
}
constexpr bool is_first() const noexcept {
return _kind == dht::token::kind::key && _data == std::numeric_limits<int64_t>::min() + 1;
}
// Returns true iff this is the largest token which can be associated with a partition key.
// Note that this is different that is_maximum().
constexpr bool is_last() const noexcept {
return _kind == dht::token::kind::key && _data == std::numeric_limits<int64_t>::max();
}
constexpr size_t external_memory_usage() const noexcept {
return 0;
}
constexpr size_t memory_usage() const noexcept {
return sizeof(token);
}
bytes data() const {
bytes b(bytes::initialized_later(), sizeof(_data));
write_unaligned<int64_t>(b.begin(), net::hton(_data));
return b;
}
/**
* @return a string representation of this token
*/
sstring to_sstring() const;
/**
* Calculate a token representing the approximate "middle" of the given
* range.
*
* @return The approximate midpoint between left and right.
*/
static token midpoint(const token& left, const token& right);
/**
* @return a randomly generated token
*/
static token get_random_token();
/**
* @return a token from string representation
*/
static dht::token from_sstring(const sstring& t);
/**
* @return a token from its byte representation
*/
static dht::token from_bytes(bytes_view bytes);
/**
* Returns int64_t representation of the token
*/
static int64_t to_int64(token);
/**
* Creates token from its int64_t representation
*/
static dht::token from_int64(int64_t);
/**
* Calculate the deltas between tokens in the ring in order to compare
* relative sizes.
*
* @param sortedtokens a sorted List of tokens
* @return the mapping from 'token' to 'percentage of the ring owned by that token'.
*/
static std::map<token, float> describe_ownership(const utils::chunked_vector<token>& sorted_tokens);
static data_type get_token_validator();
/**
* Gets the first shard of the minimum token.
*/
static constexpr unsigned shard_of_minimum_token() noexcept {
return 0; // hardcoded for now; unlikely to change
}
constexpr int64_t raw() const noexcept {
if (is_maximum()) {
return std::numeric_limits<int64_t>::max();
}
return _data;
}
constexpr uint64_t unbias() const noexcept {
return uint64_t(_data) + uint64_t(std::numeric_limits<int64_t>::min());
}
static constexpr token bias(uint64_t n) noexcept {
return token(token::kind::key, n - uint64_t(std::numeric_limits<int64_t>::min()));
}
};
inline
raw_token::raw_token(const token& t)
: value(t.raw())
{
#ifdef DEBUG
assert(t._kind == token::kind::key);
#endif
}
inline
std::strong_ordering raw_token::operator<=>(const token& o) const noexcept {
switch (o._kind) {
case token::kind::after_all_keys:
return std::strong_ordering::less;
case token::kind::before_all_keys:
// before_all_keys has a raw value set to the same raw value as a disengaged raw_token, and sorts before all keys.
// So we can order them by just comparing raw values.
[[fallthrough]];
case token::kind::key:
return value <=> o._data;
}
}
inline constexpr std::strong_ordering tri_compare_raw(const int64_t l1, const int64_t l2) noexcept {
if (l1 == l2) {
return std::strong_ordering::equal;
} else {
return l1 < l2 ? std::strong_ordering::less : std::strong_ordering::greater;
}
}
template <typename T>
concept TokenCarrier = requires (const T& v) {
{ v.token() } noexcept -> std::same_as<const token&>;
};
struct raw_token_less_comparator {
bool operator()(const int64_t k1, const int64_t k2) const noexcept {
return dht::tri_compare_raw(k1, k2) < 0;
}
template <typename Key>
requires TokenCarrier<Key>
bool operator()(const Key& k1, const int64_t k2) const noexcept {
return dht::tri_compare_raw(k1.token().raw(), k2) < 0;
}
template <typename Key>
requires TokenCarrier<Key>
bool operator()(const int64_t k1, const Key& k2) const noexcept {
return dht::tri_compare_raw(k1, k2.token().raw()) < 0;
}
template <typename Key>
requires TokenCarrier<Key>
int64_t simplify_key(const Key& k) const noexcept {
return k.token().raw();
}
int64_t simplify_key(int64_t k) const noexcept {
return k;
}
};
inline constexpr token minimum_token() noexcept {
return token::minimum();
}
inline constexpr token maximum_token() noexcept {
return token::maximum();
}
std::ostream& operator<<(std::ostream& out, const token& t);
// Returns a successor for token t.
// The caller must ensure there is a next token, otherwise
// the result is unspecified.
//
// Precondition: t.kind() == dht::token::kind::key
inline constexpr
token next_token(const token& t) noexcept {
return t.next();
}
// Returns the smallest token in the ring which can be associated with a partition key.
inline constexpr
token first_token() noexcept {
// dht::token::normalize() does not allow std::numeric_limits<int64_t>::min()
return token::first();
}
// Returns the largest token in the ring which can be associated with a partition key.
inline constexpr
token last_token() noexcept {
return token::last();
}
inline constexpr uint64_t unbias(const token& t) {
return t.unbias();
}
inline constexpr token bias(uint64_t n) {
return token::bias(n);
}
size_t compaction_group_of(unsigned most_significant_bits, const token& t);
size_t compaction_group_of(unsigned most_significant_bits, dht::raw_token);
token last_token_of_compaction_group(unsigned most_significant_bits, size_t group);
// Generates 'count' tokens uniformly distributed in the token ring. Sorted.
// All values are in the range [first_token(), last_token()]
utils::chunked_vector<dht::raw_token> get_uniform_tokens(size_t count);
struct token_comparator {
// Return values are those of a trichotomic comparison.
constexpr std::strong_ordering operator()(const token& t1, const token& t2) const noexcept {
return t1 <=> t2;
}
};
} // namespace dht
template <>
struct fmt::formatter<dht::token> : fmt::formatter<string_view> {
template <typename FormatContext>
auto format(const dht::token& t, FormatContext& ctx) const {
if (t.is_maximum()) {
return fmt::format_to(ctx.out(), "maximum token");
} else if (t.is_minimum()) {
return fmt::format_to(ctx.out(), "minimum token");
} else {
return fmt::format_to(ctx.out(), "{}", dht::token::to_int64(t));
}
}
};
template <>
struct fmt::formatter<dht::raw_token> : fmt::formatter<string_view> {
template <typename FormatContext>
auto format(const dht::raw_token& t, FormatContext& ctx) const {
if (!t) {
return fmt::format_to(ctx.out(), "null");
}
return fmt::format_to(ctx.out(), "{}", t.value);
}
};
namespace std {
template<>
struct hash<dht::token> {
size_t operator()(const dht::token& t) const {
// We have to reverse the bytes here to keep compatibility with
// the behaviour that was here when tokens were represented as
// sequence of bytes.
return bswap_64(t._data);
}
};
} // namespace std