locator: tablets: Support arbitrary tablet boundaries

There are several reasons we want to do that.

One is that it will give us more flexibility in distributing the
load. We can subdivide tablets at any points, and achieve more
evenly-sized tablets. In particular, we can isolate large partitions
into separate tablets.

Another reason is vnode-to-tablet migration. We could construct a
tablet map which matches exactly the vnode boundaries, so migration
can happen transparently from the CQL-coordinator's point of view.

Implementation details:

We store a vector of tokens which represent tablet boundaries in the
tablet_id_map. tablet_id keeps its meaning, it's an index into vector
of tablets. To avoid logarithmic lookup of tablet_id from the token,
we introduce a lookup structure with power-of-two aligned buckets, and
store the tablet_id of the tablet which owns the first token in the
bucket. This way, lookup needs to consider tablet id range which
overlaps with one bucket. If boundaries are more or less aligned,
there are around 1-2 tablets overlapping with a bucket, and the lookup
is still O(1).

Amount of memory used increased, but not significantly relative to old
size (because tablet_info is currently fat):

For 131'072 tablets:

Before:

  Size of tablet_metadata in memory: 57456 KiB

After:

  Size of tablet_metadata in memory: 59504 KiB
This commit is contained in:
Tomasz Grabiec
2026-01-29 02:33:18 +01:00
parent 82acdae74b
commit 01fb97ee78
8 changed files with 436 additions and 41 deletions

View File

@@ -230,6 +230,15 @@ dht::token find_first_token_for_shard(
}
}
size_t
compaction_group_of(unsigned most_significant_bits, dht::raw_token t) {
if (!most_significant_bits) {
return 0;
}
uint64_t adjusted = unbias(t);
return adjusted >> (64 - most_significant_bits);
}
size_t
compaction_group_of(unsigned most_significant_bits, const token& t) {
if (!most_significant_bits) {

View File

@@ -356,6 +356,7 @@ inline constexpr token bias(uint64_t n) {
return token::bias(n);
}
size_t compaction_group_of(unsigned most_significant_bits, const token& t);
size_t compaction_group_of(unsigned most_significant_bits, dht::raw_token);
token last_token_of_compaction_group(unsigned most_significant_bits, size_t group);
// Generates 'count' tokens uniformly distributed in the token ring. Sorted.

View File

@@ -180,6 +180,7 @@ public:
gms::feature batchlog_v2 { *this, "BATCHLOG_V2"sv };
gms::feature vnodes_to_tablets_migrations { *this, "VNODES_TO_TABLETS_MIGRATIONS"sv };
gms::feature writetime_ttl_individual_element { *this, "WRITETIME_TTL_INDIVIDUAL_ELEMENT"sv };
gms::feature arbitrary_tablet_boundaries { *this, "ARBITRARY_TABLET_BOUNDARIES"sv };
public:
const std::unordered_map<sstring, std::reference_wrapper<feature>>& registered_features() const;

View File

@@ -475,8 +475,26 @@ bool tablet_metadata::operator==(const tablet_metadata& o) const {
}
tablet_map::tablet_map(size_t tablet_count, bool with_raft_info)
: _log2_tablets(log2ceil(tablet_count)) {
if (tablet_count != 1ul << _log2_tablets) {
: tablet_map(dht::get_uniform_tokens(tablet_count), with_raft_info)
{ }
tablet_map::tablet_map(utils::chunked_vector<dht::raw_token> last_tokens, bool with_raft_info)
: _tablet_ids(std::move(last_tokens))
{
if (_tablet_ids.tablet_count() != 1ul << _tablet_ids.log2_count()) {
on_internal_error(tablet_logger, format("Tablet count not a power of 2: {}", _tablet_ids.tablet_count()));
}
_tablets.resize(_tablet_ids.tablet_count());
if (with_raft_info) {
_raft_info.resize(_tablet_ids.tablet_count());
}
}
tablet_map::tablet_map(size_t tablet_count, bool with_raft_info, tablet_map::initialized_later)
: _tablet_ids(tablet_count)
{
if (tablet_count != 1ul << _tablet_ids.log2_count()) {
on_internal_error(tablet_logger, format("Tablet count not a power of 2: {}", tablet_count));
}
_tablets.resize(tablet_count);
@@ -486,11 +504,13 @@ tablet_map::tablet_map(size_t tablet_count, bool with_raft_info)
}
tablet_map tablet_map::clone() const {
return tablet_map(_tablets, _log2_tablets, _transitions, _resize_decision, _resize_task_info,
_repair_scheduler_config, _raft_info);
return tablet_map(_tablet_ids, _tablets, _transitions, _resize_decision, _resize_task_info,
_repair_scheduler_config, _raft_info);
}
future<tablet_map> tablet_map::clone_gently() const {
auto ids = co_await _tablet_ids.clone_gently();
tablet_container tablets;
tablets.reserve(_tablets.size());
for (const auto& t : _tablets) {
@@ -512,8 +532,8 @@ future<tablet_map> tablet_map::clone_gently() const {
co_await coroutine::maybe_yield();
}
co_return tablet_map(std::move(tablets), _log2_tablets, std::move(transitions), _resize_decision,
_resize_task_info, _repair_scheduler_config, std::move(raft_info));
co_return tablet_map(std::move(ids), std::move(tablets), std::move(transitions),
_resize_decision, _resize_task_info, _repair_scheduler_config, std::move(raft_info));
}
void tablet_map::check_tablet_id(tablet_id id) const {
@@ -528,7 +548,7 @@ const tablet_info& tablet_map::get_tablet_info(tablet_id id) const {
}
tablet_id tablet_map::get_tablet_id(token t) const {
return tablet_id(dht::compaction_group_of(_log2_tablets, t));
return _tablet_ids.get_tablet_id(t);
}
dht::token tablet_map::get_split_token(tablet_id id) const {
@@ -549,7 +569,7 @@ std::pair<tablet_id, tablet_range_side> tablet_map::get_tablet_id_and_range_side
dht::token tablet_map::get_last_token(tablet_id id) const {
check_tablet_id(id);
return dht::last_token_of_compaction_group(_log2_tablets, size_t(id));
return dht::token(_tablet_ids.get_last_token(id));
}
dht::token tablet_map::get_first_token(tablet_id id) const {
@@ -652,11 +672,21 @@ future<utils::chunked_vector<token>> tablet_map::get_sorted_tokens() const {
co_return tokens;
}
const utils::chunked_vector<dht::raw_token>& tablet_map::get_sorted_raw_tokens() const {
return _tablet_ids.last_tokens();
}
void tablet_map::set_tablet(tablet_id id, tablet_info info) {
check_tablet_id(id);
_tablets[size_t(id)] = std::move(info);
}
void tablet_map::emplace_tablet(tablet_id id, dht::token last_token, tablet_info info) {
check_tablet_id(id);
_tablet_ids.push_back(last_token, id);
_tablets[size_t(id)] = std::move(info);
}
void tablet_map::set_tablet_transition_info(tablet_id id, tablet_transition_info info) {
check_tablet_id(id);
_transitions.insert_or_assign(id, std::move(info));
@@ -722,7 +752,8 @@ bool tablet_map::has_replica(tablet_id tid, tablet_replica r) const {
}
future<> tablet_map::clear_gently() {
return utils::clear_gently(_tablets);
co_await utils::clear_gently(_tablets);
co_await utils::clear_gently(_tablet_ids);
}
const tablet_transition_info* tablet_map::get_tablet_transition_info(tablet_id id) const {
@@ -879,8 +910,13 @@ tablet_repair_incremental_mode tablet_repair_incremental_mode_from_string(const
return tablet_repair_incremental_mode_from_name.at(name);
}
size_t tablet_id_map::external_memory_usage() const {
return _buckets.external_memory_usage() +_last_tokens.external_memory_usage();
}
size_t tablet_map::external_memory_usage() const {
size_t result = _tablets.external_memory_usage();
result += _tablet_ids.external_memory_usage();
for (auto&& tablet : _tablets) {
result += tablet.replicas.external_memory_usage();
}
@@ -1712,6 +1748,103 @@ rack_list get_allowed_racks(const locator::token_metadata& tm, const sstring& dc
return {};
}
tablet_id_map::tablet_id_map(size_t tablet_count)
: _log2_tablets(log2ceil(tablet_count))
{
_buckets.reserve(size_t(1) << _log2_tablets);
_last_tokens.reserve(tablet_count);
}
tablet_id_map::tablet_id_map(const utils::chunked_vector<dht::raw_token>& last_tokens)
: tablet_id_map(last_tokens.size())
{
for (size_t i = 0; i < last_tokens.size(); i++) {
push_back(last_tokens[i], tablet_id(i));
}
}
void tablet_id_map::push_back(dht::token last_token, tablet_id id) {
auto i = dht::compaction_group_of(_log2_tablets, last_token);
if (_buckets.empty() && size_t(id) != 0) {
on_internal_error(tablet_logger, fmt::format("tablet_id_map::push_back: First tablet must have id 0, not {}", id));
}
if (!_buckets.empty() && (id <= _buckets.back() || last_token <= _last_tokens.back())) {
on_internal_error(tablet_logger, fmt::format("tablet_id_map::push_back: Order violated: last_token={} (prev={}), id={} (prev={}), bucket={}",
last_token, _last_tokens.back(), id, tablet_id(_buckets.back()), i));
}
_last_tokens.emplace_back(last_token);
while (_buckets.size() < i + 1) {
_buckets.push_back(id);
}
}
tablet_id tablet_id_map::get_tablet_id(dht::token t) const {
if (t.is_maximum()) {
return tablet_id(_last_tokens.size() - 1);
} else if (t.is_minimum()) {
return tablet_id(0);
} else {
return get_tablet_id(dht::raw_token(t));
}
}
tablet_id tablet_id_map::get_tablet_id(dht::raw_token t) const {
auto bucket = dht::compaction_group_of(_log2_tablets, t);
if (bucket >= _buckets.size()) {
throw std::out_of_range(fmt::format("tablet_id_map: No mapping for {}, bucket: {} bucket count: {}", t, bucket, _buckets.size()));
}
// The range [low_id, high_id] tracks range of ids which may own token t.
auto low_id = size_t(_buckets[bucket]);
// Common case: the taken falls into the tablet which owns the last token of a bucket.
// This is always the case if tablets are uniformly distributed in token space and the
// tablet count is a power of 2.
if (t <= _last_tokens[low_id]) {
return tablet_id(low_id);
}
++low_id;
auto high_id = (bucket + 1 >= _buckets.size()) ? tablet_count() - 1 : size_t(_buckets[bucket + 1]);
if (high_id - low_id <= 7) { // linear search
while (low_id <= high_id) {
if (t <= _last_tokens[low_id]) {
return tablet_id(low_id);
}
++low_id;
}
} else { // binary search
auto end = _last_tokens.begin() + high_id + 1;
auto it = std::lower_bound(_last_tokens.begin() + low_id, end, t);
if (it != end) {
return tablet_id(std::distance(_last_tokens.begin(), it));
}
}
throw std::out_of_range(fmt::format("tablet_id_map: No mapping for {}, bucket {}, last token of tablet {} is {}",
t, bucket, high_id, _last_tokens[size_t(high_id)]));
}
future<tablet_id_map> tablet_id_map::clone_gently() const {
tablet_id_map result(_last_tokens.size());
static_assert(std::is_trivially_copy_assignable_v<decltype(tablet_id_map::_buckets)::value_type>);
static_assert(std::is_trivially_copy_assignable_v<decltype(tablet_id_map::_last_tokens)::value_type>);
result._buckets = _buckets;
result._last_tokens = _last_tokens;
co_return std::move(result);
}
future<> tablet_id_map::clear_gently() {
// Storage is trivially destructible, so just let the destructor do the job.
static_assert(std::is_trivially_destructible_v<decltype(tablet_id_map::_buckets)::value_type>);
static_assert(std::is_trivially_destructible_v<decltype(tablet_id_map::_last_tokens)::value_type>);
return make_ready_future<>();
}
}
auto fmt::formatter<locator::resize_decision_way>::format(const locator::resize_decision_way& way, fmt::format_context& ctx) const

View File

@@ -540,6 +540,67 @@ public:
no_such_tablet_map(const table_id& id);
};
/// Allows looking up tablet_id of the tablet which owns a given token.
///
/// Conceptually like std::map<dht::token, tablet_id>, where the token
/// is the split point (last token) for a given tablet,
/// and get_tablet_id(token) is like lower_bound(token)->second.
///
/// Optimized for the typical case where tokens are more or less evenly
/// spaced in the token space, in which case lookup will be constant time.
class tablet_id_map {
// log2 of intended number of _buckets post-population.
uint8_t _log2_tablets;
// Covers the whole token space with assignment of tablet_id to token ranges.
// The elements describe a token range corresponding to the i-th range in the
// uniformly split token space, split into 2^_log2_tablets ranges.
//
// The tablet_id assigned to the range means that this tablet owns the first
// token in that range. There could be other tablets overlapping with the
// bucket's range, but only the lowest tablet_id is kept.
//
// If tablet count is a power-of-two and tablet boundaries align with the uniform token
// distribution then _buckets[i] == tablet_id(i).
utils::chunked_vector<tablet_id> _buckets;
// Determines tablet boundaries in token space.
// One entry per tablet.
// Tablet i owns the range (a, b], where:
// a = i == 0 ? minimum_token() : _last_tokens[i-1]
// b = _last_tokens[i]
utils::chunked_vector<dht::raw_token> _last_tokens;
public:
/// Prepares an empty tablet_id_map for population with a given tablet count.
/// Must call push_back() tablet_count times to populate the map.
explicit tablet_id_map(size_t tablet_count);
/// Builds a map according to given tablet boundaries.
tablet_id_map(const utils::chunked_vector<dht::raw_token>& last_tokens);
[[nodiscard]] size_t log2_count() const { return _log2_tablets; }
[[nodiscard]] size_t tablet_count() const { return _last_tokens.size(); }
[[nodiscard]] dht::raw_token get_last_token(tablet_id id) const { return _last_tokens[id.value()]; }
[[nodiscard]] const utils::chunked_vector<dht::raw_token>& last_tokens() const { return _last_tokens; }
/// Adds a new mapping for the next tablet whose range will be recorded to start
/// after the last token of the previously added entry and end at last_token (inclusive).
/// last_token in subsequent calls must increase.
void push_back(dht::token last_token, tablet_id id);
/// Returns tablet_id of a tablet which owns a given token.
[[nodiscard]] tablet_id get_tablet_id(dht::token t) const;
[[nodiscard]] tablet_id get_tablet_id(dht::raw_token t) const;
bool operator==(const tablet_id_map&) const = default;
bool operator!=(const tablet_id_map&) const = default;
[[nodiscard]] size_t external_memory_usage() const;
[[nodiscard]] future<tablet_id_map> clone_gently() const;
future<> clear_gently();
};
/// Stores information about tablets of a single table.
///
/// The map contains a constant number of tablets, tablet_count().
@@ -559,14 +620,11 @@ class tablet_map {
public:
using tablet_container = utils::chunked_vector<tablet_info>;
using raft_info_container = utils::chunked_vector<tablet_raft_info>;
struct initialized_later {};
private:
using transitions_map = std::unordered_map<tablet_id, tablet_transition_info>;
// The implementation assumes that _tablets.size() is a power of 2:
//
// _tablets.size() == 1 << _log2_tablets
//
tablet_id_map _tablet_ids;
tablet_container _tablets;
size_t _log2_tablets; // log_2(_tablets.size())
transitions_map _transitions;
resize_decision _resize_decision;
tablet_task_info _resize_task_info;
@@ -574,12 +632,15 @@ private:
raft_info_container _raft_info;
// Internal constructor, used by clone() and clone_gently().
tablet_map(tablet_container tablets, size_t log2_tablets, transitions_map transitions,
resize_decision resize_decision, tablet_task_info resize_task_info,
std::optional<repair_scheduler_config> repair_scheduler_config,
raft_info_container raft_info)
: _tablets(std::move(tablets))
, _log2_tablets(log2_tablets)
tablet_map(tablet_id_map ids,
tablet_container tablets,
transitions_map transitions,
resize_decision resize_decision,
tablet_task_info resize_task_info,
std::optional<repair_scheduler_config> repair_scheduler_config,
raft_info_container raft_info)
: _tablet_ids(std::move(ids))
, _tablets(std::move(tablets))
, _transitions(std::move(transitions))
, _resize_decision(resize_decision)
, _resize_task_info(std::move(resize_task_info))
@@ -588,10 +649,30 @@ private:
{}
public:
/// Constructs a tablet map.
/// Tablet boundaries will be uniformly distributed in token space.
///
/// \param tablet_count The desired tablets to allocate. Must be a power of two.
explicit tablet_map(size_t tablet_count, bool with_raft_info = false);
/// Constructs a tablet map.
/// Tablet boundaries are determined by the last_tokens parameter.
/// last_tokens[i] is the last token (inclusive) of tablet_id(i), where i == 0 refers to the first tablet.
///
/// \param last_tokens The token boundaries of tablets. Size must be a power of two.
explicit tablet_map(utils::chunked_vector<dht::raw_token> last_tokens, bool with_raft_info = false);
/// Constructs a tablet map without initializing its contents, for incremental population.
/// Prepared to hold tablet_count tablets.
///
/// It must be populated by calling emplace_tablet() tablet_count times, for every tablet.
/// The tablet_map is considered populated after the whole token space is covered by tablets,
/// no extra call is needed to seal it.
///
/// Methods which are valid until populated:
/// first_tablet(), last_tablet(), next_tablet(), emplace_tablet(), tablet_count(), transitions()
/// Other methods should not be used, as they may not work correctly with unpopulated state.
tablet_map(size_t tablet_count, bool with_raft_info, initialized_later);
tablet_map(tablet_map&&) = default;
tablet_map(const tablet_map&) = delete;
@@ -655,6 +736,7 @@ public:
/// Returns a vector of sorted last tokens for tablets.
future<utils::chunked_vector<token>> get_sorted_tokens() const;
const utils::chunked_vector<dht::raw_token>& get_sorted_raw_tokens() const;
/// Returns the id of the first tablet.
tablet_id first_tablet() const {
@@ -737,6 +819,9 @@ public:
const tablet_task_info& resize_task_info() const;
const std::optional<locator::repair_scheduler_config> get_repair_scheduler_config() const;
public:
/// Use only on tablet_map constructed with initialized_later tag to populate its contents.
/// Must be called for consecutive tablet ids and with increasing last_token.
void emplace_tablet(tablet_id, dht::token last_token, tablet_info);
void set_tablet(tablet_id, tablet_info);
void set_tablet_transition_info(tablet_id, tablet_transition_info);
void set_resize_decision(locator::resize_decision);

View File

@@ -696,7 +696,12 @@ void update_tablet_metadata_change_hint(locator::tablet_metadata_change_hint& hi
namespace {
tablet_id process_one_row(replica::database* db, table_id table, tablet_map& map, tablet_id tid, const cql3::untyped_result_set_row& row) {
using updating = bool_class<struct updating_tag>;
// is_updating == updating::yes means we're making random updates of an already populated tablet_map.
// Otherwise, we're populating a tablet_map constructed with tablet_map::initialized_later.
tablet_id process_one_row(replica::database* db, table_id table, tablet_map& map, tablet_id tid,
const cql3::untyped_result_set_row& row, updating is_updating) {
tablet_replica_set tablet_replicas;
if (row.has("replicas")) {
tablet_replicas = deserialize_replica_set(row.get_view("replicas"));
@@ -755,7 +760,20 @@ tablet_id process_one_row(replica::database* db, table_id table, tablet_map& map
}
tablet_logger.debug("Set sstables_repaired_at={} table={} tablet={}", sstables_repaired_at, table, tid);
map.set_tablet(tid, tablet_info{std::move(tablet_replicas), repair_time, repair_task_info, migration_task_info, sstables_repaired_at});
auto last_token = dht::token::from_int64(row.get_as<int64_t>("last_token"));
auto info = tablet_info{std::move(tablet_replicas), repair_time, repair_task_info, migration_task_info, sstables_repaired_at};
if (is_updating) {
auto old_last_token = map.get_last_token(tid);
if (last_token != old_last_token) {
// Boundary changes require a full tablet_map refresh.
on_internal_error(tablet_logger, format("Inconsistent last_token for table {} tablet {}: {} != {}",
table, tid, last_token, old_last_token));
}
map.set_tablet(tid, std::move(info));
} else {
map.emplace_tablet(tid, last_token, std::move(info));
}
if (row.has("raft_group_id")) {
if (!map.has_raft_info()) {
@@ -785,14 +803,6 @@ tablet_id process_one_row(replica::database* db, table_id table, tablet_map& map
}
}
auto persisted_last_token = dht::token::from_int64(row.get_as<int64_t>("last_token"));
auto current_last_token = map.get_last_token(tid);
if (current_last_token != persisted_last_token) {
tablet_logger.debug("current tablet_map: {}", map);
throw std::runtime_error(format("last_token mismatch between on-disk ({}) and in-memory ({}) tablet map for table {} tablet {}",
persisted_last_token, current_last_token, table, tid));
}
return *map.next_tablet(tid);
}
@@ -825,7 +835,7 @@ struct tablet_metadata_builder {
} else {
auto tablet_count = row.get_as<int>("tablet_count");
auto with_raft_info = db->features().strongly_consistent_tables && row.has("raft_group_id");
auto tmap = tablet_map(tablet_count, with_raft_info);
auto tmap = tablet_map(tablet_count, with_raft_info, tablet_map::initialized_later());
auto first_tablet = tmap.first_tablet();
current = active_tablet_map{table, std::move(tmap), first_tablet};
}
@@ -849,7 +859,7 @@ struct tablet_metadata_builder {
}
if (row.has("last_token")) {
current->tid = process_one_row(db, current->table, current->map, current->tid, row);
current->tid = process_one_row(db, current->table, current->map, current->tid, row, updating::no);
}
}
@@ -958,7 +968,7 @@ do_update_tablet_metadata_rows(replica::database& db, cql3::query_processor& qp,
throw std::runtime_error("Failed to update tablet metadata: updated row is empty");
} else {
tmap.clear_tablet_transition_info(tid);
process_one_row(&db, hint.table_id, tmap, tid, res->one());
process_one_row(&db, hint.table_id, tmap, tid, res->one(), updating::yes);
}
}
}
@@ -1022,7 +1032,7 @@ public:
tablet_sstable_set(schema_ptr s, const storage_group_manager& sgm, const locator::tablet_map& tmap)
: _schema(std::move(s))
, _tablet_map(tmap.tablet_count())
, _tablet_map(tmap.get_sorted_raw_tokens(), false)
{
sgm.for_each_storage_group([this] (size_t id, storage_group& sg) {
auto set = sg.make_sstable_set();

View File

@@ -4100,7 +4100,7 @@ private:
future<tablet_map> split_tablets(token_metadata_ptr tm, table_id table) {
auto& tablets = tm->tablets().get_tablet_map(table);
tablet_map new_tablets(tablets.tablet_count() * 2);
tablet_map new_tablets(tablets.tablet_count() * 2, tablets.has_raft_info(), tablet_map::initialized_later());
for (tablet_id tid : tablets.tablet_ids()) {
co_await coroutine::maybe_yield();
@@ -4110,8 +4110,8 @@ private:
auto& tablet_info = tablets.get_tablet_info(tid);
new_tablets.set_tablet(new_left_tid, tablet_info);
new_tablets.set_tablet(new_right_tid, tablet_info);
new_tablets.emplace_tablet(new_left_tid, tablets.get_split_token(tid), tablet_info);
new_tablets.emplace_tablet(new_right_tid, tablets.get_last_token(tid), tablet_info);
}
lblogger.info("Split tablets for table {}, increasing tablet count from {} to {}",
@@ -4124,7 +4124,7 @@ private:
future<tablet_map> merge_tablets(token_metadata_ptr tm, table_id table) {
auto& tablets = tm->tablets().get_tablet_map(table);
tablet_map new_tablets(tablets.tablet_count() / 2);
tablet_map new_tablets(tablets.tablet_count() / 2, tablets.has_raft_info(), tablet_map::initialized_later());
for (tablet_id tid : new_tablets.tablet_ids()) {
co_await coroutine::maybe_yield();
@@ -4152,7 +4152,7 @@ private:
}
lblogger.debug("Got merged_tablet_info with sstables_repaired_at={}", merged_tablet_info->sstables_repaired_at);
new_tablets.set_tablet(tid, *merged_tablet_info);
new_tablets.emplace_tablet(tid, tablets.get_last_token(old_right_tid), *merged_tablet_info);
}
lblogger.info("Merge tablets for table {}, decreasing tablet count from {} to {}",

View File

@@ -191,6 +191,162 @@ void mutate_tablets(cql_test_env& e, seastar::noncopyable_function<future<>(tabl
mutate_tablets(e, guard, std::move(mutator));
}
SEASTAR_TEST_CASE(test_tablet_id_map_different_density_test) {
// Exercise different density of buckets by scaling token space, out of which we pick only first few tokens.
for (int num_tokens : {7, 8, 11, 16}) {
testlog.info("L{}: {} tokens", __LINE__, num_tokens);
auto tokens = dht::get_uniform_tokens(num_tokens);
auto map = tablet_id_map(3);
map.push_back(tokens[1], tablet_id(0));
map.push_back(tokens[3], tablet_id(1));
map.push_back(dht::last_token(), tablet_id(2));
BOOST_REQUIRE_EQUAL(map.get_tablet_id(dht::minimum_token()), tablet_id(0));
BOOST_REQUIRE_EQUAL(map.get_tablet_id(tokens[0]), tablet_id(0));
BOOST_REQUIRE_EQUAL(map.get_tablet_id(tokens[1]), tablet_id(0));
BOOST_REQUIRE_EQUAL(map.get_tablet_id(tokens[2]), tablet_id(1));
BOOST_REQUIRE_EQUAL(map.get_tablet_id(tokens[3]), tablet_id(1));
BOOST_REQUIRE_EQUAL(map.get_tablet_id(tokens[4]), tablet_id(2));
BOOST_REQUIRE_EQUAL(map.get_tablet_id(tokens[5]), tablet_id(2));
BOOST_REQUIRE_EQUAL(map.get_tablet_id(dht::maximum_token()), tablet_id(2));
}
for (int num_tokens : {3, 4, 7, 11}) {
testlog.info("L{}: {} tokens", __LINE__, num_tokens);
auto tokens = dht::get_uniform_tokens(num_tokens);
// Verify tokens concentrated in the front
{
auto map = tablet_id_map(3);
auto last0 = tokens[0];
auto last1 = tokens[1];
auto last2 = dht::last_token();
map.push_back(last0, tablet_id(0));
map.push_back(last1, tablet_id(1));
map.push_back(last2, tablet_id(2));
BOOST_REQUIRE_EQUAL(map.get_tablet_id(dht::minimum_token()), tablet_id(0));
BOOST_REQUIRE_EQUAL(map.get_tablet_id(dht::token::midpoint(dht::first_token(), last0)), tablet_id(0));
BOOST_REQUIRE_EQUAL(map.get_tablet_id(last0), tablet_id(0));
BOOST_REQUIRE_EQUAL(map.get_tablet_id(dht::token::midpoint(last0, last1)), tablet_id(1));
BOOST_REQUIRE_EQUAL(map.get_tablet_id(last1), tablet_id(1));
BOOST_REQUIRE_EQUAL(map.get_tablet_id(dht::token::midpoint(last1, last2)), tablet_id(2));
BOOST_REQUIRE_EQUAL(map.get_tablet_id(dht::maximum_token()), tablet_id(2));
}
// Verify tokens concentrated in the back
{
auto map = tablet_id_map(3);
auto last0 = tokens[tokens.size() - 3];
auto last1 = tokens[tokens.size() - 2];
auto last2 = tokens[tokens.size() - 1];
map.push_back(last0, tablet_id(0));
map.push_back(last1, tablet_id(1));
map.push_back(last2, tablet_id(2));
BOOST_REQUIRE_EQUAL(map.get_tablet_id(dht::minimum_token()), tablet_id(0));
BOOST_REQUIRE_EQUAL(map.get_tablet_id(dht::token::midpoint(dht::first_token(), last0)), tablet_id(0));
BOOST_REQUIRE_EQUAL(map.get_tablet_id(last0), tablet_id(0));
BOOST_REQUIRE_EQUAL(map.get_tablet_id(dht::token::midpoint(last0, last1)), tablet_id(1));
BOOST_REQUIRE_EQUAL(map.get_tablet_id(last1), tablet_id(1));
BOOST_REQUIRE_EQUAL(map.get_tablet_id(dht::token::midpoint(last1, last2)), tablet_id(2));
BOOST_REQUIRE_EQUAL(map.get_tablet_id(dht::maximum_token()), tablet_id(2));
}
}
return make_ready_future<>();
}
SEASTAR_TEST_CASE(test_tablet_id_map_building) {
auto uniform_tokens = dht::get_uniform_tokens(8);
auto map = tablet_id_map(7);
auto tablet_0_last = dht::token(uniform_tokens[0]).next().next();
auto tablet_1_last = dht::token(uniform_tokens[2]).next();
auto tablet_2_last = tablet_1_last.next(); // tablet 2 owns only 1 token.
auto tablet_3_last = tablet_2_last.next().next();
auto tablet_4_last = dht::token(uniform_tokens[3]);
auto tablet_5_last = dht::token::midpoint(dht::token(uniform_tokens[3]), dht::token(uniform_tokens[4]));
auto tablet_6_last = dht::last_token();
map.push_back(dht::raw_token(tablet_0_last), tablet_id(0));
map.push_back(dht::raw_token(tablet_1_last), tablet_id(1));
map.push_back(dht::raw_token(tablet_2_last), tablet_id(2));
map.push_back(dht::raw_token(tablet_3_last), tablet_id(3));
map.push_back(dht::raw_token(tablet_4_last), tablet_id(4));
map.push_back(dht::raw_token(tablet_5_last), tablet_id(5));
map.push_back(dht::raw_token(tablet_6_last), tablet_id(6));
BOOST_REQUIRE_EQUAL(map.get_tablet_id(uniform_tokens[0]), tablet_id(0));
BOOST_REQUIRE_EQUAL(map.get_tablet_id(uniform_tokens[1]), tablet_id(1));
BOOST_REQUIRE_EQUAL(map.get_tablet_id(uniform_tokens[2]), tablet_id(1));
BOOST_REQUIRE_EQUAL(map.get_tablet_id(uniform_tokens[3]), tablet_id(4));
BOOST_REQUIRE_EQUAL(map.get_tablet_id(uniform_tokens[4]), tablet_id(6));
BOOST_REQUIRE_EQUAL(map.get_tablet_id(uniform_tokens[5]), tablet_id(6));
BOOST_REQUIRE_EQUAL(map.get_tablet_id(uniform_tokens[6]), tablet_id(6));
BOOST_REQUIRE_EQUAL(map.get_tablet_id(uniform_tokens[7]), tablet_id(6));
BOOST_REQUIRE_EQUAL(map.get_tablet_id(tablet_0_last), tablet_id(0));
BOOST_REQUIRE_EQUAL(map.get_tablet_id(tablet_1_last), tablet_id(1));
BOOST_REQUIRE_EQUAL(map.get_tablet_id(tablet_2_last), tablet_id(2));
BOOST_REQUIRE_EQUAL(map.get_tablet_id(tablet_3_last), tablet_id(3));
BOOST_REQUIRE_EQUAL(map.get_tablet_id(tablet_4_last), tablet_id(4));
BOOST_REQUIRE_EQUAL(map.get_tablet_id(tablet_5_last), tablet_id(5));
BOOST_REQUIRE_EQUAL(map.get_tablet_id(tablet_6_last), tablet_id(6));
BOOST_REQUIRE_EQUAL(map.get_tablet_id(tablet_0_last.next()), tablet_id(1));
BOOST_REQUIRE_EQUAL(map.get_tablet_id(tablet_1_last.next()), tablet_id(2));
BOOST_REQUIRE_EQUAL(map.get_tablet_id(tablet_2_last.next()), tablet_id(3));
BOOST_REQUIRE_EQUAL(map.get_tablet_id(tablet_3_last.next()), tablet_id(4));
BOOST_REQUIRE_EQUAL(map.get_tablet_id(tablet_4_last.next()), tablet_id(5));
BOOST_REQUIRE_EQUAL(map.get_tablet_id(tablet_5_last.next()), tablet_id(6));
BOOST_REQUIRE_EQUAL(map.get_tablet_id(dht::minimum_token()), tablet_id(0));
BOOST_REQUIRE_EQUAL(map.get_tablet_id(dht::maximum_token()), tablet_id(6));
return make_ready_future<>();
}
static
utils::chunked_vector<dht::raw_token> get_random_tokens(size_t n) {
utils::chunked_vector<dht::raw_token> last_tokens;
for (size_t i = 0; i < n; ++i) {
last_tokens.push_back(dht::raw_token(dht::token::get_random_token()));
}
std::sort(last_tokens.begin(), last_tokens.end());
return last_tokens;
}
SEASTAR_THREAD_TEST_CASE(test_tablet_id_map_cloning) {
for (int n_tokens : {1, 2, 7, 13, 16}) {
auto last_tokens = get_random_tokens(n_tokens);
last_tokens.back() = dht::raw_token(dht::last_token());
auto map = tablet_id_map(last_tokens);
auto map_cloned = map.clone_gently().get();
int t_idx = 0;
for (auto&& t: last_tokens) {
testlog.trace("last token {} of tablet {} bucket {}", t, t_idx, dht::compaction_group_of(map.log2_count(), t));
BOOST_REQUIRE_EQUAL(map.get_tablet_id(t), tablet_id(t_idx));
BOOST_REQUIRE_EQUAL(map.get_tablet_id(t), map_cloned.get_tablet_id(t));
if (t < last_tokens.back()) {
auto next_t = dht::token(t).next();
BOOST_REQUIRE_EQUAL(map.get_tablet_id(next_t), map_cloned.get_tablet_id(next_t));
BOOST_REQUIRE_EQUAL(map.get_tablet_id(next_t), tablet_id(t_idx + 1));
}
if (t > dht::first_token()) {
auto prev_t = dht::token(dht::raw_token(t.value - 1));
BOOST_REQUIRE_EQUAL(map.get_tablet_id(prev_t), map_cloned.get_tablet_id(prev_t));
}
++t_idx;
}
}
}
SEASTAR_TEST_CASE(test_tablet_metadata_persistence) {
return do_with_cql_env_thread([] (cql_test_env& e) {
auto h1 = host_id(utils::UUID_gen::get_time_UUID());
@@ -3904,7 +4060,7 @@ SEASTAR_THREAD_TEST_CASE(test_load_balancing_with_random_load) {
keyspaces.push_back(add_keyspace(e, {{topo.dc(), rf}}, initial_tablets));
auto table = add_table(e, keyspaces.back()).get();
mutate_tablets(e, [&] (tablet_metadata& tmeta) -> future<> {
tablet_map tmap(initial_tablets);
tablet_map tmap = tmeta.get_tablet_map(table).clone();
for (auto tid : tmap.tablet_ids()) {
// Choose replicas randomly while loading racks evenly.
std::vector<host_id> replica_hosts = allocate_replicas_in_racks(racks, rf, hosts_by_rack);
@@ -5012,7 +5168,7 @@ static void do_test_load_balancing_merge_colocation(cql_test_env& e, const int n
auto guard = e.get_raft_group0_client().start_operation(as).get();
stm.mutate_token_metadata([&](token_metadata& tm) -> future<> {
tablet_metadata& tmeta = tm.tablets();
tablet_map tmap(initial_tablets);
tablet_map tmap = tmeta.get_tablet_map(table1).clone();
locator::resize_decision decision;
// leaves growing mode, allowing for merge decision.
decision.sequence_number = decision.next_sequence_number();