diff --git a/dht/token.cc b/dht/token.cc index 45910dd339..9b04847b69 100644 --- a/dht/token.cc +++ b/dht/token.cc @@ -230,6 +230,15 @@ dht::token find_first_token_for_shard( } } +size_t +compaction_group_of(unsigned most_significant_bits, dht::raw_token t) { + if (!most_significant_bits) { + return 0; + } + uint64_t adjusted = unbias(t); + return adjusted >> (64 - most_significant_bits); +} + size_t compaction_group_of(unsigned most_significant_bits, const token& t) { if (!most_significant_bits) { diff --git a/dht/token.hh b/dht/token.hh index 36b9d60a21..fcf89c9ede 100644 --- a/dht/token.hh +++ b/dht/token.hh @@ -356,6 +356,7 @@ inline constexpr token bias(uint64_t n) { return token::bias(n); } size_t compaction_group_of(unsigned most_significant_bits, const token& t); +size_t compaction_group_of(unsigned most_significant_bits, dht::raw_token); token last_token_of_compaction_group(unsigned most_significant_bits, size_t group); // Generates 'count' tokens uniformly distributed in the token ring. Sorted. diff --git a/gms/feature_service.hh b/gms/feature_service.hh index 4e44b04471..e7431ce548 100644 --- a/gms/feature_service.hh +++ b/gms/feature_service.hh @@ -180,6 +180,7 @@ public: gms::feature batchlog_v2 { *this, "BATCHLOG_V2"sv }; gms::feature vnodes_to_tablets_migrations { *this, "VNODES_TO_TABLETS_MIGRATIONS"sv }; gms::feature writetime_ttl_individual_element { *this, "WRITETIME_TTL_INDIVIDUAL_ELEMENT"sv }; + gms::feature arbitrary_tablet_boundaries { *this, "ARBITRARY_TABLET_BOUNDARIES"sv }; public: const std::unordered_map>& registered_features() const; diff --git a/locator/tablets.cc b/locator/tablets.cc index e0f6b4e7b6..4e982c26b0 100644 --- a/locator/tablets.cc +++ b/locator/tablets.cc @@ -475,8 +475,26 @@ bool tablet_metadata::operator==(const tablet_metadata& o) const { } tablet_map::tablet_map(size_t tablet_count, bool with_raft_info) - : _log2_tablets(log2ceil(tablet_count)) { - if (tablet_count != 1ul << _log2_tablets) { + : tablet_map(dht::get_uniform_tokens(tablet_count), with_raft_info) +{ } + +tablet_map::tablet_map(utils::chunked_vector last_tokens, bool with_raft_info) + : _tablet_ids(std::move(last_tokens)) +{ + if (_tablet_ids.tablet_count() != 1ul << _tablet_ids.log2_count()) { + on_internal_error(tablet_logger, format("Tablet count not a power of 2: {}", _tablet_ids.tablet_count())); + } + + _tablets.resize(_tablet_ids.tablet_count()); + if (with_raft_info) { + _raft_info.resize(_tablet_ids.tablet_count()); + } +} + +tablet_map::tablet_map(size_t tablet_count, bool with_raft_info, tablet_map::initialized_later) + : _tablet_ids(tablet_count) +{ + if (tablet_count != 1ul << _tablet_ids.log2_count()) { on_internal_error(tablet_logger, format("Tablet count not a power of 2: {}", tablet_count)); } _tablets.resize(tablet_count); @@ -486,11 +504,13 @@ tablet_map::tablet_map(size_t tablet_count, bool with_raft_info) } tablet_map tablet_map::clone() const { - return tablet_map(_tablets, _log2_tablets, _transitions, _resize_decision, _resize_task_info, - _repair_scheduler_config, _raft_info); + return tablet_map(_tablet_ids, _tablets, _transitions, _resize_decision, _resize_task_info, + _repair_scheduler_config, _raft_info); } future tablet_map::clone_gently() const { + auto ids = co_await _tablet_ids.clone_gently(); + tablet_container tablets; tablets.reserve(_tablets.size()); for (const auto& t : _tablets) { @@ -512,8 +532,8 @@ future tablet_map::clone_gently() const { co_await coroutine::maybe_yield(); } - co_return tablet_map(std::move(tablets), _log2_tablets, std::move(transitions), _resize_decision, - _resize_task_info, _repair_scheduler_config, std::move(raft_info)); + co_return tablet_map(std::move(ids), std::move(tablets), std::move(transitions), + _resize_decision, _resize_task_info, _repair_scheduler_config, std::move(raft_info)); } void tablet_map::check_tablet_id(tablet_id id) const { @@ -528,7 +548,7 @@ const tablet_info& tablet_map::get_tablet_info(tablet_id id) const { } tablet_id tablet_map::get_tablet_id(token t) const { - return tablet_id(dht::compaction_group_of(_log2_tablets, t)); + return _tablet_ids.get_tablet_id(t); } dht::token tablet_map::get_split_token(tablet_id id) const { @@ -549,7 +569,7 @@ std::pair tablet_map::get_tablet_id_and_range_side dht::token tablet_map::get_last_token(tablet_id id) const { check_tablet_id(id); - return dht::last_token_of_compaction_group(_log2_tablets, size_t(id)); + return dht::token(_tablet_ids.get_last_token(id)); } dht::token tablet_map::get_first_token(tablet_id id) const { @@ -652,11 +672,21 @@ future> tablet_map::get_sorted_tokens() const { co_return tokens; } +const utils::chunked_vector& tablet_map::get_sorted_raw_tokens() const { + return _tablet_ids.last_tokens(); +} + void tablet_map::set_tablet(tablet_id id, tablet_info info) { check_tablet_id(id); _tablets[size_t(id)] = std::move(info); } +void tablet_map::emplace_tablet(tablet_id id, dht::token last_token, tablet_info info) { + check_tablet_id(id); + _tablet_ids.push_back(last_token, id); + _tablets[size_t(id)] = std::move(info); +} + void tablet_map::set_tablet_transition_info(tablet_id id, tablet_transition_info info) { check_tablet_id(id); _transitions.insert_or_assign(id, std::move(info)); @@ -722,7 +752,8 @@ bool tablet_map::has_replica(tablet_id tid, tablet_replica r) const { } future<> tablet_map::clear_gently() { - return utils::clear_gently(_tablets); + co_await utils::clear_gently(_tablets); + co_await utils::clear_gently(_tablet_ids); } const tablet_transition_info* tablet_map::get_tablet_transition_info(tablet_id id) const { @@ -879,8 +910,13 @@ tablet_repair_incremental_mode tablet_repair_incremental_mode_from_string(const return tablet_repair_incremental_mode_from_name.at(name); } +size_t tablet_id_map::external_memory_usage() const { + return _buckets.external_memory_usage() +_last_tokens.external_memory_usage(); +} + size_t tablet_map::external_memory_usage() const { size_t result = _tablets.external_memory_usage(); + result += _tablet_ids.external_memory_usage(); for (auto&& tablet : _tablets) { result += tablet.replicas.external_memory_usage(); } @@ -1712,6 +1748,103 @@ rack_list get_allowed_racks(const locator::token_metadata& tm, const sstring& dc return {}; } +tablet_id_map::tablet_id_map(size_t tablet_count) + : _log2_tablets(log2ceil(tablet_count)) +{ + _buckets.reserve(size_t(1) << _log2_tablets); + _last_tokens.reserve(tablet_count); +} + +tablet_id_map::tablet_id_map(const utils::chunked_vector& last_tokens) + : tablet_id_map(last_tokens.size()) +{ + for (size_t i = 0; i < last_tokens.size(); i++) { + push_back(last_tokens[i], tablet_id(i)); + } +} + +void tablet_id_map::push_back(dht::token last_token, tablet_id id) { + auto i = dht::compaction_group_of(_log2_tablets, last_token); + + if (_buckets.empty() && size_t(id) != 0) { + on_internal_error(tablet_logger, fmt::format("tablet_id_map::push_back: First tablet must have id 0, not {}", id)); + } + + if (!_buckets.empty() && (id <= _buckets.back() || last_token <= _last_tokens.back())) { + on_internal_error(tablet_logger, fmt::format("tablet_id_map::push_back: Order violated: last_token={} (prev={}), id={} (prev={}), bucket={}", + last_token, _last_tokens.back(), id, tablet_id(_buckets.back()), i)); + } + + _last_tokens.emplace_back(last_token); + + while (_buckets.size() < i + 1) { + _buckets.push_back(id); + } +} + +tablet_id tablet_id_map::get_tablet_id(dht::token t) const { + if (t.is_maximum()) { + return tablet_id(_last_tokens.size() - 1); + } else if (t.is_minimum()) { + return tablet_id(0); + } else { + return get_tablet_id(dht::raw_token(t)); + } +} + +tablet_id tablet_id_map::get_tablet_id(dht::raw_token t) const { + auto bucket = dht::compaction_group_of(_log2_tablets, t); + if (bucket >= _buckets.size()) { + throw std::out_of_range(fmt::format("tablet_id_map: No mapping for {}, bucket: {} bucket count: {}", t, bucket, _buckets.size())); + } + + // The range [low_id, high_id] tracks range of ids which may own token t. + auto low_id = size_t(_buckets[bucket]); + + // Common case: the taken falls into the tablet which owns the last token of a bucket. + // This is always the case if tablets are uniformly distributed in token space and the + // tablet count is a power of 2. + if (t <= _last_tokens[low_id]) { + return tablet_id(low_id); + } + + ++low_id; + auto high_id = (bucket + 1 >= _buckets.size()) ? tablet_count() - 1 : size_t(_buckets[bucket + 1]); + + if (high_id - low_id <= 7) { // linear search + while (low_id <= high_id) { + if (t <= _last_tokens[low_id]) { + return tablet_id(low_id); + } + ++low_id; + } + } else { // binary search + auto end = _last_tokens.begin() + high_id + 1; + auto it = std::lower_bound(_last_tokens.begin() + low_id, end, t); + if (it != end) { + return tablet_id(std::distance(_last_tokens.begin(), it)); + } + } + throw std::out_of_range(fmt::format("tablet_id_map: No mapping for {}, bucket {}, last token of tablet {} is {}", + t, bucket, high_id, _last_tokens[size_t(high_id)])); +} + +future tablet_id_map::clone_gently() const { + tablet_id_map result(_last_tokens.size()); + static_assert(std::is_trivially_copy_assignable_v); + static_assert(std::is_trivially_copy_assignable_v); + result._buckets = _buckets; + result._last_tokens = _last_tokens; + co_return std::move(result); +} + +future<> tablet_id_map::clear_gently() { + // Storage is trivially destructible, so just let the destructor do the job. + static_assert(std::is_trivially_destructible_v); + static_assert(std::is_trivially_destructible_v); + return make_ready_future<>(); +} + } auto fmt::formatter::format(const locator::resize_decision_way& way, fmt::format_context& ctx) const diff --git a/locator/tablets.hh b/locator/tablets.hh index 0d0893dc24..2cb8198d91 100644 --- a/locator/tablets.hh +++ b/locator/tablets.hh @@ -540,6 +540,67 @@ public: no_such_tablet_map(const table_id& id); }; +/// Allows looking up tablet_id of the tablet which owns a given token. +/// +/// Conceptually like std::map, where the token +/// is the split point (last token) for a given tablet, +/// and get_tablet_id(token) is like lower_bound(token)->second. +/// +/// Optimized for the typical case where tokens are more or less evenly +/// spaced in the token space, in which case lookup will be constant time. +class tablet_id_map { + // log2 of intended number of _buckets post-population. + uint8_t _log2_tablets; + + // Covers the whole token space with assignment of tablet_id to token ranges. + // The elements describe a token range corresponding to the i-th range in the + // uniformly split token space, split into 2^_log2_tablets ranges. + // + // The tablet_id assigned to the range means that this tablet owns the first + // token in that range. There could be other tablets overlapping with the + // bucket's range, but only the lowest tablet_id is kept. + // + // If tablet count is a power-of-two and tablet boundaries align with the uniform token + // distribution then _buckets[i] == tablet_id(i). + utils::chunked_vector _buckets; + + // Determines tablet boundaries in token space. + // One entry per tablet. + // Tablet i owns the range (a, b], where: + // a = i == 0 ? minimum_token() : _last_tokens[i-1] + // b = _last_tokens[i] + utils::chunked_vector _last_tokens; +public: + /// Prepares an empty tablet_id_map for population with a given tablet count. + /// Must call push_back() tablet_count times to populate the map. + explicit tablet_id_map(size_t tablet_count); + + /// Builds a map according to given tablet boundaries. + tablet_id_map(const utils::chunked_vector& last_tokens); + + [[nodiscard]] size_t log2_count() const { return _log2_tablets; } + [[nodiscard]] size_t tablet_count() const { return _last_tokens.size(); } + [[nodiscard]] dht::raw_token get_last_token(tablet_id id) const { return _last_tokens[id.value()]; } + [[nodiscard]] const utils::chunked_vector& last_tokens() const { return _last_tokens; } + + /// Adds a new mapping for the next tablet whose range will be recorded to start + /// after the last token of the previously added entry and end at last_token (inclusive). + /// last_token in subsequent calls must increase. + void push_back(dht::token last_token, tablet_id id); + + /// Returns tablet_id of a tablet which owns a given token. + [[nodiscard]] tablet_id get_tablet_id(dht::token t) const; + [[nodiscard]] tablet_id get_tablet_id(dht::raw_token t) const; + + bool operator==(const tablet_id_map&) const = default; + bool operator!=(const tablet_id_map&) const = default; + + [[nodiscard]] size_t external_memory_usage() const; + + [[nodiscard]] future clone_gently() const; + future<> clear_gently(); +}; + /// Stores information about tablets of a single table. /// /// The map contains a constant number of tablets, tablet_count(). @@ -559,14 +620,11 @@ class tablet_map { public: using tablet_container = utils::chunked_vector; using raft_info_container = utils::chunked_vector; + struct initialized_later {}; private: using transitions_map = std::unordered_map; - // The implementation assumes that _tablets.size() is a power of 2: - // - // _tablets.size() == 1 << _log2_tablets - // + tablet_id_map _tablet_ids; tablet_container _tablets; - size_t _log2_tablets; // log_2(_tablets.size()) transitions_map _transitions; resize_decision _resize_decision; tablet_task_info _resize_task_info; @@ -574,12 +632,15 @@ private: raft_info_container _raft_info; // Internal constructor, used by clone() and clone_gently(). - tablet_map(tablet_container tablets, size_t log2_tablets, transitions_map transitions, - resize_decision resize_decision, tablet_task_info resize_task_info, - std::optional repair_scheduler_config, - raft_info_container raft_info) - : _tablets(std::move(tablets)) - , _log2_tablets(log2_tablets) + tablet_map(tablet_id_map ids, + tablet_container tablets, + transitions_map transitions, + resize_decision resize_decision, + tablet_task_info resize_task_info, + std::optional repair_scheduler_config, + raft_info_container raft_info) + : _tablet_ids(std::move(ids)) + , _tablets(std::move(tablets)) , _transitions(std::move(transitions)) , _resize_decision(resize_decision) , _resize_task_info(std::move(resize_task_info)) @@ -588,10 +649,30 @@ private: {} public: /// Constructs a tablet map. + /// Tablet boundaries will be uniformly distributed in token space. /// /// \param tablet_count The desired tablets to allocate. Must be a power of two. explicit tablet_map(size_t tablet_count, bool with_raft_info = false); + /// Constructs a tablet map. + /// Tablet boundaries are determined by the last_tokens parameter. + /// last_tokens[i] is the last token (inclusive) of tablet_id(i), where i == 0 refers to the first tablet. + /// + /// \param last_tokens The token boundaries of tablets. Size must be a power of two. + explicit tablet_map(utils::chunked_vector last_tokens, bool with_raft_info = false); + + /// Constructs a tablet map without initializing its contents, for incremental population. + /// Prepared to hold tablet_count tablets. + /// + /// It must be populated by calling emplace_tablet() tablet_count times, for every tablet. + /// The tablet_map is considered populated after the whole token space is covered by tablets, + /// no extra call is needed to seal it. + /// + /// Methods which are valid until populated: + /// first_tablet(), last_tablet(), next_tablet(), emplace_tablet(), tablet_count(), transitions() + /// Other methods should not be used, as they may not work correctly with unpopulated state. + tablet_map(size_t tablet_count, bool with_raft_info, initialized_later); + tablet_map(tablet_map&&) = default; tablet_map(const tablet_map&) = delete; @@ -655,6 +736,7 @@ public: /// Returns a vector of sorted last tokens for tablets. future> get_sorted_tokens() const; + const utils::chunked_vector& get_sorted_raw_tokens() const; /// Returns the id of the first tablet. tablet_id first_tablet() const { @@ -737,6 +819,9 @@ public: const tablet_task_info& resize_task_info() const; const std::optional get_repair_scheduler_config() const; public: + /// Use only on tablet_map constructed with initialized_later tag to populate its contents. + /// Must be called for consecutive tablet ids and with increasing last_token. + void emplace_tablet(tablet_id, dht::token last_token, tablet_info); void set_tablet(tablet_id, tablet_info); void set_tablet_transition_info(tablet_id, tablet_transition_info); void set_resize_decision(locator::resize_decision); diff --git a/replica/tablets.cc b/replica/tablets.cc index 0737ff21d4..0c4bb8ad8a 100644 --- a/replica/tablets.cc +++ b/replica/tablets.cc @@ -696,7 +696,12 @@ void update_tablet_metadata_change_hint(locator::tablet_metadata_change_hint& hi namespace { -tablet_id process_one_row(replica::database* db, table_id table, tablet_map& map, tablet_id tid, const cql3::untyped_result_set_row& row) { +using updating = bool_class; + +// is_updating == updating::yes means we're making random updates of an already populated tablet_map. +// Otherwise, we're populating a tablet_map constructed with tablet_map::initialized_later. +tablet_id process_one_row(replica::database* db, table_id table, tablet_map& map, tablet_id tid, + const cql3::untyped_result_set_row& row, updating is_updating) { tablet_replica_set tablet_replicas; if (row.has("replicas")) { tablet_replicas = deserialize_replica_set(row.get_view("replicas")); @@ -755,7 +760,20 @@ tablet_id process_one_row(replica::database* db, table_id table, tablet_map& map } tablet_logger.debug("Set sstables_repaired_at={} table={} tablet={}", sstables_repaired_at, table, tid); - map.set_tablet(tid, tablet_info{std::move(tablet_replicas), repair_time, repair_task_info, migration_task_info, sstables_repaired_at}); + + auto last_token = dht::token::from_int64(row.get_as("last_token")); + auto info = tablet_info{std::move(tablet_replicas), repair_time, repair_task_info, migration_task_info, sstables_repaired_at}; + if (is_updating) { + auto old_last_token = map.get_last_token(tid); + if (last_token != old_last_token) { + // Boundary changes require a full tablet_map refresh. + on_internal_error(tablet_logger, format("Inconsistent last_token for table {} tablet {}: {} != {}", + table, tid, last_token, old_last_token)); + } + map.set_tablet(tid, std::move(info)); + } else { + map.emplace_tablet(tid, last_token, std::move(info)); + } if (row.has("raft_group_id")) { if (!map.has_raft_info()) { @@ -785,14 +803,6 @@ tablet_id process_one_row(replica::database* db, table_id table, tablet_map& map } } - auto persisted_last_token = dht::token::from_int64(row.get_as("last_token")); - auto current_last_token = map.get_last_token(tid); - if (current_last_token != persisted_last_token) { - tablet_logger.debug("current tablet_map: {}", map); - throw std::runtime_error(format("last_token mismatch between on-disk ({}) and in-memory ({}) tablet map for table {} tablet {}", - persisted_last_token, current_last_token, table, tid)); - } - return *map.next_tablet(tid); } @@ -825,7 +835,7 @@ struct tablet_metadata_builder { } else { auto tablet_count = row.get_as("tablet_count"); auto with_raft_info = db->features().strongly_consistent_tables && row.has("raft_group_id"); - auto tmap = tablet_map(tablet_count, with_raft_info); + auto tmap = tablet_map(tablet_count, with_raft_info, tablet_map::initialized_later()); auto first_tablet = tmap.first_tablet(); current = active_tablet_map{table, std::move(tmap), first_tablet}; } @@ -849,7 +859,7 @@ struct tablet_metadata_builder { } if (row.has("last_token")) { - current->tid = process_one_row(db, current->table, current->map, current->tid, row); + current->tid = process_one_row(db, current->table, current->map, current->tid, row, updating::no); } } @@ -958,7 +968,7 @@ do_update_tablet_metadata_rows(replica::database& db, cql3::query_processor& qp, throw std::runtime_error("Failed to update tablet metadata: updated row is empty"); } else { tmap.clear_tablet_transition_info(tid); - process_one_row(&db, hint.table_id, tmap, tid, res->one()); + process_one_row(&db, hint.table_id, tmap, tid, res->one(), updating::yes); } } } @@ -1022,7 +1032,7 @@ public: tablet_sstable_set(schema_ptr s, const storage_group_manager& sgm, const locator::tablet_map& tmap) : _schema(std::move(s)) - , _tablet_map(tmap.tablet_count()) + , _tablet_map(tmap.get_sorted_raw_tokens(), false) { sgm.for_each_storage_group([this] (size_t id, storage_group& sg) { auto set = sg.make_sstable_set(); diff --git a/service/tablet_allocator.cc b/service/tablet_allocator.cc index e3a99ce3e1..c99e2048ec 100644 --- a/service/tablet_allocator.cc +++ b/service/tablet_allocator.cc @@ -4100,7 +4100,7 @@ private: future split_tablets(token_metadata_ptr tm, table_id table) { auto& tablets = tm->tablets().get_tablet_map(table); - tablet_map new_tablets(tablets.tablet_count() * 2); + tablet_map new_tablets(tablets.tablet_count() * 2, tablets.has_raft_info(), tablet_map::initialized_later()); for (tablet_id tid : tablets.tablet_ids()) { co_await coroutine::maybe_yield(); @@ -4110,8 +4110,8 @@ private: auto& tablet_info = tablets.get_tablet_info(tid); - new_tablets.set_tablet(new_left_tid, tablet_info); - new_tablets.set_tablet(new_right_tid, tablet_info); + new_tablets.emplace_tablet(new_left_tid, tablets.get_split_token(tid), tablet_info); + new_tablets.emplace_tablet(new_right_tid, tablets.get_last_token(tid), tablet_info); } lblogger.info("Split tablets for table {}, increasing tablet count from {} to {}", @@ -4124,7 +4124,7 @@ private: future merge_tablets(token_metadata_ptr tm, table_id table) { auto& tablets = tm->tablets().get_tablet_map(table); - tablet_map new_tablets(tablets.tablet_count() / 2); + tablet_map new_tablets(tablets.tablet_count() / 2, tablets.has_raft_info(), tablet_map::initialized_later()); for (tablet_id tid : new_tablets.tablet_ids()) { co_await coroutine::maybe_yield(); @@ -4152,7 +4152,7 @@ private: } lblogger.debug("Got merged_tablet_info with sstables_repaired_at={}", merged_tablet_info->sstables_repaired_at); - new_tablets.set_tablet(tid, *merged_tablet_info); + new_tablets.emplace_tablet(tid, tablets.get_last_token(old_right_tid), *merged_tablet_info); } lblogger.info("Merge tablets for table {}, decreasing tablet count from {} to {}", diff --git a/test/boost/tablets_test.cc b/test/boost/tablets_test.cc index 8f8a115f6b..81031cf1bf 100644 --- a/test/boost/tablets_test.cc +++ b/test/boost/tablets_test.cc @@ -191,6 +191,162 @@ void mutate_tablets(cql_test_env& e, seastar::noncopyable_function(tabl mutate_tablets(e, guard, std::move(mutator)); } +SEASTAR_TEST_CASE(test_tablet_id_map_different_density_test) { + // Exercise different density of buckets by scaling token space, out of which we pick only first few tokens. + for (int num_tokens : {7, 8, 11, 16}) { + testlog.info("L{}: {} tokens", __LINE__, num_tokens); + auto tokens = dht::get_uniform_tokens(num_tokens); + + auto map = tablet_id_map(3); + map.push_back(tokens[1], tablet_id(0)); + map.push_back(tokens[3], tablet_id(1)); + map.push_back(dht::last_token(), tablet_id(2)); + + BOOST_REQUIRE_EQUAL(map.get_tablet_id(dht::minimum_token()), tablet_id(0)); + BOOST_REQUIRE_EQUAL(map.get_tablet_id(tokens[0]), tablet_id(0)); + BOOST_REQUIRE_EQUAL(map.get_tablet_id(tokens[1]), tablet_id(0)); + BOOST_REQUIRE_EQUAL(map.get_tablet_id(tokens[2]), tablet_id(1)); + BOOST_REQUIRE_EQUAL(map.get_tablet_id(tokens[3]), tablet_id(1)); + BOOST_REQUIRE_EQUAL(map.get_tablet_id(tokens[4]), tablet_id(2)); + BOOST_REQUIRE_EQUAL(map.get_tablet_id(tokens[5]), tablet_id(2)); + BOOST_REQUIRE_EQUAL(map.get_tablet_id(dht::maximum_token()), tablet_id(2)); + } + + for (int num_tokens : {3, 4, 7, 11}) { + testlog.info("L{}: {} tokens", __LINE__, num_tokens); + auto tokens = dht::get_uniform_tokens(num_tokens); + + // Verify tokens concentrated in the front + { + auto map = tablet_id_map(3); + auto last0 = tokens[0]; + auto last1 = tokens[1]; + auto last2 = dht::last_token(); + map.push_back(last0, tablet_id(0)); + map.push_back(last1, tablet_id(1)); + map.push_back(last2, tablet_id(2)); + + BOOST_REQUIRE_EQUAL(map.get_tablet_id(dht::minimum_token()), tablet_id(0)); + BOOST_REQUIRE_EQUAL(map.get_tablet_id(dht::token::midpoint(dht::first_token(), last0)), tablet_id(0)); + BOOST_REQUIRE_EQUAL(map.get_tablet_id(last0), tablet_id(0)); + BOOST_REQUIRE_EQUAL(map.get_tablet_id(dht::token::midpoint(last0, last1)), tablet_id(1)); + BOOST_REQUIRE_EQUAL(map.get_tablet_id(last1), tablet_id(1)); + BOOST_REQUIRE_EQUAL(map.get_tablet_id(dht::token::midpoint(last1, last2)), tablet_id(2)); + BOOST_REQUIRE_EQUAL(map.get_tablet_id(dht::maximum_token()), tablet_id(2)); + } + + // Verify tokens concentrated in the back + { + auto map = tablet_id_map(3); + auto last0 = tokens[tokens.size() - 3]; + auto last1 = tokens[tokens.size() - 2]; + auto last2 = tokens[tokens.size() - 1]; + map.push_back(last0, tablet_id(0)); + map.push_back(last1, tablet_id(1)); + map.push_back(last2, tablet_id(2)); + + BOOST_REQUIRE_EQUAL(map.get_tablet_id(dht::minimum_token()), tablet_id(0)); + BOOST_REQUIRE_EQUAL(map.get_tablet_id(dht::token::midpoint(dht::first_token(), last0)), tablet_id(0)); + BOOST_REQUIRE_EQUAL(map.get_tablet_id(last0), tablet_id(0)); + BOOST_REQUIRE_EQUAL(map.get_tablet_id(dht::token::midpoint(last0, last1)), tablet_id(1)); + BOOST_REQUIRE_EQUAL(map.get_tablet_id(last1), tablet_id(1)); + BOOST_REQUIRE_EQUAL(map.get_tablet_id(dht::token::midpoint(last1, last2)), tablet_id(2)); + BOOST_REQUIRE_EQUAL(map.get_tablet_id(dht::maximum_token()), tablet_id(2)); + } + } + + return make_ready_future<>(); +} + +SEASTAR_TEST_CASE(test_tablet_id_map_building) { + auto uniform_tokens = dht::get_uniform_tokens(8); + + auto map = tablet_id_map(7); + auto tablet_0_last = dht::token(uniform_tokens[0]).next().next(); + auto tablet_1_last = dht::token(uniform_tokens[2]).next(); + auto tablet_2_last = tablet_1_last.next(); // tablet 2 owns only 1 token. + auto tablet_3_last = tablet_2_last.next().next(); + auto tablet_4_last = dht::token(uniform_tokens[3]); + auto tablet_5_last = dht::token::midpoint(dht::token(uniform_tokens[3]), dht::token(uniform_tokens[4])); + auto tablet_6_last = dht::last_token(); + + map.push_back(dht::raw_token(tablet_0_last), tablet_id(0)); + map.push_back(dht::raw_token(tablet_1_last), tablet_id(1)); + map.push_back(dht::raw_token(tablet_2_last), tablet_id(2)); + map.push_back(dht::raw_token(tablet_3_last), tablet_id(3)); + map.push_back(dht::raw_token(tablet_4_last), tablet_id(4)); + map.push_back(dht::raw_token(tablet_5_last), tablet_id(5)); + map.push_back(dht::raw_token(tablet_6_last), tablet_id(6)); + + BOOST_REQUIRE_EQUAL(map.get_tablet_id(uniform_tokens[0]), tablet_id(0)); + BOOST_REQUIRE_EQUAL(map.get_tablet_id(uniform_tokens[1]), tablet_id(1)); + BOOST_REQUIRE_EQUAL(map.get_tablet_id(uniform_tokens[2]), tablet_id(1)); + BOOST_REQUIRE_EQUAL(map.get_tablet_id(uniform_tokens[3]), tablet_id(4)); + BOOST_REQUIRE_EQUAL(map.get_tablet_id(uniform_tokens[4]), tablet_id(6)); + BOOST_REQUIRE_EQUAL(map.get_tablet_id(uniform_tokens[5]), tablet_id(6)); + BOOST_REQUIRE_EQUAL(map.get_tablet_id(uniform_tokens[6]), tablet_id(6)); + BOOST_REQUIRE_EQUAL(map.get_tablet_id(uniform_tokens[7]), tablet_id(6)); + + BOOST_REQUIRE_EQUAL(map.get_tablet_id(tablet_0_last), tablet_id(0)); + BOOST_REQUIRE_EQUAL(map.get_tablet_id(tablet_1_last), tablet_id(1)); + BOOST_REQUIRE_EQUAL(map.get_tablet_id(tablet_2_last), tablet_id(2)); + BOOST_REQUIRE_EQUAL(map.get_tablet_id(tablet_3_last), tablet_id(3)); + BOOST_REQUIRE_EQUAL(map.get_tablet_id(tablet_4_last), tablet_id(4)); + BOOST_REQUIRE_EQUAL(map.get_tablet_id(tablet_5_last), tablet_id(5)); + BOOST_REQUIRE_EQUAL(map.get_tablet_id(tablet_6_last), tablet_id(6)); + + BOOST_REQUIRE_EQUAL(map.get_tablet_id(tablet_0_last.next()), tablet_id(1)); + BOOST_REQUIRE_EQUAL(map.get_tablet_id(tablet_1_last.next()), tablet_id(2)); + BOOST_REQUIRE_EQUAL(map.get_tablet_id(tablet_2_last.next()), tablet_id(3)); + BOOST_REQUIRE_EQUAL(map.get_tablet_id(tablet_3_last.next()), tablet_id(4)); + BOOST_REQUIRE_EQUAL(map.get_tablet_id(tablet_4_last.next()), tablet_id(5)); + BOOST_REQUIRE_EQUAL(map.get_tablet_id(tablet_5_last.next()), tablet_id(6)); + + BOOST_REQUIRE_EQUAL(map.get_tablet_id(dht::minimum_token()), tablet_id(0)); + BOOST_REQUIRE_EQUAL(map.get_tablet_id(dht::maximum_token()), tablet_id(6)); + + return make_ready_future<>(); +} + +static +utils::chunked_vector get_random_tokens(size_t n) { + utils::chunked_vector last_tokens; + for (size_t i = 0; i < n; ++i) { + last_tokens.push_back(dht::raw_token(dht::token::get_random_token())); + } + std::sort(last_tokens.begin(), last_tokens.end()); + return last_tokens; +} + +SEASTAR_THREAD_TEST_CASE(test_tablet_id_map_cloning) { + for (int n_tokens : {1, 2, 7, 13, 16}) { + auto last_tokens = get_random_tokens(n_tokens); + last_tokens.back() = dht::raw_token(dht::last_token()); + auto map = tablet_id_map(last_tokens); + auto map_cloned = map.clone_gently().get(); + + int t_idx = 0; + for (auto&& t: last_tokens) { + testlog.trace("last token {} of tablet {} bucket {}", t, t_idx, dht::compaction_group_of(map.log2_count(), t)); + BOOST_REQUIRE_EQUAL(map.get_tablet_id(t), tablet_id(t_idx)); + BOOST_REQUIRE_EQUAL(map.get_tablet_id(t), map_cloned.get_tablet_id(t)); + + if (t < last_tokens.back()) { + auto next_t = dht::token(t).next(); + BOOST_REQUIRE_EQUAL(map.get_tablet_id(next_t), map_cloned.get_tablet_id(next_t)); + BOOST_REQUIRE_EQUAL(map.get_tablet_id(next_t), tablet_id(t_idx + 1)); + } + + if (t > dht::first_token()) { + auto prev_t = dht::token(dht::raw_token(t.value - 1)); + BOOST_REQUIRE_EQUAL(map.get_tablet_id(prev_t), map_cloned.get_tablet_id(prev_t)); + } + + ++t_idx; + } + } +} + SEASTAR_TEST_CASE(test_tablet_metadata_persistence) { return do_with_cql_env_thread([] (cql_test_env& e) { auto h1 = host_id(utils::UUID_gen::get_time_UUID()); @@ -3904,7 +4060,7 @@ SEASTAR_THREAD_TEST_CASE(test_load_balancing_with_random_load) { keyspaces.push_back(add_keyspace(e, {{topo.dc(), rf}}, initial_tablets)); auto table = add_table(e, keyspaces.back()).get(); mutate_tablets(e, [&] (tablet_metadata& tmeta) -> future<> { - tablet_map tmap(initial_tablets); + tablet_map tmap = tmeta.get_tablet_map(table).clone(); for (auto tid : tmap.tablet_ids()) { // Choose replicas randomly while loading racks evenly. std::vector replica_hosts = allocate_replicas_in_racks(racks, rf, hosts_by_rack); @@ -5012,7 +5168,7 @@ static void do_test_load_balancing_merge_colocation(cql_test_env& e, const int n auto guard = e.get_raft_group0_client().start_operation(as).get(); stm.mutate_token_metadata([&](token_metadata& tm) -> future<> { tablet_metadata& tmeta = tm.tablets(); - tablet_map tmap(initial_tablets); + tablet_map tmap = tmeta.get_tablet_map(table1).clone(); locator::resize_decision decision; // leaves growing mode, allowing for merge decision. decision.sequence_number = decision.next_sequence_number();