/* * Copyright (C) 2023-present ScyllaDB */ /* * SPDX-License-Identifier: AGPL-3.0-or-later */ #include "locator/tablet_replication_strategy.hh" #include "locator/tablets.hh" #include "locator/tablet_metadata_guard.hh" #include "locator/tablet_sharder.hh" #include "locator/token_range_splitter.hh" #include "db/system_keyspace.hh" #include "replica/database.hh" #include "utils/stall_free.hh" #include "gms/feature_service.hh" #include #include #include #include #include namespace locator { seastar::logger tablet_logger("tablets"); static write_replica_set_selector get_selector_for_writes(tablet_transition_stage stage) { switch (stage) { case tablet_transition_stage::allow_write_both_read_old: return write_replica_set_selector::previous; case tablet_transition_stage::write_both_read_old: return write_replica_set_selector::both; case tablet_transition_stage::streaming: return write_replica_set_selector::both; case tablet_transition_stage::write_both_read_new: return write_replica_set_selector::both; case tablet_transition_stage::use_new: return write_replica_set_selector::next; case tablet_transition_stage::cleanup: return write_replica_set_selector::next; case tablet_transition_stage::cleanup_target: return write_replica_set_selector::previous; case tablet_transition_stage::revert_migration: return write_replica_set_selector::previous; case tablet_transition_stage::end_migration: return write_replica_set_selector::next; } on_internal_error(tablet_logger, format("Invalid tablet transition stage: {}", static_cast(stage))); } static read_replica_set_selector get_selector_for_reads(tablet_transition_stage stage) { switch (stage) { case tablet_transition_stage::allow_write_both_read_old: return read_replica_set_selector::previous; case tablet_transition_stage::write_both_read_old: return read_replica_set_selector::previous; case tablet_transition_stage::streaming: return read_replica_set_selector::previous; case tablet_transition_stage::write_both_read_new: return read_replica_set_selector::next; case tablet_transition_stage::use_new: return read_replica_set_selector::next; case tablet_transition_stage::cleanup: return read_replica_set_selector::next; case tablet_transition_stage::cleanup_target: return read_replica_set_selector::previous; case tablet_transition_stage::revert_migration: return read_replica_set_selector::previous; case tablet_transition_stage::end_migration: return read_replica_set_selector::next; } on_internal_error(tablet_logger, format("Invalid tablet transition stage: {}", static_cast(stage))); } tablet_transition_info::tablet_transition_info(tablet_transition_stage stage, tablet_transition_kind transition, tablet_replica_set next, std::optional pending_replica, service::session_id session_id) : stage(stage) , transition(transition) , next(std::move(next)) , pending_replica(std::move(pending_replica)) , session_id(session_id) , writes(get_selector_for_writes(stage)) , reads(get_selector_for_reads(stage)) { } tablet_migration_streaming_info get_migration_streaming_info(const locator::topology& topo, const tablet_info& tinfo, const tablet_migration_info& trinfo) { return get_migration_streaming_info(topo, tinfo, migration_to_transition_info(tinfo, trinfo)); } tablet_migration_streaming_info get_migration_streaming_info(const locator::topology& topo, const tablet_info& tinfo, const tablet_transition_info& trinfo) { tablet_migration_streaming_info result; switch (trinfo.transition) { case tablet_transition_kind::intranode_migration: [[fallthrough]]; case tablet_transition_kind::migration: result.read_from = substract_sets(tinfo.replicas, trinfo.next); result.written_to = substract_sets(trinfo.next, tinfo.replicas); return result; case tablet_transition_kind::rebuild: if (!trinfo.pending_replica.has_value()) { return result; // No nodes to stream to -> no nodes to stream from } result.written_to.insert(*trinfo.pending_replica); result.read_from = std::unordered_set(trinfo.next.begin(), trinfo.next.end()); result.read_from.erase(*trinfo.pending_replica); erase_if(result.read_from, [&] (const tablet_replica& r) { auto* n = topo.find_node(r.host); return !n || n->is_excluded(); }); return result; } on_internal_error(tablet_logger, format("Invalid tablet transition kind: {}", static_cast(trinfo.transition))); } std::optional get_leaving_replica(const tablet_info& tinfo, const tablet_transition_info& trinfo) { auto leaving = substract_sets(tinfo.replicas, trinfo.next); if (leaving.empty()) { return {}; } if (leaving.size() > 1) { throw std::runtime_error(format("More than one leaving replica")); } return *leaving.begin(); } tablet_replica_set get_new_replicas(const tablet_info& tinfo, const tablet_migration_info& mig) { return replace_replica(tinfo.replicas, mig.src, mig.dst); } tablet_replica_set get_primary_replicas(const tablet_info& info, const tablet_transition_info* transition) { auto write_selector = [&] { if (!transition) { return write_replica_set_selector::previous; } return transition->writes; }; auto primary = [] (tablet_replica_set set) -> tablet_replica { return set.front(); }; auto add = [] (tablet_replica r1, tablet_replica r2) -> tablet_replica_set { // if primary replica is not the one leaving, then only primary will be streamed to. return (r1 == r2) ? tablet_replica_set{r1} : tablet_replica_set{r1, r2}; }; switch (write_selector()) { case write_replica_set_selector::previous: return {primary(info.replicas)}; case write_replica_set_selector::both: return add(primary(info.replicas), primary(transition->next)); case write_replica_set_selector::next: return {primary(transition->next)}; } } tablet_transition_info migration_to_transition_info(const tablet_info& ti, const tablet_migration_info& mig) { return tablet_transition_info { tablet_transition_stage::allow_write_both_read_old, mig.kind, get_new_replicas(ti, mig), mig.dst }; } const tablet_map& tablet_metadata::get_tablet_map(table_id id) const { try { return *_tablets.at(id); } catch (const std::out_of_range&) { throw_with_backtrace(format("Tablet map not found for table {}", id)); } } void tablet_metadata::mutate_tablet_map(table_id id, noncopyable_function func) { auto it = _tablets.find(id); if (it == _tablets.end()) { throw std::runtime_error(format("Tablet map not found for table {}", id)); } auto tablet_map_copy = make_lw_shared(*it->second); func(*tablet_map_copy); it->second = make_foreign(lw_shared_ptr(std::move(tablet_map_copy))); } future<> tablet_metadata::mutate_tablet_map_async(table_id id, noncopyable_function(tablet_map&)> func) { auto it = _tablets.find(id); if (it == _tablets.end()) { throw std::runtime_error(format("Tablet map not found for table {}", id)); } auto tablet_map_copy = make_lw_shared(*it->second); co_await func(*tablet_map_copy); it->second = make_foreign(lw_shared_ptr(std::move(tablet_map_copy))); } future tablet_metadata::copy() const { tablet_metadata copy; for (const auto& e : _tablets) { copy._tablets.emplace(e.first, co_await e.second.copy()); } copy._balancing_enabled = _balancing_enabled; co_return copy; } void tablet_metadata::set_tablet_map(table_id id, tablet_map map) { auto map_ptr = make_lw_shared(std::move(map)); auto it = _tablets.find(id); if (it == _tablets.end()) { _tablets.emplace(id, std::move(map_ptr)); } else { it->second = std::move(map_ptr); } } void tablet_metadata::drop_tablet_map(table_id id) { auto it = _tablets.find(id); if (it == _tablets.end()) { return; } _tablets.erase(it); } future<> tablet_metadata::clear_gently() { for (auto&& [id, map] : _tablets) { const auto shard = map.get_owner_shard(); co_await smp::submit_to(shard, [map = std::move(map)] () mutable { auto map_ptr = map.release(); // Others copies exist, we simply drop ours, no need to clear anything. if (map_ptr.use_count() > 1) { return make_ready_future<>(); } return const_cast(*map_ptr).clear_gently().finally([map_ptr = std::move(map_ptr)] { }); }); } _tablets.clear(); co_return; } bool tablet_metadata::operator==(const tablet_metadata& o) const { if (_tablets.size() != o._tablets.size()) { return false; } for (const auto& [k, v] : _tablets) { const auto it = o._tablets.find(k); if (it == o._tablets.end() || *v != *it->second) { return false; } } return true; } tablet_map::tablet_map(size_t tablet_count) : _log2_tablets(log2ceil(tablet_count)) { if (tablet_count != 1ul << _log2_tablets) { on_internal_error(tablet_logger, format("Tablet count not a power of 2: {}", tablet_count)); } _tablets.resize(tablet_count); } void tablet_map::check_tablet_id(tablet_id id) const { if (size_t(id) >= tablet_count()) { throw std::logic_error(format("Invalid tablet id: {} >= {}", id, tablet_count())); } } const tablet_info& tablet_map::get_tablet_info(tablet_id id) const { check_tablet_id(id); return _tablets[size_t(id)]; } tablet_id tablet_map::get_tablet_id(token t) const { return tablet_id(dht::compaction_group_of(_log2_tablets, t)); } std::pair tablet_map::get_tablet_id_and_range_side(token t) const { auto id_after_split = dht::compaction_group_of(_log2_tablets + 1, t); auto current_id = id_after_split >> 1; return {tablet_id(current_id), tablet_range_side(id_after_split & 0x1)}; } dht::token tablet_map::get_last_token(tablet_id id) const { check_tablet_id(id); return dht::last_token_of_compaction_group(_log2_tablets, size_t(id)); } dht::token tablet_map::get_first_token(tablet_id id) const { if (id == first_tablet()) { return dht::first_token(); } else { return dht::next_token(get_last_token(tablet_id(size_t(id) - 1))); } } dht::token_range tablet_map::get_token_range(tablet_id id) const { if (id == first_tablet()) { return dht::token_range::make({dht::minimum_token(), false}, {get_last_token(id), true}); } else { return dht::token_range::make({get_last_token(tablet_id(size_t(id) - 1)), false}, {get_last_token(id), true}); } } tablet_replica tablet_map::get_primary_replica(tablet_id id) const { const auto& replicas = get_tablet_info(id).replicas; return replicas.at(size_t(id) % replicas.size()); } tablet_replica tablet_map::get_primary_replica_within_dc(tablet_id id, const topology& topo, sstring dc) const { const auto replicas = boost::copy_range(get_tablet_info(id).replicas | boost::adaptors::filtered([&] (const auto& tr) { const auto& node = topo.get_node(tr.host); return node.dc_rack().dc == dc; })); return replicas.at(size_t(id) % replicas.size()); } future> tablet_map::get_sorted_tokens() const { std::vector tokens; tokens.reserve(tablet_count()); for (auto id : tablet_ids()) { tokens.push_back(get_last_token(id)); co_await coroutine::maybe_yield(); } co_return tokens; } void tablet_map::set_tablet(tablet_id id, tablet_info info) { check_tablet_id(id); _tablets[size_t(id)] = std::move(info); } void tablet_map::set_tablet_transition_info(tablet_id id, tablet_transition_info info) { check_tablet_id(id); _transitions.insert_or_assign(id, std::move(info)); } void tablet_map::set_resize_decision(locator::resize_decision decision) { _resize_decision = std::move(decision); } void tablet_map::clear_tablet_transition_info(tablet_id id) { check_tablet_id(id); _transitions.erase(id); } future<> tablet_map::for_each_tablet(seastar::noncopyable_function(tablet_id, const tablet_info&)> func) const { std::optional tid = first_tablet(); for (const tablet_info& ti : tablets()) { co_await func(*tid, ti); tid = next_tablet(*tid); } } void tablet_map::clear_transitions() { _transitions.clear(); } bool tablet_map::has_replica(tablet_id tid, tablet_replica r) const { auto& tinfo = get_tablet_info(tid); if (contains(tinfo.replicas, r)) { return true; } auto* trinfo = get_tablet_transition_info(tid); if (trinfo && contains(trinfo->next, r)) { return true; } return false; } future<> tablet_map::clear_gently() { return utils::clear_gently(_tablets); } const tablet_transition_info* tablet_map::get_tablet_transition_info(tablet_id id) const { auto i = _transitions.find(id); if (i == _transitions.end()) { return nullptr; } return &i->second; } // The names are persisted in system tables so should not be changed. static const std::unordered_map tablet_transition_stage_to_name = { {tablet_transition_stage::allow_write_both_read_old, "allow_write_both_read_old"}, {tablet_transition_stage::write_both_read_old, "write_both_read_old"}, {tablet_transition_stage::write_both_read_new, "write_both_read_new"}, {tablet_transition_stage::streaming, "streaming"}, {tablet_transition_stage::use_new, "use_new"}, {tablet_transition_stage::cleanup, "cleanup"}, {tablet_transition_stage::cleanup_target, "cleanup_target"}, {tablet_transition_stage::revert_migration, "revert_migration"}, {tablet_transition_stage::end_migration, "end_migration"}, }; static const std::unordered_map tablet_transition_stage_from_name = std::invoke([] { std::unordered_map result; for (auto&& [v, s] : tablet_transition_stage_to_name) { result.emplace(s, v); } return result; }); sstring tablet_transition_stage_to_string(tablet_transition_stage stage) { auto i = tablet_transition_stage_to_name.find(stage); if (i == tablet_transition_stage_to_name.end()) { on_internal_error(tablet_logger, format("Invalid tablet transition stage: {}", static_cast(stage))); } return i->second; } tablet_transition_stage tablet_transition_stage_from_string(const sstring& name) { return tablet_transition_stage_from_name.at(name); } // The names are persisted in system tables so should not be changed. static const std::unordered_map tablet_transition_kind_to_name = { {tablet_transition_kind::migration, "migration"}, {tablet_transition_kind::intranode_migration, "intranode_migration"}, {tablet_transition_kind::rebuild, "rebuild"}, }; static const std::unordered_map tablet_transition_kind_from_name = std::invoke([] { std::unordered_map result; for (auto&& [v, s] : tablet_transition_kind_to_name) { result.emplace(s, v); } return result; }); sstring tablet_transition_kind_to_string(tablet_transition_kind kind) { auto i = tablet_transition_kind_to_name.find(kind); if (i == tablet_transition_kind_to_name.end()) { on_internal_error(tablet_logger, format("Invalid tablet transition kind: {}", static_cast(kind))); } return i->second; } tablet_transition_kind tablet_transition_kind_from_string(const sstring& name) { return tablet_transition_kind_from_name.at(name); } size_t tablet_map::external_memory_usage() const { size_t result = _tablets.external_memory_usage(); for (auto&& tablet : _tablets) { result += tablet.replicas.external_memory_usage(); } return result; } bool resize_decision::operator==(const resize_decision& o) const { return way.index() == o.way.index() && sequence_number == o.sequence_number; } bool tablet_map::needs_split() const { return std::holds_alternative(_resize_decision.way); } const locator::resize_decision& tablet_map::resize_decision() const { return _resize_decision; } static auto to_resize_type(sstring decision) { static const std::unordered_map string_to_type = { {"none", resize_decision::none{}}, {"split", resize_decision::split{}}, {"merge", resize_decision::merge{}}, }; return string_to_type.at(decision); } resize_decision::resize_decision(sstring decision, uint64_t seq_number) : way(to_resize_type(decision)) , sequence_number(seq_number) { } sstring resize_decision::type_name() const { static const std::array index_to_string = { "none", "split", "merge", }; static_assert(std::variant_size_v == index_to_string.size()); return index_to_string[way.index()]; } resize_decision::seq_number_t resize_decision::next_sequence_number() const { // Doubt we'll ever wrap around, but just in case. // Even if sequence number is bumped every second, it would take 292471208677 years // for it to happen, about 21x the age of the universe, or ~11x according to the new // prediction after james webb. return (sequence_number == std::numeric_limits::max()) ? 0 : sequence_number + 1; } table_load_stats& table_load_stats::operator+=(const table_load_stats& s) noexcept { size_in_bytes = size_in_bytes + s.size_in_bytes; split_ready_seq_number = std::min(split_ready_seq_number, s.split_ready_seq_number); return *this; } load_stats& load_stats::operator+=(const load_stats& s) { for (auto& [id, stats] : s.tables) { tables[id] += stats; } return *this; } tablet_range_splitter::tablet_range_splitter(schema_ptr schema, const tablet_map& tablets, host_id host, const dht::partition_range_vector& ranges) : _schema(std::move(schema)) , _ranges(ranges) , _ranges_it(_ranges.begin()) { // Filter all tablets and save only those that have a replica on the specified host. for (auto tid = std::optional(tablets.first_tablet()); tid; tid = tablets.next_tablet(*tid)) { const auto& tablet_info = tablets.get_tablet_info(*tid); auto replica_it = std::ranges::find_if(tablet_info.replicas, [&] (auto&& r) { return r.host == host; }); if (replica_it == tablet_info.replicas.end()) { continue; } _tablet_ranges.emplace_back(range_split_result{replica_it->shard, dht::to_partition_range(tablets.get_token_range(*tid))}); } _tablet_ranges_it = _tablet_ranges.begin(); } std::optional tablet_range_splitter::operator()() { if (_ranges_it == _ranges.end() || _tablet_ranges_it == _tablet_ranges.end()) { return {}; } dht::ring_position_comparator cmp(*_schema); while (_ranges_it != _ranges.end()) { // First, skip all tablet-ranges that are completely before the current range. while (_ranges_it->other_is_before(_tablet_ranges_it->range, cmp)) { ++_tablet_ranges_it; } // Generate intersections with all tablet-ranges that overlap with the current range. if (auto intersection = _ranges_it->intersection(_tablet_ranges_it->range, cmp)) { const auto shard = _tablet_ranges_it->shard; if (_ranges_it->end() && cmp(_ranges_it->end()->value(), _tablet_ranges_it->range.end()->value()) < 0) { // The current tablet range extends beyond the current range, // move to the next range. ++_ranges_it; } else { // The current range extends beyond the current tablet range, // move to the next tablet range. ++_tablet_ranges_it; } return range_split_result{shard, std::move(*intersection)}; } // Current tablet-range is completely after the current range, move to the next range. ++_ranges_it; } return {}; } // Estimates the external memory usage of std::unordered_map<>. // Does not include external memory usage of elements. template static size_t estimate_external_memory_usage(const std::unordered_map& map) { return map.bucket_count() * sizeof(void*) + map.size() * (sizeof(std::pair) + 8); } size_t tablet_metadata::external_memory_usage() const { size_t result = estimate_external_memory_usage(_tablets); for (auto&& [id, map] : _tablets) { result += map->external_memory_usage(); } return result; } bool tablet_metadata::has_replica_on(host_id host) const { for (auto&& [id, map] : _tablets) { for (auto&& tablet : map->tablet_ids()) { auto& tinfo = map->get_tablet_info(tablet); for (auto&& r : tinfo.replicas) { if (r.host == host) { return true; } } auto* trinfo = map->get_tablet_transition_info(tablet); if (trinfo && trinfo->pending_replica && trinfo->pending_replica->host == host) { return true; } } } return false; } future check_tablet_replica_shards(const tablet_metadata& tm, host_id this_host) { bool valid = true; for (const auto& [table_id, tmap] : tm.all_tables()) { co_await tmap->for_each_tablet([this_host, &valid] (locator::tablet_id tid, const tablet_info& tinfo) -> future<> { for (const auto& replica : tinfo.replicas) { if (replica.host == this_host) { valid &= replica.shard < smp::count; } } return make_ready_future<>(); }); if (!valid) { break; } } co_return valid; } class tablet_effective_replication_map : public effective_replication_map { table_id _table; tablet_sharder _sharder; mutable const tablet_map* _tmap = nullptr; private: inet_address_vector_replica_set to_replica_set(const tablet_replica_set& replicas) const { inet_address_vector_replica_set result; result.reserve(replicas.size()); auto& topo = _tmptr->get_topology(); for (auto&& replica : replicas) { auto* node = topo.find_node(replica.host); if (node && !node->left()) { result.emplace_back(node->endpoint()); } } return result; } host_id_vector_replica_set to_host_set(const tablet_replica_set& replicas) const { host_id_vector_replica_set result; result.reserve(replicas.size()); for (auto&& replica : replicas) { result.emplace_back(replica.host); } return result; } const tablet_map& get_tablet_map() const { if (!_tmap) { _tmap = &_tmptr->tablets().get_tablet_map(_table); } return *_tmap; } const tablet_replica_set& get_replicas_for_write(dht::token search_token) const { auto&& tablets = get_tablet_map(); auto tablet = tablets.get_tablet_id(search_token); auto* info = tablets.get_tablet_transition_info(tablet); auto&& replicas = std::invoke([&] () -> const tablet_replica_set& { if (!info) { return tablets.get_tablet_info(tablet).replicas; } switch (info->writes) { case write_replica_set_selector::previous: [[fallthrough]]; case write_replica_set_selector::both: return tablets.get_tablet_info(tablet).replicas; case write_replica_set_selector::next: { return info->next; } } on_internal_error(tablet_logger, format("Invalid replica selector", static_cast(info->writes))); }); tablet_logger.trace("get_replicas_for_write({}): table={}, tablet={}, replicas={}", search_token, _table, tablet, replicas); return replicas; } public: tablet_effective_replication_map(table_id table, replication_strategy_ptr rs, token_metadata_ptr tmptr, size_t replication_factor) : effective_replication_map(std::move(rs), std::move(tmptr), replication_factor) , _table(table) , _sharder(*_tmptr, table) { } virtual ~tablet_effective_replication_map() = default; virtual host_id_vector_replica_set get_replicas(const token& search_token) const override { return to_host_set(get_replicas_for_write(search_token)); } virtual inet_address_vector_replica_set get_natural_endpoints(const token& search_token) const override { return to_replica_set(get_replicas_for_write(search_token)); } virtual inet_address_vector_replica_set get_natural_endpoints_without_node_being_replaced(const token& search_token) const override { auto result = get_natural_endpoints(search_token); maybe_remove_node_being_replaced(*_tmptr, *_rs, result); return result; } virtual future get_ranges(inet_address ep) const override { dht::token_range_vector ret; auto& tablet_map = get_tablet_map(); for (auto tablet_id : tablet_map.tablet_ids()) { auto endpoints = get_natural_endpoints(tablet_map.get_last_token(tablet_id)); auto should_add_range = std::find(std::begin(endpoints), std::end(endpoints), ep) != std::end(endpoints); if (should_add_range) { ret.push_back(tablet_map.get_token_range(tablet_id)); } co_await coroutine::maybe_yield(); } co_return ret; } virtual inet_address_vector_topology_change get_pending_endpoints(const token& search_token) const override { auto&& tablets = get_tablet_map(); auto tablet = tablets.get_tablet_id(search_token); auto&& info = tablets.get_tablet_transition_info(tablet); if (!info || info->transition == tablet_transition_kind::intranode_migration) { return {}; } switch (info->writes) { case write_replica_set_selector::previous: return {}; case write_replica_set_selector::both: if (!info->pending_replica) { return {}; } tablet_logger.trace("get_pending_endpoints({}): table={}, tablet={}, replica={}", search_token, _table, tablet, *info->pending_replica); return {_tmptr->get_endpoint_for_host_id(info->pending_replica->host)}; case write_replica_set_selector::next: return {}; } on_internal_error(tablet_logger, format("Invalid replica selector", static_cast(info->writes))); } virtual inet_address_vector_replica_set get_endpoints_for_reading(const token& search_token) const override { auto&& tablets = get_tablet_map(); auto tablet = tablets.get_tablet_id(search_token); auto&& info = tablets.get_tablet_transition_info(tablet); auto&& replicas = std::invoke([&] () -> const tablet_replica_set& { if (!info) { return tablets.get_tablet_info(tablet).replicas; } switch (info->reads) { case read_replica_set_selector::previous: return tablets.get_tablet_info(tablet).replicas; case read_replica_set_selector::next: { return info->next; } } on_internal_error(tablet_logger, format("Invalid replica selector", static_cast(info->reads))); }); tablet_logger.trace("get_endpoints_for_reading({}): table={}, tablet={}, replicas={}", search_token, _table, tablet, replicas); auto result = to_replica_set(replicas); maybe_remove_node_being_replaced(*_tmptr, *_rs, result); return result; } std::optional check_locality(const token& search_token) const override { auto&& tablets = get_tablet_map(); auto tid = tablets.get_tablet_id(search_token); auto&& info = tablets.get_tablet_info(tid); auto host = get_token_metadata().get_my_id(); auto shard = this_shard_id(); auto make_tablet_routing_info = [&] { dht::token first_token; if (tid == tablets.first_tablet()) { first_token = dht::minimum_token(); } else { first_token = tablets.get_last_token(tablet_id(size_t(tid) - 1)); } auto token_range = std::make_pair(first_token, tablets.get_last_token(tid)); return tablet_routing_info{info.replicas, token_range}; }; for (auto&& r : info.replicas) { if (r.host == host) { if (r.shard == shard) { return std::nullopt; // routed correctly } else { return make_tablet_routing_info(); } } } auto tinfo = tablets.get_tablet_transition_info(tid); if (tinfo && tinfo->pending_replica && tinfo->pending_replica->host == host && tinfo->pending_replica->shard == shard) { return std::nullopt; // routed correctly } return make_tablet_routing_info(); } virtual bool has_pending_ranges(locator::host_id host_id) const override { for (const auto& [id, transition_info]: get_tablet_map().transitions()) { if (transition_info.pending_replica && transition_info.pending_replica->host == host_id) { return true; } } return false; } virtual std::unique_ptr make_splitter() const override { class splitter : public token_range_splitter { token_metadata_ptr _tmptr; // To keep the tablet map alive. const tablet_map& _tmap; std::optional _next; public: splitter(token_metadata_ptr tmptr, const tablet_map& tmap) : _tmptr(std::move(tmptr)) , _tmap(tmap) { } void reset(dht::ring_position_view pos) override { _next = _tmap.get_tablet_id(pos.token()); } std::optional next_token() override { if (!_next) { return std::nullopt; } auto t = _tmap.get_last_token(*_next); _next = _tmap.next_tablet(*_next); return t; } }; return std::make_unique(_tmptr, get_tablet_map()); } const dht::sharder& get_sharder(const schema& s) const override { return _sharder; } }; void tablet_aware_replication_strategy::validate_tablet_options(const abstract_replication_strategy& ars, const gms::feature_service& fs, const replication_strategy_config_options& opts) const { if (ars._uses_tablets && !fs.tablets) { throw exceptions::configuration_exception("Tablet replication is not enabled"); } } void tablet_aware_replication_strategy::process_tablet_options(abstract_replication_strategy& ars, replication_strategy_config_options& opts, replication_strategy_params params) { if (ars._uses_tablets) { _initial_tablets = params.initial_tablets.value_or(0); mark_as_per_table(ars); } } std::unordered_set tablet_aware_replication_strategy::recognized_tablet_options() const { std::unordered_set opts; return opts; } effective_replication_map_ptr tablet_aware_replication_strategy::do_make_replication_map( table_id table, replication_strategy_ptr rs, token_metadata_ptr tm, size_t replication_factor) const { return seastar::make_shared(table, std::move(rs), std::move(tm), replication_factor); } void tablet_metadata_guard::check() noexcept { auto erm = _table->get_effective_replication_map(); auto& tmap = erm->get_token_metadata_ptr()->tablets().get_tablet_map(_tablet.table); auto* trinfo = tmap.get_tablet_transition_info(_tablet.tablet); if (bool(_stage) != bool(trinfo) || (_stage && _stage != trinfo->stage)) { _abort_source.request_abort(); } else { _erm = std::move(erm); subscribe(); } } tablet_metadata_guard::tablet_metadata_guard(replica::table& table, global_tablet_id tablet) : _table(table.shared_from_this()) , _tablet(tablet) , _erm(table.get_effective_replication_map()) { subscribe(); if (auto* trinfo = get_tablet_map().get_tablet_transition_info(tablet.tablet)) { _stage = trinfo->stage; } } void tablet_metadata_guard::subscribe() { _callback = _erm->get_validity_abort_source().subscribe([this] () noexcept { check(); }); } } auto fmt::formatter::format(const locator::global_tablet_id& id, fmt::format_context& ctx) const -> decltype(ctx.out()) { return fmt::format_to(ctx.out(), "{}:{}", id.table, id.tablet); } auto fmt::formatter::format(const locator::tablet_transition_stage& stage, fmt::format_context& ctx) const -> decltype(ctx.out()) { return fmt::format_to(ctx.out(), "{}", locator::tablet_transition_stage_to_string(stage)); } auto fmt::formatter::format(const locator::tablet_transition_kind& kind, fmt::format_context& ctx) const -> decltype(ctx.out()) { return fmt::format_to(ctx.out(), "{}", locator::tablet_transition_kind_to_string(kind)); } auto fmt::formatter::format(const locator::tablet_map& r, fmt::format_context& ctx) const -> decltype(ctx.out()) { auto out = ctx.out(); if (r.tablet_count() == 0) { return fmt::format_to(out, "{{}}"); } out = fmt::format_to(out, "{{"); bool first = true; locator::tablet_id tid = r.first_tablet(); for (auto&& tablet : r._tablets) { if (!first) { out = fmt::format_to(out, ","); } out = fmt::format_to(out, "\n [{}]: last_token={}, replicas={}", tid, r.get_last_token(tid), tablet.replicas); if (auto tr = r.get_tablet_transition_info(tid)) { out = fmt::format_to(out, ", stage={}, new_replicas={}, pending={}", tr->stage, tr->next, tr->pending_replica); if (tr->session_id) { out = fmt::format_to(out, ", session={}", tr->session_id); } } first = false; tid = *r.next_tablet(tid); } return fmt::format_to(out, "}}"); } auto fmt::formatter::format(const locator::tablet_metadata& tm, fmt::format_context& ctx) const -> decltype(ctx.out()) { auto out = ctx.out(); out = fmt::format_to(out, "{{"); bool first = true; for (auto&& [id, map] : tm._tablets) { if (!first) { out = fmt::format_to(out, ","); } out = fmt::format_to(out, "\n {}: {}", id, *map); first = false; } return fmt::format_to(out, "\n}}"); } auto fmt::formatter::format(const locator::tablet_metadata_change_hint& hint, fmt::format_context& ctx) const -> decltype(ctx.out()) { auto out = ctx.out(); out = fmt::format_to(out, "{{"); bool first = true; for (auto&& [table_id, table_hint] : hint.tables) { if (!first) { out = fmt::format_to(out, ","); } out = fmt::format_to(out, "\n [{}]: {}", table_id, table_hint.tokens); first = false; } return fmt::format_to(out, "\n}}"); }