diff --git a/api/api_init.hh b/api/api_init.hh index 4f713bcc50..54b15b3ed0 100644 --- a/api/api_init.hh +++ b/api/api_init.hh @@ -10,6 +10,7 @@ #include #include +#include "locator/host_id.hh" #include "replica/database_fwd.hh" #include "tasks/task_manager.hh" #include "seastarx.hh" @@ -32,6 +33,10 @@ namespace streaming { class stream_manager; } +namespace gms { + class inet_address; +} + namespace locator { class token_metadata; diff --git a/api/token_metadata.cc b/api/token_metadata.cc index 1eb2d021eb..58442def02 100644 --- a/api/token_metadata.cc +++ b/api/token_metadata.cc @@ -32,13 +32,22 @@ void set_token_metadata(http_context& ctx, routes& r, sharded req) { gms::inet_address addr(req->param["endpoint"]); - return make_ready_future(stream_range_as_array(tm.local().get()->get_tokens(addr), [](const dht::token& i) { - return fmt::to_string(i); - })); + auto& local_tm = *tm.local().get(); + const auto host_id = local_tm.get_host_id_if_known(addr); + return make_ready_future(stream_range_as_array(host_id ? local_tm.get_tokens(*host_id): std::vector{}, [](const dht::token& i) { + return fmt::to_string(i); + })); }); ss::get_leaving_nodes.set(r, [&tm](const_req req) { - return container_to_vec(tm.local().get()->get_leaving_endpoints()); + const auto& local_tm = *tm.local().get(); + const auto& leaving_host_ids = local_tm.get_leaving_endpoints(); + std::unordered_set eps; + eps.reserve(leaving_host_ids.size()); + for (const auto host_id: leaving_host_ids) { + eps.insert(local_tm.get_endpoint_for_host_id(host_id)); + } + return container_to_vec(eps); }); ss::get_moving_nodes.set(r, [](const_req req) { @@ -47,12 +56,14 @@ void set_token_metadata(http_context& ctx, routes& r, shardedget_bootstrap_tokens(); - std::unordered_set addr; - for (auto i: points) { - addr.insert(fmt::to_string(i.second)); + const auto& local_tm = *tm.local().get(); + const auto& points = local_tm.get_bootstrap_tokens(); + std::unordered_set eps; + eps.reserve(points.size()); + for (const auto& [token, host_id]: points) { + eps.insert(local_tm.get_endpoint_for_host_id(host_id)); } - return container_to_vec(addr); + return container_to_vec(eps); }); ss::get_host_id_map.set(r, [&tm](const_req req) { diff --git a/cdc/generation.cc b/cdc/generation.cc index 1705d2e8f1..6a06431283 100644 --- a/cdc/generation.cc +++ b/cdc/generation.cc @@ -391,8 +391,9 @@ future generation_service::legacy_make_new_generation(const throw std::runtime_error( format("Can't find endpoint for token {}", end)); } - auto sc = get_shard_count(*endpoint, _gossiper); - return {sc > 0 ? sc : 1, get_sharding_ignore_msb(*endpoint, _gossiper)}; + const auto ep = tmptr->get_endpoint_for_host_id(*endpoint); + auto sc = get_shard_count(ep, _gossiper); + return {sc > 0 ? sc : 1, get_sharding_ignore_msb(ep, _gossiper)}; } }; diff --git a/cdc/log.hh b/cdc/log.hh index 0af27cd5e6..a796729dcd 100644 --- a/cdc/log.hh +++ b/cdc/log.hh @@ -29,6 +29,7 @@ #include "timestamp.hh" #include "tracing/trace_state.hh" #include "utils/UUID.hh" +#include "locator/host_id.hh" class schema; using schema_ptr = seastar::lw_shared_ptr; diff --git a/cql3/statements/create_keyspace_statement.hh b/cql3/statements/create_keyspace_statement.hh index 02946325a2..509c9cb6e6 100644 --- a/cql3/statements/create_keyspace_statement.hh +++ b/cql3/statements/create_keyspace_statement.hh @@ -18,7 +18,6 @@ namespace locator { class token_metadata; - }; namespace data_dictionary { diff --git a/cql3/statements/ks_prop_defs.hh b/cql3/statements/ks_prop_defs.hh index 32bc4bec63..72b9f86dcd 100644 --- a/cql3/statements/ks_prop_defs.hh +++ b/cql3/statements/ks_prop_defs.hh @@ -12,6 +12,7 @@ #include "cql3/statements/property_definitions.hh" #include "data_dictionary/storage_options.hh" +#include "locator/host_id.hh" #include #include @@ -20,6 +21,9 @@ namespace data_dictionary { class keyspace_metadata; } +namespace gms { + class inet_address; +} namespace locator { class token_metadata; diff --git a/db/hints/internal/hint_sender.cc b/db/hints/internal/hint_sender.cc index abd2b9c690..561a3a5677 100644 --- a/db/hints/internal/hint_sender.cc +++ b/db/hints/internal/hint_sender.cc @@ -101,7 +101,9 @@ bool hint_sender::can_send() noexcept { return true; } else { if (!_state.contains(state::ep_state_left_the_ring)) { - _state.set_if(!_shard_manager.local_db().get_token_metadata().is_normal_token_owner(end_point_key())); + const auto& tm = _shard_manager.local_db().get_token_metadata(); + const auto host_id = tm.get_host_id_if_known(end_point_key()); + _state.set_if(!host_id || !tm.is_normal_token_owner(*host_id)); } // send the hints out if the destination Node is part of the ring - we will send to all new replicas in this case return _state.contains(state::ep_state_left_the_ring); diff --git a/db/view/view.cc b/db/view/view.cc index 34000c47a7..726974fadd 100644 --- a/db/view/view.cc +++ b/db/view/view.cc @@ -2579,7 +2579,7 @@ future check_view_build_ongoing(db::system_distributed_keyspace& sys_dist_ return sys_dist_ks.view_status(ks_name, cf_name).then([&tm] (view_statuses_type&& view_statuses) { return boost::algorithm::any_of(view_statuses, [&tm] (const view_statuses_type::value_type& view_status) { // Only consider status of known hosts. - return view_status.second == "STARTED" && tm.get_endpoint_for_host_id(view_status.first); + return view_status.second == "STARTED" && tm.get_endpoint_for_host_id_if_known(view_status.first); }); }); } diff --git a/db/view/view_update_checks.hh b/db/view/view_update_checks.hh index 77b7113c0d..45c9f03f5f 100644 --- a/db/view/view_update_checks.hh +++ b/db/view/view_update_checks.hh @@ -10,6 +10,7 @@ #include #include "streaming/stream_reason.hh" +#include "locator/host_id.hh" #include "seastarx.hh" namespace replica { diff --git a/db/virtual_tables.cc b/db/virtual_tables.cc index 915ed07f78..a1635148fd 100644 --- a/db/virtual_tables.cc +++ b/db/virtual_tables.cc @@ -80,7 +80,7 @@ public: set_cell(cr, "host_id", hostid->uuid()); } - if (tm.is_normal_token_owner(endpoint)) { + if (hostid && tm.is_normal_token_owner(*hostid)) { sstring dc = tm.get_topology().get_location(endpoint).dc; set_cell(cr, "dc", dc); } @@ -89,7 +89,7 @@ public: set_cell(cr, "owns", ownership[endpoint]); } - set_cell(cr, "tokens", int32_t(tm.get_tokens(endpoint).size())); + set_cell(cr, "tokens", int32_t(hostid ? tm.get_tokens(*hostid).size() : 0)); mutation_sink(std::move(m)); }); diff --git a/dht/boot_strapper.hh b/dht/boot_strapper.hh index 0c50c043e7..61ec72f136 100644 --- a/dht/boot_strapper.hh +++ b/dht/boot_strapper.hh @@ -35,15 +35,15 @@ class boot_strapper { sharded& _stream_manager; abort_source& _abort_source; /* endpoint that needs to be bootstrapped */ - inet_address _address; + locator::host_id _address; /* its DC/RACK info */ locator::endpoint_dc_rack _dr; /* token of the node being bootstrapped. */ std::unordered_set _tokens; - const token_metadata_ptr _token_metadata_ptr; + const locator::token_metadata_ptr _token_metadata_ptr; public: boot_strapper(distributed& db, sharded& sm, abort_source& abort_source, - inet_address addr, locator::endpoint_dc_rack dr, std::unordered_set tokens, const token_metadata_ptr tmptr) + locator::host_id addr, locator::endpoint_dc_rack dr, std::unordered_set tokens, const token_metadata_ptr tmptr) : _db(db) , _stream_manager(sm) , _abort_source(abort_source) diff --git a/dht/range_streamer.cc b/dht/range_streamer.cc index 8d2d4ba9e7..37c2e6135b 100644 --- a/dht/range_streamer.cc +++ b/dht/range_streamer.cc @@ -88,6 +88,7 @@ range_streamer::get_all_ranges_with_sources_for(const sstring& keyspace_name, lo logger.debug("keyspace={}, desired_ranges.size={}, range_addresses.size={}", keyspace_name, desired_ranges.size(), range_addresses.size()); std::unordered_map> range_sources; + const auto address_ep = get_token_metadata().get_endpoint_for_host_id(_address); for (auto& desired_range : desired_ranges) { auto found = false; for (auto& x : range_addresses) { @@ -97,7 +98,7 @@ range_streamer::get_all_ranges_with_sources_for(const sstring& keyspace_name, lo const range& src_range = x.first; if (src_range.contains(desired_range, dht::operator<=>)) { inet_address_vector_replica_set preferred(x.second.begin(), x.second.end()); - get_token_metadata().get_topology().sort_by_proximity(_address, preferred); + get_token_metadata().get_topology().sort_by_proximity(address_ep, preferred); for (inet_address& p : preferred) { range_sources[desired_range].push_back(p); } diff --git a/dht/range_streamer.hh b/dht/range_streamer.hh index 31a14d813d..00c6019f5e 100644 --- a/dht/range_streamer.hh +++ b/dht/range_streamer.hh @@ -78,7 +78,7 @@ public: }; range_streamer(distributed& db, sharded& sm, const token_metadata_ptr tmptr, abort_source& abort_source, std::unordered_set tokens, - inet_address address, locator::endpoint_dc_rack dr, sstring description, streaming::stream_reason reason, + locator::host_id address, locator::endpoint_dc_rack dr, sstring description, streaming::stream_reason reason, service::frozen_topology_guard topo_guard, std::vector tables = {}) : _db(db) @@ -97,7 +97,7 @@ public: } range_streamer(distributed& db, sharded& sm, const token_metadata_ptr tmptr, abort_source& abort_source, - inet_address address, locator::endpoint_dc_rack dr, sstring description, streaming::stream_reason reason, service::frozen_topology_guard topo_guard, std::vector tables = {}) + locator::host_id address, locator::endpoint_dc_rack dr, sstring description, streaming::stream_reason reason, service::frozen_topology_guard topo_guard, std::vector tables = {}) : range_streamer(db, sm, std::move(tmptr), abort_source, std::unordered_set(), address, std::move(dr), description, reason, std::move(topo_guard), std::move(tables)) { } @@ -157,7 +157,7 @@ private: token_metadata_ptr _token_metadata_ptr; abort_source& _abort_source; std::unordered_set _tokens; - inet_address _address; + locator::host_id _address; locator::endpoint_dc_rack _dr; sstring _description; streaming::stream_reason _reason; diff --git a/gms/gossiper.cc b/gms/gossiper.cc index 14482a8b4f..db346e4a26 100644 --- a/gms/gossiper.cc +++ b/gms/gossiper.cc @@ -755,8 +755,9 @@ future<> gossiper::do_status_check() { // check for dead state removal auto expire_time = get_expire_time_for_endpoint(endpoint); + const auto host_id = get_host_id(endpoint); if (!is_alive && (now > expire_time) - && (!get_token_metadata_ptr()->is_normal_token_owner(endpoint))) { + && (!get_token_metadata_ptr()->is_normal_token_owner(host_id))) { logger.debug("time is expiring for endpoint : {} ({})", endpoint, expire_time.time_since_epoch().count()); co_await evict_from_membership(endpoint, pid); } @@ -1138,7 +1139,7 @@ std::set gossiper::get_live_members() const { std::set gossiper::get_live_token_owners() const { std::set token_owners; - auto normal_token_owners = get_token_metadata_ptr()->get_all_endpoints(); + auto normal_token_owners = get_token_metadata_ptr()->get_all_ips(); for (auto& node: normal_token_owners) { if (is_alive(node)) { token_owners.insert(node); @@ -1149,7 +1150,7 @@ std::set gossiper::get_live_token_owners() const { std::set gossiper::get_unreachable_token_owners() const { std::set token_owners; - auto normal_token_owners = get_token_metadata_ptr()->get_all_endpoints(); + auto normal_token_owners = get_token_metadata_ptr()->get_all_ips(); for (auto& node: normal_token_owners) { if (!is_alive(node)) { token_owners.insert(node); @@ -1306,7 +1307,8 @@ future<> gossiper::assassinate_endpoint(sstring address) { std::vector tokens; logger.warn("Assassinating {} via gossip", endpoint); if (es) { - tokens = gossiper.get_token_metadata_ptr()->get_tokens(endpoint); + const auto host_id = gossiper.get_host_id(endpoint); + tokens = gossiper.get_token_metadata_ptr()->get_tokens(host_id); if (tokens.empty()) { logger.warn("Unable to calculate tokens for {}. Will use a random one", address); throw std::runtime_error(format("Unable to calculate tokens for {}", endpoint)); @@ -1391,7 +1393,8 @@ bool gossiper::is_gossip_only_member(inet_address endpoint) const { if (!es) { return false; } - return !is_dead_state(*es) && !get_token_metadata_ptr()->is_normal_token_owner(endpoint); + const auto host_id = get_host_id(endpoint); + return !is_dead_state(*es) && !get_token_metadata_ptr()->is_normal_token_owner(host_id); } clk::time_point gossiper::get_expire_time_for_endpoint(inet_address endpoint) const noexcept { @@ -2088,14 +2091,14 @@ future<> gossiper::add_saved_endpoint(inet_address ep) { ep_state.set_heart_beat_state_and_update_timestamp(heart_beat_state()); } const auto tmptr = get_token_metadata_ptr(); - auto tokens = tmptr->get_tokens(ep); - if (!tokens.empty()) { - std::unordered_set tokens_set(tokens.begin(), tokens.end()); - ep_state.add_application_state(gms::application_state::TOKENS, versioned_value::tokens(tokens_set)); - } auto host_id = tmptr->get_host_id_if_known(ep); if (host_id) { ep_state.add_application_state(gms::application_state::HOST_ID, versioned_value::host_id(host_id.value())); + auto tokens = tmptr->get_tokens(*host_id); + if (!tokens.empty()) { + std::unordered_set tokens_set(tokens.begin(), tokens.end()); + ep_state.add_application_state(gms::application_state::TOKENS, versioned_value::tokens(tokens_set)); + } } auto generation = ep_state.get_heart_beat_state().get_generation(); co_await replicate(ep, std::move(ep_state), permit.id()); diff --git a/inet_address_vectors.hh b/inet_address_vectors.hh index 859b448adf..5408a51be4 100644 --- a/inet_address_vectors.hh +++ b/inet_address_vectors.hh @@ -9,8 +9,13 @@ #pragma once #include "gms/inet_address.hh" +#include "locator/host_id.hh" #include "utils/small_vector.hh" using inet_address_vector_replica_set = utils::small_vector; using inet_address_vector_topology_change = utils::small_vector; + +using host_id_vector_replica_set = utils::small_vector; + +using host_id_vector_topology_change = utils::small_vector; diff --git a/locator/abstract_replication_strategy.cc b/locator/abstract_replication_strategy.cc index 39621b48cb..aabb7fe128 100644 --- a/locator/abstract_replication_strategy.cc +++ b/locator/abstract_replication_strategy.cc @@ -19,6 +19,18 @@ namespace locator { +static endpoint_set resolve_endpoints(const host_id_set& host_ids, const token_metadata& tm) { + endpoint_set result{}; + result.reserve(host_ids.size()); + for (const auto& host_id: host_ids) { + // Empty host_id is used as a marker for local address. + // The reason for this hack is that we need local_strategy to + // work before the local host_id is loaded from the system.local table. + result.push_back(host_id ? tm.get_endpoint_for_host_id(host_id) : tm.get_topology().my_address()); + } + return result; +} + logging::logger rslogger("replication_strategy"); abstract_replication_strategy::abstract_replication_strategy( @@ -56,6 +68,11 @@ void abstract_replication_strategy::validate_replication_strategy(const sstring& } } +future abstract_replication_strategy::calculate_natural_ips(const token& search_token, const token_metadata& tm) const { + const auto host_ids = co_await calculate_natural_endpoints(search_token, tm); + co_return resolve_endpoints(host_ids, tm); +} + using strategy_class_registry = class_registry< locator::abstract_replication_strategy, const locator::replication_strategy_config_options&>; @@ -87,7 +104,8 @@ void maybe_remove_node_being_replaced(const token_metadata& tm, // as the natural_endpoints and the node will not appear in the // pending_endpoints. auto it = boost::range::remove_if(natural_endpoints, [&] (gms::inet_address& p) { - return tm.is_being_replaced(p); + const auto host_id = tm.get_host_id(p); + return tm.is_being_replaced(host_id); }); natural_endpoints.erase(it, natural_endpoints.end()); } @@ -238,13 +256,13 @@ vnode_effective_replication_map::get_ranges(inet_address ep) const { // Caller must ensure that token_metadata will not change throughout the call. future -abstract_replication_strategy::get_ranges(inet_address ep, token_metadata_ptr tmptr) const { +abstract_replication_strategy::get_ranges(locator::host_id ep, token_metadata_ptr tmptr) const { co_return co_await get_ranges(ep, *tmptr); } // Caller must ensure that token_metadata will not change throughout the call. future -abstract_replication_strategy::get_ranges(inet_address ep, const token_metadata& tm) const { +abstract_replication_strategy::get_ranges(locator::host_id ep, const token_metadata& tm) const { dht::token_range_vector ret; if (!tm.is_normal_token_owner(ep)) { co_return ret; @@ -326,7 +344,7 @@ abstract_replication_strategy::get_range_addresses(const token_metadata& tm) con std::unordered_map ret; for (auto& t : tm.sorted_tokens()) { dht::token_range_vector ranges = tm.get_primary_ranges_for(t); - auto eps = co_await calculate_natural_endpoints(t, tm); + auto eps = co_await calculate_natural_ips(t, tm); for (auto& r : ranges) { ret.emplace(r, eps.get_vector()); } @@ -335,9 +353,9 @@ abstract_replication_strategy::get_range_addresses(const token_metadata& tm) con } future -abstract_replication_strategy::get_pending_address_ranges(const token_metadata_ptr tmptr, std::unordered_set pending_tokens, inet_address pending_address, locator::endpoint_dc_rack dr) const { +abstract_replication_strategy::get_pending_address_ranges(const token_metadata_ptr tmptr, std::unordered_set pending_tokens, locator::host_id pending_address, locator::endpoint_dc_rack dr) const { dht::token_range_vector ret; - token_metadata temp = co_await tmptr->clone_only_token_map(); + auto temp = co_await tmptr->clone_only_token_map(); temp.update_topology(pending_address, std::move(dr)); co_await temp.update_normal_tokens(pending_tokens, pending_address); for (const auto& t : temp.sorted_tokens()) { @@ -363,17 +381,14 @@ future calculate_effective_replicat replication_map.reserve(depend_on_token ? sorted_tokens.size() : 1); if (const auto& topology_changes = tmptr->get_topology_change_info(); topology_changes) { const auto& all_tokens = topology_changes->all_tokens; - const auto& base_token_metadata = topology_changes->base_token_metadata - ? *topology_changes->base_token_metadata - : *tmptr; const auto& current_tokens = tmptr->get_token_to_endpoint(); for (size_t i = 0, size = all_tokens.size(); i < size; ++i) { co_await coroutine::maybe_yield(); const auto token = all_tokens[i]; - auto current_endpoints = co_await rs->calculate_natural_endpoints(token, base_token_metadata); - auto target_endpoints = co_await rs->calculate_natural_endpoints(token, topology_changes->target_token_metadata); + auto current_endpoints = co_await rs->calculate_natural_endpoints(token, *tmptr); + auto target_endpoints = co_await rs->calculate_natural_endpoints(token, *topology_changes->target_token_metadata); auto add_mapping = [&](ring_mapping& target, std::unordered_set&& endpoints) { using interval = ring_mapping::interval_type; @@ -396,37 +411,37 @@ future calculate_effective_replicat }; { - std::unordered_set endpoints_diff; + host_id_set endpoints_diff; for (const auto& e: target_endpoints) { if (!current_endpoints.contains(e)) { endpoints_diff.insert(e); } } if (!endpoints_diff.empty()) { - add_mapping(pending_endpoints, std::move(endpoints_diff)); + add_mapping(pending_endpoints, resolve_endpoints(endpoints_diff, *tmptr).extract_set()); } } // in order not to waste memory, we update read_endpoints only if the // new endpoints differs from the old one if (topology_changes->read_new && target_endpoints.get_vector() != current_endpoints.get_vector()) { - add_mapping(read_endpoints, std::move(target_endpoints).extract_set()); + add_mapping(read_endpoints, resolve_endpoints(target_endpoints, *tmptr).extract_set()); } if (!depend_on_token) { - replication_map.emplace(default_replication_map_key, std::move(current_endpoints).extract_vector()); + replication_map.emplace(default_replication_map_key, resolve_endpoints(current_endpoints, *tmptr).extract_vector()); break; } else if (current_tokens.contains(token)) { - replication_map.emplace(token, std::move(current_endpoints).extract_vector()); + replication_map.emplace(token, resolve_endpoints(current_endpoints, *tmptr).extract_vector()); } } } else if (depend_on_token) { for (const auto &t : sorted_tokens) { - auto eps = co_await rs->calculate_natural_endpoints(t, *tmptr); + auto eps = co_await rs->calculate_natural_ips(t, *tmptr); replication_map.emplace(t, std::move(eps).extract_vector()); } } else { - auto eps = co_await rs->calculate_natural_endpoints(default_replication_map_key, *tmptr); + auto eps = co_await rs->calculate_natural_ips(default_replication_map_key, *tmptr); replication_map.emplace(default_replication_map_key, std::move(eps).extract_vector()); } diff --git a/locator/abstract_replication_strategy.hh b/locator/abstract_replication_strategy.hh index 5af70446a6..dbd35cfe67 100644 --- a/locator/abstract_replication_strategy.hh +++ b/locator/abstract_replication_strategy.hh @@ -53,12 +53,14 @@ using replication_strategy_config_options = std::map; using replication_map = std::unordered_map; using endpoint_set = utils::basic_sequenced_set; +using host_id_set = utils::basic_sequenced_set; class vnode_effective_replication_map; class effective_replication_map_factory; class per_table_replication_strategy; class tablet_aware_replication_strategy; + class abstract_replication_strategy : public seastar::enable_shared_from_this { friend class vnode_effective_replication_map; friend class per_table_replication_strategy; @@ -101,7 +103,8 @@ public: // is small, that implementation may not yield since by itself it won't cause a reactor stall (assuming practical // cluster sizes and number of tokens per node). The caller is responsible for yielding if they call this function // in a loop. - virtual future calculate_natural_endpoints(const token& search_token, const token_metadata& tm) const = 0; + virtual future calculate_natural_endpoints(const token& search_token, const token_metadata& tm) const = 0; + future calculate_natural_ips(const token& search_token, const token_metadata& tm) const; virtual ~abstract_replication_strategy() {} static ptr_type create_replication_strategy(const sstring& strategy_name, const replication_strategy_config_options& config_options); @@ -146,13 +149,13 @@ public: // Use the token_metadata provided by the caller instead of _token_metadata // Note: must be called with initialized, non-empty token_metadata. - future get_ranges(inet_address ep, token_metadata_ptr tmptr) const; - future get_ranges(inet_address ep, const token_metadata& tm) const; + future get_ranges(locator::host_id ep, token_metadata_ptr tmptr) const; + future get_ranges(locator::host_id ep, const token_metadata& tm) const; // Caller must ensure that token_metadata will not change throughout the call. future> get_range_addresses(const token_metadata& tm) const; - future get_pending_address_ranges(const token_metadata_ptr tmptr, std::unordered_set pending_tokens, inet_address pending_address, locator::endpoint_dc_rack dr) const; + future get_pending_address_ranges(const token_metadata_ptr tmptr, std::unordered_set pending_tokens, locator::host_id pending_address, locator::endpoint_dc_rack dr) const; }; using ring_mapping = boost::icl::interval_map>; diff --git a/locator/everywhere_replication_strategy.cc b/locator/everywhere_replication_strategy.cc index 3bf75f2021..45cb60c27f 100644 --- a/locator/everywhere_replication_strategy.cc +++ b/locator/everywhere_replication_strategy.cc @@ -20,13 +20,13 @@ everywhere_replication_strategy::everywhere_replication_strategy(const replicati _natural_endpoints_depend_on_token = false; } -future everywhere_replication_strategy::calculate_natural_endpoints(const token& search_token, const token_metadata& tm) const { +future everywhere_replication_strategy::calculate_natural_endpoints(const token& search_token, const token_metadata& tm) const { if (tm.sorted_tokens().empty()) { - endpoint_set result{inet_address_vector_replica_set({tm.get_topology().my_address()})}; - return make_ready_future(std::move(result)); + host_id_set result{host_id_vector_replica_set({host_id{}})}; + return make_ready_future(std::move(result)); } const auto& all_endpoints = tm.get_all_endpoints(); - return make_ready_future(endpoint_set(all_endpoints.begin(), all_endpoints.end())); + return make_ready_future(host_id_set(all_endpoints.begin(), all_endpoints.end())); } size_t everywhere_replication_strategy::get_replication_factor(const token_metadata& tm) const { diff --git a/locator/everywhere_replication_strategy.hh b/locator/everywhere_replication_strategy.hh index a3cd8ab134..a67c584d2c 100644 --- a/locator/everywhere_replication_strategy.hh +++ b/locator/everywhere_replication_strategy.hh @@ -18,7 +18,7 @@ class everywhere_replication_strategy : public abstract_replication_strategy { public: everywhere_replication_strategy(const replication_strategy_config_options& config_options); - virtual future calculate_natural_endpoints(const token& search_token, const token_metadata& tm) const override; + virtual future calculate_natural_endpoints(const token& search_token, const token_metadata& tm) const override; virtual void validate_options(const gms::feature_service&) const override { /* noop */ } diff --git a/locator/local_strategy.cc b/locator/local_strategy.cc index 8f213489c4..60409235af 100644 --- a/locator/local_strategy.cc +++ b/locator/local_strategy.cc @@ -18,8 +18,8 @@ local_strategy::local_strategy(const replication_strategy_config_options& config _natural_endpoints_depend_on_token = false; } -future local_strategy::calculate_natural_endpoints(const token& t, const token_metadata& tm) const { - return make_ready_future(endpoint_set({tm.get_topology().my_address()})); +future local_strategy::calculate_natural_endpoints(const token& t, const token_metadata& tm) const { + return make_ready_future(host_id_set{host_id{}}); } void local_strategy::validate_options(const gms::feature_service&) const { diff --git a/locator/local_strategy.hh b/locator/local_strategy.hh index 60e58e1d4b..b4b1660e0e 100644 --- a/locator/local_strategy.hh +++ b/locator/local_strategy.hh @@ -27,7 +27,7 @@ public: virtual ~local_strategy() {}; virtual size_t get_replication_factor(const token_metadata&) const override; - virtual future calculate_natural_endpoints(const token& search_token, const token_metadata& tm) const override; + virtual future calculate_natural_endpoints(const token& search_token, const token_metadata& tm) const override; virtual void validate_options(const gms::feature_service&) const override; diff --git a/locator/network_topology_strategy.cc b/locator/network_topology_strategy.cc index e320fe96ca..fc9cd90572 100644 --- a/locator/network_topology_strategy.cc +++ b/locator/network_topology_strategy.cc @@ -82,7 +82,7 @@ class natural_endpoints_tracker { */ struct data_center_endpoints { /** List accepted endpoints get pushed into. */ - endpoint_set& _endpoints; + host_id_set& _endpoints; /** * Racks encountered so far. Replicas are put into separate racks while possible. @@ -95,7 +95,7 @@ class natural_endpoints_tracker { size_t _rf_left; ssize_t _acceptable_rack_repeats; - data_center_endpoints(size_t rf, size_t rack_count, size_t node_count, endpoint_set& endpoints, endpoint_dc_rack_set& racks) + data_center_endpoints(size_t rf, size_t rack_count, size_t node_count, host_id_set& endpoints, endpoint_dc_rack_set& racks) : _endpoints(endpoints) , _racks(racks) // If there aren't enough nodes in this DC to fill the RF, the number of nodes is the effective RF. @@ -109,7 +109,7 @@ class natural_endpoints_tracker { * Attempts to add an endpoint to the replicas for this datacenter, adding to the endpoints set if successful. * Returns true if the endpoint was added, and this datacenter does not require further replicas. */ - bool add_endpoint_and_check_if_done(const inet_address& ep, const endpoint_dc_rack& location) { + bool add_endpoint_and_check_if_done(const host_id& ep, const endpoint_dc_rack& location) { if (done()) { return false; } @@ -168,7 +168,7 @@ class natural_endpoints_tracker { // We want to preserve insertion order so that the first added endpoint // becomes primary. // - endpoint_set _replicas; + host_id_set _replicas; // tracks the racks we have already placed replicas in endpoint_dc_rack_set _seen_racks; @@ -219,7 +219,7 @@ public: } } - bool add_endpoint_and_check_if_done(inet_address ep) { + bool add_endpoint_and_check_if_done(host_id ep) { auto& loc = _tp.get_location(ep); auto i = _dcs.find(loc.dc); if (i != _dcs.end() && i->second.add_endpoint_and_check_if_done(ep, loc)) { @@ -232,12 +232,12 @@ public: return _dcs_to_fill == 0; } - endpoint_set& replicas() noexcept { + host_id_set& replicas() noexcept { return _replicas; } }; -future +future network_topology_strategy::calculate_natural_endpoints( const token& search_token, const token_metadata& tm) const { @@ -246,7 +246,7 @@ network_topology_strategy::calculate_natural_endpoints( for (auto& next : tm.ring_range(search_token)) { co_await coroutine::maybe_yield(); - inet_address ep = *tm.get_endpoint(next); + host_id ep = *tm.get_endpoint(next); if (tracker.add_endpoint_and_check_if_done(ep)) { break; } @@ -313,7 +313,7 @@ future network_topology_strategy::allocate_tablets_for_new_table(sch if (token_range.begin() == token_range.end()) { token_range = tm->ring_range(dht::minimum_token()); } - inet_address ep = *tm->get_endpoint(*token_range.begin()); + locator::host_id ep = *tm->get_endpoint(*token_range.begin()); token_range.drop_front(); if (tracker.add_endpoint_and_check_if_done(ep)) { break; @@ -322,8 +322,7 @@ future network_topology_strategy::allocate_tablets_for_new_table(sch tablet_replica_set replicas; for (auto&& ep : tracker.replicas()) { - auto host = tm->get_host_id(ep); - replicas.emplace_back(tablet_replica{host, load.next_shard(host)}); + replicas.emplace_back(tablet_replica{ep, load.next_shard(ep)}); } tablets.set_tablet(tb, tablet_info{std::move(replicas)}); diff --git a/locator/network_topology_strategy.hh b/locator/network_topology_strategy.hh index 3f704c98b8..b5f1d118ff 100644 --- a/locator/network_topology_strategy.hh +++ b/locator/network_topology_strategy.hh @@ -50,7 +50,7 @@ protected: * calculate endpoints in one pass through the tokens by tracking our * progress in each DC, rack etc. */ - virtual future calculate_natural_endpoints( + virtual future calculate_natural_endpoints( const token& search_token, const token_metadata& tm) const override; virtual void validate_options(const gms::feature_service&) const override; diff --git a/locator/simple_strategy.cc b/locator/simple_strategy.cc index 81006dc800..17730ed2ef 100644 --- a/locator/simple_strategy.cc +++ b/locator/simple_strategy.cc @@ -33,15 +33,15 @@ simple_strategy::simple_strategy(const replication_strategy_config_options& conf } } -future simple_strategy::calculate_natural_endpoints(const token& t, const token_metadata& tm) const { +future simple_strategy::calculate_natural_endpoints(const token& t, const token_metadata& tm) const { const std::vector& tokens = tm.sorted_tokens(); if (tokens.empty()) { - co_return endpoint_set(); + co_return host_id_set{}; } size_t replicas = _replication_factor; - endpoint_set endpoints; + host_id_set endpoints; endpoints.reserve(replicas); for (auto& token : tm.ring_range(t)) { diff --git a/locator/simple_strategy.hh b/locator/simple_strategy.hh index a04e3e2ccd..a4cacccebf 100644 --- a/locator/simple_strategy.hh +++ b/locator/simple_strategy.hh @@ -26,7 +26,7 @@ public: return true; } - virtual future calculate_natural_endpoints(const token& search_token, const token_metadata& tm) const override; + virtual future calculate_natural_endpoints(const token& search_token, const token_metadata& tm) const override; private: size_t _replication_factor = 1; }; diff --git a/locator/tablets.cc b/locator/tablets.cc index 5e7e81679a..ef69433402 100644 --- a/locator/tablets.cc +++ b/locator/tablets.cc @@ -115,7 +115,7 @@ const tablet_map& tablet_metadata::get_tablet_map(table_id id) const { try { return _tablets.at(id); } catch (const std::out_of_range&) { - throw std::runtime_error(format("Tablet map not found for table {}", id)); + throw_with_backtrace(format("Tablet map not found for table {}", id)); } } @@ -334,18 +334,11 @@ class tablet_effective_replication_map : public effective_replication_map { table_id _table; tablet_sharder _sharder; private: - gms::inet_address get_endpoint_for_host_id(host_id host) const { - auto endpoint_opt = _tmptr->get_endpoint_for_host_id(host); - if (!endpoint_opt) { - on_internal_error(tablet_logger, format("Host ID {} not found in the cluster", host)); - } - return *endpoint_opt; - } inet_address_vector_replica_set to_replica_set(const tablet_replica_set& replicas) const { inet_address_vector_replica_set result; result.reserve(replicas.size()); for (auto&& replica : replicas) { - result.emplace_back(get_endpoint_for_host_id(replica.host)); + result.emplace_back(_tmptr->get_endpoint_for_host_id(replica.host)); } return result; } @@ -406,7 +399,7 @@ public: case write_replica_set_selector::both: tablet_logger.trace("get_pending_endpoints({}): table={}, tablet={}, replica={}", search_token, _table, tablet, info->pending_replica); - return {get_endpoint_for_host_id(info->pending_replica.host)}; + return {_tmptr->get_endpoint_for_host_id(info->pending_replica.host)}; case write_replica_set_selector::next: return {}; } diff --git a/locator/token_metadata.cc b/locator/token_metadata.cc index 5d883e6f1f..67e1302e0a 100644 --- a/locator/token_metadata.cc +++ b/locator/token_metadata.cc @@ -39,8 +39,6 @@ static void remove_by_value(C& container, V value) { } class token_metadata_impl final { -public: - using inet_address = gms::inet_address; private: /** * Maintains token to endpoint map of every node in the cluster. @@ -48,15 +46,15 @@ private: * multiple tokens. Hence, the BiMultiValMap collection. */ // FIXME: have to be BiMultiValMap - std::unordered_map _token_to_endpoint_map; + std::unordered_map _token_to_endpoint_map; // Track the unique set of nodes in _token_to_endpoint_map - std::unordered_set _normal_token_owners; + std::unordered_set _normal_token_owners; - std::unordered_map _bootstrap_tokens; - std::unordered_set _leaving_endpoints; + std::unordered_map _bootstrap_tokens; + std::unordered_set _leaving_endpoints; // The map between the existing node to be replaced and the replacing node - std::unordered_map _replacing_endpoints; + std::unordered_map _replacing_endpoints; std::optional _topology_change_info; @@ -100,25 +98,25 @@ public: token_metadata_impl(const token_metadata_impl&) = delete; // it's too huge for direct copy, use clone_async() token_metadata_impl(token_metadata_impl&&) noexcept = default; const std::vector& sorted_tokens() const; - future<> update_normal_tokens(std::unordered_set tokens, inet_address endpoint); + future<> update_normal_tokens(std::unordered_set tokens, host_id endpoint); const token& first_token(const token& start) const; size_t first_token_index(const token& start) const; - std::optional get_endpoint(const token& token) const; - std::vector get_tokens(const inet_address& addr) const; - const std::unordered_map& get_token_to_endpoint() const { + std::optional get_endpoint(const token& token) const; + std::vector get_tokens(const host_id& addr) const; + const std::unordered_map& get_token_to_endpoint() const { return _token_to_endpoint_map; } - const std::unordered_set& get_leaving_endpoints() const { + const std::unordered_set& get_leaving_endpoints() const { return _leaving_endpoints; } - const std::unordered_map& get_bootstrap_tokens() const { + const std::unordered_map& get_bootstrap_tokens() const { return _bootstrap_tokens; } - void update_topology(inet_address ep, std::optional opt_dr, std::optional opt_st, std::optional shard_count = std::nullopt) { - _topology.add_or_update_endpoint(ep, std::nullopt, std::move(opt_dr), std::move(opt_st), std::move(shard_count)); + void update_topology(host_id id, std::optional opt_dr, std::optional opt_st, std::optional shard_count = std::nullopt) { + _topology.add_or_update_endpoint(id, std::nullopt, std::move(opt_dr), std::move(opt_st), std::move(shard_count)); } /** @@ -158,36 +156,39 @@ public: /// Return the unique host ID for an end-point or nullopt if not found. std::optional get_host_id_if_known(inet_address endpoint) const; - /** Return the end-point for a unique host ID */ - std::optional get_endpoint_for_host_id(host_id) const; + /** Return the end-point for a unique host ID or nullopt if not found.*/ + std::optional get_endpoint_for_host_id_if_known(host_id) const; + + /** Return the end-point for a unique host ID.*/ + inet_address get_endpoint_for_host_id(host_id) const; /** @return a copy of the endpoint-to-id map for read-only operations */ std::unordered_map get_endpoint_to_host_id_map_for_reading() const; - void add_bootstrap_token(token t, inet_address endpoint); + void add_bootstrap_token(token t, host_id endpoint); - void add_bootstrap_tokens(std::unordered_set tokens, inet_address endpoint); + void add_bootstrap_tokens(std::unordered_set tokens, host_id endpoint); void remove_bootstrap_tokens(std::unordered_set tokens); - void add_leaving_endpoint(inet_address endpoint); - void del_leaving_endpoint(inet_address endpoint); + void add_leaving_endpoint(host_id endpoint); + void del_leaving_endpoint(host_id endpoint); public: - void remove_endpoint(inet_address endpoint); + void remove_endpoint(host_id endpoint); - bool is_normal_token_owner(inet_address endpoint) const; + bool is_normal_token_owner(host_id endpoint) const; - bool is_leaving(inet_address endpoint) const; + bool is_leaving(host_id endpoint) const; // Is this node being replaced by another node - bool is_being_replaced(inet_address endpoint) const; + bool is_being_replaced(host_id endpoint) const; // Is any node being replaced by another node bool is_any_node_being_replaced() const; - void add_replacing_endpoint(inet_address existing_node, inet_address replacing_node); + void add_replacing_endpoint(host_id existing_node, host_id replacing_node); - void del_replacing_endpoint(inet_address existing_node); + void del_replacing_endpoint(host_id existing_node); public: /** @@ -248,7 +249,7 @@ public: // node that is still joining the cluster, e.g., a node that is still // streaming data before it finishes the bootstrap process and turns into // NORMAL status. - const std::unordered_set& get_all_endpoints() const noexcept { + const std::unordered_set& get_all_endpoints() const noexcept { return _normal_token_owners; } @@ -258,24 +259,11 @@ public: private: future<> update_normal_token_owners(); public: - // returns empty vector if keyspace_name not found. - inet_address_vector_topology_change pending_endpoints_for(const token& token, const sstring& keyspace_name) const; - - std::optional endpoints_for_reading(const token& token, const sstring& keyspace_name) const; - void set_read_new(token_metadata::read_new_t read_new) { _read_new = read_new; } public: - /** @return an endpoint to token multimap representation of tokenToEndpointMap (a copy) */ - std::multimap get_endpoint_to_token_map_for_reading() const; - /** - * @return a (stable copy, won't be modified) Token to Endpoint map for all the normal and bootstrapping nodes - * in the cluster. - */ - std::map get_normal_and_bootstrapping_token_to_endpoint_map() const; - long get_ring_version() const { return _ring_version; } @@ -417,7 +405,7 @@ const std::vector& token_metadata_impl::sorted_tokens() const { return _sorted_tokens; } -std::vector token_metadata_impl::get_tokens(const inet_address& addr) const { +std::vector token_metadata_impl::get_tokens(const host_id& addr) const { std::vector res; for (auto&& i : _token_to_endpoint_map) { if (i.second == addr) { @@ -428,12 +416,12 @@ std::vector token_metadata_impl::get_tokens(const inet_address& addr) con return res; } -future<> token_metadata_impl::update_normal_tokens(std::unordered_set tokens, inet_address endpoint) { +future<> token_metadata_impl::update_normal_tokens(std::unordered_set tokens, host_id endpoint) { if (tokens.empty()) { co_return; } - if (!_topology.has_endpoint(endpoint)) { + if (!_topology.has_node(endpoint)) { on_internal_error(tlogger, format("token_metadata_impl: {} must be a member of topology to update normal tokens", endpoint)); } @@ -467,7 +455,7 @@ future<> token_metadata_impl::update_normal_tokens(std::unordered_set tok for (const token& t : tokens) { co_await coroutine::maybe_yield(); - auto prev = _token_to_endpoint_map.insert(std::pair(t, endpoint)); + auto prev = _token_to_endpoint_map.insert(std::pair(t, endpoint)); should_sort_tokens |= prev.second; // new token inserted -> sort if (prev.first->second != endpoint) { tlogger.debug("Token {} changing ownership from {} to {}", t, prev.first->second, endpoint); @@ -503,7 +491,7 @@ const token& token_metadata_impl::first_token(const token& start) const { return _sorted_tokens[first_token_index(start)]; } -std::optional token_metadata_impl::get_endpoint(const token& token) const { +std::optional token_metadata_impl::get_endpoint(const token& token) const { auto it = _token_to_endpoint_map.find(token); if (it == _token_to_endpoint_map.end()) { return std::nullopt; @@ -528,14 +516,14 @@ void token_metadata_impl::debug_show() const { } void token_metadata_impl::update_host_id(const host_id& host_id, inet_address endpoint) { - _topology.add_or_update_endpoint(endpoint, host_id); + _topology.add_or_update_endpoint(host_id, endpoint); } host_id token_metadata_impl::get_host_id(inet_address endpoint) const { if (const auto* node = _topology.find_node(endpoint)) [[likely]] { return node->host_id(); } else { - throw std::runtime_error(format("host_id for endpoint {} is not found", endpoint)); + on_internal_error(tlogger, format("host_id for endpoint {} is not found", endpoint)); } } @@ -547,7 +535,7 @@ std::optional token_metadata_impl::get_host_id_if_known(inet_address en } } -std::optional token_metadata_impl::get_endpoint_for_host_id(host_id host_id) const { +std::optional token_metadata_impl::get_endpoint_for_host_id_if_known(host_id host_id) const { if (const auto* node = _topology.find_node(host_id)) [[likely]] { return node->endpoint(); } else { @@ -555,6 +543,14 @@ std::optional token_metadata_impl::get_endpoint_for_host_id(host_i } } +inet_address token_metadata_impl::get_endpoint_for_host_id(host_id host_id) const { + if (const auto* node = _topology.find_node(host_id)) [[likely]] { + return node->endpoint(); + } else { + on_internal_error(tlogger, format("endpoint for host_id {} is not found", host_id)); + } +} + std::unordered_map token_metadata_impl::get_endpoint_to_host_id_map_for_reading() const { const auto& nodes = _topology.get_nodes_by_endpoint(); std::unordered_map map; @@ -573,11 +569,11 @@ std::unordered_map token_metadata_impl::get_endpoint_to_h return map; } -bool token_metadata_impl::is_normal_token_owner(inet_address endpoint) const { +bool token_metadata_impl::is_normal_token_owner(host_id endpoint) const { return _normal_token_owners.contains(endpoint); } -void token_metadata_impl::add_bootstrap_token(token t, inet_address endpoint) { +void token_metadata_impl::add_bootstrap_token(token t, host_id endpoint) { std::unordered_set tokens{t}; add_bootstrap_tokens(tokens, endpoint); } @@ -587,7 +583,7 @@ token_metadata_impl::ring_range(const dht::ring_position_view start) const { return ring_range(start.token()); } -void token_metadata_impl::add_bootstrap_tokens(std::unordered_set tokens, inet_address endpoint) { +void token_metadata_impl::add_bootstrap_tokens(std::unordered_set tokens, host_id endpoint) { for (auto t : tokens) { auto old_endpoint = _bootstrap_tokens.find(t); if (old_endpoint != _bootstrap_tokens.end() && (*old_endpoint).second != endpoint) { @@ -602,7 +598,7 @@ void token_metadata_impl::add_bootstrap_tokens(std::unordered_set tokens, } } - std::erase_if(_bootstrap_tokens, [endpoint] (const std::pair& n) { return n.second == endpoint; }); + std::erase_if(_bootstrap_tokens, [endpoint] (const std::pair& n) { return n.second == endpoint; }); for (auto t : tokens) { _bootstrap_tokens[t] = endpoint; @@ -619,11 +615,11 @@ void token_metadata_impl::remove_bootstrap_tokens(std::unordered_set toke } } -bool token_metadata_impl::is_leaving(inet_address endpoint) const { +bool token_metadata_impl::is_leaving(host_id endpoint) const { return _leaving_endpoints.contains(endpoint); } -bool token_metadata_impl::is_being_replaced(inet_address endpoint) const { +bool token_metadata_impl::is_being_replaced(host_id endpoint) const { return _replacing_endpoints.contains(endpoint); } @@ -631,7 +627,7 @@ bool token_metadata_impl::is_any_node_being_replaced() const { return !_replacing_endpoints.empty(); } -void token_metadata_impl::remove_endpoint(inet_address endpoint) { +void token_metadata_impl::remove_endpoint(host_id endpoint) { remove_by_value(_bootstrap_tokens, endpoint); remove_by_value(_token_to_endpoint_map, endpoint); _normal_token_owners.erase(endpoint); @@ -732,13 +728,11 @@ future<> token_metadata_impl::update_topology_change_info(dc_rack_fn& get_dc_rac co_return; } - // true if there is a node replaced with the same IP - bool replace_with_same_endpoint = false; // target_token_metadata incorporates all the changes from leaving, bootstrapping and replacing auto target_token_metadata = co_await clone_only_token_map(false); { // construct new_normal_tokens based on _bootstrap_tokens and _replacing_endpoints - std::unordered_map> new_normal_tokens; + std::unordered_map> new_normal_tokens; if (!_replacing_endpoints.empty()) { for (const auto& [token, inet_address]: _token_to_endpoint_map) { const auto it = _replacing_endpoints.find(inet_address); @@ -748,11 +742,7 @@ future<> token_metadata_impl::update_topology_change_info(dc_rack_fn& get_dc_rac new_normal_tokens[it->second].insert(token); } for (const auto& [replace_from, replace_to]: _replacing_endpoints) { - if (replace_from == replace_to) { - replace_with_same_endpoint = true; - } else { - target_token_metadata->remove_endpoint(replace_from); - } + target_token_metadata->remove_endpoint(replace_from); } } for (const auto& [token, inet_address]: _bootstrap_tokens) { @@ -770,22 +760,6 @@ future<> token_metadata_impl::update_topology_change_info(dc_rack_fn& get_dc_rac target_token_metadata->sort_tokens(); } - // We require a distinct token_metadata instance when replace_from equals replace_to, - // as it ensures the node is included in pending_ranges. - // Otherwise, the node would be excluded from both pending_ranges and - // get_natural_endpoints_without_node_being_replaced, - // causing the coordinator to overlook it entirely. - std::unique_ptr base_token_metadata; - if (replace_with_same_endpoint) { - base_token_metadata = co_await clone_only_token_map(false); - for (const auto& [replace_from, replace_to]: _replacing_endpoints) { - if (replace_from == replace_to) { - base_token_metadata->remove_endpoint(replace_from); - } - } - base_token_metadata->sort_tokens(); - } - // merge tokens from token_to_endpoint and bootstrap_tokens, // preserving tokens of leaving endpoints auto all_tokens = std::vector(); @@ -798,8 +772,7 @@ future<> token_metadata_impl::update_topology_change_info(dc_rack_fn& get_dc_rac std::sort(begin(all_tokens), end(all_tokens)); auto prev_value = std::move(_topology_change_info); - _topology_change_info.emplace(token_metadata(std::move(target_token_metadata)), - base_token_metadata ? std::optional(token_metadata(std::move(base_token_metadata))): std::nullopt, + _topology_change_info.emplace(make_lw_shared(std::move(target_token_metadata)), std::move(all_tokens), _read_new); co_await utils::clear_gently(prev_value); @@ -810,7 +783,7 @@ size_t token_metadata_impl::count_normal_token_owners() const { } future<> token_metadata_impl::update_normal_token_owners() { - std::unordered_set eps; + std::unordered_set eps; for (auto [t, ep]: _token_to_endpoint_map) { eps.insert(ep); co_await coroutine::maybe_yield(); @@ -818,21 +791,24 @@ future<> token_metadata_impl::update_normal_token_owners() { _normal_token_owners = std::move(eps); } -void token_metadata_impl::add_leaving_endpoint(inet_address endpoint) { +void token_metadata_impl::add_leaving_endpoint(host_id endpoint) { _leaving_endpoints.emplace(endpoint); } -void token_metadata_impl::del_leaving_endpoint(inet_address endpoint) { +void token_metadata_impl::del_leaving_endpoint(host_id endpoint) { _leaving_endpoints.erase(endpoint); } -void token_metadata_impl::add_replacing_endpoint(inet_address existing_node, inet_address replacing_node) { +void token_metadata_impl::add_replacing_endpoint(host_id existing_node, host_id replacing_node) { + if (existing_node == replacing_node) { + on_internal_error(tlogger, format("Can't replace node {} with itself")); + } tlogger.info("Added node {} as pending replacing endpoint which replaces existing node {}", replacing_node, existing_node); _replacing_endpoints[existing_node] = replacing_node; } -void token_metadata_impl::del_replacing_endpoint(inet_address existing_node) { +void token_metadata_impl::del_replacing_endpoint(host_id existing_node) { if (_replacing_endpoints.contains(existing_node)) { tlogger.info("Removed node {} as pending replacing endpoint which replaces existing node {}", _replacing_endpoints[existing_node], existing_node); @@ -840,26 +816,10 @@ void token_metadata_impl::del_replacing_endpoint(inet_address existing_node) { _replacing_endpoints.erase(existing_node); } -std::map token_metadata_impl::get_normal_and_bootstrapping_token_to_endpoint_map() const { - std::map ret(_token_to_endpoint_map.begin(), _token_to_endpoint_map.end()); - ret.insert(_bootstrap_tokens.begin(), _bootstrap_tokens.end()); - return ret; -} - -std::multimap token_metadata_impl::get_endpoint_to_token_map_for_reading() const { - std::multimap cloned; - for (const auto& x : _token_to_endpoint_map) { - cloned.emplace(x.second, x.first); - } - return cloned; -} - -topology_change_info::topology_change_info(token_metadata target_token_metadata_, - std::optional base_token_metadata_, - std::vector all_tokens_, - token_metadata::read_new_t read_new_) +topology_change_info::topology_change_info(lw_shared_ptr target_token_metadata_, + std::vector all_tokens_, + token_metadata::read_new_t read_new_) : target_token_metadata(std::move(target_token_metadata_)) - , base_token_metadata(std::move(base_token_metadata_)) , all_tokens(std::move(all_tokens_)) , read_new(read_new_) { @@ -867,21 +827,21 @@ topology_change_info::topology_change_info(token_metadata target_token_metadata_ future<> topology_change_info::clear_gently() { co_await utils::clear_gently(target_token_metadata); - co_await utils::clear_gently(base_token_metadata); co_await utils::clear_gently(all_tokens); } token_metadata::token_metadata(std::unique_ptr impl) - : _impl(std::move(impl)) { + : _impl(std::move(impl)) +{ } token_metadata::token_metadata(config cfg) - : _impl(std::make_unique(std::move(cfg))) { + : _impl(std::make_unique(cfg)) +{ } token_metadata::~token_metadata() = default; - token_metadata::token_metadata(token_metadata&&) noexcept = default; token_metadata& token_metadata::token_metadata::operator=(token_metadata&&) noexcept = default; @@ -892,7 +852,7 @@ token_metadata::sorted_tokens() const { } future<> -token_metadata::update_normal_tokens(std::unordered_set tokens, inet_address endpoint) { +token_metadata::update_normal_tokens(std::unordered_set tokens, host_id endpoint) { return _impl->update_normal_tokens(std::move(tokens), endpoint); } @@ -906,33 +866,33 @@ token_metadata::first_token_index(const token& start) const { return _impl->first_token_index(start); } -std::optional +std::optional token_metadata::get_endpoint(const token& token) const { return _impl->get_endpoint(token); } std::vector -token_metadata::get_tokens(const inet_address& addr) const { +token_metadata::get_tokens(const host_id& addr) const { return _impl->get_tokens(addr); } -const std::unordered_map& +const std::unordered_map& token_metadata::get_token_to_endpoint() const { return _impl->get_token_to_endpoint(); } -const std::unordered_set& +const std::unordered_set& token_metadata::get_leaving_endpoints() const { return _impl->get_leaving_endpoints(); } -const std::unordered_map& +const std::unordered_map& token_metadata::get_bootstrap_tokens() const { return _impl->get_bootstrap_tokens(); } void -token_metadata::update_topology(inet_address ep, std::optional opt_dr, std::optional opt_st, std::optional shard_count) { +token_metadata::update_topology(host_id ep, std::optional opt_dr, std::optional opt_st, std::optional shard_count) { _impl->update_topology(ep, std::move(opt_dr), std::move(opt_st), std::move(shard_count)); } @@ -1006,6 +966,11 @@ token_metadata::get_host_id_if_known(inet_address endpoint) const { } std::optional +token_metadata::get_endpoint_for_host_id_if_known(host_id host_id) const { + return _impl->get_endpoint_for_host_id_if_known(host_id); +} + +token_metadata::inet_address token_metadata::get_endpoint_for_host_id(host_id host_id) const { return _impl->get_endpoint_for_host_id(host_id); } @@ -1022,12 +987,12 @@ token_metadata::get_endpoint_to_host_id_map_for_reading() const { } void -token_metadata::add_bootstrap_token(token t, inet_address endpoint) { +token_metadata::add_bootstrap_token(token t, host_id endpoint) { _impl->add_bootstrap_token(t, endpoint); } void -token_metadata::add_bootstrap_tokens(std::unordered_set tokens, inet_address endpoint) { +token_metadata::add_bootstrap_tokens(std::unordered_set tokens, host_id endpoint) { _impl->add_bootstrap_tokens(std::move(tokens), endpoint); } @@ -1037,33 +1002,33 @@ token_metadata::remove_bootstrap_tokens(std::unordered_set tokens) { } void -token_metadata::add_leaving_endpoint(inet_address endpoint) { +token_metadata::add_leaving_endpoint(host_id endpoint) { _impl->add_leaving_endpoint(endpoint); } void -token_metadata::del_leaving_endpoint(inet_address endpoint) { +token_metadata::del_leaving_endpoint(host_id endpoint) { _impl->del_leaving_endpoint(endpoint); } void -token_metadata::remove_endpoint(inet_address endpoint) { +token_metadata::remove_endpoint(host_id endpoint) { _impl->remove_endpoint(endpoint); _impl->sort_tokens(); } bool -token_metadata::is_normal_token_owner(inet_address endpoint) const { +token_metadata::is_normal_token_owner(host_id endpoint) const { return _impl->is_normal_token_owner(endpoint); } bool -token_metadata::is_leaving(inet_address endpoint) const { +token_metadata::is_leaving(host_id endpoint) const { return _impl->is_leaving(endpoint); } bool -token_metadata::is_being_replaced(inet_address endpoint) const { +token_metadata::is_being_replaced(host_id endpoint) const { return _impl->is_being_replaced(endpoint); } @@ -1072,32 +1037,26 @@ token_metadata::is_any_node_being_replaced() const { return _impl->is_any_node_being_replaced(); } -void token_metadata::add_replacing_endpoint(inet_address existing_node, inet_address replacing_node) { +void token_metadata::add_replacing_endpoint(host_id existing_node, host_id replacing_node) { _impl->add_replacing_endpoint(existing_node, replacing_node); } -void token_metadata::del_replacing_endpoint(inet_address existing_node) { +void token_metadata::del_replacing_endpoint(host_id existing_node) { _impl->del_replacing_endpoint(existing_node); } future token_metadata::clone_async() const noexcept { - return _impl->clone_async().then([] (std::unique_ptr impl) { - return make_ready_future(std::move(impl)); - }); + co_return token_metadata(co_await _impl->clone_async()); } future token_metadata::clone_only_token_map() const noexcept { - return _impl->clone_only_token_map().then([] (std::unique_ptr impl) { - return token_metadata(std::move(impl)); - }); + co_return token_metadata(co_await _impl->clone_only_token_map()); } future token_metadata::clone_after_all_left() const noexcept { - return _impl->clone_after_all_left().then([] (std::unique_ptr impl) { - return token_metadata(std::move(impl)); - }); + co_return token_metadata(co_await _impl->clone_after_all_left()); } future<> token_metadata::clear_gently() noexcept { @@ -1139,11 +1098,21 @@ token_metadata::get_predecessor(token t) const { return _impl->get_predecessor(t); } -const std::unordered_set& +const std::unordered_set& token_metadata::get_all_endpoints() const { return _impl->get_all_endpoints(); } +std::unordered_set token_metadata::get_all_ips() const { + const auto& host_ids = _impl->get_all_endpoints(); + std::unordered_set result; + result.reserve(host_ids.size()); + for (const auto& id: host_ids) { + result.insert(_impl->get_endpoint_for_host_id(id)); + } + return result; +} + size_t token_metadata::count_normal_token_owners() const { return _impl->count_normal_token_owners(); @@ -1154,16 +1123,6 @@ token_metadata::set_read_new(read_new_t read_new) { _impl->set_read_new(read_new); } -std::multimap -token_metadata::get_endpoint_to_token_map_for_reading() const { - return _impl->get_endpoint_to_token_map_for_reading(); -} - -std::map -token_metadata::get_normal_and_bootstrapping_token_to_endpoint_map() const { - return _impl->get_normal_and_bootstrapping_token_to_endpoint_map(); -} - long token_metadata::get_ring_version() const { return _impl->get_ring_version(); @@ -1294,7 +1253,7 @@ host_id_or_endpoint::host_id_or_endpoint(const sstring& s, param_type restrict) void host_id_or_endpoint::resolve(const token_metadata& tm) { if (id) { - auto endpoint_opt = tm.get_endpoint_for_host_id(id); + auto endpoint_opt = tm.get_endpoint_for_host_id_if_known(id); if (!endpoint_opt) { throw std::runtime_error(format("Host ID {} not found in the cluster", id)); } diff --git a/locator/token_metadata.hh b/locator/token_metadata.hh index 66122cc34a..b798b47ab0 100644 --- a/locator/token_metadata.hh +++ b/locator/token_metadata.hh @@ -76,13 +76,6 @@ struct topology_change_info; class token_metadata final { std::unique_ptr _impl; -public: - struct config { - topology::config topo_cfg; - }; - using inet_address = gms::inet_address; - using version_t = service::topology::version_t; - using version_tracker_t = utils::phased_barrier::operation; private: friend class token_metadata_ring_splitter; class tokens_iterator { @@ -107,6 +100,13 @@ private: }; public: + struct config { + topology::config topo_cfg; + }; + using inet_address = gms::inet_address; + using version_t = service::topology::version_t; + using version_tracker_t = utils::phased_barrier::operation; + token_metadata(config cfg); explicit token_metadata(std::unique_ptr impl); token_metadata(token_metadata&&) noexcept; // Can't use "= default;" - hits some static_assert in unique_ptr @@ -121,19 +121,21 @@ public: // // Note: the function is not exception safe! // It must be called only on a temporary copy of the token_metadata - future<> update_normal_tokens(std::unordered_set tokens, inet_address endpoint); + future<> update_normal_tokens(std::unordered_set tokens, host_id endpoint); const token& first_token(const token& start) const; size_t first_token_index(const token& start) const; - std::optional get_endpoint(const token& token) const; - std::vector get_tokens(const inet_address& addr) const; - const std::unordered_map& get_token_to_endpoint() const; - const std::unordered_set& get_leaving_endpoints() const; - const std::unordered_map& get_bootstrap_tokens() const; + std::optional get_endpoint(const token& token) const; + std::vector get_tokens(const host_id& addr) const; + const std::unordered_map& get_token_to_endpoint() const; + const std::unordered_set& get_leaving_endpoints() const; + const std::unordered_map& get_bootstrap_tokens() const; /** - * Update or add endpoint given its inet_address and endpoint_dc_rack. + * Update or add a node for a given host_id. + * The other arguments (dc, state, shard_count) are optional, i.e. the corresponding node + * fields won't be updated if std::nullopt is passed. */ - void update_topology(inet_address ep, std::optional opt_dr, std::optional opt_st = std::nullopt, + void update_topology(host_id ep, std::optional opt_dr, std::optional opt_st = std::nullopt, std::optional shard_count = std::nullopt); /** * Creates an iterable range of the sorted tokens starting at the token t @@ -169,8 +171,11 @@ public: /// Return the unique host ID for an end-point or nullopt if not found. std::optional get_host_id_if_known(inet_address endpoint) const; + /** Return the end-point for a unique host ID or nullopt if not found. */ + std::optional get_endpoint_for_host_id_if_known(locator::host_id host_id) const; + /** Return the end-point for a unique host ID */ - std::optional get_endpoint_for_host_id(locator::host_id host_id) const; + inet_address get_endpoint_for_host_id(locator::host_id host_id) const; /// Parses the \c host_id_string either as a host uuid or as an ip address and returns the mapping. /// Throws std::invalid_argument on parse error or std::runtime_error if the host_id wasn't found. @@ -182,32 +187,32 @@ public: /// Returns host_id of the local node. host_id get_my_id() const; - void add_bootstrap_token(token t, inet_address endpoint); + void add_bootstrap_token(token t, host_id endpoint); - void add_bootstrap_tokens(std::unordered_set tokens, inet_address endpoint); + void add_bootstrap_tokens(std::unordered_set tokens, host_id endpoint); void remove_bootstrap_tokens(std::unordered_set tokens); - void add_leaving_endpoint(inet_address endpoint); - void del_leaving_endpoint(inet_address endpoint); + void add_leaving_endpoint(host_id endpoint); + void del_leaving_endpoint(host_id endpoint); - void remove_endpoint(inet_address endpoint); + void remove_endpoint(host_id endpoint); // Checks if the node is part of the token ring. If yes, the node is one of // the nodes that owns the tokens and inside the set _normal_token_owners. - bool is_normal_token_owner(inet_address endpoint) const; + bool is_normal_token_owner(host_id endpoint) const; - bool is_leaving(inet_address endpoint) const; + bool is_leaving(host_id endpoint) const; // Is this node being replaced by another node - bool is_being_replaced(inet_address endpoint) const; + bool is_being_replaced(host_id endpoint) const; // Is any node being replaced by another node bool is_any_node_being_replaced() const; - void add_replacing_endpoint(inet_address existing_node, inet_address replacing_node); + void add_replacing_endpoint(host_id existing_node, host_id replacing_node); - void del_replacing_endpoint(inet_address existing_node); + void del_replacing_endpoint(host_id existing_node); /** * Create a full copy of token_metadata using asynchronous continuations. @@ -257,7 +262,9 @@ public: token get_predecessor(token t) const; - const std::unordered_set& get_all_endpoints() const; + const std::unordered_set& get_all_endpoints() const; + + std::unordered_set get_all_ips() const; /* Returns the number of different endpoints that own tokens in the ring. * Bootstrapping tokens are not taken into account. */ @@ -271,14 +278,6 @@ public: using read_new_t = bool_class; void set_read_new(read_new_t value); - /** @return an endpoint to token multimap representation of tokenToEndpointMap (a copy) */ - std::multimap get_endpoint_to_token_map_for_reading() const; - /** - * @return a (stable copy, won't be modified) Token to Endpoint map for all the normal and bootstrapping nodes - * in the cluster. - */ - std::map get_normal_and_bootstrapping_token_to_endpoint_map() const; - long get_ring_version() const; void invalidate_cached_rings(); @@ -292,13 +291,11 @@ private: }; struct topology_change_info { - token_metadata target_token_metadata; - std::optional base_token_metadata; + lw_shared_ptr target_token_metadata; std::vector all_tokens; token_metadata::read_new_t read_new; - topology_change_info(token_metadata target_token_metadata_, - std::optional base_token_metadata_, + topology_change_info(lw_shared_ptr target_token_metadata_, std::vector all_tokens_, token_metadata::read_new_t read_new_); future<> clear_gently(); diff --git a/locator/topology.cc b/locator/topology.cc index 39be2acc96..4f8c75cb97 100644 --- a/locator/topology.cc +++ b/locator/topology.cc @@ -316,7 +316,12 @@ void topology::index_node(const node* node) { if (node->endpoint() != inet_address{}) { auto eit = _nodes_by_endpoint.find(node->endpoint()); if (eit != _nodes_by_endpoint.end()) { - if (eit->second->is_leaving() || eit->second->left()) { + if (eit->second->get_state() == node::state::replacing && node->get_state() == node::state::being_replaced) { + // replace-with-same-ip, map ip to the old node + _nodes_by_endpoint.erase(node->endpoint()); + } else if (eit->second->get_state() == node::state::being_replaced && node->get_state() == node::state::replacing) { + // replace-with-same-ip, map ip to the old node, do nothing if it's already the case + } else if (eit->second->is_leaving() || eit->second->left()) { _nodes_by_endpoint.erase(node->endpoint()); } else if (!node->is_leaving() && !node->left()) { if (node->host_id()) { @@ -437,30 +442,32 @@ const node* topology::find_node(node::idx_type idx) const noexcept { return _nodes.at(idx).get(); } -const node* topology::add_or_update_endpoint(inet_address ep, std::optional opt_id, std::optional opt_dr, std::optional opt_st, std::optional shard_count) +const node* topology::add_or_update_endpoint(host_id id, std::optional opt_ep, std::optional opt_dr, std::optional opt_st, std::optional shard_count) { if (tlogger.is_enabled(log_level::trace)) { - tlogger.trace("topology[{}]: add_or_update_endpoint: ep={} host_id={} dc={} rack={} state={} shards={}, at {}", fmt::ptr(this), - ep, opt_id.value_or(host_id::create_null_id()), opt_dr.value_or(endpoint_dc_rack{}).dc, opt_dr.value_or(endpoint_dc_rack{}).rack, opt_st.value_or(node::state::none), shard_count, + tlogger.trace("topology[{}]: add_or_update_endpoint: host_id={} ep={} dc={} rack={} state={} shards={}, at {}", fmt::ptr(this), + id, opt_ep, opt_dr.value_or(endpoint_dc_rack{}).dc, opt_dr.value_or(endpoint_dc_rack{}).rack, opt_st.value_or(node::state::none), shard_count, current_backtrace()); } - auto n = find_node(ep); + + const auto* n = find_node(id); if (n) { - return update_node(make_mutable(n), opt_id, std::nullopt, std::move(opt_dr), std::move(opt_st), std::move(shard_count)); - } else if (opt_id && (n = find_node(*opt_id))) { - return update_node(make_mutable(n), std::nullopt, ep, std::move(opt_dr), std::move(opt_st), std::move(shard_count)); - } else { - return add_node(opt_id.value_or(host_id::create_null_id()), ep, - opt_dr.value_or(endpoint_dc_rack::default_location), - opt_st.value_or(node::state::normal), - shard_count.value_or(0)); + return update_node(make_mutable(n), std::nullopt, opt_ep, std::move(opt_dr), std::move(opt_st), std::move(shard_count)); + } else if (opt_ep && (n = find_node(*opt_ep))) { + return update_node(make_mutable(n), id, std::nullopt, std::move(opt_dr), std::move(opt_st), std::move(shard_count)); } + + return add_node(id, + opt_ep.value_or(inet_address{}), + opt_dr.value_or(endpoint_dc_rack::default_location), + opt_st.value_or(node::state::normal), + shard_count.value_or(0)); } -bool topology::remove_endpoint(inet_address ep) +bool topology::remove_endpoint(locator::host_id host_id) { - auto node = find_node(ep); - tlogger.debug("topology[{}]: remove_endpoint: endpoint={}: {}", fmt::ptr(this), ep, debug_format(node)); + auto node = find_node(host_id); + tlogger.debug("topology[{}]: remove_endpoint: host_id={}: {}", fmt::ptr(this), host_id, debug_format(node)); if (node) { remove_node(node); return true; diff --git a/locator/topology.hh b/locator/topology.hh index 7b35f0969e..77bb367332 100644 --- a/locator/topology.hh +++ b/locator/topology.hh @@ -234,24 +234,12 @@ public: * * Adds or updates a node with given endpoint */ - const node* add_or_update_endpoint(inet_address ep, std::optional opt_id, - std::optional opt_dr, - std::optional opt_st, + const node* add_or_update_endpoint(host_id id, std::optional opt_ep, + std::optional opt_dr = std::nullopt, + std::optional opt_st = std::nullopt, std::optional shard_count = std::nullopt); - // Legacy entry point from token_metadata::update_topology - const node* add_or_update_endpoint(inet_address ep, endpoint_dc_rack dr, std::optional opt_st) { - return add_or_update_endpoint(ep, std::nullopt, std::move(dr), std::move(opt_st), std::nullopt); - } - const node* add_or_update_endpoint(inet_address ep, host_id id) { - return add_or_update_endpoint(ep, id, std::nullopt, std::nullopt, std::nullopt); - } - - /** - * Removes current DC/rack assignment for ep - * Returns true if the node was found and removed. - */ - bool remove_endpoint(inet_address ep); + bool remove_endpoint(locator::host_id ep); /** * Returns true iff contains given endpoint. @@ -319,7 +307,7 @@ public: } auto get_local_dc_filter() const noexcept { - return [ this, local_dc = get_datacenter() ] (inet_address ep) { + return [ this, local_dc = get_datacenter() ] (auto ep) { return get_datacenter(ep) == local_dc; }; }; diff --git a/locator/types.hh b/locator/types.hh index 4625062ead..3f2783f3fe 100644 --- a/locator/types.hh +++ b/locator/types.hh @@ -31,6 +31,6 @@ struct endpoint_dc_rack { bool operator==(const endpoint_dc_rack&) const = default; }; -using dc_rack_fn = seastar::noncopyable_function(inet_address)>; +using dc_rack_fn = seastar::noncopyable_function(host_id)>; } // namespace locator diff --git a/main.cc b/main.cc index 2dd5a833e6..7b3504ab50 100644 --- a/main.cc +++ b/main.cc @@ -1211,7 +1211,7 @@ To start the scylla server proper, simply invoke as: scylla server (or just scyl // Raft topology discard the endpoint-to-id map, so the local id can // still be found in the config. tm.get_topology().set_host_id_cfg(host_id); - tm.get_topology().add_or_update_endpoint(endpoint, host_id); + tm.get_topology().add_or_update_endpoint(host_id, endpoint); return make_ready_future<>(); }).get(); diff --git a/node_ops/node_ops_ctl.hh b/node_ops/node_ops_ctl.hh index cc2d9dfdab..6cba6b8cc0 100644 --- a/node_ops/node_ops_ctl.hh +++ b/node_ops/node_ops_ctl.hh @@ -13,6 +13,7 @@ #include "locator/host_id.hh" #include "node_ops/id.hh" #include "schema/schema_fwd.hh" +#include "locator/host_id.hh" #include diff --git a/repair/repair.cc b/repair/repair.cc index 0363322339..f64f0feb5f 100644 --- a/repair/repair.cc +++ b/repair/repair.cc @@ -221,7 +221,7 @@ static std::vector get_neighbors( dht::token tok = range.end() ? range.end()->value() : dht::maximum_token(); auto ret = erm.get_natural_endpoints(tok); if (small_table_optimization) { - auto normal_nodes = erm.get_token_metadata().get_all_endpoints(); + auto normal_nodes = erm.get_token_metadata().get_all_ips(); ret = inet_address_vector_replica_set(normal_nodes.begin(), normal_nodes.end()); } auto my_address = erm.get_topology().my_address(); @@ -1231,13 +1231,13 @@ future<> repair::user_requested_repair_task_impl::run() { bool hints_batchlog_flushed = false; std::list participants; if (_small_table_optimization) { - auto normal_nodes = germs->get().get_token_metadata().get_all_endpoints(); + auto normal_nodes = germs->get().get_token_metadata().get_all_ips(); participants = std::list(normal_nodes.begin(), normal_nodes.end()); } else { participants = get_hosts_participating_in_repair(germs->get(), keyspace, ranges, data_centers, hosts, ignore_nodes).get(); } if (needs_flush_before_repair) { - auto waiting_nodes = db.get_token_metadata().get_all_endpoints(); + auto waiting_nodes = db.get_token_metadata().get_all_ips(); std::erase_if(waiting_nodes, [&] (const auto& addr) { return ignore_nodes.contains(addr); }); @@ -1500,7 +1500,7 @@ future<> repair_service::bootstrap_with_repair(locator::token_metadata_ptr tmptr auto ks_erms = db.get_non_local_strategy_keyspaces_erms(); auto& topology = tmptr->get_topology(); auto myloc = topology.get_location(); - auto myip = topology.my_address(); + auto myid = tmptr->get_my_id(); auto reason = streaming::stream_reason::bootstrap; // Calculate number of ranges to sync data size_t nr_ranges_total = 0; @@ -1509,7 +1509,7 @@ future<> repair_service::bootstrap_with_repair(locator::token_metadata_ptr tmptr continue; } auto& strat = erm->get_replication_strategy(); - dht::token_range_vector desired_ranges = strat.get_pending_address_ranges(tmptr, tokens, myip, myloc).get0(); + dht::token_range_vector desired_ranges = strat.get_pending_address_ranges(tmptr, tokens, myid, myloc).get0(); seastar::thread::maybe_yield(); auto nr_tables = get_nr_tables(db, keyspace_name); nr_ranges_total += desired_ranges.size() * nr_tables; @@ -1525,7 +1525,7 @@ future<> repair_service::bootstrap_with_repair(locator::token_metadata_ptr tmptr continue; } auto& strat = erm->get_replication_strategy(); - dht::token_range_vector desired_ranges = strat.get_pending_address_ranges(tmptr, tokens, myip, myloc).get0(); + dht::token_range_vector desired_ranges = strat.get_pending_address_ranges(tmptr, tokens, myid, myloc).get0(); bool find_node_in_local_dc_only = strat.get_type() == locator::replication_strategy_type::network_topology; bool everywhere_topology = strat.get_type() == locator::replication_strategy_type::everywhere_topology; auto replication_factor = erm->get_replication_factor(); @@ -1535,8 +1535,8 @@ future<> repair_service::bootstrap_with_repair(locator::token_metadata_ptr tmptr auto range_addresses = strat.get_range_addresses(metadata_clone).get0(); //Pending ranges - metadata_clone.update_topology(myip, myloc, locator::node::state::bootstrapping); - metadata_clone.update_normal_tokens(tokens, myip).get(); + metadata_clone.update_topology(myid, myloc, locator::node::state::bootstrapping); + metadata_clone.update_normal_tokens(tokens, myid).get(); auto pending_range_addresses = strat.get_range_addresses(metadata_clone).get0(); metadata_clone.clear_gently().get(); @@ -1676,6 +1676,7 @@ future<> repair_service::do_decommission_removenode_with_repair(locator::token_m auto& db = get_db().local(); auto& topology = tmptr->get_topology(); auto myip = topology.my_address(); + const auto leaving_node_id = tmptr->get_host_id(leaving_node); auto ks_erms = db.get_non_local_strategy_keyspaces_erms(); auto local_dc = topology.get_datacenter(); bool is_removenode = myip != leaving_node; @@ -1719,15 +1720,15 @@ future<> repair_service::do_decommission_removenode_with_repair(locator::token_m // Find (for each range) all nodes that store replicas for these ranges as well for (auto& r : ranges) { auto end_token = r.end() ? r.end()->value() : dht::maximum_token(); - auto eps = strat.calculate_natural_endpoints(end_token, *tmptr).get0(); + auto eps = strat.calculate_natural_ips(end_token, *tmptr).get0(); current_replica_endpoints.emplace(r, std::move(eps)); seastar::thread::maybe_yield(); } auto temp = tmptr->clone_after_all_left().get0(); // leaving_node might or might not be 'leaving'. If it was not leaving (that is, removenode // command was used), it is still present in temp and must be removed. - if (temp.is_normal_token_owner(leaving_node)) { - temp.remove_endpoint(leaving_node); + if (temp.is_normal_token_owner(leaving_node_id)) { + temp.remove_endpoint(leaving_node_id); } std::unordered_map range_sources; dht::token_range_vector ranges_for_removenode; @@ -1738,7 +1739,7 @@ future<> repair_service::do_decommission_removenode_with_repair(locator::token_m ops->check_abort(); } auto end_token = r.end() ? r.end()->value() : dht::maximum_token(); - const auto new_eps = strat.calculate_natural_endpoints(end_token, temp).get0(); + const auto new_eps = strat.calculate_natural_ips(end_token, temp).get0(); const auto& current_eps = current_replica_endpoints[r]; std::unordered_set neighbors_set = new_eps.get_set(); bool skip_this_range = false; @@ -1889,6 +1890,7 @@ future<> repair_service::do_rebuild_replace_with_repair(locator::token_metadata_ auto& db = get_db().local(); auto ks_erms = db.get_non_local_strategy_keyspaces_erms(); auto myip = tmptr->get_topology().my_address(); + auto myid = tmptr->get_my_id(); size_t nr_ranges_total = 0; for (const auto& [keyspace_name, erm] : ks_erms) { if (!db.has_keyspace(keyspace_name)) { @@ -1896,7 +1898,7 @@ future<> repair_service::do_rebuild_replace_with_repair(locator::token_metadata_ } auto& strat = erm->get_replication_strategy(); // Okay to yield since tm is immutable - dht::token_range_vector ranges = strat.get_ranges(myip, tmptr).get0(); + dht::token_range_vector ranges = strat.get_ranges(myid, tmptr).get0(); auto nr_tables = get_nr_tables(db, keyspace_name); nr_ranges_total += ranges.size() * nr_tables; @@ -1920,7 +1922,7 @@ future<> repair_service::do_rebuild_replace_with_repair(locator::token_metadata_ continue; } auto& strat = erm->get_replication_strategy(); - dht::token_range_vector ranges = strat.get_ranges(myip, tmptr).get0(); + dht::token_range_vector ranges = strat.get_ranges(myid, *tmptr).get0(); auto& topology = erm->get_token_metadata().get_topology(); std::unordered_map range_sources; auto nr_tables = get_nr_tables(db, keyspace_name); @@ -1929,7 +1931,7 @@ future<> repair_service::do_rebuild_replace_with_repair(locator::token_metadata_ auto& r = *it; seastar::thread::maybe_yield(); auto end_token = r.end() ? r.end()->value() : dht::maximum_token(); - auto neighbors = boost::copy_range>(strat.calculate_natural_endpoints(end_token, *tmptr).get0() | + auto neighbors = boost::copy_range>(strat.calculate_natural_ips(end_token, *tmptr).get0() | boost::adaptors::filtered([myip, &source_dc, &topology, &ignore_nodes] (const gms::inet_address& node) { if (node == myip) { return false; @@ -1988,14 +1990,13 @@ future<> repair_service::replace_with_repair(locator::token_metadata_ptr tmptr, auto cloned_tm = co_await tmptr->clone_async(); auto op = sstring("replace_with_repair"); auto& topology = tmptr->get_topology(); - auto myip = topology.my_address(); auto myloc = topology.get_location(); auto reason = streaming::stream_reason::replace; // update a cloned version of tmptr // no need to set the original version auto cloned_tmptr = make_token_metadata_ptr(std::move(cloned_tm)); - cloned_tmptr->update_topology(myip, myloc, locator::node::state::replacing); - co_await cloned_tmptr->update_normal_tokens(replacing_tokens, myip); + cloned_tmptr->update_topology(tmptr->get_my_id(), myloc, locator::node::state::replacing); + co_await cloned_tmptr->update_normal_tokens(replacing_tokens, tmptr->get_my_id()); co_return co_await do_rebuild_replace_with_repair(std::move(cloned_tmptr), std::move(op), myloc.dc, reason, std::move(ignore_nodes)); } diff --git a/repair/row_level.cc b/repair/row_level.cc index 8ce475bde7..63615e6b40 100644 --- a/repair/row_level.cc +++ b/repair/row_level.cc @@ -679,7 +679,7 @@ void flush_rows(schema_ptr s, std::list& rows, lw_shared_ptrdk; if (do_small_table_optimization) { // Check if the token is owned by the node - auto eps = strat->calculate_natural_endpoints(dk.token(), *tm).get0(); + auto eps = strat->calculate_natural_ips(dk.token(), *tm).get0(); if (!eps.contains(myip)) { rlogger.trace("master: ignore row, token={}", dk.token()); continue; @@ -1900,12 +1900,12 @@ public: } if (small_table_optimization) { auto& strat = erm.get_replication_strategy(); - auto& tm = erm.get_token_metadata(); + const auto& tm = erm.get_token_metadata(); std::list tmp; for (auto& row : row_diff) { repair_row r = std::move(row); const auto& dk = r.get_dk_with_hash()->dk; - auto eps = co_await strat.calculate_natural_endpoints(dk.token(), tm); + auto eps = co_await strat.calculate_natural_ips(dk.token(), tm); if (eps.contains(remote_node)) { tmp.push_back(std::move(r)); } else { diff --git a/service/migration_manager.cc b/service/migration_manager.cc index 4690194ea8..f906c6bfaa 100644 --- a/service/migration_manager.cc +++ b/service/migration_manager.cc @@ -1272,7 +1272,8 @@ future<> migration_manager::on_change(gms::inet_address endpoint, gms::applicati mlogger.debug("Ignoring state change for dead or unknown endpoint: {}", endpoint); return make_ready_future(); } - if (_storage_proxy.get_token_metadata_ptr()->is_normal_token_owner(endpoint)) { + const auto host_id = _gossiper.get_host_id(endpoint); + if (_storage_proxy.get_token_metadata_ptr()->is_normal_token_owner(host_id)) { schedule_schema_pull(endpoint, *ep_state); } } diff --git a/service/storage_proxy.cc b/service/storage_proxy.cc index 749ed3a23d..ee038d3b3d 100644 --- a/service/storage_proxy.cc +++ b/service/storage_proxy.cc @@ -2291,7 +2291,7 @@ replica_ids_to_endpoints(const locator::token_metadata& tm, const std::vector set_gossip_tokens(gms::gossiper& g, }); } +static std::unordered_map get_token_to_endpoint(const locator::token_metadata& tm) { + const auto& map = tm.get_token_to_endpoint(); + std::unordered_map result; + result.reserve(map.size()); + for (const auto [t, id]: map) { + result.insert({t, tm.get_endpoint_for_host_id(id)}); + } + return result; +} + /* * The helper waits for two things * 1) for schema agreement @@ -401,7 +411,7 @@ future<> storage_service::topology_state_load() { tmptr->set_version(_topology_state_machine._topology.version); auto update_topology = [&] (locator::host_id id, inet_address ip, const replica_state& rs) { - tmptr->update_topology(ip, locator::endpoint_dc_rack{rs.datacenter, rs.rack}, + tmptr->update_topology(id, locator::endpoint_dc_rack{rs.datacenter, rs.rack}, to_topology_node_state(rs.state), rs.shard_count); tmptr->update_host_id(id, ip); }; @@ -431,14 +441,14 @@ future<> storage_service::topology_state_load() { co_await _gossiper.add_local_application_state({{ gms::application_state::STATUS, gms::versioned_value::normal(rs.ring.value().tokens) }}); } update_topology(host_id, ip, rs); - co_await tmptr->update_normal_tokens(rs.ring.value().tokens, ip); + co_await tmptr->update_normal_tokens(rs.ring.value().tokens, host_id); }; for (const auto& [id, rs]: _topology_state_machine._topology.normal_nodes) { co_await add_normal_node(id, rs); } - tmptr->set_read_new(std::invoke([](std::optional state) { + const auto read_new = std::invoke([](std::optional state) { using read_new_t = locator::token_metadata::read_new_t; if (!state.has_value()) { return read_new_t::no; @@ -457,7 +467,8 @@ future<> storage_service::topology_state_load() { case topology::transition_state::write_both_read_new: return read_new_t::yes; } - }, _topology_state_machine._topology.tstate)); + }, _topology_state_machine._topology.tstate); + tmptr->set_read_new(read_new); for (const auto& [id, rs]: _topology_state_machine._topology.transition_nodes) { locator::host_id host_id{id.uuid()}; @@ -483,9 +494,9 @@ future<> storage_service::topology_state_load() { // so we can perform writes to regular 'distributed' tables during the bootstrap procedure // (such as the CDC generation write). // It doesn't break anything to set the tokens to normal early in this single-node case. - co_await tmptr->update_normal_tokens(rs.ring.value().tokens, ip); + co_await tmptr->update_normal_tokens(rs.ring.value().tokens, host_id); } else { - tmptr->add_bootstrap_tokens(rs.ring.value().tokens, ip); + tmptr->add_bootstrap_tokens(rs.ring.value().tokens, host_id); co_await update_topology_change_info(tmptr, ::format("bootstrapping node {}/{}", id, ip)); } } @@ -493,8 +504,8 @@ future<> storage_service::topology_state_load() { case node_state::decommissioning: case node_state::removing: update_topology(host_id, ip, rs); - co_await tmptr->update_normal_tokens(rs.ring.value().tokens, ip); - tmptr->add_leaving_endpoint(ip); + co_await tmptr->update_normal_tokens(rs.ring.value().tokens, host_id); + tmptr->add_leaving_endpoint(host_id); co_await update_topology_change_info(tmptr, ::format("{} {}/{}", rs.state, id, ip)); break; case node_state::replacing: { @@ -507,11 +518,10 @@ future<> storage_service::topology_state_load() { on_fatal_internal_error(slogger, ::format("Cannot map id of a node being replaced {} to its ip", replaced_id)); } assert(existing_ip); - // FIXME: Topology cannot hold two IPs with different host ids yet so - // when replacing we must advertise the replaced_id for the ip, otherwise - // topology will complain about host id of a local node changing and fail. - update_topology(ip == existing_ip ? locator::host_id(replaced_id.uuid()) : host_id, ip, rs); - tmptr->add_replacing_endpoint(*existing_ip, ip); + const auto replaced_host_id = locator::host_id(replaced_id.uuid()); + tmptr->update_topology(replaced_host_id, std::nullopt, locator::node::state::being_replaced); + update_topology(host_id, ip, rs); + tmptr->add_replacing_endpoint(replaced_host_id, host_id); co_await update_topology_change_info(tmptr, ::format("replacing {}/{} by {}/{}", replaced_id, *existing_ip, id, ip)); } } @@ -545,9 +555,11 @@ future<> storage_service::topology_state_load() { // of the cluster state. To work correctly, the gossiper needs to know the current // endpoints. We cannot rely on seeds alone, since it is not guaranteed that seeds // will be up to date and reachable at the time of restart. - for (const auto& e: get_token_metadata_ptr()->get_all_endpoints()) { - if (!is_me(e) && !_gossiper.get_endpoint_state_ptr(e)) { - co_await _gossiper.add_saved_endpoint(e); + const auto tmptr = get_token_metadata_ptr(); + for (const auto& e: tmptr->get_all_endpoints()) { + const auto ep = tmptr->get_endpoint_for_host_id(e); + if (!is_me(e) && !_gossiper.get_endpoint_state_ptr(ep)) { + co_await _gossiper.add_saved_endpoint(ep); } } @@ -1210,18 +1222,11 @@ class topology_coordinator { " can't find endpoint for token {}", end)); } - auto id = tmptr->get_host_id_if_known(*ep); - if (!id) { - on_internal_error(slogger, ::format( - "raft topology: make_new_cdc_generation_data: get_sharding_info:" - " can't find host ID for endpoint {}, owner of token {}", *ep, end)); - } - - auto ptr = _topo_sm._topology.find(raft::server_id{id->uuid()}); + auto ptr = _topo_sm._topology.find(raft::server_id{ep->uuid()}); if (!ptr) { on_internal_error(slogger, ::format( "raft topology: make_new_cdc_generation_data: get_sharding_info:" - " couldn't find node {} in topology, owner of token {}", *id, end)); + " couldn't find node {} in topology, owner of token {}", *ep, end)); } auto& rs = ptr->second; @@ -3047,8 +3052,11 @@ future<> storage_service::join_token_ring(shardedupdate_topology(*replace_address, std::move(ri->dc_rack), locator::node::state::being_replaced); - co_await tmptr->update_normal_tokens(bootstrap_tokens, *replace_address); + tmptr->update_topology(tmptr->get_my_id(), std::nullopt, locator::node::state::replacing); + tmptr->update_topology(ri->host_id, std::move(ri->dc_rack), locator::node::state::being_replaced); + co_await tmptr->update_normal_tokens(bootstrap_tokens, ri->host_id); + tmptr->update_host_id(ri->host_id, *replace_address); + replaced_host_id = ri->host_id; } } else if (should_bootstrap()) { @@ -3088,8 +3096,8 @@ future<> storage_service::join_token_ring(shardedupdate_topology(get_broadcast_address(), _snitch.local()->get_location(), locator::node::state::normal); - co_await tmptr->update_normal_tokens(my_tokens, get_broadcast_address()); + tmptr->update_topology(tmptr->get_my_id(), _snitch.local()->get_location(), locator::node::state::normal); + co_await tmptr->update_normal_tokens(my_tokens, tmptr->get_my_id()); cdc_gen_id = co_await _sys_ks.local().get_cdc_generation_id(); if (!cdc_gen_id) { @@ -3343,7 +3351,7 @@ future<> storage_service::join_token_ring(shardedis_normal_token_owner(get_broadcast_address())) { + if (tmptr->is_normal_token_owner(tmptr->get_my_id())) { throw std::runtime_error("This node is already a member of the token ring; bootstrap aborted. (If replacing a dead node, remove the old one from the ring first.)"); } slogger.info("getting bootstrap token"); @@ -3369,7 +3377,7 @@ future<> storage_service::join_token_ring(shardedget_endpoint(token); if (existing) { - auto eps = _gossiper.get_endpoint_state_ptr(*existing); + auto eps = _gossiper.get_endpoint_state_ptr(tmptr->get_endpoint_for_host_id(*existing)); if (eps && eps->get_update_timestamp() > gms::gossiper::clk::now() - delay) { throw std::runtime_error("Cannot replace a live node..."); } @@ -3406,12 +3414,12 @@ future<> storage_service::join_token_ring(sharded future<> { // This node must know about its chosen tokens before other nodes do // since they may start sending writes to this node after it gossips status = NORMAL. // Therefore, in case we haven't updated _token_metadata with our tokens yet, do it now. - tmptr->update_topology(get_broadcast_address(), _snitch.local()->get_location(), locator::node::state::normal); - return tmptr->update_normal_tokens(bootstrap_tokens, get_broadcast_address()); + tmptr->update_topology(tmptr->get_my_id(), _snitch.local()->get_location(), locator::node::state::normal); + co_await tmptr->update_normal_tokens(bootstrap_tokens, tmptr->get_my_id()); }); if (!_sys_ks.local().bootstrap_complete()) { @@ -3549,8 +3557,8 @@ future<> storage_service::bootstrap(std::unordered_set& bootstrap_tokens, slogger.debug("bootstrap: update pending ranges: endpoint={} bootstrap_tokens={}", get_broadcast_address(), bootstrap_tokens); mutate_token_metadata([this, &bootstrap_tokens] (mutable_token_metadata_ptr tmptr) { auto endpoint = get_broadcast_address(); - tmptr->update_topology(endpoint, _snitch.local()->get_location(), locator::node::state::bootstrapping); - tmptr->add_bootstrap_tokens(bootstrap_tokens, endpoint); + tmptr->update_topology(tmptr->get_my_id(), _snitch.local()->get_location(), locator::node::state::bootstrapping); + tmptr->add_bootstrap_tokens(bootstrap_tokens, tmptr->get_my_id()); return update_topology_change_info(std::move(tmptr), ::format("bootstrapping node {}", endpoint)); }).get(); } @@ -3572,7 +3580,7 @@ future<> storage_service::bootstrap(std::unordered_set& bootstrap_tokens, slogger.info("sleeping {} ms for pending range setup", get_ring_delay().count()); _gossiper.wait_for_range_setup().get(); - dht::boot_strapper bs(_db, _stream_manager, _abort_source, get_broadcast_address(), _snitch.local()->get_location(), bootstrap_tokens, get_token_metadata_ptr()); + dht::boot_strapper bs(_db, _stream_manager, _abort_source, get_token_metadata_ptr()->get_my_id(), _snitch.local()->get_location(), bootstrap_tokens, get_token_metadata_ptr()); slogger.info("Starting to bootstrap..."); bs.bootstrap(streaming::stream_reason::bootstrap, _gossiper, null_topology_guard).get(); } else { @@ -3642,21 +3650,22 @@ future<> storage_service::handle_state_bootstrap(inet_address endpoint, gms::per // continue. auto tmlock = co_await get_token_metadata_lock(); auto tmptr = co_await get_mutable_token_metadata_ptr(); - if (tmptr->is_normal_token_owner(endpoint)) { + const auto host_id = _gossiper.get_host_id(endpoint); + if (tmptr->is_normal_token_owner(host_id)) { // If isLeaving is false, we have missed both LEAVING and LEFT. However, if // isLeaving is true, we have only missed LEFT. Waiting time between completing // leave operation and rebootstrapping is relatively short, so the latter is quite // common (not enough time for gossip to spread). Therefore we report only the // former in the log. - if (!tmptr->is_leaving(endpoint)) { - slogger.info("Node {} state jump to bootstrap", endpoint); + if (!tmptr->is_leaving(host_id)) { + slogger.info("Node {} state jump to bootstrap", host_id); } - tmptr->remove_endpoint(endpoint); + tmptr->remove_endpoint(host_id); } + tmptr->update_topology(host_id, get_dc_rack_for(endpoint), locator::node::state::bootstrapping); + tmptr->add_bootstrap_tokens(tokens, host_id); + tmptr->update_host_id(host_id, endpoint); - tmptr->update_topology(endpoint, get_dc_rack_for(endpoint), locator::node::state::bootstrapping); - tmptr->add_bootstrap_tokens(tokens, endpoint); - tmptr->update_host_id(_gossiper.get_host_id(endpoint), endpoint); co_await update_topology_change_info(tmptr, ::format("handle_state_bootstrap {}", endpoint)); co_await replicate_to_all_cores(std::move(tmptr)); } @@ -3675,35 +3684,85 @@ future<> storage_service::handle_state_normal(inet_address endpoint, gms::permit auto tmlock = std::make_unique(co_await get_token_metadata_lock()); auto tmptr = co_await get_mutable_token_metadata_ptr(); - if (tmptr->is_normal_token_owner(endpoint)) { - slogger.info("Node {} state jump to normal", endpoint); - } std::unordered_set endpoints_to_remove; auto do_remove_node = [&] (gms::inet_address node) { - tmptr->remove_endpoint(node); + // this lambda is called in three cases: + // 1. old endpoint for the given host_id is ours, we remove the new endpoint; + // 2. new endpoint for the given host_id has bigger generation, we remove the old endpoint; + // 3. old endpoint for the given host_id has bigger generation, we remove the new endpoint. + // In all of these cases host_id is retained, only the IP addresses are changed. + // We don't need to call remove_endpoint on tmptr, since it will be called + // indirectly through the chain endpoints_to_remove->storage_service::remove_endpoint -> + // _gossiper.remove_endpoint -> storage_service::on_remove. + endpoints_to_remove.insert(node); }; // Order Matters, TM.updateHostID() should be called before TM.updateNormalToken(), (see CASSANDRA-4300). auto host_id = _gossiper.get_host_id(endpoint); - auto existing = tmptr->get_endpoint_for_host_id(host_id); + if (tmptr->is_normal_token_owner(host_id)) { + slogger.info("Node {}/{} state jump to normal", endpoint, host_id); + } + auto existing = tmptr->get_endpoint_for_host_id_if_known(host_id); + + // Old node in replace-with-same-IP scenario. + std::optional replaced_id; + if (existing && *existing != endpoint) { + // This branch in taken when a node changes its IP address. + if (*existing == get_broadcast_address()) { slogger.warn("Not updating host ID {} for {} because it's mine", host_id, endpoint); do_remove_node(endpoint); } else if (_gossiper.compare_endpoint_startup(endpoint, *existing) > 0) { + // The new IP has greater generation than the existing one. + // Here we remap the host_id to the new IP. The 'owned_tokens' calculation logic below + // won't detect any changes - the branch 'endpoint == current_owner' will be taken. + // We still need to call 'remove_endpoint' for existing IP to remove it from system.peers. + slogger.warn("Host ID collision for {} between {} and {}; {} is the new owner", host_id, *existing, endpoint, endpoint); do_remove_node(*existing); slogger.info("Set host_id={} to be owned by node={}, existing={}", host_id, endpoint, *existing); tmptr->update_host_id(host_id, endpoint); } else { + // The new IP has smaller generation than the existing one, + // we are going to remove it, so we add it to the endpoints_to_remove. + // How does this relate to the tokens this endpoint may have? + // There is a condition below which checks that if endpoints_to_remove + // contains 'endpoint', then the owned_tokens must be empty, otherwise internal_error + // is triggered. This means the following is expected to be true: + // 1. each token from the tokens variable (which is read from gossiper) must have an owner node + // 2. this owner must be different from 'endpoint' + // 3. its generation must be greater than endpoint's + slogger.warn("Host ID collision for {} between {} and {}; ignored {}", host_id, *existing, endpoint, endpoint); do_remove_node(endpoint); } } else if (existing && *existing == endpoint) { - tmptr->del_replacing_endpoint(endpoint); + // This branch is taken for all gossiper-managed topology operations. + // For example, if this node is a member of the cluster and a new node is added, + // handle_state_normal is called on this node as the final step + // in the endpoint bootstrap process. + // This method is also called for both replace scenarios - with either the same or with a different IP. + // If the new node has a different IP, the old IP is removed by the block of + // logic below - we detach the old IP from token ring, + // it gets added to candidates_for_removal, then storage_service::remove_endpoint -> + // _gossiper.remove_endpoint -> storage_service::on_remove -> remove from token_metadata. + // If the new node has the same IP, we need to explicitly remove old host_id from + // token_metadata, since no IPs will be removed in this case. + // We do this after update_normal_tokens, allowing for tokens to be properly + // migrated to the new host_id. + + if (const auto old_host_id = tmptr->get_host_id_if_known(endpoint); old_host_id && *old_host_id != host_id) { + replaced_id = *old_host_id; + } } else { - tmptr->del_replacing_endpoint(endpoint); + // This branch is taken if this node wasn't involved in node_ops + // workflow (storage_service::node_ops_cmd_handler wasn't called on it) and it just + // receives the current state of the cluster from the gossiper. + // For example, a new node receives this notification for every + // existing node in the cluster. + auto nodes = _gossiper.get_nodes_with_host_id(host_id); bool left = std::any_of(nodes.begin(), nodes.end(), [this] (const gms::inet_address& node) { return _gossiper.is_left(node); }); if (left) { @@ -3723,9 +3782,19 @@ future<> storage_service::handle_state_normal(inet_address endpoint, gms::permit // token_to_endpoint_map is used to track the current token owners for the purpose of removing replaced endpoints. // when any token is replaced by a new owner, we track the existing owner in `candidates_for_removal` // and eventually, if any candidate for removal ends up owning no tokens, it is removed from token_metadata. - std::unordered_map token_to_endpoint_map = get_token_metadata().get_token_to_endpoint(); + std::unordered_map token_to_endpoint_map = get_token_to_endpoint(get_token_metadata()); std::unordered_set candidates_for_removal; + // Here we convert endpoint tokens from gossiper to owned_tokens, which will be assigned as a new + // normal tokens to the token_metadata. + // This transformation accounts for situations where some tokens + // belong to outdated nodes - the ones with smaller generation. + // We use endpoints instead of host_ids here since gossiper operates + // with endpoints and generations are tied to endpoints, not host_ids. + // In replace-with-same-ip scenario we won't be able to distinguish + // between the old and new IP owners, so we assume the old replica + // is down and won't be resurrected. + for (auto t : tokens) { // we don't want to update if this node is responsible for the token and it has a later startup time than endpoint. auto current = token_to_endpoint_map.find(t); @@ -3777,7 +3846,7 @@ future<> storage_service::handle_state_normal(inet_address endpoint, gms::permit endpoints_to_remove.insert(ep); } - bool is_normal_token_owner = tmptr->is_normal_token_owner(endpoint); + bool is_normal_token_owner = tmptr->is_normal_token_owner(host_id); bool do_notify_joined = false; if (endpoints_to_remove.contains(endpoint)) [[unlikely]] { @@ -3793,8 +3862,19 @@ future<> storage_service::handle_state_normal(inet_address endpoint, gms::permit do_notify_joined = true; } - tmptr->update_topology(endpoint, get_dc_rack_for(endpoint), locator::node::state::normal); - co_await tmptr->update_normal_tokens(owned_tokens, endpoint); + const auto dc_rack = get_dc_rack_for(endpoint); + tmptr->update_topology(host_id, dc_rack, locator::node::state::normal); + co_await tmptr->update_normal_tokens(owned_tokens, host_id); + if (replaced_id) { + if (tmptr->is_normal_token_owner(*replaced_id)) { + on_internal_error(slogger, ::format("replaced endpoint={}/{} still owns tokens {}", + endpoint, *replaced_id, tmptr->get_tokens(*replaced_id))); + } else { + tmptr->remove_endpoint(*replaced_id); + slogger.info("node {}/{} is removed from token_metadata since it's replaced by {}/{} ", + endpoint, *replaced_id, endpoint, host_id); + } + } } co_await update_topology_change_info(tmptr, ::format("handle_state_normal {}", endpoint)); @@ -3823,7 +3903,7 @@ future<> storage_service::handle_state_normal(inet_address endpoint, gms::permit const auto& tm = get_token_metadata(); auto ver = tm.get_ring_version(); for (auto& x : tm.get_token_to_endpoint()) { - slogger.debug("handle_state_normal: token_metadata.ring_version={}, token={} -> endpoint={}", ver, x.first, x.second); + slogger.debug("handle_state_normal: token_metadata.ring_version={}, token={} -> endpoint={}/{}", ver, x.first, tm.get_endpoint_for_host_id(x.second), x.second); } } _normal_state_handled_on_boot.insert(endpoint); @@ -3841,8 +3921,9 @@ future<> storage_service::handle_state_left(inet_address endpoint, std::vector storage_service::handle_state_left(inet_address endpoint, std::vector(tokens_from_tm.begin(), tokens_from_tm.end()); } co_await excise(tokens, endpoint, extract_expire_time(pieces), pid); @@ -3870,9 +3951,10 @@ future<> storage_service::handle_state_removed(inet_address endpoint, std::vecto } co_return; } - if (get_token_metadata().is_normal_token_owner(endpoint)) { + const auto host_id = _gossiper.get_host_id(endpoint); + if (get_token_metadata().is_normal_token_owner(host_id)) { auto state = pieces[0]; - auto remove_tokens = get_token_metadata().get_tokens(endpoint); + auto remove_tokens = get_token_metadata().get_tokens(host_id); std::unordered_set tmp(remove_tokens.begin(), remove_tokens.end()); co_await excise(std::move(tmp), endpoint, extract_expire_time(pieces), pid); } else { // now that the gossiper has told us about this nonexistent member, notify the gossiper to remove it @@ -3889,14 +3971,19 @@ future<> storage_service::on_join(gms::inet_address endpoint, gms::endpoint_stat } future<> storage_service::on_alive(gms::inet_address endpoint, gms::endpoint_state_ptr state, gms::permit_id pid) { - slogger.debug("endpoint={} on_alive: permit_id={}", endpoint, pid); - bool is_normal_token_owner = get_token_metadata().is_normal_token_owner(endpoint); + const auto& tm = get_token_metadata(); + const auto tm_host_id_opt = tm.get_host_id_if_known(endpoint); + slogger.debug("endpoint={}/{} on_alive: permit_id={}", endpoint, tm_host_id_opt, pid); + bool is_normal_token_owner = tm_host_id_opt && tm.is_normal_token_owner(*tm_host_id_opt); if (is_normal_token_owner) { co_await notify_up(endpoint); } else { auto tmlock = co_await get_token_metadata_lock(); auto tmptr = co_await get_mutable_token_metadata_ptr(); - tmptr->update_topology(endpoint, get_dc_rack_for(endpoint)); + const auto dc_rack = get_dc_rack_for(endpoint); + const auto host_id = _gossiper.get_host_id(endpoint); + tmptr->update_host_id(host_id, endpoint); + tmptr->update_topology(host_id, dc_rack); co_await replicate_to_all_cores(std::move(tmptr)); } } @@ -3934,8 +4021,17 @@ future<> storage_service::on_change(inet_address endpoint, application_state sta slogger.debug("Ignoring state change for dead or unknown endpoint: {}", endpoint); co_return; } - if (get_token_metadata().is_normal_token_owner(endpoint)) { - slogger.debug("endpoint={} on_change: updating system.peers table", endpoint); + const auto host_id = _gossiper.get_host_id(endpoint); + const auto& tm = get_token_metadata(); + const auto ep = tm.get_endpoint_for_host_id_if_known(host_id); + // The check *ep == endpoint is needed when a node changes + // its IP - on_change can be called by the gossiper for old IP as part + // of its removal, after handle_state_normal has already been called for + // the new one. Without the check, the do_update_system_peers_table call + // overwrites the IP back to its old value. + // In essence, the code under the 'if' should fire if the given IP is a normal_token_owner. + if (ep && *ep == endpoint && tm.is_normal_token_owner(host_id)) { + slogger.debug("endpoint={}/{} on_change: updating system.peers table", endpoint, host_id); co_await do_update_system_peers_table(endpoint, state, value); if (state == application_state::RPC_READY) { slogger.debug("Got application_state::RPC_READY for node {}, is_cql_ready={}", endpoint, ep_state->is_cql_ready()); @@ -3966,7 +4062,13 @@ future<> storage_service::on_remove(gms::inet_address endpoint, gms::permit_id p slogger.debug("endpoint={} on_remove: permit_id={}", endpoint, pid); auto tmlock = co_await get_token_metadata_lock(); auto tmptr = co_await get_mutable_token_metadata_ptr(); - tmptr->remove_endpoint(endpoint); + // We should handle the case when we aren't able to find endpoint -> ip mapping in token_metadata. + // This could happen e.g. when the new endpoint has bigger generation in handle_state_normal - the code + // in handle_state_normal will remap host_id to the new IP and we won't find + // old IP here. We should just skip the remove in that case. + if (const auto host_id = tmptr->get_host_id_if_known(endpoint); host_id) { + tmptr->remove_endpoint(*host_id); + } co_await update_topology_change_info(tmptr, ::format("on_remove {}", endpoint)); co_await replicate_to_all_cores(std::move(tmptr)); } @@ -4163,11 +4265,14 @@ future<> storage_service::join_cluster(sharded& // entry has been mistakenly added, delete it co_await _sys_ks.local().remove_endpoint(ep); } else { - tmptr->update_topology(ep, get_dc_rack(ep), locator::node::state::normal); - co_await tmptr->update_normal_tokens(tokens, ep); - if (loaded_host_ids.contains(ep)) { - tmptr->update_host_id(loaded_host_ids.at(ep), ep); + const auto dc_rack = get_dc_rack(ep); + const auto hostIdIt = loaded_host_ids.find(ep); + if (hostIdIt == loaded_host_ids.end()) { + on_internal_error(slogger, format("can't find host_id for ep {}", ep)); } + tmptr->update_topology(hostIdIt->second, dc_rack, locator::node::state::normal); + co_await tmptr->update_normal_tokens(tokens, hostIdIt->second); + tmptr->update_host_id(hostIdIt->second, ep); loaded_endpoints.insert(ep); co_await _gossiper.add_saved_endpoint(ep); } @@ -4256,7 +4361,7 @@ future<> storage_service::replicate_to_all_cores(mutable_token_metadata_ptr tmpt continue; } auto tmptr = pending_token_metadata_ptr[this_shard_id()]; - auto erm = co_await ss.get_erm_factory().create_effective_replication_map(rs, std::move(tmptr)); + auto erm = co_await ss.get_erm_factory().create_effective_replication_map(rs, tmptr); pending_effective_replication_maps[this_shard_id()].emplace(ks_name, std::move(erm)); } }); @@ -4493,9 +4598,9 @@ future> storage_service::get_ownership() { // describeOwnership returns tokens in an unspecified order, let's re-order them std::map ownership; for (auto entry : token_map) { - gms::inet_address endpoint = tm.get_endpoint(entry.first).value(); + locator::host_id id = tm.get_endpoint(entry.first).value(); auto token_ownership = entry.second; - ownership[endpoint] += token_ownership; + ownership[tm.get_endpoint_for_host_id(id)] += token_ownership; } return ownership; }); @@ -4785,7 +4890,7 @@ future<> storage_service::decommission() { uuid = ctl.uuid(); auto endpoint = ctl.endpoint; const auto& tmptr = ctl.tmptr; - if (!tmptr->is_normal_token_owner(endpoint)) { + if (!tmptr->is_normal_token_owner(ctl.host_id)) { throw std::runtime_error("local node is not a member of the token ring yet"); } // We assume that we're a member of group 0 if we're in decommission()` and Raft is enabled. @@ -5024,7 +5129,7 @@ void storage_service::run_replace_ops(std::unordered_set& bootstrap_token _repair.local().replace_with_repair(get_token_metadata_ptr(), bootstrap_tokens, ctl.ignore_nodes).get(); } else { slogger.info("replace[{}]: Using streaming based node ops to sync data", uuid); - dht::boot_strapper bs(_db, _stream_manager, _abort_source, get_broadcast_address(), _snitch.local()->get_location(), bootstrap_tokens, get_token_metadata_ptr()); + dht::boot_strapper bs(_db, _stream_manager, _abort_source, get_token_metadata_ptr()->get_my_id(), _snitch.local()->get_location(), bootstrap_tokens, get_token_metadata_ptr()); bs.bootstrap(streaming::stream_reason::replace, _gossiper, null_topology_guard, replace_address).get(); } on_streaming_finished(); @@ -5136,7 +5241,7 @@ future<> storage_service::removenode(locator::host_id host_id, std::listget_endpoint_for_host_id(host_id); + auto endpoint_opt = tmptr->get_endpoint_for_host_id_if_known(host_id); assert(ss._group0); auto raft_id = raft::server_id{host_id.uuid()}; bool raft_available = ss._group0->wait_for_raft().get(); @@ -5195,7 +5300,7 @@ future<> storage_service::removenode(locator::host_id host_id, std::listget_tokens(endpoint); + auto tokens = tmptr->get_tokens(host_id); try { // Step 3: Start heartbeat updater @@ -5346,8 +5451,8 @@ void storage_service::node_ops_insert(node_ops_id ops_uuid, on_node_ops_registered(ops_uuid); } -future storage_service::node_ops_cmd_handler(gms::inet_address coordinator, node_ops_cmd_request req) { - return seastar::async([this, coordinator, req = std::move(req)] () mutable { +future storage_service::node_ops_cmd_handler(gms::inet_address coordinator, std::optional coordinator_host_id, node_ops_cmd_request req) { + return seastar::async([this, coordinator, coordinator_host_id, req = std::move(req)] () mutable { auto ops_uuid = req.ops_uuid; auto topo_guard = null_topology_guard; slogger.debug("node_ops_cmd_handler cmd={}, ops_uuid={}", req.cmd, ops_uuid); @@ -5389,7 +5494,7 @@ future storage_service::node_ops_cmd_handler(gms::inet_ad mutate_token_metadata([coordinator, &req, this] (mutable_token_metadata_ptr tmptr) mutable { for (auto& node : req.leaving_nodes) { slogger.info("removenode[{}]: Added node={} as leaving node, coordinator={}", req.ops_uuid, node, coordinator); - tmptr->add_leaving_endpoint(node); + tmptr->add_leaving_endpoint(tmptr->get_host_id(node)); } return update_topology_change_info(tmptr, ::format("removenode {}", req.leaving_nodes)); }).get(); @@ -5397,7 +5502,7 @@ future storage_service::node_ops_cmd_handler(gms::inet_ad return mutate_token_metadata([this, coordinator, req = std::move(req)] (mutable_token_metadata_ptr tmptr) mutable { for (auto& node : req.leaving_nodes) { slogger.info("removenode[{}]: Removed node={} as leaving node, coordinator={}", req.ops_uuid, node, coordinator); - tmptr->del_leaving_endpoint(node); + tmptr->del_leaving_endpoint(tmptr->get_host_id(node)); } return update_topology_change_info(tmptr, ::format("removenode {}", req.leaving_nodes)); }); @@ -5437,7 +5542,7 @@ future storage_service::node_ops_cmd_handler(gms::inet_ad mutate_token_metadata([coordinator, &req, this] (mutable_token_metadata_ptr tmptr) mutable { for (auto& node : req.leaving_nodes) { slogger.info("decommission[{}]: Added node={} as leaving node, coordinator={}", req.ops_uuid, node, coordinator); - tmptr->add_leaving_endpoint(node); + tmptr->add_leaving_endpoint(tmptr->get_host_id(node)); } return update_topology_change_info(tmptr, ::format("decommission {}", req.leaving_nodes)); }).get(); @@ -5445,7 +5550,7 @@ future storage_service::node_ops_cmd_handler(gms::inet_ad return mutate_token_metadata([this, coordinator, req = std::move(req)] (mutable_token_metadata_ptr tmptr) mutable { for (auto& node : req.leaving_nodes) { slogger.info("decommission[{}]: Removed node={} as leaving node, coordinator={}", req.ops_uuid, node, coordinator); - tmptr->del_leaving_endpoint(node); + tmptr->del_leaving_endpoint(tmptr->get_host_id(node)); } return update_topology_change_info(tmptr, ::format("decommission {}", req.leaving_nodes)); }); @@ -5461,13 +5566,14 @@ future storage_service::node_ops_cmd_handler(gms::inet_ad check_again = false; for (auto& node : req.leaving_nodes) { auto tmptr = get_token_metadata_ptr(); - if (tmptr->is_normal_token_owner(node)) { + const auto host_id = tmptr->get_host_id_if_known(node); + if (host_id && tmptr->is_normal_token_owner(*host_id)) { check_again = true; if (std::chrono::steady_clock::now() > start_time + std::chrono::seconds(60)) { - auto msg = ::format("decommission[{}]: Node {} is still in the cluster", req.ops_uuid, node); + auto msg = ::format("decommission[{}]: Node {}/{} is still in the cluster", req.ops_uuid, node, host_id); throw std::runtime_error(msg); } - slogger.warn("decommission[{}]: Node {} is still in the cluster, sleep and check again", req.ops_uuid, node); + slogger.warn("decommission[{}]: Node {}/{} is still in the cluster, sleep and check again", req.ops_uuid, node, host_id); sleep_abortable(std::chrono::milliseconds(500), _abort_source).get(); break; } @@ -5491,23 +5597,48 @@ future storage_service::node_ops_cmd_handler(gms::inet_ad slogger.warn("{}", msg); throw std::runtime_error(msg); } - mutate_token_metadata([coordinator, &req, this] (mutable_token_metadata_ptr tmptr) mutable { + if (!coordinator_host_id) { + throw std::runtime_error("Coordinator host_id not found"); + } + mutate_token_metadata([coordinator, coordinator_host_id, &req, this] (mutable_token_metadata_ptr tmptr) mutable { for (auto& x: req.replace_nodes) { auto existing_node = x.first; auto replacing_node = x.second; - slogger.info("replace[{}]: Added replacing_node={} to replace existing_node={}, coordinator={}", req.ops_uuid, replacing_node, existing_node, coordinator); - tmptr->update_topology(replacing_node, get_dc_rack_for(replacing_node), locator::node::state::replacing); - tmptr->add_replacing_endpoint(existing_node, replacing_node); + const auto existing_node_id = tmptr->get_host_id(existing_node); + const auto replacing_node_id = *coordinator_host_id; + slogger.info("replace[{}]: Added replacing_node={}/{} to replace existing_node={}/{}, coordinator={}/{}", + req.ops_uuid, replacing_node, replacing_node_id, existing_node, existing_node_id, coordinator, *coordinator_host_id); + + // In case of replace-with-same-ip we need to map both host_id-s + // to the same IP. The locator::topology allows this specifically in case + // where one node is being_replaced and another is replacing, + // so here we adjust the state of the original node accordingly. + // The host_id -> IP map works as usual, and IP -> host_id will map + // IP to the being_replaced node - this is what is implied by the + // current code. The IP will be placed in pending_endpoints and + // excluded from normal_endpoints (maybe_remove_node_being_replaced function). + // In handle_state_normal we'll remap the IP to the new host_id. + tmptr->update_topology(existing_node_id, std::nullopt, locator::node::state::being_replaced); + tmptr->update_topology(replacing_node_id, get_dc_rack_for(replacing_node), locator::node::state::replacing); + tmptr->update_host_id(replacing_node_id, replacing_node); + tmptr->add_replacing_endpoint(existing_node_id, replacing_node_id); } return make_ready_future<>(); }).get(); - node_ops_insert(ops_uuid, coordinator, std::move(req.ignore_nodes), [this, coordinator, req = std::move(req)] () mutable { - return mutate_token_metadata([this, coordinator, req = std::move(req)] (mutable_token_metadata_ptr tmptr) mutable { + node_ops_insert(ops_uuid, coordinator, std::move(req.ignore_nodes), [this, coordinator, coordinator_host_id, req = std::move(req)] () mutable { + return mutate_token_metadata([this, coordinator, coordinator_host_id, req = std::move(req)] (mutable_token_metadata_ptr tmptr) mutable { for (auto& x: req.replace_nodes) { auto existing_node = x.first; auto replacing_node = x.second; - slogger.info("replace[{}]: Removed replacing_node={} to replace existing_node={}, coordinator={}", req.ops_uuid, replacing_node, existing_node, coordinator); - tmptr->del_replacing_endpoint(existing_node); + const auto existing_node_id = tmptr->get_host_id(existing_node); + const auto replacing_node_id = *coordinator_host_id; + slogger.info("replace[{}]: Removed replacing_node={}/{} to replace existing_node={}/{}, coordinator={}/{}", + req.ops_uuid, replacing_node, replacing_node_id, existing_node, existing_node_id, coordinator, *coordinator_host_id); + + tmptr->del_replacing_endpoint(existing_node_id); + const auto dc_rack = get_dc_rack_for(replacing_node); + tmptr->update_topology(existing_node_id, dc_rack, locator::node::state::normal); + tmptr->remove_endpoint(replacing_node_id); } return update_topology_change_info(tmptr, ::format("replace {}", req.replace_nodes)); }); @@ -5543,13 +5674,20 @@ future storage_service::node_ops_cmd_handler(gms::inet_ad slogger.warn("{}", msg); throw std::runtime_error(msg); } - mutate_token_metadata([coordinator, &req, this] (mutable_token_metadata_ptr tmptr) mutable { + if (!coordinator_host_id) { + throw std::runtime_error("Coordinator host_id not found"); + } + mutate_token_metadata([coordinator, coordinator_host_id, &req, this] (mutable_token_metadata_ptr tmptr) mutable { for (auto& x: req.bootstrap_nodes) { auto& endpoint = x.first; auto tokens = std::unordered_set(x.second.begin(), x.second.end()); - slogger.info("bootstrap[{}]: Added node={} as bootstrap, coordinator={}", req.ops_uuid, endpoint, coordinator); - tmptr->update_topology(endpoint, get_dc_rack_for(endpoint), locator::node::state::bootstrapping); - tmptr->add_bootstrap_tokens(tokens, endpoint); + const auto host_id = *coordinator_host_id; + const auto dc_rack = get_dc_rack_for(endpoint); + slogger.info("bootstrap[{}]: Added node={}/{} as bootstrap, coordinator={}/{}", + req.ops_uuid, endpoint, host_id, coordinator, *coordinator_host_id); + tmptr->update_host_id(host_id, endpoint); + tmptr->update_topology(host_id, dc_rack, locator::node::state::bootstrapping); + tmptr->add_bootstrap_tokens(tokens, host_id); } return update_topology_change_info(tmptr, ::format("bootstrap {}", req.bootstrap_nodes)); }).get(); @@ -5722,7 +5860,7 @@ future<> storage_service::rebuild(sstring source_dc) { co_await ss._repair.local().rebuild_with_repair(tmptr, std::move(source_dc)); } else { auto streamer = make_lw_shared(ss._db, ss._stream_manager, tmptr, ss._abort_source, - ss.get_broadcast_address(), ss._snitch.local()->get_location(), "Rebuild", streaming::stream_reason::rebuild, null_topology_guard); + tmptr->get_my_id(), ss._snitch.local()->get_location(), "Rebuild", streaming::stream_reason::rebuild, null_topology_guard); streamer->add_source_filter(std::make_unique(ss._gossiper.get_unreachable_members())); if (source_dc != "") { streamer->add_source_filter(std::make_unique(source_dc)); @@ -5775,8 +5913,8 @@ storage_service::get_changed_ranges_for_leaving(locator::vnode_effective_replica // endpoint might or might not be 'leaving'. If it was not leaving (that is, removenode // command was used), it is still present in temp and must be removed. - if (temp.is_normal_token_owner(endpoint)) { - temp.remove_endpoint(endpoint); + if (const auto host_id = temp.get_host_id_if_known(endpoint); host_id && temp.is_normal_token_owner(*host_id)) { + temp.remove_endpoint(*host_id); } std::unordered_multimap changed_ranges; @@ -5789,7 +5927,7 @@ storage_service::get_changed_ranges_for_leaving(locator::vnode_effective_replica const auto& rs = erm->get_replication_strategy(); for (auto& r : ranges) { auto end_token = r.end() ? r.end()->value() : dht::maximum_token(); - auto new_replica_endpoints = co_await rs.calculate_natural_endpoints(end_token, temp); + auto new_replica_endpoints = co_await rs.calculate_natural_ips(end_token, temp); auto rg = current_replica_endpoints.equal_range(r); for (auto it = rg.first; it != rg.second; it++) { @@ -5900,7 +6038,7 @@ future<> storage_service::removenode_with_stream(gms::inet_address leaving_node, as.request_abort(); } }); - auto streamer = make_lw_shared(_db, _stream_manager, tmptr, as, get_broadcast_address(), _snitch.local()->get_location(), "Removenode", streaming::stream_reason::removenode, topo_guard); + auto streamer = make_lw_shared(_db, _stream_manager, tmptr, as, tmptr->get_my_id(), _snitch.local()->get_location(), "Removenode", streaming::stream_reason::removenode, topo_guard); removenode_add_ranges(streamer, leaving_node).get(); try { streamer->stream_async().get(); @@ -5917,7 +6055,9 @@ future<> storage_service::excise(std::unordered_set tokens, inet_address co_await remove_endpoint(endpoint, pid); auto tmlock = std::make_optional(co_await get_token_metadata_lock()); auto tmptr = co_await get_mutable_token_metadata_ptr(); - tmptr->remove_endpoint(endpoint); + if (const auto host_id = tmptr->get_host_id_if_known(endpoint); host_id) { + tmptr->remove_endpoint(*host_id); + } tmptr->remove_bootstrap_tokens(tokens); co_await update_topology_change_info(tmptr, ::format("excise {}", endpoint)); @@ -5937,8 +6077,9 @@ future<> storage_service::leave_ring() { co_await _sys_ks.local().set_bootstrap_state(db::system_keyspace::bootstrap_state::NEEDS_BOOTSTRAP); co_await mutate_token_metadata([this] (mutable_token_metadata_ptr tmptr) { auto endpoint = get_broadcast_address(); - tmptr->remove_endpoint(endpoint); - return update_topology_change_info(std::move(tmptr), ::format("leave_ring {}", endpoint)); + const auto my_id = tmptr->get_my_id(); + tmptr->remove_endpoint(my_id); + return update_topology_change_info(std::move(tmptr), ::format("leave_ring {}/{}", endpoint, my_id)); }); auto expire_time = _gossiper.compute_expire_time().time_since_epoch().count(); @@ -5951,12 +6092,7 @@ future<> storage_service::leave_ring() { future<> storage_service::stream_ranges(std::unordered_map> ranges_to_stream_by_keyspace) { - auto streamer = dht::range_streamer(_db, _stream_manager, get_token_metadata_ptr(), _abort_source, - get_broadcast_address(), - _snitch.local()->get_location(), - "Unbootstrap", - streaming::stream_reason::decommission, - null_topology_guard); + auto streamer = dht::range_streamer(_db, _stream_manager, get_token_metadata_ptr(), _abort_source, get_token_metadata_ptr()->get_my_id(), _snitch.local()->get_location(), "Unbootstrap", streaming::stream_reason::decommission, null_topology_guard); for (auto& entry : ranges_to_stream_by_keyspace) { const auto& keyspace = entry.first; auto& ranges_with_endpoints = entry.second; @@ -6069,7 +6205,15 @@ storage_service::construct_range_to_endpoint_map( std::map storage_service::get_token_to_endpoint_map() { - return get_token_metadata().get_normal_and_bootstrapping_token_to_endpoint_map(); + const auto& tm = get_token_metadata(); + std::map result; + for (const auto [t, id]: tm.get_token_to_endpoint()) { + result.insert({t, tm.get_endpoint_for_host_id(id)}); + } + for (const auto [t, id]: tm.get_bootstrap_tokens()) { + result.insert({t, tm.get_endpoint_for_host_id(id)}); + } + return result; } std::chrono::milliseconds storage_service::get_ring_delay() { @@ -6109,8 +6253,22 @@ future<> storage_service::update_topology_change_info(mutable_token_metadata_ptr assert(this_shard_id() == 0); try { - locator::dc_rack_fn get_dc_rack_from_gossiper([this] (inet_address ep) { return get_dc_rack_for(ep); }); - co_await tmptr->update_topology_change_info(get_dc_rack_from_gossiper); + locator::dc_rack_fn get_dc_rack_by_host_id([this, &tm = *tmptr] (locator::host_id host_id) -> std::optional { + if (_raft_topology_change_enabled) { + const auto server_id = raft::server_id(host_id.uuid()); + const auto* node = _topology_state_machine._topology.find(server_id); + if (node) { + return locator::endpoint_dc_rack { + .dc = node->second.datacenter, + .rack = node->second.rack, + }; + } + return std::nullopt; + } + + return get_dc_rack_for(tm.get_endpoint_for_host_id(host_id)); + }); + co_await tmptr->update_topology_change_info(get_dc_rack_by_host_id); } catch (...) { auto ep = std::current_exception(); slogger.error("Failed to update topology change info for {}: {}", reason, ep); @@ -6161,9 +6319,9 @@ future<> storage_service::load_tablet_metadata() { future<> storage_service::snitch_reconfigured() { assert(this_shard_id() == 0); auto& snitch = _snitch.local(); - co_await mutate_token_metadata([&] (mutable_token_metadata_ptr tmptr) -> future<> { + co_await mutate_token_metadata([&snitch] (mutable_token_metadata_ptr tmptr) -> future<> { // re-read local rack and DC info - tmptr->update_topology(get_broadcast_address(), snitch->get_location()); + tmptr->update_topology(tmptr->get_my_id(), snitch->get_location()); return make_ready_future<>(); }); @@ -6315,7 +6473,7 @@ future storage_service::raft_topology_cmd_handler(raft if (is_repair_based_node_ops_enabled(streaming::stream_reason::bootstrap)) { co_await _repair.local().bootstrap_with_repair(get_token_metadata_ptr(), rs.ring.value().tokens); } else { - dht::boot_strapper bs(_db, _stream_manager, _abort_source, get_broadcast_address(), + dht::boot_strapper bs(_db, _stream_manager, _abort_source, get_token_metadata_ptr()->get_my_id(), locator::endpoint_dc_rack{rs.datacenter, rs.rack}, rs.ring.value().tokens, get_token_metadata_ptr()); co_await bs.bootstrap(streaming::stream_reason::bootstrap, _gossiper, _topology_state_machine._topology.session); } @@ -6339,7 +6497,7 @@ future storage_service::raft_topology_cmd_handler(raft } co_await _repair.local().replace_with_repair(get_token_metadata_ptr(), rs.ring.value().tokens, std::move(ignored_ips)); } else { - dht::boot_strapper bs(_db, _stream_manager, _abort_source, get_broadcast_address(), + dht::boot_strapper bs(_db, _stream_manager, _abort_source, get_token_metadata_ptr()->get_my_id(), locator::endpoint_dc_rack{rs.datacenter, rs.rack}, rs.ring.value().tokens, get_token_metadata_ptr()); auto replaced_id = std::get(_topology_state_machine._topology.req_param[raft_server.id()]).replaced_id; auto existing_ip = _group0->address_map().find(replaced_id); @@ -6411,8 +6569,7 @@ future storage_service::raft_topology_cmd_handler(raft co_await _repair.local().rebuild_with_repair(tmptr, std::move(source_dc)); } else { auto streamer = make_lw_shared(_db, _stream_manager, tmptr, _abort_source, - get_broadcast_address(), _snitch.local()->get_location(), "Rebuild", streaming::stream_reason::rebuild, - _topology_state_machine._topology.session); + tmptr->get_my_id(), _snitch.local()->get_location(), "Rebuild", streaming::stream_reason::rebuild, _topology_state_machine._topology.session); streamer->add_source_filter(std::make_unique(_gossiper.get_unreachable_members())); if (source_dc != "") { streamer->add_source_filter(std::make_unique(source_dc)); @@ -6601,10 +6758,9 @@ future<> storage_service::stream_tablet(locator::global_tablet_id tablet) { auto& table = _db.local().find_column_family(tablet.table); std::vector tables = {table.schema()->cf_name()}; - auto streamer = make_lw_shared(_db, _stream_manager, std::move(tm), guard.get_abort_source(), - get_broadcast_address(), _snitch.local()->get_location(), + auto streamer = make_lw_shared(_db, _stream_manager, tm, guard.get_abort_source(), + tm->get_my_id(), _snitch.local()->get_location(), "Tablet migration", streaming::stream_reason::tablet_migration, topo_guard, std::move(tables)); - tm = nullptr; streamer->add_source_filter(std::make_unique( _gossiper.get_unreachable_members())); @@ -7035,8 +7191,12 @@ future storage_service::join_node_response_handler(jo void storage_service::init_messaging_service(bool raft_topology_change_enabled) { _messaging.local().register_node_ops_cmd([this] (const rpc::client_info& cinfo, node_ops_cmd_request req) { auto coordinator = cinfo.retrieve_auxiliary("baddr"); - return container().invoke_on(0, [coordinator, req = std::move(req)] (auto& ss) mutable { - return ss.node_ops_cmd_handler(coordinator, std::move(req)); + std::optional coordinator_host_id; + if (const auto* id = cinfo.retrieve_auxiliary_opt("host_id")) { + coordinator_host_id = *id; + } + return container().invoke_on(0, [coordinator, coordinator_host_id, req = std::move(req)] (auto& ss) mutable { + return ss.node_ops_cmd_handler(coordinator, coordinator_host_id, std::move(req)); }); }); if (raft_topology_change_enabled) { @@ -7179,22 +7339,20 @@ future<> storage_service::force_remove_completion() { if (!tm.get_leaving_endpoints().empty()) { auto leaving = tm.get_leaving_endpoints(); slogger.warn("Removal not confirmed, Leaving={}", leaving); - for (auto endpoint : leaving) { - locator::host_id host_id; - auto tokens = tm.get_tokens(endpoint); - try { - host_id = tm.get_host_id(endpoint); - } catch (...) { - slogger.warn("No host_id is found for endpoint {}", endpoint); + for (auto host_id : leaving) { + const auto endpoint = tm.get_endpoint_for_host_id_if_known(host_id); + if (!endpoint) { + slogger.warn("No endpoint is found for host_id {}", host_id); continue; } - auto permit = co_await ss._gossiper.lock_endpoint(endpoint, gms::null_permit_id); + auto tokens = tm.get_tokens(host_id); + auto permit = co_await ss._gossiper.lock_endpoint(*endpoint, gms::null_permit_id); const auto& pid = permit.id(); - co_await ss._gossiper.advertise_token_removed(endpoint, host_id, pid); + co_await ss._gossiper.advertise_token_removed(*endpoint, host_id, pid); std::unordered_set tokens_set(tokens.begin(), tokens.end()); - co_await ss.excise(tokens_set, endpoint, pid); + co_await ss.excise(tokens_set, *endpoint, pid); - slogger.info("force_remove_completion: removing endpoint {} from group 0", endpoint); + slogger.info("force_remove_completion: removing endpoint {} from group 0", *endpoint); assert(ss._group0); bool raft_available = co_await ss._group0->wait_for_raft(); if (raft_available) { diff --git a/service/storage_service.hh b/service/storage_service.hh index 99bf6f58c6..5d21f241e6 100644 --- a/service/storage_service.hh +++ b/service/storage_service.hh @@ -223,7 +223,7 @@ private: future<> snitch_reconfigured(); future get_mutable_token_metadata_ptr() noexcept { - return get_token_metadata_ptr()->clone_async().then([] (token_metadata tm) { + return _shared_token_metadata.get()->clone_async().then([] (token_metadata tm) { // bump the token_metadata ring_version // to invalidate cached token/replication mappings // when the modified token_metadata is committed. @@ -270,6 +270,9 @@ private: bool is_me(inet_address addr) const noexcept { return get_token_metadata_ptr()->get_topology().is_me(addr); } + bool is_me(locator::host_id id) const noexcept { + return get_token_metadata_ptr()->get_topology().is_me(id); + } /* This abstraction maintains the token/endpoint metadata information */ shared_token_metadata& _shared_token_metadata; @@ -653,7 +656,7 @@ public: * @param hostIdString token for the node */ future<> removenode(locator::host_id host_id, std::list ignore_nodes); - future node_ops_cmd_handler(gms::inet_address coordinator, node_ops_cmd_request req); + future node_ops_cmd_handler(gms::inet_address coordinator, std::optional coordinator_host_id, node_ops_cmd_request req); void node_ops_cmd_check(gms::inet_address coordinator, const node_ops_cmd_request& req); future<> node_ops_cmd_heartbeat_updater(node_ops_cmd cmd, node_ops_id uuid, std::list nodes, lw_shared_ptr heartbeat_updater_done); void on_node_ops_registered(node_ops_id); diff --git a/test/boost/locator_topology_test.cc b/test/boost/locator_topology_test.cc index 850bf545a3..427e6d26fd 100644 --- a/test/boost/locator_topology_test.cc +++ b/test/boost/locator_topology_test.cc @@ -99,6 +99,7 @@ SEASTAR_THREAD_TEST_CASE(test_update_node) { topology::config cfg = { .this_endpoint = ep1, + .this_host_id = id1, .local_dc_rack = endpoint_dc_rack::default_location, }; @@ -109,12 +110,12 @@ SEASTAR_THREAD_TEST_CASE(test_update_node) { set_abort_on_internal_error(true); }); - topo.add_or_update_endpoint(ep1, endpoint_dc_rack::default_location, node::state::normal); + topo.add_or_update_endpoint(id1, std::nullopt, endpoint_dc_rack::default_location, node::state::normal); auto node = topo.this_node(); auto mutable_node = const_cast(node); - node = topo.update_node(mutable_node, id1, std::nullopt, std::nullopt, std::nullopt); + node = topo.update_node(mutable_node, std::nullopt, ep1, std::nullopt, std::nullopt); BOOST_REQUIRE_EQUAL(topo.find_node(id1), node); mutable_node = const_cast(node); @@ -171,6 +172,38 @@ SEASTAR_THREAD_TEST_CASE(test_update_node) { BOOST_REQUIRE_EQUAL(node->get_state(), locator::node::state::left); } +SEASTAR_THREAD_TEST_CASE(test_add_or_update_by_host_id) { + auto id1 = host_id::create_random_id(); + auto id2 = host_id::create_random_id(); + auto ep1 = gms::inet_address("127.0.0.1"); + + // In this test we check that add_or_update_endpoint searches by host_id first. + // We create two nodes, one matches by id, another - by ip, + // and assert that add_or_update_endpoint updates the first. + // We need to make the second node 'being_decommissioned', so that + // it gets removed from ip index and we don't get the non-unique IP error. + + auto topo = topology({}); + //auto topo = topology({}); + topo.add_node(id1, gms::inet_address{}, endpoint_dc_rack::default_location, node::state::normal); + topo.add_node(id2, ep1, endpoint_dc_rack::default_location, node::state::being_decommissioned); + + topo.add_or_update_endpoint(id1, ep1, std::nullopt, node::state::bootstrapping); + + auto* n = topo.find_node(id1); + BOOST_REQUIRE_EQUAL(n->get_state(), node::state::bootstrapping); + BOOST_REQUIRE_EQUAL(n->host_id(), id1); + BOOST_REQUIRE_EQUAL(n->endpoint(), ep1); + + auto* n2 = topo.find_node(ep1); + BOOST_REQUIRE_EQUAL(n, n2); + + auto* n3 = topo.find_node(id2); + BOOST_REQUIRE_EQUAL(n3->get_state(), node::state::being_decommissioned); + BOOST_REQUIRE_EQUAL(n3->host_id(), id2); + BOOST_REQUIRE_EQUAL(n3->endpoint(), ep1); +} + SEASTAR_THREAD_TEST_CASE(test_remove_endpoint) { using dc_endpoints_t = std::unordered_map>; using dc_racks_t = std::unordered_map>>; @@ -203,12 +236,12 @@ SEASTAR_THREAD_TEST_CASE(test_remove_endpoint) { BOOST_REQUIRE_EQUAL(topo.get_datacenter_racks(), (dc_racks_t{{"dc1", {{"rack1", {ep1}}, {"rack2", {ep2}}}}})); BOOST_REQUIRE_EQUAL(topo.get_datacenters(), (dcs_t{"dc1"})); - topo.remove_endpoint(ep2); + topo.remove_endpoint(id2); BOOST_REQUIRE_EQUAL(topo.get_datacenter_endpoints(), (dc_endpoints_t{{"dc1", {ep1}}})); BOOST_REQUIRE_EQUAL(topo.get_datacenter_racks(), (dc_racks_t{{"dc1", {{"rack1", {ep1}}}}})); BOOST_REQUIRE_EQUAL(topo.get_datacenters(), (dcs_t{"dc1"})); - topo.remove_endpoint(ep1); + topo.remove_endpoint(id1); BOOST_REQUIRE_EQUAL(topo.get_datacenter_endpoints(), (dc_endpoints_t{})); BOOST_REQUIRE_EQUAL(topo.get_datacenter_racks(), (dc_racks_t{})); BOOST_REQUIRE_EQUAL(topo.get_datacenters(), (dcs_t{})); @@ -231,6 +264,7 @@ SEASTAR_THREAD_TEST_CASE(test_load_sketch) { shared_token_metadata stm([&sem] () noexcept { return get_units(sem, 1); }, locator::token_metadata::config{ topology::config{ .this_endpoint = ip1, + .this_host_id = host1 } }); @@ -238,9 +272,9 @@ SEASTAR_THREAD_TEST_CASE(test_load_sketch) { tm.update_host_id(host1, ip1); tm.update_host_id(host2, ip2); tm.update_host_id(host3, ip3); - tm.update_topology(ip1, locator::endpoint_dc_rack::default_location, std::nullopt, node1_shard_count); - tm.update_topology(ip2, locator::endpoint_dc_rack::default_location, std::nullopt, node2_shard_count); - tm.update_topology(ip3, locator::endpoint_dc_rack::default_location, std::nullopt, node3_shard_count); + tm.update_topology(host1, locator::endpoint_dc_rack::default_location, std::nullopt, node1_shard_count); + tm.update_topology(host2, locator::endpoint_dc_rack::default_location, std::nullopt, node2_shard_count); + tm.update_topology(host3, locator::endpoint_dc_rack::default_location, std::nullopt, node3_shard_count); return make_ready_future<>(); }).get(); diff --git a/test/boost/network_topology_strategy_test.cc b/test/boost/network_topology_strategy_test.cc index 80fd48443e..159fc3bcb2 100644 --- a/test/boost/network_topology_strategy_test.cc +++ b/test/boost/network_topology_strategy_test.cc @@ -72,7 +72,7 @@ static void check_ranges_are_sorted(vnode_effective_replication_map_ptr erm, gms void strategy_sanity_check( replication_strategy_ptr ars_ptr, - const token_metadata& tm, + const token_metadata_ptr& tm, const std::map& options) { const network_topology_strategy* nts_ptr = @@ -90,16 +90,16 @@ void strategy_sanity_check( total_rf += rf; } - BOOST_CHECK(ars_ptr->get_replication_factor(tm) == total_rf); + BOOST_CHECK(ars_ptr->get_replication_factor(*tm) == total_rf); } void endpoints_check( replication_strategy_ptr ars_ptr, - const token_metadata& tm, + const token_metadata_ptr& tm, const inet_address_vector_replica_set& endpoints, const locator::topology& topo) { - auto&& nodes_per_dc = tm.get_topology().get_datacenter_endpoints(); + auto&& nodes_per_dc = tm->get_topology().get_datacenter_endpoints(); const network_topology_strategy* nts_ptr = dynamic_cast(ars_ptr.get()); @@ -111,7 +111,7 @@ void endpoints_check( // Check the total RF BOOST_CHECK(endpoints.size() == total_rf); - BOOST_CHECK(total_rf <= ars_ptr->get_replication_factor(tm)); + BOOST_CHECK(total_rf <= ars_ptr->get_replication_factor(*tm)); // Check the uniqueness std::unordered_set ep_set(endpoints.begin(), endpoints.end()); @@ -159,7 +159,7 @@ void full_ring_check(const std::vector& ring_points, locator::token_metadata_ptr tmptr) { auto& tm = *tmptr; const auto& topo = tm.get_topology(); - strategy_sanity_check(ars_ptr, tm, options); + strategy_sanity_check(ars_ptr, tmptr, options); auto erm = calculate_effective_replication_map(ars_ptr, tmptr).get0(); @@ -168,7 +168,7 @@ void full_ring_check(const std::vector& ring_points, token t1(dht::token::kind::key, d2t(cur_point1 / ring_points.size())); auto endpoints1 = erm->get_natural_endpoints(t1); - endpoints_check(ars_ptr, tm, endpoints1, topo); + endpoints_check(ars_ptr, tmptr, endpoints1, topo); print_natural_endpoints(cur_point1, endpoints1); @@ -181,7 +181,7 @@ void full_ring_check(const std::vector& ring_points, token t2(dht::token::kind::key, d2t(cur_point2 / ring_points.size())); auto endpoints2 = erm->get_natural_endpoints(t2); - endpoints_check(ars_ptr, tm, endpoints2, topo); + endpoints_check(ars_ptr, tmptr, endpoints2, topo); check_ranges_are_sorted(erm, rp.host); BOOST_CHECK(endpoints1 == endpoints2); } @@ -194,23 +194,17 @@ void full_ring_check(const tablet_map& tmap, auto& tm = *tmptr; const auto& topo = tm.get_topology(); - auto get_endpoint_for_host_id = [&] (host_id host) { - auto endpoint_opt = tm.get_endpoint_for_host_id(host); - assert(endpoint_opt); - return *endpoint_opt; - }; - auto to_endpoint_set = [&] (const tablet_replica_set& replicas) { inet_address_vector_replica_set result; result.reserve(replicas.size()); for (auto&& replica : replicas) { - result.emplace_back(get_endpoint_for_host_id(replica.host)); + result.emplace_back(tm.get_endpoint_for_host_id(replica.host)); } return result; }; for (tablet_id tb : tmap.tablet_ids()) { - endpoints_check(rs_ptr, tm, to_endpoint_set(tmap.get_tablet_info(tb).replicas), topo); + endpoints_check(rs_ptr, tmptr, to_endpoint_set(tmap.get_tablet_info(tb).replicas), topo); } } @@ -262,7 +256,7 @@ void simple_test() { std::unordered_set tokens; tokens.insert({dht::token::kind::key, d2t(ring_point / ring_points.size())}); topo.add_node(id, endpoint, make_endpoint_dc_rack(endpoint), locator::node::state::normal); - co_await tm.update_normal_tokens(std::move(tokens), endpoint); + co_await tm.update_normal_tokens(std::move(tokens), id); } }).get(); @@ -367,7 +361,7 @@ void heavy_origin_test() { auto& topo = tm.get_topology(); for (const auto& [ring_point, endpoint, id] : ring_points) { topo.add_node(id, endpoint, make_endpoint_dc_rack(endpoint), locator::node::state::normal); - co_await tm.update_normal_tokens(std::move(tokens[endpoint]), endpoint); + co_await tm.update_normal_tokens(tokens[endpoint], id); } }).get(); @@ -426,7 +420,7 @@ SEASTAR_THREAD_TEST_CASE(NetworkTopologyStrategy_tablets_test) { tokens.insert({dht::token::kind::key, d2t(ring_point / ring_points.size())}); topo.add_node(id, endpoint, make_endpoint_dc_rack(endpoint), locator::node::state::normal, 1); tm.update_host_id(id, endpoint); - co_await tm.update_normal_tokens(std::move(tokens), endpoint); + co_await tm.update_normal_tokens(std::move(tokens), id); } }).get(); @@ -497,7 +491,7 @@ static size_t get_replication_factor(const sstring& dc, } static bool has_sufficient_replicas(const sstring& dc, - const std::unordered_map>& dc_replicas, + const std::unordered_map>& dc_replicas, const std::unordered_map>& all_endpoints, const std::unordered_map& datacenters) noexcept { auto dc_replicas_it = dc_replicas.find(dc); @@ -515,7 +509,7 @@ static bool has_sufficient_replicas(const sstring& dc, } static bool has_sufficient_replicas( - const std::unordered_map>& dc_replicas, + const std::unordered_map>& dc_replicas, const std::unordered_map>& all_endpoints, const std::unordered_map& datacenters) noexcept { @@ -529,7 +523,7 @@ static bool has_sufficient_replicas( return true; } -static locator::endpoint_set calculate_natural_endpoints( +static locator::host_id_set calculate_natural_endpoints( const token& search_token, const token_metadata& tm, const locator::topology& topo, const std::unordered_map& datacenters) { @@ -537,10 +531,10 @@ static locator::endpoint_set calculate_natural_endpoints( // We want to preserve insertion order so that the first added endpoint // becomes primary. // - locator::endpoint_set replicas; + locator::host_id_set replicas; // replicas we have found in each DC - std::unordered_map> dc_replicas; + std::unordered_map> dc_replicas; // tracks the racks we have already placed replicas in std::unordered_map> seen_racks; // @@ -548,7 +542,7 @@ static locator::endpoint_set calculate_natural_endpoints( // when we relax the rack uniqueness we can append this to the current // result so we don't have to wind back the iterator // - std::unordered_map + std::unordered_map skipped_dc_endpoints; // @@ -589,7 +583,7 @@ static locator::endpoint_set calculate_natural_endpoints( break; } - inet_address ep = *tm.get_endpoint(next); + host_id ep = *tm.get_endpoint(next); sstring dc = topo.get_location(ep).dc; auto& seen_racks_dc_set = seen_racks[dc]; @@ -628,7 +622,7 @@ static locator::endpoint_set calculate_natural_endpoints( auto skipped_it = skipped_dc_endpoints_set.begin(); while (skipped_it != skipped_dc_endpoints_set.end() && !has_sufficient_replicas(dc, dc_replicas, all_endpoints, datacenters)) { - inet_address skipped = *skipped_it++; + host_id skipped = *skipped_it++; dc_replicas_dc_set.insert(skipped); replicas.push_back(skipped); } @@ -660,21 +654,21 @@ static void test_equivalence(const shared_token_metadata& stm, const locator::to for (size_t i = 0; i < 1000; ++i) { auto token = dht::token::get_random_token(); auto expected = calculate_natural_endpoints(token, tm, topo, datacenters); - auto actual = nts.calculate_natural_endpoints(token, tm).get0(); + auto actual = nts.calculate_natural_endpoints(token, *stm.get()).get0(); // Because the old algorithm does not put the nodes in the correct order in the case where more replicas // are required than there are racks in a dc, we accept different order as long as the primary // replica is the same. BOOST_REQUIRE_EQUAL(expected[0], actual[0]); - BOOST_REQUIRE_EQUAL(std::set(expected.begin(), expected.end()), - std::set(actual.begin(), actual.end())); + BOOST_REQUIRE_EQUAL(std::set(expected.begin(), expected.end()), + std::set(actual.begin(), actual.end())); } } -void generate_topology(topology& topo, const std::unordered_map datacenters, const std::vector& nodes) { +void generate_topology(topology& topo, const std::unordered_map datacenters, const std::vector& nodes) { auto& e1 = seastar::testing::local_random_engine; std::unordered_map racks_per_dc; @@ -694,11 +688,12 @@ void generate_topology(topology& topo, const std::unordered_map out = std::fill_n(out, rf, std::cref(dc)); } + unsigned i = 0; for (auto& node : nodes) { const sstring& dc = dcs[udist(0, dcs.size() - 1)(e1)]; auto rc = racks_per_dc.at(dc); auto r = udist(0, rc)(e1); - topo.add_node(host_id::create_random_id(), node, {dc, to_sstring(r)}, locator::node::state::normal); + topo.add_node(node, inet_address((127u << 24) | ++i), {dc, to_sstring(r)}, locator::node::state::normal); } } @@ -719,10 +714,10 @@ SEASTAR_THREAD_TEST_CASE(testCalculateEndpoints) { { "rf5_2", 5 }, { "rf5_3", 5 }, }; - std::vector nodes; + std::vector nodes; nodes.reserve(NODES); std::generate_n(std::back_inserter(nodes), NODES, [i = 0u]() mutable { - return inet_address((127u << 24) | ++i); + return host_id{utils::UUID(0, ++i)}; }); for (size_t run = 0; run < RUNS; ++run) { @@ -733,7 +728,7 @@ SEASTAR_THREAD_TEST_CASE(testCalculateEndpoints) { while (random_tokens.size() < nodes.size() * VNODES) { random_tokens.insert(dht::token::get_random_token()); } - std::unordered_map> endpoint_tokens; + std::unordered_map> endpoint_tokens; auto next_token_it = random_tokens.begin(); for (auto& node : nodes) { for (size_t i = 0; i < VNODES; ++i) { @@ -741,7 +736,7 @@ SEASTAR_THREAD_TEST_CASE(testCalculateEndpoints) { next_token_it++; } } - + stm.mutate_token_metadata([&] (token_metadata& tm) -> future<> { generate_topology(tm.get_topology(), datacenters, nodes); for (auto&& i : endpoint_tokens) { @@ -826,17 +821,17 @@ SEASTAR_THREAD_TEST_CASE(test_topology_compare_endpoints) { { "rf2", 2 }, { "rf3", 3 }, }; - std::vector nodes; + std::vector nodes; nodes.reserve(NODES); auto make_address = [] (unsigned i) { - return inet_address((127u << 24) | i); + return host_id{utils::UUID(0, i)}; }; std::generate_n(std::back_inserter(nodes), NODES, [&, i = 0u]() mutable { return make_address(++i); }); - auto bogus_address = make_address(NODES + 1); + auto bogus_address = inet_address((127u << 24) | static_cast(NODES + 1)); semaphore sem(1); shared_token_metadata stm([&sem] () noexcept { return get_units(sem, 1); }, tm_cfg); @@ -844,9 +839,9 @@ SEASTAR_THREAD_TEST_CASE(test_topology_compare_endpoints) { auto& topo = tm.get_topology(); generate_topology(topo, datacenters, nodes); - const auto& address = nodes[tests::random::get_int(0, NODES-1)]; - const auto& a1 = nodes[tests::random::get_int(0, NODES-1)]; - const auto& a2 = nodes[tests::random::get_int(0, NODES-1)]; + const auto& address = tm.get_endpoint_for_host_id(nodes[tests::random::get_int(0, NODES-1)]); + const auto& a1 = tm.get_endpoint_for_host_id(nodes[tests::random::get_int(0, NODES-1)]); + const auto& a2 = tm.get_endpoint_for_host_id(nodes[tests::random::get_int(0, NODES-1)]); topo.test_compare_endpoints(address, address, address); topo.test_compare_endpoints(address, address, a1); @@ -911,7 +906,7 @@ SEASTAR_THREAD_TEST_CASE(test_topology_tracks_local_node) { // Removing local node stm.mutate_token_metadata([&] (token_metadata& tm) { - tm.remove_endpoint(ip1); + tm.remove_endpoint(host1); tm.update_host_id(host3, ip3); return make_ready_future<>(); }).get(); @@ -924,7 +919,7 @@ SEASTAR_THREAD_TEST_CASE(test_topology_tracks_local_node) { // Removing node with no local node stm.mutate_token_metadata([&] (token_metadata& tm) { - tm.remove_endpoint(ip2); + tm.remove_endpoint(host2); return make_ready_future<>(); }).get(); @@ -960,7 +955,7 @@ SEASTAR_THREAD_TEST_CASE(test_topology_tracks_local_node) { stm.mutate_token_metadata([&] (token_metadata& tm) -> future<> { co_await tm.clear_gently(); - tm.get_topology().add_or_update_endpoint(ip1, host1, ip1_dc_rack_v2, node::state::being_decommissioned); + tm.get_topology().add_or_update_endpoint(host1, ip1, ip1_dc_rack_v2, node::state::being_decommissioned); }).get(); n1 = stm.get()->get_topology().find_node(host1); diff --git a/test/boost/storage_proxy_test.cc b/test/boost/storage_proxy_test.cc index 1a40d21938..0006d7604c 100644 --- a/test/boost/storage_proxy_test.cc +++ b/test/boost/storage_proxy_test.cc @@ -55,8 +55,9 @@ SEASTAR_TEST_CASE(test_get_restricted_ranges) { { // Ring with minimum token auto tmptr = locator::make_token_metadata_ptr(locator::token_metadata::config{}); - tmptr->update_topology(gms::inet_address("10.0.0.1"), locator::endpoint_dc_rack{"dc1", "rack1"}); - tmptr->update_normal_tokens(std::unordered_set({dht::minimum_token()}), gms::inet_address("10.0.0.1")).get(); + const auto host_id = locator::host_id{utils::UUID(0, 1)}; + tmptr->update_topology(host_id, locator::endpoint_dc_rack{"dc1", "rack1"}); + tmptr->update_normal_tokens(std::unordered_set({dht::minimum_token()}), host_id).get(); check(tmptr, dht::partition_range::make_singular(ring[0]), { dht::partition_range::make_singular(ring[0]) @@ -69,10 +70,12 @@ SEASTAR_TEST_CASE(test_get_restricted_ranges) { { auto tmptr = locator::make_token_metadata_ptr(locator::token_metadata::config{}); - tmptr->update_topology(gms::inet_address("10.0.0.1"), locator::endpoint_dc_rack{"dc1", "rack1"}); - tmptr->update_normal_tokens(std::unordered_set({ring[2].token()}), gms::inet_address("10.0.0.1")).get(); - tmptr->update_topology(gms::inet_address("10.0.0.2"), locator::endpoint_dc_rack{"dc1", "rack1"}); - tmptr->update_normal_tokens(std::unordered_set({ring[5].token()}), gms::inet_address("10.0.0.2")).get(); + const auto id1 = locator::host_id{utils::UUID(0, 1)}; + const auto id2 = locator::host_id{utils::UUID(0, 2)}; + tmptr->update_topology(id1, locator::endpoint_dc_rack{"dc1", "rack1"}); + tmptr->update_normal_tokens(std::unordered_set({ring[2].token()}), id1).get(); + tmptr->update_topology(id2, locator::endpoint_dc_rack{"dc1", "rack1"}); + tmptr->update_normal_tokens(std::unordered_set({ring[5].token()}), id2).get(); check(tmptr, dht::partition_range::make_singular(ring[0]), { dht::partition_range::make_singular(ring[0]) diff --git a/test/boost/tablets_test.cc b/test/boost/tablets_test.cc index 754b216849..8755f1647f 100644 --- a/test/boost/tablets_test.cc +++ b/test/boost/tablets_test.cc @@ -434,7 +434,7 @@ SEASTAR_TEST_CASE(test_sharder) { auto table1 = table_id(utils::UUID_gen::get_time_UUID()); token_metadata tokm(token_metadata::config{ .topo_cfg{ .this_host_id = h1 } }); - tokm.get_topology().add_or_update_endpoint(tokm.get_topology().my_address(), h1); + tokm.get_topology().add_or_update_endpoint(h1, tokm.get_topology().my_address()); std::vector tablet_ids; { @@ -689,13 +689,13 @@ SEASTAR_THREAD_TEST_CASE(test_load_balancing_with_empty_node) { } }); - stm.mutate_token_metadata([&] (auto& tm) { + stm.mutate_token_metadata([&] (token_metadata& tm) { tm.update_host_id(host1, ip1); tm.update_host_id(host2, ip2); tm.update_host_id(host3, ip3); - tm.update_topology(ip1, locator::endpoint_dc_rack::default_location, std::nullopt, shard_count); - tm.update_topology(ip2, locator::endpoint_dc_rack::default_location, std::nullopt, shard_count); - tm.update_topology(ip3, locator::endpoint_dc_rack::default_location, std::nullopt, shard_count); + tm.update_topology(host1, locator::endpoint_dc_rack::default_location, std::nullopt, shard_count); + tm.update_topology(host2, locator::endpoint_dc_rack::default_location, std::nullopt, shard_count); + tm.update_topology(host3, locator::endpoint_dc_rack::default_location, std::nullopt, shard_count); tablet_map tmap(4); auto tid = tmap.first_tablet(); @@ -783,15 +783,15 @@ SEASTAR_THREAD_TEST_CASE(test_decommission_rf_met) { } }); - stm.mutate_token_metadata([&](auto& tm) { + stm.mutate_token_metadata([&](token_metadata& tm) { const unsigned shard_count = 2; tm.update_host_id(host1, ip1); tm.update_host_id(host2, ip2); tm.update_host_id(host3, ip3); - tm.update_topology(ip1, locator::endpoint_dc_rack::default_location, std::nullopt, shard_count); - tm.update_topology(ip2, locator::endpoint_dc_rack::default_location, std::nullopt, shard_count); - tm.update_topology(ip3, locator::endpoint_dc_rack::default_location, node::state::being_decommissioned, + tm.update_topology(host1, locator::endpoint_dc_rack::default_location, std::nullopt, shard_count); + tm.update_topology(host2, locator::endpoint_dc_rack::default_location, std::nullopt, shard_count); + tm.update_topology(host3, locator::endpoint_dc_rack::default_location, node::state::being_decommissioned, shard_count); tablet_map tmap(4); @@ -839,8 +839,8 @@ SEASTAR_THREAD_TEST_CASE(test_decommission_rf_met) { BOOST_REQUIRE(load.get_avg_shard_load(host3) == 0); } - stm.mutate_token_metadata([&](auto& tm) { - tm.update_topology(ip3, locator::endpoint_dc_rack::default_location, node::state::left); + stm.mutate_token_metadata([&](token_metadata& tm) { + tm.update_topology(host3, locator::endpoint_dc_rack::default_location, node::state::left); return make_ready_future<>(); }).get(); @@ -885,17 +885,17 @@ SEASTAR_THREAD_TEST_CASE(test_decommission_two_racks) { } }); - stm.mutate_token_metadata([&](auto& tm) { + stm.mutate_token_metadata([&](token_metadata& tm) { const unsigned shard_count = 1; tm.update_host_id(host1, ip1); tm.update_host_id(host2, ip2); tm.update_host_id(host3, ip3); tm.update_host_id(host4, ip4); - tm.update_topology(ip1, racks[0], std::nullopt, shard_count); - tm.update_topology(ip2, racks[1], std::nullopt, shard_count); - tm.update_topology(ip3, racks[0], std::nullopt, shard_count); - tm.update_topology(ip4, racks[1], node::state::being_decommissioned, + tm.update_topology(host1, racks[0], std::nullopt, shard_count); + tm.update_topology(host2, racks[1], std::nullopt, shard_count); + tm.update_topology(host3, racks[0], std::nullopt, shard_count); + tm.update_topology(host4, racks[1], node::state::being_decommissioned, shard_count); tablet_map tmap(4); @@ -986,17 +986,17 @@ SEASTAR_THREAD_TEST_CASE(test_decommission_rack_load_failure) { } }); - stm.mutate_token_metadata([&](auto& tm) { + stm.mutate_token_metadata([&](token_metadata& tm) { const unsigned shard_count = 1; tm.update_host_id(host1, ip1); tm.update_host_id(host2, ip2); tm.update_host_id(host3, ip3); tm.update_host_id(host4, ip4); - tm.update_topology(ip1, racks[0], std::nullopt, shard_count); - tm.update_topology(ip2, racks[0], std::nullopt, shard_count); - tm.update_topology(ip3, racks[0], std::nullopt, shard_count); - tm.update_topology(ip4, racks[1], node::state::being_decommissioned, + tm.update_topology(host1, racks[0], std::nullopt, shard_count); + tm.update_topology(host2, racks[0], std::nullopt, shard_count); + tm.update_topology(host3, racks[0], std::nullopt, shard_count); + tm.update_topology(host4, racks[1], node::state::being_decommissioned, shard_count); tablet_map tmap(4); @@ -1060,15 +1060,15 @@ SEASTAR_THREAD_TEST_CASE(test_decommission_rf_not_met) { } }); - stm.mutate_token_metadata([&](auto& tm) { + stm.mutate_token_metadata([&](token_metadata& tm) { const unsigned shard_count = 2; tm.update_host_id(host1, ip1); tm.update_host_id(host2, ip2); tm.update_host_id(host3, ip3); - tm.update_topology(ip1, locator::endpoint_dc_rack::default_location, std::nullopt, shard_count); - tm.update_topology(ip2, locator::endpoint_dc_rack::default_location, std::nullopt, shard_count); - tm.update_topology(ip3, locator::endpoint_dc_rack::default_location, node::state::being_decommissioned, + tm.update_topology(host1, locator::endpoint_dc_rack::default_location, std::nullopt, shard_count); + tm.update_topology(host2, locator::endpoint_dc_rack::default_location, std::nullopt, shard_count); + tm.update_topology(host3, locator::endpoint_dc_rack::default_location, node::state::being_decommissioned, shard_count); tablet_map tmap(1); @@ -1117,13 +1117,13 @@ SEASTAR_THREAD_TEST_CASE(test_load_balancing_works_with_in_progress_transitions) } }); - stm.mutate_token_metadata([&] (auto& tm) { + stm.mutate_token_metadata([&] (token_metadata& tm) { tm.update_host_id(host1, ip1); tm.update_host_id(host2, ip2); tm.update_host_id(host3, ip3); - tm.update_topology(ip1, locator::endpoint_dc_rack::default_location, std::nullopt, 1); - tm.update_topology(ip2, locator::endpoint_dc_rack::default_location, std::nullopt, 1); - tm.update_topology(ip3, locator::endpoint_dc_rack::default_location, std::nullopt, 2); + tm.update_topology(host1, locator::endpoint_dc_rack::default_location, std::nullopt, 1); + tm.update_topology(host2, locator::endpoint_dc_rack::default_location, std::nullopt, 1); + tm.update_topology(host3, locator::endpoint_dc_rack::default_location, std::nullopt, 2); tablet_map tmap(4); std::optional tid = tmap.first_tablet(); @@ -1186,13 +1186,13 @@ SEASTAR_THREAD_TEST_CASE(test_load_balancer_shuffle_mode) { } }); - stm.mutate_token_metadata([&] (auto& tm) { + stm.mutate_token_metadata([&] (token_metadata& tm) { tm.update_host_id(host1, ip1); tm.update_host_id(host2, ip2); tm.update_host_id(host3, ip3); - tm.update_topology(ip1, locator::endpoint_dc_rack::default_location, std::nullopt, 1); - tm.update_topology(ip2, locator::endpoint_dc_rack::default_location, std::nullopt, 1); - tm.update_topology(ip3, locator::endpoint_dc_rack::default_location, std::nullopt, 2); + tm.update_topology(host1, locator::endpoint_dc_rack::default_location, std::nullopt, 1); + tm.update_topology(host2, locator::endpoint_dc_rack::default_location, std::nullopt, 1); + tm.update_topology(host3, locator::endpoint_dc_rack::default_location, std::nullopt, 2); tablet_map tmap(4); std::optional tid = tmap.first_tablet(); @@ -1249,15 +1249,15 @@ SEASTAR_THREAD_TEST_CASE(test_load_balancing_with_two_empty_nodes) { } }); - stm.mutate_token_metadata([&] (auto& tm) { + stm.mutate_token_metadata([&] (token_metadata& tm) { tm.update_host_id(host1, ip1); tm.update_host_id(host2, ip2); tm.update_host_id(host3, ip3); tm.update_host_id(host4, ip4); - tm.update_topology(ip1, locator::endpoint_dc_rack::default_location, std::nullopt, shard_count); - tm.update_topology(ip2, locator::endpoint_dc_rack::default_location, std::nullopt, shard_count); - tm.update_topology(ip3, locator::endpoint_dc_rack::default_location, std::nullopt, shard_count); - tm.update_topology(ip4, locator::endpoint_dc_rack::default_location, std::nullopt, shard_count); + tm.update_topology(host1, locator::endpoint_dc_rack::default_location, std::nullopt, shard_count); + tm.update_topology(host2, locator::endpoint_dc_rack::default_location, std::nullopt, shard_count); + tm.update_topology(host3, locator::endpoint_dc_rack::default_location, std::nullopt, shard_count); + tm.update_topology(host4, locator::endpoint_dc_rack::default_location, std::nullopt, shard_count); tablet_map tmap(16); for (auto tid : tmap.tablet_ids()) { @@ -1312,8 +1312,8 @@ SEASTAR_THREAD_TEST_CASE(test_load_balancer_disabling) { stm.mutate_token_metadata([&] (auto& tm) { tm.update_host_id(host1, ip1); tm.update_host_id(host2, ip2); - tm.update_topology(ip1, locator::endpoint_dc_rack::default_location, std::nullopt, shard_count); - tm.update_topology(ip2, locator::endpoint_dc_rack::default_location, std::nullopt, shard_count); + tm.update_topology(host1, locator::endpoint_dc_rack::default_location, std::nullopt, shard_count); + tm.update_topology(host2, locator::endpoint_dc_rack::default_location, std::nullopt, shard_count); tablet_map tmap(16); for (auto tid : tmap.tablet_ids()) { @@ -1399,12 +1399,13 @@ SEASTAR_THREAD_TEST_CASE(test_load_balancing_with_random_load) { shared_token_metadata stm([&sem]() noexcept { return get_units(sem, 1); }, locator::token_metadata::config { locator::topology::config { .this_endpoint = inet_address("192.168.0.1"), + .this_host_id = hosts[0], .local_dc_rack = racks[1] } }); size_t total_tablet_count = 0; - stm.mutate_token_metadata([&](auto& tm) { + stm.mutate_token_metadata([&](token_metadata& tm) { tablet_metadata tmeta; int i = 0; @@ -1413,7 +1414,7 @@ SEASTAR_THREAD_TEST_CASE(test_load_balancing_with_random_load) { auto shard_count = 2; tm.update_host_id(h, ip); auto rack = racks[i % racks.size()]; - tm.update_topology(ip, rack, std::nullopt, shard_count); + tm.update_topology(h, rack, std::nullopt, shard_count); if (h != hosts[0]) { // Leave the first host empty by making it invisible to allocation algorithm. hosts_by_rack[rack.rack].push_back(h); diff --git a/test/boost/token_metadata_test.cc b/test/boost/token_metadata_test.cc index 3dfe63155d..94b8f5c1d4 100644 --- a/test/boost/token_metadata_test.cc +++ b/test/boost/token_metadata_test.cc @@ -17,19 +17,22 @@ using namespace locator; namespace { const auto ks_name = sstring("test-ks"); - endpoint_dc_rack get_dc_rack(inet_address) { + host_id gen_id(int id) { + return host_id{utils::UUID(0, id)}; + } + + endpoint_dc_rack get_dc_rack(host_id) { return { .dc = "unk-dc", .rack = "unk-rack" }; } - mutable_token_metadata_ptr create_token_metadata(inet_address this_endpoint) { + mutable_token_metadata_ptr create_token_metadata(host_id this_host_id) { return make_lw_shared(token_metadata::config { topology::config { - .this_endpoint = this_endpoint, - .this_cql_address = this_endpoint, - .local_dc_rack = get_dc_rack(this_endpoint) + .this_host_id = this_host_id, + .local_dc_rack = get_dc_rack(this_host_id) } }); } @@ -39,21 +42,25 @@ namespace { dc_rack_fn get_dc_rack_fn = get_dc_rack; tmptr->update_topology_change_info(get_dc_rack_fn).get(); auto strategy = seastar::make_shared(std::move(opts)); - return calculate_effective_replication_map(std::move(strategy), std::move(tmptr)).get0(); + return calculate_effective_replication_map(std::move(strategy), tmptr).get0(); } } SEASTAR_THREAD_TEST_CASE(test_pending_and_read_endpoints_for_everywhere_strategy) { const auto e1 = inet_address("192.168.0.1"); const auto e2 = inet_address("192.168.0.2"); + const auto e1_id = gen_id(1); + const auto e2_id = gen_id(2); const auto t1 = dht::token::from_int64(10); const auto t2 = dht::token::from_int64(20); - auto token_metadata = create_token_metadata(e1); - token_metadata->update_topology(e1, get_dc_rack(e1)); - token_metadata->update_topology(e2, get_dc_rack(e2)); - token_metadata->update_normal_tokens({t1}, e1).get(); - token_metadata->add_bootstrap_token(t2, e2); + auto token_metadata = create_token_metadata(e1_id); + token_metadata->update_host_id(e1_id, e1); + token_metadata->update_host_id(e2_id, e2); + token_metadata->update_topology(e1_id, get_dc_rack(e1_id)); + token_metadata->update_topology(e2_id, get_dc_rack(e2_id)); + token_metadata->update_normal_tokens({t1}, e1_id).get(); + token_metadata->add_bootstrap_token(t2, e2_id); token_metadata->set_read_new(token_metadata::read_new_t::yes); auto erm = create_erm(token_metadata); @@ -68,12 +75,16 @@ SEASTAR_THREAD_TEST_CASE(test_pending_endpoints_for_bootstrap_second_node) { const auto t1 = dht::token::from_int64(1); const auto e2 = inet_address("192.168.0.2"); const auto t2 = dht::token::from_int64(100); + const auto e1_id = gen_id(1); + const auto e2_id = gen_id(2); - auto token_metadata = create_token_metadata(e1); - token_metadata->update_topology(e1, get_dc_rack(e1)); - token_metadata->update_topology(e2, get_dc_rack(e2)); - token_metadata->update_normal_tokens({t1}, e1).get(); - token_metadata->add_bootstrap_token(t2, e2); + auto token_metadata = create_token_metadata(e1_id); + token_metadata->update_host_id(e1_id, e1); + token_metadata->update_host_id(e2_id, e2); + token_metadata->update_topology(e1_id, get_dc_rack(e1_id)); + token_metadata->update_topology(e2_id, get_dc_rack(e2_id)); + token_metadata->update_normal_tokens({t1}, e1_id).get(); + token_metadata->add_bootstrap_token(t2, e2_id); auto erm = create_erm(token_metadata, {{"replication_factor", "1"}}); BOOST_REQUIRE_EQUAL(erm->get_pending_endpoints(dht::token::from_int64(0)), @@ -96,14 +107,20 @@ SEASTAR_THREAD_TEST_CASE(test_pending_endpoints_for_bootstrap_with_replicas) { const auto e1 = inet_address("192.168.0.1"); const auto e2 = inet_address("192.168.0.2"); const auto e3 = inet_address("192.168.0.3"); + const auto e1_id = gen_id(1); + const auto e2_id = gen_id(2); + const auto e3_id = gen_id(3); - auto token_metadata = create_token_metadata(e1); - token_metadata->update_topology(e1, get_dc_rack(e1)); - token_metadata->update_topology(e2, get_dc_rack(e2)); - token_metadata->update_topology(e3, get_dc_rack(e3)); - token_metadata->update_normal_tokens({t1, t1000}, e2).get(); - token_metadata->update_normal_tokens({t10}, e3).get(); - token_metadata->add_bootstrap_token(t100, e1); + auto token_metadata = create_token_metadata(e1_id); + token_metadata->update_host_id(e1_id, e1); + token_metadata->update_host_id(e2_id, e2); + token_metadata->update_host_id(e3_id, e3); + token_metadata->update_topology(e1_id, get_dc_rack(e1_id)); + token_metadata->update_topology(e2_id, get_dc_rack(e2_id)); + token_metadata->update_topology(e3_id, get_dc_rack(e3_id)); + token_metadata->update_normal_tokens({t1, t1000}, e2_id).get(); + token_metadata->update_normal_tokens({t10}, e3_id).get(); + token_metadata->add_bootstrap_token(t100, e1_id); auto erm = create_erm(token_metadata, {{"replication_factor", "2"}}); BOOST_REQUIRE_EQUAL(erm->get_pending_endpoints(dht::token::from_int64(1)), @@ -126,15 +143,21 @@ SEASTAR_THREAD_TEST_CASE(test_pending_endpoints_for_leave_with_replicas) { const auto e1 = inet_address("192.168.0.1"); const auto e2 = inet_address("192.168.0.2"); const auto e3 = inet_address("192.168.0.3"); + const auto e1_id = gen_id(1); + const auto e2_id = gen_id(2); + const auto e3_id = gen_id(3); - auto token_metadata = create_token_metadata(e1); - token_metadata->update_topology(e1, get_dc_rack(e1)); - token_metadata->update_topology(e2, get_dc_rack(e2)); - token_metadata->update_topology(e3, get_dc_rack(e3)); - token_metadata->update_normal_tokens({t1, t1000}, e2).get(); - token_metadata->update_normal_tokens({t10}, e3).get(); - token_metadata->update_normal_tokens({t100}, e1).get(); - token_metadata->add_leaving_endpoint(e1); + auto token_metadata = create_token_metadata(e1_id); + token_metadata->update_host_id(e1_id, e1); + token_metadata->update_host_id(e2_id, e2); + token_metadata->update_host_id(e3_id, e3); + token_metadata->update_topology(e1_id, get_dc_rack(e1_id)); + token_metadata->update_topology(e2_id, get_dc_rack(e2_id)); + token_metadata->update_topology(e3_id, get_dc_rack(e3_id)); + token_metadata->update_normal_tokens({t1, t1000}, e2_id).get(); + token_metadata->update_normal_tokens({t10}, e3_id).get(); + token_metadata->update_normal_tokens({t100}, e1_id).get(); + token_metadata->add_leaving_endpoint(e1_id); auto erm = create_erm(token_metadata, {{"replication_factor", "2"}}); BOOST_REQUIRE_EQUAL(erm->get_pending_endpoints(dht::token::from_int64(1)), @@ -158,16 +181,24 @@ SEASTAR_THREAD_TEST_CASE(test_pending_endpoints_for_replace_with_replicas) { const auto e2 = inet_address("192.168.0.2"); const auto e3 = inet_address("192.168.0.3"); const auto e4 = inet_address("192.168.0.4"); + const auto e1_id = gen_id(1); + const auto e2_id = gen_id(2); + const auto e3_id = gen_id(3); + const auto e4_id = gen_id(4); - auto token_metadata = create_token_metadata(e1); - token_metadata->update_topology(e1, get_dc_rack(e1)); - token_metadata->update_topology(e2, get_dc_rack(e2)); - token_metadata->update_topology(e3, get_dc_rack(e3)); - token_metadata->update_topology(e4, get_dc_rack(e4)); - token_metadata->update_normal_tokens({t1000}, e1).get(); - token_metadata->update_normal_tokens({t1, t100}, e2).get(); - token_metadata->update_normal_tokens({t10}, e3).get(); - token_metadata->add_replacing_endpoint(e3, e4); + auto token_metadata = create_token_metadata(e1_id); + token_metadata->update_host_id(e1_id, e1); + token_metadata->update_host_id(e2_id, e2); + token_metadata->update_host_id(e3_id, e3); + token_metadata->update_host_id(e4_id, e4); + token_metadata->update_topology(e1_id, get_dc_rack(e1_id)); + token_metadata->update_topology(e2_id, get_dc_rack(e2_id)); + token_metadata->update_topology(e3_id, get_dc_rack(e3_id)); + token_metadata->update_topology(e4_id, get_dc_rack(e4_id)); + token_metadata->update_normal_tokens({t1000}, e1_id).get(); + token_metadata->update_normal_tokens({t1, t100}, e2_id).get(); + token_metadata->update_normal_tokens({t10}, e3_id).get(); + token_metadata->add_replacing_endpoint(e3_id, e4_id); auto erm = create_erm(token_metadata, {{"replication_factor", "2"}}); BOOST_REQUIRE_EQUAL(erm->get_pending_endpoints(dht::token::from_int64(100)), @@ -194,14 +225,20 @@ SEASTAR_THREAD_TEST_CASE(test_endpoints_for_reading_when_bootstrap_with_replicas const auto e1 = inet_address("192.168.0.1"); const auto e2 = inet_address("192.168.0.2"); const auto e3 = inet_address("192.168.0.3"); + const auto e1_id = gen_id(1); + const auto e2_id = gen_id(2); + const auto e3_id = gen_id(3); - auto token_metadata = create_token_metadata(e1); - token_metadata->update_topology(e1, get_dc_rack(e1)); - token_metadata->update_topology(e2, get_dc_rack(e2)); - token_metadata->update_topology(e3, get_dc_rack(e3)); - token_metadata->update_normal_tokens({t1, t1000}, e2).get(); - token_metadata->update_normal_tokens({t10}, e3).get(); - token_metadata->add_bootstrap_token(t100, e1); + auto token_metadata = create_token_metadata(e1_id); + token_metadata->update_host_id(e1_id, e1); + token_metadata->update_host_id(e2_id, e2); + token_metadata->update_host_id(e3_id, e3); + token_metadata->update_topology(e1_id, get_dc_rack(e1_id)); + token_metadata->update_topology(e2_id, get_dc_rack(e2_id)); + token_metadata->update_topology(e3_id, get_dc_rack(e3_id)); + token_metadata->update_normal_tokens({t1, t1000}, e2_id).get(); + token_metadata->update_normal_tokens({t10}, e3_id).get(); + token_metadata->add_bootstrap_token(t100, e1_id); auto check_endpoints = [](mutable_vnode_erm_ptr erm, int64_t t, inet_address_vector_replica_set expected_replicas, @@ -246,14 +283,24 @@ SEASTAR_THREAD_TEST_CASE(test_endpoints_for_reading_when_bootstrap_with_replicas SEASTAR_THREAD_TEST_CASE(test_replace_node_with_same_endpoint) { const auto t1 = dht::token::from_int64(1); const auto e1 = inet_address("192.168.0.1"); + const auto e1_id1 = gen_id(1); + const auto e1_id2 = gen_id(2); - auto token_metadata = create_token_metadata(e1); - token_metadata->update_topology(e1, get_dc_rack(e1)); - token_metadata->update_normal_tokens({t1}, e1).get(); - token_metadata->add_replacing_endpoint(e1, e1); + auto token_metadata = create_token_metadata(e1_id2); + token_metadata->update_host_id(e1_id1, e1); + token_metadata->update_topology(e1_id1, get_dc_rack(e1_id1), node::state::being_replaced); + token_metadata->update_normal_tokens({t1}, e1_id1).get(); + + token_metadata->update_topology(e1_id2, get_dc_rack(e1_id2), node::state::replacing); + token_metadata->update_host_id(e1_id2, e1); + + token_metadata->add_replacing_endpoint(e1_id1, e1_id2); auto erm = create_erm(token_metadata, {{"replication_factor", "2"}}); + BOOST_REQUIRE_EQUAL(token_metadata->get_host_id(e1), e1_id1); BOOST_REQUIRE_EQUAL(erm->get_pending_endpoints(dht::token::from_int64(1)), inet_address_vector_topology_change{e1}); - BOOST_REQUIRE_EQUAL(token_metadata->get_endpoint(t1), e1); + BOOST_REQUIRE_EQUAL(erm->get_natural_endpoints_without_node_being_replaced(dht::token::from_int64(1)), + inet_address_vector_replica_set{}); + BOOST_REQUIRE_EQUAL(token_metadata->get_endpoint(t1), e1_id1); } diff --git a/test/lib/cql_test_env.cc b/test/lib/cql_test_env.cc index edd09e2478..8ead224c20 100644 --- a/test/lib/cql_test_env.cc +++ b/test/lib/cql_test_env.cc @@ -643,8 +643,8 @@ private: locator::shared_token_metadata::mutate_on_all_shards(_token_metadata, [hostid = host_id, &cfg_in] (locator::token_metadata& tm) { auto& topo = tm.get_topology(); topo.set_host_id_cfg(hostid); - topo.add_or_update_endpoint(cfg_in.broadcast_address, - hostid, + topo.add_or_update_endpoint(hostid, + cfg_in.broadcast_address, std::nullopt, locator::node::state::normal, smp::count);