/* * Copyright (C) 2015 ScyllaDB */ /* * This file is part of Scylla. * * Scylla is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * Scylla is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with Scylla. If not, see . */ #include "utils/UUID.hh" #include "token_metadata.hh" #include #include "locator/snitch_base.hh" #include "locator/abstract_replication_strategy.hh" #include "log.hh" #include "stdx.hh" #include "partition_range_compat.hh" #include #include #include #include namespace locator { static logging::logger logger("token_metadata"); template static void remove_by_value(C& container, V value) { for (auto it = container.begin(); it != container.end();) { if (it->second == value) { it = container.erase(it); } else { it++; } } } token_metadata::token_metadata(std::map token_to_endpoint_map, std::unordered_map endpoints_map, topology topology) : _token_to_endpoint_map(token_to_endpoint_map), _endpoint_to_host_id_map(endpoints_map), _topology(topology) { _sorted_tokens = sort_tokens(); } std::vector token_metadata::sort_tokens() { std::vector sorted; sorted.reserve(_token_to_endpoint_map.size()); for (auto&& i : _token_to_endpoint_map) { sorted.push_back(i.first); } return sorted; } const std::vector& token_metadata::sorted_tokens() const { return _sorted_tokens; } std::vector token_metadata::get_tokens(const inet_address& addr) const { std::vector res; for (auto&& i : _token_to_endpoint_map) { if (i.second == addr) { res.push_back(i.first); } } return res; } /** * Update token map with a single token/endpoint pair in normal state. */ void token_metadata::update_normal_token(token t, inet_address endpoint) { update_normal_tokens(std::unordered_set({t}), endpoint); } void token_metadata::update_normal_tokens(std::unordered_set tokens, inet_address endpoint) { if (tokens.empty()) { return; } std::unordered_map> endpoint_tokens ({{endpoint, tokens}}); update_normal_tokens(endpoint_tokens); } /** * Update token map with a set of token/endpoint pairs in normal state. * * Prefer this whenever there are multiple pairs to update, as each update (whether a single or multiple) * is expensive (CASSANDRA-3831). * * @param endpointTokens */ void token_metadata::update_normal_tokens(std::unordered_map>& endpoint_tokens) { if (endpoint_tokens.empty()) { return; } bool should_sort_tokens = false; for (auto&& i : endpoint_tokens) { inet_address endpoint = i.first; std::unordered_set& tokens = i.second; assert(!tokens.empty()); for(auto it = _token_to_endpoint_map.begin(), ite = _token_to_endpoint_map.end(); it != ite;) { if(it->second == endpoint) { it = _token_to_endpoint_map.erase(it); } else { ++it; } } _topology.add_endpoint(endpoint); remove_by_value(_bootstrap_tokens, endpoint); _leaving_endpoints.erase(endpoint); remove_from_moving(endpoint); // also removing this endpoint from moving for (const token& t : tokens) { auto prev = _token_to_endpoint_map.insert(std::pair(t, endpoint)); should_sort_tokens |= prev.second; // new token inserted -> sort if (prev.first->second != endpoint) { logger.warn("Token {} changing ownership from {} to {}", t, prev.first->second, endpoint); prev.first->second = endpoint; } } } if (should_sort_tokens) { _sorted_tokens = sort_tokens(); } } size_t token_metadata::first_token_index(const token& start) const { assert(_sorted_tokens.size() > 0); auto it = std::lower_bound(_sorted_tokens.begin(), _sorted_tokens.end(), start); if (it == _sorted_tokens.end()) { return 0; } else { return std::distance(_sorted_tokens.begin(), it); } } const token& token_metadata::first_token(const token& start) const { return _sorted_tokens[first_token_index(start)]; } std::experimental::optional token_metadata::get_endpoint(const token& token) const { auto it = _token_to_endpoint_map.find(token); if (it == _token_to_endpoint_map.end()) { return std::experimental::nullopt; } else { return it->second; } } void token_metadata::debug_show() { auto reporter = std::make_shared>(); reporter->set_callback ([reporter, this] { print("Endpoint -> Token\n"); for (auto x : _token_to_endpoint_map) { print("inet_address=%s, token=%s\n", x.second, x.first); } print("Endpoint -> UUID\n"); for (auto x : _endpoint_to_host_id_map) { print("inet_address=%s, uuid=%s\n", x.first, x.second); } print("Sorted Token\n"); for (auto x : _sorted_tokens) { print("token=%s\n", x); } }); reporter->arm_periodic(std::chrono::seconds(1)); } void token_metadata::update_host_id(const UUID& host_id, inet_address endpoint) { #if 0 assert host_id != null; assert endpoint != null; InetAddress storedEp = _endpoint_to_host_id_map.inverse().get(host_id); if (storedEp != null) { if (!storedEp.equals(endpoint) && (FailureDetector.instance.isAlive(storedEp))) { throw new RuntimeException(String.format("Host ID collision between active endpoint %s and %s (id=%s)", storedEp, endpoint, host_id)); } } UUID storedId = _endpoint_to_host_id_map.get(endpoint); // if ((storedId != null) && (!storedId.equals(host_id))) logger.warn("Changing {}'s host ID from {} to {}", endpoint, storedId, host_id); #endif _endpoint_to_host_id_map[endpoint] = host_id; } utils::UUID token_metadata::get_host_id(inet_address endpoint) { if (!_endpoint_to_host_id_map.count(endpoint)) { std::runtime_error(sprint("host_id for endpoint %s is not found", endpoint)); } return _endpoint_to_host_id_map.at(endpoint); } std::experimental::optional token_metadata::get_endpoint_for_host_id(UUID host_id) { auto beg = _endpoint_to_host_id_map.cbegin(); auto end = _endpoint_to_host_id_map.cend(); auto it = std::find_if(beg, end, [host_id] (auto x) { return x.second == host_id; }); if (it == end) { return {}; } else { return (*it).first; } } const std::unordered_map& token_metadata::get_endpoint_to_host_id_map_for_reading() const{ return _endpoint_to_host_id_map; } bool token_metadata::is_member(inet_address endpoint) { auto beg = _token_to_endpoint_map.cbegin(); auto end = _token_to_endpoint_map.cend(); return end != std::find_if(beg, end, [endpoint] (const auto& x) { return x.second == endpoint; }); } void token_metadata::add_bootstrap_token(token t, inet_address endpoint) { std::unordered_set tokens{t}; add_bootstrap_tokens(tokens, endpoint); } boost::iterator_range token_metadata::ring_range( const std::experimental::optional& start, bool include_min) const { auto r = ring_range(start ? start->value().token() : dht::minimum_token(), include_min); if (!r.empty()) { // We should skip the first token if it's excluded by the range. if (start && !start->is_inclusive() && !start->value().has_key() && start->value().token() == *r.begin()) { r.pop_front(); } } return r; } void token_metadata::add_bootstrap_tokens(std::unordered_set tokens, inet_address endpoint) { for (auto t : tokens) { auto old_endpoint = _bootstrap_tokens.find(t); if (old_endpoint != _bootstrap_tokens.end() && (*old_endpoint).second != endpoint) { auto msg = sprint("Bootstrap Token collision between %s and %s (token %s", (*old_endpoint).second, endpoint, t); throw std::runtime_error(msg); } auto old_endpoint2 = _token_to_endpoint_map.find(t); if (old_endpoint2 != _token_to_endpoint_map.end() && (*old_endpoint2).second != endpoint) { auto msg = sprint("Bootstrap Token collision between %s and %s (token %s", (*old_endpoint2).second, endpoint, t); throw std::runtime_error(msg); } } // Unfortunately, std::remove_if does not work with std::map for (auto it = _bootstrap_tokens.begin(); it != _bootstrap_tokens.end();) { if ((*it).second == endpoint) { it = _bootstrap_tokens.erase(it); } else { it++; } } for (auto t : tokens) { _bootstrap_tokens[t] = endpoint; } } void token_metadata::remove_bootstrap_tokens(std::unordered_set tokens) { assert(!tokens.empty()); for (auto t : tokens) { _bootstrap_tokens.erase(t); } } bool token_metadata::is_leaving(inet_address endpoint) { return _leaving_endpoints.count(endpoint); } void token_metadata::remove_endpoint(inet_address endpoint) { remove_by_value(_bootstrap_tokens, endpoint); remove_by_value(_token_to_endpoint_map, endpoint); _topology.remove_endpoint(endpoint); _leaving_endpoints.erase(endpoint); _endpoint_to_host_id_map.erase(endpoint); _sorted_tokens = sort_tokens(); invalidate_cached_rings(); } void token_metadata::remove_from_moving(inet_address endpoint) { remove_by_value(_moving_endpoints, endpoint); invalidate_cached_rings(); } token token_metadata::get_predecessor(token t) { auto& tokens = sorted_tokens(); auto it = std::lower_bound(tokens.begin(), tokens.end(), t); assert(it != tokens.end() && *it == t); if (it == tokens.begin()) { // If the token is the first element, its preprocessor is the last element return tokens.back(); } else { return *(--it); } } dht::token_range_vector token_metadata::get_primary_ranges_for(std::unordered_set tokens) { dht::token_range_vector ranges; ranges.reserve(tokens.size() + 1); // one of the ranges will wrap for (auto right : tokens) { auto left = get_predecessor(right); compat::unwrap_into( wrapping_range(range_bound(left, false), range_bound(right)), dht::token_comparator(), [&] (auto&& rng) { ranges.push_back(std::move(rng)); }); } return ranges; } dht::token_range_vector token_metadata::get_primary_ranges_for(token right) { return get_primary_ranges_for(std::unordered_set{right}); } boost::icl::interval::interval_type token_metadata::range_to_interval(range r) { bool start_inclusive = false; bool end_inclusive = false; token start = dht::minimum_token(); token end = dht::maximum_token(); if (r.start()) { start = r.start()->value(); start_inclusive = r.start()->is_inclusive(); } if (r.end()) { end = r.end()->value(); end_inclusive = r.end()->is_inclusive(); } if (start_inclusive == false && end_inclusive == false) { return boost::icl::interval::open(std::move(start), std::move(end)); } else if (start_inclusive == false && end_inclusive == true) { return boost::icl::interval::left_open(std::move(start), std::move(end)); } else if (start_inclusive == true && end_inclusive == false) { return boost::icl::interval::right_open(std::move(start), std::move(end)); } else { return boost::icl::interval::closed(std::move(start), std::move(end)); } } range token_metadata::interval_to_range(boost::icl::interval::interval_type i) { bool start_inclusive; bool end_inclusive; auto bounds = i.bounds().bits(); if (bounds == boost::icl::interval_bounds::static_open) { start_inclusive = false; end_inclusive = false; } else if (bounds == boost::icl::interval_bounds::static_left_open) { start_inclusive = false; end_inclusive = true; } else if (bounds == boost::icl::interval_bounds::static_right_open) { start_inclusive = true; end_inclusive = false; } else if (bounds == boost::icl::interval_bounds::static_closed) { start_inclusive = true; end_inclusive = true; } else { throw std::runtime_error("Invalid boost::icl::interval bounds"); } return range({{i.lower(), start_inclusive}}, {{i.upper(), end_inclusive}}); } void token_metadata::set_pending_ranges(const sstring& keyspace_name, std::unordered_multimap, inet_address> new_pending_ranges) { if (new_pending_ranges.empty()) { _pending_ranges.erase(keyspace_name); _pending_ranges_map.erase(keyspace_name); _pending_ranges_interval_map.erase(keyspace_name); return; } std::unordered_map, std::unordered_set> map; for (const auto& x : new_pending_ranges) { map[x.first].emplace(x.second); } // construct a interval map to speed up the search _pending_ranges_interval_map[keyspace_name] = {}; for (const auto& m : map) { _pending_ranges_interval_map[keyspace_name] += std::make_pair(range_to_interval(m.first), m.second); } _pending_ranges[keyspace_name] = std::move(new_pending_ranges); _pending_ranges_map[keyspace_name] = std::move(map); } std::unordered_multimap, inet_address>& token_metadata::get_pending_ranges_mm(sstring keyspace_name) { return _pending_ranges[keyspace_name]; } const std::unordered_map, std::unordered_set>& token_metadata::get_pending_ranges(sstring keyspace_name) { return _pending_ranges_map[keyspace_name]; } std::vector> token_metadata::get_pending_ranges(sstring keyspace_name, inet_address endpoint) { std::vector> ret; for (auto x : get_pending_ranges_mm(keyspace_name)) { auto& range_token = x.first; auto& ep = x.second; if (ep == endpoint) { ret.push_back(range_token); } } return ret; } void token_metadata::calculate_pending_ranges(abstract_replication_strategy& strategy, const sstring& keyspace_name) { std::unordered_multimap, inet_address> new_pending_ranges; if (_bootstrap_tokens.empty() && _leaving_endpoints.empty() && _moving_endpoints.empty()) { logger.debug("No bootstrapping, leaving or moving nodes -> empty pending ranges for {}", keyspace_name); set_pending_ranges(keyspace_name, std::move(new_pending_ranges)); return; } std::unordered_multimap address_ranges = strategy.get_address_ranges(*this); // FIMXE // Copy of metadata reflecting the situation after all leave operations are finished. auto all_left_metadata = clone_after_all_left(); // get all ranges that will be affected by leaving nodes std::unordered_set> affected_ranges; for (auto endpoint : _leaving_endpoints) { auto r = address_ranges.equal_range(endpoint); for (auto x = r.first; x != r.second; x++) { affected_ranges.emplace(x->second); } } // for each of those ranges, find what new nodes will be responsible for the range when // all leaving nodes are gone. auto metadata = clone_only_token_map(); // don't do this in the loop! #7758 for (const auto& r : affected_ranges) { auto t = r.end() ? r.end()->value() : dht::maximum_token(); auto current_endpoints = strategy.calculate_natural_endpoints(t, metadata); auto new_endpoints = strategy.calculate_natural_endpoints(t, all_left_metadata); std::vector diff; std::sort(current_endpoints.begin(), current_endpoints.end()); std::sort(new_endpoints.begin(), new_endpoints.end()); std::set_difference(new_endpoints.begin(), new_endpoints.end(), current_endpoints.begin(), current_endpoints.end(), std::back_inserter(diff)); for (auto& ep : diff) { new_pending_ranges.emplace(r, ep); } } // At this stage newPendingRanges has been updated according to leave operations. We can // now continue the calculation by checking bootstrapping nodes. // For each of the bootstrapping nodes, simply add and remove them one by one to // allLeftMetadata and check in between what their ranges would be. std::unordered_multimap bootstrap_addresses; for (auto& x : _bootstrap_tokens) { bootstrap_addresses.emplace(x.second, x.first); } // TODO: share code with unordered_multimap_to_unordered_map std::unordered_map> tmp; for (auto& x : bootstrap_addresses) { auto& addr = x.first; auto& t = x.second; tmp[addr].insert(t); } for (auto& x : tmp) { auto& endpoint = x.first; auto& tokens = x.second; all_left_metadata.update_normal_tokens(tokens, endpoint); for (auto& x : strategy.get_address_ranges(all_left_metadata)) { if (x.first == endpoint) { new_pending_ranges.emplace(x.second, endpoint); } } all_left_metadata.remove_endpoint(endpoint); } // At this stage newPendingRanges has been updated according to leaving and bootstrapping nodes. // We can now finish the calculation by checking moving nodes. // For each of the moving nodes, we do the same thing we did for bootstrapping: // simply add and remove them one by one to allLeftMetadata and check in between what their ranges would be. for (auto& moving : _moving_endpoints) { auto& t = moving.first; auto& endpoint = moving.second; // address of the moving node // moving.left is a new token of the endpoint all_left_metadata.update_normal_token(t, endpoint); for (auto& x : strategy.get_address_ranges(all_left_metadata)) { if (x.first == endpoint) { new_pending_ranges.emplace(x.second, endpoint); } } all_left_metadata.remove_endpoint(endpoint); } set_pending_ranges(keyspace_name, std::move(new_pending_ranges)); if (logger.is_enabled(logging::log_level::debug)) { logger.debug("Pending ranges: {}", (_pending_ranges.empty() ? "" : print_pending_ranges())); } } sstring token_metadata::print_pending_ranges() { std::stringstream ss; for (auto& x : _pending_ranges) { auto& keyspace_name = x.first; ss << "\nkeyspace_name = " << keyspace_name << " {\n"; for (auto& m : x.second) { ss << m.second << " : " << m.first << "\n"; } ss << "}\n"; } return sstring(ss.str()); } void token_metadata::add_leaving_endpoint(inet_address endpoint) { _leaving_endpoints.emplace(endpoint); } token_metadata token_metadata::clone_after_all_settled() { token_metadata metadata = clone_only_token_map(); for (auto endpoint : _leaving_endpoints) { metadata.remove_endpoint(endpoint); } for (auto x : _moving_endpoints) { metadata.update_normal_token(x.first, x.second); } return metadata; } void token_metadata::add_moving_endpoint(token t, inet_address endpoint) { _moving_endpoints[t] = endpoint; } std::vector token_metadata::pending_endpoints_for(const token& token, const sstring& keyspace_name) { // Fast path 0: no pending ranges at all if (_pending_ranges_interval_map.empty()) { return {}; } // Fast path 1: no pending ranges for this keyspace_name if (_pending_ranges_interval_map[keyspace_name].empty()) { return {}; } // Slow path: lookup pending ranges std::vector endpoints; auto interval = range_to_interval(range(token)); auto it = _pending_ranges_interval_map[keyspace_name].find(interval); if (it != _pending_ranges_interval_map[keyspace_name].end()) { // interval_map does not work with std::vector, convert to std::vector of ips endpoints = std::vector(it->second.begin(), it->second.end()); } return endpoints; } std::map token_metadata::get_normal_and_bootstrapping_token_to_endpoint_map() { std::map ret(_token_to_endpoint_map.begin(), _token_to_endpoint_map.end()); ret.insert(_bootstrap_tokens.begin(), _bootstrap_tokens.end()); return ret; } std::multimap token_metadata::get_endpoint_to_token_map_for_reading() { std::multimap cloned; for (const auto& x : _token_to_endpoint_map) { cloned.emplace(x.second, x.first); } return cloned; } /////////////////// class topology ///////////////////////////////////////////// inline void topology::clear() { _dc_endpoints.clear(); _dc_racks.clear(); _current_locations.clear(); } topology::topology(const topology& other) { _dc_endpoints = other._dc_endpoints; _dc_racks = other._dc_racks; _current_locations = other._current_locations; } void topology::add_endpoint(const inet_address& ep) { auto& snitch = i_endpoint_snitch::get_local_snitch_ptr(); sstring dc = snitch->get_datacenter(ep); sstring rack = snitch->get_rack(ep); auto current = _current_locations.find(ep); if (current != _current_locations.end()) { if (current->second.dc == dc && current->second.rack == rack) { return; } _dc_racks[current->second.dc][current->second.rack].erase(ep); _dc_endpoints[current->second.dc].erase(ep); } _dc_endpoints[dc].insert(ep); _dc_racks[dc][rack].insert(ep); _current_locations[ep] = {dc, rack}; } void topology::update_endpoint(inet_address ep) { if (!_current_locations.count(ep) || !locator::i_endpoint_snitch::snitch_instance().local_is_initialized()) { return; } add_endpoint(ep); } void topology::remove_endpoint(inet_address ep) { auto cur_dc_rack = _current_locations.find(ep); if (cur_dc_rack == _current_locations.end()) { return; } _dc_endpoints[cur_dc_rack->second.dc].erase(ep); _dc_racks[cur_dc_rack->second.dc][cur_dc_rack->second.rack].erase(ep); _current_locations.erase(cur_dc_rack); } /////////////////// class topology end ///////////////////////////////////////// } // namespace locator