Files
scylladb/locator/token_metadata.hh
Tomasz Grabiec ef9e5e64a3 locator: token_metadata: Introduce topology barrier stall detector
When topology barrier is blocked for longer than configured threshold
(2s), stale versions are marked as stalled and when they get released
they report backtrace to the logs. This should help to identify what
was holding for token metadata pointer for too long.

Example log:

  token_metadata - topology version 30 held for 299.159 [s] past expiry, released at:  0x2397ae1 0x23a36b6 ...

Closes scylladb/scylladb#17427
2024-02-21 15:05:34 +02:00

448 lines
17 KiB
C++

/*
*
* Modified by ScyllaDB
* Copyright (C) 2015-present ScyllaDB
*/
/*
* SPDX-License-Identifier: (AGPL-3.0-or-later and Apache-2.0)
*/
#pragma once
#include <unordered_set>
#include <unordered_map>
#include "gms/inet_address.hh"
#include "dht/ring_position.hh"
#include <optional>
#include <memory>
#include <boost/range/iterator_range.hpp>
#include <boost/icl/interval.hpp>
#include "interval.hh"
#include <seastar/core/shared_future.hh>
#include <seastar/core/shared_ptr.hh>
#include <seastar/core/semaphore.hh>
#include <seastar/core/sharded.hh>
#include "utils/phased_barrier.hh"
#include "service/topology_state_machine.hh"
#include "locator/types.hh"
#include "locator/topology.hh"
#include "locator/token_metadata_fwd.hh"
// forward declaration since replica/database.hh includes this file
namespace replica {
class keyspace;
}
namespace locator {
class abstract_replication_strategy;
using token = dht::token;
class token_metadata;
class tablet_metadata;
struct host_id_or_endpoint {
host_id id;
gms::inet_address endpoint;
enum class param_type {
host_id,
endpoint,
auto_detect
};
host_id_or_endpoint(const sstring& s, param_type restrict = param_type::auto_detect);
bool has_host_id() const noexcept {
return bool(id);
}
bool has_endpoint() const noexcept {
return endpoint != gms::inet_address();
}
// Map the host_id to endpoint based on whichever of them is set,
// using the token_metadata
void resolve(const token_metadata& tm);
};
class token_metadata_impl;
struct topology_change_info;
struct version_tracker {
public:
friend class shared_token_metadata;
using link_base = boost::intrusive::list_member_hook<boost::intrusive::link_mode<boost::intrusive::auto_unlink>>;
struct link_type : link_base {
link_type() noexcept = default;
link_type(link_type&& o) noexcept { swap_nodes(o); }
link_type& operator=(link_type&& o) noexcept {
if (this != &o) {
unlink();
swap_nodes(o);
}
return *this;
}
};
private:
utils::phased_barrier::operation _op;
service::topology::version_t _version;
link_type _link;
// When engaged it means the version is no longer latest and should be released soon as to
// not block barriers. If this version dies past _log_threshold, it should self-report.
std::optional<std::chrono::steady_clock::time_point> _expired_at;
std::chrono::steady_clock::duration _log_threshold;
public:
version_tracker() = default;
version_tracker(utils::phased_barrier::operation op, service::topology::version_t version)
: _op(std::move(op)), _version(version) {}
version_tracker(version_tracker&&) noexcept = default;
version_tracker& operator=(version_tracker&&) noexcept = default;
version_tracker(const version_tracker&) = delete;
~version_tracker();
service::topology::version_t version() const {
return _version;
}
void mark_expired(std::chrono::steady_clock::duration log_threshold) {
if (!_expired_at) {
_expired_at = std::chrono::steady_clock::now();
_log_threshold = log_threshold;
}
}
};
class token_metadata final {
std::unique_ptr<token_metadata_impl> _impl;
private:
friend class token_metadata_ring_splitter;
class tokens_iterator {
public:
using iterator_category = std::input_iterator_tag;
using value_type = token;
using difference_type = std::ptrdiff_t;
using pointer = token*;
using reference = token&;
public:
tokens_iterator() = default;
tokens_iterator(const token& start, const token_metadata_impl* token_metadata);
bool operator==(const tokens_iterator& it) const;
const token& operator*() const;
tokens_iterator& operator++();
private:
std::vector<token>::const_iterator _cur_it;
size_t _remaining = 0;
const token_metadata_impl* _token_metadata = nullptr;
friend class token_metadata_impl;
};
public:
struct config {
topology::config topo_cfg;
};
using inet_address = gms::inet_address;
using version_t = service::topology::version_t;
using version_tracker_t = version_tracker;
token_metadata(config cfg);
explicit token_metadata(std::unique_ptr<token_metadata_impl> impl);
token_metadata(token_metadata&&) noexcept; // Can't use "= default;" - hits some static_assert in unique_ptr
token_metadata& operator=(token_metadata&&) noexcept;
~token_metadata();
const std::vector<token>& sorted_tokens() const;
const tablet_metadata& tablets() const;
tablet_metadata& tablets();
void set_tablets(tablet_metadata);
// Update token->endpoint mappings for a given \c endpoint.
// \c tokens are all the tokens that are now owned by \c endpoint.
//
// Note: the function is not exception safe!
// It must be called only on a temporary copy of the token_metadata
future<> update_normal_tokens(std::unordered_set<token> tokens, host_id endpoint);
const token& first_token(const token& start) const;
size_t first_token_index(const token& start) const;
std::optional<host_id> get_endpoint(const token& token) const;
std::vector<token> get_tokens(const host_id& addr) const;
const std::unordered_map<token, host_id>& get_token_to_endpoint() const;
const std::unordered_set<host_id>& get_leaving_endpoints() const;
const std::unordered_map<token, host_id>& get_bootstrap_tokens() const;
/**
* Update or add a node for a given host_id.
* The other arguments (dc, state, shard_count) are optional, i.e. the corresponding node
* fields won't be updated if std::nullopt is passed.
*/
void update_topology(host_id ep, std::optional<endpoint_dc_rack> opt_dr, std::optional<node::state> opt_st = std::nullopt,
std::optional<shard_id> shard_count = std::nullopt);
/**
* Creates an iterable range of the sorted tokens starting at the token t
* such that t >= start.
*
* @param start A token that will define the beginning of the range
*
* @return The requested range (see the description above)
*/
boost::iterator_range<tokens_iterator> ring_range(const token& start) const;
/**
* Returns a range of tokens such that the first token t satisfies dht::ring_position_view::ending_at(t) >= start.
*/
boost::iterator_range<tokens_iterator> ring_range(dht::ring_position_view start) const;
topology& get_topology();
const topology& get_topology() const;
void debug_show() const;
/**
* Store an end-point to host ID mapping. Each ID must be unique, and
* cannot be changed after the fact.
*
* @param hostId
* @param endpoint
*/
void update_host_id(const locator::host_id& host_id, inet_address endpoint);
/** Return the unique host ID for an end-point. */
host_id get_host_id(inet_address endpoint) const;
/// Return the unique host ID for an end-point or nullopt if not found.
std::optional<host_id> get_host_id_if_known(inet_address endpoint) const;
/** Return the end-point for a unique host ID or nullopt if not found. */
std::optional<inet_address> get_endpoint_for_host_id_if_known(locator::host_id host_id) const;
/** Return the end-point for a unique host ID */
inet_address get_endpoint_for_host_id(locator::host_id host_id) const;
/// Parses the \c host_id_string either as a host uuid or as an ip address and returns the mapping.
/// Throws std::invalid_argument on parse error or std::runtime_error if the host_id wasn't found.
host_id_or_endpoint parse_host_id_and_endpoint(const sstring& host_id_string) const;
/** @return a copy of the endpoint-to-id map for read-only operations */
std::unordered_map<inet_address, host_id> get_endpoint_to_host_id_map_for_reading() const;
/// Returns host_id of the local node.
host_id get_my_id() const;
void add_bootstrap_token(token t, host_id endpoint);
void add_bootstrap_tokens(std::unordered_set<token> tokens, host_id endpoint);
void remove_bootstrap_tokens(std::unordered_set<token> tokens);
void add_leaving_endpoint(host_id endpoint);
void del_leaving_endpoint(host_id endpoint);
void remove_endpoint(host_id endpoint);
// Checks if the node is part of the token ring. If yes, the node is one of
// the nodes that owns the tokens and inside the set _normal_token_owners.
bool is_normal_token_owner(host_id endpoint) const;
bool is_leaving(host_id endpoint) const;
// Is this node being replaced by another node
bool is_being_replaced(host_id endpoint) const;
// Is any node being replaced by another node
bool is_any_node_being_replaced() const;
void add_replacing_endpoint(host_id existing_node, host_id replacing_node);
void del_replacing_endpoint(host_id existing_node);
/**
* Create a full copy of token_metadata using asynchronous continuations.
* The caller must ensure that the cloned object will not change if
* the function yields.
*/
future<token_metadata> clone_async() const noexcept;
/**
* Create a copy of TokenMetadata with only tokenToEndpointMap. That is, pending ranges,
* bootstrap tokens and leaving endpoints are not included in the copy.
* The caller must ensure that the cloned object will not change if
* the function yields.
*/
future<token_metadata> clone_only_token_map() const noexcept;
/**
* Create a copy of TokenMetadata with tokenToEndpointMap reflecting situation after all
* current leave operations have finished.
* The caller must ensure that the cloned object will not change if
* the function yields.
*
* @return a future holding a new token metadata
*/
future<token_metadata> clone_after_all_left() const noexcept;
/**
* Gently clear the token_metadata members.
* Yield if needed to prevent reactor stalls.
*/
future<> clear_gently() noexcept;
/*
* Number of returned ranges = O(tokens.size())
*/
dht::token_range_vector get_primary_ranges_for(std::unordered_set<token> tokens) const;
/*
* Number of returned ranges = O(1)
*/
dht::token_range_vector get_primary_ranges_for(token right) const;
static boost::icl::interval<token>::interval_type range_to_interval(wrapping_interval<dht::token> r);
static wrapping_interval<dht::token> interval_to_range(boost::icl::interval<token>::interval_type i);
future<> update_topology_change_info(dc_rack_fn& get_dc_rack);
const std::optional<topology_change_info>& get_topology_change_info() const;
token get_predecessor(token t) const;
const std::unordered_set<host_id>& get_all_endpoints() const;
std::unordered_set<gms::inet_address> get_all_ips() const;
/* Returns the number of different endpoints that own tokens in the ring.
* Bootstrapping tokens are not taken into account. */
size_t count_normal_token_owners() const;
// Updates the read_new flag, switching read requests from
// the old endpoints to the new ones during topology changes:
// read_new_t::no - no read_endpoints will be stored on update_pending_ranges, all reads goes to normal endpoints;
// read_new_t::yes - triggers update_pending_ranges to compute and store new ranges for read requests.
// The value is preserved in all clone functions, the default is read_new_t::no.
using read_new_t = bool_class<class read_new_tag>;
void set_read_new(read_new_t value);
long get_ring_version() const;
void invalidate_cached_rings();
version_t get_version() const;
void set_version(version_t version);
friend class token_metadata_impl;
friend class shared_token_metadata;
private:
void set_version_tracker(version_tracker_t tracker);
};
struct topology_change_info {
lw_shared_ptr<token_metadata> target_token_metadata;
std::vector<dht::token> all_tokens;
token_metadata::read_new_t read_new;
topology_change_info(lw_shared_ptr<token_metadata> target_token_metadata_,
std::vector<dht::token> all_tokens_,
token_metadata::read_new_t read_new_);
future<> clear_gently();
};
using token_metadata_lock = semaphore_units<>;
using token_metadata_lock_func = noncopyable_function<future<token_metadata_lock>() noexcept>;
template <typename... Args>
mutable_token_metadata_ptr make_token_metadata_ptr(Args... args) {
return make_lw_shared<token_metadata>(std::forward<Args>(args)...);
}
class shared_token_metadata {
mutable_token_metadata_ptr _shared;
token_metadata_lock_func _lock_func;
std::chrono::steady_clock::duration _stall_detector_threshold = std::chrono::seconds(2);
// We use this barrier during the transition to a new token_metadata version to ensure that the
// system stops using previous versions. Here are the key points:
// * A new phase begins when a mutable_token_metadata_ptr passed to shared_token_metadata::set has
// a higher version than the current one.
// * Each shared_token_metadata::set call initiates an operation on the barrier. If multiple calls
// have the same version, multiple operations may be initiated with the same phase.
// * The operation is stored within the new token_metadata instance (token_metadata::set_version_tracker),
// and it completes when the instance is destroyed.
// * The method shared_token_metadata::stale_versions_in_use can be used to wait for the phase
// transition to complete. Once this future resolves, there will be no token_metadata instances
// with versions lower than the current one.
// * Multiple new phases (version upgrades) can be started before accessing stale_versions_in_use.
// However, stale_versions_in_use waits for all previous phases to finish, as advance_and_await
// includes its own invocation as an operation in the new phase.
utils::phased_barrier _versions_barrier;
shared_future<> _stale_versions_in_use{make_ready_future<>()};
token_metadata::version_t _fence_version = 0;
using version_tracker_list_type = boost::intrusive::list<version_tracker,
boost::intrusive::member_hook<version_tracker, version_tracker::link_type, &version_tracker::_link>,
boost::intrusive::constant_time_size<false>>;
version_tracker_list_type _trackers;
private:
version_tracker new_tracker(token_metadata::version_t);
public:
// used to construct the shared object as a sharded<> instance
// lock_func returns semaphore_units<>
explicit shared_token_metadata(token_metadata_lock_func lock_func, token_metadata::config cfg)
: _shared(make_token_metadata_ptr(std::move(cfg)))
, _lock_func(std::move(lock_func))
{
_shared->set_version_tracker(new_tracker(_shared->get_version()));
}
shared_token_metadata(const shared_token_metadata& x) = delete;
shared_token_metadata(shared_token_metadata&& x) = default;
token_metadata_ptr get() const noexcept {
return _shared;
}
void set(mutable_token_metadata_ptr tmptr) noexcept;
void set_stall_detector_threshold(std::chrono::steady_clock::duration threshold) {
_stall_detector_threshold = threshold;
}
future<> stale_versions_in_use() const {
return _stale_versions_in_use.get_future();
}
void update_fence_version(token_metadata::version_t version);
token_metadata::version_t get_fence_version() const noexcept {
return _fence_version;
}
// Token metadata changes are serialized
// using the schema_tables merge_lock.
//
// Must be called on shard 0.
future<token_metadata_lock> get_lock() const noexcept {
return _lock_func();
}
// mutate_token_metadata_on_all_shards acquires the shared_token_metadata lock,
// clones the token_metadata (using clone_async)
// and calls an asynchronous functor on
// the cloned copy of the token_metadata to mutate it.
//
// If the functor is successful, the mutated clone
// is set back to to the shared_token_metadata,
// otherwise, the clone is destroyed.
future<> mutate_token_metadata(seastar::noncopyable_function<future<> (token_metadata&)> func);
// mutate_token_metadata_on_all_shards acquires the shared_token_metadata lock,
// clones the token_metadata (using clone_async)
// and calls an asynchronous functor on
// the cloned copy of the token_metadata to mutate it.
//
// If the functor is successful, the mutated clone
// is set back to to the shared_token_metadata on all shards,
// otherwise, the clone is destroyed.
//
// Must be called on shard 0.
static future<> mutate_on_all_shards(sharded<shared_token_metadata>& stm, seastar::noncopyable_function<future<> (token_metadata&)> func);
};
}