/* * * Modified by ScyllaDB * Copyright (C) 2015-present ScyllaDB */ /* * SPDX-License-Identifier: (AGPL-3.0-or-later and Apache-2.0) */ #pragma once #include "unimplemented.hh" #include #include #include #include #include #include "utils/atomic_vector.hh" #include "utils/UUID.hh" #include "utils/fb_utilities.hh" #include "gms/failure_detector.hh" #include "gms/versioned_value.hh" #include "gms/application_state.hh" #include "gms/endpoint_state.hh" #include "gms/feature.hh" #include "gms/gossip_digest_syn.hh" #include "gms/gossip_digest.hh" #include "utils/loading_shared_values.hh" #include "utils/updateable_value.hh" #include "utils/in.hh" #include "message/messaging_service_fwd.hh" #include #include #include #include #include #include #include #include #include "locator/token_metadata.hh" namespace db { class config; class system_keyspace; } namespace gms { class gossip_digest_syn; class gossip_digest_ack; class gossip_digest_ack2; class gossip_digest; class inet_address; class i_endpoint_state_change_subscriber; class gossip_get_endpoint_states_request; class gossip_get_endpoint_states_response; class feature_service; using advertise_myself = bool_class; struct syn_msg_pending { bool pending = false; std::optional syn_msg; }; struct ack_msg_pending { bool pending = false; std::optional> ack_msg_digest; }; struct gossip_config { seastar::scheduling_group gossip_scheduling_group = seastar::scheduling_group(); sstring cluster_name; std::set seeds; sstring partitioner; uint32_t ring_delay_ms = 30 * 1000; uint32_t shadow_round_ms = 300 * 1000; uint32_t shutdown_announce_ms = 2 * 1000; uint32_t skip_wait_for_gossip_to_settle = -1; }; /** * This module is responsible for Gossiping information for the local endpoint. This abstraction * maintains the list of live and dead endpoints. Periodically i.e. every 1 second this module * chooses a random node and initiates a round of Gossip with it. A round of Gossip involves 3 * rounds of messaging. For instance if node A wants to initiate a round of Gossip with node B * it starts off by sending node B a GossipDigestSynMessage. Node B on receipt of this message * sends node A a GossipDigestAckMessage. On receipt of this message node A sends node B a * GossipDigestAck2Message which completes a round of Gossip. This module as and when it hears one * of the three above mentioned messages updates the Failure Detector with the liveness information. * Upon hearing a GossipShutdownMessage, this module will instantly mark the remote node as down in * the Failure Detector. */ class gossiper : public seastar::async_sharded_service, public seastar::peering_sharded_service { public: using clk = seastar::lowres_system_clock; using ignore_features_of_local_node = bool_class; private: using messaging_verb = netw::messaging_verb; using messaging_service = netw::messaging_service; using msg_addr = netw::msg_addr; void init_messaging_service_handler(); future<> uninit_messaging_service_handler(); future<> handle_syn_msg(msg_addr from, gossip_digest_syn syn_msg); future<> handle_ack_msg(msg_addr from, gossip_digest_ack ack_msg); future<> handle_ack2_msg(msg_addr from, gossip_digest_ack2 msg); future<> handle_echo_msg(inet_address from, std::optional generation_number_opt); future<> handle_shutdown_msg(inet_address from, std::optional generation_number_opt); future<> do_send_ack_msg(msg_addr from, gossip_digest_syn syn_msg); future<> do_send_ack2_msg(msg_addr from, utils::chunked_vector ack_msg_digest); future handle_get_endpoint_states_msg(gossip_get_endpoint_states_request request); static constexpr uint32_t _default_cpuid = 0; msg_addr get_msg_addr(inet_address to) const noexcept; void do_sort(utils::chunked_vector& g_digest_list); timer _scheduled_gossip_task; bool _enabled = false; semaphore _callback_running{1}; semaphore _apply_state_locally_semaphore{100}; seastar::gate _background_msg; std::unordered_map _syn_handlers; std::unordered_map _ack_handlers; bool _advertise_myself = true; // Map ip address and generation number std::unordered_map _advertise_to_nodes; future<> _failure_detector_loop_done{make_ready_future<>()} ; rpc::no_wait_type background_msg(sstring type, noncopyable_function(gossiper&)> fn); public: // Get current generation number for the given nodes future> get_generation_for_nodes(std::list nodes); // Only respond echo message listed in nodes with the generation number future<> advertise_to_nodes(std::unordered_map advertise_to_nodes = {}); const sstring& get_cluster_name() const noexcept; const sstring& get_partitioner_name() const noexcept { return _gcfg.partitioner; } inet_address get_broadcast_address() const noexcept { return utils::fb_utilities::get_broadcast_address(); } const std::set& get_seeds() const noexcept; netw::messaging_service& get_local_messaging() const noexcept { return _messaging; } sharded& get_system_keyspace() const noexcept { return _sys_ks; } public: static clk::time_point inline now() noexcept { return clk::now(); } public: using endpoint_locks_map = utils::loading_shared_values; struct endpoint_permit { endpoint_locks_map::entry_ptr _ptr; semaphore_units<> _units; }; future lock_endpoint(inet_address); public: /* map where key is the endpoint and value is the state associated with the endpoint */ std::unordered_map endpoint_state_map; // Used for serializing changes to endpoint_state_map and running of associated change listeners. endpoint_locks_map endpoint_locks; const std::vector DEAD_STATES = { versioned_value::REMOVING_TOKEN, versioned_value::REMOVED_TOKEN, versioned_value::STATUS_LEFT, }; const std::vector SILENT_SHUTDOWN_STATES = { versioned_value::REMOVING_TOKEN, versioned_value::REMOVED_TOKEN, versioned_value::STATUS_LEFT, versioned_value::HIBERNATE, versioned_value::STATUS_BOOTSTRAPPING, versioned_value::STATUS_UNKNOWN, }; static constexpr std::chrono::milliseconds INTERVAL{1000}; static constexpr std::chrono::hours A_VERY_LONG_TIME{24 * 3}; static constexpr std::chrono::milliseconds GOSSIP_SETTLE_MIN_WAIT_MS{5000}; // Maximimum difference between remote generation value and generation // value this node would get if this node were restarted that we are // willing to accept about a peer. static constexpr int64_t MAX_GENERATION_DIFFERENCE = 86400 * 365; std::chrono::milliseconds fat_client_timeout; std::chrono::milliseconds quarantine_delay() const noexcept; private: std::default_random_engine _random_engine{std::random_device{}()}; /** * subscribers for interest in EndpointState change */ atomic_vector> _subscribers; std::list> _endpoints_to_talk_with; /* live member set */ utils::chunked_vector _live_endpoints; uint64_t _live_endpoints_version = 0; /* nodes are being marked as alive */ std::unordered_set _pending_mark_alive_endpoints; /* unreachable member set */ std::unordered_map _unreachable_endpoints; /* initial seeds for joining the cluster */ std::set _seeds; /* map where key is endpoint and value is timestamp when this endpoint was removed from * gossip. We will ignore any gossip regarding these endpoints for QUARANTINE_DELAY time * after removal to prevent nodes from falsely reincarnating during the time when removal * gossip gets propagated to all nodes */ std::map _just_removed_endpoints; std::map _expire_time_endpoint_map; bool _in_shadow_round = false; std::unordered_map _shadow_unreachable_endpoints; utils::chunked_vector _shadow_live_endpoints; std::default_random_engine _e1{std::random_device{}()}; void run(); // Replicates given endpoint_state to all other shards. // The state state doesn't have to be kept alive around until completes. future<> replicate(inet_address, const endpoint_state&); // Replicates "states" from "src" to all other shards. // "src" and "states" must be kept alive until completes and must not change. future<> replicate(inet_address, const std::map& src, const utils::chunked_vector& states); // Replicates given value to all other shards. // The value must be kept alive until completes and not change. future<> replicate(inet_address, application_state key, const versioned_value& value); public: explicit gossiper(abort_source& as, feature_service& features, const locator::shared_token_metadata& stm, netw::messaging_service& ms, sharded& sys_ks, const db::config& cfg, gossip_config gcfg); void check_seen_seeds(); /** * Register for interesting state changes. * * @param subscriber module which implements the IEndpointStateChangeSubscriber */ void register_(shared_ptr subscriber); /** * Unregister interest for state changes. * * @param subscriber module which implements the IEndpointStateChangeSubscriber */ future<> unregister_(shared_ptr subscriber); std::set get_live_members(); std::set get_live_token_owners(); /** * @return a list of unreachable gossip participants, including fat clients */ std::set get_unreachable_members(); /** * @return a list of unreachable token owners */ std::set get_unreachable_token_owners(); int64_t get_endpoint_downtime(inet_address ep) const noexcept; /** * @param endpoint end point that is convicted. */ future<> convict(inet_address endpoint); /** * Return either: the greatest heartbeat or application state * * @param ep_state * @return */ int get_max_endpoint_state_version(endpoint_state state) const noexcept; private: /** * Removes the endpoint from gossip completely * * @param endpoint endpoint to be removed from the current membership. */ future<> evict_from_membership(inet_address endpoint); public: /** * Removes the endpoint from Gossip but retains endpoint state */ future<> remove_endpoint(inet_address endpoint); future<> force_remove_endpoint(inet_address endpoint); private: /** * Quarantines the endpoint for QUARANTINE_DELAY * * @param endpoint */ void quarantine_endpoint(inet_address endpoint); /** * Quarantines the endpoint until quarantine_start + QUARANTINE_DELAY * * @param endpoint * @param quarantine_start */ void quarantine_endpoint(inet_address endpoint, clk::time_point quarantine_start); private: /** * The gossip digest is built based on randomization * rather than just looping through the collection of live endpoints. * * @param g_digests list of Gossip Digests. */ void make_random_gossip_digest(utils::chunked_vector& g_digests); public: /** * This method will begin removing an existing endpoint from the cluster by spoofing its state * This should never be called unless this coordinator has had 'removenode' invoked * * @param endpoint - the endpoint being removed * @param host_id - the ID of the host being removed * @param local_host_id - my own host ID for replication coordination */ future<> advertise_removing(inet_address endpoint, utils::UUID host_id, utils::UUID local_host_id); /** * Handles switching the endpoint's state from REMOVING_TOKEN to REMOVED_TOKEN * This should only be called after advertise_removing * * @param endpoint * @param host_id */ future<> advertise_token_removed(inet_address endpoint, utils::UUID host_id); future<> unsafe_assassinate_endpoint(sstring address); /** * Do not call this method unless you know what you are doing. * It will try extremely hard to obliterate any endpoint from the ring, * even if it does not know about it. * * @param address * @throws UnknownHostException */ future<> assassinate_endpoint(sstring address); public: bool is_known_endpoint(inet_address endpoint) const noexcept; future get_current_generation_number(inet_address endpoint); future get_current_heart_beat_version(inet_address endpoint); bool is_gossip_only_member(inet_address endpoint); bool is_safe_for_bootstrap(inet_address endpoint); private: /** * Returns true if the chosen target was also a seed. False otherwise * * @param message * @param epSet a set of endpoint from which a random endpoint is chosen. * @return true if the chosen endpoint is also a seed. */ future<> send_gossip(gossip_digest_syn message, std::set epset); /* Sends a Gossip message to a live member */ future<> do_gossip_to_live_member(gossip_digest_syn message, inet_address ep); /* Sends a Gossip message to an unreachable member */ future<> do_gossip_to_unreachable_member(gossip_digest_syn message); future<> do_status_check(); public: clk::time_point get_expire_time_for_endpoint(inet_address endpoint) const noexcept; const endpoint_state* get_endpoint_state_for_endpoint_ptr(inet_address ep) const noexcept; endpoint_state& get_endpoint_state(inet_address ep); endpoint_state* get_endpoint_state_for_endpoint_ptr(inet_address ep) noexcept; const versioned_value* get_application_state_ptr(inet_address endpoint, application_state appstate) const noexcept; sstring get_application_state_value(inet_address endpoint, application_state appstate) const; // Use with caution, copies might be expensive (see #764) std::optional get_endpoint_state_for_endpoint(inet_address ep) const noexcept; // removes ALL endpoint states; should only be called after shadow gossip future<> reset_endpoint_state_map(); const std::unordered_map& get_endpoint_states() const noexcept; bool uses_host_id(inet_address endpoint) const; utils::UUID get_host_id(inet_address endpoint) const; std::optional get_state_for_version_bigger_than(inet_address for_endpoint, int version); /** * determine which endpoint started up earlier */ int compare_endpoint_startup(inet_address addr1, inet_address addr2); private: void update_timestamp_for_nodes(const std::map& map); void mark_alive(inet_address addr, endpoint_state& local_state); future<> real_mark_alive(inet_address addr, endpoint_state& local_state); future<> mark_dead(inet_address addr, endpoint_state& local_state); /** * This method is called whenever there is a "big" change in ep state (a generation change for a known node). * * @param ep endpoint * @param ep_state EndpointState for the endpoint */ future<> handle_major_state_change(inet_address ep, const endpoint_state& eps); public: bool is_alive(inet_address ep) const; bool is_dead_state(const endpoint_state& eps) const; // Wait for nodes to be alive on all shards future<> wait_alive(std::vector nodes, std::chrono::milliseconds timeout); future<> apply_state_locally(std::map map); // filter `endpoints` by `local_rack` inet_address_vector_replica_set endpoint_filter(const sstring& local_rack, const std::unordered_map>& endpoints); private: future<> do_apply_state_locally(gms::inet_address node, const endpoint_state& remote_state, bool listener_notification); future<> apply_state_locally_without_listener_notification(std::unordered_map map); future<> apply_new_states(inet_address addr, endpoint_state& local_state, const endpoint_state& remote_state); // notify that a local application state is going to change (doesn't get triggered for remote changes) future<> do_before_change_notifications(inet_address addr, const endpoint_state& ep_state, const application_state& ap_state, const versioned_value& new_value); // notify that an application state has changed future<> do_on_change_notifications(inet_address addr, const application_state& state, const versioned_value& value); /* Request all the state for the endpoint in the g_digest */ void request_all(gossip_digest& g_digest, utils::chunked_vector& delta_gossip_digest_list, int remote_generation); /* Send all the data with version greater than max_remote_version */ void send_all(gossip_digest& g_digest, std::map& delta_ep_state_map, int max_remote_version); public: /* This method is used to figure the state that the Gossiper has but Gossipee doesn't. The delta digests and the delta state are built up. */ void examine_gossiper(utils::chunked_vector& g_digest_list, utils::chunked_vector& delta_gossip_digest_list, std::map& delta_ep_state_map); public: /** * Start the gossiper with the generation number, preloading the map of application states before starting * * If advertise is set to false, gossip will not respond to gossip echo * message, so that other nodes will not mark this node as alive. * * Note 1: In practice, advertise is set to false only when the local node is * replacing a dead node using the same ip address of the dead node, i.e., * replacing_a_node_with_same_ip is set to true, because the issue (#7312) * that the advertise flag fixes is limited to replacing a node with the * same ip address only. * * Note 2: When a node with a new ip address joins the cluster, e.g., * replacing a dead node using the different ip address, with advertise = * false, existing nodes will not mark the node as up. So existing nodes * will not send gossip syn messages to the new node because the new node * is not in either live node list or unreachable node list. * * The new node will only include itself in the gossip syn messages, so the * syn message from new node to existing node will not exchange gossip * application states of existing nodes. Gossip exchanges node information * for node listed in SYN messages only. * * As a result, the new node will not learn other existing nodes in gossip * and existing nodes will learn the new node. * * Note 3: When a node replaces a dead node using the same ip address of * the dead node, with advertise = false, existing nodes will send syn * messages to the replacing node, because the replacing node is listed * in the unreachable node list. * * As a result, the replacing node will learn other existing nodes in * gossip and existing nodes will learn the new replacing node. Yes, * unreachable node is contacted with some probability, but all of the * existing nodes can talk to the replacing node. So the probability of * replacing node being talked to is pretty high. */ future<> start_gossiping(int generation_nbr, std::map preload_local_states = {}, gms::advertise_myself advertise = gms::advertise_myself::yes); public: /** * Do a single 'shadow' round of gossip, where we do not modify any state */ future<> do_shadow_round(std::unordered_set nodes = {}); private: void build_seeds_list(); public: // initialize local HB state if needed, i.e., if gossiper has never been started before. void maybe_initialize_local_state(int generation_nbr); /** * Add an endpoint we knew about previously, but whose state is unknown */ future<> add_saved_endpoint(inet_address ep); future<> add_local_application_state(application_state state, versioned_value value); /** * Applies all states in set "atomically", as in guaranteed monotonic versions and * inserted into endpoint state together (and assuming same grouping, overwritten together). */ future<> add_local_application_state(std::list>); /** * Intentionally overenginered to avoid very rare string copies. */ future<> add_local_application_state(std::initializer_list>>); future<> start(); future<> shutdown(); // Needed by seastar::sharded future<> stop(); future<> do_stop_gossiping(); public: bool is_enabled() const; void finish_shadow_round(); bool is_in_shadow_round() const; void goto_shadow_round(); public: void add_expire_time_for_endpoint(inet_address endpoint, clk::time_point expire_time); static clk::time_point compute_expire_time(); public: void dump_endpoint_state_map(); public: bool is_seed(const inet_address& endpoint) const; bool is_shutdown(const inet_address& endpoint) const; bool is_normal(const inet_address& endpoint) const; bool is_left(const inet_address& endpoint) const; // Check if a node is in NORMAL or SHUTDOWN status which means the node is // part of the token ring from the gossip point of view and operates in // normal status or was in normal status but is shutdown. bool is_normal_ring_member(const inet_address& endpoint) const; bool is_cql_ready(const inet_address& endpoint) const; bool is_silent_shutdown_state(const endpoint_state& ep_state) const; future<> mark_as_shutdown(const inet_address& endpoint); void force_newer_generation(); public: std::string_view get_gossip_status(const endpoint_state& ep_state) const noexcept; std::string_view get_gossip_status(const inet_address& endpoint) const noexcept; public: future<> wait_for_gossip_to_settle(); future<> wait_for_range_setup(); private: future<> wait_for_gossip(std::chrono::milliseconds, std::optional = {}); uint64_t _nr_run = 0; uint64_t _msg_processing = 0; bool _gossip_settled = false; class msg_proc_guard; private: abort_source& _abort_source; feature_service& _feature_service; const locator::shared_token_metadata& _shared_token_metadata; netw::messaging_service& _messaging; sharded& _sys_ks; utils::updateable_value _failure_detector_timeout_ms; utils::updateable_value _force_gossip_generation; gossip_config _gcfg; // Get features supported by a particular node std::set get_supported_features(inet_address endpoint) const; // Get features supported by all the nodes this node knows about std::set get_supported_features(const std::unordered_map& loaded_peer_features, ignore_features_of_local_node ignore_local_node) const; locator::token_metadata_ptr get_token_metadata_ptr() const noexcept; public: void check_knows_remote_features(std::set& local_features, const std::unordered_map& loaded_peer_features) const; future<> maybe_enable_features(); private: seastar::metrics::metric_groups _metrics; public: void append_endpoint_state(std::stringstream& ss, const endpoint_state& state); public: void check_snitch_name_matches() const; sstring get_all_endpoint_states(); std::map get_simple_states(); int get_down_endpoint_count() const noexcept; int get_up_endpoint_count() const noexcept; int get_all_endpoint_count() const noexcept; sstring get_endpoint_state(sstring address); private: future<> failure_detector_loop(); future<> failure_detector_loop_for_node(gms::inet_address node, int64_t gossip_generation, uint64_t live_endpoints_version); future<> update_live_endpoints_version(); }; inline future get_all_endpoint_states(gossiper& g) { return g.container().invoke_on(0, [] (gossiper& g) { return g.get_all_endpoint_states(); }); } inline future get_endpoint_state(gossiper& g, sstring address) { return g.container().invoke_on(0, [address] (gossiper& g) { return g.get_endpoint_state(address); }); } inline future> get_simple_states(gossiper& g) { return g.container().invoke_on(0, [] (gossiper& g) { return g.get_simple_states(); }); } inline future get_down_endpoint_count(gossiper& g) { return g.container().invoke_on(0, [] (gossiper& g) { return g.get_down_endpoint_count(); }); } inline future get_up_endpoint_count(gossiper& g) { return g.container().invoke_on(0, [] (gossiper& g) { return g.get_up_endpoint_count(); }); } inline future get_all_endpoint_count(gossiper& g) { return g.container().invoke_on(0, [] (gossiper& g) { return static_cast(g.get_endpoint_states().size()); }); } inline future<> set_phi_convict_threshold(double phi) { return smp::submit_to(0, [phi] { return make_ready_future<>(); }); } inline future get_phi_convict_threshold() { return smp::submit_to(0, [] { return make_ready_future(8); }); } inline future> get_arrival_samples() { return smp::submit_to(0, [] { return make_ready_future>(); }); } struct gossip_get_endpoint_states_request { // Application states the sender requested std::unordered_set application_states; }; struct gossip_get_endpoint_states_response { std::unordered_map endpoint_state_map; }; } // namespace gms