Files
scylladb/gms/gossiper.cc
Asias He bf3d6a4c06 gossip: Disable sleep and retry logic in do_status_check
We do not have the ThreadPoolExecutor logic. Disable the sleep and retry
logic.
2015-04-23 14:55:26 +08:00

1196 lines
50 KiB
C++

/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
* Modified by Cloudius Systems.
* Copyright 2015 Cloudius Systems.
*/
#include "gms/inet_address.hh"
#include "gms/endpoint_state.hh"
#include "gms/gossip_digest.hh"
#include "gms/gossip_digest_syn.hh"
#include "gms/gossip_digest_ack.hh"
#include "gms/gossip_digest_ack2.hh"
#include "gms/versioned_value.hh"
#include "gms/gossiper.hh"
#include "gms/application_state.hh"
#include "gms/failure_detector.hh"
#include "gms/i_failure_detection_event_listener.hh"
#include "gms/i_endpoint_state_change_subscriber.hh"
#include "gms/i_failure_detector.hh"
namespace gms {
gossiper::gossiper()
: _scheduled_gossip_task([this] { run(); }) {
// half of QUARATINE_DELAY, to ensure _just_removed_endpoints has enough leeway to prevent re-gossip
fat_client_timeout = (int64_t) (QUARANTINE_DELAY / 2);
/* register with the Failure Detector for receiving Failure detector events */
get_local_failure_detector().register_failure_detection_event_listener(this->shared_from_this());
// Register this instance with JMX
init_messaging_service_handler();
}
/*
* First construct a map whose key is the endpoint in the GossipDigest and the value is the
* GossipDigest itself. Then build a list of version differences i.e difference between the
* version in the GossipDigest and the version in the local state for a given InetAddress.
* Sort this list. Now loop through the sorted list and retrieve the GossipDigest corresponding
* to the endpoint from the map that was initially constructed.
*/
void gossiper::do_sort(std::vector<gossip_digest>& g_digest_list) {
/* Construct a map of endpoint to GossipDigest. */
std::map<inet_address, gossip_digest> ep_to_digest_map;
for (auto g_digest : g_digest_list) {
ep_to_digest_map.emplace(g_digest.get_endpoint(), g_digest);
}
/*
* These digests have their maxVersion set to the difference of the version
* of the local EndpointState and the version found in the GossipDigest.
*/
std::vector<gossip_digest> diff_digests;
for (auto g_digest : g_digest_list) {
auto ep = g_digest.get_endpoint();
auto ep_state = this->get_endpoint_state_for_endpoint(ep);
int version = ep_state ? this->get_max_endpoint_state_version(*ep_state) : 0;
int diff_version = ::abs(version - g_digest.get_max_version());
diff_digests.emplace_back(gossip_digest(ep, g_digest.get_generation(), diff_version));
}
g_digest_list.clear();
std::sort(diff_digests.begin(), diff_digests.end());
int size = diff_digests.size();
/*
* Report the digests in descending order. This takes care of the endpoints
* that are far behind w.r.t this local endpoint
*/
for (int i = size - 1; i >= 0; --i) {
g_digest_list.emplace_back(ep_to_digest_map[diff_digests[i].get_endpoint()]);
}
}
void gossiper::init_messaging_service_handler() {
ms().register_handler(messaging_verb::ECHO, [this] (empty_msg msg) {
// TODO: Use time_point instead of long for timing.
this->set_last_processed_message_at(now_millis());
return make_ready_future<empty_msg>();
});
ms().register_handler_oneway(messaging_verb::GOSSIP_SHUTDOWN, [this] (inet_address from) {
this->set_last_processed_message_at(now_millis());
// TODO: Implement processing of incoming SHUTDOWN message
get_local_failure_detector().force_conviction(from);
return messaging_service::no_wait();
});
ms().register_handler(messaging_verb::GOSSIP_DIGEST_SYN, [this] (gossip_digest_syn syn_msg) {
this->set_last_processed_message_at(now_millis());
inet_address from;
if (!this->is_enabled()) {
return make_ready_future<gossip_digest_ack>(gossip_digest_ack());
}
/* If the message is from a different cluster throw it away. */
if (syn_msg.cluster_id() != get_cluster_name()) {
return make_ready_future<gossip_digest_ack>(gossip_digest_ack());
}
if (syn_msg.partioner() != "" && syn_msg.partioner() != get_partitioner_name()) {
return make_ready_future<gossip_digest_ack>(gossip_digest_ack());
}
auto g_digest_list = syn_msg.get_gossip_digests();
do_sort(g_digest_list);
std::vector<gossip_digest> delta_gossip_digest_list;
std::map<inet_address, endpoint_state> delta_ep_state_map;
this->examine_gossiper(g_digest_list, delta_gossip_digest_list, delta_ep_state_map);
gms::gossip_digest_ack ack_msg(std::move(delta_gossip_digest_list), std::move(delta_ep_state_map));
return make_ready_future<gossip_digest_ack>(std::move(ack_msg));
});
ms().register_handler_oneway(messaging_verb::GOSSIP_DIGEST_ACK2, [this] (gossip_digest_ack2 msg) {
this->set_last_processed_message_at(now_millis());
auto& remote_ep_state_map = msg.get_endpoint_state_map();
/* Notify the Failure Detector */
this->notify_failure_detector(remote_ep_state_map);
this->apply_state_locally(remote_ep_state_map);
return messaging_service::no_wait();
});
}
bool gossiper::send_gossip(gossip_digest_syn message, std::set<inet_address> epset) {
std::vector<inet_address> __live_endpoints(epset.begin(), epset.end());
size_t size = __live_endpoints.size();
if (size < 1) {
return false;
}
/* Generate a random number from 0 -> size */
std::uniform_int_distribution<int> dist(0, size - 1);
int index = dist(_random);
inet_address to = __live_endpoints[index];
// if (logger.isTraceEnabled())
// logger.trace("Sending a GossipDigestSyn to {} ...", to);
using RetMsg = gossip_digest_ack;
auto id = get_shard_id(to);
ms().send_message<RetMsg>(messaging_verb::GOSSIP_DIGEST_SYN, std::move(id), std::move(message)).then([this, id] (RetMsg ack_msg) {
this->set_last_processed_message_at(now_millis());
if (!this->is_enabled() && !this->is_in_shadow_round()) {
return make_ready_future<>();
}
auto g_digest_list = ack_msg.get_gossip_digest_list();
auto ep_state_map = ack_msg.get_endpoint_state_map();
if (ep_state_map.size() > 0) {
/* Notify the Failure Detector */
this->notify_failure_detector(ep_state_map);
this->apply_state_locally(ep_state_map);
}
if (this->is_in_shadow_round()) {
this->finish_shadow_round();
return make_ready_future<>(); // don't bother doing anything else, we have what we came for
}
/* Get the state required to send to this gossipee - construct GossipDigestAck2Message */
std::map<inet_address, endpoint_state> delta_ep_state_map;
for (auto g_digest : g_digest_list) {
inet_address addr = g_digest.get_endpoint();
auto local_ep_state_ptr = this->get_state_for_version_bigger_than(addr, g_digest.get_max_version());
if (local_ep_state_ptr) {
delta_ep_state_map.emplace(addr, *local_ep_state_ptr);
}
}
gms::gossip_digest_ack2 ack2_msg(std::move(delta_ep_state_map));
return ms().send_message_oneway<void>(messaging_verb::GOSSIP_DIGEST_ACK2, std::move(id), std::move(ack2_msg)).then([] () {
return make_ready_future<>();
});
});
return _seeds.count(to);
}
void gossiper::notify_failure_detector(inet_address endpoint, endpoint_state remote_endpoint_state) {
/*
* If the local endpoint state exists then report to the FD only
* if the versions workout.
*/
auto it = endpoint_state_map.find(endpoint);
if (it != endpoint_state_map.end()) {
auto& local_endpoint_state = it->second;
i_failure_detector& fd = get_local_failure_detector();
int local_generation = local_endpoint_state.get_heart_beat_state().get_generation();
int remote_generation = remote_endpoint_state.get_heart_beat_state().get_generation();
if (remote_generation > local_generation) {
local_endpoint_state.update_timestamp();
// this node was dead and the generation changed, this indicates a reboot, or possibly a takeover
// we will clean the fd intervals for it and relearn them
if (!local_endpoint_state.is_alive()) {
//logger.debug("Clearing interval times for {} due to generation change", endpoint);
fd.remove(endpoint);
}
fd.report(endpoint);
return;
}
if (remote_generation == local_generation) {
int local_version = get_max_endpoint_state_version(local_endpoint_state);
int remote_version = remote_endpoint_state.get_heart_beat_state().get_heart_beat_version();
if (remote_version > local_version) {
local_endpoint_state.update_timestamp();
// just a version change, report to the fd
fd.report(endpoint);
}
}
}
}
void gossiper::apply_state_locally(std::map<inet_address, endpoint_state>& map) {
for (auto& entry : map) {
auto& ep = entry.first;
if (ep == get_broadcast_address() && !is_in_shadow_round()) {
continue;
}
if (_just_removed_endpoints.count(ep)) {
// if (logger.isTraceEnabled())
// logger.trace("Ignoring gossip for {} because it is quarantined", ep);
continue;
}
/*
If state does not exist just add it. If it does then add it if the remote generation is greater.
If there is a generation tie, attempt to break it by heartbeat version.
*/
endpoint_state& remote_state = entry.second;
auto it = endpoint_state_map.find(ep);
if (it != endpoint_state_map.end()) {
endpoint_state& local_ep_state_ptr = it->second;
int local_generation = local_ep_state_ptr.get_heart_beat_state().get_generation();
int remote_generation = remote_state.get_heart_beat_state().get_generation();
// if (logger.isTraceEnabled()) {
// logger.trace("{} local generation {}, remote generation {}", ep, local_generation, remote_generation);
// }
if (local_generation != 0 && remote_generation > local_generation + MAX_GENERATION_DIFFERENCE) {
// assume some peer has corrupted memory and is broadcasting an unbelievable generation about another peer (or itself)
// logger.warn("received an invalid gossip generation for peer {}; local generation = {}, received generation = {}",
// ep, local_generation, remote_generation);
} else if (remote_generation > local_generation) {
// if (logger.isTraceEnabled())
// logger.trace("Updating heartbeat state generation to {} from {} for {}", remote_generation, local_generation, ep);
// major state change will handle the update by inserting the remote state directly
handle_major_state_change(ep, remote_state);
} else if (remote_generation == local_generation) { //generation has not changed, apply new states
/* find maximum state */
int local_max_version = get_max_endpoint_state_version(local_ep_state_ptr);
int remote_max_version = get_max_endpoint_state_version(remote_state);
if (remote_max_version > local_max_version) {
// apply states, but do not notify since there is no major change
apply_new_states(ep, local_ep_state_ptr, remote_state);
} else {
// if (logger.isTraceEnabled()) {
// logger.trace("Ignoring remote version {} <= {} for {}", remote_max_version, local_max_version, ep);
}
if (!local_ep_state_ptr.is_alive() && !is_dead_state(local_ep_state_ptr)) { // unless of course, it was dead
mark_alive(ep, local_ep_state_ptr);
}
} else {
// if (logger.isTraceEnabled())
// logger.trace("Ignoring remote generation {} < {}", remote_generation, local_generation);
}
} else {
// this is a new node, report it to the FD in case it is the first time we are seeing it AND it's not alive
get_local_failure_detector().report(ep);
handle_major_state_change(ep, remote_state);
}
}
}
void gossiper::remove_endpoint(inet_address endpoint) {
// do subscribers first so anything in the subscriber that depends on gossiper state won't get confused
for (auto& subscriber : _subscribers) {
subscriber->on_remove(endpoint);
}
if(_seeds.count(endpoint)) {
build_seeds_list();
_seeds.erase(endpoint);
//logger.info("removed {} from _seeds, updated _seeds list = {}", endpoint, _seeds);
}
_live_endpoints.erase(endpoint);
_unreachable_endpoints.erase(endpoint);
// do not remove endpointState until the quarantine expires
get_local_failure_detector().remove(endpoint);
// FIXME: MessagingService
//MessagingService.instance().resetVersion(endpoint);
warn(unimplemented::cause::GOSSIP);
quarantine_endpoint(endpoint);
// FIXME: MessagingService
//MessagingService.instance().destroyConnectionPool(endpoint);
// if (logger.isDebugEnabled())
// logger.debug("removing endpoint {}", endpoint);
}
void gossiper::do_status_check() {
// if (logger.isTraceEnabled())
// logger.trace("Performing status check ...");
int64_t now = now_millis();
for (auto& entry : endpoint_state_map) {
const inet_address& endpoint = entry.first;
if (endpoint == get_broadcast_address()) {
continue;
}
get_local_failure_detector().interpret(endpoint);
auto it = endpoint_state_map.find(endpoint);
if (it != endpoint_state_map.end()) {
endpoint_state& ep_state = it->second;
// check if this is a fat client. fat clients are removed automatically from
// gossip after FatClientTimeout. Do not remove dead states here.
if (is_gossip_only_member(endpoint)
&& !_just_removed_endpoints.count(endpoint)
&& ((now - ep_state.get_update_timestamp().time_since_epoch().count()) > fat_client_timeout)) {
// logger.info("FatClient {} has been silent for {}ms, removing from gossip", endpoint, FatClientTimeout);
remove_endpoint(endpoint); // will put it in _just_removed_endpoints to respect quarantine delay
evict_from_membershipg(endpoint); // can get rid of the state immediately
}
// check for dead state removal
int64_t expire_time = get_expire_time_for_endpoint(endpoint);
if (!ep_state.is_alive() && (now > expire_time)) {
/* && (!StorageService.instance.getTokenMetadata().isMember(endpoint))) */
// if (logger.isDebugEnabled()) {
// logger.debug("time is expiring for endpoint : {} ({})", endpoint, expire_time);
// }
evict_from_membershipg(endpoint);
}
}
}
for (auto it = _just_removed_endpoints.begin(); it != _just_removed_endpoints.end();) {
auto& t= it->second;
if ((now - t) > QUARANTINE_DELAY) {
// if (logger.isDebugEnabled())
// logger.debug("{} elapsed, {} gossip quarantine over", QUARANTINE_DELAY, entry.getKey());
it = _just_removed_endpoints.erase(it);
} else {
it++;
}
}
}
void gossiper::run() {
//wait on messaging service to start listening
// MessagingService.instance().waitUntilListening();
/* Update the local heartbeat counter. */
endpoint_state_map[get_broadcast_address()].get_heart_beat_state().update_heart_beat();
// if (logger.isTraceEnabled())
// logger.trace("My heartbeat is now {}", endpoint_state_map.get(FBUtilities.getBroadcastAddress()).get_heart_beat_state().get_heart_beat_version());
std::vector<gossip_digest> g_digests;
this->make_random_gossip_digest(g_digests);
if (g_digests.size() > 0) {
gossip_digest_syn message(get_cluster_name(), get_partitioner_name(), g_digests);
/* Gossip to some random live member */
bool gossiped_to_seed = do_gossip_to_live_member(message);
/* Gossip to some unreachable member with some probability to check if he is back up */
do_gossip_to_unreachable_member(message);
/* Gossip to a seed if we did not do so above, or we have seen less nodes
than there are seeds. This prevents partitions where each group of nodes
is only gossiping to a subset of the seeds.
The most straightforward check would be to check that all the seeds have been
verified either as live or unreachable. To avoid that computation each round,
we reason that:
either all the live nodes are seeds, in which case non-seeds that come online
will introduce themselves to a member of the ring by definition,
or there is at least one non-seed node in the list, in which case eventually
someone will gossip to it, and then do a gossip to a random seed from the
gossipedToSeed check.
See CASSANDRA-150 for more exposition. */
if (!gossiped_to_seed || _live_endpoints.size() < _seeds.size()) {
do_gossip_to_seed(message);
}
do_status_check();
}
}
bool gossiper::seen_any_seed() {
for (auto& entry : endpoint_state_map) {
if (_seeds.count(entry.first)) {
return true;
}
auto& state = entry.second;
if (state.get_application_state_map().count(application_state::INTERNAL_IP) &&
_seeds.count(inet_address(state.get_application_state(application_state::INTERNAL_IP)->value))) {
return true;
}
}
return false;
}
void gossiper::register_(i_endpoint_state_change_subscriber* subscriber) {
_subscribers.push_back(subscriber);
}
void gossiper::unregister_(i_endpoint_state_change_subscriber* subscriber) {
_subscribers.remove(subscriber);
}
std::set<inet_address> gossiper::get_live_members() {
std::set<inet_address> live_members(_live_endpoints);
if (!live_members.count(get_broadcast_address())) {
live_members.insert(get_broadcast_address());
}
return live_members;
}
std::set<inet_address> gossiper::get_live_token_owners() {
std::set<inet_address> token_owners;
for (auto& member : get_live_members()) {
auto it = endpoint_state_map.find(member);
// FIXME: StorageService.instance.getTokenMetadata
if (it != endpoint_state_map.end() && !is_dead_state(it->second) /* && StorageService.instance.getTokenMetadata().isMember(member) */) {
token_owners.insert(member);
}
warn(unimplemented::cause::GOSSIP);
}
return token_owners;
}
std::set<inet_address> gossiper::get_unreachable_token_owners() {
std::set<inet_address> token_owners;
for (auto&& x : _unreachable_endpoints) {
auto& endpoint = x.first;
warn(unimplemented::cause::GOSSIP);
if (true /* StorageService.instance.getTokenMetadata().isMember(endpoint) */) {
token_owners.insert(endpoint);
}
}
return token_owners;
}
int64_t gossiper::get_endpoint_downtime(inet_address ep) {
auto it = _unreachable_endpoints.find(ep);
if (it != _unreachable_endpoints.end()) {
auto& downtime = it->second;
return (now_nanos() - downtime) / 1000;
} else {
return 0L;
}
}
void gossiper::convict(inet_address endpoint, double phi) {
auto it = endpoint_state_map.find(endpoint);
if (it == endpoint_state_map.end()) {
return;
}
auto& state = it->second;
if (state.is_alive() && is_dead_state(state)) {
mark_dead(endpoint, state);
} else {
state.mark_dead();
}
}
std::set<inet_address> gossiper::get_unreachable_members() {
std::set<inet_address> ret;
for (auto&& x : _unreachable_endpoints) {
ret.insert(x.first);
}
return ret;
}
int gossiper::get_max_endpoint_state_version(endpoint_state state) {
int max_version = state.get_heart_beat_state().get_heart_beat_version();
for (auto& entry : state.get_application_state_map()) {
auto& value = entry.second;
max_version = std::max(max_version, value.version);
}
return max_version;
}
void gossiper::evict_from_membershipg(inet_address endpoint) {
_unreachable_endpoints.erase(endpoint);
endpoint_state_map.erase(endpoint);
_expire_time_endpoint_map.erase(endpoint);
quarantine_endpoint(endpoint);
// if (logger.isDebugEnabled())
// logger.debug("evicting {} from gossip", endpoint);
}
void gossiper::quarantine_endpoint(inet_address endpoint) {
quarantine_endpoint(endpoint, now_millis());
}
void gossiper::quarantine_endpoint(inet_address endpoint, int64_t quarantine_expiration) {
_just_removed_endpoints[endpoint] = quarantine_expiration;
}
void gossiper::replacement_quarantine(inet_address endpoint) {
// remember, quarantine_endpoint will effectively already add QUARANTINE_DELAY, so this is 2x
// logger.debug("");
quarantine_endpoint(endpoint, now_millis() + QUARANTINE_DELAY);
}
void gossiper::replaced_endpoint(inet_address endpoint) {
remove_endpoint(endpoint);
evict_from_membershipg(endpoint);
replacement_quarantine(endpoint);
}
void gossiper::make_random_gossip_digest(std::vector<gossip_digest>& g_digests) {
int generation = 0;
int max_version = 0;
// local epstate will be part of endpoint_state_map
std::vector<inet_address> endpoints;
for (auto&& x : endpoint_state_map) {
endpoints.push_back(x.first);
}
std::random_shuffle(endpoints.begin(), endpoints.end());
for (auto& endpoint : endpoints) {
auto it = endpoint_state_map.find(endpoint);
if (it != endpoint_state_map.end()) {
auto& eps = it->second;
generation = eps.get_heart_beat_state().get_generation();
max_version = get_max_endpoint_state_version(eps);
}
g_digests.push_back(gossip_digest(endpoint, generation, max_version));
}
#if 0
if (logger.isTraceEnabled()) {
StringBuilder sb = new StringBuilder();
for (GossipDigest g_digest : g_digests)
{
sb.append(g_digest);
sb.append(" ");
}
logger.trace("Gossip Digests are : {}", sb);
}
#endif
}
void gossiper::advertise_removing(inet_address endpoint, utils::UUID host_id, utils::UUID local_host_id) {
auto& state = endpoint_state_map.at(endpoint);
// remember this node's generation
int generation = state.get_heart_beat_state().get_generation();
// logger.info("Removing host: {}", host_id);
// logger.info("Sleeping for {}ms to ensure {} does not change", StorageService.RING_DELAY, endpoint);
// FIXME: sleep
warn(unimplemented::cause::GOSSIP);
// Uninterruptibles.sleepUninterruptibly(StorageService.RING_DELAY, TimeUnit.MILLISECONDS);
// make sure it did not change
auto& eps = endpoint_state_map.at(endpoint);
if (eps.get_heart_beat_state().get_generation() != generation) {
throw std::runtime_error(sprint("Endpoint %s generation changed while trying to remove it", endpoint));
}
// update the other node's generation to mimic it as if it had changed it itself
//logger.info("Advertising removal for {}", endpoint);
eps.update_timestamp(); // make sure we don't evict it too soon
eps.get_heart_beat_state().force_newer_generation_unsafe();
// FIXME: StorageService.instance.valueFactory
// eps.add_application_state(application_state::STATUS, StorageService.instance.valueFactory.removingNonlocal(host_id));
// eps.add_application_state(application_state::REMOVAL_COORDINATOR, StorageService.instance.valueFactory.removalCoordinator(local_host_id));
endpoint_state_map[endpoint] = eps;
}
void gossiper::advertise_token_removed(inet_address endpoint, utils::UUID host_id) {
auto& eps = endpoint_state_map.at(endpoint);
eps.update_timestamp(); // make sure we don't evict it too soon
eps.get_heart_beat_state().force_newer_generation_unsafe();
int64_t expire_time = compute_expire_time();
// FIXME: StorageService.instance.valueFactory.removedNonlocal
// eps.add_application_state(application_state::STATUS, StorageService.instance.valueFactory.removedNonlocal(host_id, expire_time));
//logger.info("Completing removal of {}", endpoint);
add_expire_time_for_endpoint(endpoint, expire_time);
endpoint_state_map[endpoint] = eps;
// ensure at least one gossip round occurs before returning
// FIXME: sleep
//Uninterruptibles.sleepUninterruptibly(INTERVAL_IN_MILLIS * 2, TimeUnit.MILLISECONDS);
warn(unimplemented::cause::GOSSIP);
}
void gossiper::unsafe_assassinate_endpoint(sstring address) {
//logger.warn("Gossiper.unsafeAssassinateEndpoint is deprecated and will be removed in the next release; use assassinate_endpoint instead");
assassinate_endpoint(address);
}
void gossiper::assassinate_endpoint(sstring address) {
inet_address endpoint(address);
auto is_exist = endpoint_state_map.count(endpoint);
endpoint_state&& ep_state = is_exist ? endpoint_state_map.at(endpoint) :
endpoint_state(heart_beat_state((int) ((now_millis() + 60000) / 1000), 9999));
//Collection<Token> tokens = null;
// logger.warn("Assassinating {} via gossip", endpoint);
if (is_exist) {
// FIXME:
warn(unimplemented::cause::GOSSIP);
#if 0
try {
tokens = StorageService.instance.getTokenMetadata().getTokens(endpoint);
} catch (Throwable th) {
JVMStabilityInspector.inspectThrowable(th);
// TODO this is broken
logger.warn("Unable to calculate tokens for {}. Will use a random one", address);
tokens = Collections.singletonList(StorageService.getPartitioner().getRandomToken());
}
#endif
int generation = ep_state.get_heart_beat_state().get_generation();
int heartbeat = ep_state.get_heart_beat_state().get_heart_beat_version();
//logger.info("Sleeping for {}ms to ensure {} does not change", StorageService.RING_DELAY, endpoint);
//Uninterruptibles.sleepUninterruptibly(StorageService.RING_DELAY, TimeUnit.MILLISECONDS);
// make sure it did not change
auto it = endpoint_state_map.find(endpoint);
if (it == endpoint_state_map.end()) {
// logger.warn("Endpoint {} disappeared while trying to assassinate, continuing anyway", endpoint);
} else {
auto& new_state = it->second;
if (new_state.get_heart_beat_state().get_generation() != generation) {
throw std::runtime_error(sprint("Endpoint still alive: %s generation changed while trying to assassinate it", endpoint));
} else if (new_state.get_heart_beat_state().get_heart_beat_version() != heartbeat) {
throw std::runtime_error(sprint("Endpoint still alive: %s heartbeat changed while trying to assassinate it", endpoint));
}
}
ep_state.update_timestamp(); // make sure we don't evict it too soon
ep_state.get_heart_beat_state().force_newer_generation_unsafe();
}
// do not pass go, do not collect 200 dollars, just gtfo
// FIXME: StorageService.instance and Sleep
// ep_state.add_application_state(application_state::STATUS, StorageService.instance.valueFactory.left(tokens, compute_expire_time()));
handle_major_state_change(endpoint, ep_state);
// Uninterruptibles.sleepUninterruptibly(INTERVAL_IN_MILLIS * 4, TimeUnit.MILLISECONDS);
//logger.warn("Finished assassinating {}", endpoint);
}
bool gossiper::is_known_endpoint(inet_address endpoint) {
return endpoint_state_map.count(endpoint);
}
int gossiper::get_current_generation_number(inet_address endpoint) {
return endpoint_state_map.at(endpoint).get_heart_beat_state().get_generation();
}
bool gossiper::do_gossip_to_live_member(gossip_digest_syn message) {
size_t size = _live_endpoints.size();
if (size == 0) {
return false;
}
return send_gossip(message, _live_endpoints);
}
void gossiper::do_gossip_to_unreachable_member(gossip_digest_syn message) {
double live_endpoint_count = _live_endpoints.size();
double unreachable_endpoint_count = _unreachable_endpoints.size();
if (unreachable_endpoint_count > 0) {
/* based on some probability */
double prob = unreachable_endpoint_count / (live_endpoint_count + 1);
std::uniform_real_distribution<double> dist(0, 1);
double rand_dbl = dist(_random);
if (rand_dbl < prob) {
std::set<inet_address> addrs;
for (auto&& x : _unreachable_endpoints) {
addrs.insert(x.first);
}
send_gossip(message, addrs);
}
}
}
void gossiper::do_gossip_to_seed(gossip_digest_syn prod) {
size_t size = _seeds.size();
if (size > 0) {
if (size == 1 && _seeds.count(get_broadcast_address())) {
return;
}
if (_live_endpoints.size() == 0) {
send_gossip(prod, _seeds);
} else {
/* Gossip with the seed with some probability. */
double probability = _seeds.size() / (double) (_live_endpoints.size() + _unreachable_endpoints.size());
std::uniform_real_distribution<double> dist(0, 1);
double rand_dbl = dist(_random);
if (rand_dbl <= probability) {
send_gossip(prod, _seeds);
}
}
}
}
bool gossiper::is_gossip_only_member(inet_address endpoint) {
auto it = endpoint_state_map.find(endpoint);
if (it == endpoint_state_map.end()) {
return false;
}
auto& eps = it->second;
// FIXME: StorageService.instance.getTokenMetadata
return !is_dead_state(eps) /* && !StorageService.instance.getTokenMetadata().isMember(endpoint); */;
}
int64_t gossiper::get_expire_time_for_endpoint(inet_address endpoint) {
/* default expire_time is A_VERY_LONG_TIME */
auto it = _expire_time_endpoint_map.find(endpoint);
if (it == _expire_time_endpoint_map.end()) {
return compute_expire_time();
} else {
int64_t stored_time = it->second;
return stored_time;
}
}
std::experimental::optional<endpoint_state> gossiper::get_endpoint_state_for_endpoint(inet_address ep) {
auto it = endpoint_state_map.find(ep);
if (it == endpoint_state_map.end()) {
return {};
} else {
return it->second;
}
}
void gossiper::reset_endpoint_state_map() {
endpoint_state_map.clear();
_unreachable_endpoints.clear();
_live_endpoints.clear();
}
std::map<inet_address, endpoint_state>&gms::gossiper::get_endpoint_states() {
return endpoint_state_map;
}
bool gossiper::uses_host_id(inet_address endpoint) {
// FIXME
warn(unimplemented::cause::GOSSIP);
if (true /* MessagingService.instance().knowsVersion(endpoint) */) {
return true;
} else if (get_endpoint_state_for_endpoint(endpoint)->get_application_state(application_state::NET_VERSION)) {
return true;
}
return false;
}
bool gossiper::uses_vnodes(inet_address endpoint) {
return uses_host_id(endpoint) && get_endpoint_state_for_endpoint(endpoint)->get_application_state(application_state::TOKENS);
}
utils::UUID gossiper::get_host_id(inet_address endpoint) {
if (!uses_host_id(endpoint)) {
throw std::runtime_error(sprint("Host %s does not use new-style tokens!", endpoint));
}
sstring uuid = get_endpoint_state_for_endpoint(endpoint)->get_application_state(application_state::HOST_ID)->value;
// FIXME: Add UUID(const sstring& id) constructor
warn(unimplemented::cause::GOSSIP);
return utils::UUID(0, 0);
}
std::experimental::optional<endpoint_state> gossiper::get_state_for_version_bigger_than(inet_address for_endpoint, int version) {
std::experimental::optional<endpoint_state> reqd_endpoint_state;
auto it = endpoint_state_map.find(for_endpoint);
if (it != endpoint_state_map.end()) {
auto& eps = it->second;
/*
* Here we try to include the Heart Beat state only if it is
* greater than the version passed in. It might happen that
* the heart beat version maybe lesser than the version passed
* in and some application state has a version that is greater
* than the version passed in. In this case we also send the old
* heart beat and throw it away on the receiver if it is redundant.
*/
int local_hb_version = eps.get_heart_beat_state().get_heart_beat_version();
if (local_hb_version > version) {
reqd_endpoint_state.emplace(eps.get_heart_beat_state());
// if (logger.isTraceEnabled())
// logger.trace("local heartbeat version {} greater than {} for {}", local_hb_version, version, for_endpoint);
}
/* Accumulate all application states whose versions are greater than "version" variable */
for (auto& entry : eps.get_application_state_map()) {
auto& value = entry.second;
if (value.version > version) {
if (!reqd_endpoint_state) {
reqd_endpoint_state.emplace(eps.get_heart_beat_state());
}
auto& key = entry.first;
// if (logger.isTraceEnabled())
// logger.trace("Adding state {}: {}" , key, value.value);
reqd_endpoint_state->add_application_state(key, value);
}
}
}
return reqd_endpoint_state;
}
int gossiper::compare_endpoint_startup(inet_address addr1, inet_address addr2) {
auto ep1 = get_endpoint_state_for_endpoint(addr1);
auto ep2 = get_endpoint_state_for_endpoint(addr2);
assert(ep1 && ep2);
return ep1->get_heart_beat_state().get_generation() - ep2->get_heart_beat_state().get_generation();
}
void gossiper::notify_failure_detector(std::map<inet_address, endpoint_state> remoteEpStateMap) {
for (auto& entry : remoteEpStateMap) {
notify_failure_detector(entry.first, entry.second);
}
}
void gossiper::mark_alive(inet_address addr, endpoint_state local_state) {
// if (MessagingService.instance().getVersion(addr) < MessagingService.VERSION_20) {
// real_mark_alive(addr, local_state);
// return;
// }
local_state.mark_dead();
//logger.trace("Sending a EchoMessage to {}", addr);
shard_id id = get_shard_id(addr);
ms().send_message<empty_msg>(messaging_verb::ECHO, id).then([this, addr, local_state = std::move(local_state)] (empty_msg msg) mutable {
this->set_last_processed_message_at(now_millis());
this->real_mark_alive(addr, local_state);
});
}
void gossiper::real_mark_alive(inet_address addr, endpoint_state local_state) {
// if (logger.isTraceEnabled())
// logger.trace("marking as alive {}", addr);
local_state.mark_alive();
local_state.update_timestamp(); // prevents do_status_check from racing us and evicting if it was down > A_VERY_LONG_TIME
_live_endpoints.insert(addr);
_unreachable_endpoints.erase(addr);
_expire_time_endpoint_map.erase(addr);
// logger.debug("removing expire time for endpoint : {}", addr);
// logger.info("inet_address {} is now UP", addr);
for (auto& subscriber : _subscribers)
subscriber->on_alive(addr, local_state);
// if (logger.isTraceEnabled())
// logger.trace("Notified {}", _subscribers);
}
void gossiper::mark_dead(inet_address addr, endpoint_state local_state) {
// if (logger.isTraceEnabled())
// logger.trace("marking as down {}", addr);
local_state.mark_dead();
_live_endpoints.erase(addr);
_unreachable_endpoints[addr] = now_nanos();
// logger.info("inet_address {} is now DOWN", addr);
for (auto& subscriber : _subscribers)
subscriber->on_dead(addr, local_state);
// if (logger.isTraceEnabled())
// logger.trace("Notified {}", _subscribers);
}
void gossiper::handle_major_state_change(inet_address ep, endpoint_state eps) {
if (!is_dead_state(eps)) {
if (endpoint_state_map.count(ep)) {
//logger.info("Node {} has restarted, now UP", ep);
} else {
//logger.info("Node {} is now part of the cluster", ep);
}
}
// if (logger.isTraceEnabled())
// logger.trace("Adding endpoint state for {}", ep);
endpoint_state_map[ep] = eps;
// the node restarted: it is up to the subscriber to take whatever action is necessary
for (auto& subscriber : _subscribers) {
subscriber->on_restart(ep, eps);
}
if (!is_dead_state(eps)) {
mark_alive(ep, eps);
} else {
//logger.debug("Not marking {} alive due to dead state", ep);
mark_dead(ep, eps);
}
for (auto& subscriber : _subscribers) {
subscriber->on_join(ep, eps);
}
}
bool gossiper::is_dead_state(endpoint_state eps) {
if (!eps.get_application_state(application_state::STATUS)) {
return false;
}
auto value = eps.get_application_state(application_state::STATUS)->value;
std::vector<sstring> pieces;
boost::split(pieces, value, boost::is_any_of(","));
assert(pieces.size() > 0);
sstring state = pieces[0];
for (auto& deadstate : DEAD_STATES) {
if (state == deadstate) {
return true;
}
}
return false;
}
void gossiper::apply_new_states(inet_address addr, endpoint_state& local_state, endpoint_state& remote_state) {
// don't assert here, since if the node restarts the version will go back to zero
//int oldVersion = local_state.get_heart_beat_state().get_heart_beat_version();
local_state.set_heart_beat_state(remote_state.get_heart_beat_state());
// if (logger.isTraceEnabled()) {
// logger.trace("Updating heartbeat state version to {} from {} for {} ...",
// local_state.get_heart_beat_state().get_heart_beat_version(), oldVersion, addr);
// }
// we need to make two loops here, one to apply, then another to notify,
// this way all states in an update are present and current when the notifications are received
for (auto& remote_entry : remote_state.get_application_state_map()) {
auto& remote_key = remote_entry.first;
auto& remote_value = remote_entry.second;
assert(remote_state.get_heart_beat_state().get_generation() == local_state.get_heart_beat_state().get_generation());
local_state.add_application_state(remote_key, remote_value);
}
for (auto& entry : remote_state.get_application_state_map()) {
do_on_change_notifications(addr, entry.first, entry.second);
}
}
void gossiper::do_before_change_notifications(inet_address addr, endpoint_state& ep_state, application_state& ap_state, versioned_value& new_value) {
for (auto& subscriber : _subscribers) {
subscriber->before_change(addr, ep_state, ap_state, new_value);
}
}
void gossiper::do_on_change_notifications(inet_address addr, const application_state& state, versioned_value& value) {
for (auto& subscriber : _subscribers) {
subscriber->on_change(addr, state, value);
}
}
void gossiper::request_all(gossip_digest& g_digest,
std::vector<gossip_digest>& delta_gossip_digest_list, int remote_generation) {
/* We are here since we have no data for this endpoint locally so request everthing. */
delta_gossip_digest_list.emplace_back(g_digest.get_endpoint(), remote_generation, 0);
// if (logger.isTraceEnabled())
// logger.trace("request_all for {}", g_digest.get_endpoint());
}
void gossiper::send_all(gossip_digest& g_digest,
std::map<inet_address, endpoint_state>& delta_ep_state_map,
int max_remote_version) {
auto ep = g_digest.get_endpoint();
auto local_ep_state_ptr = get_state_for_version_bigger_than(ep, max_remote_version);
if (local_ep_state_ptr) {
delta_ep_state_map[ep] = *local_ep_state_ptr;
}
}
void gossiper::examine_gossiper(std::vector<gossip_digest>& g_digest_list,
std::vector<gossip_digest>& delta_gossip_digest_list,
std::map<inet_address, endpoint_state>& delta_ep_state_map) {
if (g_digest_list.size() == 0) {
/* we've been sent a *completely* empty syn, which should normally
* never happen since an endpoint will at least send a syn with
* itself. If this is happening then the node is attempting shadow
* gossip, and we should reply with everything we know.
*/
// logger.debug("Shadow request received, adding all states");
for (auto& entry : endpoint_state_map) {
g_digest_list.emplace_back(entry.first, 0, 0);
}
}
for (gossip_digest& g_digest : g_digest_list) {
int remote_generation = g_digest.get_generation();
int max_remote_version = g_digest.get_max_version();
/* Get state associated with the end point in digest */
auto it = endpoint_state_map.find(g_digest.get_endpoint());
/* Here we need to fire a GossipDigestAckMessage. If we have some
* data associated with this endpoint locally then we follow the
* "if" path of the logic. If we have absolutely nothing for this
* endpoint we need to request all the data for this endpoint.
*/
if (it != endpoint_state_map.end()) {
endpoint_state& ep_state_ptr = it->second;
int local_generation = ep_state_ptr.get_heart_beat_state().get_generation();
/* get the max version of all keys in the state associated with this endpoint */
int max_local_version = get_max_endpoint_state_version(ep_state_ptr);
if (remote_generation == local_generation && max_remote_version == max_local_version) {
continue;
}
if (remote_generation > local_generation) {
/* we request everything from the gossiper */
request_all(g_digest, delta_gossip_digest_list, remote_generation);
} else if (remote_generation < local_generation) {
/* send all data with generation = localgeneration and version > 0 */
send_all(g_digest, delta_ep_state_map, 0);
} else if (remote_generation == local_generation) {
/*
* If the max remote version is greater then we request the
* remote endpoint send us all the data for this endpoint
* with version greater than the max version number we have
* locally for this endpoint.
*
* If the max remote version is lesser, then we send all
* the data we have locally for this endpoint with version
* greater than the max remote version.
*/
if (max_remote_version > max_local_version) {
delta_gossip_digest_list.emplace_back(g_digest.get_endpoint(), remote_generation, max_local_version);
} else if (max_remote_version < max_local_version) {
/* send all data with generation = localgeneration and version > max_remote_version */
send_all(g_digest, delta_ep_state_map, max_remote_version);
}
}
} else {
/* We are here since we have no data for this endpoint locally so request everything. */
request_all(g_digest, delta_gossip_digest_list, remote_generation);
}
}
}
void gossiper::start(int generation_number) {
start(generation_number, std::map<application_state, versioned_value>());
}
void gossiper::start(int generation_nbr, std::map<application_state, versioned_value> preload_local_states) {
build_seeds_list();
/* initialize the heartbeat state for this localEndpoint */
maybe_initialize_local_state(generation_nbr);
endpoint_state& local_state = endpoint_state_map[get_broadcast_address()];
for (auto& entry : preload_local_states) {
local_state.add_application_state(entry.first, entry.second);
}
//notify snitches that Gossiper is about to start
#if 0
DatabaseDescriptor.getEndpointSnitch().gossiperStarting();
if (logger.isTraceEnabled())
logger.trace("gossip started with generation {}", local_state.get_heart_beat_state().get_generation());
#endif
std::chrono::milliseconds period(INTERVAL_IN_MILLIS);
_scheduled_gossip_task.arm_periodic(period);
}
void gossiper::do_shadow_round() {
build_seeds_list();
// send a completely empty syn
std::vector<gossip_digest> g_digests;
gossip_digest_syn message(get_cluster_name(), get_partitioner_name(), g_digests);
_in_shadow_round = true;
for (inet_address seed : _seeds) {
auto id = get_shard_id(seed);
ms().send_message<gossip_digest_ack>(messaging_verb::GOSSIP_DIGEST_SYN,
std::move(id), std::move(message)).then([this, id] (gossip_digest_ack ack_msg) {
this->set_last_processed_message_at(now_millis());
if (this->is_in_shadow_round()) {
this->finish_shadow_round();
}
});
}
// FIXME: Implemnt the wait logic below
#if 0
int slept = 0;
try {
while (true) {
Thread.sleep(1000);
if (!_in_shadow_round)
break;
slept += 1000;
if (slept > StorageService.RING_DELAY)
throw new RuntimeException("Unable to gossip with any _seeds");
}
} catch (InterruptedException wtf) {
throw new RuntimeException(wtf);
}
#endif
}
void gossiper::build_seeds_list() {
for (inet_address seed : get_seeds() ) {
if (seed == get_broadcast_address()) {
continue;
}
_seeds.emplace(seed);
}
}
void gossiper::maybe_initialize_local_state(int generation_nbr) {
heart_beat_state hb_state(generation_nbr);
endpoint_state local_state(hb_state);
local_state.mark_alive();
inet_address ep = get_broadcast_address();
auto it = endpoint_state_map.find(ep);
if (it == endpoint_state_map.end()) {
endpoint_state_map.emplace(ep, local_state);
}
}
void gossiper::add_saved_endpoint(inet_address ep) {
if (ep == get_broadcast_address()) {
// logger.debug("Attempt to add self as saved endpoint");
return;
}
//preserve any previously known, in-memory data about the endpoint (such as DC, RACK, and so on)
auto ep_state = endpoint_state(heart_beat_state(0));
auto it = endpoint_state_map.find(ep);
if (it != endpoint_state_map.end()) {
ep_state = it->second;
// logger.debug("not replacing a previous ep_state for {}, but reusing it: {}", ep, ep_state);
ep_state.set_heart_beat_state(heart_beat_state(0));
}
ep_state.mark_dead();
endpoint_state_map[ep] = ep_state;
_unreachable_endpoints[ep] = now_nanos();
// if (logger.isTraceEnabled())
// logger.trace("Adding saved endpoint {} {}", ep, ep_state.get_heart_beat_state().get_generation());
}
void gossiper::add_local_application_state(application_state state, versioned_value value) {
inet_address ep_addr = get_broadcast_address();
assert(endpoint_state_map.count(ep_addr));
endpoint_state& ep_state = endpoint_state_map.at(ep_addr);
// Fire "before change" notifications:
do_before_change_notifications(ep_addr, ep_state, state, value);
// Notifications may have taken some time, so preventively raise the version
// of the new value, otherwise it could be ignored by the remote node
// if another value with a newer version was received in the meantime:
// FIXME:
// value = StorageService.instance.valueFactory.cloneWithHigherVersion(value);
// Add to local application state and fire "on change" notifications:
ep_state.add_application_state(state, value);
do_on_change_notifications(ep_addr, state, value);
}
void gossiper::add_lccal_application_states(std::list<std::pair<application_state, versioned_value> > states) {
// Note: The taskLock in Origin code is removed, we can probaby use a
// simple data structure here
for (std::pair<application_state, versioned_value>& pair : states) {
add_local_application_state(pair.first, pair.second);
}
}
void gossiper::stop() {
warn(unimplemented::cause::GOSSIP);
// if (scheduledGossipTask != null)
// scheduledGossipTask.cancel(false);
// logger.info("Announcing shutdown");
// Uninterruptibles.sleepUninterruptibly(INTERVAL_IN_MILLIS * 2, TimeUnit.MILLISECONDS);
for (inet_address ep : _live_endpoints) {
ms().send_message_oneway<void>(messaging_verb::GOSSIP_SHUTDOWN, get_shard_id(ep), ep).then([]{
});
}
}
bool gossiper::is_enabled() {
//return (scheduledGossipTask != null) && (!scheduledGossipTask.isCancelled());
warn(unimplemented::cause::GOSSIP);
return true;
}
void gossiper::finish_shadow_round() {
if (_in_shadow_round) {
_in_shadow_round = false;
}
}
bool gossiper::is_in_shadow_round() {
return _in_shadow_round;
}
void gossiper::add_expire_time_for_endpoint(inet_address endpoint, int64_t expire_time) {
// if (logger.isDebugEnabled()) {
// logger.debug("adding expire time for endpoint : {} ({})", endpoint, expire_time);
// }
_expire_time_endpoint_map[endpoint] = expire_time;
}
int64_t gossiper::compute_expire_time() {
return now_millis() + A_VERY_LONG_TIME;
}
void gossiper::dump_endpoint_state_map() {
print("----------- endpoint_state_map: -----------\n");
for (auto& x : endpoint_state_map) {
print("ep=%s, eps=%s\n", x.first, x.second);
}
}
} // namespace gms