Files
scylladb/gms/failure_detector.cc
Pekka Enberg 38a54df863 Fix pre-ScyllaDB copyright statements
People keep tripping over the old copyrights and copy-pasting them to
new files. Search and replace "Cloudius Systems" with "ScyllaDB".

Message-Id: <1460013664-25966-1-git-send-email-penberg@scylladb.com>
2016-04-08 08:12:47 +03:00

302 lines
10 KiB
C++

/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
* Modified by ScyllaDB
* Copyright (C) 2015 ScyllaDB
*/
/*
* This file is part of Scylla.
*
* Scylla is free software: you can redistribute it and/or modify
* it under the terms of the GNU Affero General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* Scylla is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with Scylla. If not, see <http://www.gnu.org/licenses/>.
*/
#include "gms/failure_detector.hh"
#include "gms/gossiper.hh"
#include "gms/i_failure_detector.hh"
#include "gms/i_failure_detection_event_listener.hh"
#include "gms/endpoint_state.hh"
#include "gms/application_state.hh"
#include "gms/inet_address.hh"
#include "log.hh"
#include <iostream>
#include <chrono>
namespace gms {
static logging::logger logger("failure_detector");
constexpr std::chrono::milliseconds failure_detector::DEFAULT_MAX_PAUSE;
using clk = arrival_window::clk;
static clk::duration get_initial_value() {
#if 0
String newvalue = System.getProperty("cassandra.fd_initial_value_ms");
if (newvalue == null)
{
return Gossiper.intervalInMillis * 2;
}
else
{
logger.info("Overriding FD INITIAL_VALUE to {}ms", newvalue);
return Integer.parseInt(newvalue);
}
#endif
warn(unimplemented::cause::GOSSIP);
return std::chrono::seconds(2);
}
clk::duration arrival_window::get_max_interval() {
#if 0
sstring newvalue = System.getProperty("cassandra.fd_max_interval_ms");
if (newvalue == null)
{
return failure_detector.INITIAL_VALUE_NANOS;
}
else
{
logger.info("Overriding FD MAX_INTERVAL to {}ms", newvalue);
return TimeUnit.NANOSECONDS.convert(Integer.parseInt(newvalue), TimeUnit.MILLISECONDS);
}
#endif
warn(unimplemented::cause::GOSSIP);
return get_initial_value();
}
void arrival_window::add(clk::time_point value, const gms::inet_address& ep) {
if (_tlast > clk::time_point::min()) {
auto inter_arrival_time = value - _tlast;
if (inter_arrival_time <= get_max_interval()) {
_arrival_intervals.add(inter_arrival_time.count());
} else {
logger.debug("failure_detector: Ignoring interval time of {} for {}", inter_arrival_time.count(), ep);
}
} else {
// We use a very large initial interval since the "right" average depends on the cluster size
// and it's better to err high (false negatives, which will be corrected by waiting a bit longer)
// than low (false positives, which cause "flapping").
_arrival_intervals.add(get_initial_value().count());
}
_tlast = value;
}
double arrival_window::mean() {
return _arrival_intervals.mean();
}
double arrival_window::phi(clk::time_point tnow) {
assert(_arrival_intervals.size() > 0 && _tlast > clk::time_point::min()); // should not be called before any samples arrive
auto t = (tnow - _tlast).count();
auto m = mean();
double phi = t / m;
logger.debug("failure_detector: now={}, tlast={}, t={}, mean={}, phi={}",
tnow.time_since_epoch().count(), _tlast.time_since_epoch().count(), t, m, phi);
return phi;
}
std::ostream& operator<<(std::ostream& os, const arrival_window& w) {
for (auto& x : w._arrival_intervals.deque()) {
os << x << " ";
}
return os;
}
sstring failure_detector::get_all_endpoint_states() {
std::stringstream ss;
for (auto& entry : get_local_gossiper().endpoint_state_map) {
auto& ep = entry.first;
auto& state = entry.second;
ss << ep << "\n";
append_endpoint_state(ss, state);
}
return sstring(ss.str());
}
std::map<sstring, sstring> failure_detector::get_simple_states() {
std::map<sstring, sstring> nodes_status;
for (auto& entry : get_local_gossiper().endpoint_state_map) {
auto& ep = entry.first;
auto& state = entry.second;
std::stringstream ss;
ss << ep;
if (state.is_alive())
nodes_status.emplace(sstring(ss.str()), "UP");
else
nodes_status.emplace(sstring(ss.str()), "DOWN");
}
return nodes_status;
}
int failure_detector::get_down_endpoint_count() {
int count = 0;
for (auto& entry : get_local_gossiper().endpoint_state_map) {
auto& state = entry.second;
if (!state.is_alive()) {
count++;
}
}
return count;
}
int failure_detector::get_up_endpoint_count() {
int count = 0;
for (auto& entry : get_local_gossiper().endpoint_state_map) {
auto& state = entry.second;
if (state.is_alive()) {
count++;
}
}
return count;
}
sstring failure_detector::get_endpoint_state(sstring address) {
std::stringstream ss;
auto eps = get_local_gossiper().get_endpoint_state_for_endpoint(inet_address(address));
if (eps) {
append_endpoint_state(ss, *eps);
return sstring(ss.str());
} else {
return sstring("unknown endpoint ") + address;
}
}
void failure_detector::append_endpoint_state(std::stringstream& ss, endpoint_state& state) {
ss << " generation:" << state.get_heart_beat_state().get_generation() << "\n";
ss << " heartbeat:" << state.get_heart_beat_state().get_heart_beat_version() << "\n";
for (const auto& entry : state.get_application_state_map()) {
auto& app_state = entry.first;
auto& versioned_val = entry.second;
if (app_state == application_state::TOKENS) {
continue;
}
ss << " " << app_state << ":" << versioned_val.version << ":" << versioned_val.value << "\n";
}
const auto& app_state_map = state.get_application_state_map();
if (app_state_map.count(application_state::TOKENS)) {
ss << " TOKENS:" << app_state_map.at(application_state::TOKENS).version << ":<hidden>\n";
} else {
ss << " TOKENS: not present" << "\n";
}
}
void failure_detector::set_phi_convict_threshold(double phi) {
_phi = phi;
}
double failure_detector::get_phi_convict_threshold() {
return _phi;
}
bool failure_detector::is_alive(inet_address ep) {
return get_local_gossiper().is_alive(ep);
}
void failure_detector::report(inet_address ep) {
logger.trace("failure_detector: reporting {}", ep);
auto now = clk::now();
auto it = _arrival_samples.find(ep);
if (it == _arrival_samples.end()) {
// avoid adding an empty ArrivalWindow to the Map
auto heartbeat_window = arrival_window(SAMPLE_SIZE);
heartbeat_window.add(now, ep);
_arrival_samples.emplace(ep, heartbeat_window);
} else {
it->second.add(now, ep);
}
}
// Runs inside seastar::async context
void failure_detector::interpret(inet_address ep) {
auto it = _arrival_samples.find(ep);
if (it == _arrival_samples.end()) {
return;
}
arrival_window& hb_wnd = it->second;
auto now = clk::now();
if (!_last_interpret) {
*_last_interpret = now;
}
auto diff = std::chrono::duration_cast<std::chrono::milliseconds>(now - *_last_interpret);
*_last_interpret = now;
if (diff > get_max_local_pause()) {
logger.warn("Not marking nodes down due to local pause of {} > {} (milliseconds)", diff.count(), get_max_local_pause().count());
_last_paused = now;
return;
}
if (clk::now() - _last_paused < get_max_local_pause()) {
logger.debug("Still not marking nodes down due to local pause");
return;
}
double phi = hb_wnd.phi(now);
logger.trace("failure_detector: PHI for {} : {}", ep, phi);
logger.trace("failure_detector: phi_convict_threshold={}", _phi);
if (PHI_FACTOR * phi > get_phi_convict_threshold()) {
logger.trace("failure_detector: notifying listeners that {} is down", ep);
logger.trace("failure_detector: intervals: {} mean: {}", hb_wnd, hb_wnd.mean());
for (auto& listener : _fd_evnt_listeners) {
logger.debug("failure_detector: convict ep={} phi={}", ep, phi);
listener->convict(ep, phi);
}
}
}
// Runs inside seastar::async context
void failure_detector::force_conviction(inet_address ep) {
logger.debug("failure_detector: Forcing conviction of {}", ep);
for (auto& listener : _fd_evnt_listeners) {
listener->convict(ep, get_phi_convict_threshold());
}
}
void failure_detector::remove(inet_address ep) {
_arrival_samples.erase(ep);
}
void failure_detector::register_failure_detection_event_listener(i_failure_detection_event_listener* listener) {
_fd_evnt_listeners.push_back(std::move(listener));
}
void failure_detector::unregister_failure_detection_event_listener(i_failure_detection_event_listener* listener) {
_fd_evnt_listeners.remove(listener);
}
std::ostream& operator<<(std::ostream& os, const failure_detector& x) {
for (auto& entry : x._arrival_samples) {
const inet_address& ep = entry.first;
const arrival_window& win = entry.second;
os << ep << " : " << win << "\n";
}
return os;
}
distributed<failure_detector> _the_failure_detector;
} // namespace gms