Files
scylladb/raft/tracker.cc
Gleb Natapov 9d6bf7f351 raft: introduce leader stepdown procedure
Section 3.10 of the PhD describes two cases for which the extension can
be helpful:

1. Sometimes the leader must step down. For example, it may need to reboot
 for maintenance, or it may be removed from the cluster. When it steps
 down, the cluster will be idle for an election timeout until another
 server times out and wins an election. This brief unavailability can be
 avoided by having the leader transfer its leadership to another server
 before it steps down.

2. In some cases, one or more servers may be more suitable to lead the
 cluster than others. For example, a server with high load would not make
 a good leader, or in a WAN deployment, servers in a primary datacenter
 may be preferred in order to minimize the latency between clients and
 the leader. Other consensus algorithms may be able to accommodate these
 preferences during leader election, but Raft needs a server with a
 sufficiently up-to-date log to become leader, which might not be the
 most preferred one. Instead, a leader in Raft can periodically check
 to see whether one of its available followers would be more suitable,
 and if so, transfer its leadership to that server. (If only human leaders
 were so graceful.)

The patch here implements the extension and employs it automatically
when a leader removes itself from a cluster.
2021-03-22 10:28:43 +02:00

281 lines
9.0 KiB
C++

/*
* Copyright (C) 2020 ScyllaDB
*/
/*
* This file is part of Scylla.
*
* Scylla is free software: you can redistribute it and/or modify
* it under the terms of the GNU Affero General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* Scylla is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with Scylla. If not, see <http://www.gnu.org/licenses/>.
*/
#include "tracker.hh"
#include <seastar/core/coroutine.hh>
#include <seastar/core/on_internal_error.hh>
namespace raft {
bool follower_progress::is_stray_reject(const append_reply::rejected& rejected) {
switch (state) {
case follower_progress::state::PIPELINE:
if (rejected.non_matching_idx <= match_idx) {
// If rejected index is smaller that matched it means this is a stray reply
return true;
}
break;
case follower_progress::state::PROBE:
// In the probe state the reply is only valid if it matches next_idx - 1, since only
// one append request is outstanding.
if (rejected.non_matching_idx != index_t(next_idx - 1)) {
return true;
}
break;
case follower_progress::state::SNAPSHOT:
// any reject during snapshot transfer is stray one
return true;
default:
assert(false);
}
return false;
}
void follower_progress::become_probe() {
state = state::PROBE;
probe_sent = false;
}
void follower_progress::become_pipeline() {
if (state != state::PIPELINE) {
// If a previous request was accepted, move to "pipeline" state
// since we now know the follower's log state.
state = state::PIPELINE;
in_flight = 0;
}
}
void follower_progress::become_snapshot(index_t snp_idx) {
state = state::SNAPSHOT;
// If snapshot transfer succeeds, start replicating from the
// next index, otherwise we will learn the follower's index
// again by sending a probe request.
next_idx = snp_idx + index_t{1};
}
bool follower_progress::can_send_to() {
switch (state) {
case state::PROBE:
return !probe_sent;
case state::PIPELINE:
// allow `max_in_flight` outstanding indexes
// FIXME: make it smarter
return in_flight < follower_progress::max_in_flight;
case state::SNAPSHOT:
// In this state we are waiting
// for a snapshot to be transferred
// before starting to sync the log.
return false;
}
assert(false);
return false;
}
// If this is called when a tracker is just created, the current
// progress is empty and we should simply crate an instance for
// each follower.
// When switching configurations, we should preserve progress
// for existing followers, crate progress for new, and remove
// progress for non-members (to make sure we don't send noise
// messages to them).
void tracker::set_configuration(const configuration& configuration, index_t next_idx) {
_leader_progress = nullptr;
_current_voters.clear();
_previous_voters.clear();
// Swap out the current progress and then re-add
// only those entries which are still present.
progress old_progress = std::move(*this);
auto emplace_simple_config = [&](const server_address_set& config, std::unordered_set<server_id>& voter_ids) {
for (const auto& s : config) {
if (s.can_vote) {
voter_ids.emplace(s.id);
}
auto newp = this->progress::find(s.id);
if (newp != this->progress::end()) {
// Processing joint configuration and already added
// an entry for this id.
continue;
}
auto oldp = old_progress.find(s.id);
if (oldp != old_progress.end()) {
newp = this->progress::emplace(s.id, std::move(oldp->second)).first;
} else {
newp = this->progress::emplace(s.id, follower_progress{s.id, next_idx}).first;
}
newp->second.can_vote = s.can_vote;
if (s.id == _my_id) {
// The leader is part of the current
// configuration.
_leader_progress = &newp->second;
}
}
};
emplace_simple_config(configuration.current, _current_voters);
if (configuration.is_joint()) {
emplace_simple_config(configuration.previous, _previous_voters);
}
}
// A sorted array of node match indexes used to find
// the pivot which serves as commit index of the group.
class match_vector {
std::vector<index_t> _match;
// How many elements in the match array have a match index
// larger than the previous commit index.
size_t _count = 0;
index_t _prev_commit_idx;
public:
explicit match_vector(index_t prev_commit_idx, size_t reserve_size)
: _prev_commit_idx(prev_commit_idx) {
_match.reserve(reserve_size);
}
void push_back(index_t match_idx) {
if (match_idx > _prev_commit_idx) {
_count++;
}
_match.push_back(match_idx);
}
bool committed() const {
return _count >= _match.size()/2 + 1;
}
index_t commit_idx() {
logger.trace("check committed count {} cluster size {}", _count, _match.size());
// The index of the pivot node is selected so that all nodes
// with a larger match index plus the pivot form a majority,
// for example:
// cluster size pivot node majority
// 1 0 1
// 2 0 2
// 3 1 2
// 4 1 3
// 5 2 3
//
auto pivot = (_match.size() - 1) / 2;
std::nth_element(_match.begin(), _match.begin() + pivot, _match.end());
return _match[pivot];
}
};
index_t tracker::committed(index_t prev_commit_idx) {
match_vector current(prev_commit_idx, _current_voters.size());
if (!_previous_voters.empty()) {
match_vector previous(prev_commit_idx, _previous_voters.size());
for (const auto& [id, p] : *this) {
if (_current_voters.contains(p.id)) {
current.push_back(p.match_idx);
}
if (_previous_voters.contains(p.id)) {
previous.push_back(p.match_idx);
}
}
if (!current.committed() || !previous.committed()) {
return prev_commit_idx;
}
return std::min(current.commit_idx(), previous.commit_idx());
} else {
for (const auto& [id, p] : *this) {
if (_current_voters.contains(p.id)) {
current.push_back(p.match_idx);
}
}
if (!current.committed()) {
return prev_commit_idx;
}
return current.commit_idx();
}
}
votes::votes(configuration configuration)
:_voters(configuration.current)
, _current(configuration.current) {
if (configuration.is_joint()) {
_previous.emplace(configuration.previous);
_voters.insert(configuration.previous.begin(), configuration.previous.end());
}
// Filter out non voting members
std::erase_if(_voters, [] (const server_address& s) { return !s.can_vote; });
}
void votes::register_vote(server_id from, bool granted) {
bool registered = false;
if (_current.register_vote(from, granted)) {
registered = true;
}
if (_previous && _previous->register_vote(from, granted)) {
registered = true;
}
// We can get an outdated vote from a node that is now non-voting member.
// Such vote should be ignored.
if (!registered) {
logger.info("Got a vote from unregistered server {} during election", from);
}
}
vote_result votes::tally_votes() const {
if (_previous) {
auto previous_result = _previous->tally_votes();
if (previous_result != vote_result::WON) {
return previous_result;
}
}
return _current.tally_votes();
}
std::ostream& operator<<(std::ostream& os, const election_tracker& v) {
os << "responded: " << v._responded.size() << ", ";
os << "granted: " << v._granted;
return os;
}
std::ostream& operator<<(std::ostream& os, const votes& v) {
os << "current: " << v._current << std::endl;
if (v._previous) {
os << "previous: " << v._previous.value() << std::endl;
}
return os;
}
std::ostream& operator<<(std::ostream& os, const vote_result& v) {
static const char *n;
switch (v) {
case vote_result::UNKNOWN:
n = "UNKNOWN";
break;
case vote_result::WON:
n = "WON";
break;
case vote_result::LOST:
n = "LOST";
break;
}
os << n;
return os;
}
} // end of namespace raft