Section 3.10 of the PhD describes two cases for which the extension can be helpful: 1. Sometimes the leader must step down. For example, it may need to reboot for maintenance, or it may be removed from the cluster. When it steps down, the cluster will be idle for an election timeout until another server times out and wins an election. This brief unavailability can be avoided by having the leader transfer its leadership to another server before it steps down. 2. In some cases, one or more servers may be more suitable to lead the cluster than others. For example, a server with high load would not make a good leader, or in a WAN deployment, servers in a primary datacenter may be preferred in order to minimize the latency between clients and the leader. Other consensus algorithms may be able to accommodate these preferences during leader election, but Raft needs a server with a sufficiently up-to-date log to become leader, which might not be the most preferred one. Instead, a leader in Raft can periodically check to see whether one of its available followers would be more suitable, and if so, transfer its leadership to that server. (If only human leaders were so graceful.) The patch here implements the extension and employs it automatically when a leader removes itself from a cluster.
196 lines
6.2 KiB
C++
196 lines
6.2 KiB
C++
/*
|
|
* Copyright (C) 2020 ScyllaDB
|
|
*/
|
|
|
|
/*
|
|
* This file is part of Scylla.
|
|
*
|
|
* Scylla is free software: you can redistribute it and/or modify
|
|
* it under the terms of the GNU Affero General Public License as published by
|
|
* the Free Software Foundation, either version 3 of the License, or
|
|
* (at your option) any later version.
|
|
*
|
|
* Scylla is distributed in the hope that it will be useful,
|
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
* GNU General Public License for more details.
|
|
*
|
|
* You should have received a copy of the GNU General Public License
|
|
* along with Scylla. If not, see <http://www.gnu.org/licenses/>.
|
|
*/
|
|
#pragma once
|
|
|
|
#include <seastar/core/condition-variable.hh>
|
|
#include "raft.hh"
|
|
#include "logical_clock.hh"
|
|
|
|
namespace raft {
|
|
|
|
// Leader's view of each follower, including self.
|
|
class follower_progress {
|
|
public:
|
|
// Id of this server
|
|
const server_id id;
|
|
// Index of the next log entry to send to this server.
|
|
index_t next_idx;
|
|
// Index of the highest log entry known to be replicated to this
|
|
// server.
|
|
index_t match_idx = index_t(0);
|
|
// Index that we know to be committed by the follower
|
|
index_t commit_idx = index_t(0);
|
|
// True if the follower is voting one
|
|
bool can_vote = true;
|
|
|
|
enum class state {
|
|
// In this state only one append entry is send until matching index is found
|
|
PROBE,
|
|
// In this state multiple append entries are sent optimistically
|
|
PIPELINE,
|
|
// In this state snapshot is been transfered
|
|
SNAPSHOT
|
|
};
|
|
state state = state::PROBE;
|
|
// true if a packet was sent already in a probe mode
|
|
bool probe_sent = false;
|
|
// number of in flight still un-acked append entries requests
|
|
size_t in_flight = 0;
|
|
static constexpr size_t max_in_flight = 10;
|
|
|
|
// check if a reject packet should be ignored because it was delayed
|
|
// or reordered
|
|
bool is_stray_reject(const append_reply::rejected&);
|
|
|
|
void become_probe();
|
|
void become_pipeline();
|
|
void become_snapshot(index_t snp_idx);
|
|
|
|
void accepted(index_t idx) {
|
|
// AppendEntries replies can arrive out of order.
|
|
match_idx = std::max(idx, match_idx);
|
|
// idx may be smaller if we increased next_idx
|
|
// optimistically in pipeline mode.
|
|
next_idx = std::max(idx + index_t{1}, next_idx);
|
|
}
|
|
|
|
// Return true if a new replication record can be sent to the follower.
|
|
bool can_send_to();
|
|
|
|
follower_progress(server_id id_arg, index_t next_idx_arg)
|
|
: id(id_arg), next_idx(next_idx_arg)
|
|
{}
|
|
};
|
|
|
|
using progress = std::unordered_map<server_id, follower_progress>;
|
|
|
|
class tracker: private progress {
|
|
// Copy of this server's id
|
|
server_id _my_id;
|
|
std::unordered_set<server_id> _current_voters;
|
|
std::unordered_set<server_id> _previous_voters;
|
|
|
|
// Not NULL if the leader is part of the current configuration.
|
|
//
|
|
// 4.2.2 Removing the current leader
|
|
// There will be a period of time (while it is committing
|
|
// C_new) when a leader can manage a cluster that does not
|
|
// include itself; it replicates log entries but does not
|
|
// count itself in majorities.
|
|
follower_progress *_leader_progress = nullptr;
|
|
public:
|
|
using progress::begin, progress::end, progress::cbegin, progress::cend, progress::size;
|
|
|
|
explicit tracker(server_id my_id)
|
|
: _my_id(my_id)
|
|
{}
|
|
|
|
// Return progress for a follower
|
|
// May return nullptr if the follower is not part of the current
|
|
// configuration any more. This may happen when handling
|
|
// messages from removed followers.
|
|
follower_progress* find(server_id dst) {
|
|
auto it = this->progress::find(dst);
|
|
return it == this->progress::end() ? nullptr : &it->second;
|
|
}
|
|
void set_configuration(const configuration& configuration, index_t next_idx);
|
|
// Return progress object for the current leader if it's
|
|
// part of the current configuration.
|
|
follower_progress* leader_progress() {
|
|
return _leader_progress;
|
|
}
|
|
// Calculate the current commit index based on the current
|
|
// simple or joint quorum.
|
|
index_t committed(index_t prev_commit_idx);
|
|
};
|
|
|
|
// Possible leader election outcomes.
|
|
enum class vote_result {
|
|
// We haven't got enough responses yet, either because
|
|
// the servers haven't voted or responses failed to arrive.
|
|
UNKNOWN = 0,
|
|
// This candidate has won the election
|
|
WON,
|
|
// The quorum of servers has voted against this candidate
|
|
LOST,
|
|
};
|
|
|
|
std::ostream& operator<<(std::ostream& os, const vote_result& v);
|
|
|
|
// State of election in a single quorum
|
|
class election_tracker {
|
|
// All eligible voters
|
|
std::unordered_set<server_id> _suffrage;
|
|
// Votes collected
|
|
std::unordered_set<server_id> _responded;
|
|
size_t _granted = 0;
|
|
public:
|
|
election_tracker(const server_address_set& configuration) {
|
|
for (const auto& a : configuration) {
|
|
if (a.can_vote) {
|
|
_suffrage.emplace(a.id);
|
|
}
|
|
}
|
|
}
|
|
|
|
bool register_vote(server_id from, bool granted) {
|
|
if (_suffrage.find(from) == _suffrage.end()) {
|
|
return false;
|
|
}
|
|
if (_responded.emplace(from).second) {
|
|
// Have not counted this vote yet
|
|
_granted += static_cast<int>(granted);
|
|
}
|
|
return true;
|
|
}
|
|
vote_result tally_votes() const {
|
|
auto quorum = _suffrage.size() / 2 + 1;
|
|
if (_granted >= quorum) {
|
|
return vote_result::WON;
|
|
}
|
|
assert(_responded.size() <= _suffrage.size());
|
|
auto unknown = _suffrage.size() - _responded.size();
|
|
return _granted + unknown >= quorum ? vote_result::UNKNOWN : vote_result::LOST;
|
|
}
|
|
friend std::ostream& operator<<(std::ostream& os, const election_tracker& v);
|
|
};
|
|
|
|
// Candidate's state specific to election
|
|
class votes {
|
|
server_address_set _voters;
|
|
election_tracker _current;
|
|
std::optional<election_tracker> _previous;
|
|
public:
|
|
votes(configuration configuration);
|
|
|
|
const server_address_set& voters() const {
|
|
return _voters;
|
|
}
|
|
|
|
void register_vote(server_id from, bool granted);
|
|
vote_result tally_votes() const;
|
|
|
|
friend std::ostream& operator<<(std::ostream& os, const votes& v);
|
|
};
|
|
|
|
} // namespace raft
|
|
|