Files
scylladb/raft/tracker.cc
Konstantin Osipov 3478389d60 raft: do not account for the same vote twice
While duplicate votes are not allowed by Raft rules, it is possible
that a vote message is delivered multiple times.

The current voting implementation does reject votes from non-members,
but doesn't check for duplicate votes.

Keep track of who has voted yet, and reject duplicate votes.

A unit test follows.
2021-02-16 23:15:16 +03:00

268 lines
8.4 KiB
C++

/*
* Copyright (C) 2020 ScyllaDB
*/
/*
* This file is part of Scylla.
*
* Scylla is free software: you can redistribute it and/or modify
* it under the terms of the GNU Affero General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* Scylla is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with Scylla. If not, see <http://www.gnu.org/licenses/>.
*/
#include "tracker.hh"
#include <seastar/core/coroutine.hh>
#include <seastar/core/on_internal_error.hh>
namespace raft {
bool follower_progress::is_stray_reject(const append_reply::rejected& rejected) {
switch (state) {
case follower_progress::state::PIPELINE:
if (rejected.non_matching_idx <= match_idx) {
// If rejected index is smaller that matched it means this is a stray reply
return true;
}
break;
case follower_progress::state::PROBE:
// In the probe state the reply is only valid if it matches next_idx - 1, since only
// one append request is outstanding.
if (rejected.non_matching_idx != index_t(next_idx - 1)) {
return true;
}
break;
case follower_progress::state::SNAPSHOT:
// any reject during snapshot transfer is stray one
return true;
default:
assert(false);
}
return false;
}
void follower_progress::become_probe() {
state = state::PROBE;
probe_sent = false;
}
void follower_progress::become_pipeline() {
if (state != state::PIPELINE) {
// If a previous request was accepted, move to "pipeline" state
// since we now know the follower's log state.
state = state::PIPELINE;
in_flight = 0;
}
}
void follower_progress::become_snapshot() {
state = state::SNAPSHOT;
}
bool follower_progress::can_send_to() {
switch (state) {
case state::PROBE:
return !probe_sent;
case state::PIPELINE:
// allow `max_in_flight` outstanding indexes
// FIXME: make it smarter
return in_flight < follower_progress::max_in_flight;
case state::SNAPSHOT:
// In this state we are waiting
// for a snapshot to be transferred
// before starting to sync the log.
return false;
}
assert(false);
return false;
}
// If this is called when a tracker is just created, the current
// progress is empty and we should simply crate an instance for
// each follower.
// When switching configurations, we should preserve progress
// for existing followers, crate progress for new, and remove
// progress for non-members (to make sure we don't send noise
// messages to them).
void tracker::set_configuration(configuration configuration, index_t next_idx) {
_configuration = std::move(configuration);
_leader_progress = nullptr;
// Swap out the current progress and then re-add
// only those entries which are still present.
progress old_progress = std::move(*this);
auto emplace_simple_config = [&](const server_address_set& config) {
for (const auto& s : config) {
auto newp = this->progress::find(s.id);
if (newp != this->progress::end()) {
// Processing joint configuration and already added
// an entry for this id.
continue;
}
auto oldp = old_progress.find(s.id);
if (oldp != old_progress.end()) {
newp = this->progress::emplace(s.id, std::move(oldp->second)).first;
} else {
newp = this->progress::emplace(s.id, follower_progress{s.id, next_idx}).first;
}
if (s.id == _my_id) {
// The leader is part of the current
// configuration.
_leader_progress = &newp->second;
}
}
};
emplace_simple_config(_configuration.current);
if (_configuration.is_joint()) {
emplace_simple_config(_configuration.previous);
}
}
// A sorted array of node match indexes used to find
// the pivot which serves as commit index of the group.
class match_vector {
std::vector<index_t> _match;
// How many elements in the match array have a match index
// larger than the previous commit index.
size_t _count = 0;
index_t _prev_commit_idx;
public:
explicit match_vector(index_t prev_commit_idx, size_t reserve_size)
: _prev_commit_idx(prev_commit_idx) {
_match.reserve(reserve_size);
}
void push_back(index_t match_idx) {
if (match_idx > _prev_commit_idx) {
_count++;
}
_match.push_back(match_idx);
}
bool committed() const {
return _count >= _match.size()/2 + 1;
}
index_t commit_idx() {
logger.trace("check committed count {} cluster size {}", _count, _match.size());
// The index of the pivot node is selected so that all nodes
// with a larger match index plus the pivot form a majority,
// for example:
// cluster size pivot node majority
// 1 0 1
// 2 0 2
// 3 1 2
// 4 1 3
// 5 2 3
//
auto pivot = (_match.size() - 1) / 2;
std::nth_element(_match.begin(), _match.begin() + pivot, _match.end());
return _match[pivot];
}
};
index_t tracker::committed(index_t prev_commit_idx) {
match_vector current(prev_commit_idx, _configuration.current.size());
if (_configuration.is_joint()) {
match_vector previous(prev_commit_idx, _configuration.previous.size());
for (const auto& [id, p] : *this) {
if (_configuration.current.find(server_address{p.id}) != _configuration.current.end()) {
current.push_back(p.match_idx);
}
if (_configuration.previous.find(server_address{p.id}) != _configuration.previous.end()) {
previous.push_back(p.match_idx);
}
}
if (!current.committed() || !previous.committed()) {
return prev_commit_idx;
}
return std::min(current.commit_idx(), previous.commit_idx());
} else {
for (const auto& [id, p] : *this) {
current.push_back(p.match_idx);
}
if (!current.committed()) {
return prev_commit_idx;
}
return current.commit_idx();
}
}
votes::votes(configuration configuration)
:_voters(configuration.current)
, _current(configuration.current) {
if (configuration.is_joint()) {
_previous.emplace(configuration.previous);
_voters.insert(configuration.previous.begin(), configuration.previous.end());
}
}
void votes::register_vote(server_id from, bool granted) {
bool registered = false;
if (_current.register_vote(from, granted)) {
registered = true;
}
if (_previous && _previous->register_vote(from, granted)) {
registered = true;
}
// Should never receive a vote not requested, unless an RPC bug.
if (! registered) {
seastar::on_internal_error(logger,
format("Got a vote from unregistered server {} during election", from));
}
}
vote_result votes::tally_votes() const {
if (_previous) {
auto previous_result = _previous->tally_votes();
if (previous_result != vote_result::WON) {
return previous_result;
}
}
return _current.tally_votes();
}
std::ostream& operator<<(std::ostream& os, const election_tracker& v) {
os << "responded: " << v._responded.size() << ", ";
os << "granted: " << v._granted;
return os;
}
std::ostream& operator<<(std::ostream& os, const votes& v) {
os << "current: " << v._current << std::endl;
if (v._previous) {
os << "previous: " << v._previous.value() << std::endl;
}
return os;
}
std::ostream& operator<<(std::ostream& os, const vote_result& v) {
static const char *n;
switch (v) {
case vote_result::UNKNOWN:
n = "UNKNOWN";
break;
case vote_result::WON:
n = "WON";
break;
case vote_result::LOST:
n = "LOST";
break;
}
os << n;
return os;
}
} // end of namespace raft