This patch implements RAFT extension that allows to perform linearisable reads by accessing local state machine. The extension is described in section 6.4 of the PhD. To sum it up to perform a read barrier on a follower it needs to asks a leader the last committed index that it knows about. The leader must make sure that it is still a leader before answering by communicating with a quorum. When follower gets the index back it waits for it to be applied and by that completes read_barrier invocation. The patch adds three new RPC: read_barrier, read_barrier_reply and execute_read_barrier_on_leader. The last one is the one a follower uses to ask a leader about safe index it can read. First two are used by a leader to communicate with a quorum.
302 lines
10 KiB
C++
302 lines
10 KiB
C++
/*
|
|
* Copyright (C) 2020-present ScyllaDB
|
|
*/
|
|
|
|
/*
|
|
* This file is part of Scylla.
|
|
*
|
|
* Scylla is free software: you can redistribute it and/or modify
|
|
* it under the terms of the GNU Affero General Public License as published by
|
|
* the Free Software Foundation, either version 3 of the License, or
|
|
* (at your option) any later version.
|
|
*
|
|
* Scylla is distributed in the hope that it will be useful,
|
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
* GNU General Public License for more details.
|
|
*
|
|
* You should have received a copy of the GNU General Public License
|
|
* along with Scylla. If not, see <http://www.gnu.org/licenses/>.
|
|
*/
|
|
#include "tracker.hh"
|
|
#include <seastar/core/coroutine.hh>
|
|
|
|
namespace raft {
|
|
|
|
bool follower_progress::is_stray_reject(const append_reply::rejected& rejected) {
|
|
// By precondition, we are the leader and `rejected.current_term` is equal to our term.
|
|
// By definition of `match_idx` we know that at some point all entries up to and including
|
|
// `match_idx` were the same in our log and the follower's log; ...
|
|
if (rejected.non_matching_idx <= match_idx) {
|
|
// ... in particular, entry `rejected.non_matching_idx` (which is <= `match_idx`) at some point
|
|
// was the same in our log and the follower's log, but `rejected` claims they are different.
|
|
// A follower cannot change an entry unless it enters a different term, but `rejected.current_term`
|
|
// is equal to our term. Thus `rejected` must be stray.
|
|
return true;
|
|
}
|
|
if (rejected.last_idx < match_idx) {
|
|
// ... in particular, at some point the follower had to have an entry with index `rejected.last_idx + 1`
|
|
// (because `rejected.last_idx < match_idx implies rejected.last_idx + 1 <= match_idx)
|
|
// but `rejected` claims it doesn't have such entry now.
|
|
// A follower cannot truncate a suffix of its log unless it enters a different term,
|
|
// but `rejected.current_term` is equal to our term. Thus `rejected` must be stray.
|
|
return true;
|
|
}
|
|
|
|
switch (state) {
|
|
case follower_progress::state::PIPELINE:
|
|
break;
|
|
case follower_progress::state::PROBE:
|
|
// In PROBE state we send a single append request `req` with `req.prev_log_idx == next_idx - 1`.
|
|
// When the follower generates a rejected response `r`, it sets `r.non_matching_idx = req.prev_log_idx`.
|
|
// Thus the reject either satisfies `rejected.non_matching_idx == next_idx - 1` or is stray.
|
|
if (rejected.non_matching_idx != index_t(next_idx - 1)) {
|
|
return true;
|
|
}
|
|
break;
|
|
case follower_progress::state::SNAPSHOT:
|
|
// any reject during snapshot transfer is stray one
|
|
return true;
|
|
default:
|
|
assert(false);
|
|
}
|
|
return false;
|
|
}
|
|
|
|
void follower_progress::become_probe() {
|
|
state = state::PROBE;
|
|
probe_sent = false;
|
|
}
|
|
|
|
void follower_progress::become_pipeline() {
|
|
if (state != state::PIPELINE) {
|
|
// If a previous request was accepted, move to "pipeline" state
|
|
// since we now know the follower's log state.
|
|
state = state::PIPELINE;
|
|
in_flight = 0;
|
|
}
|
|
}
|
|
|
|
void follower_progress::become_snapshot(index_t snp_idx) {
|
|
state = state::SNAPSHOT;
|
|
// If snapshot transfer succeeds, start replicating from the
|
|
// next index, otherwise we will learn the follower's index
|
|
// again by sending a probe request.
|
|
next_idx = snp_idx + index_t{1};
|
|
}
|
|
|
|
bool follower_progress::can_send_to() {
|
|
switch (state) {
|
|
case state::PROBE:
|
|
return !probe_sent;
|
|
case state::PIPELINE:
|
|
// allow `max_in_flight` outstanding indexes
|
|
// FIXME: make it smarter
|
|
return in_flight < follower_progress::max_in_flight;
|
|
case state::SNAPSHOT:
|
|
// In this state we are waiting
|
|
// for a snapshot to be transferred
|
|
// before starting to sync the log.
|
|
return false;
|
|
}
|
|
assert(false);
|
|
return false;
|
|
}
|
|
|
|
// If this is called when a tracker is just created, the current
|
|
// progress is empty and we should simply crate an instance for
|
|
// each follower.
|
|
// When switching configurations, we should preserve progress
|
|
// for existing followers, crate progress for new, and remove
|
|
// progress for non-members (to make sure we don't send noise
|
|
// messages to them).
|
|
void tracker::set_configuration(const configuration& configuration, index_t next_idx) {
|
|
_current_voters.clear();
|
|
_previous_voters.clear();
|
|
|
|
// Swap out the current progress and then re-add
|
|
// only those entries which are still present.
|
|
progress old_progress = std::move(*this);
|
|
|
|
auto emplace_simple_config = [&](const server_address_set& config, std::unordered_set<server_id>& voter_ids) {
|
|
for (const auto& s : config) {
|
|
if (s.can_vote) {
|
|
voter_ids.emplace(s.id);
|
|
}
|
|
auto newp = this->progress::find(s.id);
|
|
if (newp != this->progress::end()) {
|
|
// Processing joint configuration and already added
|
|
// an entry for this id.
|
|
continue;
|
|
}
|
|
auto oldp = old_progress.find(s.id);
|
|
if (oldp != old_progress.end()) {
|
|
newp = this->progress::emplace(s.id, std::move(oldp->second)).first;
|
|
} else {
|
|
newp = this->progress::emplace(s.id, follower_progress{s.id, next_idx}).first;
|
|
}
|
|
newp->second.can_vote = s.can_vote;
|
|
}
|
|
};
|
|
emplace_simple_config(configuration.current, _current_voters);
|
|
if (configuration.is_joint()) {
|
|
emplace_simple_config(configuration.previous, _previous_voters);
|
|
}
|
|
}
|
|
|
|
// A sorted array of node match indexes used to find
|
|
// the pivot which serves as commit index of the group.
|
|
template<typename Index>
|
|
class match_vector {
|
|
std::vector<Index> _match;
|
|
// How many elements in the match array have a match index
|
|
// larger than the previous commit index.
|
|
size_t _count = 0;
|
|
Index _prev_commit_idx;
|
|
public:
|
|
explicit match_vector(Index prev_commit_idx, size_t reserve_size)
|
|
: _prev_commit_idx(prev_commit_idx) {
|
|
_match.reserve(reserve_size);
|
|
}
|
|
|
|
void push_back(Index match_idx) {
|
|
if (match_idx > _prev_commit_idx) {
|
|
_count++;
|
|
}
|
|
_match.push_back(match_idx);
|
|
}
|
|
bool committed() const {
|
|
return _count >= _match.size()/2 + 1;
|
|
}
|
|
Index commit_idx() {
|
|
logger.trace("{}: check committed count {} cluster size {}", std::is_same_v<Index, index_t> ? "commit" : "read barrier", _count, _match.size());
|
|
// The index of the pivot node is selected so that all nodes
|
|
// with a larger match index plus the pivot form a majority,
|
|
// for example:
|
|
// cluster size pivot node majority
|
|
// 1 0 1
|
|
// 2 0 2
|
|
// 3 1 2
|
|
// 4 1 3
|
|
// 5 2 3
|
|
//
|
|
auto pivot = (_match.size() - 1) / 2;
|
|
std::nth_element(_match.begin(), _match.begin() + pivot, _match.end());
|
|
return _match[pivot];
|
|
}
|
|
};
|
|
|
|
template<typename T>
|
|
T tracker::committed(T prev_commit_idx) {
|
|
auto push_idx = [] (match_vector<T>& v, const follower_progress& p) {
|
|
if constexpr (std::is_same_v<T, index_t>) {
|
|
v.push_back(p.match_idx);
|
|
} else {
|
|
v.push_back(p.max_acked_read);
|
|
}
|
|
};
|
|
match_vector<T> current(prev_commit_idx, _current_voters.size());
|
|
|
|
if (!_previous_voters.empty()) {
|
|
match_vector<T> previous(prev_commit_idx, _previous_voters.size());
|
|
|
|
for (const auto& [id, p] : *this) {
|
|
if (_current_voters.contains(p.id)) {
|
|
push_idx(current, p);
|
|
}
|
|
if (_previous_voters.contains(p.id)) {
|
|
push_idx(previous, p);
|
|
}
|
|
}
|
|
if (!current.committed() || !previous.committed()) {
|
|
return prev_commit_idx;
|
|
}
|
|
return std::min(current.commit_idx(), previous.commit_idx());
|
|
} else {
|
|
for (const auto& [id, p] : *this) {
|
|
if (_current_voters.contains(p.id)) {
|
|
push_idx(current, p);
|
|
}
|
|
}
|
|
if (!current.committed()) {
|
|
return prev_commit_idx;
|
|
}
|
|
return current.commit_idx();
|
|
}
|
|
}
|
|
|
|
template index_t tracker::committed(index_t);
|
|
template read_id tracker::committed(read_id);
|
|
|
|
votes::votes(configuration configuration)
|
|
:_voters(configuration.current)
|
|
, _current(configuration.current) {
|
|
|
|
if (configuration.is_joint()) {
|
|
_previous.emplace(configuration.previous);
|
|
_voters.insert(configuration.previous.begin(), configuration.previous.end());
|
|
}
|
|
// Filter out non voting members
|
|
std::erase_if(_voters, [] (const server_address& s) { return !s.can_vote; });
|
|
}
|
|
|
|
void votes::register_vote(server_id from, bool granted) {
|
|
bool registered = false;
|
|
|
|
if (_current.register_vote(from, granted)) {
|
|
registered = true;
|
|
}
|
|
if (_previous && _previous->register_vote(from, granted)) {
|
|
registered = true;
|
|
}
|
|
// We can get an outdated vote from a node that is now non-voting member.
|
|
// Such vote should be ignored.
|
|
if (!registered) {
|
|
logger.info("Got a vote from unregistered server {} during election", from);
|
|
}
|
|
}
|
|
|
|
vote_result votes::tally_votes() const {
|
|
if (_previous) {
|
|
auto previous_result = _previous->tally_votes();
|
|
if (previous_result != vote_result::WON) {
|
|
return previous_result;
|
|
}
|
|
}
|
|
return _current.tally_votes();
|
|
}
|
|
|
|
std::ostream& operator<<(std::ostream& os, const election_tracker& v) {
|
|
os << "responded: " << v._responded.size() << ", ";
|
|
os << "granted: " << v._granted;
|
|
return os;
|
|
}
|
|
|
|
|
|
std::ostream& operator<<(std::ostream& os, const votes& v) {
|
|
os << "current: " << v._current << std::endl;
|
|
if (v._previous) {
|
|
os << "previous: " << v._previous.value() << std::endl;
|
|
}
|
|
return os;
|
|
}
|
|
|
|
std::ostream& operator<<(std::ostream& os, const vote_result& v) {
|
|
static const char *n;
|
|
switch (v) {
|
|
case vote_result::UNKNOWN:
|
|
n = "UNKNOWN";
|
|
break;
|
|
case vote_result::WON:
|
|
n = "WON";
|
|
break;
|
|
case vote_result::LOST:
|
|
n = "LOST";
|
|
break;
|
|
}
|
|
os << n;
|
|
return os;
|
|
}
|
|
|
|
} // end of namespace raft
|