assert() is traditionally disabled in release builds, but not in
scylladb. This hasn't caused problems so far, but the latest abseil
release includes a commit [1] that causes a 1000 insn/op regression when
NDEBUG is not defined.
Clearly, we must move towards a build system where NDEBUG is defined in
release builds. But we can't just define it blindly without vetting
all the assert() calls, as some were written with the expectation that
they are enabled in release mode.
To solve the conundrum, change all assert() calls to a new SCYLLA_ASSERT()
macro in utils/assert.hh. This macro is always defined and is not conditional
on NDEBUG, so we can later (after vetting Seastar) enable NDEBUG in release
mode.
[1] 66ef711d68
Closes scylladb/scylladb#20006
295 lines
10 KiB
C++
295 lines
10 KiB
C++
/*
|
|
* Copyright (C) 2020-present ScyllaDB
|
|
*/
|
|
|
|
/*
|
|
* SPDX-License-Identifier: AGPL-3.0-or-later
|
|
*/
|
|
#include "utils/assert.hh"
|
|
#include "tracker.hh"
|
|
#include <seastar/core/coroutine.hh>
|
|
|
|
namespace raft {
|
|
|
|
bool follower_progress::is_stray_reject(const append_reply::rejected& rejected) {
|
|
// By precondition, we are the leader and `rejected.current_term` is equal to our term.
|
|
// By definition of `match_idx` we know that at some point all entries up to and including
|
|
// `match_idx` were the same in our log and the follower's log; ...
|
|
if (rejected.non_matching_idx <= match_idx) {
|
|
// ... in particular, entry `rejected.non_matching_idx` (which is <= `match_idx`) at some point
|
|
// was the same in our log and the follower's log, but `rejected` claims they are different.
|
|
// A follower cannot change an entry unless it enters a different term, but `rejected.current_term`
|
|
// is equal to our term. Thus `rejected` must be stray.
|
|
return true;
|
|
}
|
|
if (rejected.last_idx < match_idx) {
|
|
// ... in particular, at some point the follower had to have an entry with index `rejected.last_idx + 1`
|
|
// (because `rejected.last_idx < match_idx implies rejected.last_idx + 1 <= match_idx)
|
|
// but `rejected` claims it doesn't have such entry now.
|
|
// A follower cannot truncate a suffix of its log unless it enters a different term,
|
|
// but `rejected.current_term` is equal to our term. Thus `rejected` must be stray.
|
|
return true;
|
|
}
|
|
|
|
switch (state) {
|
|
case follower_progress::state::PIPELINE:
|
|
break;
|
|
case follower_progress::state::PROBE:
|
|
// In PROBE state we send a single append request `req` with `req.prev_log_idx == next_idx - 1`.
|
|
// When the follower generates a rejected response `r`, it sets `r.non_matching_idx = req.prev_log_idx`.
|
|
// Thus the reject either satisfies `rejected.non_matching_idx == next_idx - 1` or is stray.
|
|
if (rejected.non_matching_idx != index_t(next_idx - 1)) {
|
|
return true;
|
|
}
|
|
break;
|
|
case follower_progress::state::SNAPSHOT:
|
|
// any reject during snapshot transfer is stray one
|
|
return true;
|
|
default:
|
|
SCYLLA_ASSERT(false);
|
|
}
|
|
return false;
|
|
}
|
|
|
|
void follower_progress::become_probe() {
|
|
state = state::PROBE;
|
|
probe_sent = false;
|
|
}
|
|
|
|
void follower_progress::become_pipeline() {
|
|
if (state != state::PIPELINE) {
|
|
// If a previous request was accepted, move to "pipeline" state
|
|
// since we now know the follower's log state.
|
|
state = state::PIPELINE;
|
|
in_flight = 0;
|
|
}
|
|
}
|
|
|
|
void follower_progress::become_snapshot(index_t snp_idx) {
|
|
state = state::SNAPSHOT;
|
|
// If snapshot transfer succeeds, start replicating from the
|
|
// next index, otherwise we will learn the follower's index
|
|
// again by sending a probe request.
|
|
next_idx = snp_idx + index_t{1};
|
|
}
|
|
|
|
bool follower_progress::can_send_to() {
|
|
switch (state) {
|
|
case state::PROBE:
|
|
return !probe_sent;
|
|
case state::PIPELINE:
|
|
// allow `max_in_flight` outstanding indexes
|
|
// FIXME: make it smarter
|
|
return in_flight < follower_progress::max_in_flight;
|
|
case state::SNAPSHOT:
|
|
// In this state we are waiting
|
|
// for a snapshot to be transferred
|
|
// before starting to sync the log.
|
|
return false;
|
|
}
|
|
SCYLLA_ASSERT(false);
|
|
return false;
|
|
}
|
|
|
|
// If this is called when a tracker is just created, the current
|
|
// progress is empty and we should simply create an instance for
|
|
// each follower.
|
|
// When switching configurations, we should preserve progress
|
|
// for existing followers, create progress for new, and remove
|
|
// progress for non-members (to make sure we don't send noise
|
|
// messages to them).
|
|
void tracker::set_configuration(const configuration& configuration, index_t next_idx) {
|
|
_current_voters.clear();
|
|
_previous_voters.clear();
|
|
|
|
// Swap out the current progress and then re-add
|
|
// only those entries which are still present.
|
|
progress old_progress = std::move(*this);
|
|
|
|
auto emplace_simple_config = [&](const config_member_set& config, std::unordered_set<server_id>& voter_ids) {
|
|
for (const auto& s : config) {
|
|
if (s.can_vote) {
|
|
voter_ids.emplace(s.addr.id);
|
|
}
|
|
auto newp = this->progress::find(s.addr.id);
|
|
if (newp != this->progress::end()) {
|
|
// Processing joint configuration and already added
|
|
// an entry for this id.
|
|
continue;
|
|
}
|
|
auto oldp = old_progress.find(s.addr.id);
|
|
if (oldp != old_progress.end()) {
|
|
newp = this->progress::emplace(s.addr.id, std::move(oldp->second)).first;
|
|
} else {
|
|
newp = this->progress::emplace(s.addr.id, follower_progress{s.addr.id, next_idx}).first;
|
|
}
|
|
newp->second.can_vote = s.can_vote;
|
|
}
|
|
};
|
|
emplace_simple_config(configuration.current, _current_voters);
|
|
if (configuration.is_joint()) {
|
|
emplace_simple_config(configuration.previous, _previous_voters);
|
|
}
|
|
}
|
|
|
|
// A sorted array of node match indexes used to find
|
|
// the pivot which serves as commit index of the group.
|
|
template<typename Index>
|
|
class match_vector {
|
|
std::vector<Index> _match;
|
|
// How many elements in the match array have a match index
|
|
// larger than the previous commit index.
|
|
size_t _count = 0;
|
|
Index _prev_commit_idx;
|
|
public:
|
|
explicit match_vector(Index prev_commit_idx, size_t reserve_size)
|
|
: _prev_commit_idx(prev_commit_idx) {
|
|
_match.reserve(reserve_size);
|
|
}
|
|
|
|
void push_back(Index match_idx) {
|
|
if (match_idx > _prev_commit_idx) {
|
|
_count++;
|
|
}
|
|
_match.push_back(match_idx);
|
|
}
|
|
bool committed() const {
|
|
return _count >= _match.size()/2 + 1;
|
|
}
|
|
Index commit_idx() {
|
|
logger.trace("{}: check committed count {} cluster size {}", std::is_same_v<Index, index_t> ? "commit" : "read barrier", _count, _match.size());
|
|
// The index of the pivot node is selected so that all nodes
|
|
// with a larger match index plus the pivot form a majority,
|
|
// for example:
|
|
// cluster size pivot node majority
|
|
// 1 0 1
|
|
// 2 0 2
|
|
// 3 1 2
|
|
// 4 1 3
|
|
// 5 2 3
|
|
//
|
|
auto pivot = (_match.size() - 1) / 2;
|
|
std::nth_element(_match.begin(), _match.begin() + pivot, _match.end());
|
|
return _match[pivot];
|
|
}
|
|
};
|
|
|
|
template<typename T>
|
|
T tracker::committed(T prev_commit_idx) {
|
|
auto push_idx = [] (match_vector<T>& v, const follower_progress& p) {
|
|
if constexpr (std::is_same_v<T, index_t>) {
|
|
v.push_back(p.match_idx);
|
|
} else {
|
|
v.push_back(p.max_acked_read);
|
|
}
|
|
};
|
|
match_vector<T> current(prev_commit_idx, _current_voters.size());
|
|
|
|
if (!_previous_voters.empty()) {
|
|
match_vector<T> previous(prev_commit_idx, _previous_voters.size());
|
|
|
|
for (const auto& [id, p] : *this) {
|
|
if (_current_voters.contains(p.id)) {
|
|
push_idx(current, p);
|
|
}
|
|
if (_previous_voters.contains(p.id)) {
|
|
push_idx(previous, p);
|
|
}
|
|
}
|
|
if (!current.committed() || !previous.committed()) {
|
|
return prev_commit_idx;
|
|
}
|
|
return std::min(current.commit_idx(), previous.commit_idx());
|
|
} else {
|
|
for (const auto& [id, p] : *this) {
|
|
if (_current_voters.contains(p.id)) {
|
|
push_idx(current, p);
|
|
}
|
|
}
|
|
if (!current.committed()) {
|
|
return prev_commit_idx;
|
|
}
|
|
return current.commit_idx();
|
|
}
|
|
}
|
|
|
|
template index_t tracker::committed(index_t);
|
|
template read_id tracker::committed(read_id);
|
|
|
|
votes::votes(configuration configuration)
|
|
: _current(configuration.current) {
|
|
for (auto* cfg: {&configuration.previous, &configuration.current}) {
|
|
for (auto& srv: *cfg) {
|
|
if (srv.can_vote) {
|
|
_voters.insert(srv.addr);
|
|
}
|
|
}
|
|
}
|
|
|
|
if (configuration.is_joint()) {
|
|
_previous.emplace(configuration.previous);
|
|
}
|
|
}
|
|
|
|
void votes::register_vote(server_id from, bool granted) {
|
|
bool registered = false;
|
|
|
|
if (_current.register_vote(from, granted)) {
|
|
registered = true;
|
|
}
|
|
if (_previous && _previous->register_vote(from, granted)) {
|
|
registered = true;
|
|
}
|
|
// We can get an outdated vote from a node that is now non-voting member.
|
|
// Such vote should be ignored.
|
|
if (!registered) {
|
|
logger.info("Got a vote from unregistered server {} during election", from);
|
|
}
|
|
}
|
|
|
|
vote_result votes::tally_votes() const {
|
|
if (_previous) {
|
|
auto previous_result = _previous->tally_votes();
|
|
if (previous_result != vote_result::WON) {
|
|
return previous_result;
|
|
}
|
|
}
|
|
return _current.tally_votes();
|
|
}
|
|
|
|
} // end of namespace raft
|
|
|
|
auto fmt::formatter<raft::election_tracker>::format(const raft::election_tracker& v, fmt::format_context& ctx) const
|
|
-> decltype(ctx.out()) {
|
|
return fmt::format_to(ctx.out(), "responded: {}, granted: {}",
|
|
v._responded.size(), v._granted);
|
|
}
|
|
|
|
auto fmt::formatter<raft::votes>::format(const raft::votes& v, fmt::format_context& ctx) const
|
|
-> decltype(ctx.out()) {
|
|
auto out = ctx.out();
|
|
out = fmt::format_to(out, "current: {}\n", v._current);
|
|
if (v._previous) {
|
|
out = fmt::format_to(out, "previous: {}\n", v._previous.value());
|
|
}
|
|
return out;
|
|
}
|
|
|
|
auto fmt::formatter<raft::vote_result>::format(const raft::vote_result& v, fmt::format_context& ctx) const
|
|
-> decltype(ctx.out()) {
|
|
std::string_view name;
|
|
using enum raft::vote_result;
|
|
switch (v) {
|
|
case UNKNOWN:
|
|
name = "UNKNOWN";
|
|
break;
|
|
case WON:
|
|
name = "WON";
|
|
break;
|
|
case LOST:
|
|
name = "LOST";
|
|
break;
|
|
}
|
|
return formatter<string_view>::format(name, ctx);
|
|
}
|