Files
scylladb/raft/log.cc
Gleb Natapov a59779155f raft: fix matching of a snapshotted log on a follower
There can be a situation where a leader will send to a follower entries
that the latter already snapshotted. Currently a follower consider those
to be outdated appends and it rejects them, but it may cause the
follower progress to be stuck:

- A is a leader, B is a follower, there are other followers which A used to commit entries
- A remembers that the last matched entry for B is 10, so the next entry to send is 11. A managed to commit the 11 entry using other followers
- A sends entry 11 to B
- B receives it, accepts, and updates its commit index to 11. It sends a success reply to A, but it never reaches A due to a network partition
- B takes a snapshot at index 11
- A sends entry 11 to B again
- B rejects it since it is inside the snapshot
- A receives the reject and retries from the same entry
- Same thing happen again

We should not reject such outdated entries since if they fall inside a
snapshot it means they match (according to log matching property).
Accepting them will make the case above alive.

Fixes #9552
2021-11-09 14:51:40 +02:00

279 lines
8.3 KiB
C++

/*
* Copyright (C) 2020-present ScyllaDB
*/
/*
* This file is part of Scylla.
*
* Scylla is free software: you can redistribute it and/or modify
* it under the terms of the GNU Affero General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* Scylla is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with Scylla. If not, see <http://www.gnu.org/licenses/>.
*/
#include "log.hh"
namespace raft {
log_entry_ptr& log::get_entry(index_t i) {
return _log[i - _first_idx];
}
const log_entry_ptr& log::get_entry(index_t i) const {
return _log[i - _first_idx];
}
log_entry_ptr& log::operator[](size_t i) {
assert(!_log.empty() && index_t(i) >= _first_idx);
return get_entry(index_t(i));
}
void log::emplace_back(log_entry_ptr&& e) {
_log.emplace_back(std::move(e));
if (std::holds_alternative<configuration>(_log.back()->data)) {
_prev_conf_idx = _last_conf_idx;
_last_conf_idx = last_idx();
}
}
bool log::empty() const {
return _log.empty();
}
bool log::is_up_to_date(index_t idx, term_t term) const {
// 3.6.1 Election restriction
// Raft determines which of two logs is more up-to-date by comparing the
// index and term of the last entries in the logs. If the logs have last
// entries with different terms, then the log with the later term is more
// up-to-date. If the logs end with the same term, then whichever log is
// longer is more up-to-date.
return term > last_term() || (term == last_term() && idx >= last_idx());
}
index_t log::last_idx() const {
return index_t(_log.size()) + _first_idx - index_t(1);
}
index_t log::next_idx() const {
return last_idx() + index_t(1);
}
void log::truncate_uncommitted(index_t idx) {
assert(idx >= _first_idx);
auto it = _log.begin() + (idx - _first_idx);
_log.erase(it, _log.end());
stable_to(std::min(_stable_idx, last_idx()));
if (_last_conf_idx > last_idx()) {
// If _prev_conf_idx is 0, this log does not contain any
// other configuration changes, since no two uncommitted
// configuration changes can be in progress.
assert(_prev_conf_idx < _last_conf_idx);
_last_conf_idx = _prev_conf_idx;
_prev_conf_idx = index_t{0};
}
}
void log::init_last_conf_idx() {
for (auto it = _log.rbegin(); it != _log.rend() && (**it).idx != _snapshot.idx; ++it) {
if (std::holds_alternative<configuration>((**it).data)) {
if (_last_conf_idx == index_t{0}) {
_last_conf_idx = (**it).idx;
} else {
_prev_conf_idx = (**it).idx;
break;
}
}
}
}
term_t log::last_term() const {
if (_log.empty()) {
return _snapshot.term;
}
return _log.back()->term;
}
void log::stable_to(index_t idx) {
assert(idx <= last_idx());
_stable_idx = idx;
}
std::pair<bool, term_t> log::match_term(index_t idx, term_t term) const {
if (idx == 0) {
// Special case of empty log on leader,
// TLA+ line 324.
return std::make_pair(true, term_t(0));
}
// We got an AppendEntries inside out snapshot, it has to much by
// log matching property
if (idx < _snapshot.idx) {
return std::make_pair(true, last_term());
}
term_t my_term;
if (idx == _snapshot.idx) {
my_term = _snapshot.term;
} else {
auto i = idx - _first_idx;
if (i >= _log.size()) {
// We have a gap between the follower and the leader.
return std::make_pair(false, term_t(0));
}
my_term = _log[i]->term;
}
return my_term == term ? std::make_pair(true, term_t(0)) : std::make_pair(false, my_term);
}
std::optional<term_t> log::term_for(index_t idx) const {
if (!_log.empty() && idx >= _first_idx) {
return _log[idx - _first_idx]->term;
}
if (idx == _snapshot.idx) {
return _snapshot.term;
}
return {};
}
const configuration& log::get_configuration() const {
return _last_conf_idx ? std::get<configuration>(_log[_last_conf_idx - _first_idx]->data) : _snapshot.config;
}
const configuration& log::last_conf_for(index_t idx) const {
assert(last_idx() >= idx);
assert(idx >= _snapshot.idx);
if (!_last_conf_idx) {
assert(!_prev_conf_idx);
return _snapshot.config;
}
if (idx >= _last_conf_idx) {
return std::get<configuration>(get_entry(_last_conf_idx)->data);
}
if (!_prev_conf_idx) {
// There are no config entries between _snapshot and _last_conf_idx.
return _snapshot.config;
}
if (idx >= _prev_conf_idx) {
return std::get<configuration>(get_entry(_prev_conf_idx)->data);
}
for (; idx > _snapshot.idx; --idx) {
if (auto cfg = std::get_if<configuration>(&get_entry(idx)->data)) {
return *cfg;
}
}
return _snapshot.config;
}
index_t log::maybe_append(std::vector<log_entry_ptr>&& entries) {
assert(!entries.empty());
index_t last_new_idx = entries.back()->idx;
// We must scan through all entries if the log already
// contains them to ensure the terms match.
for (auto& e : entries) {
if (e->idx <= last_idx()) {
if (e->idx < _first_idx) {
logger.trace("append_entries: skipping entry with idx {} less than log start {}",
e->idx, _first_idx);
continue;
}
if (e->term == get_entry(e->idx)->term) {
logger.trace("append_entries: entries with index {} has matching terms {}", e->idx, e->term);
continue;
}
logger.trace("append_entries: entries with index {} has non matching terms e.term={}, _log[i].term = {}",
e->idx, e->term, get_entry(e->idx)->term);
// If an existing entry conflicts with a new one (same
// index but different terms), delete the existing
// entry and all that follow it (§5.3).
assert(e->idx > _snapshot.idx);
truncate_uncommitted(e->idx);
}
// Assert log monotonicity
assert(e->idx == next_idx());
emplace_back(std::move(e));
}
return last_new_idx;
}
const configuration* log::get_prev_configuration() const {
if (_prev_conf_idx) {
return &std::get<configuration>(get_entry(_prev_conf_idx)->data);
}
if (_last_conf_idx > _snapshot.idx) {
return &_snapshot.config;
}
// _last_conf_idx <= _snapshot.idx means we only have the last configuration (from the snapshot).
return nullptr;
}
size_t log::apply_snapshot(snapshot_descriptor&& snp, size_t trailing) {
assert (snp.idx > _snapshot.idx);
size_t removed;
auto idx = snp.idx;
if (idx > last_idx()) {
// Remove all entries ignoring the 'trailing' argument,
// since otherwise there would be a gap between old
// entries and the next entry index.
removed = _log.size();
_log.clear();
_first_idx = idx + index_t{1};
} else {
removed = _log.size() - (last_idx() - idx);
removed -= std::min(trailing, removed);
_log.erase(_log.begin(), _log.begin() + removed);
_first_idx = _first_idx + index_t{removed};
}
_stable_idx = std::max(idx, _stable_idx);
if (idx >= _prev_conf_idx) {
// The log cannot be truncated beyond snapshot index, so
// if previous config index is smaller we can forget it.
_prev_conf_idx = index_t{0};
if (idx >= _last_conf_idx) {
// If last config index is included in the snapshot
// use the config from the snapshot as last one
_last_conf_idx = index_t{0};
}
}
_snapshot = std::move(snp);
return removed;
}
std::ostream& operator<<(std::ostream& os, const log& l) {
os << "first idx: " << l._first_idx << ", ";
os << "last idx: " << l.last_idx() << ", ";
os << "next idx: " << l.next_idx() << ", ";
os << "stable idx: " << l.stable_idx() << ", ";
os << "last term: " << l.last_term();
return os;
}
} // end of namespace raft