There can be a situation where a leader will send to a follower entries that the latter already snapshotted. Currently a follower consider those to be outdated appends and it rejects them, but it may cause the follower progress to be stuck: - A is a leader, B is a follower, there are other followers which A used to commit entries - A remembers that the last matched entry for B is 10, so the next entry to send is 11. A managed to commit the 11 entry using other followers - A sends entry 11 to B - B receives it, accepts, and updates its commit index to 11. It sends a success reply to A, but it never reaches A due to a network partition - B takes a snapshot at index 11 - A sends entry 11 to B again - B rejects it since it is inside the snapshot - A receives the reject and retries from the same entry - Same thing happen again We should not reject such outdated entries since if they fall inside a snapshot it means they match (according to log matching property). Accepting them will make the case above alive. Fixes #9552
279 lines
8.3 KiB
C++
279 lines
8.3 KiB
C++
/*
|
|
* Copyright (C) 2020-present ScyllaDB
|
|
*/
|
|
|
|
/*
|
|
* This file is part of Scylla.
|
|
*
|
|
* Scylla is free software: you can redistribute it and/or modify
|
|
* it under the terms of the GNU Affero General Public License as published by
|
|
* the Free Software Foundation, either version 3 of the License, or
|
|
* (at your option) any later version.
|
|
*
|
|
* Scylla is distributed in the hope that it will be useful,
|
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
* GNU General Public License for more details.
|
|
*
|
|
* You should have received a copy of the GNU General Public License
|
|
* along with Scylla. If not, see <http://www.gnu.org/licenses/>.
|
|
*/
|
|
#include "log.hh"
|
|
|
|
namespace raft {
|
|
|
|
log_entry_ptr& log::get_entry(index_t i) {
|
|
return _log[i - _first_idx];
|
|
}
|
|
|
|
const log_entry_ptr& log::get_entry(index_t i) const {
|
|
return _log[i - _first_idx];
|
|
}
|
|
|
|
log_entry_ptr& log::operator[](size_t i) {
|
|
assert(!_log.empty() && index_t(i) >= _first_idx);
|
|
return get_entry(index_t(i));
|
|
}
|
|
|
|
void log::emplace_back(log_entry_ptr&& e) {
|
|
_log.emplace_back(std::move(e));
|
|
if (std::holds_alternative<configuration>(_log.back()->data)) {
|
|
_prev_conf_idx = _last_conf_idx;
|
|
_last_conf_idx = last_idx();
|
|
}
|
|
}
|
|
|
|
bool log::empty() const {
|
|
return _log.empty();
|
|
}
|
|
|
|
bool log::is_up_to_date(index_t idx, term_t term) const {
|
|
// 3.6.1 Election restriction
|
|
// Raft determines which of two logs is more up-to-date by comparing the
|
|
// index and term of the last entries in the logs. If the logs have last
|
|
// entries with different terms, then the log with the later term is more
|
|
// up-to-date. If the logs end with the same term, then whichever log is
|
|
// longer is more up-to-date.
|
|
return term > last_term() || (term == last_term() && idx >= last_idx());
|
|
}
|
|
|
|
index_t log::last_idx() const {
|
|
return index_t(_log.size()) + _first_idx - index_t(1);
|
|
}
|
|
|
|
index_t log::next_idx() const {
|
|
return last_idx() + index_t(1);
|
|
}
|
|
|
|
void log::truncate_uncommitted(index_t idx) {
|
|
assert(idx >= _first_idx);
|
|
auto it = _log.begin() + (idx - _first_idx);
|
|
_log.erase(it, _log.end());
|
|
stable_to(std::min(_stable_idx, last_idx()));
|
|
if (_last_conf_idx > last_idx()) {
|
|
// If _prev_conf_idx is 0, this log does not contain any
|
|
// other configuration changes, since no two uncommitted
|
|
// configuration changes can be in progress.
|
|
assert(_prev_conf_idx < _last_conf_idx);
|
|
_last_conf_idx = _prev_conf_idx;
|
|
_prev_conf_idx = index_t{0};
|
|
}
|
|
}
|
|
|
|
void log::init_last_conf_idx() {
|
|
for (auto it = _log.rbegin(); it != _log.rend() && (**it).idx != _snapshot.idx; ++it) {
|
|
if (std::holds_alternative<configuration>((**it).data)) {
|
|
if (_last_conf_idx == index_t{0}) {
|
|
_last_conf_idx = (**it).idx;
|
|
} else {
|
|
_prev_conf_idx = (**it).idx;
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
term_t log::last_term() const {
|
|
if (_log.empty()) {
|
|
return _snapshot.term;
|
|
}
|
|
return _log.back()->term;
|
|
}
|
|
|
|
void log::stable_to(index_t idx) {
|
|
assert(idx <= last_idx());
|
|
_stable_idx = idx;
|
|
}
|
|
|
|
std::pair<bool, term_t> log::match_term(index_t idx, term_t term) const {
|
|
if (idx == 0) {
|
|
// Special case of empty log on leader,
|
|
// TLA+ line 324.
|
|
return std::make_pair(true, term_t(0));
|
|
}
|
|
|
|
// We got an AppendEntries inside out snapshot, it has to much by
|
|
// log matching property
|
|
if (idx < _snapshot.idx) {
|
|
return std::make_pair(true, last_term());
|
|
}
|
|
|
|
term_t my_term;
|
|
|
|
if (idx == _snapshot.idx) {
|
|
my_term = _snapshot.term;
|
|
} else {
|
|
auto i = idx - _first_idx;
|
|
|
|
if (i >= _log.size()) {
|
|
// We have a gap between the follower and the leader.
|
|
return std::make_pair(false, term_t(0));
|
|
}
|
|
|
|
my_term = _log[i]->term;
|
|
}
|
|
|
|
return my_term == term ? std::make_pair(true, term_t(0)) : std::make_pair(false, my_term);
|
|
}
|
|
|
|
std::optional<term_t> log::term_for(index_t idx) const {
|
|
if (!_log.empty() && idx >= _first_idx) {
|
|
return _log[idx - _first_idx]->term;
|
|
}
|
|
if (idx == _snapshot.idx) {
|
|
return _snapshot.term;
|
|
}
|
|
return {};
|
|
}
|
|
|
|
const configuration& log::get_configuration() const {
|
|
return _last_conf_idx ? std::get<configuration>(_log[_last_conf_idx - _first_idx]->data) : _snapshot.config;
|
|
}
|
|
|
|
const configuration& log::last_conf_for(index_t idx) const {
|
|
assert(last_idx() >= idx);
|
|
assert(idx >= _snapshot.idx);
|
|
|
|
if (!_last_conf_idx) {
|
|
assert(!_prev_conf_idx);
|
|
return _snapshot.config;
|
|
}
|
|
|
|
if (idx >= _last_conf_idx) {
|
|
return std::get<configuration>(get_entry(_last_conf_idx)->data);
|
|
}
|
|
|
|
if (!_prev_conf_idx) {
|
|
// There are no config entries between _snapshot and _last_conf_idx.
|
|
return _snapshot.config;
|
|
}
|
|
|
|
if (idx >= _prev_conf_idx) {
|
|
return std::get<configuration>(get_entry(_prev_conf_idx)->data);
|
|
}
|
|
|
|
for (; idx > _snapshot.idx; --idx) {
|
|
if (auto cfg = std::get_if<configuration>(&get_entry(idx)->data)) {
|
|
return *cfg;
|
|
}
|
|
}
|
|
|
|
return _snapshot.config;
|
|
}
|
|
|
|
index_t log::maybe_append(std::vector<log_entry_ptr>&& entries) {
|
|
assert(!entries.empty());
|
|
|
|
index_t last_new_idx = entries.back()->idx;
|
|
|
|
// We must scan through all entries if the log already
|
|
// contains them to ensure the terms match.
|
|
for (auto& e : entries) {
|
|
if (e->idx <= last_idx()) {
|
|
if (e->idx < _first_idx) {
|
|
logger.trace("append_entries: skipping entry with idx {} less than log start {}",
|
|
e->idx, _first_idx);
|
|
continue;
|
|
}
|
|
if (e->term == get_entry(e->idx)->term) {
|
|
logger.trace("append_entries: entries with index {} has matching terms {}", e->idx, e->term);
|
|
continue;
|
|
}
|
|
logger.trace("append_entries: entries with index {} has non matching terms e.term={}, _log[i].term = {}",
|
|
e->idx, e->term, get_entry(e->idx)->term);
|
|
// If an existing entry conflicts with a new one (same
|
|
// index but different terms), delete the existing
|
|
// entry and all that follow it (§5.3).
|
|
assert(e->idx > _snapshot.idx);
|
|
truncate_uncommitted(e->idx);
|
|
}
|
|
// Assert log monotonicity
|
|
assert(e->idx == next_idx());
|
|
emplace_back(std::move(e));
|
|
}
|
|
|
|
return last_new_idx;
|
|
}
|
|
|
|
const configuration* log::get_prev_configuration() const {
|
|
if (_prev_conf_idx) {
|
|
return &std::get<configuration>(get_entry(_prev_conf_idx)->data);
|
|
}
|
|
|
|
if (_last_conf_idx > _snapshot.idx) {
|
|
return &_snapshot.config;
|
|
}
|
|
|
|
// _last_conf_idx <= _snapshot.idx means we only have the last configuration (from the snapshot).
|
|
return nullptr;
|
|
}
|
|
|
|
size_t log::apply_snapshot(snapshot_descriptor&& snp, size_t trailing) {
|
|
assert (snp.idx > _snapshot.idx);
|
|
|
|
size_t removed;
|
|
auto idx = snp.idx;
|
|
|
|
if (idx > last_idx()) {
|
|
// Remove all entries ignoring the 'trailing' argument,
|
|
// since otherwise there would be a gap between old
|
|
// entries and the next entry index.
|
|
removed = _log.size();
|
|
_log.clear();
|
|
_first_idx = idx + index_t{1};
|
|
} else {
|
|
removed = _log.size() - (last_idx() - idx);
|
|
removed -= std::min(trailing, removed);
|
|
_log.erase(_log.begin(), _log.begin() + removed);
|
|
_first_idx = _first_idx + index_t{removed};
|
|
}
|
|
|
|
_stable_idx = std::max(idx, _stable_idx);
|
|
|
|
if (idx >= _prev_conf_idx) {
|
|
// The log cannot be truncated beyond snapshot index, so
|
|
// if previous config index is smaller we can forget it.
|
|
_prev_conf_idx = index_t{0};
|
|
if (idx >= _last_conf_idx) {
|
|
// If last config index is included in the snapshot
|
|
// use the config from the snapshot as last one
|
|
_last_conf_idx = index_t{0};
|
|
}
|
|
}
|
|
|
|
_snapshot = std::move(snp);
|
|
|
|
return removed;
|
|
}
|
|
|
|
std::ostream& operator<<(std::ostream& os, const log& l) {
|
|
os << "first idx: " << l._first_idx << ", ";
|
|
os << "last idx: " << l.last_idx() << ", ";
|
|
os << "next idx: " << l.next_idx() << ", ";
|
|
os << "stable idx: " << l.stable_idx() << ", ";
|
|
os << "last term: " << l.last_term();
|
|
return os;
|
|
}
|
|
|
|
} // end of namespace raft
|