The test will now, with probability 1/2, enable forwarding of entries by followers to leaders. This is possible thanks to the new abort_source& APIs which we use to ensure that no operations are running on servers before we destroy them.
2970 lines
114 KiB
C++
2970 lines
114 KiB
C++
/*
|
|
* Copyright (C) 2021-present ScyllaDB
|
|
*/
|
|
|
|
/*
|
|
* SPDX-License-Identifier: AGPL-3.0-or-later
|
|
*/
|
|
|
|
#include <seastar/core/reactor.hh>
|
|
#include <seastar/testing/test_case.hh>
|
|
#include <seastar/core/timed_out_error.hh>
|
|
#include <seastar/core/coroutine.hh>
|
|
#include <seastar/coroutine/maybe_yield.hh>
|
|
#include <seastar/core/gate.hh>
|
|
#include <seastar/core/queue.hh>
|
|
#include <seastar/core/future-util.hh>
|
|
#include <seastar/core/weak_ptr.hh>
|
|
#include <seastar/util/defer.hh>
|
|
|
|
#include "raft/server.hh"
|
|
#include "raft/logical_clock.hh"
|
|
#include "serializer.hh"
|
|
#include "serializer_impl.hh"
|
|
#include "idl/uuid.dist.hh"
|
|
#include "idl/uuid.dist.impl.hh"
|
|
|
|
#include "test/lib/random_utils.hh"
|
|
#include "test/raft/logical_timer.hh"
|
|
#include "test/raft/ticker.hh"
|
|
#include "test/raft/generator.hh"
|
|
|
|
#include "to_string.hh"
|
|
|
|
using namespace seastar;
|
|
using namespace std::chrono_literals;
|
|
|
|
seastar::logger tlogger("randomized_nemesis_test");
|
|
|
|
// A direct translaction of a mathematical definition of a state machine
|
|
// (see e.g. Wikipedia) as a C++ concept. Implementations of this concept
|
|
// do not store the state, they only define the types, the transition function
|
|
// (which is a pure function), and the initial state (which is a constant).
|
|
template <typename M> concept PureStateMachine =
|
|
requires (typename M::state_t s, typename M::input_t i) {
|
|
// The type of all possible states.
|
|
typename M::state_t;
|
|
|
|
// The type of all possible inputs (commands).
|
|
typename M::input_t;
|
|
|
|
// The type of all possible outputs.
|
|
typename M::output_t;
|
|
|
|
// The transition function (a pure function - no side effects). It takes a state
|
|
// and an input, and returns the next state and the output produced
|
|
// by applying the input to the given state.
|
|
{ M::delta(s, i) } -> std::same_as<std::pair<typename M::state_t, typename M::output_t>>;
|
|
|
|
// The initial state, of type `state_t`.
|
|
M::init;
|
|
requires std::is_same_v<const typename M::state_t, decltype(M::init)>;
|
|
};
|
|
|
|
// Used to uniquely identify commands passed into `apply` in order to return
|
|
// the outputs of these commands. See `impure_state_machine` and `call`.
|
|
using cmd_id_t = utils::UUID;
|
|
|
|
// A set of in-memory snapshots maintained by a single Raft server.
|
|
// The different parts of the server (the state machine, persistence,
|
|
// rpc) will share a single `snapshots_t`.
|
|
template <typename State>
|
|
using snapshots_t = std::unordered_map<raft::snapshot_id, State>;
|
|
|
|
// To replicate a state machine, our Raft implementation requires it to
|
|
// be represented with the `raft::state_machine` interface.
|
|
//
|
|
// `impure_state_machine` is an implementation of `raft::state_machine`
|
|
// that wraps a `PureStateMachine`. It keeps a variable of type `state_t`
|
|
// representing the current state. In `apply` it deserializes the given
|
|
// command into `input_t`, uses the transition (`delta`) function to
|
|
// produce the next state and output, replaces its current state with the
|
|
// obtained state and returns the output (more on that below); it does so
|
|
// sequentially for every given command. We can think of `PureStateMachine`
|
|
// as the actual state machine - the business logic, and `impure_state_machine`
|
|
// as the ``boilerplate'' that allows the pure machine to be replicated
|
|
// by Raft and communicate with the external world.
|
|
//
|
|
// The interface also requires maintainance of snapshots. We use the
|
|
// `snapshots_t` introduced above; `impure_state_machine` keeps a reference to `snapshots_t`
|
|
// because it will share it with an implementation of `raft::persistence`.
|
|
template <PureStateMachine M>
|
|
class impure_state_machine : public raft::state_machine {
|
|
raft::server_id _id;
|
|
|
|
typename M::state_t _val;
|
|
snapshots_t<typename M::state_t>& _snapshots;
|
|
|
|
// Used to ensure that when `abort()` returns there are
|
|
// no more in-progress methods running on this object.
|
|
seastar::gate _gate;
|
|
|
|
// To obtain output from an applied command, the client (see `call`)
|
|
// first allocates a channel in this data structure by calling `with_output_channel`
|
|
// and makes the returned command ID a part of the command passed to Raft.
|
|
// When (if) we eventually apply the command, we use the ID to find the output channel
|
|
// here and push the output to the client waiting on the other end.
|
|
// The channel is allocated only on the local server where `with_output_channel`
|
|
// was called; other replicas of the state machine will therefore not find the ID
|
|
// in their instances of `_output_channels` so they just drop the output.
|
|
std::unordered_map<cmd_id_t, promise<typename M::output_t>> _output_channels;
|
|
|
|
public:
|
|
impure_state_machine(raft::server_id id, snapshots_t<typename M::state_t>& snapshots)
|
|
: _id(id), _val(M::init), _snapshots(snapshots) {}
|
|
|
|
future<> apply(std::vector<raft::command_cref> cmds) override {
|
|
co_await with_gate(_gate, [this, cmds = std::move(cmds)] () mutable -> future<> {
|
|
for (auto& cref : cmds) {
|
|
_gate.check();
|
|
|
|
auto is = ser::as_input_stream(cref);
|
|
auto cmd_id = ser::deserialize(is, boost::type<cmd_id_t>{});
|
|
auto input = ser::deserialize(is, boost::type<typename M::input_t>{});
|
|
auto [new_state, output] = M::delta(std::move(_val), std::move(input));
|
|
_val = std::move(new_state);
|
|
|
|
auto it = _output_channels.find(cmd_id);
|
|
if (it != _output_channels.end()) {
|
|
// We are on the leader server where the client submitted the command
|
|
// and waits for the output. Send it to them.
|
|
it->second.set_value(std::move(output));
|
|
_output_channels.erase(it);
|
|
} else {
|
|
// This is not the leader on which the command was submitted,
|
|
// or it is but the client already gave up on us and deallocated the channel.
|
|
// In any case we simply drop the output.
|
|
}
|
|
|
|
co_await coroutine::maybe_yield();
|
|
}
|
|
});
|
|
}
|
|
|
|
future<raft::snapshot_id> take_snapshot() override {
|
|
auto id = raft::snapshot_id::create_random_id();
|
|
assert(_snapshots.emplace(id, _val).second);
|
|
tlogger.trace("{}: took snapshot id {} val {}", _id, id, _val);
|
|
co_return id;
|
|
}
|
|
|
|
void drop_snapshot(raft::snapshot_id id) override {
|
|
_snapshots.erase(id);
|
|
}
|
|
|
|
future<> load_snapshot(raft::snapshot_id id) override {
|
|
auto it = _snapshots.find(id);
|
|
assert(it != _snapshots.end()); // dunno if the snapshot can actually be missing
|
|
tlogger.trace("{}: loading snapshot id {} prev val {} new val {}", _id, id, _val, it->second);
|
|
_val = it->second;
|
|
co_return;
|
|
}
|
|
|
|
future<> abort() override {
|
|
return _gate.close();
|
|
}
|
|
|
|
struct output_channel_dropped : public raft::error {
|
|
output_channel_dropped() : error("output channel dropped") {}
|
|
};
|
|
|
|
// Before sending a command to Raft, the client must obtain a command ID
|
|
// and an output channel using this function.
|
|
template <typename F>
|
|
future<typename M::output_t> with_output_channel(F f) {
|
|
return with_gate(_gate, [this, f = std::move(f)] () mutable -> future<typename M::output_t> {
|
|
promise<typename M::output_t> p;
|
|
auto fut = p.get_future();
|
|
auto cmd_id = utils::make_random_uuid();
|
|
assert(_output_channels.emplace(cmd_id, std::move(p)).second);
|
|
|
|
auto guard = defer([this, cmd_id] {
|
|
auto it = _output_channels.find(cmd_id);
|
|
if (it != _output_channels.end()) {
|
|
it->second.set_exception(output_channel_dropped{});
|
|
_output_channels.erase(it);
|
|
}
|
|
});
|
|
return f(cmd_id, std::move(fut)).finally([guard = std::move(guard)] {});
|
|
});
|
|
}
|
|
|
|
const typename M::state_t& state() const {
|
|
return _val;
|
|
}
|
|
};
|
|
|
|
// TODO: serializable concept?
|
|
template <typename Input>
|
|
raft::command make_command(const cmd_id_t& cmd_id, const Input& input) {
|
|
raft::command cmd;
|
|
ser::serialize(cmd, cmd_id);
|
|
ser::serialize(cmd, input);
|
|
return cmd;
|
|
}
|
|
|
|
// TODO: handle other errors?
|
|
template <PureStateMachine M>
|
|
using call_result_t = std::variant<typename M::output_t, timed_out_error, raft::not_a_leader, raft::dropped_entry, raft::commit_status_unknown, raft::stopped_error>;
|
|
|
|
// Wait for a future `f` to finish, but keep the result inside a `future`.
|
|
// Works for `future<void>` as well as for `future<T>`.
|
|
template <Future F>
|
|
future<F> wait(F f) {
|
|
// FIXME: using lambda as workaround for clang bug #50345
|
|
auto impl = [] (F f) -> future<F> {
|
|
struct container { F f; };
|
|
container c = co_await f.then_wrapped([] (F f) { return container{std::move(f)}; });
|
|
assert(c.f.available());
|
|
co_return std::move(c.f);
|
|
};
|
|
|
|
return impl(std::move(f));
|
|
}
|
|
|
|
template <std::invocable<abort_source&> F>
|
|
static futurize_t<std::invoke_result_t<F, abort_source&>>
|
|
with_timeout(logical_timer& t, raft::logical_clock::time_point tp, F&& fun) {
|
|
using future_t = futurize_t<std::invoke_result_t<F, abort_source&>>;
|
|
|
|
// FIXME: using lambda as workaround for clang bug #50345
|
|
auto impl = [] (logical_timer& t, raft::logical_clock::time_point tp, F&& fun) -> future_t {
|
|
abort_source timeout_as;
|
|
|
|
// Using lambda here as workaround for seastar#1005
|
|
future_t f = futurize_invoke([fun = std::move(fun)] (abort_source& as) mutable { return std::forward<F>(fun)(as); }, timeout_as);
|
|
|
|
auto sleep_and_abort = [] (raft::logical_clock::time_point tp, abort_source& timeout_as, logical_timer& t) -> future<> {
|
|
co_await t.sleep_until(tp, timeout_as);
|
|
if (!timeout_as.abort_requested()) {
|
|
// We resolved before `f`. Abort the operation.
|
|
timeout_as.request_abort();
|
|
}
|
|
}(tp, timeout_as, t);
|
|
|
|
f = co_await wait(std::move(f));
|
|
|
|
if (!timeout_as.abort_requested()) {
|
|
// `f` has already resolved, but abort the sleep.
|
|
timeout_as.request_abort();
|
|
}
|
|
|
|
// Wait on the sleep as well (it should return shortly, being aborted) so we don't discard the future.
|
|
try {
|
|
co_await std::move(sleep_and_abort);
|
|
} catch (const sleep_aborted&) {
|
|
// Expected (if `f` resolved first or we were externally aborted).
|
|
} catch (...) {
|
|
// There should be no other exceptions, but just in case... log it and discard,
|
|
// we want to propagate exceptions from `f`, not from sleep.
|
|
tlogger.error("unexpected exception from sleep_and_abort", std::current_exception());
|
|
}
|
|
|
|
// The future is available but cannot use `f.get()` as it doesn't handle void futures.
|
|
co_return co_await std::move(f);
|
|
};
|
|
|
|
return impl(t, tp, std::forward<F>(fun));
|
|
}
|
|
|
|
// Sends a given `input` as a command to `server`, waits until the command gets replicated
|
|
// and applied on that server and returns the produced output.
|
|
//
|
|
// The wait time is limited using `timeout` which is a logical time point referring to the
|
|
// logical clock used by `timer`. Standard way to use is to pass `timer.now() + X_t`
|
|
// as the time point, where `X` is the maximum number of ticks that we wait for.
|
|
//
|
|
// `sm` must be a reference to the state machine owned by `server`.
|
|
//
|
|
// The `server` may currently be a follower, in which case it will return a `not_a_leader` error.
|
|
template <PureStateMachine M>
|
|
future<call_result_t<M>> call(
|
|
typename M::input_t input,
|
|
raft::logical_clock::time_point timeout,
|
|
logical_timer& timer,
|
|
raft::server& server,
|
|
impure_state_machine<M>& sm) {
|
|
using output_channel_dropped = typename impure_state_machine<M>::output_channel_dropped;
|
|
using input_t = typename M::input_t;
|
|
using output_t = typename M::output_t;
|
|
|
|
return sm.with_output_channel([&, input = std::move(input), timeout] (cmd_id_t cmd_id, future<output_t> f) {
|
|
return with_timeout(timer, timeout, std::bind_front([&] (input_t input, future<output_t> f, abort_source& as) {
|
|
return server.add_entry(
|
|
make_command(std::move(cmd_id), std::move(input)),
|
|
raft::wait_type::applied,
|
|
&as
|
|
).then_wrapped([output_f = std::move(f)] (future<> add_entry_f) mutable {
|
|
if (add_entry_f.failed()) {
|
|
// We need to discard `output_f`; the only expected exception is:
|
|
(void)output_f.discard_result().handle_exception_type([] (const output_channel_dropped&) {});
|
|
std::rethrow_exception(add_entry_f.get_exception());
|
|
}
|
|
|
|
return std::move(output_f);
|
|
});
|
|
}, std::move(input), std::move(f)));
|
|
}).then([] (output_t output) {
|
|
return make_ready_future<call_result_t<M>>(std::move(output));
|
|
}).handle_exception([] (std::exception_ptr eptr) {
|
|
try {
|
|
std::rethrow_exception(eptr);
|
|
} catch (raft::not_a_leader e) {
|
|
return make_ready_future<call_result_t<M>>(e);
|
|
} catch (raft::dropped_entry e) {
|
|
return make_ready_future<call_result_t<M>>(e);
|
|
} catch (raft::commit_status_unknown e) {
|
|
return make_ready_future<call_result_t<M>>(e);
|
|
} catch (raft::stopped_error e) {
|
|
return make_ready_future<call_result_t<M>>(e);
|
|
} catch (raft::request_aborted&) {
|
|
return make_ready_future<call_result_t<M>>(timed_out_error{});
|
|
} catch (seastar::timed_out_error& e) {
|
|
return make_ready_future<call_result_t<M>>(e);
|
|
} catch (broken_promise&) {
|
|
// FIXME: workaround for #9688
|
|
return make_ready_future<call_result_t<M>>(raft::stopped_error{});
|
|
} catch (...) {
|
|
tlogger.error("unexpected exception from call: {}", std::current_exception());
|
|
assert(false);
|
|
}
|
|
});
|
|
}
|
|
|
|
// Allows a Raft server to communicate with other servers.
|
|
// The implementation is mostly boilerplate. It assumes that there exists a method of message passing
|
|
// given by a `send_message_t` function (passed in the constructor) for sending and by the `receive`
|
|
// function for receiving messages.
|
|
//
|
|
// We also keep a reference to a `snapshots_t` set to be shared with the `impure_state_machine`
|
|
// on the same server. We access this set when we receive or send a snapshot message.
|
|
template <typename State>
|
|
class rpc : public raft::rpc {
|
|
using reply_id_t = uint32_t;
|
|
|
|
struct snapshot_message {
|
|
raft::install_snapshot ins;
|
|
State snapshot_payload;
|
|
reply_id_t reply_id;
|
|
};
|
|
|
|
struct snapshot_reply_message {
|
|
raft::snapshot_reply reply;
|
|
reply_id_t reply_id;
|
|
};
|
|
|
|
struct execute_barrier_on_leader {
|
|
reply_id_t reply_id;
|
|
};
|
|
|
|
struct execute_barrier_on_leader_reply {
|
|
raft::read_barrier_reply reply;
|
|
reply_id_t reply_id;
|
|
};
|
|
|
|
struct add_entry_message {
|
|
raft::command cmd;
|
|
reply_id_t reply_id;
|
|
};
|
|
|
|
struct add_entry_reply_message {
|
|
raft::add_entry_reply reply;
|
|
reply_id_t reply_id;
|
|
};
|
|
|
|
struct modify_config_message {
|
|
std::vector<raft::server_address> add;
|
|
std::vector<raft::server_id> del;
|
|
reply_id_t reply_id;
|
|
};
|
|
|
|
public:
|
|
using message_t = std::variant<
|
|
snapshot_message,
|
|
snapshot_reply_message,
|
|
raft::append_request,
|
|
raft::append_reply,
|
|
raft::vote_request,
|
|
raft::vote_reply,
|
|
raft::timeout_now,
|
|
raft::read_quorum,
|
|
raft::read_quorum_reply,
|
|
execute_barrier_on_leader,
|
|
execute_barrier_on_leader_reply,
|
|
add_entry_message,
|
|
add_entry_reply_message,
|
|
modify_config_message
|
|
>;
|
|
|
|
using send_message_t = std::function<void(raft::server_id dst, message_t)>;
|
|
|
|
private:
|
|
raft::server_id _id;
|
|
|
|
snapshots_t<State>& _snapshots;
|
|
|
|
logical_timer _timer;
|
|
|
|
send_message_t _send;
|
|
|
|
// Before we send a snapshot apply request we create a promise-future pair,
|
|
// allocate a new ID, and put the promise here under that ID. We then send the ID
|
|
// together with the request and wait on the future.
|
|
// When (if) a reply returns, we take the ID from the reply (which is the same
|
|
// as the ID in the corresponding request), take the promise under that ID
|
|
// and push the reply through that promise.
|
|
using reply_promise = std::variant<
|
|
promise<raft::snapshot_reply>,
|
|
promise<raft::read_barrier_reply>,
|
|
promise<raft::add_entry_reply>
|
|
>;
|
|
std::unordered_map<reply_id_t, reply_promise> _reply_promises;
|
|
reply_id_t _counter = 0;
|
|
|
|
// Used to ensure that when `abort()` returns there are
|
|
// no more in-progress methods running on this object.
|
|
seastar::gate _gate;
|
|
|
|
size_t _snapshot_applications = 0;
|
|
size_t _read_barrier_executions = 0;
|
|
size_t _add_entry_executions = 0;
|
|
size_t _modify_config_executions = 0;
|
|
|
|
template <typename F>
|
|
auto with_gate(F&& f) -> decltype(f()) {
|
|
try {
|
|
co_return co_await seastar::with_gate(_gate, std::forward<F>(f));
|
|
} catch (const gate_closed_exception&) {
|
|
co_return coroutine::make_exception(raft::stopped_error{});
|
|
}
|
|
}
|
|
|
|
public:
|
|
rpc(raft::server_id id, snapshots_t<State>& snaps, send_message_t send)
|
|
: _id(id), _snapshots(snaps), _send(std::move(send)) {
|
|
}
|
|
|
|
// Message is delivered to us.
|
|
// The caller must ensure that `abort()` wasn't called yet.
|
|
void receive(raft::server_id src, message_t payload) {
|
|
assert(!_gate.is_closed());
|
|
assert(_client);
|
|
auto& c = *_client;
|
|
|
|
std::visit(make_visitor(
|
|
[&] (snapshot_message m) {
|
|
static const size_t max_concurrent_snapshot_applications = 5; // TODO: configurable
|
|
if (_snapshot_applications >= max_concurrent_snapshot_applications) {
|
|
tlogger.warn(
|
|
"{}: cannot apply snapshot from {} (id: {}) due to too many concurrent requests, dropping it",
|
|
_id, src, m.ins.snp.id);
|
|
// Should we send some message back instead?
|
|
return;
|
|
}
|
|
|
|
++_snapshot_applications;
|
|
(void)[] (rpc& self, raft::server_id src, snapshot_message m, gate::holder holder) -> future<> {
|
|
try {
|
|
self._snapshots.emplace(m.ins.snp.id, std::move(m.snapshot_payload));
|
|
auto reply = co_await self._client->apply_snapshot(src, std::move(m.ins));
|
|
|
|
self._send(src, snapshot_reply_message{
|
|
.reply = std::move(reply),
|
|
.reply_id = m.reply_id
|
|
});
|
|
} catch (...) {
|
|
tlogger.warn("{}: exception when applying snapshot from {}: {}", self._id, src, std::current_exception());
|
|
}
|
|
|
|
--self._snapshot_applications;
|
|
}(*this, src, std::move(m), _gate.hold());
|
|
},
|
|
[this] (snapshot_reply_message m) {
|
|
auto it = _reply_promises.find(m.reply_id);
|
|
if (it != _reply_promises.end()) {
|
|
std::get<promise<raft::snapshot_reply>>(it->second).set_value(std::move(m.reply));
|
|
}
|
|
},
|
|
[&] (raft::append_request m) {
|
|
c.append_entries(src, std::move(m));
|
|
},
|
|
[&] (raft::append_reply m) {
|
|
c.append_entries_reply(src, std::move(m));
|
|
},
|
|
[&] (raft::vote_request m) {
|
|
c.request_vote(src, std::move(m));
|
|
},
|
|
[&] (raft::vote_reply m) {
|
|
c.request_vote_reply(src, std::move(m));
|
|
},
|
|
[&] (raft::timeout_now m) {
|
|
c.timeout_now_request(src, std::move(m));
|
|
},
|
|
[&] (raft::read_quorum m) {
|
|
c.read_quorum_request(src, std::move(m));
|
|
},
|
|
[&] (raft::read_quorum_reply m) {
|
|
c.read_quorum_reply(src, std::move(m));
|
|
},
|
|
[&] (execute_barrier_on_leader m) {
|
|
static const size_t max_concurrent_read_barrier_executions = 100; // TODO: configurable
|
|
if (_read_barrier_executions >= max_concurrent_read_barrier_executions) {
|
|
tlogger.warn(
|
|
"{}: cannot execute read barrier for {} due to too many concurrent requests, dropping it",
|
|
_id, src);
|
|
// Should we send some message back instead?
|
|
return;
|
|
}
|
|
|
|
++_read_barrier_executions;
|
|
(void)[] (rpc& self, raft::server_id src, execute_barrier_on_leader m, gate::holder holder) -> future<> {
|
|
try {
|
|
auto reply = co_await self._client->execute_read_barrier(src, nullptr);
|
|
|
|
self._send(src, execute_barrier_on_leader_reply{
|
|
.reply = std::move(reply),
|
|
.reply_id = m.reply_id
|
|
});
|
|
} catch (...) {
|
|
tlogger.warn("{}: exception when executing read barrier for {}: {}", self._id, src, std::current_exception());
|
|
}
|
|
|
|
--self._read_barrier_executions;
|
|
}(*this, src, std::move(m), _gate.hold());
|
|
},
|
|
[this] (execute_barrier_on_leader_reply m) {
|
|
auto it = _reply_promises.find(m.reply_id);
|
|
if (it != _reply_promises.end()) {
|
|
std::get<promise<raft::read_barrier_reply>>(it->second).set_value(std::move(m.reply));
|
|
}
|
|
},
|
|
[&] (add_entry_message m) {
|
|
static const size_t max_concurrent_add_entry_executions = 100; // TODO: configurable
|
|
if (_add_entry_executions >= max_concurrent_add_entry_executions) {
|
|
tlogger.warn(
|
|
"{}: cannot execute add_entry for {} due to too many concurrent requests, dropping it",
|
|
_id, src);
|
|
// Should we send some message back instead?
|
|
return;
|
|
}
|
|
|
|
++_add_entry_executions;
|
|
(void)[] (rpc& self, raft::server_id src, add_entry_message m, gate::holder holder) -> future<> {
|
|
try {
|
|
auto reply = co_await self._client->execute_add_entry(src, std::move(m.cmd), nullptr);
|
|
|
|
self._send(src, add_entry_reply_message{
|
|
.reply = std::move(reply),
|
|
.reply_id = m.reply_id
|
|
});
|
|
} catch (...) {
|
|
tlogger.warn("{}: exception when executing add_entry for {}: {}", self._id, src, std::current_exception());
|
|
}
|
|
|
|
--self._add_entry_executions;
|
|
}(*this, src, std::move(m), _gate.hold());
|
|
},
|
|
[this] (add_entry_reply_message m) {
|
|
auto it = _reply_promises.find(m.reply_id);
|
|
if (it != _reply_promises.end()) {
|
|
std::get<promise<raft::add_entry_reply>>(it->second).set_value(std::move(m.reply));
|
|
}
|
|
},
|
|
[&] (modify_config_message m) {
|
|
static const size_t max_concurrent_modify_config_executions = 100; // TODO: configurable
|
|
if (_modify_config_executions >= max_concurrent_modify_config_executions) {
|
|
tlogger.warn(
|
|
"{}: cannot execute modify_config for {} due to too many concurrent requests, dropping it",
|
|
_id, src);
|
|
// Should we send some message back instead?
|
|
return;
|
|
}
|
|
|
|
++_modify_config_executions;
|
|
(void)[] (rpc& self, raft::server_id src, modify_config_message m, gate::holder holder) -> future<> {
|
|
try {
|
|
auto reply = co_await self._client->execute_modify_config(src, std::move(m.add), std::move(m.del), nullptr);
|
|
|
|
self._send(src, add_entry_reply_message{
|
|
.reply = std::move(reply),
|
|
.reply_id = m.reply_id
|
|
});
|
|
} catch (...) {
|
|
tlogger.warn("{}: exception when executing modify_config for {}: {}", self._id, src, std::current_exception());
|
|
}
|
|
|
|
--self._modify_config_executions;
|
|
}(*this, src, std::move(m), _gate.hold());
|
|
}
|
|
), std::move(payload));
|
|
}
|
|
|
|
struct snapshot_not_found {
|
|
raft::snapshot_id id;
|
|
};
|
|
|
|
virtual future<raft::snapshot_reply> send_snapshot(raft::server_id dst, const raft::install_snapshot& ins, seastar::abort_source&) override {
|
|
co_return co_await with_gate([&] () -> future<raft::snapshot_reply> {
|
|
auto it = _snapshots.find(ins.snp.id);
|
|
if (it == _snapshots.end()) {
|
|
throw snapshot_not_found{ .id = ins.snp.id };
|
|
}
|
|
|
|
auto id = _counter++;
|
|
promise<raft::snapshot_reply> p;
|
|
auto f = p.get_future();
|
|
_reply_promises.emplace(id, std::move(p));
|
|
auto guard = defer([this, id] { _reply_promises.erase(id); });
|
|
|
|
_send(dst, snapshot_message{
|
|
.ins = ins,
|
|
.snapshot_payload = it->second,
|
|
.reply_id = id
|
|
});
|
|
|
|
// The message receival function on the other side, when it receives the snapshot message,
|
|
// will apply the snapshot and send `id` back to us in the snapshot reply message (see `receive`,
|
|
// `snapshot_message` case). When we receive the reply, we shall find `id` in `_reply_promises`
|
|
// and push the reply through the promise, which will resolve `f` (see `receive`, `snapshot_reply_message`
|
|
// case).
|
|
|
|
// TODO configurable
|
|
static const raft::logical_clock::duration send_snapshot_timeout = 20_t;
|
|
|
|
// TODO: catch aborts from the abort_source as well
|
|
try {
|
|
co_return co_await _timer.with_timeout(_timer.now() + send_snapshot_timeout, std::move(f));
|
|
} catch (logical_timer::timed_out<raft::snapshot_reply>& e) {
|
|
// The future will probably get a broken_promise exception after we destroy the guard.
|
|
(void)e.get_future().discard_result().handle_exception_type([] (const broken_promise&) {});
|
|
throw timed_out_error{};
|
|
}
|
|
// co_await ensures that `guard` is destroyed before we leave `_gate`
|
|
});
|
|
}
|
|
|
|
virtual future<raft::add_entry_reply> send_add_entry(raft::server_id dst, const raft::command& cmd) override {
|
|
co_return co_await with_gate([&] () -> future<raft::add_entry_reply> {
|
|
auto id = _counter++;
|
|
promise<raft::add_entry_reply> p;
|
|
auto f = p.get_future();
|
|
_reply_promises.emplace(id, std::move(p));
|
|
auto guard = defer([this, id] { _reply_promises.erase(id); });
|
|
|
|
_send(dst, add_entry_message{
|
|
.cmd = cmd,
|
|
.reply_id = id
|
|
});
|
|
|
|
static const raft::logical_clock::duration send_add_entry_timeout = 20_t;
|
|
|
|
try {
|
|
co_return co_await _timer.with_timeout(_timer.now() + send_add_entry_timeout, std::move(f));
|
|
} catch (logical_timer::timed_out<raft::add_entry_reply>& e) {
|
|
(void) e.get_future().discard_result().handle_exception_type([] (const broken_promise&) { });
|
|
throw timed_out_error{};
|
|
}
|
|
});
|
|
}
|
|
virtual future<raft::add_entry_reply> send_modify_config(raft::server_id dst,
|
|
const std::vector<raft::server_address>& add,
|
|
const std::vector<raft::server_id>& del) override {
|
|
co_return co_await with_gate([&] () -> future<raft::add_entry_reply> {
|
|
auto id = _counter++;
|
|
promise<raft::add_entry_reply> p;
|
|
auto f = p.get_future();
|
|
_reply_promises.emplace(id, std::move(p));
|
|
auto guard = defer([this, id] { _reply_promises.erase(id); });
|
|
|
|
_send(dst, modify_config_message{
|
|
.add = add,
|
|
.del = del,
|
|
.reply_id = id
|
|
});
|
|
|
|
static const raft::logical_clock::duration send_modify_config_timeout = 200_t;
|
|
|
|
try {
|
|
co_return co_await _timer.with_timeout(_timer.now() + send_modify_config_timeout, std::move(f));
|
|
} catch (logical_timer::timed_out<raft::add_entry_reply>& e) {
|
|
(void) e.get_future().discard_result().handle_exception_type([] (const broken_promise&) { });
|
|
throw timed_out_error{};
|
|
}
|
|
});
|
|
}
|
|
virtual future<raft::read_barrier_reply> execute_read_barrier_on_leader(raft::server_id dst) override {
|
|
co_return co_await with_gate([&] () -> future<raft::read_barrier_reply> {
|
|
auto id = _counter++;
|
|
promise<raft::read_barrier_reply> p;
|
|
auto f = p.get_future();
|
|
_reply_promises.emplace(id, std::move(p));
|
|
auto guard = defer([this, id] { _reply_promises.erase(id); });
|
|
|
|
_send(dst, execute_barrier_on_leader {
|
|
.reply_id = id
|
|
});
|
|
|
|
// TODO configurable
|
|
static const raft::logical_clock::duration execute_read_barrier_on_leader_timeout = 20_t;
|
|
|
|
// TODO: catch aborts from the abort_source as well
|
|
co_return co_await _timer.with_timeout(_timer.now() + execute_read_barrier_on_leader_timeout, std::move(f));
|
|
// co_await ensures that `guard` is destroyed before we leave `_gate`
|
|
});
|
|
}
|
|
|
|
virtual future<> send_append_entries(raft::server_id dst, const raft::append_request& m) override {
|
|
_send(dst, m);
|
|
co_return;
|
|
}
|
|
|
|
virtual void send_append_entries_reply(raft::server_id dst, const raft::append_reply& m) override {
|
|
_send(dst, m);
|
|
}
|
|
|
|
virtual void send_vote_request(raft::server_id dst, const raft::vote_request& m) override {
|
|
_send(dst, m);
|
|
}
|
|
|
|
virtual void send_vote_reply(raft::server_id dst, const raft::vote_reply& m) override {
|
|
_send(dst, m);
|
|
}
|
|
|
|
virtual void send_timeout_now(raft::server_id dst, const raft::timeout_now& m) override {
|
|
_send(dst, m);
|
|
}
|
|
|
|
virtual void send_read_quorum(raft::server_id dst, const raft::read_quorum& m) override {
|
|
_send(dst, m);
|
|
}
|
|
|
|
virtual void send_read_quorum_reply(raft::server_id dst, const raft::read_quorum_reply& m) override {
|
|
_send(dst, m);
|
|
}
|
|
|
|
virtual void add_server(raft::server_id, raft::server_info) override {
|
|
}
|
|
|
|
virtual void remove_server(raft::server_id) override {
|
|
}
|
|
|
|
virtual future<> abort() override {
|
|
return _gate.close();
|
|
}
|
|
|
|
void tick() {
|
|
_timer.tick();
|
|
}
|
|
};
|
|
|
|
template <typename State>
|
|
class persistence {
|
|
std::pair<raft::snapshot_descriptor, State> _stored_snapshot;
|
|
std::pair<raft::term_t, raft::server_id> _stored_term_and_vote;
|
|
|
|
// Invariants:
|
|
// 1. for each entry except the first, the raft index is equal to the raft index of the previous entry plus one.
|
|
// 2. the index of the first entry is <= _stored_snapshot.first.idx + 1.
|
|
// 3. the index of the last entry is >= _stored_snapshot.first.idx.
|
|
// Informally, the last two invariants say that the stored log intersects or ``touches'' the snapshot ``on the right side''.
|
|
raft::log_entries _stored_entries;
|
|
|
|
// Returns an iterator to the entry in `_stored_entries` whose raft index is `idx` if the entry exists.
|
|
// If all entries in `_stored_entries` have greater indexes, returns the first one.
|
|
// If all entries have smaller indexes, returns end().
|
|
raft::log_entries::iterator find(raft::index_t idx) {
|
|
// The correctness of this depends on the `_stored_entries` invariant.
|
|
auto b = _stored_entries.begin();
|
|
if (b == _stored_entries.end() || (*b)->idx >= idx) {
|
|
return b;
|
|
}
|
|
return b + std::min((idx - (*b)->idx).get_value(), _stored_entries.size());
|
|
}
|
|
|
|
public:
|
|
// If this is the first server of a cluster, it must be initialized with a singleton configuration
|
|
// containing opnly this server's ID which must be also provided here as `init_config_id`.
|
|
// Otherwise it must be initialized with an empty configuration (it will be added to the cluster
|
|
// through a configuration change) and `init_config_id` must be `nullopt`.
|
|
persistence(std::optional<raft::server_id> init_config_id, State init_state)
|
|
: _stored_snapshot(
|
|
raft::snapshot_descriptor{
|
|
.config = init_config_id ? raft::configuration{*init_config_id} : raft::configuration{}
|
|
},
|
|
std::move(init_state))
|
|
, _stored_term_and_vote(raft::term_t{1}, raft::server_id{})
|
|
{}
|
|
|
|
void store_term_and_vote(raft::term_t term, raft::server_id vote) {
|
|
_stored_term_and_vote = std::pair{term, vote};
|
|
}
|
|
|
|
std::pair<raft::term_t, raft::server_id> load_term_and_vote() {
|
|
return _stored_term_and_vote;
|
|
}
|
|
|
|
void store_snapshot(const raft::snapshot_descriptor& snap, State snap_data, size_t preserve_log_entries) {
|
|
// The snapshot's index cannot be smaller than the index of the first stored entry minus one;
|
|
// that would create a ``gap'' in the log.
|
|
assert(_stored_entries.empty() || snap.idx + 1 >= _stored_entries.front()->idx);
|
|
|
|
_stored_snapshot = {snap, std::move(snap_data)};
|
|
|
|
if (!_stored_entries.empty() && snap.idx > _stored_entries.back()->idx) {
|
|
// Clear the log in order to not create a gap.
|
|
_stored_entries.clear();
|
|
return;
|
|
}
|
|
|
|
auto first_to_remain = snap.idx + 1 >= preserve_log_entries ? raft::index_t{snap.idx + 1 - preserve_log_entries} : raft::index_t{0};
|
|
_stored_entries.erase(_stored_entries.begin(), find(first_to_remain));
|
|
}
|
|
|
|
std::pair<raft::snapshot_descriptor, State> load_snapshot() {
|
|
return _stored_snapshot;
|
|
}
|
|
|
|
void store_log_entries(const std::vector<raft::log_entry_ptr>& entries) {
|
|
if (entries.empty()) {
|
|
return;
|
|
}
|
|
|
|
// The raft server is supposed to provide entries in strictly increasing order,
|
|
// hence the following assertions.
|
|
if (_stored_entries.empty()) {
|
|
assert(entries.front()->idx == _stored_snapshot.first.idx + 1);
|
|
} else {
|
|
assert(entries.front()->idx == _stored_entries.back()->idx + 1);
|
|
}
|
|
|
|
_stored_entries.push_back(entries[0]);
|
|
for (size_t i = 1; i < entries.size(); ++i) {
|
|
assert(entries[i]->idx == entries[i-1]->idx + 1);
|
|
_stored_entries.push_back(entries[i]);
|
|
}
|
|
}
|
|
|
|
raft::log_entries load_log() {
|
|
return _stored_entries;
|
|
}
|
|
|
|
void truncate_log(raft::index_t idx) {
|
|
_stored_entries.erase(find(idx), _stored_entries.end());
|
|
}
|
|
};
|
|
|
|
template <typename State>
|
|
class persistence_proxy : public raft::persistence {
|
|
snapshots_t<State>& _snapshots;
|
|
lw_shared_ptr<::persistence<State>> _persistence;
|
|
|
|
public:
|
|
persistence_proxy(snapshots_t<State>& snaps, lw_shared_ptr<::persistence<State>> persistence)
|
|
: _snapshots(snaps)
|
|
, _persistence(std::move(persistence))
|
|
{}
|
|
|
|
virtual future<> store_term_and_vote(raft::term_t term, raft::server_id vote) override {
|
|
_persistence->store_term_and_vote(term, vote);
|
|
co_return;
|
|
}
|
|
|
|
virtual future<std::pair<raft::term_t, raft::server_id>> load_term_and_vote() override {
|
|
co_return _persistence->load_term_and_vote();
|
|
}
|
|
|
|
virtual future<> store_commit_idx(raft::index_t) override {
|
|
co_return;
|
|
}
|
|
|
|
virtual future<raft::index_t> load_commit_idx() override {
|
|
co_return raft::index_t{0};
|
|
}
|
|
|
|
// Stores not only the snapshot descriptor but also the corresponding snapshot.
|
|
virtual future<> store_snapshot_descriptor(const raft::snapshot_descriptor& snap, size_t preserve_log_entries) override {
|
|
auto it = _snapshots.find(snap.id);
|
|
assert(it != _snapshots.end());
|
|
|
|
_persistence->store_snapshot(snap, it->second, preserve_log_entries);
|
|
co_return;
|
|
}
|
|
|
|
// Loads not only the snapshot descriptor but also the corresponding snapshot.
|
|
virtual future<raft::snapshot_descriptor> load_snapshot_descriptor() override {
|
|
auto [snap, state] = _persistence->load_snapshot();
|
|
_snapshots.insert_or_assign(snap.id, std::move(state));
|
|
co_return snap;
|
|
}
|
|
|
|
virtual future<> store_log_entries(const std::vector<raft::log_entry_ptr>& entries) override {
|
|
_persistence->store_log_entries(entries);
|
|
co_return;
|
|
}
|
|
|
|
virtual future<raft::log_entries> load_log() override {
|
|
co_return _persistence->load_log();
|
|
}
|
|
|
|
virtual future<> truncate_log(raft::index_t idx) override {
|
|
_persistence->truncate_log(idx);
|
|
co_return;
|
|
}
|
|
|
|
virtual future<> abort() override {
|
|
// There are no yields anywhere in our methods so no need to wait for anything.
|
|
// We assume that our methods won't be called after `abort()`.
|
|
// TODO: is this assumption correct?
|
|
co_return;
|
|
}
|
|
};
|
|
|
|
// A failure detector using heartbeats for deciding whether to convict a server
|
|
// as failed. We convict a server if we don't receive a heartbeat for a long enough time.
|
|
// `failure_detector` assumes a message-passing method given by a `send_heartbeat_t` function
|
|
// through the constructor for sending heartbeats and assumes that `receive_heartbeat` is called
|
|
// whenever another server sends a message to us.
|
|
// To decide who to send heartbeats to we use the ``current knowledge'' of servers in the network
|
|
// which is updated through `add_server` and `remove_server` functions.
|
|
class failure_detector : public raft::failure_detector {
|
|
public:
|
|
using send_heartbeat_t = std::function<void(raft::server_id dst)>;
|
|
|
|
private:
|
|
raft::logical_clock _clock;
|
|
|
|
// The set of known servers, used to broadcast heartbeats.
|
|
std::unordered_set<raft::server_id> _known;
|
|
|
|
// The last time we received a heartbeat from a server.
|
|
std::unordered_map<raft::server_id, raft::logical_clock::time_point> _last_heard;
|
|
|
|
// The last time we sent a heartbeat.
|
|
raft::logical_clock::time_point _last_beat;
|
|
|
|
// How long from the last received heartbeat does it take to convict a node as dead.
|
|
const raft::logical_clock::duration _convict_threshold;
|
|
|
|
send_heartbeat_t _send_heartbeat;
|
|
|
|
public:
|
|
failure_detector(raft::logical_clock::duration convict_threshold, send_heartbeat_t f)
|
|
: _convict_threshold(convict_threshold), _send_heartbeat(std::move(f))
|
|
{
|
|
send_heartbeats();
|
|
assert(_last_beat == _clock.now());
|
|
}
|
|
|
|
void receive_heartbeat(raft::server_id src) {
|
|
assert(_known.contains(src));
|
|
_last_heard[src] = std::max(_clock.now(), _last_heard[src]);
|
|
}
|
|
|
|
void tick() {
|
|
_clock.advance();
|
|
|
|
// TODO: make it adjustable
|
|
static const raft::logical_clock::duration _heartbeat_period = 10_t;
|
|
|
|
if (_last_beat + _heartbeat_period <= _clock.now()) {
|
|
send_heartbeats();
|
|
}
|
|
}
|
|
|
|
void send_heartbeats() {
|
|
for (auto& dst : _known) {
|
|
_send_heartbeat(dst);
|
|
}
|
|
_last_beat = _clock.now();
|
|
}
|
|
|
|
// We expect a server to be added through this function before we receive a heartbeat from it.
|
|
void add_server(raft::server_id id) {
|
|
_known.insert(id);
|
|
}
|
|
|
|
void remove_server(raft::server_id id) {
|
|
_known.erase(id);
|
|
_last_heard.erase(id);
|
|
}
|
|
|
|
bool is_alive(raft::server_id id) override {
|
|
return _clock.now() < _last_heard[id] + _convict_threshold;
|
|
}
|
|
};
|
|
|
|
// `network` is a simple priority queue of `event`s, where an `event` is a message associated
|
|
// with its planned delivery time. The queue uses a logical clock to decide when to deliver messages.
|
|
// It delives all messages whose associated times are smaller than the ``current time'', the latter
|
|
// determined by the number of `tick()` calls.
|
|
template <typename Payload>
|
|
class network {
|
|
public:
|
|
// When the time comes to deliver a message we use this function.
|
|
using deliver_t = std::function<void(raft::server_id src, raft::server_id dst, const Payload&)>;
|
|
|
|
private:
|
|
struct message {
|
|
raft::server_id src;
|
|
raft::server_id dst;
|
|
|
|
// shared ptr to implement duplication of messages
|
|
lw_shared_ptr<Payload> payload;
|
|
};
|
|
|
|
struct event {
|
|
raft::logical_clock::time_point time;
|
|
message msg;
|
|
};
|
|
|
|
deliver_t _deliver;
|
|
|
|
// A min-heap of event occurences compared by their time points.
|
|
std::vector<event> _events;
|
|
|
|
// Comparator for the `_events` min-heap.
|
|
static bool cmp(const event& o1, const event& o2) {
|
|
return o1.time > o2.time;
|
|
}
|
|
|
|
// A pair (dst, [src1, src2, ...]) in this set denotes that `dst`
|
|
// does not receive messages from src1, src2, ...
|
|
std::unordered_map<raft::server_id, std::unordered_set<raft::server_id>> _grudges;
|
|
|
|
raft::logical_clock _clock;
|
|
|
|
// How long does it take to deliver a message?
|
|
std::uniform_int_distribution<raft::logical_clock::rep> _delivery_delay;
|
|
std::mt19937 _rnd;
|
|
|
|
public:
|
|
network(std::uniform_int_distribution<raft::logical_clock::rep> delivery_delay, std::mt19937 rnd, deliver_t f)
|
|
: _deliver(std::move(f)), _delivery_delay(std::move(delivery_delay)), _rnd(std::move(rnd)) {}
|
|
|
|
void send(raft::server_id src, raft::server_id dst, Payload payload) {
|
|
// Predict the delivery time in advance.
|
|
// Our prediction may be wrong if a grudge exists at this expected moment of delivery.
|
|
// Messages may also be reordered.
|
|
auto delivery_time = _clock.now() + raft::logical_clock::duration{_delivery_delay(_rnd)};
|
|
|
|
_events.push_back(event{delivery_time, message{src, dst, make_lw_shared<Payload>(std::move(payload))}});
|
|
std::push_heap(_events.begin(), _events.end(), cmp);
|
|
}
|
|
|
|
void tick() {
|
|
_clock.advance();
|
|
deliver();
|
|
}
|
|
|
|
void add_grudge(raft::server_id src, raft::server_id dst) {
|
|
_grudges[dst].insert(src);
|
|
}
|
|
|
|
void remove_grudge(raft::server_id src, raft::server_id dst) {
|
|
_grudges[dst].erase(src);
|
|
}
|
|
|
|
private:
|
|
void deliver() {
|
|
// Deliver every message whose time has come.
|
|
while (!_events.empty() && _events.front().time <= _clock.now()) {
|
|
auto& [_, m] = _events.front();
|
|
if (!_grudges[m.dst].contains(m.src)) {
|
|
_deliver(m.src, m.dst, *m.payload);
|
|
} else {
|
|
// A grudge means that we drop the message.
|
|
}
|
|
|
|
std::pop_heap(_events.begin(), _events.end(), cmp);
|
|
_events.pop_back();
|
|
}
|
|
}
|
|
};
|
|
|
|
using reconfigure_result_t = std::variant<std::monostate,
|
|
timed_out_error, raft::not_a_leader, raft::dropped_entry, raft::commit_status_unknown, raft::conf_change_in_progress, raft::stopped_error>;
|
|
|
|
future<reconfigure_result_t> reconfigure(
|
|
const std::vector<raft::server_id>& ids,
|
|
raft::logical_clock::time_point timeout,
|
|
logical_timer& timer,
|
|
raft::server& server) {
|
|
raft::server_address_set config;
|
|
for (auto id : ids) {
|
|
config.insert(raft::server_address { .id = id });
|
|
}
|
|
|
|
try {
|
|
co_await with_timeout(timer, timeout, [&server, config = std::move(config)] (abort_source& as) {
|
|
return server.set_configuration(std::move(config), &as);
|
|
});
|
|
co_return std::monostate{};
|
|
} catch (raft::not_a_leader e) {
|
|
co_return e;
|
|
} catch (raft::dropped_entry e) {
|
|
co_return e;
|
|
} catch (raft::commit_status_unknown e) {
|
|
co_return e;
|
|
} catch (raft::conf_change_in_progress e) {
|
|
co_return e;
|
|
} catch (broken_promise&) {
|
|
// FIXME: workaround for #9688
|
|
co_return raft::stopped_error{};
|
|
} catch (raft::stopped_error e) {
|
|
co_return e;
|
|
} catch (raft::request_aborted&) {
|
|
co_return timed_out_error{};
|
|
} catch (...) {
|
|
tlogger.error("unexpected exception from set_configuration: {}", std::current_exception());
|
|
assert(false);
|
|
}
|
|
}
|
|
|
|
future<reconfigure_result_t> modify_config(
|
|
const std::vector<raft::server_id>& added,
|
|
std::vector<raft::server_id> deleted,
|
|
raft::logical_clock::time_point timeout,
|
|
logical_timer& timer,
|
|
raft::server& server) {
|
|
std::vector<raft::server_address> added_set;
|
|
for (auto id : added) {
|
|
added_set.push_back(raft::server_address { .id = id });
|
|
}
|
|
|
|
try {
|
|
co_await with_timeout(timer, timeout, [&server, added_set = std::move(added_set), deleted = std::move(deleted)] (abort_source& as) mutable {
|
|
return server.modify_config(std::move(added_set), std::move(deleted), &as);
|
|
});
|
|
co_return std::monostate{};
|
|
} catch (raft::not_a_leader e) {
|
|
co_return e;
|
|
} catch (raft::dropped_entry e) {
|
|
co_return e;
|
|
} catch (raft::commit_status_unknown e) {
|
|
co_return e;
|
|
} catch (raft::conf_change_in_progress e) {
|
|
co_return e;
|
|
} catch (raft::stopped_error e) {
|
|
co_return e;
|
|
} catch (raft::request_aborted&) {
|
|
co_return timed_out_error{};
|
|
} catch (...) {
|
|
tlogger.error("unexpected exception from modify_config: {}", std::current_exception());
|
|
assert(false);
|
|
}
|
|
}
|
|
|
|
// Contains a `raft::server` and other facilities needed for it and the underlying
|
|
// modules (persistence, rpc, etc.) to run, and to communicate with the external environment.
|
|
template <PureStateMachine M>
|
|
class raft_server {
|
|
raft::server_id _id;
|
|
|
|
std::unique_ptr<snapshots_t<typename M::state_t>> _snapshots;
|
|
std::unique_ptr<raft::server> _server;
|
|
|
|
// The following objects are owned by _server:
|
|
impure_state_machine<M>& _sm;
|
|
rpc<typename M::state_t>& _rpc;
|
|
|
|
bool _started = false;
|
|
bool _stopped = false;
|
|
|
|
// Used to ensure that when `abort()` returns there are
|
|
// no more in-progress methods running on this object.
|
|
seastar::gate _gate;
|
|
|
|
public:
|
|
// Create a `raft::server` with the given `id` and all other facilities required
|
|
// by the server (the state machine, RPC instance and so on). The server will use
|
|
// `send_rpc` to send RPC messages to other servers and `fd` for failure detection.
|
|
//
|
|
// The server is started with `persistence` as its underlying persistent storage.
|
|
// This can be used to simulate a server that is restarting by giving it a `persistence`
|
|
// that was previously used by a different instance of `raft_server<M>` (but make sure
|
|
// they had the same `id` and that the previous instance is no longer using this
|
|
// `persistence`).
|
|
//
|
|
// The created server is not started yet; use `start` for that.
|
|
static std::unique_ptr<raft_server> create(
|
|
raft::server_id id,
|
|
lw_shared_ptr<persistence<typename M::state_t>> persistence,
|
|
shared_ptr<failure_detector> fd,
|
|
raft::server::configuration cfg,
|
|
typename rpc<typename M::state_t>::send_message_t send_rpc) {
|
|
using state_t = typename M::state_t;
|
|
|
|
auto snapshots = std::make_unique<snapshots_t<state_t>>();
|
|
auto sm = std::make_unique<impure_state_machine<M>>(id, *snapshots);
|
|
auto rpc_ = std::make_unique<rpc<state_t>>(id, *snapshots, std::move(send_rpc));
|
|
auto persistence_ = std::make_unique<persistence_proxy<state_t>>(*snapshots, std::move(persistence));
|
|
|
|
auto& sm_ref = *sm;
|
|
auto& rpc_ref = *rpc_;
|
|
|
|
auto server = raft::create_server(
|
|
id, std::move(rpc_), std::move(sm), std::move(persistence_), std::move(fd),
|
|
std::move(cfg));
|
|
|
|
return std::make_unique<raft_server>(initializer{
|
|
._id = id,
|
|
._snapshots = std::move(snapshots),
|
|
._server = std::move(server),
|
|
._sm = sm_ref,
|
|
._rpc = rpc_ref
|
|
});
|
|
}
|
|
|
|
~raft_server() {
|
|
assert(!_started || _stopped);
|
|
}
|
|
|
|
raft_server(const raft_server&&) = delete;
|
|
raft_server(raft_server&&) = delete;
|
|
|
|
// Start the server. Can be called at most once.
|
|
future<> start() {
|
|
assert(!_started);
|
|
_started = true;
|
|
|
|
co_await _server->start();
|
|
}
|
|
|
|
// Stop the given server. Must be called before the server is destroyed
|
|
// (unless it was never started in the first place).
|
|
future<> abort() {
|
|
auto f = _gate.close();
|
|
// Abort everything before waiting on the gate close future
|
|
// so currently running operations finish earlier.
|
|
if (_started) {
|
|
co_await _server->abort();
|
|
}
|
|
co_await std::move(f);
|
|
_stopped = true;
|
|
}
|
|
|
|
void tick() {
|
|
assert(_started);
|
|
_rpc.tick();
|
|
_server->tick();
|
|
}
|
|
|
|
future<call_result_t<M>> call(
|
|
typename M::input_t input,
|
|
raft::logical_clock::time_point timeout,
|
|
logical_timer& timer) {
|
|
assert(_started);
|
|
try {
|
|
co_return co_await with_gate(_gate, [this, input = std::move(input), timeout, &timer] {
|
|
return ::call(std::move(input), timeout, timer, *_server, _sm);
|
|
});
|
|
} catch (const gate_closed_exception&) {
|
|
co_return raft::stopped_error{};
|
|
}
|
|
}
|
|
|
|
future<reconfigure_result_t> reconfigure(
|
|
const std::vector<raft::server_id>& ids,
|
|
raft::logical_clock::time_point timeout,
|
|
logical_timer& timer) {
|
|
assert(_started);
|
|
try {
|
|
co_return co_await with_gate(_gate, [this, &ids, timeout, &timer] {
|
|
return ::reconfigure(ids, timeout, timer, *_server);
|
|
});
|
|
} catch (const gate_closed_exception&) {
|
|
co_return raft::stopped_error{};
|
|
}
|
|
}
|
|
|
|
future<reconfigure_result_t> modify_config(
|
|
const std::vector<raft::server_id>& added,
|
|
std::vector<raft::server_id> deleted,
|
|
raft::logical_clock::time_point timeout,
|
|
logical_timer& timer) {
|
|
assert(_started);
|
|
try {
|
|
co_return co_await with_gate(_gate, [this, &added, deleted = std::move(deleted), timeout, &timer] {
|
|
return ::modify_config(added, std::move(deleted), timeout, timer, *_server);
|
|
});
|
|
} catch (const gate_closed_exception&) {
|
|
co_return raft::stopped_error{};
|
|
}
|
|
}
|
|
|
|
bool is_leader() const {
|
|
return _server->is_leader();
|
|
}
|
|
|
|
raft::server_id id() const {
|
|
return _id;
|
|
}
|
|
|
|
const typename M::state_t& state() const {
|
|
return _sm.state();
|
|
}
|
|
|
|
raft::configuration get_configuration() const {
|
|
return _server->get_configuration();
|
|
}
|
|
|
|
void deliver(raft::server_id src, const typename rpc<typename M::state_t>::message_t& m) {
|
|
assert(_started);
|
|
if (!_gate.is_closed()) {
|
|
_rpc.receive(src, m);
|
|
}
|
|
}
|
|
|
|
private:
|
|
struct initializer {
|
|
raft::server_id _id;
|
|
|
|
std::unique_ptr<snapshots_t<typename M::state_t>> _snapshots;
|
|
std::unique_ptr<raft::server> _server;
|
|
|
|
impure_state_machine<M>& _sm;
|
|
rpc<typename M::state_t>& _rpc;
|
|
};
|
|
|
|
raft_server(initializer i)
|
|
: _id(i._id)
|
|
, _snapshots(std::move(i._snapshots))
|
|
, _server(std::move(i._server))
|
|
, _sm(i._sm)
|
|
, _rpc(i._rpc) {
|
|
}
|
|
|
|
friend std::unique_ptr<raft_server> std::make_unique<raft_server, raft_server::initializer>(initializer&&);
|
|
};
|
|
|
|
static raft::server_id to_raft_id(size_t id) {
|
|
// Raft uses UUID 0 as special case.
|
|
assert(id > 0);
|
|
return raft::server_id{utils::UUID{0, id}};
|
|
}
|
|
|
|
struct environment_config {
|
|
std::mt19937 rnd;
|
|
std::uniform_int_distribution<raft::logical_clock::rep> network_delay;
|
|
raft::logical_clock::duration fd_convict_threshold;
|
|
};
|
|
|
|
// A set of `raft_server`s connected by a `network`.
|
|
//
|
|
// The `network` is initialized with a message delivery function
|
|
// which notifies the destination's failure detector on each message
|
|
// and if the message contains an RPC payload, pushes it into the destination's
|
|
// `delivery_queue`.
|
|
//
|
|
// Needs to be periodically `tick()`ed which ticks the network
|
|
// and underlying servers.
|
|
template <PureStateMachine M>
|
|
class environment : public seastar::weakly_referencable<environment<M>> {
|
|
using input_t = typename M::output_t;
|
|
using state_t = typename M::state_t;
|
|
using output_t = typename M::output_t;
|
|
|
|
// Invariant: if `_server` is engaged then it uses `_persistence` and `_fd`
|
|
// underneath and is initialized using `_cfg`.
|
|
struct route {
|
|
raft::server::configuration _cfg;
|
|
lw_shared_ptr<persistence<state_t>> _persistence;
|
|
std::unique_ptr<raft_server<M>> _server;
|
|
shared_ptr<failure_detector> _fd;
|
|
};
|
|
|
|
// Passed to newly created failure detectors.
|
|
const raft::logical_clock::duration _fd_convict_threshold;
|
|
|
|
// Used to deliver messages coming from the network to appropriate servers and their failure detectors.
|
|
// Also keeps the servers and the failure detectors alive (owns them).
|
|
// Before we show a Raft server to others we must add it to this map.
|
|
std::unordered_map<raft::server_id, route> _routes;
|
|
|
|
// Used to create a new ID in `new_server`.
|
|
size_t _next_id = 1;
|
|
|
|
// Engaged optional: RPC message, nullopt: heartbeat
|
|
using message_t = std::optional<typename rpc<state_t>::message_t>;
|
|
network<message_t> _network;
|
|
|
|
bool _stopped = false;
|
|
|
|
// Used to ensure that when `abort()` returns there are
|
|
// no more in-progress methods running on this object.
|
|
seastar::gate _gate;
|
|
|
|
// Used to implement `crash`.
|
|
//
|
|
// We cannot destroy a server immediately in order to simulate a crash:
|
|
// there may be fibers running that use the server's internals.
|
|
// We move these 'crashed' servers into continuations attached to this fiber
|
|
// and abort them there before destruction.
|
|
future<> _crash_fiber = make_ready_future<>();
|
|
|
|
// Servers that are aborting in the background (in `_crash_fiber`).
|
|
// We need these pointers so we keep ticking the servers
|
|
// (in general, `abort()` requires the server to be ticked in order to finish).
|
|
// One downside of this is that ticks may cause the servers to output traces.
|
|
// Hopefully these crashing servers abort quickly so they don't stay too long
|
|
// and make the logs unreadable...
|
|
std::unordered_set<raft_server<M>*> _crashing_servers;
|
|
|
|
public:
|
|
environment(environment_config cfg)
|
|
: _fd_convict_threshold(cfg.fd_convict_threshold)
|
|
, _network(std::move(cfg.network_delay), std::move(cfg.rnd),
|
|
[this] (raft::server_id src, raft::server_id dst, const message_t& m) {
|
|
auto& n = _routes.at(dst);
|
|
assert(n._persistence);
|
|
assert(n._fd);
|
|
|
|
if (n._server) {
|
|
n._fd->receive_heartbeat(src);
|
|
if (m) {
|
|
n._server->deliver(src, *m);
|
|
}
|
|
}
|
|
}) {
|
|
}
|
|
|
|
~environment() {
|
|
assert(_routes.empty() || _stopped);
|
|
}
|
|
|
|
environment(const environment&) = delete;
|
|
environment(environment&&) = delete;
|
|
|
|
void tick_network() {
|
|
_network.tick();
|
|
}
|
|
|
|
template <std::invocable<raft::server_id, raft_server<M>*, failure_detector&> F>
|
|
void for_each_server(F&& f) {
|
|
for (auto& [id, r]: _routes) {
|
|
assert(r._fd);
|
|
f(id, r._server.get(), *r._fd);
|
|
}
|
|
}
|
|
|
|
// Call this periodically so `abort()` can finish for 'crashed' servers.
|
|
void tick_crashing_servers() {
|
|
for (auto& srv: _crashing_servers) {
|
|
srv->tick();
|
|
}
|
|
}
|
|
|
|
void tick_servers() {
|
|
for_each_server([] (raft::server_id, raft_server<M>* srv, failure_detector& fd) {
|
|
if (srv) {
|
|
srv->tick();
|
|
}
|
|
fd.tick();
|
|
});
|
|
|
|
tick_crashing_servers();
|
|
}
|
|
|
|
// A 'node' is a container for a Raft server, its storage ('persistence') and failure detector.
|
|
// At a given point in time at most one Raft server instance can be running on a node.
|
|
// Different instances may be running at different points in time, but they will all have
|
|
// the same ID (returned by `new_node`) and will reuse the same storage and failure detector
|
|
// (this can be used to simulate a server that is restarting).
|
|
//
|
|
// The storage is initialized when the node is created and will be used by the first started server.
|
|
// If `first == true` the storage is created with a singleton server configuration containing only
|
|
// the ID returned from the function. Otherwise it is created with an empty configuration
|
|
// (a server started on this node will have to be joined to an existing cluster in this case).
|
|
raft::server_id new_node(bool first, raft::server::configuration cfg) {
|
|
_gate.check();
|
|
|
|
auto id = to_raft_id(_next_id++);
|
|
auto [it, inserted] = _routes.emplace(id, route{
|
|
._cfg = std::move(cfg),
|
|
._persistence = make_lw_shared<persistence<state_t>>(first ? std::optional{id} : std::nullopt, M::init),
|
|
._server = nullptr,
|
|
._fd = nullptr,
|
|
});
|
|
assert(inserted);
|
|
auto& n = it->second;
|
|
|
|
n._fd = seastar::make_shared<failure_detector>(_fd_convict_threshold,
|
|
[id, &n, this] (raft::server_id dst) {
|
|
// Ping others only if a server is running.
|
|
if (n._server) {
|
|
_network.send(id, dst, std::nullopt);
|
|
}
|
|
});
|
|
|
|
// Add us to other servers' failure detectors.
|
|
for (auto& [_, r] : _routes) {
|
|
r._fd->add_server(id);
|
|
}
|
|
|
|
// Add other servers to our failure detector.
|
|
for (auto& [id, _] : _routes) {
|
|
n._fd->add_server(id);
|
|
}
|
|
|
|
return id;
|
|
}
|
|
|
|
// Starts a server on node `id`.
|
|
// Assumes node with `id` exists (i.e. an earlier `new_node` call returned `id`) and that no server is running on node `id`.
|
|
future<> start_server(raft::server_id id) {
|
|
return with_gate(_gate, [this, id] () -> future<> {
|
|
auto& n = _routes.at(id);
|
|
assert(n._persistence);
|
|
assert(n._fd);
|
|
assert(!n._server);
|
|
|
|
lw_shared_ptr<raft_server<M>*> this_srv_addr = make_lw_shared<raft_server<M>*>(nullptr);
|
|
auto srv = raft_server<M>::create(id, n._persistence, n._fd, n._cfg,
|
|
[id, this_srv_addr, &n, this] (raft::server_id dst, typename rpc<state_t>::message_t m) {
|
|
// Allow the message out only if we are still the currently running server on this node.
|
|
if (*this_srv_addr == n._server.get()) {
|
|
_network.send(id, dst, {std::move(m)});
|
|
}
|
|
});
|
|
*this_srv_addr = srv.get();
|
|
|
|
co_await srv->start();
|
|
n._server = std::move(srv);
|
|
});
|
|
}
|
|
|
|
// Creates a new node, connects it to the network, starts a server on it and returns its ID.
|
|
//
|
|
// If `first == true` the node is created with a singleton configuration containing only its ID.
|
|
// Otherwise it is created with an empty configuration. The user must explicitly ask for a configuration change
|
|
// if they want to make a cluster (group) out of this server and other existing servers.
|
|
// The user should be able to create multiple clusters by calling `new_server` multiple times with `first = true`.
|
|
// (`first` means ``first in group'').
|
|
future<raft::server_id> new_server(bool first, raft::server::configuration cfg = {}) {
|
|
auto id = new_node(first, std::move(cfg));
|
|
// not using co_await here due to miscompile
|
|
return start_server(id).then([id] () { return id; });
|
|
}
|
|
|
|
// Gracefully stop a running server.
|
|
// Assumes a server is currently running on the node `id`.
|
|
// When the future resolves, a new server may be started on this node. It will reuse the storage
|
|
// of the previously running server (so the Raft log etc. will be preserved).
|
|
future<> stop(raft::server_id id) {
|
|
return with_gate(_gate, [this, id] () -> future<> {
|
|
auto& n = _routes.at(id);
|
|
assert(n._persistence);
|
|
assert(n._server);
|
|
assert(n._fd);
|
|
|
|
co_await n._server->abort();
|
|
n._server = nullptr;
|
|
});
|
|
}
|
|
|
|
// Immediately stop a running server.
|
|
// Assumes a server is currently running on the node `id`.
|
|
// A new server may be started on this node when the function returns. It will reuse the storage
|
|
// of the previously running server (so the Raft log etc. will be preserved).
|
|
void crash(raft::server_id id) {
|
|
_gate.check();
|
|
|
|
auto& n = _routes.at(id);
|
|
assert(n._persistence);
|
|
assert(n._server);
|
|
assert(n._fd);
|
|
|
|
// Let the 'crashed' server continue working on its copy of persistence;
|
|
// none of that work will be seen by later servers restarted on this node
|
|
// since they'll use a separate copy.
|
|
n._persistence = make_lw_shared<persistence<state_t>>(*n._persistence);
|
|
// Setting `n._server` to nullptr cuts out the network access both for the server and failure detector.
|
|
// Even though the server will continue running for some time (in order to be gracefully aborted),
|
|
// none of that work will be seen by the rest of the environment. From others' point of view
|
|
// the server is immediately gone.
|
|
auto srv = std::exchange(n._server, nullptr);
|
|
_crashing_servers.insert(srv.get());
|
|
|
|
auto f = std::bind_front([] (environment<M>& self, std::unique_ptr<raft_server<M>> srv) -> future<> {
|
|
tlogger.trace("crash fiber: aborting {}", srv->id());
|
|
co_await srv->abort();
|
|
tlogger.trace("crash fiber: finished aborting {}", srv->id());
|
|
self._crashing_servers.erase(srv.get());
|
|
// abort() ensures there are no in-progress calls on the server, so we can destroy it.
|
|
}, std::ref(*this), std::move(srv));
|
|
|
|
// Cannot do `.then(std::move(f))`, because that would try to use `f()`, which is ill-formed (seastar#1005).
|
|
_crash_fiber = _crash_fiber.then([f = std::move(f)] () mutable { return std::move(f)(); });
|
|
}
|
|
|
|
bool is_leader(raft::server_id id) {
|
|
auto& n = _routes.at(id);
|
|
if (!n._server) {
|
|
return false;
|
|
}
|
|
return n._server->is_leader();
|
|
}
|
|
|
|
future<call_result_t<M>> call(
|
|
raft::server_id id,
|
|
typename M::input_t input,
|
|
raft::logical_clock::time_point timeout,
|
|
logical_timer& timer) {
|
|
auto& n = _routes.at(id);
|
|
if (!n._server) {
|
|
// A 'remote' caller doesn't know in general if the server is down or just slow to respond.
|
|
// Simulate this by timing out the call.
|
|
co_await timer.sleep_until(timeout);
|
|
co_return timed_out_error{};
|
|
}
|
|
|
|
auto srv = n._server.get();
|
|
auto res = co_await srv->call(std::move(input), timeout, timer);
|
|
|
|
if (srv != n._server.get()) {
|
|
// The server stopped while the call was happening.
|
|
// As above, we simulate a 'remote' call by timing it out in this case.
|
|
co_await timer.sleep_until(timeout);
|
|
co_return timed_out_error{};
|
|
}
|
|
co_return res;
|
|
}
|
|
|
|
future<reconfigure_result_t> reconfigure(
|
|
raft::server_id id,
|
|
const std::vector<raft::server_id>& ids,
|
|
raft::logical_clock::time_point timeout,
|
|
logical_timer& timer) {
|
|
auto& n = _routes.at(id);
|
|
if (!n._server) {
|
|
// A 'remote' caller doesn't know in general if the server is down or just slow to respond.
|
|
// Simulate this by timing out the call.
|
|
co_await timer.sleep_until(timeout);
|
|
co_return timed_out_error{};
|
|
}
|
|
|
|
auto srv = n._server.get();
|
|
auto res = co_await srv->reconfigure(ids, timeout, timer);
|
|
|
|
if (srv != n._server.get()) {
|
|
// The server stopped while the call was happening.
|
|
// As above, we simulate a 'remote' call by timing it out in this case.
|
|
co_await timer.sleep_until(timeout);
|
|
co_return timed_out_error{};
|
|
}
|
|
co_return res;
|
|
}
|
|
|
|
future<reconfigure_result_t> modify_config(
|
|
raft::server_id id,
|
|
const std::vector<raft::server_id>& added,
|
|
std::vector<raft::server_id> deleted,
|
|
raft::logical_clock::time_point timeout,
|
|
logical_timer& timer) {
|
|
auto& n = _routes.at(id);
|
|
if (!n._server) {
|
|
// A 'remote' caller doesn't know in general if the server is down or just slow to respond.
|
|
// Simulate this by timing out the call.
|
|
co_await timer.sleep_until(timeout);
|
|
co_return timed_out_error{};
|
|
}
|
|
|
|
auto srv = n._server.get();
|
|
auto res = co_await srv->modify_config(added, std::move(deleted), timeout, timer);
|
|
|
|
if (srv != n._server.get()) {
|
|
// The server stopped while the call was happening.
|
|
// As above, we simulate a 'remote' call by timing it out in this case.
|
|
co_await timer.sleep_until(timeout);
|
|
co_return timed_out_error{};
|
|
}
|
|
co_return res;
|
|
}
|
|
|
|
std::optional<raft::configuration> get_configuration(raft::server_id id) {
|
|
auto& n = _routes.at(id);
|
|
if (!n._server) {
|
|
return std::nullopt;
|
|
}
|
|
return n._server->get_configuration();
|
|
}
|
|
|
|
network<message_t>& get_network() {
|
|
return _network;
|
|
}
|
|
|
|
// Must be called before we are destroyed unless `new_server` was never called.
|
|
future<> abort() {
|
|
// Close the gate before iterating over _routes to prevent concurrent modification by other methods.
|
|
co_await _gate.close();
|
|
for (auto& [_, r] : _routes) {
|
|
if (r._server) {
|
|
co_await r._server->abort();
|
|
r._server = nullptr;
|
|
}
|
|
}
|
|
co_await std::move(_crash_fiber);
|
|
_stopped = true;
|
|
}
|
|
};
|
|
|
|
template <PureStateMachine M, std::invocable<environment<M>&, ticker&> F>
|
|
auto with_env_and_ticker(environment_config cfg, F f) {
|
|
return do_with(std::move(f), std::make_unique<environment<M>>(std::move(cfg)), std::make_unique<ticker>(tlogger),
|
|
[] (F& f, std::unique_ptr<environment<M>>& env, std::unique_ptr<ticker>& t) {
|
|
return f(*env, *t).finally([&env_ = env, &t_ = t] () mutable -> future<> {
|
|
// move into coroutine body so they don't get destroyed with the lambda (on first co_await)
|
|
auto& env = env_;
|
|
auto& t = t_;
|
|
|
|
// We abort the environment before the ticker as the environment may require time to advance
|
|
// in order to finish (e.g. some operations may need to timeout).
|
|
tlogger.info("aborting environment");
|
|
co_await env->abort();
|
|
tlogger.info("environment aborted, aborting ticker");
|
|
co_await t->abort();
|
|
tlogger.info("ticker aborted");
|
|
});
|
|
});
|
|
}
|
|
|
|
struct ExReg {
|
|
// Replaces the state with `x` and returns the previous state.
|
|
struct exchange { int32_t x; };
|
|
|
|
// Returns the state.
|
|
struct read {};
|
|
|
|
// Return value for `exchange` or `read`.
|
|
struct ret { int32_t x; };
|
|
|
|
using state_t = int32_t;
|
|
using input_t = std::variant<read, exchange>;
|
|
using output_t = ret;
|
|
|
|
static std::pair<state_t, output_t> delta(state_t curr, input_t input) {
|
|
using res_t = std::pair<state_t, output_t>;
|
|
|
|
return std::visit(make_visitor(
|
|
[&curr] (const exchange& w) -> res_t {
|
|
return {w.x, ret{curr}};
|
|
},
|
|
[&curr] (const read&) -> res_t {
|
|
return {curr, ret{curr}};
|
|
}
|
|
), input);
|
|
}
|
|
|
|
static const state_t init;
|
|
};
|
|
|
|
const ExReg::state_t ExReg::init = 0;
|
|
|
|
namespace ser {
|
|
template <>
|
|
struct serializer<ExReg::exchange> {
|
|
template <typename Output>
|
|
static void write(Output& buf, const ExReg::exchange& op) { serializer<int32_t>::write(buf, op.x); };
|
|
|
|
template <typename Input>
|
|
static ExReg::exchange read(Input& buf) { return { serializer<int32_t>::read(buf) }; }
|
|
|
|
template <typename Input>
|
|
static void skip(Input& buf) { serializer<int32_t>::skip(buf); }
|
|
};
|
|
|
|
template <>
|
|
struct serializer<ExReg::read> {
|
|
template <typename Output>
|
|
static void write(Output& buf, const ExReg::read&) {};
|
|
|
|
template <typename Input>
|
|
static ExReg::read read(Input& buf) { return {}; }
|
|
|
|
template <typename Input>
|
|
static void skip(Input& buf) {}
|
|
};
|
|
}
|
|
|
|
bool operator==(ExReg::ret a, ExReg::ret b) { return a.x == b.x; }
|
|
|
|
std::ostream& operator<<(std::ostream& os, const ExReg::ret& r) {
|
|
return os << format("ret{{{}}}", r.x);
|
|
}
|
|
|
|
std::ostream& operator<<(std::ostream& os, const ExReg::read&) {
|
|
return os << "read";
|
|
}
|
|
|
|
std::ostream& operator<<(std::ostream& os, const ExReg::exchange& e) {
|
|
return os << format("xng{{{}}}", e.x);
|
|
}
|
|
|
|
// Wait until either one of `nodes` in `env` becomes a leader, or time point `timeout` is reached according to `timer` (whichever happens first).
|
|
// If the leader is found, returns it. Otherwise throws a `logical_timer::timed_out` exception.
|
|
//
|
|
// Note: the returned node may have been a leader the moment we found it, but may have just stepped down
|
|
// the moment we return it. It may be useful to call this function multiple times during cluster
|
|
// stabilization periods in order to find a node that will successfully answer calls.
|
|
template <PureStateMachine M>
|
|
struct wait_for_leader {
|
|
// FIXME: change into free function after clang bug #50345 is fixed
|
|
future<raft::server_id> operator()(
|
|
environment<M>& env,
|
|
std::vector<raft::server_id> nodes,
|
|
logical_timer& timer,
|
|
raft::logical_clock::time_point timeout) {
|
|
auto l = co_await timer.with_timeout(timeout, [] (weak_ptr<environment<M>> env, std::vector<raft::server_id> nodes) -> future<raft::server_id> {
|
|
while (true) {
|
|
if (!env) {
|
|
co_return raft::server_id{};
|
|
}
|
|
|
|
auto it = std::find_if(nodes.begin(), nodes.end(), [&env] (raft::server_id id) { return env->is_leader(id); });
|
|
if (it != nodes.end()) {
|
|
co_return *it;
|
|
}
|
|
|
|
co_await seastar::yield();
|
|
}
|
|
}(env.weak_from_this(), std::move(nodes)));
|
|
|
|
assert(l != raft::server_id{});
|
|
|
|
// Note: `l` may no longer be a leader at this point if there was a yield at the `co_await` above
|
|
// and `l` decided to step down, was restarted, or just got removed from the configuration.
|
|
|
|
co_return l;
|
|
}
|
|
};
|
|
|
|
SEASTAR_TEST_CASE(basic_test) {
|
|
logical_timer timer;
|
|
environment_config cfg {
|
|
.rnd{0},
|
|
.network_delay{5, 5},
|
|
.fd_convict_threshold = 50_t,
|
|
};
|
|
co_await with_env_and_ticker<ExReg>(cfg, [&timer] (environment<ExReg>& env, ticker& t) -> future<> {
|
|
using output_t = typename ExReg::output_t;
|
|
|
|
t.start([&] (uint64_t tick) {
|
|
env.tick_network();
|
|
timer.tick();
|
|
if (tick % 10 == 0) {
|
|
env.tick_servers();
|
|
}
|
|
}, 10'000);
|
|
|
|
auto leader_id = co_await env.new_server(true);
|
|
|
|
// Wait at most 1000 ticks for the server to elect itself as a leader.
|
|
assert(co_await wait_for_leader<ExReg>{}(env, {leader_id}, timer, timer.now() + 1000_t) == leader_id);
|
|
|
|
auto call = [&] (ExReg::input_t input, raft::logical_clock::duration timeout) {
|
|
return env.call(leader_id, std::move(input), timer.now() + timeout, timer);
|
|
};
|
|
|
|
auto eq = [] (const call_result_t<ExReg>& r, const output_t& expected) {
|
|
return std::holds_alternative<output_t>(r) && std::get<output_t>(r) == expected;
|
|
};
|
|
|
|
for (int i = 1; i <= 100; ++i) {
|
|
assert(eq(co_await call(ExReg::exchange{i}, 100_t), ExReg::ret{i - 1}));
|
|
}
|
|
|
|
tlogger.debug("100 exchanges - single server - passed");
|
|
|
|
auto id2 = co_await env.new_server(false);
|
|
auto id3 = co_await env.new_server(false);
|
|
|
|
tlogger.debug("Started 2 more servers, changing configuration");
|
|
|
|
assert(std::holds_alternative<std::monostate>(
|
|
co_await env.reconfigure(leader_id, {leader_id, id2, id3}, timer.now() + 100_t, timer)));
|
|
|
|
tlogger.debug("Configuration changed");
|
|
|
|
co_await call(ExReg::exchange{0}, 100_t);
|
|
for (int i = 1; i <= 100; ++i) {
|
|
assert(eq(co_await call(ExReg::exchange{i}, 100_t), ExReg::ret{i - 1}));
|
|
}
|
|
|
|
tlogger.debug("100 exchanges - three servers - passed");
|
|
|
|
// concurrent calls
|
|
std::vector<future<call_result_t<ExReg>>> futs;
|
|
for (int i = 0; i < 100; ++i) {
|
|
futs.push_back(call(ExReg::read{}, 100_t));
|
|
co_await timer.sleep(2_t);
|
|
}
|
|
for (int i = 0; i < 100; ++i) {
|
|
assert(eq(co_await std::move(futs[i]), ExReg::ret{100}));
|
|
}
|
|
|
|
tlogger.debug("100 concurrent reads - three servers - passed");
|
|
});
|
|
|
|
tlogger.debug("Finished");
|
|
}
|
|
|
|
// A snapshot was being taken with the wrong term (current term instead of the term at the snapshotted index).
|
|
// This is a regression test for that bug.
|
|
SEASTAR_TEST_CASE(snapshot_uses_correct_term_test) {
|
|
logical_timer timer;
|
|
environment_config cfg {
|
|
.rnd{0},
|
|
.network_delay{1, 1},
|
|
.fd_convict_threshold = 10_t,
|
|
};
|
|
co_await with_env_and_ticker<ExReg>(cfg, [&timer] (environment<ExReg>& env, ticker& t) -> future<> {
|
|
t.start([&] (uint64_t tick) {
|
|
env.tick_network();
|
|
timer.tick();
|
|
if (tick % 10 == 0) {
|
|
env.tick_servers();
|
|
}
|
|
}, 10'000);
|
|
|
|
auto id1 = co_await env.new_server(true,
|
|
raft::server::configuration{
|
|
// It's easier to catch the problem when we send entries one by one, not in batches.
|
|
.append_request_threshold = 1,
|
|
});
|
|
assert(co_await wait_for_leader<ExReg>{}(env, {id1}, timer, timer.now() + 1000_t) == id1);
|
|
|
|
auto id2 = co_await env.new_server(false,
|
|
raft::server::configuration{
|
|
.append_request_threshold = 1,
|
|
});
|
|
|
|
assert(std::holds_alternative<std::monostate>(
|
|
co_await env.reconfigure(id1, {id1, id2}, timer.now() + 100_t, timer)));
|
|
|
|
// Append a bunch of entries
|
|
for (int i = 1; i <= 10; ++i) {
|
|
assert(std::holds_alternative<typename ExReg::ret>(
|
|
co_await env.call(id1, ExReg::exchange{0}, timer.now() + 100_t, timer)));
|
|
}
|
|
|
|
assert(env.is_leader(id1));
|
|
|
|
// Force a term increase by partitioning the network and waiting for the leader to step down
|
|
tlogger.trace("add grudge");
|
|
env.get_network().add_grudge(id2, id1);
|
|
env.get_network().add_grudge(id1, id2);
|
|
|
|
while (env.is_leader(id1)) {
|
|
co_await seastar::yield();
|
|
}
|
|
|
|
tlogger.trace("remove grudge");
|
|
env.get_network().remove_grudge(id2, id1);
|
|
env.get_network().remove_grudge(id1, id2);
|
|
|
|
auto l = co_await wait_for_leader<ExReg>{}(env, {id1, id2}, timer, timer.now() + 1000_t);
|
|
tlogger.trace("last leader: {}", l);
|
|
|
|
// Now the current term is greater than the term of the first couple of entries.
|
|
// Join another server with a small snapshot_threshold.
|
|
// The leader will send entries to this server one by one (due to small append_request_threshold),
|
|
// so the joining server will apply entries one by one or in small batches (depends on the timing),
|
|
// making it likely that it decides to take a snapshot at an entry with term lower than the current one.
|
|
// If we are (un)lucky and we take a snapshot at the last appended entry, the node will refuse all
|
|
// later append_entries requests due to non-matching term at the last appended entry. Note: due to this
|
|
// requirement, the test is nondeterministic and doesn't always catch the bug (it depends on a race
|
|
// between applier_fiber and io_fiber), but it does catch it in a significant number of runs.
|
|
// It's also a lot easier to catch this in dev than in debug, for instance.
|
|
// If we catch the bug, the reconfigure request below will time out.
|
|
|
|
auto id3 = co_await env.new_server(false,
|
|
raft::server::configuration{
|
|
.snapshot_threshold = 5,
|
|
.snapshot_trailing = 2,
|
|
});
|
|
assert(std::holds_alternative<std::monostate>(
|
|
co_await env.reconfigure(l, {l, id3}, timer.now() + 1000_t, timer)));
|
|
});
|
|
}
|
|
|
|
// Regression test for the following bug: when we took a snapshot, we forgot to save the configuration.
|
|
// This caused each node in the cluster to eventually forget the cluster configuration.
|
|
SEASTAR_TEST_CASE(snapshotting_preserves_config_test) {
|
|
logical_timer timer;
|
|
environment_config cfg {
|
|
.rnd{0},
|
|
.network_delay{1, 1},
|
|
.fd_convict_threshold = 10_t,
|
|
};
|
|
co_await with_env_and_ticker<ExReg>(cfg, [&timer] (environment<ExReg>& env, ticker& t) -> future<> {
|
|
t.start([&] (uint64_t tick) {
|
|
env.tick_network();
|
|
timer.tick();
|
|
if (tick % 10 == 0) {
|
|
env.tick_servers();
|
|
}
|
|
}, 10'000);
|
|
|
|
auto id1 = co_await env.new_server(true,
|
|
raft::server::configuration{
|
|
.snapshot_threshold = 5,
|
|
.snapshot_trailing = 1,
|
|
});
|
|
assert(co_await wait_for_leader<ExReg>{}(env, {id1}, timer, timer.now() + 1000_t) == id1);
|
|
|
|
auto id2 = co_await env.new_server(false,
|
|
raft::server::configuration{
|
|
.snapshot_threshold = 5,
|
|
.snapshot_trailing = 1,
|
|
});
|
|
|
|
assert(std::holds_alternative<std::monostate>(
|
|
co_await env.reconfigure(id1, {id1, id2}, timer.now() + 100_t, timer)));
|
|
|
|
// Append a bunch of entries
|
|
for (int i = 1; i <= 10; ++i) {
|
|
assert(std::holds_alternative<typename ExReg::ret>(
|
|
co_await env.call(id1, ExReg::exchange{0}, timer.now() + 100_t, timer)));
|
|
}
|
|
|
|
assert(env.is_leader(id1));
|
|
|
|
// Partition the network, forcing the leader to step down.
|
|
tlogger.trace("add grudge");
|
|
env.get_network().add_grudge(id2, id1);
|
|
env.get_network().add_grudge(id1, id2);
|
|
|
|
while (env.is_leader(id1)) {
|
|
co_await seastar::yield();
|
|
}
|
|
|
|
tlogger.trace("remove grudge");
|
|
env.get_network().remove_grudge(id2, id1);
|
|
env.get_network().remove_grudge(id1, id2);
|
|
|
|
// With the bug this would timeout, the cluster is unable to elect a leader without the configuration.
|
|
auto l = co_await wait_for_leader<ExReg>{}(env, {id1, id2}, timer, timer.now() + 1000_t);
|
|
tlogger.trace("last leader: {}", l);
|
|
});
|
|
}
|
|
|
|
// Regression test for #9981.
|
|
SEASTAR_TEST_CASE(removed_follower_with_forwarding_learns_about_removal) {
|
|
logical_timer timer;
|
|
environment_config cfg {
|
|
.rnd{0},
|
|
.network_delay{1, 1},
|
|
.fd_convict_threshold = 10_t,
|
|
};
|
|
co_await with_env_and_ticker<ExReg>(cfg, [&timer] (environment<ExReg>& env, ticker& t) -> future<> {
|
|
t.start([&] (uint64_t tick) {
|
|
env.tick_network();
|
|
timer.tick();
|
|
if (tick % 10 == 0) {
|
|
env.tick_servers();
|
|
}
|
|
}, 10'000);
|
|
|
|
raft::server::configuration cfg {
|
|
.enable_forwarding = true,
|
|
};
|
|
|
|
auto id1 = co_await env.new_server(true, cfg);
|
|
assert(co_await wait_for_leader<ExReg>{}(env, {id1}, timer, timer.now() + 1000_t) == id1);
|
|
|
|
auto id2 = co_await env.new_server(false, cfg);
|
|
assert(std::holds_alternative<std::monostate>(
|
|
co_await env.reconfigure(id1, {id1, id2}, timer.now() + 100_t, timer)));
|
|
|
|
// Server 2 forwards the entry that removes it to server 1.
|
|
// We want server 2 to eventually learn from server 1 that it was removed,
|
|
// so the call finishes (no timeout).
|
|
assert(std::holds_alternative<std::monostate>(
|
|
co_await env.modify_config(id2, {}, {id2}, timer.now() + 100_t, timer)));
|
|
});
|
|
}
|
|
|
|
// Given a function `F` which takes a `raft::server_id` argument and returns a variant type
|
|
// which contains `not_a_leader`, repeatedly calls `F` until it returns something else than
|
|
// `not_a_leader` or until we reach a limit, whichever happens first.
|
|
// The maximum number of calls until we give up is specified by `bounces`.
|
|
// The initial `raft::server_id` argument provided to `F` is specified as an argument
|
|
// to this function (`srv_id`). If the initial call returns `not_a_leader`, then:
|
|
// - if the result contained a different leader ID and we didn't already try that ID,
|
|
// we will use it in the next call, sleeping for `known_leader_delay` first,
|
|
// - otherwise we will take the next ID from the `known` set, sleeping for
|
|
// `unknown_leader_delay` first; no ID will be tried twice.
|
|
// The returned result contains the result of the last call to `F` and the last
|
|
// server ID passed to `F`.
|
|
template <typename F>
|
|
struct bouncing {
|
|
using future_type = std::invoke_result_t<F, raft::server_id>;
|
|
using value_type = typename future_type::value_type;
|
|
|
|
static_assert(boost::mp11::mp_contains<value_type, raft::not_a_leader>::value);
|
|
|
|
F _f;
|
|
|
|
bouncing(F f) : _f(std::move(f)) {}
|
|
|
|
// FIXME: change this into a free function after clang bug #50345 is fixed.
|
|
future<std::pair<value_type, raft::server_id>> operator()(
|
|
logical_timer& timer,
|
|
std::unordered_set<raft::server_id> known,
|
|
raft::server_id srv_id,
|
|
size_t bounces,
|
|
raft::logical_clock::duration known_leader_delay,
|
|
raft::logical_clock::duration unknown_leader_delay
|
|
) {
|
|
tlogger.trace("bouncing call: starting with {}", srv_id);
|
|
std::unordered_set<raft::server_id> tried;
|
|
while (true) {
|
|
auto res = co_await _f(srv_id);
|
|
tried.insert(srv_id);
|
|
known.erase(srv_id);
|
|
|
|
if (auto n_a_l = std::get_if<raft::not_a_leader>(&res); n_a_l && bounces) {
|
|
--bounces;
|
|
|
|
if (n_a_l->leader) {
|
|
assert(n_a_l->leader != srv_id);
|
|
if (!tried.contains(n_a_l->leader)) {
|
|
co_await timer.sleep(known_leader_delay);
|
|
srv_id = n_a_l->leader;
|
|
tlogger.trace("bouncing call: got `not_a_leader`, rerouted to {}", srv_id);
|
|
continue;
|
|
}
|
|
}
|
|
|
|
if (!known.empty()) {
|
|
srv_id = *known.begin();
|
|
if (n_a_l->leader) {
|
|
tlogger.trace("bouncing call: got `not_a_leader`, rerouted to {}, but already tried it; trying {}", n_a_l->leader, srv_id);
|
|
} else {
|
|
tlogger.trace("bouncing call: got `not_a_leader`, no reroute, trying {}", srv_id);
|
|
}
|
|
continue;
|
|
}
|
|
}
|
|
|
|
co_return std::pair{res, srv_id};
|
|
}
|
|
}
|
|
};
|
|
|
|
// An operation representing a call to the Raft cluster with a specific state machine input.
|
|
// We may bounce a number of times if the server returns `not_a_leader` before giving up.
|
|
template <PureStateMachine M>
|
|
struct raft_call {
|
|
typename M::input_t input;
|
|
raft::logical_clock::duration timeout;
|
|
|
|
using result_type = call_result_t<M>;
|
|
|
|
struct state_type {
|
|
environment<M>& env;
|
|
|
|
// The set of servers that may be part of the current configuration.
|
|
// Sometimes we don't know the exact configuration, e.g. after a failed configuration change.
|
|
const std::unordered_set<raft::server_id>& known;
|
|
|
|
logical_timer& timer;
|
|
};
|
|
|
|
future<result_type> execute(state_type& s, const operation::context& ctx) {
|
|
// TODO a stable contact point used by a given thread would be preferable;
|
|
// the thread would switch only if necessary (the contact point left the configuration).
|
|
// Currently we choose the contact point randomly each time.
|
|
assert(s.known.size() > 0);
|
|
static std::mt19937 engine{0};
|
|
|
|
auto it = s.known.begin();
|
|
std::advance(it, std::uniform_int_distribution<size_t>{0, s.known.size() - 1}(engine));
|
|
auto contact = *it;
|
|
|
|
tlogger.debug("db call start inp {} tid {} start time {} current time {} contact {}", input, ctx.thread, ctx.start, s.timer.now(), contact);
|
|
|
|
auto [res, last] = co_await bouncing{[input = input, timeout = s.timer.now() + timeout, &timer = s.timer, &env = s.env] (raft::server_id id) {
|
|
return env.call(id, input, timeout, timer);
|
|
}}(s.timer, s.known, contact, 6, 10_t, 10_t);
|
|
tlogger.debug("db call end inp {} tid {} start time {} current time {} last contact {}", input, ctx.thread, ctx.start, s.timer.now(), last);
|
|
|
|
co_return res;
|
|
}
|
|
|
|
friend std::ostream& operator<<(std::ostream& os, const raft_call& c) {
|
|
return os << format("raft_call{{input:{},timeout:{}}}", c.input, c.timeout);
|
|
}
|
|
};
|
|
|
|
// An operation that partitions the network in half.
|
|
// During the partition, no server from one half can contact any server from the other;
|
|
// the partition is symmetric.
|
|
// For odd number of nodes, ensures that the current leader (if there is one) is in the minority.
|
|
template <PureStateMachine M>
|
|
class network_majority_grudge {
|
|
raft::logical_clock::duration _duration;
|
|
|
|
public:
|
|
struct state_type {
|
|
environment<M>& env;
|
|
const std::unordered_set<raft::server_id>& known;
|
|
logical_timer& timer;
|
|
std::mt19937 rnd;
|
|
};
|
|
|
|
using result_type = std::monostate;
|
|
|
|
network_majority_grudge(raft::logical_clock::duration d) : _duration(d) {
|
|
static_assert(operation::Executable<network_majority_grudge<M>>);
|
|
}
|
|
|
|
future<result_type> execute(state_type& s, const operation::context& ctx) {
|
|
std::vector<raft::server_id> nodes{s.known.begin(), s.known.end()};
|
|
std::shuffle(nodes.begin(), nodes.end(), s.rnd);
|
|
|
|
auto mid = nodes.begin() + (nodes.size() / 2);
|
|
if (nodes.size() % 2) {
|
|
// Odd number of nodes, let's ensure that the leader (if there is one) is in the minority
|
|
auto it = std::find_if(mid, nodes.end(), [&env = s.env] (raft::server_id id) { return env.is_leader(id); });
|
|
if (it != nodes.end()) {
|
|
std::swap(*nodes.begin(), *it);
|
|
}
|
|
}
|
|
|
|
// Note: creating the grudges has O(n^2) complexity, where n is the cluster size.
|
|
// May be problematic for (very) large clusters.
|
|
for (auto x = nodes.begin(); x != mid; ++x) {
|
|
for (auto y = mid; y != nodes.end(); ++y) {
|
|
s.env.get_network().add_grudge(*x, *y);
|
|
s.env.get_network().add_grudge(*y, *x);
|
|
}
|
|
}
|
|
|
|
tlogger.debug("network_majority_grudge start tid {} start time {} current time {} duration {} grudge: {} vs {}",
|
|
ctx.thread, ctx.start, s.timer.now(),
|
|
_duration,
|
|
std::vector<raft::server_id>{nodes.begin(), mid},
|
|
std::vector<raft::server_id>{mid, nodes.end()});
|
|
|
|
co_await s.timer.sleep(_duration);
|
|
|
|
tlogger.debug("network_majority_grudge end tid {} start time {} current time {}", ctx.thread, ctx.start, s.timer.now());
|
|
|
|
// Some servers in `nodes` may already be gone at this point but network doesn't care.
|
|
// It's safe to call `remove_grudge`.
|
|
for (auto x = nodes.begin(); x != mid; ++x) {
|
|
for (auto y = mid; y != nodes.end(); ++y) {
|
|
s.env.get_network().remove_grudge(*x, *y);
|
|
s.env.get_network().remove_grudge(*y, *x);
|
|
}
|
|
}
|
|
|
|
co_return std::monostate{};
|
|
}
|
|
|
|
friend std::ostream& operator<<(std::ostream& os, const network_majority_grudge& p) {
|
|
return os << format("network_majority_grudge{{duration:{}}}", p._duration);
|
|
}
|
|
};
|
|
|
|
// Must be executed sequentially.
|
|
template <PureStateMachine M>
|
|
struct reconfiguration {
|
|
raft::logical_clock::duration timeout;
|
|
|
|
struct state_type {
|
|
const std::vector<raft::server_id> all_servers;
|
|
environment<M>& env;
|
|
// a subset of all_servers that we modify;
|
|
// the set of servers which may potentially be in the current configuration
|
|
std::unordered_set<raft::server_id>& known;
|
|
logical_timer& timer;
|
|
std::mt19937 rnd;
|
|
};
|
|
|
|
using result_type = reconfigure_result_t;
|
|
|
|
future<result_type> execute(state_type& s, const operation::context& ctx) {
|
|
assert(s.all_servers.size() > 1);
|
|
std::vector<raft::server_id> nodes{s.all_servers.begin(), s.all_servers.end()};
|
|
|
|
std::shuffle(nodes.begin(), nodes.end(), s.rnd);
|
|
nodes.resize(std::uniform_int_distribution<size_t>{1, nodes.size()}(s.rnd));
|
|
|
|
assert(s.known.size() > 0);
|
|
auto [res, last] = co_await bouncing{[&nodes, timeout = s.timer.now() + timeout, &timer = s.timer, &env = s.env] (raft::server_id id) {
|
|
return env.reconfigure(id, nodes, timeout, timer);
|
|
}}(s.timer, s.known, *s.known.begin(), 10, 10_t, 10_t);
|
|
|
|
std::visit(make_visitor(
|
|
[&, last = last] (std::monostate) {
|
|
tlogger.debug("reconfig successful from {} to {} by {}", s.known, nodes, last);
|
|
s.known = std::unordered_set<raft::server_id>{nodes.begin(), nodes.end()};
|
|
// TODO: include the old leader as well in case it's not part of the new config?
|
|
// it may remain a leader for some time...
|
|
},
|
|
[&, last = last] (raft::not_a_leader& e) {
|
|
tlogger.debug("reconfig failed, not a leader: {} tried {} by {}", e, nodes, last);
|
|
},
|
|
[&, last = last] (auto& e) {
|
|
s.known.merge(std::unordered_set<raft::server_id>{nodes.begin(), nodes.end()});
|
|
tlogger.debug("reconfig failed: {}, tried {} after merge {} by {}", e, nodes, s.known, last);
|
|
}
|
|
), res);
|
|
|
|
co_return res;
|
|
}
|
|
|
|
friend std::ostream& operator<<(std::ostream& os, const reconfiguration& r) {
|
|
return os << format("reconfiguration{{timeout:{}}}", r.timeout);
|
|
}
|
|
};
|
|
|
|
template <PureStateMachine M>
|
|
struct stop_crash {
|
|
raft::logical_clock::duration restart_delay;
|
|
|
|
struct state_type {
|
|
environment<M>& env;
|
|
std::unordered_set<raft::server_id>& known;
|
|
logical_timer& timer;
|
|
std::mt19937 rnd;
|
|
};
|
|
|
|
struct result_type {};
|
|
|
|
future<result_type> execute(state_type& s, const operation::context& ctx) {
|
|
assert(s.known.size() > 0);
|
|
auto it = s.known.begin();
|
|
std::advance(it, std::uniform_int_distribution<size_t>{0, s.known.size() - 1}(s.rnd));
|
|
auto srv = *it;
|
|
|
|
static std::bernoulli_distribution bdist{0.5};
|
|
if (bdist(s.rnd)) {
|
|
tlogger.debug("Crashing server {}", srv);
|
|
s.env.crash(srv);
|
|
} else {
|
|
tlogger.debug("Stopping server {}...", srv);
|
|
co_await s.env.stop(srv);
|
|
tlogger.debug("Server {} stopped", srv);
|
|
}
|
|
co_await s.timer.sleep(restart_delay);
|
|
tlogger.debug("Restarting server {}", srv);
|
|
co_await s.env.start_server(srv);
|
|
|
|
co_return result_type{};
|
|
}
|
|
|
|
friend std::ostream& operator<<(std::ostream& os, const stop_crash& c) {
|
|
return os << format("stop_crash{{delay:{}}}", c.restart_delay);
|
|
}
|
|
|
|
friend std::ostream& operator<<(std::ostream& os, const result_type&) {
|
|
return os << "";
|
|
}
|
|
};
|
|
|
|
namespace std {
|
|
|
|
std::ostream& operator<<(std::ostream& os, const std::monostate&) {
|
|
return os << "";
|
|
}
|
|
|
|
template <typename T, typename... Ts>
|
|
std::ostream& operator<<(std::ostream& os, const std::variant<T, Ts...>& v) {
|
|
std::visit([&os] (auto& arg) { os << arg; }, v);
|
|
return os;
|
|
}
|
|
|
|
} // namespace std
|
|
|
|
namespace operation {
|
|
|
|
std::ostream& operator<<(std::ostream& os, const thread_id& tid) {
|
|
return os << format("thread_id{{{}}}", tid.id);
|
|
}
|
|
|
|
} // namespace operation
|
|
|
|
// An immutable sequence of integers.
|
|
class append_seq {
|
|
public:
|
|
using elem_t = int32_t;
|
|
|
|
private:
|
|
// This represents the sequence of integers from _seq->begin() to _seq->begin() + _end.
|
|
// The underlying vector *_seq may however be shared by other instances of `append_seq`.
|
|
// If only one instance is appending, the operation is O(1). However, each subsequent
|
|
// append performed by another instance sharing this vector must perform a copy.
|
|
|
|
lw_shared_ptr<std::vector<elem_t>> _seq; // always engaged
|
|
size_t _end; // <= _seq.size()
|
|
elem_t _digest; // sum of all elements modulo `magic`
|
|
|
|
static const elem_t magic = 54313;
|
|
|
|
public:
|
|
append_seq(std::vector<elem_t> v) : _seq{make_lw_shared<std::vector<elem_t>>(std::move(v))}, _end{_seq->size()}, _digest{0} {
|
|
for (auto x : *_seq) {
|
|
_digest = digest_append(_digest, x);
|
|
}
|
|
}
|
|
|
|
static elem_t digest_append(elem_t d, elem_t x) {
|
|
assert(0 <= d < magic);
|
|
|
|
auto y = (d + x) % magic;
|
|
assert(digest_remove(y, x) == d);
|
|
return y;
|
|
}
|
|
|
|
static elem_t digest_remove(elem_t d, elem_t x) {
|
|
assert(0 <= d < magic);
|
|
auto y = (d - x) % magic;
|
|
return y < 0 ? y + magic : y;
|
|
}
|
|
|
|
elem_t digest() const {
|
|
return _digest;
|
|
}
|
|
|
|
append_seq append(elem_t x) const {
|
|
assert(_seq);
|
|
assert(_end <= _seq->size());
|
|
|
|
auto seq = _seq;
|
|
if (_end < seq->size()) {
|
|
// The shared sequence was already appended beyond _end by someone else.
|
|
// We need to copy everything so we don't break the other guy.
|
|
seq = make_lw_shared<std::vector<elem_t>>(seq->begin(), seq->begin() + _end);
|
|
}
|
|
|
|
seq->push_back(x);
|
|
return {std::move(seq), _end + 1, digest_append(_digest, x)};
|
|
}
|
|
|
|
elem_t operator[](size_t idx) const {
|
|
assert(_seq);
|
|
assert(idx < _end);
|
|
assert(_end <= _seq->size());
|
|
return (*_seq)[idx];
|
|
}
|
|
|
|
bool empty() const {
|
|
return _end == 0;
|
|
}
|
|
|
|
size_t size() const {
|
|
assert(_end <= _seq->size());
|
|
return _end;
|
|
}
|
|
|
|
std::pair<append_seq, elem_t> pop() const {
|
|
assert(_seq);
|
|
assert(_end <= _seq->size());
|
|
assert(0 < _end);
|
|
|
|
return {{_seq, _end - 1, digest_remove(_digest, (*_seq)[_end - 1])}, (*_seq)[_end - 1]};
|
|
}
|
|
|
|
friend std::ostream& operator<<(std::ostream& os, const append_seq& s) {
|
|
// TODO: don't copy the elements
|
|
std::vector<elem_t> v{s._seq->begin(), s._seq->begin() + s._end};
|
|
return os << format("seq({} _end {})", v, s._end);
|
|
}
|
|
|
|
private:
|
|
append_seq(lw_shared_ptr<std::vector<elem_t>> seq, size_t end, elem_t d)
|
|
: _seq(std::move(seq)), _end(end), _digest(d) {}
|
|
};
|
|
|
|
struct AppendReg {
|
|
struct append { int32_t x; };
|
|
struct ret { int32_t x; append_seq prev; };
|
|
|
|
using state_t = append_seq;
|
|
using input_t = append;
|
|
using output_t = ret;
|
|
|
|
static std::pair<state_t, output_t> delta(const state_t& curr, input_t input) {
|
|
return {curr.append(input.x), {input.x, curr}};
|
|
}
|
|
|
|
static thread_local const state_t init;
|
|
};
|
|
|
|
thread_local const AppendReg::state_t AppendReg::init{{0}};
|
|
|
|
namespace ser {
|
|
template <>
|
|
struct serializer<AppendReg::append> {
|
|
template <typename Output>
|
|
static void write(Output& buf, const AppendReg::append& op) { serializer<int32_t>::write(buf, op.x); };
|
|
|
|
template <typename Input>
|
|
static AppendReg::append read(Input& buf) { return { serializer<int32_t>::read(buf) }; }
|
|
|
|
template <typename Input>
|
|
static void skip(Input& buf) { serializer<int32_t>::skip(buf); }
|
|
};
|
|
}
|
|
|
|
struct inconsistency {
|
|
std::string what;
|
|
};
|
|
|
|
struct append_reg_model {
|
|
using elem_t = typename append_seq::elem_t;
|
|
|
|
struct entry {
|
|
elem_t elem;
|
|
elem_t digest;
|
|
};
|
|
|
|
friend std::ostream& operator<<(std::ostream& os, const entry& e) {
|
|
return os << e.elem;
|
|
}
|
|
|
|
std::vector<entry> seq{{0, 0}};
|
|
std::unordered_map<elem_t, size_t> index{{0, 0}};
|
|
std::unordered_set<elem_t> banned;
|
|
std::unordered_set<elem_t> returned;
|
|
std::unordered_set<elem_t> in_progress;
|
|
|
|
void invocation(elem_t x) {
|
|
assert(!index.contains(x));
|
|
assert(!in_progress.contains(x));
|
|
in_progress.insert(x);
|
|
}
|
|
|
|
void return_success(elem_t x, append_seq prev) {
|
|
assert(!returned.contains(x));
|
|
assert(x != 0);
|
|
assert(!prev.empty());
|
|
try {
|
|
completion(x, prev);
|
|
} catch (inconsistency& e) {
|
|
e.what += format("\nwhen completing elem: {}\nprev: {}\nmodel: {}", x, prev, seq);
|
|
throw;
|
|
}
|
|
returned.insert(x);
|
|
}
|
|
|
|
void return_failure(elem_t x) {
|
|
assert(!index.contains(x));
|
|
assert(in_progress.contains(x));
|
|
banned.insert(x);
|
|
in_progress.erase(x);
|
|
}
|
|
|
|
private:
|
|
void completion(elem_t x, append_seq prev) {
|
|
if (prev.empty()) {
|
|
assert(x == 0);
|
|
return;
|
|
}
|
|
|
|
assert(x != 0);
|
|
assert(!banned.contains(x));
|
|
assert(in_progress.contains(x) || index.contains(x));
|
|
|
|
auto [prev_prev, prev_x] = prev.pop();
|
|
|
|
if (auto it = index.find(x); it != index.end()) {
|
|
// This element was already completed.
|
|
auto idx = it->second;
|
|
assert(0 < idx);
|
|
assert(idx < seq.size());
|
|
|
|
if (prev_x != seq[idx - 1].elem) {
|
|
throw inconsistency{format(
|
|
"elem {} completed again (existing at idx {}), but prev elem does not match existing model"
|
|
"\nprev elem: {}\nmodel prev elem: {}\nprev: {} model up to idx: {}",
|
|
x, idx, prev_x, seq[idx - 1].elem, prev, std::vector<entry>{seq.begin(), seq.begin()+idx})};
|
|
}
|
|
|
|
if (prev.digest() != seq[idx - 1].digest) {
|
|
auto err = format(
|
|
"elem {} completed again (existing at idx {}), but prev does not match existing model"
|
|
"\n prev: {}\nmodel up to idx: {}",
|
|
x, idx, prev, std::vector<entry>{seq.begin(), seq.begin()+idx});
|
|
|
|
auto min_len = std::min(prev.size(), idx);
|
|
for (size_t i = 0; i < min_len; ++i) {
|
|
if (prev[i] != seq[i].elem) {
|
|
err += format("\nmismatch at idx {} prev {} model {}", i, prev[i], seq[i].elem);
|
|
}
|
|
}
|
|
|
|
throw inconsistency{std::move(err)};
|
|
}
|
|
|
|
return;
|
|
}
|
|
|
|
// A new completion.
|
|
// First, recursively complete the previous elements...
|
|
completion(prev_x, std::move(prev_prev));
|
|
|
|
// Check that the existing tail matches our tail.
|
|
assert(!seq.empty());
|
|
if (prev_x != seq.back().elem) {
|
|
throw inconsistency{format(
|
|
"new completion (elem: {}) but prev elem does not match existing model"
|
|
"\nprev elem: {}\nmodel prev elem: {}\nprev: {}\n model: {}",
|
|
x, prev_x, seq.back().elem, prev, seq)};
|
|
}
|
|
if (prev.digest() != seq.back().digest) {
|
|
auto err = format(
|
|
"new completion (elem: {}) but prev does not match existing model"
|
|
"\nprev: {}\n model: {}",
|
|
x, prev, seq);
|
|
|
|
auto min_len = std::min(prev.size(), seq.size());
|
|
for (size_t i = 0; i < min_len; ++i) {
|
|
if (prev[i] != seq[i].elem) {
|
|
err += format("\nmismatch at idx {} prev {} model {}", i, prev[i], seq[i].elem);
|
|
}
|
|
}
|
|
|
|
throw inconsistency{std::move(err)};
|
|
}
|
|
|
|
// All previous elements were completed, so the new element belongs at the end.
|
|
index.emplace(x, seq.size());
|
|
seq.push_back(entry{x, append_seq::digest_append(seq.back().digest, x)});
|
|
in_progress.erase(x);
|
|
}
|
|
};
|
|
|
|
std::ostream& operator<<(std::ostream& os, const AppendReg::append& a) {
|
|
return os << format("append{{{}}}", a.x);
|
|
}
|
|
|
|
std::ostream& operator<<(std::ostream& os, const AppendReg::ret& r) {
|
|
return os << format("ret{{{}, {}}}", r.x, r.prev);
|
|
}
|
|
|
|
SEASTAR_TEST_CASE(basic_generator_test) {
|
|
using op_type = operation::invocable<operation::either_of<
|
|
raft_call<AppendReg>,
|
|
network_majority_grudge<AppendReg>,
|
|
reconfiguration<AppendReg>,
|
|
stop_crash<AppendReg>
|
|
>>;
|
|
using history_t = utils::chunked_vector<std::variant<op_type, operation::completion<op_type>>>;
|
|
|
|
static_assert(operation::Invocable<op_type>);
|
|
|
|
auto seed = tests::random::get_int<int32_t>();
|
|
std::mt19937 random_engine{seed};
|
|
|
|
logical_timer timer;
|
|
environment_config cfg {
|
|
.rnd{random_engine},
|
|
.network_delay{0, 6},
|
|
.fd_convict_threshold = 50_t,
|
|
};
|
|
co_await with_env_and_ticker<AppendReg>(cfg, [&] (environment<AppendReg>& env, ticker& t) -> future<> {
|
|
t.start([&, dist = std::uniform_int_distribution<size_t>(0, 9)] (uint64_t tick) mutable {
|
|
env.tick_network();
|
|
timer.tick();
|
|
env.for_each_server([&] (raft::server_id, raft_server<AppendReg>* srv, failure_detector& fd) {
|
|
// Tick each server with probability 1/10.
|
|
// Thus each server is ticked, on average, once every 10 timer/network ticks.
|
|
// On the other hand, we now have servers running at different speeds.
|
|
if (srv && dist(random_engine) == 0) {
|
|
srv->tick();
|
|
fd.tick();
|
|
}
|
|
});
|
|
env.tick_crashing_servers();
|
|
}, 200'000);
|
|
|
|
std::bernoulli_distribution bdist{0.5};
|
|
|
|
// With probability 1/2 enable forwarding: when we send a command to a follower, it automatically
|
|
// forwards it to the known leader or waits for learning about a leader instead of returning
|
|
// `not_a_leader`.
|
|
bool forwarding = bdist(random_engine);
|
|
|
|
// With probability 1/2, run the servers with a configuration which causes frequent snapshotting.
|
|
// Note: with the default configuration we won't observe any snapshots at all, since the default
|
|
// threshold is 1024 log commands and we perform only 500 ops.
|
|
bool frequent_snapshotting = bdist(random_engine);
|
|
|
|
// TODO: randomize the snapshot thresholds between different servers for more chaos.
|
|
auto srv_cfg = frequent_snapshotting
|
|
? raft::server::configuration {
|
|
.snapshot_threshold{10},
|
|
.snapshot_trailing{5},
|
|
.max_log_size{20},
|
|
.enable_forwarding{forwarding},
|
|
}
|
|
: raft::server::configuration {
|
|
.enable_forwarding{forwarding},
|
|
};
|
|
|
|
tlogger.info("basic_generator_test: forwarding: {}, frequent snapshotting: {}", forwarding, frequent_snapshotting);
|
|
|
|
auto leader_id = co_await env.new_server(true, srv_cfg);
|
|
|
|
// Wait for the server to elect itself as a leader.
|
|
assert(co_await wait_for_leader<AppendReg>{}(env, {leader_id}, timer, timer.now() + 1000_t) == leader_id);
|
|
|
|
size_t no_all_servers = 10;
|
|
std::vector<raft::server_id> all_servers{leader_id};
|
|
for (size_t i = 1; i < no_all_servers; ++i) {
|
|
all_servers.push_back(co_await env.new_server(false, srv_cfg));
|
|
}
|
|
|
|
size_t no_init_servers = 5;
|
|
|
|
// `known_config` represents the set of servers that may potentially be in the cluster configuration.
|
|
//
|
|
// It is not possible to determine in general what the 'true' current configuration is (if even such notion
|
|
// makes sense at all). Given a sequence of reconfiguration requests, assuming that all except possibly the last
|
|
// requests have finished, then:
|
|
// - if the last request has finished successfully, then the current configuration must be equal
|
|
// to the one chosen in the last request;
|
|
// - but if it hasn't finished yet, or it finished with a failure, the current configuration may contain servers
|
|
// from the one chosen in the last request or from the previously known set of servers.
|
|
//
|
|
// The situation is even worse considering that requests may never 'finish', i.e. we may never get a response
|
|
// to a reconfiguration request (in which case we eventually timeout). These requests may in theory execute
|
|
// at any point in the future. We take a practical approach when updating `known_config`: we assume
|
|
// that our timeouts for reconfiguration requests are large enough so that if a reconfiguration request
|
|
// has timed out, it has either already finished or it never will.
|
|
// TODO: this may not be true and we may end up with `known_config` that does not contain the current leader
|
|
// (not observed in practice yet though... I think) Come up with a better approach.
|
|
std::unordered_set<raft::server_id> known_config;
|
|
|
|
for (size_t i = 0; i < no_init_servers; ++i) {
|
|
known_config.insert(all_servers[i]);
|
|
}
|
|
|
|
assert(std::holds_alternative<std::monostate>(
|
|
co_await env.reconfigure(leader_id,
|
|
std::vector<raft::server_id>{known_config.begin(), known_config.end()}, timer.now() + 100_t, timer)));
|
|
|
|
auto threads = operation::make_thread_set(all_servers.size() + 3);
|
|
auto [partition_thread, reconfig_thread, crash_thread] = take<3>(threads);
|
|
|
|
|
|
raft_call<AppendReg>::state_type db_call_state {
|
|
.env = env,
|
|
.known = known_config,
|
|
.timer = timer
|
|
};
|
|
|
|
network_majority_grudge<AppendReg>::state_type network_majority_grudge_state {
|
|
.env = env,
|
|
.known = known_config,
|
|
.timer = timer,
|
|
.rnd = std::mt19937{seed}
|
|
};
|
|
|
|
reconfiguration<AppendReg>::state_type reconfiguration_state {
|
|
.all_servers = all_servers,
|
|
.env = env,
|
|
.known = known_config,
|
|
.timer = timer,
|
|
.rnd = std::mt19937{seed}
|
|
};
|
|
|
|
stop_crash<AppendReg>::state_type crash_state {
|
|
.env = env,
|
|
.known = known_config,
|
|
.timer = timer,
|
|
.rnd = std::mt19937{seed}
|
|
};
|
|
|
|
auto init_state = op_type::state_type{
|
|
std::move(db_call_state),
|
|
std::move(network_majority_grudge_state),
|
|
std::move(reconfiguration_state),
|
|
std::move(crash_state)
|
|
};
|
|
|
|
using namespace generator;
|
|
|
|
// For reference to ``real life'' suppose 1_t ~= 10ms. Then:
|
|
// 10_t (server tick) ~= 100ms
|
|
// network delay = 3_t ~= 30ms
|
|
// election timeout = 10 server ticks = 100_t ~= 1s
|
|
// thus, to enforce leader election, need a majority to convict the current leader for > 100_t ~= 1s,
|
|
// failure detector convict threshold = 50 srv ticks = 500_t ~= 5s
|
|
// so need to partition for > 600_t ~= 6s
|
|
// choose network partition duration uniformly from [600_t-600_t/3, 600_t+600_t/3] = [400_t, 800_t]
|
|
// ~= [4s, 8s] -> ~1/2 partitions should cause an election
|
|
// we will set request timeout 600_t ~= 6s and partition every 1200_t ~= 12s
|
|
|
|
auto gen = op_limit(500,
|
|
pin(partition_thread,
|
|
stagger(seed, timer.now() + 200_t, 1200_t, 1200_t,
|
|
random(seed, [] (std::mt19937& engine) {
|
|
static std::uniform_int_distribution<raft::logical_clock::rep> dist{400, 800};
|
|
return op_type{network_majority_grudge<AppendReg>{raft::logical_clock::duration{dist(engine)}}};
|
|
})
|
|
),
|
|
pin(reconfig_thread,
|
|
stagger(seed, timer.now() + 1000_t, 500_t, 500_t,
|
|
constant([] () { return op_type{reconfiguration<AppendReg>{500_t}}; })
|
|
),
|
|
pin(crash_thread,
|
|
stagger(seed, timer.now() + 200_t, 100_t, 200_t,
|
|
random(seed, [] (std::mt19937& engine) {
|
|
static std::uniform_int_distribution<raft::logical_clock::rep> dist{0, 100};
|
|
return op_type{stop_crash<AppendReg>{raft::logical_clock::duration{dist(engine)}}};
|
|
})
|
|
),
|
|
stagger(seed, timer.now(), 0_t, 50_t,
|
|
sequence(1, [] (int32_t i) {
|
|
assert(i > 0);
|
|
return op_type{raft_call<AppendReg>{AppendReg::append{i}, 200_t}};
|
|
})
|
|
)
|
|
)
|
|
)
|
|
)
|
|
);
|
|
|
|
struct statistics {
|
|
size_t invocations{0};
|
|
size_t successes{0};
|
|
size_t failures{0};
|
|
};
|
|
|
|
class consistency_checker {
|
|
append_reg_model _model;
|
|
statistics& _stats;
|
|
|
|
public:
|
|
consistency_checker(statistics& s) : _model{}, _stats(s) {}
|
|
|
|
void operator()(op_type o) {
|
|
tlogger.debug("invocation {}", o);
|
|
|
|
if (auto call_op = std::get_if<raft_call<AppendReg>>(&o.op)) {
|
|
++_stats.invocations;
|
|
_model.invocation(call_op->input.x);
|
|
}
|
|
}
|
|
|
|
void operator()(operation::completion<op_type> c) {
|
|
auto res = std::get_if<op_type::result_type>(&c.result);
|
|
assert(res);
|
|
|
|
if (auto call_res = std::get_if<raft_call<AppendReg>::result_type>(res)) {
|
|
std::visit(make_visitor(
|
|
[this] (AppendReg::output_t& out) {
|
|
tlogger.debug("completion x: {} prev digest: {}", out.x, out.prev.digest());
|
|
|
|
++_stats.successes;
|
|
_model.return_success(out.x, std::move(out.prev));
|
|
},
|
|
[this] (raft::not_a_leader& e) {
|
|
// TODO: this is a definite failure, mark it
|
|
// _model.return_failure(...)
|
|
++_stats.failures;
|
|
},
|
|
[this] (raft::commit_status_unknown& e) {
|
|
// TODO assert: only allowed if reconfigurations happen?
|
|
// assert(false); TODO debug this
|
|
++_stats.failures;
|
|
},
|
|
[this] (auto&) {
|
|
++_stats.failures;
|
|
}
|
|
), *call_res);
|
|
} else {
|
|
tlogger.debug("completion {}", c);
|
|
}
|
|
|
|
// TODO: check consistency of reconfiguration completions
|
|
// (there's not much to check, but for example: we should not get back `conf_change_in_progress`
|
|
// if our last reconfiguration was successful?).
|
|
}
|
|
};
|
|
|
|
statistics stats;
|
|
history_t history;
|
|
interpreter<op_type, decltype(gen), consistency_checker> interp{
|
|
std::move(gen), std::move(threads), 1_t, std::move(init_state), timer,
|
|
consistency_checker{stats}};
|
|
try {
|
|
co_await interp.run();
|
|
} catch (inconsistency& e) {
|
|
tlogger.error("inconsistency: {}", e.what);
|
|
env.for_each_server([&] (raft::server_id id, raft_server<AppendReg>* srv, failure_detector&) {
|
|
if (srv) {
|
|
tlogger.info("server {} state machine state: {}", id, srv->state());
|
|
} else {
|
|
tlogger.info("node {} currently missing server", id);
|
|
}
|
|
});
|
|
|
|
assert(false);
|
|
}
|
|
|
|
tlogger.info("Finished generator run, time: {}, invocations: {}, successes: {}, failures: {}, total: {}",
|
|
timer.now(), stats.invocations, stats.successes, stats.failures, stats.successes + stats.failures);
|
|
|
|
// Liveness check: we must be able to obtain a final response after all the nemeses have stopped.
|
|
// Due to possible multiple leaders at this point and the cluster stabilizing (for example there
|
|
// may be no leader right now, the current leader may be stepping down etc.) we may need to try
|
|
// sending requests multiple times to different servers to obtain the last result.
|
|
|
|
auto limit = timer.now() + 10000_t;
|
|
size_t cnt = 0;
|
|
for (; timer.now() < limit; ++cnt) {
|
|
tlogger.info("Trying to obtain last result: attempt number {}", cnt + 1);
|
|
|
|
auto now = timer.now();
|
|
auto leader = co_await wait_for_leader<AppendReg>{}(env,
|
|
std::vector<raft::server_id>{all_servers.begin(), all_servers.end()}, timer, limit)
|
|
.handle_exception_type([&timer, now] (logical_timer::timed_out<raft::server_id>) -> raft::server_id {
|
|
tlogger.error("Failed to find a leader after {} ticks at the end of test.", timer.now() - now);
|
|
assert(false);
|
|
});
|
|
|
|
if (env.is_leader(leader)) {
|
|
tlogger.info("Leader {} found after {} ticks", leader, timer.now() - now);
|
|
} else {
|
|
tlogger.warn("Leader {} found after {} ticks, but suddenly lost leadership", leader, timer.now() - now);
|
|
continue;
|
|
}
|
|
|
|
auto config = env.get_configuration(leader);
|
|
assert(config);
|
|
tlogger.info("Leader {} configuration: current {} previous {}", leader, config->current, config->previous);
|
|
|
|
for (auto& s: all_servers) {
|
|
if (env.is_leader(s) && s != leader) {
|
|
auto conf = env.get_configuration(s);
|
|
assert(conf);
|
|
tlogger.info("There is another leader: {}, configuration: current {} previous {}", s, conf->current, conf->previous);
|
|
}
|
|
}
|
|
|
|
tlogger.info("From the clients' point of view, the possible cluster members are: {}", known_config);
|
|
|
|
auto [res, last_attempted_server] = co_await bouncing{[&timer, &env] (raft::server_id id) {
|
|
return env.call(id, AppendReg::append{-1}, timer.now() + 200_t, timer);
|
|
}}(timer, known_config, leader, known_config.size() + 1, 10_t, 10_t);
|
|
|
|
if (std::holds_alternative<typename AppendReg::ret>(res)) {
|
|
tlogger.info("Obtained last result");
|
|
tlogger.debug("Last result: {}", res);
|
|
co_return;
|
|
}
|
|
|
|
tlogger.warn("Failed to obtain last result at end of test: {} returned by {}", res, last_attempted_server);
|
|
}
|
|
|
|
tlogger.error("Failed to obtain a final successful response at the end of the test. Number of attempts: {}", cnt);
|
|
assert(false);
|
|
});
|
|
}
|