/* * Copyright (C) 2021-present ScyllaDB */ /* * SPDX-License-Identifier: AGPL-3.0-or-later */ #include #include #include #include #include #include #include #include #include #include #include "raft/server.hh" #include "raft/logical_clock.hh" #include "serializer.hh" #include "serializer_impl.hh" #include "idl/uuid.dist.hh" #include "idl/uuid.dist.impl.hh" #include "test/lib/random_utils.hh" #include "test/raft/logical_timer.hh" #include "test/raft/ticker.hh" #include "test/raft/generator.hh" #include "to_string.hh" using namespace seastar; using namespace std::chrono_literals; seastar::logger tlogger("randomized_nemesis_test"); // A direct translaction of a mathematical definition of a state machine // (see e.g. Wikipedia) as a C++ concept. Implementations of this concept // do not store the state, they only define the types, the transition function // (which is a pure function), and the initial state (which is a constant). template concept PureStateMachine = requires (typename M::state_t s, typename M::input_t i) { // The type of all possible states. typename M::state_t; // The type of all possible inputs (commands). typename M::input_t; // The type of all possible outputs. typename M::output_t; // The transition function (a pure function - no side effects). It takes a state // and an input, and returns the next state and the output produced // by applying the input to the given state. { M::delta(s, i) } -> std::same_as>; // The initial state, of type `state_t`. M::init; requires std::is_same_v; }; // Used to uniquely identify commands passed into `apply` in order to return // the outputs of these commands. See `impure_state_machine` and `call`. using cmd_id_t = utils::UUID; // A set of in-memory snapshots maintained by a single Raft server. // The different parts of the server (the state machine, persistence, // rpc) will share a single `snapshots_t`. template using snapshots_t = std::unordered_map; // To replicate a state machine, our Raft implementation requires it to // be represented with the `raft::state_machine` interface. // // `impure_state_machine` is an implementation of `raft::state_machine` // that wraps a `PureStateMachine`. It keeps a variable of type `state_t` // representing the current state. In `apply` it deserializes the given // command into `input_t`, uses the transition (`delta`) function to // produce the next state and output, replaces its current state with the // obtained state and returns the output (more on that below); it does so // sequentially for every given command. We can think of `PureStateMachine` // as the actual state machine - the business logic, and `impure_state_machine` // as the ``boilerplate'' that allows the pure machine to be replicated // by Raft and communicate with the external world. // // The interface also requires maintainance of snapshots. We use the // `snapshots_t` introduced above; `impure_state_machine` keeps a reference to `snapshots_t` // because it will share it with an implementation of `raft::persistence`. template class impure_state_machine : public raft::state_machine { raft::server_id _id; typename M::state_t _val; snapshots_t& _snapshots; // Used to ensure that when `abort()` returns there are // no more in-progress methods running on this object. seastar::gate _gate; // To obtain output from an applied command, the client (see `call`) // first allocates a channel in this data structure by calling `with_output_channel` // and makes the returned command ID a part of the command passed to Raft. // When (if) we eventually apply the command, we use the ID to find the output channel // here and push the output to the client waiting on the other end. // The channel is allocated only on the local server where `with_output_channel` // was called; other replicas of the state machine will therefore not find the ID // in their instances of `_output_channels` so they just drop the output. std::unordered_map> _output_channels; public: impure_state_machine(raft::server_id id, snapshots_t& snapshots) : _id(id), _val(M::init), _snapshots(snapshots) {} future<> apply(std::vector cmds) override { co_await with_gate(_gate, [this, cmds = std::move(cmds)] () mutable -> future<> { for (auto& cref : cmds) { _gate.check(); auto is = ser::as_input_stream(cref); auto cmd_id = ser::deserialize(is, boost::type{}); auto input = ser::deserialize(is, boost::type{}); auto [new_state, output] = M::delta(std::move(_val), std::move(input)); _val = std::move(new_state); auto it = _output_channels.find(cmd_id); if (it != _output_channels.end()) { // We are on the leader server where the client submitted the command // and waits for the output. Send it to them. it->second.set_value(std::move(output)); _output_channels.erase(it); } else { // This is not the leader on which the command was submitted, // or it is but the client already gave up on us and deallocated the channel. // In any case we simply drop the output. } co_await coroutine::maybe_yield(); } }); } future take_snapshot() override { auto id = raft::snapshot_id::create_random_id(); assert(_snapshots.emplace(id, _val).second); tlogger.trace("{}: took snapshot id {} val {}", _id, id, _val); co_return id; } void drop_snapshot(raft::snapshot_id id) override { _snapshots.erase(id); } future<> load_snapshot(raft::snapshot_id id) override { auto it = _snapshots.find(id); assert(it != _snapshots.end()); // dunno if the snapshot can actually be missing tlogger.trace("{}: loading snapshot id {} prev val {} new val {}", _id, id, _val, it->second); _val = it->second; co_return; } future<> abort() override { return _gate.close(); } struct output_channel_dropped : public raft::error { output_channel_dropped() : error("output channel dropped") {} }; // Before sending a command to Raft, the client must obtain a command ID // and an output channel using this function. template future with_output_channel(F f) { return with_gate(_gate, [this, f = std::move(f)] () mutable -> future { promise p; auto fut = p.get_future(); auto cmd_id = utils::make_random_uuid(); assert(_output_channels.emplace(cmd_id, std::move(p)).second); auto guard = defer([this, cmd_id] { auto it = _output_channels.find(cmd_id); if (it != _output_channels.end()) { it->second.set_exception(output_channel_dropped{}); _output_channels.erase(it); } }); return f(cmd_id, std::move(fut)).finally([guard = std::move(guard)] {}); }); } const typename M::state_t& state() const { return _val; } }; // TODO: serializable concept? template raft::command make_command(const cmd_id_t& cmd_id, const Input& input) { raft::command cmd; ser::serialize(cmd, cmd_id); ser::serialize(cmd, input); return cmd; } // TODO: handle other errors? template using call_result_t = std::variant; // Wait for a future `f` to finish, but keep the result inside a `future`. // Works for `future` as well as for `future`. template future wait(F f) { // FIXME: using lambda as workaround for clang bug #50345 auto impl = [] (F f) -> future { struct container { F f; }; container c = co_await f.then_wrapped([] (F f) { return container{std::move(f)}; }); assert(c.f.available()); co_return std::move(c.f); }; return impl(std::move(f)); } template F> static futurize_t> with_timeout(logical_timer& t, raft::logical_clock::time_point tp, F&& fun) { using future_t = futurize_t>; // FIXME: using lambda as workaround for clang bug #50345 auto impl = [] (logical_timer& t, raft::logical_clock::time_point tp, F&& fun) -> future_t { abort_source timeout_as; // Using lambda here as workaround for seastar#1005 future_t f = futurize_invoke([fun = std::move(fun)] (abort_source& as) mutable { return std::forward(fun)(as); }, timeout_as); auto sleep_and_abort = [] (raft::logical_clock::time_point tp, abort_source& timeout_as, logical_timer& t) -> future<> { co_await t.sleep_until(tp, timeout_as); if (!timeout_as.abort_requested()) { // We resolved before `f`. Abort the operation. timeout_as.request_abort(); } }(tp, timeout_as, t); f = co_await wait(std::move(f)); if (!timeout_as.abort_requested()) { // `f` has already resolved, but abort the sleep. timeout_as.request_abort(); } // Wait on the sleep as well (it should return shortly, being aborted) so we don't discard the future. try { co_await std::move(sleep_and_abort); } catch (const sleep_aborted&) { // Expected (if `f` resolved first or we were externally aborted). } catch (...) { // There should be no other exceptions, but just in case... log it and discard, // we want to propagate exceptions from `f`, not from sleep. tlogger.error("unexpected exception from sleep_and_abort", std::current_exception()); } // The future is available but cannot use `f.get()` as it doesn't handle void futures. co_return co_await std::move(f); }; return impl(t, tp, std::forward(fun)); } // Sends a given `input` as a command to `server`, waits until the command gets replicated // and applied on that server and returns the produced output. // // The wait time is limited using `timeout` which is a logical time point referring to the // logical clock used by `timer`. Standard way to use is to pass `timer.now() + X_t` // as the time point, where `X` is the maximum number of ticks that we wait for. // // `sm` must be a reference to the state machine owned by `server`. // // The `server` may currently be a follower, in which case it will return a `not_a_leader` error. template future> call( typename M::input_t input, raft::logical_clock::time_point timeout, logical_timer& timer, raft::server& server, impure_state_machine& sm) { using output_channel_dropped = typename impure_state_machine::output_channel_dropped; using input_t = typename M::input_t; using output_t = typename M::output_t; return sm.with_output_channel([&, input = std::move(input), timeout] (cmd_id_t cmd_id, future f) { return with_timeout(timer, timeout, std::bind_front([&] (input_t input, future f, abort_source& as) { return server.add_entry( make_command(std::move(cmd_id), std::move(input)), raft::wait_type::applied, &as ).then_wrapped([output_f = std::move(f)] (future<> add_entry_f) mutable { if (add_entry_f.failed()) { // We need to discard `output_f`; the only expected exception is: (void)output_f.discard_result().handle_exception_type([] (const output_channel_dropped&) {}); std::rethrow_exception(add_entry_f.get_exception()); } return std::move(output_f); }); }, std::move(input), std::move(f))); }).then([] (output_t output) { return make_ready_future>(std::move(output)); }).handle_exception([] (std::exception_ptr eptr) { try { std::rethrow_exception(eptr); } catch (raft::not_a_leader e) { return make_ready_future>(e); } catch (raft::dropped_entry e) { return make_ready_future>(e); } catch (raft::commit_status_unknown e) { return make_ready_future>(e); } catch (raft::stopped_error e) { return make_ready_future>(e); } catch (raft::request_aborted&) { return make_ready_future>(timed_out_error{}); } catch (seastar::timed_out_error& e) { return make_ready_future>(e); } catch (broken_promise&) { // FIXME: workaround for #9688 return make_ready_future>(raft::stopped_error{}); } catch (...) { tlogger.error("unexpected exception from call: {}", std::current_exception()); assert(false); } }); } // Allows a Raft server to communicate with other servers. // The implementation is mostly boilerplate. It assumes that there exists a method of message passing // given by a `send_message_t` function (passed in the constructor) for sending and by the `receive` // function for receiving messages. // // We also keep a reference to a `snapshots_t` set to be shared with the `impure_state_machine` // on the same server. We access this set when we receive or send a snapshot message. template class rpc : public raft::rpc { using reply_id_t = uint32_t; struct snapshot_message { raft::install_snapshot ins; State snapshot_payload; reply_id_t reply_id; }; struct snapshot_reply_message { raft::snapshot_reply reply; reply_id_t reply_id; }; struct execute_barrier_on_leader { reply_id_t reply_id; }; struct execute_barrier_on_leader_reply { raft::read_barrier_reply reply; reply_id_t reply_id; }; struct add_entry_message { raft::command cmd; reply_id_t reply_id; }; struct add_entry_reply_message { raft::add_entry_reply reply; reply_id_t reply_id; }; struct modify_config_message { std::vector add; std::vector del; reply_id_t reply_id; }; public: using message_t = std::variant< snapshot_message, snapshot_reply_message, raft::append_request, raft::append_reply, raft::vote_request, raft::vote_reply, raft::timeout_now, raft::read_quorum, raft::read_quorum_reply, execute_barrier_on_leader, execute_barrier_on_leader_reply, add_entry_message, add_entry_reply_message, modify_config_message >; using send_message_t = std::function; private: raft::server_id _id; snapshots_t& _snapshots; logical_timer _timer; send_message_t _send; // Before we send a snapshot apply request we create a promise-future pair, // allocate a new ID, and put the promise here under that ID. We then send the ID // together with the request and wait on the future. // When (if) a reply returns, we take the ID from the reply (which is the same // as the ID in the corresponding request), take the promise under that ID // and push the reply through that promise. using reply_promise = std::variant< promise, promise, promise >; std::unordered_map _reply_promises; reply_id_t _counter = 0; // Used to ensure that when `abort()` returns there are // no more in-progress methods running on this object. seastar::gate _gate; size_t _snapshot_applications = 0; size_t _read_barrier_executions = 0; size_t _add_entry_executions = 0; size_t _modify_config_executions = 0; template auto with_gate(F&& f) -> decltype(f()) { try { co_return co_await seastar::with_gate(_gate, std::forward(f)); } catch (const gate_closed_exception&) { co_return coroutine::make_exception(raft::stopped_error{}); } } public: rpc(raft::server_id id, snapshots_t& snaps, send_message_t send) : _id(id), _snapshots(snaps), _send(std::move(send)) { } // Message is delivered to us. // The caller must ensure that `abort()` wasn't called yet. void receive(raft::server_id src, message_t payload) { assert(!_gate.is_closed()); assert(_client); auto& c = *_client; std::visit(make_visitor( [&] (snapshot_message m) { static const size_t max_concurrent_snapshot_applications = 5; // TODO: configurable if (_snapshot_applications >= max_concurrent_snapshot_applications) { tlogger.warn( "{}: cannot apply snapshot from {} (id: {}) due to too many concurrent requests, dropping it", _id, src, m.ins.snp.id); // Should we send some message back instead? return; } ++_snapshot_applications; (void)[] (rpc& self, raft::server_id src, snapshot_message m, gate::holder holder) -> future<> { try { self._snapshots.emplace(m.ins.snp.id, std::move(m.snapshot_payload)); auto reply = co_await self._client->apply_snapshot(src, std::move(m.ins)); self._send(src, snapshot_reply_message{ .reply = std::move(reply), .reply_id = m.reply_id }); } catch (...) { tlogger.warn("{}: exception when applying snapshot from {}: {}", self._id, src, std::current_exception()); } --self._snapshot_applications; }(*this, src, std::move(m), _gate.hold()); }, [this] (snapshot_reply_message m) { auto it = _reply_promises.find(m.reply_id); if (it != _reply_promises.end()) { std::get>(it->second).set_value(std::move(m.reply)); } }, [&] (raft::append_request m) { c.append_entries(src, std::move(m)); }, [&] (raft::append_reply m) { c.append_entries_reply(src, std::move(m)); }, [&] (raft::vote_request m) { c.request_vote(src, std::move(m)); }, [&] (raft::vote_reply m) { c.request_vote_reply(src, std::move(m)); }, [&] (raft::timeout_now m) { c.timeout_now_request(src, std::move(m)); }, [&] (raft::read_quorum m) { c.read_quorum_request(src, std::move(m)); }, [&] (raft::read_quorum_reply m) { c.read_quorum_reply(src, std::move(m)); }, [&] (execute_barrier_on_leader m) { static const size_t max_concurrent_read_barrier_executions = 100; // TODO: configurable if (_read_barrier_executions >= max_concurrent_read_barrier_executions) { tlogger.warn( "{}: cannot execute read barrier for {} due to too many concurrent requests, dropping it", _id, src); // Should we send some message back instead? return; } ++_read_barrier_executions; (void)[] (rpc& self, raft::server_id src, execute_barrier_on_leader m, gate::holder holder) -> future<> { try { auto reply = co_await self._client->execute_read_barrier(src, nullptr); self._send(src, execute_barrier_on_leader_reply{ .reply = std::move(reply), .reply_id = m.reply_id }); } catch (...) { tlogger.warn("{}: exception when executing read barrier for {}: {}", self._id, src, std::current_exception()); } --self._read_barrier_executions; }(*this, src, std::move(m), _gate.hold()); }, [this] (execute_barrier_on_leader_reply m) { auto it = _reply_promises.find(m.reply_id); if (it != _reply_promises.end()) { std::get>(it->second).set_value(std::move(m.reply)); } }, [&] (add_entry_message m) { static const size_t max_concurrent_add_entry_executions = 100; // TODO: configurable if (_add_entry_executions >= max_concurrent_add_entry_executions) { tlogger.warn( "{}: cannot execute add_entry for {} due to too many concurrent requests, dropping it", _id, src); // Should we send some message back instead? return; } ++_add_entry_executions; (void)[] (rpc& self, raft::server_id src, add_entry_message m, gate::holder holder) -> future<> { try { auto reply = co_await self._client->execute_add_entry(src, std::move(m.cmd), nullptr); self._send(src, add_entry_reply_message{ .reply = std::move(reply), .reply_id = m.reply_id }); } catch (...) { tlogger.warn("{}: exception when executing add_entry for {}: {}", self._id, src, std::current_exception()); } --self._add_entry_executions; }(*this, src, std::move(m), _gate.hold()); }, [this] (add_entry_reply_message m) { auto it = _reply_promises.find(m.reply_id); if (it != _reply_promises.end()) { std::get>(it->second).set_value(std::move(m.reply)); } }, [&] (modify_config_message m) { static const size_t max_concurrent_modify_config_executions = 100; // TODO: configurable if (_modify_config_executions >= max_concurrent_modify_config_executions) { tlogger.warn( "{}: cannot execute modify_config for {} due to too many concurrent requests, dropping it", _id, src); // Should we send some message back instead? return; } ++_modify_config_executions; (void)[] (rpc& self, raft::server_id src, modify_config_message m, gate::holder holder) -> future<> { try { auto reply = co_await self._client->execute_modify_config(src, std::move(m.add), std::move(m.del), nullptr); self._send(src, add_entry_reply_message{ .reply = std::move(reply), .reply_id = m.reply_id }); } catch (...) { tlogger.warn("{}: exception when executing modify_config for {}: {}", self._id, src, std::current_exception()); } --self._modify_config_executions; }(*this, src, std::move(m), _gate.hold()); } ), std::move(payload)); } struct snapshot_not_found { raft::snapshot_id id; }; virtual future send_snapshot(raft::server_id dst, const raft::install_snapshot& ins, seastar::abort_source&) override { co_return co_await with_gate([&] () -> future { auto it = _snapshots.find(ins.snp.id); if (it == _snapshots.end()) { throw snapshot_not_found{ .id = ins.snp.id }; } auto id = _counter++; promise p; auto f = p.get_future(); _reply_promises.emplace(id, std::move(p)); auto guard = defer([this, id] { _reply_promises.erase(id); }); _send(dst, snapshot_message{ .ins = ins, .snapshot_payload = it->second, .reply_id = id }); // The message receival function on the other side, when it receives the snapshot message, // will apply the snapshot and send `id` back to us in the snapshot reply message (see `receive`, // `snapshot_message` case). When we receive the reply, we shall find `id` in `_reply_promises` // and push the reply through the promise, which will resolve `f` (see `receive`, `snapshot_reply_message` // case). // TODO configurable static const raft::logical_clock::duration send_snapshot_timeout = 20_t; // TODO: catch aborts from the abort_source as well try { co_return co_await _timer.with_timeout(_timer.now() + send_snapshot_timeout, std::move(f)); } catch (logical_timer::timed_out& e) { // The future will probably get a broken_promise exception after we destroy the guard. (void)e.get_future().discard_result().handle_exception_type([] (const broken_promise&) {}); throw timed_out_error{}; } // co_await ensures that `guard` is destroyed before we leave `_gate` }); } virtual future send_add_entry(raft::server_id dst, const raft::command& cmd) override { co_return co_await with_gate([&] () -> future { auto id = _counter++; promise p; auto f = p.get_future(); _reply_promises.emplace(id, std::move(p)); auto guard = defer([this, id] { _reply_promises.erase(id); }); _send(dst, add_entry_message{ .cmd = cmd, .reply_id = id }); static const raft::logical_clock::duration send_add_entry_timeout = 20_t; try { co_return co_await _timer.with_timeout(_timer.now() + send_add_entry_timeout, std::move(f)); } catch (logical_timer::timed_out& e) { (void) e.get_future().discard_result().handle_exception_type([] (const broken_promise&) { }); throw timed_out_error{}; } }); } virtual future send_modify_config(raft::server_id dst, const std::vector& add, const std::vector& del) override { co_return co_await with_gate([&] () -> future { auto id = _counter++; promise p; auto f = p.get_future(); _reply_promises.emplace(id, std::move(p)); auto guard = defer([this, id] { _reply_promises.erase(id); }); _send(dst, modify_config_message{ .add = add, .del = del, .reply_id = id }); static const raft::logical_clock::duration send_modify_config_timeout = 200_t; try { co_return co_await _timer.with_timeout(_timer.now() + send_modify_config_timeout, std::move(f)); } catch (logical_timer::timed_out& e) { (void) e.get_future().discard_result().handle_exception_type([] (const broken_promise&) { }); throw timed_out_error{}; } }); } virtual future execute_read_barrier_on_leader(raft::server_id dst) override { co_return co_await with_gate([&] () -> future { auto id = _counter++; promise p; auto f = p.get_future(); _reply_promises.emplace(id, std::move(p)); auto guard = defer([this, id] { _reply_promises.erase(id); }); _send(dst, execute_barrier_on_leader { .reply_id = id }); // TODO configurable static const raft::logical_clock::duration execute_read_barrier_on_leader_timeout = 20_t; // TODO: catch aborts from the abort_source as well co_return co_await _timer.with_timeout(_timer.now() + execute_read_barrier_on_leader_timeout, std::move(f)); // co_await ensures that `guard` is destroyed before we leave `_gate` }); } virtual future<> send_append_entries(raft::server_id dst, const raft::append_request& m) override { _send(dst, m); co_return; } virtual void send_append_entries_reply(raft::server_id dst, const raft::append_reply& m) override { _send(dst, m); } virtual void send_vote_request(raft::server_id dst, const raft::vote_request& m) override { _send(dst, m); } virtual void send_vote_reply(raft::server_id dst, const raft::vote_reply& m) override { _send(dst, m); } virtual void send_timeout_now(raft::server_id dst, const raft::timeout_now& m) override { _send(dst, m); } virtual void send_read_quorum(raft::server_id dst, const raft::read_quorum& m) override { _send(dst, m); } virtual void send_read_quorum_reply(raft::server_id dst, const raft::read_quorum_reply& m) override { _send(dst, m); } virtual void add_server(raft::server_id, raft::server_info) override { } virtual void remove_server(raft::server_id) override { } virtual future<> abort() override { return _gate.close(); } void tick() { _timer.tick(); } }; template class persistence { std::pair _stored_snapshot; std::pair _stored_term_and_vote; // Invariants: // 1. for each entry except the first, the raft index is equal to the raft index of the previous entry plus one. // 2. the index of the first entry is <= _stored_snapshot.first.idx + 1. // 3. the index of the last entry is >= _stored_snapshot.first.idx. // Informally, the last two invariants say that the stored log intersects or ``touches'' the snapshot ``on the right side''. raft::log_entries _stored_entries; // Returns an iterator to the entry in `_stored_entries` whose raft index is `idx` if the entry exists. // If all entries in `_stored_entries` have greater indexes, returns the first one. // If all entries have smaller indexes, returns end(). raft::log_entries::iterator find(raft::index_t idx) { // The correctness of this depends on the `_stored_entries` invariant. auto b = _stored_entries.begin(); if (b == _stored_entries.end() || (*b)->idx >= idx) { return b; } return b + std::min((idx - (*b)->idx).get_value(), _stored_entries.size()); } public: // If this is the first server of a cluster, it must be initialized with a singleton configuration // containing opnly this server's ID which must be also provided here as `init_config_id`. // Otherwise it must be initialized with an empty configuration (it will be added to the cluster // through a configuration change) and `init_config_id` must be `nullopt`. persistence(std::optional init_config_id, State init_state) : _stored_snapshot( raft::snapshot_descriptor{ .config = init_config_id ? raft::configuration{*init_config_id} : raft::configuration{} }, std::move(init_state)) , _stored_term_and_vote(raft::term_t{1}, raft::server_id{}) {} void store_term_and_vote(raft::term_t term, raft::server_id vote) { _stored_term_and_vote = std::pair{term, vote}; } std::pair load_term_and_vote() { return _stored_term_and_vote; } void store_snapshot(const raft::snapshot_descriptor& snap, State snap_data, size_t preserve_log_entries) { // The snapshot's index cannot be smaller than the index of the first stored entry minus one; // that would create a ``gap'' in the log. assert(_stored_entries.empty() || snap.idx + 1 >= _stored_entries.front()->idx); _stored_snapshot = {snap, std::move(snap_data)}; if (!_stored_entries.empty() && snap.idx > _stored_entries.back()->idx) { // Clear the log in order to not create a gap. _stored_entries.clear(); return; } auto first_to_remain = snap.idx + 1 >= preserve_log_entries ? raft::index_t{snap.idx + 1 - preserve_log_entries} : raft::index_t{0}; _stored_entries.erase(_stored_entries.begin(), find(first_to_remain)); } std::pair load_snapshot() { return _stored_snapshot; } void store_log_entries(const std::vector& entries) { if (entries.empty()) { return; } // The raft server is supposed to provide entries in strictly increasing order, // hence the following assertions. if (_stored_entries.empty()) { assert(entries.front()->idx == _stored_snapshot.first.idx + 1); } else { assert(entries.front()->idx == _stored_entries.back()->idx + 1); } _stored_entries.push_back(entries[0]); for (size_t i = 1; i < entries.size(); ++i) { assert(entries[i]->idx == entries[i-1]->idx + 1); _stored_entries.push_back(entries[i]); } } raft::log_entries load_log() { return _stored_entries; } void truncate_log(raft::index_t idx) { _stored_entries.erase(find(idx), _stored_entries.end()); } }; template class persistence_proxy : public raft::persistence { snapshots_t& _snapshots; lw_shared_ptr<::persistence> _persistence; public: persistence_proxy(snapshots_t& snaps, lw_shared_ptr<::persistence> persistence) : _snapshots(snaps) , _persistence(std::move(persistence)) {} virtual future<> store_term_and_vote(raft::term_t term, raft::server_id vote) override { _persistence->store_term_and_vote(term, vote); co_return; } virtual future> load_term_and_vote() override { co_return _persistence->load_term_and_vote(); } virtual future<> store_commit_idx(raft::index_t) override { co_return; } virtual future load_commit_idx() override { co_return raft::index_t{0}; } // Stores not only the snapshot descriptor but also the corresponding snapshot. virtual future<> store_snapshot_descriptor(const raft::snapshot_descriptor& snap, size_t preserve_log_entries) override { auto it = _snapshots.find(snap.id); assert(it != _snapshots.end()); _persistence->store_snapshot(snap, it->second, preserve_log_entries); co_return; } // Loads not only the snapshot descriptor but also the corresponding snapshot. virtual future load_snapshot_descriptor() override { auto [snap, state] = _persistence->load_snapshot(); _snapshots.insert_or_assign(snap.id, std::move(state)); co_return snap; } virtual future<> store_log_entries(const std::vector& entries) override { _persistence->store_log_entries(entries); co_return; } virtual future load_log() override { co_return _persistence->load_log(); } virtual future<> truncate_log(raft::index_t idx) override { _persistence->truncate_log(idx); co_return; } virtual future<> abort() override { // There are no yields anywhere in our methods so no need to wait for anything. // We assume that our methods won't be called after `abort()`. // TODO: is this assumption correct? co_return; } }; // A failure detector using heartbeats for deciding whether to convict a server // as failed. We convict a server if we don't receive a heartbeat for a long enough time. // `failure_detector` assumes a message-passing method given by a `send_heartbeat_t` function // through the constructor for sending heartbeats and assumes that `receive_heartbeat` is called // whenever another server sends a message to us. // To decide who to send heartbeats to we use the ``current knowledge'' of servers in the network // which is updated through `add_server` and `remove_server` functions. class failure_detector : public raft::failure_detector { public: using send_heartbeat_t = std::function; private: raft::logical_clock _clock; // The set of known servers, used to broadcast heartbeats. std::unordered_set _known; // The last time we received a heartbeat from a server. std::unordered_map _last_heard; // The last time we sent a heartbeat. raft::logical_clock::time_point _last_beat; // How long from the last received heartbeat does it take to convict a node as dead. const raft::logical_clock::duration _convict_threshold; send_heartbeat_t _send_heartbeat; public: failure_detector(raft::logical_clock::duration convict_threshold, send_heartbeat_t f) : _convict_threshold(convict_threshold), _send_heartbeat(std::move(f)) { send_heartbeats(); assert(_last_beat == _clock.now()); } void receive_heartbeat(raft::server_id src) { assert(_known.contains(src)); _last_heard[src] = std::max(_clock.now(), _last_heard[src]); } void tick() { _clock.advance(); // TODO: make it adjustable static const raft::logical_clock::duration _heartbeat_period = 10_t; if (_last_beat + _heartbeat_period <= _clock.now()) { send_heartbeats(); } } void send_heartbeats() { for (auto& dst : _known) { _send_heartbeat(dst); } _last_beat = _clock.now(); } // We expect a server to be added through this function before we receive a heartbeat from it. void add_server(raft::server_id id) { _known.insert(id); } void remove_server(raft::server_id id) { _known.erase(id); _last_heard.erase(id); } bool is_alive(raft::server_id id) override { return _clock.now() < _last_heard[id] + _convict_threshold; } }; // `network` is a simple priority queue of `event`s, where an `event` is a message associated // with its planned delivery time. The queue uses a logical clock to decide when to deliver messages. // It delives all messages whose associated times are smaller than the ``current time'', the latter // determined by the number of `tick()` calls. template class network { public: // When the time comes to deliver a message we use this function. using deliver_t = std::function; private: struct message { raft::server_id src; raft::server_id dst; // shared ptr to implement duplication of messages lw_shared_ptr payload; }; struct event { raft::logical_clock::time_point time; message msg; }; deliver_t _deliver; // A min-heap of event occurences compared by their time points. std::vector _events; // Comparator for the `_events` min-heap. static bool cmp(const event& o1, const event& o2) { return o1.time > o2.time; } // A pair (dst, [src1, src2, ...]) in this set denotes that `dst` // does not receive messages from src1, src2, ... std::unordered_map> _grudges; raft::logical_clock _clock; // How long does it take to deliver a message? std::uniform_int_distribution _delivery_delay; std::mt19937 _rnd; public: network(std::uniform_int_distribution delivery_delay, std::mt19937 rnd, deliver_t f) : _deliver(std::move(f)), _delivery_delay(std::move(delivery_delay)), _rnd(std::move(rnd)) {} void send(raft::server_id src, raft::server_id dst, Payload payload) { // Predict the delivery time in advance. // Our prediction may be wrong if a grudge exists at this expected moment of delivery. // Messages may also be reordered. auto delivery_time = _clock.now() + raft::logical_clock::duration{_delivery_delay(_rnd)}; _events.push_back(event{delivery_time, message{src, dst, make_lw_shared(std::move(payload))}}); std::push_heap(_events.begin(), _events.end(), cmp); } void tick() { _clock.advance(); deliver(); } void add_grudge(raft::server_id src, raft::server_id dst) { _grudges[dst].insert(src); } void remove_grudge(raft::server_id src, raft::server_id dst) { _grudges[dst].erase(src); } private: void deliver() { // Deliver every message whose time has come. while (!_events.empty() && _events.front().time <= _clock.now()) { auto& [_, m] = _events.front(); if (!_grudges[m.dst].contains(m.src)) { _deliver(m.src, m.dst, *m.payload); } else { // A grudge means that we drop the message. } std::pop_heap(_events.begin(), _events.end(), cmp); _events.pop_back(); } } }; using reconfigure_result_t = std::variant; future reconfigure( const std::vector& ids, raft::logical_clock::time_point timeout, logical_timer& timer, raft::server& server) { raft::server_address_set config; for (auto id : ids) { config.insert(raft::server_address { .id = id }); } try { co_await with_timeout(timer, timeout, [&server, config = std::move(config)] (abort_source& as) { return server.set_configuration(std::move(config), &as); }); co_return std::monostate{}; } catch (raft::not_a_leader e) { co_return e; } catch (raft::dropped_entry e) { co_return e; } catch (raft::commit_status_unknown e) { co_return e; } catch (raft::conf_change_in_progress e) { co_return e; } catch (broken_promise&) { // FIXME: workaround for #9688 co_return raft::stopped_error{}; } catch (raft::stopped_error e) { co_return e; } catch (raft::request_aborted&) { co_return timed_out_error{}; } catch (...) { tlogger.error("unexpected exception from set_configuration: {}", std::current_exception()); assert(false); } } future modify_config( const std::vector& added, std::vector deleted, raft::logical_clock::time_point timeout, logical_timer& timer, raft::server& server) { std::vector added_set; for (auto id : added) { added_set.push_back(raft::server_address { .id = id }); } try { co_await with_timeout(timer, timeout, [&server, added_set = std::move(added_set), deleted = std::move(deleted)] (abort_source& as) mutable { return server.modify_config(std::move(added_set), std::move(deleted), &as); }); co_return std::monostate{}; } catch (raft::not_a_leader e) { co_return e; } catch (raft::dropped_entry e) { co_return e; } catch (raft::commit_status_unknown e) { co_return e; } catch (raft::conf_change_in_progress e) { co_return e; } catch (raft::stopped_error e) { co_return e; } catch (raft::request_aborted&) { co_return timed_out_error{}; } catch (...) { tlogger.error("unexpected exception from modify_config: {}", std::current_exception()); assert(false); } } // Contains a `raft::server` and other facilities needed for it and the underlying // modules (persistence, rpc, etc.) to run, and to communicate with the external environment. template class raft_server { raft::server_id _id; std::unique_ptr> _snapshots; std::unique_ptr _server; // The following objects are owned by _server: impure_state_machine& _sm; rpc& _rpc; bool _started = false; bool _stopped = false; // Used to ensure that when `abort()` returns there are // no more in-progress methods running on this object. seastar::gate _gate; public: // Create a `raft::server` with the given `id` and all other facilities required // by the server (the state machine, RPC instance and so on). The server will use // `send_rpc` to send RPC messages to other servers and `fd` for failure detection. // // The server is started with `persistence` as its underlying persistent storage. // This can be used to simulate a server that is restarting by giving it a `persistence` // that was previously used by a different instance of `raft_server` (but make sure // they had the same `id` and that the previous instance is no longer using this // `persistence`). // // The created server is not started yet; use `start` for that. static std::unique_ptr create( raft::server_id id, lw_shared_ptr> persistence, shared_ptr fd, raft::server::configuration cfg, typename rpc::send_message_t send_rpc) { using state_t = typename M::state_t; auto snapshots = std::make_unique>(); auto sm = std::make_unique>(id, *snapshots); auto rpc_ = std::make_unique>(id, *snapshots, std::move(send_rpc)); auto persistence_ = std::make_unique>(*snapshots, std::move(persistence)); auto& sm_ref = *sm; auto& rpc_ref = *rpc_; auto server = raft::create_server( id, std::move(rpc_), std::move(sm), std::move(persistence_), std::move(fd), std::move(cfg)); return std::make_unique(initializer{ ._id = id, ._snapshots = std::move(snapshots), ._server = std::move(server), ._sm = sm_ref, ._rpc = rpc_ref }); } ~raft_server() { assert(!_started || _stopped); } raft_server(const raft_server&&) = delete; raft_server(raft_server&&) = delete; // Start the server. Can be called at most once. future<> start() { assert(!_started); _started = true; co_await _server->start(); } // Stop the given server. Must be called before the server is destroyed // (unless it was never started in the first place). future<> abort() { auto f = _gate.close(); // Abort everything before waiting on the gate close future // so currently running operations finish earlier. if (_started) { co_await _server->abort(); } co_await std::move(f); _stopped = true; } void tick() { assert(_started); _rpc.tick(); _server->tick(); } future> call( typename M::input_t input, raft::logical_clock::time_point timeout, logical_timer& timer) { assert(_started); try { co_return co_await with_gate(_gate, [this, input = std::move(input), timeout, &timer] { return ::call(std::move(input), timeout, timer, *_server, _sm); }); } catch (const gate_closed_exception&) { co_return raft::stopped_error{}; } } future reconfigure( const std::vector& ids, raft::logical_clock::time_point timeout, logical_timer& timer) { assert(_started); try { co_return co_await with_gate(_gate, [this, &ids, timeout, &timer] { return ::reconfigure(ids, timeout, timer, *_server); }); } catch (const gate_closed_exception&) { co_return raft::stopped_error{}; } } future modify_config( const std::vector& added, std::vector deleted, raft::logical_clock::time_point timeout, logical_timer& timer) { assert(_started); try { co_return co_await with_gate(_gate, [this, &added, deleted = std::move(deleted), timeout, &timer] { return ::modify_config(added, std::move(deleted), timeout, timer, *_server); }); } catch (const gate_closed_exception&) { co_return raft::stopped_error{}; } } bool is_leader() const { return _server->is_leader(); } raft::server_id id() const { return _id; } const typename M::state_t& state() const { return _sm.state(); } raft::configuration get_configuration() const { return _server->get_configuration(); } void deliver(raft::server_id src, const typename rpc::message_t& m) { assert(_started); if (!_gate.is_closed()) { _rpc.receive(src, m); } } private: struct initializer { raft::server_id _id; std::unique_ptr> _snapshots; std::unique_ptr _server; impure_state_machine& _sm; rpc& _rpc; }; raft_server(initializer i) : _id(i._id) , _snapshots(std::move(i._snapshots)) , _server(std::move(i._server)) , _sm(i._sm) , _rpc(i._rpc) { } friend std::unique_ptr std::make_unique(initializer&&); }; static raft::server_id to_raft_id(size_t id) { // Raft uses UUID 0 as special case. assert(id > 0); return raft::server_id{utils::UUID{0, id}}; } struct environment_config { std::mt19937 rnd; std::uniform_int_distribution network_delay; raft::logical_clock::duration fd_convict_threshold; }; // A set of `raft_server`s connected by a `network`. // // The `network` is initialized with a message delivery function // which notifies the destination's failure detector on each message // and if the message contains an RPC payload, pushes it into the destination's // `delivery_queue`. // // Needs to be periodically `tick()`ed which ticks the network // and underlying servers. template class environment : public seastar::weakly_referencable> { using input_t = typename M::output_t; using state_t = typename M::state_t; using output_t = typename M::output_t; // Invariant: if `_server` is engaged then it uses `_persistence` and `_fd` // underneath and is initialized using `_cfg`. struct route { raft::server::configuration _cfg; lw_shared_ptr> _persistence; std::unique_ptr> _server; shared_ptr _fd; }; // Passed to newly created failure detectors. const raft::logical_clock::duration _fd_convict_threshold; // Used to deliver messages coming from the network to appropriate servers and their failure detectors. // Also keeps the servers and the failure detectors alive (owns them). // Before we show a Raft server to others we must add it to this map. std::unordered_map _routes; // Used to create a new ID in `new_server`. size_t _next_id = 1; // Engaged optional: RPC message, nullopt: heartbeat using message_t = std::optional::message_t>; network _network; bool _stopped = false; // Used to ensure that when `abort()` returns there are // no more in-progress methods running on this object. seastar::gate _gate; // Used to implement `crash`. // // We cannot destroy a server immediately in order to simulate a crash: // there may be fibers running that use the server's internals. // We move these 'crashed' servers into continuations attached to this fiber // and abort them there before destruction. future<> _crash_fiber = make_ready_future<>(); // Servers that are aborting in the background (in `_crash_fiber`). // We need these pointers so we keep ticking the servers // (in general, `abort()` requires the server to be ticked in order to finish). // One downside of this is that ticks may cause the servers to output traces. // Hopefully these crashing servers abort quickly so they don't stay too long // and make the logs unreadable... std::unordered_set*> _crashing_servers; public: environment(environment_config cfg) : _fd_convict_threshold(cfg.fd_convict_threshold) , _network(std::move(cfg.network_delay), std::move(cfg.rnd), [this] (raft::server_id src, raft::server_id dst, const message_t& m) { auto& n = _routes.at(dst); assert(n._persistence); assert(n._fd); if (n._server) { n._fd->receive_heartbeat(src); if (m) { n._server->deliver(src, *m); } } }) { } ~environment() { assert(_routes.empty() || _stopped); } environment(const environment&) = delete; environment(environment&&) = delete; void tick_network() { _network.tick(); } template *, failure_detector&> F> void for_each_server(F&& f) { for (auto& [id, r]: _routes) { assert(r._fd); f(id, r._server.get(), *r._fd); } } // Call this periodically so `abort()` can finish for 'crashed' servers. void tick_crashing_servers() { for (auto& srv: _crashing_servers) { srv->tick(); } } void tick_servers() { for_each_server([] (raft::server_id, raft_server* srv, failure_detector& fd) { if (srv) { srv->tick(); } fd.tick(); }); tick_crashing_servers(); } // A 'node' is a container for a Raft server, its storage ('persistence') and failure detector. // At a given point in time at most one Raft server instance can be running on a node. // Different instances may be running at different points in time, but they will all have // the same ID (returned by `new_node`) and will reuse the same storage and failure detector // (this can be used to simulate a server that is restarting). // // The storage is initialized when the node is created and will be used by the first started server. // If `first == true` the storage is created with a singleton server configuration containing only // the ID returned from the function. Otherwise it is created with an empty configuration // (a server started on this node will have to be joined to an existing cluster in this case). raft::server_id new_node(bool first, raft::server::configuration cfg) { _gate.check(); auto id = to_raft_id(_next_id++); auto [it, inserted] = _routes.emplace(id, route{ ._cfg = std::move(cfg), ._persistence = make_lw_shared>(first ? std::optional{id} : std::nullopt, M::init), ._server = nullptr, ._fd = nullptr, }); assert(inserted); auto& n = it->second; n._fd = seastar::make_shared(_fd_convict_threshold, [id, &n, this] (raft::server_id dst) { // Ping others only if a server is running. if (n._server) { _network.send(id, dst, std::nullopt); } }); // Add us to other servers' failure detectors. for (auto& [_, r] : _routes) { r._fd->add_server(id); } // Add other servers to our failure detector. for (auto& [id, _] : _routes) { n._fd->add_server(id); } return id; } // Starts a server on node `id`. // Assumes node with `id` exists (i.e. an earlier `new_node` call returned `id`) and that no server is running on node `id`. future<> start_server(raft::server_id id) { return with_gate(_gate, [this, id] () -> future<> { auto& n = _routes.at(id); assert(n._persistence); assert(n._fd); assert(!n._server); lw_shared_ptr*> this_srv_addr = make_lw_shared*>(nullptr); auto srv = raft_server::create(id, n._persistence, n._fd, n._cfg, [id, this_srv_addr, &n, this] (raft::server_id dst, typename rpc::message_t m) { // Allow the message out only if we are still the currently running server on this node. if (*this_srv_addr == n._server.get()) { _network.send(id, dst, {std::move(m)}); } }); *this_srv_addr = srv.get(); co_await srv->start(); n._server = std::move(srv); }); } // Creates a new node, connects it to the network, starts a server on it and returns its ID. // // If `first == true` the node is created with a singleton configuration containing only its ID. // Otherwise it is created with an empty configuration. The user must explicitly ask for a configuration change // if they want to make a cluster (group) out of this server and other existing servers. // The user should be able to create multiple clusters by calling `new_server` multiple times with `first = true`. // (`first` means ``first in group''). future new_server(bool first, raft::server::configuration cfg = {}) { auto id = new_node(first, std::move(cfg)); // not using co_await here due to miscompile return start_server(id).then([id] () { return id; }); } // Gracefully stop a running server. // Assumes a server is currently running on the node `id`. // When the future resolves, a new server may be started on this node. It will reuse the storage // of the previously running server (so the Raft log etc. will be preserved). future<> stop(raft::server_id id) { return with_gate(_gate, [this, id] () -> future<> { auto& n = _routes.at(id); assert(n._persistence); assert(n._server); assert(n._fd); co_await n._server->abort(); n._server = nullptr; }); } // Immediately stop a running server. // Assumes a server is currently running on the node `id`. // A new server may be started on this node when the function returns. It will reuse the storage // of the previously running server (so the Raft log etc. will be preserved). void crash(raft::server_id id) { _gate.check(); auto& n = _routes.at(id); assert(n._persistence); assert(n._server); assert(n._fd); // Let the 'crashed' server continue working on its copy of persistence; // none of that work will be seen by later servers restarted on this node // since they'll use a separate copy. n._persistence = make_lw_shared>(*n._persistence); // Setting `n._server` to nullptr cuts out the network access both for the server and failure detector. // Even though the server will continue running for some time (in order to be gracefully aborted), // none of that work will be seen by the rest of the environment. From others' point of view // the server is immediately gone. auto srv = std::exchange(n._server, nullptr); _crashing_servers.insert(srv.get()); auto f = std::bind_front([] (environment& self, std::unique_ptr> srv) -> future<> { tlogger.trace("crash fiber: aborting {}", srv->id()); co_await srv->abort(); tlogger.trace("crash fiber: finished aborting {}", srv->id()); self._crashing_servers.erase(srv.get()); // abort() ensures there are no in-progress calls on the server, so we can destroy it. }, std::ref(*this), std::move(srv)); // Cannot do `.then(std::move(f))`, because that would try to use `f()`, which is ill-formed (seastar#1005). _crash_fiber = _crash_fiber.then([f = std::move(f)] () mutable { return std::move(f)(); }); } bool is_leader(raft::server_id id) { auto& n = _routes.at(id); if (!n._server) { return false; } return n._server->is_leader(); } future> call( raft::server_id id, typename M::input_t input, raft::logical_clock::time_point timeout, logical_timer& timer) { auto& n = _routes.at(id); if (!n._server) { // A 'remote' caller doesn't know in general if the server is down or just slow to respond. // Simulate this by timing out the call. co_await timer.sleep_until(timeout); co_return timed_out_error{}; } auto srv = n._server.get(); auto res = co_await srv->call(std::move(input), timeout, timer); if (srv != n._server.get()) { // The server stopped while the call was happening. // As above, we simulate a 'remote' call by timing it out in this case. co_await timer.sleep_until(timeout); co_return timed_out_error{}; } co_return res; } future reconfigure( raft::server_id id, const std::vector& ids, raft::logical_clock::time_point timeout, logical_timer& timer) { auto& n = _routes.at(id); if (!n._server) { // A 'remote' caller doesn't know in general if the server is down or just slow to respond. // Simulate this by timing out the call. co_await timer.sleep_until(timeout); co_return timed_out_error{}; } auto srv = n._server.get(); auto res = co_await srv->reconfigure(ids, timeout, timer); if (srv != n._server.get()) { // The server stopped while the call was happening. // As above, we simulate a 'remote' call by timing it out in this case. co_await timer.sleep_until(timeout); co_return timed_out_error{}; } co_return res; } future modify_config( raft::server_id id, const std::vector& added, std::vector deleted, raft::logical_clock::time_point timeout, logical_timer& timer) { auto& n = _routes.at(id); if (!n._server) { // A 'remote' caller doesn't know in general if the server is down or just slow to respond. // Simulate this by timing out the call. co_await timer.sleep_until(timeout); co_return timed_out_error{}; } auto srv = n._server.get(); auto res = co_await srv->modify_config(added, std::move(deleted), timeout, timer); if (srv != n._server.get()) { // The server stopped while the call was happening. // As above, we simulate a 'remote' call by timing it out in this case. co_await timer.sleep_until(timeout); co_return timed_out_error{}; } co_return res; } std::optional get_configuration(raft::server_id id) { auto& n = _routes.at(id); if (!n._server) { return std::nullopt; } return n._server->get_configuration(); } network& get_network() { return _network; } // Must be called before we are destroyed unless `new_server` was never called. future<> abort() { // Close the gate before iterating over _routes to prevent concurrent modification by other methods. co_await _gate.close(); for (auto& [_, r] : _routes) { if (r._server) { co_await r._server->abort(); r._server = nullptr; } } co_await std::move(_crash_fiber); _stopped = true; } }; template &, ticker&> F> auto with_env_and_ticker(environment_config cfg, F f) { return do_with(std::move(f), std::make_unique>(std::move(cfg)), std::make_unique(tlogger), [] (F& f, std::unique_ptr>& env, std::unique_ptr& t) { return f(*env, *t).finally([&env_ = env, &t_ = t] () mutable -> future<> { // move into coroutine body so they don't get destroyed with the lambda (on first co_await) auto& env = env_; auto& t = t_; // We abort the environment before the ticker as the environment may require time to advance // in order to finish (e.g. some operations may need to timeout). tlogger.info("aborting environment"); co_await env->abort(); tlogger.info("environment aborted, aborting ticker"); co_await t->abort(); tlogger.info("ticker aborted"); }); }); } struct ExReg { // Replaces the state with `x` and returns the previous state. struct exchange { int32_t x; }; // Returns the state. struct read {}; // Return value for `exchange` or `read`. struct ret { int32_t x; }; using state_t = int32_t; using input_t = std::variant; using output_t = ret; static std::pair delta(state_t curr, input_t input) { using res_t = std::pair; return std::visit(make_visitor( [&curr] (const exchange& w) -> res_t { return {w.x, ret{curr}}; }, [&curr] (const read&) -> res_t { return {curr, ret{curr}}; } ), input); } static const state_t init; }; const ExReg::state_t ExReg::init = 0; namespace ser { template <> struct serializer { template static void write(Output& buf, const ExReg::exchange& op) { serializer::write(buf, op.x); }; template static ExReg::exchange read(Input& buf) { return { serializer::read(buf) }; } template static void skip(Input& buf) { serializer::skip(buf); } }; template <> struct serializer { template static void write(Output& buf, const ExReg::read&) {}; template static ExReg::read read(Input& buf) { return {}; } template static void skip(Input& buf) {} }; } bool operator==(ExReg::ret a, ExReg::ret b) { return a.x == b.x; } std::ostream& operator<<(std::ostream& os, const ExReg::ret& r) { return os << format("ret{{{}}}", r.x); } std::ostream& operator<<(std::ostream& os, const ExReg::read&) { return os << "read"; } std::ostream& operator<<(std::ostream& os, const ExReg::exchange& e) { return os << format("xng{{{}}}", e.x); } // Wait until either one of `nodes` in `env` becomes a leader, or time point `timeout` is reached according to `timer` (whichever happens first). // If the leader is found, returns it. Otherwise throws a `logical_timer::timed_out` exception. // // Note: the returned node may have been a leader the moment we found it, but may have just stepped down // the moment we return it. It may be useful to call this function multiple times during cluster // stabilization periods in order to find a node that will successfully answer calls. template struct wait_for_leader { // FIXME: change into free function after clang bug #50345 is fixed future operator()( environment& env, std::vector nodes, logical_timer& timer, raft::logical_clock::time_point timeout) { auto l = co_await timer.with_timeout(timeout, [] (weak_ptr> env, std::vector nodes) -> future { while (true) { if (!env) { co_return raft::server_id{}; } auto it = std::find_if(nodes.begin(), nodes.end(), [&env] (raft::server_id id) { return env->is_leader(id); }); if (it != nodes.end()) { co_return *it; } co_await seastar::yield(); } }(env.weak_from_this(), std::move(nodes))); assert(l != raft::server_id{}); // Note: `l` may no longer be a leader at this point if there was a yield at the `co_await` above // and `l` decided to step down, was restarted, or just got removed from the configuration. co_return l; } }; SEASTAR_TEST_CASE(basic_test) { logical_timer timer; environment_config cfg { .rnd{0}, .network_delay{5, 5}, .fd_convict_threshold = 50_t, }; co_await with_env_and_ticker(cfg, [&timer] (environment& env, ticker& t) -> future<> { using output_t = typename ExReg::output_t; t.start([&] (uint64_t tick) { env.tick_network(); timer.tick(); if (tick % 10 == 0) { env.tick_servers(); } }, 10'000); auto leader_id = co_await env.new_server(true); // Wait at most 1000 ticks for the server to elect itself as a leader. assert(co_await wait_for_leader{}(env, {leader_id}, timer, timer.now() + 1000_t) == leader_id); auto call = [&] (ExReg::input_t input, raft::logical_clock::duration timeout) { return env.call(leader_id, std::move(input), timer.now() + timeout, timer); }; auto eq = [] (const call_result_t& r, const output_t& expected) { return std::holds_alternative(r) && std::get(r) == expected; }; for (int i = 1; i <= 100; ++i) { assert(eq(co_await call(ExReg::exchange{i}, 100_t), ExReg::ret{i - 1})); } tlogger.debug("100 exchanges - single server - passed"); auto id2 = co_await env.new_server(false); auto id3 = co_await env.new_server(false); tlogger.debug("Started 2 more servers, changing configuration"); assert(std::holds_alternative( co_await env.reconfigure(leader_id, {leader_id, id2, id3}, timer.now() + 100_t, timer))); tlogger.debug("Configuration changed"); co_await call(ExReg::exchange{0}, 100_t); for (int i = 1; i <= 100; ++i) { assert(eq(co_await call(ExReg::exchange{i}, 100_t), ExReg::ret{i - 1})); } tlogger.debug("100 exchanges - three servers - passed"); // concurrent calls std::vector>> futs; for (int i = 0; i < 100; ++i) { futs.push_back(call(ExReg::read{}, 100_t)); co_await timer.sleep(2_t); } for (int i = 0; i < 100; ++i) { assert(eq(co_await std::move(futs[i]), ExReg::ret{100})); } tlogger.debug("100 concurrent reads - three servers - passed"); }); tlogger.debug("Finished"); } // A snapshot was being taken with the wrong term (current term instead of the term at the snapshotted index). // This is a regression test for that bug. SEASTAR_TEST_CASE(snapshot_uses_correct_term_test) { logical_timer timer; environment_config cfg { .rnd{0}, .network_delay{1, 1}, .fd_convict_threshold = 10_t, }; co_await with_env_and_ticker(cfg, [&timer] (environment& env, ticker& t) -> future<> { t.start([&] (uint64_t tick) { env.tick_network(); timer.tick(); if (tick % 10 == 0) { env.tick_servers(); } }, 10'000); auto id1 = co_await env.new_server(true, raft::server::configuration{ // It's easier to catch the problem when we send entries one by one, not in batches. .append_request_threshold = 1, }); assert(co_await wait_for_leader{}(env, {id1}, timer, timer.now() + 1000_t) == id1); auto id2 = co_await env.new_server(false, raft::server::configuration{ .append_request_threshold = 1, }); assert(std::holds_alternative( co_await env.reconfigure(id1, {id1, id2}, timer.now() + 100_t, timer))); // Append a bunch of entries for (int i = 1; i <= 10; ++i) { assert(std::holds_alternative( co_await env.call(id1, ExReg::exchange{0}, timer.now() + 100_t, timer))); } assert(env.is_leader(id1)); // Force a term increase by partitioning the network and waiting for the leader to step down tlogger.trace("add grudge"); env.get_network().add_grudge(id2, id1); env.get_network().add_grudge(id1, id2); while (env.is_leader(id1)) { co_await seastar::yield(); } tlogger.trace("remove grudge"); env.get_network().remove_grudge(id2, id1); env.get_network().remove_grudge(id1, id2); auto l = co_await wait_for_leader{}(env, {id1, id2}, timer, timer.now() + 1000_t); tlogger.trace("last leader: {}", l); // Now the current term is greater than the term of the first couple of entries. // Join another server with a small snapshot_threshold. // The leader will send entries to this server one by one (due to small append_request_threshold), // so the joining server will apply entries one by one or in small batches (depends on the timing), // making it likely that it decides to take a snapshot at an entry with term lower than the current one. // If we are (un)lucky and we take a snapshot at the last appended entry, the node will refuse all // later append_entries requests due to non-matching term at the last appended entry. Note: due to this // requirement, the test is nondeterministic and doesn't always catch the bug (it depends on a race // between applier_fiber and io_fiber), but it does catch it in a significant number of runs. // It's also a lot easier to catch this in dev than in debug, for instance. // If we catch the bug, the reconfigure request below will time out. auto id3 = co_await env.new_server(false, raft::server::configuration{ .snapshot_threshold = 5, .snapshot_trailing = 2, }); assert(std::holds_alternative( co_await env.reconfigure(l, {l, id3}, timer.now() + 1000_t, timer))); }); } // Regression test for the following bug: when we took a snapshot, we forgot to save the configuration. // This caused each node in the cluster to eventually forget the cluster configuration. SEASTAR_TEST_CASE(snapshotting_preserves_config_test) { logical_timer timer; environment_config cfg { .rnd{0}, .network_delay{1, 1}, .fd_convict_threshold = 10_t, }; co_await with_env_and_ticker(cfg, [&timer] (environment& env, ticker& t) -> future<> { t.start([&] (uint64_t tick) { env.tick_network(); timer.tick(); if (tick % 10 == 0) { env.tick_servers(); } }, 10'000); auto id1 = co_await env.new_server(true, raft::server::configuration{ .snapshot_threshold = 5, .snapshot_trailing = 1, }); assert(co_await wait_for_leader{}(env, {id1}, timer, timer.now() + 1000_t) == id1); auto id2 = co_await env.new_server(false, raft::server::configuration{ .snapshot_threshold = 5, .snapshot_trailing = 1, }); assert(std::holds_alternative( co_await env.reconfigure(id1, {id1, id2}, timer.now() + 100_t, timer))); // Append a bunch of entries for (int i = 1; i <= 10; ++i) { assert(std::holds_alternative( co_await env.call(id1, ExReg::exchange{0}, timer.now() + 100_t, timer))); } assert(env.is_leader(id1)); // Partition the network, forcing the leader to step down. tlogger.trace("add grudge"); env.get_network().add_grudge(id2, id1); env.get_network().add_grudge(id1, id2); while (env.is_leader(id1)) { co_await seastar::yield(); } tlogger.trace("remove grudge"); env.get_network().remove_grudge(id2, id1); env.get_network().remove_grudge(id1, id2); // With the bug this would timeout, the cluster is unable to elect a leader without the configuration. auto l = co_await wait_for_leader{}(env, {id1, id2}, timer, timer.now() + 1000_t); tlogger.trace("last leader: {}", l); }); } // Regression test for #9981. SEASTAR_TEST_CASE(removed_follower_with_forwarding_learns_about_removal) { logical_timer timer; environment_config cfg { .rnd{0}, .network_delay{1, 1}, .fd_convict_threshold = 10_t, }; co_await with_env_and_ticker(cfg, [&timer] (environment& env, ticker& t) -> future<> { t.start([&] (uint64_t tick) { env.tick_network(); timer.tick(); if (tick % 10 == 0) { env.tick_servers(); } }, 10'000); raft::server::configuration cfg { .enable_forwarding = true, }; auto id1 = co_await env.new_server(true, cfg); assert(co_await wait_for_leader{}(env, {id1}, timer, timer.now() + 1000_t) == id1); auto id2 = co_await env.new_server(false, cfg); assert(std::holds_alternative( co_await env.reconfigure(id1, {id1, id2}, timer.now() + 100_t, timer))); // Server 2 forwards the entry that removes it to server 1. // We want server 2 to eventually learn from server 1 that it was removed, // so the call finishes (no timeout). assert(std::holds_alternative( co_await env.modify_config(id2, {}, {id2}, timer.now() + 100_t, timer))); }); } // Given a function `F` which takes a `raft::server_id` argument and returns a variant type // which contains `not_a_leader`, repeatedly calls `F` until it returns something else than // `not_a_leader` or until we reach a limit, whichever happens first. // The maximum number of calls until we give up is specified by `bounces`. // The initial `raft::server_id` argument provided to `F` is specified as an argument // to this function (`srv_id`). If the initial call returns `not_a_leader`, then: // - if the result contained a different leader ID and we didn't already try that ID, // we will use it in the next call, sleeping for `known_leader_delay` first, // - otherwise we will take the next ID from the `known` set, sleeping for // `unknown_leader_delay` first; no ID will be tried twice. // The returned result contains the result of the last call to `F` and the last // server ID passed to `F`. template struct bouncing { using future_type = std::invoke_result_t; using value_type = typename future_type::value_type; static_assert(boost::mp11::mp_contains::value); F _f; bouncing(F f) : _f(std::move(f)) {} // FIXME: change this into a free function after clang bug #50345 is fixed. future> operator()( logical_timer& timer, std::unordered_set known, raft::server_id srv_id, size_t bounces, raft::logical_clock::duration known_leader_delay, raft::logical_clock::duration unknown_leader_delay ) { tlogger.trace("bouncing call: starting with {}", srv_id); std::unordered_set tried; while (true) { auto res = co_await _f(srv_id); tried.insert(srv_id); known.erase(srv_id); if (auto n_a_l = std::get_if(&res); n_a_l && bounces) { --bounces; if (n_a_l->leader) { assert(n_a_l->leader != srv_id); if (!tried.contains(n_a_l->leader)) { co_await timer.sleep(known_leader_delay); srv_id = n_a_l->leader; tlogger.trace("bouncing call: got `not_a_leader`, rerouted to {}", srv_id); continue; } } if (!known.empty()) { srv_id = *known.begin(); if (n_a_l->leader) { tlogger.trace("bouncing call: got `not_a_leader`, rerouted to {}, but already tried it; trying {}", n_a_l->leader, srv_id); } else { tlogger.trace("bouncing call: got `not_a_leader`, no reroute, trying {}", srv_id); } continue; } } co_return std::pair{res, srv_id}; } } }; // An operation representing a call to the Raft cluster with a specific state machine input. // We may bounce a number of times if the server returns `not_a_leader` before giving up. template struct raft_call { typename M::input_t input; raft::logical_clock::duration timeout; using result_type = call_result_t; struct state_type { environment& env; // The set of servers that may be part of the current configuration. // Sometimes we don't know the exact configuration, e.g. after a failed configuration change. const std::unordered_set& known; logical_timer& timer; }; future execute(state_type& s, const operation::context& ctx) { // TODO a stable contact point used by a given thread would be preferable; // the thread would switch only if necessary (the contact point left the configuration). // Currently we choose the contact point randomly each time. assert(s.known.size() > 0); static std::mt19937 engine{0}; auto it = s.known.begin(); std::advance(it, std::uniform_int_distribution{0, s.known.size() - 1}(engine)); auto contact = *it; tlogger.debug("db call start inp {} tid {} start time {} current time {} contact {}", input, ctx.thread, ctx.start, s.timer.now(), contact); auto [res, last] = co_await bouncing{[input = input, timeout = s.timer.now() + timeout, &timer = s.timer, &env = s.env] (raft::server_id id) { return env.call(id, input, timeout, timer); }}(s.timer, s.known, contact, 6, 10_t, 10_t); tlogger.debug("db call end inp {} tid {} start time {} current time {} last contact {}", input, ctx.thread, ctx.start, s.timer.now(), last); co_return res; } friend std::ostream& operator<<(std::ostream& os, const raft_call& c) { return os << format("raft_call{{input:{},timeout:{}}}", c.input, c.timeout); } }; // An operation that partitions the network in half. // During the partition, no server from one half can contact any server from the other; // the partition is symmetric. // For odd number of nodes, ensures that the current leader (if there is one) is in the minority. template class network_majority_grudge { raft::logical_clock::duration _duration; public: struct state_type { environment& env; const std::unordered_set& known; logical_timer& timer; std::mt19937 rnd; }; using result_type = std::monostate; network_majority_grudge(raft::logical_clock::duration d) : _duration(d) { static_assert(operation::Executable>); } future execute(state_type& s, const operation::context& ctx) { std::vector nodes{s.known.begin(), s.known.end()}; std::shuffle(nodes.begin(), nodes.end(), s.rnd); auto mid = nodes.begin() + (nodes.size() / 2); if (nodes.size() % 2) { // Odd number of nodes, let's ensure that the leader (if there is one) is in the minority auto it = std::find_if(mid, nodes.end(), [&env = s.env] (raft::server_id id) { return env.is_leader(id); }); if (it != nodes.end()) { std::swap(*nodes.begin(), *it); } } // Note: creating the grudges has O(n^2) complexity, where n is the cluster size. // May be problematic for (very) large clusters. for (auto x = nodes.begin(); x != mid; ++x) { for (auto y = mid; y != nodes.end(); ++y) { s.env.get_network().add_grudge(*x, *y); s.env.get_network().add_grudge(*y, *x); } } tlogger.debug("network_majority_grudge start tid {} start time {} current time {} duration {} grudge: {} vs {}", ctx.thread, ctx.start, s.timer.now(), _duration, std::vector{nodes.begin(), mid}, std::vector{mid, nodes.end()}); co_await s.timer.sleep(_duration); tlogger.debug("network_majority_grudge end tid {} start time {} current time {}", ctx.thread, ctx.start, s.timer.now()); // Some servers in `nodes` may already be gone at this point but network doesn't care. // It's safe to call `remove_grudge`. for (auto x = nodes.begin(); x != mid; ++x) { for (auto y = mid; y != nodes.end(); ++y) { s.env.get_network().remove_grudge(*x, *y); s.env.get_network().remove_grudge(*y, *x); } } co_return std::monostate{}; } friend std::ostream& operator<<(std::ostream& os, const network_majority_grudge& p) { return os << format("network_majority_grudge{{duration:{}}}", p._duration); } }; // Must be executed sequentially. template struct reconfiguration { raft::logical_clock::duration timeout; struct state_type { const std::vector all_servers; environment& env; // a subset of all_servers that we modify; // the set of servers which may potentially be in the current configuration std::unordered_set& known; logical_timer& timer; std::mt19937 rnd; }; using result_type = reconfigure_result_t; future execute(state_type& s, const operation::context& ctx) { assert(s.all_servers.size() > 1); std::vector nodes{s.all_servers.begin(), s.all_servers.end()}; std::shuffle(nodes.begin(), nodes.end(), s.rnd); nodes.resize(std::uniform_int_distribution{1, nodes.size()}(s.rnd)); assert(s.known.size() > 0); auto [res, last] = co_await bouncing{[&nodes, timeout = s.timer.now() + timeout, &timer = s.timer, &env = s.env] (raft::server_id id) { return env.reconfigure(id, nodes, timeout, timer); }}(s.timer, s.known, *s.known.begin(), 10, 10_t, 10_t); std::visit(make_visitor( [&, last = last] (std::monostate) { tlogger.debug("reconfig successful from {} to {} by {}", s.known, nodes, last); s.known = std::unordered_set{nodes.begin(), nodes.end()}; // TODO: include the old leader as well in case it's not part of the new config? // it may remain a leader for some time... }, [&, last = last] (raft::not_a_leader& e) { tlogger.debug("reconfig failed, not a leader: {} tried {} by {}", e, nodes, last); }, [&, last = last] (auto& e) { s.known.merge(std::unordered_set{nodes.begin(), nodes.end()}); tlogger.debug("reconfig failed: {}, tried {} after merge {} by {}", e, nodes, s.known, last); } ), res); co_return res; } friend std::ostream& operator<<(std::ostream& os, const reconfiguration& r) { return os << format("reconfiguration{{timeout:{}}}", r.timeout); } }; template struct stop_crash { raft::logical_clock::duration restart_delay; struct state_type { environment& env; std::unordered_set& known; logical_timer& timer; std::mt19937 rnd; }; struct result_type {}; future execute(state_type& s, const operation::context& ctx) { assert(s.known.size() > 0); auto it = s.known.begin(); std::advance(it, std::uniform_int_distribution{0, s.known.size() - 1}(s.rnd)); auto srv = *it; static std::bernoulli_distribution bdist{0.5}; if (bdist(s.rnd)) { tlogger.debug("Crashing server {}", srv); s.env.crash(srv); } else { tlogger.debug("Stopping server {}...", srv); co_await s.env.stop(srv); tlogger.debug("Server {} stopped", srv); } co_await s.timer.sleep(restart_delay); tlogger.debug("Restarting server {}", srv); co_await s.env.start_server(srv); co_return result_type{}; } friend std::ostream& operator<<(std::ostream& os, const stop_crash& c) { return os << format("stop_crash{{delay:{}}}", c.restart_delay); } friend std::ostream& operator<<(std::ostream& os, const result_type&) { return os << ""; } }; namespace std { std::ostream& operator<<(std::ostream& os, const std::monostate&) { return os << ""; } template std::ostream& operator<<(std::ostream& os, const std::variant& v) { std::visit([&os] (auto& arg) { os << arg; }, v); return os; } } // namespace std namespace operation { std::ostream& operator<<(std::ostream& os, const thread_id& tid) { return os << format("thread_id{{{}}}", tid.id); } } // namespace operation // An immutable sequence of integers. class append_seq { public: using elem_t = int32_t; private: // This represents the sequence of integers from _seq->begin() to _seq->begin() + _end. // The underlying vector *_seq may however be shared by other instances of `append_seq`. // If only one instance is appending, the operation is O(1). However, each subsequent // append performed by another instance sharing this vector must perform a copy. lw_shared_ptr> _seq; // always engaged size_t _end; // <= _seq.size() elem_t _digest; // sum of all elements modulo `magic` static const elem_t magic = 54313; public: append_seq(std::vector v) : _seq{make_lw_shared>(std::move(v))}, _end{_seq->size()}, _digest{0} { for (auto x : *_seq) { _digest = digest_append(_digest, x); } } static elem_t digest_append(elem_t d, elem_t x) { assert(0 <= d < magic); auto y = (d + x) % magic; assert(digest_remove(y, x) == d); return y; } static elem_t digest_remove(elem_t d, elem_t x) { assert(0 <= d < magic); auto y = (d - x) % magic; return y < 0 ? y + magic : y; } elem_t digest() const { return _digest; } append_seq append(elem_t x) const { assert(_seq); assert(_end <= _seq->size()); auto seq = _seq; if (_end < seq->size()) { // The shared sequence was already appended beyond _end by someone else. // We need to copy everything so we don't break the other guy. seq = make_lw_shared>(seq->begin(), seq->begin() + _end); } seq->push_back(x); return {std::move(seq), _end + 1, digest_append(_digest, x)}; } elem_t operator[](size_t idx) const { assert(_seq); assert(idx < _end); assert(_end <= _seq->size()); return (*_seq)[idx]; } bool empty() const { return _end == 0; } size_t size() const { assert(_end <= _seq->size()); return _end; } std::pair pop() const { assert(_seq); assert(_end <= _seq->size()); assert(0 < _end); return {{_seq, _end - 1, digest_remove(_digest, (*_seq)[_end - 1])}, (*_seq)[_end - 1]}; } friend std::ostream& operator<<(std::ostream& os, const append_seq& s) { // TODO: don't copy the elements std::vector v{s._seq->begin(), s._seq->begin() + s._end}; return os << format("seq({} _end {})", v, s._end); } private: append_seq(lw_shared_ptr> seq, size_t end, elem_t d) : _seq(std::move(seq)), _end(end), _digest(d) {} }; struct AppendReg { struct append { int32_t x; }; struct ret { int32_t x; append_seq prev; }; using state_t = append_seq; using input_t = append; using output_t = ret; static std::pair delta(const state_t& curr, input_t input) { return {curr.append(input.x), {input.x, curr}}; } static thread_local const state_t init; }; thread_local const AppendReg::state_t AppendReg::init{{0}}; namespace ser { template <> struct serializer { template static void write(Output& buf, const AppendReg::append& op) { serializer::write(buf, op.x); }; template static AppendReg::append read(Input& buf) { return { serializer::read(buf) }; } template static void skip(Input& buf) { serializer::skip(buf); } }; } struct inconsistency { std::string what; }; struct append_reg_model { using elem_t = typename append_seq::elem_t; struct entry { elem_t elem; elem_t digest; }; friend std::ostream& operator<<(std::ostream& os, const entry& e) { return os << e.elem; } std::vector seq{{0, 0}}; std::unordered_map index{{0, 0}}; std::unordered_set banned; std::unordered_set returned; std::unordered_set in_progress; void invocation(elem_t x) { assert(!index.contains(x)); assert(!in_progress.contains(x)); in_progress.insert(x); } void return_success(elem_t x, append_seq prev) { assert(!returned.contains(x)); assert(x != 0); assert(!prev.empty()); try { completion(x, prev); } catch (inconsistency& e) { e.what += format("\nwhen completing elem: {}\nprev: {}\nmodel: {}", x, prev, seq); throw; } returned.insert(x); } void return_failure(elem_t x) { assert(!index.contains(x)); assert(in_progress.contains(x)); banned.insert(x); in_progress.erase(x); } private: void completion(elem_t x, append_seq prev) { if (prev.empty()) { assert(x == 0); return; } assert(x != 0); assert(!banned.contains(x)); assert(in_progress.contains(x) || index.contains(x)); auto [prev_prev, prev_x] = prev.pop(); if (auto it = index.find(x); it != index.end()) { // This element was already completed. auto idx = it->second; assert(0 < idx); assert(idx < seq.size()); if (prev_x != seq[idx - 1].elem) { throw inconsistency{format( "elem {} completed again (existing at idx {}), but prev elem does not match existing model" "\nprev elem: {}\nmodel prev elem: {}\nprev: {} model up to idx: {}", x, idx, prev_x, seq[idx - 1].elem, prev, std::vector{seq.begin(), seq.begin()+idx})}; } if (prev.digest() != seq[idx - 1].digest) { auto err = format( "elem {} completed again (existing at idx {}), but prev does not match existing model" "\n prev: {}\nmodel up to idx: {}", x, idx, prev, std::vector{seq.begin(), seq.begin()+idx}); auto min_len = std::min(prev.size(), idx); for (size_t i = 0; i < min_len; ++i) { if (prev[i] != seq[i].elem) { err += format("\nmismatch at idx {} prev {} model {}", i, prev[i], seq[i].elem); } } throw inconsistency{std::move(err)}; } return; } // A new completion. // First, recursively complete the previous elements... completion(prev_x, std::move(prev_prev)); // Check that the existing tail matches our tail. assert(!seq.empty()); if (prev_x != seq.back().elem) { throw inconsistency{format( "new completion (elem: {}) but prev elem does not match existing model" "\nprev elem: {}\nmodel prev elem: {}\nprev: {}\n model: {}", x, prev_x, seq.back().elem, prev, seq)}; } if (prev.digest() != seq.back().digest) { auto err = format( "new completion (elem: {}) but prev does not match existing model" "\nprev: {}\n model: {}", x, prev, seq); auto min_len = std::min(prev.size(), seq.size()); for (size_t i = 0; i < min_len; ++i) { if (prev[i] != seq[i].elem) { err += format("\nmismatch at idx {} prev {} model {}", i, prev[i], seq[i].elem); } } throw inconsistency{std::move(err)}; } // All previous elements were completed, so the new element belongs at the end. index.emplace(x, seq.size()); seq.push_back(entry{x, append_seq::digest_append(seq.back().digest, x)}); in_progress.erase(x); } }; std::ostream& operator<<(std::ostream& os, const AppendReg::append& a) { return os << format("append{{{}}}", a.x); } std::ostream& operator<<(std::ostream& os, const AppendReg::ret& r) { return os << format("ret{{{}, {}}}", r.x, r.prev); } SEASTAR_TEST_CASE(basic_generator_test) { using op_type = operation::invocable, network_majority_grudge, reconfiguration, stop_crash >>; using history_t = utils::chunked_vector>>; static_assert(operation::Invocable); auto seed = tests::random::get_int(); std::mt19937 random_engine{seed}; logical_timer timer; environment_config cfg { .rnd{random_engine}, .network_delay{0, 6}, .fd_convict_threshold = 50_t, }; co_await with_env_and_ticker(cfg, [&] (environment& env, ticker& t) -> future<> { t.start([&, dist = std::uniform_int_distribution(0, 9)] (uint64_t tick) mutable { env.tick_network(); timer.tick(); env.for_each_server([&] (raft::server_id, raft_server* srv, failure_detector& fd) { // Tick each server with probability 1/10. // Thus each server is ticked, on average, once every 10 timer/network ticks. // On the other hand, we now have servers running at different speeds. if (srv && dist(random_engine) == 0) { srv->tick(); fd.tick(); } }); env.tick_crashing_servers(); }, 200'000); std::bernoulli_distribution bdist{0.5}; // With probability 1/2 enable forwarding: when we send a command to a follower, it automatically // forwards it to the known leader or waits for learning about a leader instead of returning // `not_a_leader`. bool forwarding = bdist(random_engine); // With probability 1/2, run the servers with a configuration which causes frequent snapshotting. // Note: with the default configuration we won't observe any snapshots at all, since the default // threshold is 1024 log commands and we perform only 500 ops. bool frequent_snapshotting = bdist(random_engine); // TODO: randomize the snapshot thresholds between different servers for more chaos. auto srv_cfg = frequent_snapshotting ? raft::server::configuration { .snapshot_threshold{10}, .snapshot_trailing{5}, .max_log_size{20}, .enable_forwarding{forwarding}, } : raft::server::configuration { .enable_forwarding{forwarding}, }; tlogger.info("basic_generator_test: forwarding: {}, frequent snapshotting: {}", forwarding, frequent_snapshotting); auto leader_id = co_await env.new_server(true, srv_cfg); // Wait for the server to elect itself as a leader. assert(co_await wait_for_leader{}(env, {leader_id}, timer, timer.now() + 1000_t) == leader_id); size_t no_all_servers = 10; std::vector all_servers{leader_id}; for (size_t i = 1; i < no_all_servers; ++i) { all_servers.push_back(co_await env.new_server(false, srv_cfg)); } size_t no_init_servers = 5; // `known_config` represents the set of servers that may potentially be in the cluster configuration. // // It is not possible to determine in general what the 'true' current configuration is (if even such notion // makes sense at all). Given a sequence of reconfiguration requests, assuming that all except possibly the last // requests have finished, then: // - if the last request has finished successfully, then the current configuration must be equal // to the one chosen in the last request; // - but if it hasn't finished yet, or it finished with a failure, the current configuration may contain servers // from the one chosen in the last request or from the previously known set of servers. // // The situation is even worse considering that requests may never 'finish', i.e. we may never get a response // to a reconfiguration request (in which case we eventually timeout). These requests may in theory execute // at any point in the future. We take a practical approach when updating `known_config`: we assume // that our timeouts for reconfiguration requests are large enough so that if a reconfiguration request // has timed out, it has either already finished or it never will. // TODO: this may not be true and we may end up with `known_config` that does not contain the current leader // (not observed in practice yet though... I think) Come up with a better approach. std::unordered_set known_config; for (size_t i = 0; i < no_init_servers; ++i) { known_config.insert(all_servers[i]); } assert(std::holds_alternative( co_await env.reconfigure(leader_id, std::vector{known_config.begin(), known_config.end()}, timer.now() + 100_t, timer))); auto threads = operation::make_thread_set(all_servers.size() + 3); auto [partition_thread, reconfig_thread, crash_thread] = take<3>(threads); raft_call::state_type db_call_state { .env = env, .known = known_config, .timer = timer }; network_majority_grudge::state_type network_majority_grudge_state { .env = env, .known = known_config, .timer = timer, .rnd = std::mt19937{seed} }; reconfiguration::state_type reconfiguration_state { .all_servers = all_servers, .env = env, .known = known_config, .timer = timer, .rnd = std::mt19937{seed} }; stop_crash::state_type crash_state { .env = env, .known = known_config, .timer = timer, .rnd = std::mt19937{seed} }; auto init_state = op_type::state_type{ std::move(db_call_state), std::move(network_majority_grudge_state), std::move(reconfiguration_state), std::move(crash_state) }; using namespace generator; // For reference to ``real life'' suppose 1_t ~= 10ms. Then: // 10_t (server tick) ~= 100ms // network delay = 3_t ~= 30ms // election timeout = 10 server ticks = 100_t ~= 1s // thus, to enforce leader election, need a majority to convict the current leader for > 100_t ~= 1s, // failure detector convict threshold = 50 srv ticks = 500_t ~= 5s // so need to partition for > 600_t ~= 6s // choose network partition duration uniformly from [600_t-600_t/3, 600_t+600_t/3] = [400_t, 800_t] // ~= [4s, 8s] -> ~1/2 partitions should cause an election // we will set request timeout 600_t ~= 6s and partition every 1200_t ~= 12s auto gen = op_limit(500, pin(partition_thread, stagger(seed, timer.now() + 200_t, 1200_t, 1200_t, random(seed, [] (std::mt19937& engine) { static std::uniform_int_distribution dist{400, 800}; return op_type{network_majority_grudge{raft::logical_clock::duration{dist(engine)}}}; }) ), pin(reconfig_thread, stagger(seed, timer.now() + 1000_t, 500_t, 500_t, constant([] () { return op_type{reconfiguration{500_t}}; }) ), pin(crash_thread, stagger(seed, timer.now() + 200_t, 100_t, 200_t, random(seed, [] (std::mt19937& engine) { static std::uniform_int_distribution dist{0, 100}; return op_type{stop_crash{raft::logical_clock::duration{dist(engine)}}}; }) ), stagger(seed, timer.now(), 0_t, 50_t, sequence(1, [] (int32_t i) { assert(i > 0); return op_type{raft_call{AppendReg::append{i}, 200_t}}; }) ) ) ) ) ); struct statistics { size_t invocations{0}; size_t successes{0}; size_t failures{0}; }; class consistency_checker { append_reg_model _model; statistics& _stats; public: consistency_checker(statistics& s) : _model{}, _stats(s) {} void operator()(op_type o) { tlogger.debug("invocation {}", o); if (auto call_op = std::get_if>(&o.op)) { ++_stats.invocations; _model.invocation(call_op->input.x); } } void operator()(operation::completion c) { auto res = std::get_if(&c.result); assert(res); if (auto call_res = std::get_if::result_type>(res)) { std::visit(make_visitor( [this] (AppendReg::output_t& out) { tlogger.debug("completion x: {} prev digest: {}", out.x, out.prev.digest()); ++_stats.successes; _model.return_success(out.x, std::move(out.prev)); }, [this] (raft::not_a_leader& e) { // TODO: this is a definite failure, mark it // _model.return_failure(...) ++_stats.failures; }, [this] (raft::commit_status_unknown& e) { // TODO assert: only allowed if reconfigurations happen? // assert(false); TODO debug this ++_stats.failures; }, [this] (auto&) { ++_stats.failures; } ), *call_res); } else { tlogger.debug("completion {}", c); } // TODO: check consistency of reconfiguration completions // (there's not much to check, but for example: we should not get back `conf_change_in_progress` // if our last reconfiguration was successful?). } }; statistics stats; history_t history; interpreter interp{ std::move(gen), std::move(threads), 1_t, std::move(init_state), timer, consistency_checker{stats}}; try { co_await interp.run(); } catch (inconsistency& e) { tlogger.error("inconsistency: {}", e.what); env.for_each_server([&] (raft::server_id id, raft_server* srv, failure_detector&) { if (srv) { tlogger.info("server {} state machine state: {}", id, srv->state()); } else { tlogger.info("node {} currently missing server", id); } }); assert(false); } tlogger.info("Finished generator run, time: {}, invocations: {}, successes: {}, failures: {}, total: {}", timer.now(), stats.invocations, stats.successes, stats.failures, stats.successes + stats.failures); // Liveness check: we must be able to obtain a final response after all the nemeses have stopped. // Due to possible multiple leaders at this point and the cluster stabilizing (for example there // may be no leader right now, the current leader may be stepping down etc.) we may need to try // sending requests multiple times to different servers to obtain the last result. auto limit = timer.now() + 10000_t; size_t cnt = 0; for (; timer.now() < limit; ++cnt) { tlogger.info("Trying to obtain last result: attempt number {}", cnt + 1); auto now = timer.now(); auto leader = co_await wait_for_leader{}(env, std::vector{all_servers.begin(), all_servers.end()}, timer, limit) .handle_exception_type([&timer, now] (logical_timer::timed_out) -> raft::server_id { tlogger.error("Failed to find a leader after {} ticks at the end of test.", timer.now() - now); assert(false); }); if (env.is_leader(leader)) { tlogger.info("Leader {} found after {} ticks", leader, timer.now() - now); } else { tlogger.warn("Leader {} found after {} ticks, but suddenly lost leadership", leader, timer.now() - now); continue; } auto config = env.get_configuration(leader); assert(config); tlogger.info("Leader {} configuration: current {} previous {}", leader, config->current, config->previous); for (auto& s: all_servers) { if (env.is_leader(s) && s != leader) { auto conf = env.get_configuration(s); assert(conf); tlogger.info("There is another leader: {}, configuration: current {} previous {}", s, conf->current, conf->previous); } } tlogger.info("From the clients' point of view, the possible cluster members are: {}", known_config); auto [res, last_attempted_server] = co_await bouncing{[&timer, &env] (raft::server_id id) { return env.call(id, AppendReg::append{-1}, timer.now() + 200_t, timer); }}(timer, known_config, leader, known_config.size() + 1, 10_t, 10_t); if (std::holds_alternative(res)) { tlogger.info("Obtained last result"); tlogger.debug("Last result: {}", res); co_return; } tlogger.warn("Failed to obtain last result at end of test: {} returned by {}", res, last_attempted_server); } tlogger.error("Failed to obtain a final successful response at the end of the test. Number of attempts: {}", cnt); assert(false); }); }