/* * Copyright (C) 2020-present ScyllaDB */ /* * SPDX-License-Identifier: AGPL-3.0-or-later */ #define BOOST_TEST_MODULE raft #include "test/raft/helpers.hh" using namespace raft; BOOST_AUTO_TEST_CASE(test_votes) { auto id1 = id(); raft::votes votes(raft::configuration({id1})); BOOST_CHECK_EQUAL(votes.tally_votes(), raft::vote_result::UNKNOWN); BOOST_CHECK_EQUAL(votes.voters().size(), 1); // Try a vote from an unknown server, it should be ignored. votes.register_vote(id(), true); votes.register_vote(id1, false); // Quorum votes against the decision BOOST_CHECK_EQUAL(votes.tally_votes(), raft::vote_result::LOST); // Another vote from the same server is ignored votes.register_vote(id1, true); votes.register_vote(id1, true); BOOST_CHECK_EQUAL(votes.tally_votes(), raft::vote_result::LOST); auto id2 = id(); votes = raft::votes(raft::configuration({id1, id2})); BOOST_CHECK_EQUAL(votes.voters().size(), 2); votes.register_vote(id1, true); // We need a quorum of participants to win an election BOOST_CHECK_EQUAL(votes.tally_votes(), raft::vote_result::UNKNOWN); votes.register_vote(id2, false); // At this point it's clear we don't have enough votes BOOST_CHECK_EQUAL(votes.tally_votes(), raft::vote_result::LOST); auto id3 = id(); // Joint configuration votes = raft::votes(raft::configuration(address_set({id1}), address_set({id2, id3}))); BOOST_CHECK_EQUAL(votes.voters().size(), 3); BOOST_CHECK_EQUAL(votes.tally_votes(), raft::vote_result::UNKNOWN); votes.register_vote(id2, true); votes.register_vote(id3, true); BOOST_CHECK_EQUAL(votes.tally_votes(), raft::vote_result::UNKNOWN); votes.register_vote(id1, false); BOOST_CHECK_EQUAL(votes.tally_votes(), raft::vote_result::LOST); votes = raft::votes(raft::configuration(address_set({id1}), address_set({id2, id3}))); votes.register_vote(id2, true); votes.register_vote(id3, true); votes.register_vote(id1, true); BOOST_CHECK_EQUAL(votes.tally_votes(), raft::vote_result::WON); votes = raft::votes(raft::configuration(address_set({id1, id2, id3}), address_set({id1}))); BOOST_CHECK_EQUAL(votes.voters().size(), 3); votes.register_vote(id1, true); BOOST_CHECK_EQUAL(votes.tally_votes(), raft::vote_result::UNKNOWN); // This gives us a majority in both new and old // configurations. votes.register_vote(id2, true); BOOST_CHECK_EQUAL(votes.tally_votes(), raft::vote_result::WON); // Basic voting test for 4 nodes auto id4 = id(); votes = raft::votes(raft::configuration({id1, id2, id3, id4})); votes.register_vote(id1, true); votes.register_vote(id2, true); BOOST_CHECK_EQUAL(votes.tally_votes(), raft::vote_result::UNKNOWN); votes.register_vote(id3, false); BOOST_CHECK_EQUAL(votes.tally_votes(), raft::vote_result::UNKNOWN); votes.register_vote(id4, false); BOOST_CHECK_EQUAL(votes.tally_votes(), raft::vote_result::LOST); auto id5 = id(); // Basic voting test for 5 nodes votes = raft::votes(raft::configuration(address_set({id1, id2, id3, id4, id5}), address_set({id1, id2, id3}))); votes.register_vote(id1, false); votes.register_vote(id2, false); BOOST_CHECK_EQUAL(votes.tally_votes(), raft::vote_result::LOST); votes.register_vote(id3, true); votes.register_vote(id4, true); votes.register_vote(id5, true); BOOST_CHECK_EQUAL(votes.tally_votes(), raft::vote_result::LOST); // Basic voting test with tree voters and one no-voter votes = raft::votes(raft::configuration({{.id = id1}, {.id = id2}, {.id = id3}, {id4, false}})); votes.register_vote(id1, true); votes.register_vote(id2, true); BOOST_CHECK_EQUAL(votes.tally_votes(), raft::vote_result::WON); // Basic test that non-voting votes are ignored votes = raft::votes(raft::configuration({{.id = id1}, {.id = id2}, {.id = id3}, {id4, false}})); votes.register_vote(id1, true); votes.register_vote(id4, true); BOOST_CHECK_EQUAL(votes.tally_votes(), raft::vote_result::UNKNOWN); votes.register_vote(id3, true); BOOST_CHECK_EQUAL(votes.tally_votes(), raft::vote_result::WON); // Joint configuration with non voting members votes = raft::votes(raft::configuration({{.id = id1}}, {{.id = id2}, {.id = id3}, {id4, false}})); BOOST_CHECK_EQUAL(votes.voters().size(), 3); BOOST_CHECK_EQUAL(votes.tally_votes(), raft::vote_result::UNKNOWN); votes.register_vote(id2, true); votes.register_vote(id3, true); votes.register_vote(id4, true); BOOST_CHECK_EQUAL(votes.tally_votes(), raft::vote_result::UNKNOWN); votes.register_vote(id1, true); BOOST_CHECK_EQUAL(votes.tally_votes(), raft::vote_result::WON); // Same node is voting in one config and non voting in another votes = raft::votes(raft::configuration({{.id = id1}, {.id = id4}}, {{.id = id2}, {.id = id3}, {id4, false}})); votes.register_vote(id2, true); votes.register_vote(id1, true); votes.register_vote(id4, true); BOOST_CHECK_EQUAL(votes.tally_votes(), raft::vote_result::UNKNOWN); votes.register_vote(id3, true); BOOST_CHECK_EQUAL(votes.tally_votes(), raft::vote_result::WON); } BOOST_AUTO_TEST_CASE(test_tracker) { auto id1 = id(); raft::tracker tracker; raft::configuration cfg({id1}); tracker.set_configuration(cfg, index_t{1}); BOOST_CHECK_NE(tracker.find(id1), nullptr); // The node with id set during construction is assumed to be // the leader, since otherwise we wouldn't create a tracker // in the first place. BOOST_CHECK_EQUAL(tracker.committed(index_t{0}), index_t{0}); // Avoid keeping a reference, follower_progress address may // change with configuration change auto pr = [&tracker](raft::server_id id) -> raft::follower_progress* { return tracker.find(id); }; BOOST_CHECK_EQUAL(pr(id1)->match_idx, index_t{0}); BOOST_CHECK_EQUAL(pr(id1)->next_idx, index_t{1}); pr(id1)->accepted(index_t{1}); BOOST_CHECK_EQUAL(pr(id1)->match_idx, index_t{1}); BOOST_CHECK_EQUAL(pr(id1)->next_idx, index_t{2}); BOOST_CHECK_EQUAL(tracker.committed(index_t{0}), index_t{1}); pr(id1)->accepted(index_t{10}); BOOST_CHECK_EQUAL(pr(id1)->match_idx, index_t{10}); BOOST_CHECK_EQUAL(pr(id1)->next_idx, index_t{11}); BOOST_CHECK_EQUAL(tracker.committed(index_t{0}), index_t{10}); // Out of order confirmation is OK // pr(id1)->accepted(index_t{5}); BOOST_CHECK_EQUAL(pr(id1)->match_idx, index_t{10}); BOOST_CHECK_EQUAL(pr(id1)->next_idx, index_t{11}); BOOST_CHECK_EQUAL(tracker.committed(index_t{5}), index_t{10}); // Enter joint configuration {A,B,C} auto id2 = id(), id3 = id(); cfg.enter_joint(address_set({id1, id2, id3})); tracker.set_configuration(cfg, index_t{1}); BOOST_CHECK_EQUAL(tracker.committed(index_t{10}), index_t{10}); pr(id2)->accepted(index_t{11}); BOOST_CHECK_EQUAL(tracker.committed(index_t{10}), index_t{10}); pr(id3)->accepted(index_t{12}); BOOST_CHECK_EQUAL(tracker.committed(index_t{10}), index_t{10}); pr(id1)->accepted(index_t{13}); BOOST_CHECK_EQUAL(tracker.committed(index_t{10}), index_t{12}); pr(id1)->accepted(index_t{14}); BOOST_CHECK_EQUAL(tracker.committed(index_t{13}), index_t{13}); // Leave joint configuration, final configuration is {A,B,C} cfg.leave_joint(); tracker.set_configuration(cfg, index_t{1}); BOOST_CHECK_EQUAL(tracker.committed(index_t{13}), index_t{13}); auto id4 = id(), id5 = id(); cfg.enter_joint(address_set({id3, id4, id5})); tracker.set_configuration(cfg, index_t{1}); BOOST_CHECK_EQUAL(tracker.committed(index_t{13}), index_t{13}); pr(id1)->accepted(index_t{15}); BOOST_CHECK_EQUAL(tracker.committed(index_t{13}), index_t{13}); pr(id5)->accepted(index_t{15}); BOOST_CHECK_EQUAL(tracker.committed(index_t{13}), index_t{13}); pr(id3)->accepted(index_t{15}); BOOST_CHECK_EQUAL(tracker.committed(index_t{13}), index_t{15}); // This does not advance the joint quorum pr(id1)->accepted(index_t{16}); pr(id4)->accepted(index_t{17}); pr(id5)->accepted(index_t{18}); BOOST_CHECK_EQUAL(tracker.committed(index_t{15}), index_t{15}); cfg.leave_joint(); tracker.set_configuration(cfg, index_t{1}); // Leaving joint configuration commits more entries BOOST_CHECK_EQUAL(tracker.committed(index_t{15}), index_t{17}); // cfg.enter_joint(address_set({id1})); cfg.leave_joint(); cfg.enter_joint(address_set({id2})); tracker.set_configuration(cfg, index_t{1}); // Sic: we're in a weird state. The joint commit index // is actually 1, since id2 is at position 1. But in // unwinding back the commit index would be weird, // so we report back the hint (prev_commit_idx). // As soon as the cluster enters joint configuration, // and old quorum is insufficient, the leader won't be able to // commit new entries until the new members catch up. BOOST_CHECK_EQUAL(tracker.committed(index_t{17}), index_t{17}); pr(id1)->accepted(index_t{18}); BOOST_CHECK_EQUAL(tracker.committed(index_t{17}), index_t{17}); pr(id2)->accepted(index_t{19}); BOOST_CHECK_EQUAL(tracker.committed(index_t{17}), index_t{18}); pr(id1)->accepted(index_t{20}); BOOST_CHECK_EQUAL(tracker.committed(index_t{18}), index_t{19}); // Check that non voting member is not counted for the quorum in simple config cfg.enter_joint({{.id = id1}, {.id = id2}, {id3, false}}); cfg.leave_joint(); tracker.set_configuration(cfg, index_t{1}); pr(id1)->accepted(index_t{30}); pr(id2)->accepted(index_t{25}); pr(id3)->accepted(index_t{30}); BOOST_CHECK_EQUAL(tracker.committed(index_t{0}), index_t{25}); // Check that non voting member is not counted for the quorum in joint config cfg.enter_joint({{.id = id4}, {.id = id5}}); tracker.set_configuration(cfg, index_t{1}); pr(id4)->accepted(index_t{30}); pr(id5)->accepted(index_t{30}); BOOST_CHECK_EQUAL(tracker.committed(index_t{0}), index_t{25}); // Check the case where the same node is in both config but different voting rights cfg.leave_joint(); cfg.enter_joint({{.id = id1}, {.id = id2}, {id5, false}}); BOOST_CHECK_EQUAL(tracker.committed(index_t{0}), index_t{25}); } BOOST_AUTO_TEST_CASE(test_log_last_conf_idx) { // last_conf_idx, prev_conf_idx are initialized correctly, // and maintained during truncate head/truncate tail server_id id1 = id(); raft::configuration cfg({id1}); raft::log log{raft::snapshot_descriptor{.config = cfg}}; BOOST_CHECK_EQUAL(log.last_conf_idx(), 0); add_entry(log, cfg); BOOST_CHECK_EQUAL(log.last_conf_idx(), 1); add_entry(log, log_entry::dummy{}); add_entry(log, cfg); BOOST_CHECK_EQUAL(log.last_conf_idx(), 3); // apply snapshot truncates the log and resets last_conf_idx() log.apply_snapshot(log_snapshot(log, log.last_idx()), 0); BOOST_CHECK_EQUAL(log.last_conf_idx(), log.get_snapshot().idx); // log::last_term() is maintained correctly by truncate_head/truncate_tail() (snapshotting) BOOST_CHECK_EQUAL(log.last_term(), log.get_snapshot().term); BOOST_CHECK(log.term_for(log.get_snapshot().idx)); BOOST_CHECK_EQUAL(log.term_for(log.get_snapshot().idx).value(), log.get_snapshot().term); BOOST_CHECK(! log.term_for(log.last_idx() - index_t{1})); add_entry(log, log_entry::dummy{}); BOOST_CHECK(log.term_for(log.last_idx())); add_entry(log, log_entry::dummy{}); const size_t GAP = 10; // apply_snapshot with a log gap, this should clear all log // entries, despite that trailing is given, a gap // between old log entries and a snapshot would violate // log continuity. log.apply_snapshot(log_snapshot(log, log.last_idx() + index_t{GAP}), GAP * 2); BOOST_CHECK(log.empty()); BOOST_CHECK_EQUAL(log.next_idx(), log.get_snapshot().idx + index_t{1}); add_entry(log, log_entry::dummy{}); BOOST_CHECK_EQUAL(log.in_memory_size(), 1); add_entry(log, log_entry::dummy{}); BOOST_CHECK_EQUAL(log.in_memory_size(), 2); // Set trailing longer than the length of the log. log.apply_snapshot(log_snapshot(log, log.last_idx()), 3); BOOST_CHECK_EQUAL(log.in_memory_size(), 2); // Set trailing the same length as the current log length add_entry(log, log_entry::dummy{}); BOOST_CHECK_EQUAL(log.in_memory_size(), 3); log.apply_snapshot(log_snapshot(log, log.last_idx()), 3); BOOST_CHECK_EQUAL(log.in_memory_size(), 3); BOOST_CHECK_EQUAL(log.last_conf_idx(), log.get_snapshot().idx); add_entry(log, log_entry::dummy{}); // Set trailing shorter than the length of the log log.apply_snapshot(log_snapshot(log, log.last_idx()), 1); BOOST_CHECK_EQUAL(log.in_memory_size(), 1); // check that configuration from snapshot is used and not config entries from a trailing add_entry(log, cfg); add_entry(log, cfg); add_entry(log, log_entry::dummy{}); auto snp_idx = log.last_idx(); log.apply_snapshot(log_snapshot(log, snp_idx), 10); BOOST_CHECK_EQUAL(log.last_conf_idx(), snp_idx); // Check that configuration from the log is used if it has higher index then snapshot idx add_entry(log, log_entry::dummy{}); snp_idx = log.last_idx(); add_entry(log, cfg); add_entry(log, cfg); log.apply_snapshot(log_snapshot(log, snp_idx), 10); BOOST_CHECK_EQUAL(log.last_conf_idx(), log.last_idx()); } void test_election_single_node_helper(raft::fsm_config fcfg) { server_id id1 = id(); raft::configuration cfg({id1}); raft::log log{raft::snapshot_descriptor{.config = cfg}}; raft::fsm fsm(id1, term_t{}, server_id{}, std::move(log), trivial_failure_detector, fcfg); election_timeout(fsm); // Immediately converts from leader to follower if quorum=1 BOOST_CHECK(fsm.is_leader()); auto output = fsm.get_output(); BOOST_CHECK(output.term_and_vote); BOOST_CHECK(output.term_and_vote->first); BOOST_CHECK(output.term_and_vote->second); BOOST_CHECK(output.messages.empty()); // A new leader applies one dummy entry BOOST_CHECK_EQUAL(output.log_entries.size(), 1); if (output.log_entries.size()) { BOOST_CHECK(std::holds_alternative(output.log_entries[0]->data)); } BOOST_CHECK(output.committed.empty()); // The leader does not become candidate simply because // a timeout has elapsed, i.e. there are no spurious // elections. election_timeout(fsm); BOOST_CHECK(fsm.is_leader()); output = fsm.get_output(); BOOST_CHECK(!output.term_and_vote); BOOST_CHECK(output.messages.empty()); BOOST_CHECK(output.log_entries.empty()); // Dummy entry is now committed BOOST_CHECK_EQUAL(output.committed.size(), 1); if (output.committed.size()) { BOOST_CHECK(std::holds_alternative(output.committed[0]->data)); } } BOOST_AUTO_TEST_CASE(test_election_single_node) { test_election_single_node_helper(fsm_cfg); } // Test that adding an entry to a single-node cluster // does not lead to RPC BOOST_AUTO_TEST_CASE(test_single_node_is_quiet) { server_id id1 = id(); raft::configuration cfg({id1}); raft::log log{raft::snapshot_descriptor{.config = cfg}}; auto fsm = create_follower(id1, std::move(log)); election_timeout(fsm); // Immediately converts from leader to follower if quorum=1 BOOST_CHECK(fsm.is_leader()); (void) fsm.get_output(); fsm.add_entry(raft::command{}); BOOST_CHECK(fsm.get_output().messages.empty()); fsm.tick(); BOOST_CHECK(fsm.get_output().messages.empty()); } BOOST_AUTO_TEST_CASE(test_snapshot_follower_is_quiet) { server_id id1 = id(), id2 = id(); raft::configuration cfg({id1, id2}); raft::log log(raft::snapshot_descriptor{.idx = index_t{999}, .config = cfg}); log.emplace_back(seastar::make_lw_shared(raft::log_entry{term_t{10}, index_t{1000}})); log.stable_to(log.last_idx()); fsm_debug fsm(id1, term_t{10}, server_id{}, std::move(log), trivial_failure_detector, fsm_cfg); // become leader election_timeout(fsm); fsm.step(id2, raft::vote_reply{fsm.get_current_term(), true}); BOOST_CHECK(fsm.is_leader()); // clear output (void) fsm.get_output(); // reply with reject pointing into the snapshot fsm.step(id2, raft::append_reply{fsm.get_current_term(), raft::index_t{1}, raft::append_reply::rejected{raft::index_t{1000}, raft::index_t{1}}}); BOOST_CHECK(fsm.get_progress(id2).state == raft::follower_progress::state::SNAPSHOT); // clear output (void) fsm.get_output(); for (int i = 0; i < 100; i++) { fsm.tick(); BOOST_CHECK(fsm.get_output().messages.empty()); } } BOOST_AUTO_TEST_CASE(test_election_two_nodes) { discrete_failure_detector fd; server_id id1 = id(), id2 = id(); raft::configuration cfg({id1, id2}); raft::log log{raft::snapshot_descriptor{.config = cfg}}; auto fsm = create_follower(id1, std::move(log), fd); // Initial state is follower BOOST_CHECK(fsm.is_follower()); // After election timeout, a follower becomes a candidate election_timeout(fsm); BOOST_CHECK(fsm.is_candidate()); // If nothing happens, the candidate stays this way election_timeout(fsm); BOOST_CHECK(fsm.is_candidate()); auto output = fsm.get_output(); BOOST_CHECK(output.term_and_vote); // After a favourable reply, we become a leader (quorum is 2) fsm.step(id2, raft::vote_reply{output.term_and_vote->first, true}); BOOST_CHECK(fsm.is_leader()); // Out of order response to the previous election is ignored fsm.step(id2, raft::vote_reply{output.term_and_vote->first - term_t{1}, false}); BOOST_CHECK(fsm.is_leader()); // Any message with a newer term after election timeout // -> immediately convert to follower fd.mark_all_dead(); election_threshold(fsm); // Use current_term + 2 to switch fsm to follower // even if it itself switched to a candidate fsm.step(id2, raft::vote_request{output.term_and_vote->first + term_t{2}}); BOOST_CHECK(fsm.is_follower()); // Check that the candidate converts to a follower as well election_timeout(fsm); BOOST_CHECK(fsm.is_candidate()); output = fsm.get_output(); BOOST_CHECK(output.term_and_vote); fsm.step(id2, raft::vote_request{output.term_and_vote->first + term_t{1}}); BOOST_CHECK(fsm.is_follower()); // Test that a node doesn't cast a vote if it has voted for // self already (void) fsm.get_output(); while (fsm.is_follower()) { fsm.tick(); } BOOST_CHECK(fsm.is_candidate()); output = fsm.get_output(); auto msg = std::get(output.messages.back().second); fsm.step(id2, std::move(msg)); // We could figure out this round is going to a nowhere, but // we're not that smart and simply wait for a vote_reply. BOOST_CHECK(fsm.is_candidate()); output = fsm.get_output(); auto reply = std::get(output.messages.back().second); BOOST_CHECK(!reply.vote_granted); } BOOST_AUTO_TEST_CASE(test_election_four_nodes) { discrete_failure_detector fd; server_id id1 = id(), id2 = id(), id3 = id(), id4 = id(); raft::configuration cfg({id1, id2, id3, id4}); raft::log log{raft::snapshot_descriptor{.config = cfg}}; auto fsm = create_follower(id1, std::move(log), fd); // Initial state is follower BOOST_CHECK(fsm.is_follower()); // Inform FSM about a new leader at a new term fsm.step(id4, raft::append_request{term_t{1}, index_t{1}, term_t{1}}); (void) fsm.get_output(); // Request a vote during the same term. Even though // we haven't voted, we should deny a vote because we // know about a leader for this term. fsm.step(id3, raft::vote_request{term_t{1}, index_t{1}, term_t{1}}); auto output = fsm.get_output(); auto reply = std::get(output.messages.back().second); BOOST_CHECK(!reply.vote_granted); // Run out of steam for this term. Start a new one. fd.mark_all_dead(); election_timeout(fsm); BOOST_CHECK(fsm.is_candidate()); output = fsm.get_output(); BOOST_CHECK(output.term_and_vote); auto current_term = output.term_and_vote->first; // Add a favourable reply, not enough for quorum fsm.step(id2, raft::vote_reply{current_term, true}); BOOST_CHECK(fsm.is_candidate()); // Add another one, this adds up to quorum fsm.step(id3, raft::vote_reply{current_term, true}); BOOST_CHECK(fsm.is_leader()); } BOOST_AUTO_TEST_CASE(test_election_single_node_prevote) { auto fcfg = fsm_cfg; fcfg.enable_prevoting = true; test_election_single_node_helper(fcfg); } BOOST_AUTO_TEST_CASE(test_election_two_nodes_prevote) { auto fcfg = fsm_cfg; fcfg.enable_prevoting = true; server_id id1 = id(), id2 = id(); raft::configuration cfg({id1, id2}); raft::log log{raft::snapshot_descriptor{.config = cfg}}; raft::fsm fsm(id1, term_t{}, server_id{}, std::move(log), trivial_failure_detector, fcfg); // Initial state is follower BOOST_CHECK(fsm.is_follower()); // After election timeout, a follower becomes a prevote candidate election_timeout(fsm); BOOST_CHECK(fsm.is_prevote_candidate()); // Term was not increased BOOST_CHECK_EQUAL(fsm.get_current_term(), term_t{}); // If nothing happens, the candidate stays this way election_timeout(fsm); BOOST_CHECK(fsm.is_prevote_candidate()); BOOST_CHECK_EQUAL(fsm.get_current_term(), term_t{}); auto output = fsm.get_output(); // After a favourable prevote reply, we become a regular candidate (quorum is 2) BOOST_CHECK(!output.term_and_vote); fsm.step(id2, raft::vote_reply{fsm.get_current_term(), true, true}); BOOST_CHECK(fsm.is_candidate() && !fsm.is_prevote_candidate()); // And increased our term this time BOOST_CHECK_EQUAL(fsm.get_current_term(), term_t{1}); election_timeout(fsm); // Check that rejected prevote with higher term causes prevote candidate move to follower fsm.step(id2, raft::vote_reply{term_t{2}, false, true}); BOOST_CHECK(fsm.is_follower()); BOOST_CHECK_EQUAL(fsm.get_current_term(), term_t{2}); election_timeout(fsm); (void)fsm.get_output(); // Check that receiving prevote with smaller term generate reject with newer term fsm.step(id2, raft::vote_request{term_t{1}, index_t{}, term_t{}, true}); output = fsm.get_output(); BOOST_CHECK_EQUAL(output.messages.size(), 1); auto msg = std::get(output.messages.back().second); BOOST_CHECK(msg.current_term == term_t{2} && !msg.vote_granted); // Check that prevote with higer term get a reply with term in the future // and does not change local term. // Move to follower again fsm.step(id2, raft::vote_reply{term_t{3}, false, true}); BOOST_CHECK(fsm.is_follower()); // Send prevote with higher term fsm.step(id2, raft::vote_request{term_t{4}, index_t{}, term_t{}, true}); output = fsm.get_output(); BOOST_CHECK_EQUAL(output.messages.size(), 1); // Reply has request's term msg = std::get(output.messages.back().second); BOOST_CHECK(msg.current_term == term_t{4} && msg.vote_granted); // But fsm current term stays the same BOOST_CHECK_EQUAL(fsm.get_current_term(), term_t{3}); } BOOST_AUTO_TEST_CASE(test_election_four_nodes_prevote) { auto fcfg = fsm_cfg; fcfg.enable_prevoting = true; discrete_failure_detector fd; server_id id1 = id(), id2 = id(), id3 = id(), id4 = id(); raft::configuration cfg({id1, id2, id3, id4}); raft::log log{raft::snapshot_descriptor{.config = cfg}}; raft::fsm fsm(id1, term_t{}, server_id{}, std::move(log), fd, fcfg); // Initial state is follower BOOST_CHECK(fsm.is_follower()); // Inform FSM about a new leader at a new term fsm.step(id4, raft::append_request{term_t{1}, index_t{1}, term_t{1}}); (void) fsm.get_output(); // Request a prevote during the same term. Even though // we haven't voted, we should deny a vote because we // know about a leader for this term. fsm.step(id3, raft::vote_request{term_t{1}, index_t{1}, term_t{1}, true}); auto output = fsm.get_output(); auto reply = std::get(output.messages.back().second); BOOST_CHECK(!reply.vote_granted && reply.is_prevote); // Run out of steam for this term. Start a new one. fd.mark_all_dead(); election_timeout(fsm); BOOST_CHECK(fsm.is_candidate() && fsm.is_prevote_candidate()); output = fsm.get_output(); // Add a favourable prevote reply, not enough for quorum BOOST_CHECK(!output.term_and_vote); fsm.step(id2, raft::vote_reply{fsm.get_current_term() + term_t{1}, true, true}); BOOST_CHECK(fsm.is_candidate() && fsm.is_prevote_candidate()); // Add another one, this adds up to quorum fsm.step(id3, raft::vote_reply{fsm.get_current_term() + term_t{1}, true, true}); BOOST_CHECK(fsm.is_candidate() && !fsm.is_prevote_candidate()); // Check that prevote with future term is answered even if we voted already // Request regular vote fsm.step(id2, raft::vote_request{fsm.get_current_term(), index_t{1}, term_t{1}, false}); // Clear message queue (void)fsm.get_output(); // Ask for prevote with future term fsm.step(id3, raft::vote_request{fsm.get_current_term() + term_t{1}, index_t{1}, term_t{1}, true}); output = fsm.get_output(); BOOST_CHECK_EQUAL(output.messages.size(), 1); reply = std::get(output.messages.back().second); BOOST_CHECK(reply.vote_granted && reply.is_prevote); } BOOST_AUTO_TEST_CASE(test_log_matching_rule) { server_id id1 = id(), id2 = id(), id3 = id(); raft::configuration cfg({id1, id2, id3}); raft::log log(raft::snapshot_descriptor{.idx = index_t{999}, .config = cfg}); log.emplace_back(seastar::make_lw_shared(raft::log_entry{term_t{10}, index_t{1000}})); log.stable_to(log.last_idx()); raft::fsm fsm(id1, term_t{10}, server_id{}, std::move(log), trivial_failure_detector, fsm_cfg); // Initial state is follower BOOST_CHECK(fsm.is_follower()); (void) fsm.get_output(); fsm.step(id2, raft::vote_request{term_t{9}, index_t{1001}, term_t{11}}); // Current term is too old - vote is not granted auto output = fsm.get_output(); BOOST_CHECK(output.messages.empty()); auto request_vote = [&](term_t term, index_t last_log_idx, term_t last_log_term) -> raft::vote_reply { fsm.step(id2, raft::vote_request{term, last_log_idx, last_log_term}); auto output = fsm.get_output(); return std::get(output.messages.back().second); }; // Last stable index is too small - vote is not granted BOOST_CHECK(!request_vote(term_t{11}, index_t{999}, term_t{10}).vote_granted); // Last stable term is too small - vote is not granted BOOST_CHECK(!request_vote(term_t{12}, index_t{1002}, term_t{9}).vote_granted); // Last stable term and index are equal to the voter's - vote // is granted BOOST_CHECK(request_vote(term_t{13}, index_t{1000}, term_t{10}).vote_granted); // Last stable term is the same, index is greater to the voter's - vote // is granted BOOST_CHECK(request_vote(term_t{14}, index_t{1001}, term_t{10}).vote_granted); // Both term and index are greater than the voter's - vote // is granted BOOST_CHECK(request_vote(term_t{15}, index_t{1001}, term_t{11}).vote_granted); } BOOST_AUTO_TEST_CASE(test_confchange_add_node) { server_id id1 = id(), id2 = id(), id3 = id(); raft::configuration cfg({id1, id2}); raft::log log(raft::snapshot_descriptor{.idx = index_t{100}, .config = cfg}); auto fsm = create_follower(id1, std::move(log)); // Initial state is follower BOOST_CHECK(fsm.is_follower()); // Turn to a leader election_timeout(fsm); BOOST_CHECK(fsm.is_candidate()); auto output = fsm.get_output(); BOOST_CHECK(output.term_and_vote); fsm.step(id2, raft::vote_reply{output.term_and_vote->first, true}); BOOST_CHECK(fsm.is_leader()); output = fsm.get_output(); // A new leader applies one dummy entry BOOST_CHECK_EQUAL(output.log_entries.size(), 1); if (output.log_entries.size()) { BOOST_CHECK(std::holds_alternative(output.log_entries[0]->data)); } BOOST_CHECK(output.committed.empty()); // accept dummy entry, otherwise no more entries will be sent BOOST_CHECK_EQUAL(output.messages.size(), 1); auto msg = std::get(output.messages.back().second); auto idx = msg.entries.back()->idx; fsm.step(id2, raft::append_reply{msg.current_term, idx, raft::append_reply::accepted{idx}}); raft::configuration newcfg({id1, id2, id3}); // Suggest a confchange. fsm.add_entry(newcfg); // Can't have two confchanges in progress. BOOST_CHECK_THROW(fsm.add_entry(newcfg), raft::conf_change_in_progress); // Entered joint configuration immediately. BOOST_CHECK(fsm.get_configuration().is_joint()); BOOST_CHECK_EQUAL(fsm.get_configuration().previous.size(), 2); BOOST_CHECK_EQUAL(fsm.get_configuration().current.size(), 3); output = fsm.get_output(); // The output contains a log entry to be committed. // Once it's committed, it will be replicated. // The output must contain messages both for id2 and id3 BOOST_CHECK_EQUAL(output.log_entries.size(), 1); // Append entry for id2 and id3 BOOST_CHECK_EQUAL(output.messages.size(), 2); msg = std::get(output.messages.back().second); idx = msg.entries.back().get()->idx; // In order to accept a configuration change // we need one ACK, since there is a quorum overlap. // Strictly speaking the new node needs to install a snapshot, // first, for simplicity let's assume it's happened already. fsm.step(id2, raft::append_reply{msg.current_term, idx, raft::append_reply::accepted{idx}}); // One reply is enough to commit the joint configuration, // since there is a quorum overlap between the two // configurations. BOOST_CHECK(! fsm.get_configuration().is_joint()); // Still can't have two confchanges in progress, even though // we left joint already, the final configuration is not // committed yet. BOOST_CHECK_THROW(fsm.add_entry(newcfg), raft::conf_change_in_progress); output = fsm.get_output(); // A log entry for the final configuration BOOST_CHECK_EQUAL(output.log_entries.size(), 1); // AppendEntries messages for the final configuration BOOST_CHECK(output.messages.size() >= 1); msg = std::get(output.messages.back().second); idx = msg.entries.back().get()->idx; // Ack AppendEntries for the final configuration fsm.step(id2, raft::append_reply{msg.current_term, idx, raft::append_reply::accepted{idx}}); BOOST_CHECK_EQUAL(fsm.get_configuration().current.size(), 3); fsm.step(id3, raft::append_reply{msg.current_term, idx, raft::append_reply::accepted{idx}}); // Check that we can start a new confchange raft::configuration newcfg2({id1, id2}); fsm.add_entry(newcfg); } BOOST_AUTO_TEST_CASE(test_confchange_remove_node) { server_id id1 = id(), id2 = id(), id3 = id(); raft::configuration cfg({id1, id2, id3}); raft::log log(raft::snapshot_descriptor{.idx = index_t{100}, .config = cfg}); auto fsm = create_follower(id1, std::move(log)); // Initial state is follower BOOST_CHECK(fsm.is_follower()); // Turn to a leader election_timeout(fsm); BOOST_CHECK(fsm.is_candidate()); auto output = fsm.get_output(); // Vote requests to id2 and id3 BOOST_CHECK_EQUAL(output.messages.size(), 2); if (output.messages.size() > 0) { BOOST_CHECK(std::holds_alternative(output.messages[0].second)); } if (output.messages.size() > 1) { BOOST_CHECK(std::holds_alternative(output.messages[1].second)); } BOOST_CHECK(output.term_and_vote); fsm.step(id2, raft::vote_reply{output.term_and_vote->first, true}); BOOST_CHECK(fsm.is_leader()); output = fsm.get_output(); BOOST_CHECK_EQUAL(output.log_entries.size(), 1); if (output.log_entries.size()) { BOOST_CHECK(std::holds_alternative(output.log_entries[0]->data)); } // accept dummy entry, otherwise no more entries will be sent BOOST_CHECK_EQUAL(output.messages.size(), 2); auto msg = std::get(output.messages.back().second); auto idx = msg.entries.back()->idx; fsm.step(id2, raft::append_reply{msg.current_term, idx, raft::append_reply::accepted{idx}}); fsm.step(id3, raft::append_reply{msg.current_term, idx, raft::append_reply::accepted{idx}}); raft::configuration newcfg({id1, id2}); // Suggest a confchange. fsm.add_entry(newcfg); // Entered joint configuration immediately. BOOST_CHECK(fsm.get_configuration().is_joint()); BOOST_CHECK_EQUAL(fsm.get_configuration().current.size(), 2); BOOST_CHECK_EQUAL(fsm.get_configuration().previous.size(), 3); output = fsm.get_output(); // The output contains a log entry to be committed. // Once it's committed, it will be replicated. BOOST_CHECK_EQUAL(output.log_entries.size(), 1); if (output.log_entries.size()) { BOOST_CHECK(std::holds_alternative(output.log_entries[0]->data)); } BOOST_CHECK_EQUAL(output.messages.size(), 2); // Configuration change sent to id2 and id3 BOOST_REQUIRE_NO_THROW(msg = std::get(output.messages[0].second)); BOOST_CHECK_EQUAL(msg.entries.size(), 1); BOOST_CHECK(std::holds_alternative(msg.entries[0]->data)); idx = msg.entries.back().get()->idx; BOOST_CHECK_EQUAL(idx, 102); // Ack AppendEntries for the joint configuration // In order to accept a configuration change // we need one ACK, since there is a quorum overlap. fsm.step(id2, raft::append_reply{msg.current_term, idx, raft::append_reply::accepted{idx}}); // Final configuration is proposed output = fsm.get_output(); // AppendEntries messages for the final configuration BOOST_CHECK_EQUAL(output.messages.size(), 1); BOOST_REQUIRE_NO_THROW(msg = std::get(output.messages[0].second)); // A log entry for the final configuration BOOST_CHECK_EQUAL(msg.entries.size(), 1); if (output.log_entries.size()) { BOOST_CHECK(std::holds_alternative(msg.entries[0]->data)); } idx = msg.entries.back().get()->idx; BOOST_CHECK_EQUAL(idx, 103); BOOST_CHECK_EQUAL(fsm.get_configuration().current.size(), 2); BOOST_CHECK(!fsm.get_configuration().is_joint()); // Ack AppendEntries for final configuration fsm.step(id2, raft::append_reply{msg.current_term, idx, raft::append_reply::accepted{idx}}); // Check that we can start a new confchange raft::configuration newcfg2({id1, id2, id3}); fsm.add_entry(newcfg); BOOST_CHECK_EQUAL(fsm.get_configuration().current.size(), 2); } BOOST_AUTO_TEST_CASE(test_confchange_replace_node) { server_id id1 = id(), id2 = id(), id3 = id(), id4 = id(); raft::configuration cfg({id1, id2, id3}); raft::log log(raft::snapshot_descriptor{.idx = index_t{100}, .config = cfg}); auto fsm = create_follower(id1, std::move(log)); // Initial state is follower BOOST_CHECK(fsm.is_follower()); // Turn to a leader election_timeout(fsm); BOOST_CHECK(fsm.is_candidate()); auto output = fsm.get_output(); BOOST_CHECK(output.term_and_vote); fsm.step(id2, raft::vote_reply{output.term_and_vote->first, true}); BOOST_CHECK(fsm.is_leader()); output = fsm.get_output(); BOOST_CHECK_EQUAL(output.log_entries.size(), 1); if (output.log_entries.size()) { BOOST_CHECK(std::holds_alternative(output.log_entries[0]->data)); } BOOST_CHECK(output.committed.empty()); // accept dummy entry, otherwise no more entries will be sent BOOST_CHECK_EQUAL(output.messages.size(), 2); auto msg = std::get(output.messages.back().second); auto idx = msg.entries.back()->idx; fsm.step(id2, raft::append_reply{msg.current_term, idx, raft::append_reply::accepted{idx}}); fsm.step(id3, raft::append_reply{msg.current_term, idx, raft::append_reply::accepted{idx}}); raft::configuration newcfg({id1, id2, id4}); // Suggest a confchange. fsm.add_entry(newcfg); // Entered joint configuration immediately. BOOST_CHECK(fsm.get_configuration().is_joint()); BOOST_CHECK_EQUAL(fsm.get_configuration().current.size(), 3); BOOST_CHECK_EQUAL(fsm.get_configuration().previous.size(), 3); output = fsm.get_output(); BOOST_REQUIRE_NO_THROW(msg = std::get(output.messages[0].second)); idx = msg.entries.back().get()->idx; // In order to accept a configuration change // we need two ACK, since there is a quorum overlap. fsm.step(id2, raft::append_reply{msg.current_term, idx, raft::append_reply::accepted{idx}}); BOOST_CHECK(!fsm.get_configuration().is_joint()); // final config to log output = fsm.get_output(); BOOST_CHECK_EQUAL(output.log_entries.size(), 1); if (output.log_entries.size()) { BOOST_CHECK(std::holds_alternative(output.log_entries[0]->data)); } // AppendEntries messages for the final configuration BOOST_CHECK(output.messages.size() >= 1); msg = std::get(output.messages.back().second); idx = msg.entries.back().get()->idx; // Ack AppendEntries for the final configuration fsm.step(id2, raft::append_reply{msg.current_term, idx, raft::append_reply::accepted{idx}}); BOOST_CHECK_EQUAL(fsm.get_configuration().current.size(), 3); BOOST_CHECK(!fsm.get_configuration().is_joint()); } BOOST_AUTO_TEST_CASE(test_leader_stepdown) { server_id id1 = id(), id2 = id(), id3 = id(); raft::configuration cfg({{id1}, {id2}, {id3, false}}); raft::log log(raft::snapshot_descriptor{.config = cfg}); raft::fsm fsm(id1, term_t{1}, /* voted for */ server_id{}, std::move(log), trivial_failure_detector, fsm_cfg); // Check that we move to candidate state on timeout_now message fsm.step(id2, raft::timeout_now{fsm.get_current_term()}); BOOST_CHECK(fsm.is_candidate()); auto output = fsm.get_output(); auto vote_request = std::get(output.messages.back().second); // Check that vote_request has `force` flag set. BOOST_CHECK(vote_request.force); // Turn to a leader fsm.step(id2, raft::vote_reply{fsm.get_current_term(), true}); BOOST_CHECK(fsm.is_leader()); // make id2's match idx to be up-to-date output = fsm.get_output(); auto append = std::get(output.messages.back().second); auto idx = append.entries.back()->idx; fsm.step(id2, raft::append_reply{fsm.get_current_term(), index_t{}, raft::append_reply::accepted{idx}}); // start leadership transfer while there is a fully up-to-date follower fsm.transfer_leadership(); // Check that timeout_now message is sent output = fsm.get_output(); BOOST_CHECK_EQUAL(output.messages.size(), 1); BOOST_CHECK(std::holds_alternative(output.messages.back().second)); // Turn to a leader again // ... first turn to a follower fsm.step(id2, raft::vote_request{fsm.get_current_term() + term_t{1}, index_t{10}, term_t{}, false, true}); BOOST_CHECK(fsm.is_follower()); (void)fsm.get_output(); // ... and now leader election_timeout(fsm); BOOST_CHECK(fsm.is_candidate()); output = fsm.get_output(); fsm.step(id2, raft::vote_reply{fsm.get_current_term(), true}); BOOST_CHECK(fsm.is_leader()); output = fsm.get_output(); append = std::get(output.messages.back().second); idx = append.entries.back()->idx; // start leadership transfer while there is no fully up-to-date follower // (dummy entry appended by become_leader is not replicated yet) fsm.transfer_leadership(); // check that no timeout_now message was sent output = fsm.get_output(); BOOST_CHECK_EQUAL(output.messages.size(), 0); // Now make non voting follower match the log and see that timeout_now is not sent fsm.step(id3, raft::append_reply{fsm.get_current_term(), index_t{}, raft::append_reply::accepted{idx}}); output = fsm.get_output(); BOOST_CHECK_EQUAL(output.messages.size(), 0); // Now make voting follower match the log and see that timeout_now is sent fsm.step(id2, raft::append_reply{fsm.get_current_term(), index_t{}, raft::append_reply::accepted{idx}}); output = fsm.get_output(); BOOST_CHECK_EQUAL(output.messages.size(), 1); BOOST_CHECK(std::holds_alternative(output.messages.back().second)); // Turn to a leader yet again // ... first turn to a follower fsm.step(id2, raft::vote_request{fsm.get_current_term() + term_t{1}, index_t{10}, term_t{}, false, true}); BOOST_CHECK(fsm.is_follower()); (void)fsm.get_output(); // ... and now leader election_timeout(fsm); BOOST_CHECK(fsm.is_candidate()); output = fsm.get_output(); fsm.step(id2, raft::vote_reply{fsm.get_current_term(), true}); BOOST_CHECK(fsm.is_leader()); // Commit dummy entry output = fsm.get_output(); append = std::get(output.messages.back().second); idx = append.entries.back()->idx; fsm.step(id2, raft::append_reply{fsm.get_current_term(), idx, raft::append_reply::accepted{idx}}); // Drop the leader from the current config and see that stepdown message is sent raft::configuration newcfg({{id2}, {id3, false}}); fsm.add_entry(newcfg); output = fsm.get_output(); append = std::get(output.messages.back().second); idx = append.entries.back()->idx; // Accept joint config entry on id2 fsm.step(id2, raft::append_reply{fsm.get_current_term(), idx, raft::append_reply::accepted{idx}}); // fms added new config to the log output = fsm.get_output(); append = std::get(output.messages.back().second); idx = append.entries.back()->idx; // Accept new config entry on id2 fsm.step(id2, raft::append_reply{fsm.get_current_term(), idx, raft::append_reply::accepted{idx}}); // And check that the deposed leader sent timeout_now output = fsm.get_output(); BOOST_CHECK_EQUAL(output.messages.size(), 1); BOOST_CHECK(std::holds_alternative(output.messages.back().second)); /// Check that leader stepdown works when the leader is removed from the config and there are entries above C_new in its log raft::configuration cfg2({{id1}, {id2}, {id3}}); raft::log log2(raft::snapshot_descriptor{.config = cfg}); raft::fsm fsm2(id1, term_t{1}, /* voted for */ server_id{}, std::move(log2), trivial_failure_detector, fsm_cfg); election_timeout(fsm2); // Turn to a leader fsm2.step(id2, raft::vote_reply{fsm2.get_current_term(), true}); BOOST_CHECK(fsm2.is_leader()); output = fsm2.get_output(); append = std::get(output.messages.back().second); idx = append.entries.back()->idx; // Accept the dummy on id2 fsm2.step(id2, raft::append_reply{fsm2.get_current_term(), idx, raft::append_reply::accepted{idx}}); // Accept the dummy on id3 fsm2.step(id3, raft::append_reply{fsm2.get_current_term(), idx, raft::append_reply::accepted{idx}}); // Drop the leader from the current config and see that stepdown message is sent raft::configuration newcfg2({{id2}, {id3}}); fsm2.add_entry(newcfg2); output = fsm2.get_output(); append = std::get(output.messages.back().second); idx = append.entries.back()->idx; // Accept joint config entry on id2 fsm2.step(id2, raft::append_reply{fsm2.get_current_term(), idx, raft::append_reply::accepted{idx}}); // Accept joint config entry on id3 fsm2.step(id3, raft::append_reply{fsm2.get_current_term(), idx, raft::append_reply::accepted{idx}}); // fsm added new config entry output = fsm2.get_output(); append = std::get(output.messages.back().second); idx = append.entries.back()->idx; fsm2.add_entry(raft::command{}); // add one more command that will be not replicated yet // Accept new config entry on id2 fsm2.step(id2, raft::append_reply{fsm2.get_current_term(), idx, raft::append_reply::accepted{idx}}); // Accept new config entry on id3 fsm2.step(id3, raft::append_reply{fsm2.get_current_term(), idx, raft::append_reply::accepted{idx}}); // C_new is now commited output = fsm2.get_output(); // this sends out the entry submitted after C_new append = std::get(output.messages.back().second); idx = append.entries.back()->idx; // Accept the entry fsm2.step(id2, raft::append_reply{fsm2.get_current_term(), idx, raft::append_reply::accepted{idx}}); // And check that the deposed leader sent timeout_now output = fsm2.get_output(); BOOST_CHECK_EQUAL(output.messages.size(), 1); BOOST_CHECK(std::holds_alternative(output.messages.back().second)); /// End test } BOOST_AUTO_TEST_CASE(test_empty_configuration) { // When a server is joining an existing cluster, its configuration is empty. // The leader sends its configuration over in AppendEntries or // ApplySnapshot RPC. Test this scenario. server_id id1 = id(); raft::configuration cfg({}); raft::log log(raft::snapshot_descriptor{.idx = index_t{0}, .config = cfg}); auto follower = create_follower(id1, std::move(log)); // Initial state is follower BOOST_CHECK(follower.is_follower()); election_timeout(follower); BOOST_CHECK(follower.is_follower()); auto output = follower.get_output(); BOOST_CHECK_EQUAL(output.log_entries.size(), 0); BOOST_CHECK_EQUAL(output.messages.size(), 0); BOOST_CHECK_EQUAL(follower.get_current_term(), 0); server_id id2 = id(); auto log2 = raft::log(raft::snapshot_descriptor{.idx = index_t{0}, .config = raft::configuration({id2})}); auto leader = create_follower(id2, std::move(log2)); election_timeout(leader); BOOST_CHECK(leader.is_leader()); // Transitioning to an empty configuration is not supported. BOOST_CHECK_THROW(leader.add_entry(raft::configuration({})), std::invalid_argument); leader.add_entry(raft::configuration({id1, id2})); communicate(leader, follower); BOOST_CHECK_EQUAL(follower.get_current_term(), 1); BOOST_CHECK_EQUAL(follower.in_memory_log_size(), leader.in_memory_log_size()); BOOST_CHECK_EQUAL(leader.get_configuration().is_joint(), false); } BOOST_AUTO_TEST_CASE(test_confchange_a_to_b) { // Test we can transition from a single-server configuration // {A} to a single server configuration {B} server_id A_id = id(); raft::log log(raft::snapshot_descriptor{.idx = index_t{0}, .config = raft::configuration{A_id}}); auto A = create_follower(A_id, log); election_timeout(A); BOOST_CHECK(A.is_leader()); // Let's have a non-empty log at A A.add_entry(log_entry::dummy{}); server_id B_id = id(); auto B = create_follower(B_id, log); A.add_entry(raft::configuration({B_id})); communicate(A, B); BOOST_CHECK_EQUAL(A.get_current_term(), 1); BOOST_CHECK(A.is_follower()); // A is not part of the current configuration BOOST_CHECK(B.is_leader()); BOOST_CHECK_EQUAL(B.get_current_term(), 2); BOOST_CHECK_EQUAL(B.get_configuration().is_joint(), false); BOOST_CHECK_EQUAL(B.get_configuration().current.size(), 1); BOOST_CHECK(B.get_configuration().current.contains(raft::server_address{B_id})); // Let's try the same configuration change now, but let's // restart the leader after persisting the joint // configuration. log = raft::log(raft::snapshot_descriptor{.idx = B.log_last_idx(), .term = B.log_last_term(), .config = B.get_configuration()}); // A somewhat awkward way to obtain B's log for restart log.emplace_back(make_lw_shared(B.add_entry(raft::configuration({A_id})))); log.stable_to(log.last_idx()); raft::fsm B_1(B_id, B.get_current_term(), B_id, std::move(log), trivial_failure_detector, fsm_cfg); election_timeout(B_1); communicate(A, B_1); BOOST_CHECK(B_1.is_follower()); election_timeout(A); BOOST_CHECK(A.is_leader()); // B_1 must be quiet after an election timeout and doesn't // disrupt the new configuration election_timeout(B_1); BOOST_CHECK(B_1.is_follower()); BOOST_CHECK_EQUAL(B_1.get_output().messages.size(), 0); } BOOST_AUTO_TEST_CASE(test_confchange_ab_to_cd) { // Similar to A -> B change, but with many nodes, // so C_new has to campaign after configuration change. server_id A_id = id(), B_id = id(), C_id = id(), D_id = id(); raft::log log(raft::snapshot_descriptor{.idx = index_t{0}, .config = raft::configuration{A_id, B_id}}); auto A = create_follower(A_id, log); auto B = create_follower(B_id, log); election_timeout(A); communicate(A, B); BOOST_CHECK(A.is_leader()); auto C = create_follower(C_id, log); auto D = create_follower(D_id, log); A.add_entry(raft::configuration({C_id, D_id})); communicate(A, B, C, D); BOOST_CHECK_EQUAL(A.get_current_term(), 1); // A and B are not part of the current configuration BOOST_CHECK(A.is_follower()); BOOST_CHECK(B.is_follower()); election_timeout(C); election_threshold(D); communicate(A, B, C, D); BOOST_CHECK_EQUAL(C.get_current_term(), 2); BOOST_CHECK(C.is_leader()); BOOST_CHECK_EQUAL(C.get_configuration().is_joint(), false); BOOST_CHECK_EQUAL(C.get_configuration().current.size(), 2); } BOOST_AUTO_TEST_CASE(test_confchange_abc_to_cde) { // Check configuration changes when C_old and C_new have no // common quorum, test leader change during configuration // change discrete_failure_detector fd; server_id A_id = id(), B_id = id(), C_id = id(), D_id = id(), E_id = id(); raft::log log(raft::snapshot_descriptor{.idx = index_t{0}, .config = raft::configuration{A_id, B_id, C_id}}); auto A = create_follower(A_id, log); auto B = create_follower(B_id, log, fd); auto C = create_follower(C_id, log, fd); election_timeout(A); communicate(A, B, C); BOOST_CHECK(A.is_leader()); auto D = create_follower(D_id, log); auto E = create_follower(E_id, log); A.add_entry(raft::configuration({C_id, D_id, E_id})); // Make sure C gets a new (joint) configuration entry. // It is stable, but not committed, because we need D or E // to commit it. communicate(A, B, C); // Leader change while committing a joint configuration fd.mark_dead(A_id); election_timeout(C); BOOST_CHECK(C.is_candidate()); // Ticking for election_threshold at B is // necessary for B to vote for C but not become // candidate itself. election_threshold(B); communicate(B, C, D, E); BOOST_CHECK(A.is_leader()); BOOST_CHECK_EQUAL(A.get_current_term(), 1); BOOST_CHECK(B.is_follower()); BOOST_CHECK(C.is_leader()); BOOST_CHECK(D.is_follower()); BOOST_CHECK(E.is_follower()); BOOST_CHECK(C.get_current_term() >= 2); BOOST_CHECK_EQUAL(C.get_configuration().is_joint(), false); BOOST_CHECK_EQUAL(C.get_configuration().current.size(), 3); } BOOST_AUTO_TEST_CASE(test_confchange_abcdef_to_abcgh) { // Test configuration changes in presence of down nodes in C_old discrete_failure_detector fd; server_id A_id = id(), B_id = id(), C_id = id(), D_id = id(), E_id = id(), F_id = id(), G_id = id(), H_id = id(); raft::log log(raft::snapshot_descriptor{.idx = index_t{0}, .config = raft::configuration{A_id, B_id, C_id, D_id, E_id, F_id}}); auto A = create_follower(A_id, log, fd); auto B = create_follower(B_id, log, fd); auto C = create_follower(C_id, log, fd); auto D = create_follower(D_id, log, fd); auto E = create_follower(E_id, log, fd); auto F = create_follower(F_id, log, fd); election_timeout(D); communicate(A, D, E, F); BOOST_CHECK(D.is_leader()); auto G = create_follower(G_id, log); auto H = create_follower(H_id, log); D.add_entry(raft::configuration({A_id, B_id, C_id, G_id, H_id})); // We can't transition to C_new in absence of C_old majority communicate(B, C, D, G, H); BOOST_CHECK(D.is_leader()); BOOST_CHECK(D.get_configuration().is_joint()); D.tick(); communicate(B, C, E, D, G, H); BOOST_CHECK(D.is_follower()); auto leader = select_leader(A, B, C, G, H); BOOST_CHECK_EQUAL(leader->get_configuration().is_joint(), false); BOOST_CHECK_EQUAL(leader->get_configuration().current.size(), 5); fd.mark_all_dead(); election_timeout(D); election_timeout(A); communicate(A, B, C, D, E, F, G, H); BOOST_CHECK(leader->is_leader()); } BOOST_AUTO_TEST_CASE(test_confchange_abcde_abcdefg) { // Check configuration changes work fine with many nodes down discrete_failure_detector fd; server_id A_id = id(), B_id = id(), C_id = id(), D_id = id(), E_id = id(), F_id = id(), G_id = id(); raft::log log(raft::snapshot_descriptor{.idx = index_t{0}, .config = raft::configuration{A_id, B_id, C_id, D_id, E_id}}); auto A = create_follower(A_id, log, fd); auto B = create_follower(B_id, log, fd); auto C = create_follower(C_id, log, fd); auto D = create_follower(D_id, log, fd); auto E = create_follower(E_id, log, fd); election_timeout(A); communicate(A, D, E); BOOST_CHECK(A.is_leader()); auto F = create_follower(F_id, log); auto G = create_follower(G_id, log); // Wrap configuration entry into some traffic A.add_entry(log_entry::dummy{}); A.add_entry(raft::configuration({A_id, B_id, C_id, D_id, E_id, F_id, G_id})); A.add_entry(log_entry::dummy{}); // Without tick() A won't re-try communication with nodes it // believes are down (B, C). A.tick(); // 4 is enough to transition to the new configuration communicate(A, B, C, G); BOOST_CHECK(A.is_leader()); BOOST_CHECK_EQUAL(A.get_configuration().is_joint(), false); BOOST_CHECK_EQUAL(A.get_configuration().current.size(), 7); A.tick(); communicate(A, B, C, D, E, F, G); BOOST_CHECK_EQUAL(A.log_last_idx(), B.log_last_idx()); BOOST_CHECK_EQUAL(A.log_last_idx(), C.log_last_idx()); BOOST_CHECK_EQUAL(A.log_last_idx(), D.log_last_idx()); BOOST_CHECK_EQUAL(A.log_last_idx(), E.log_last_idx()); BOOST_CHECK_EQUAL(A.log_last_idx(), F.log_last_idx()); BOOST_CHECK_EQUAL(A.log_last_idx(), G.log_last_idx()); BOOST_CHECK(A.is_leader()); BOOST_CHECK_EQUAL(A.get_configuration().is_joint(), false); BOOST_CHECK_EQUAL(A.get_configuration().current.size(), 7); } BOOST_AUTO_TEST_CASE(test_election_during_confchange) { server_id A_id = id(), B_id = id(), C_id = id(), D_id = id(), E_id = id(); raft::log log(raft::snapshot_descriptor{.idx = index_t{0}, .config = raft::configuration{A_id, B_id, C_id}}); // Joint config has reached old majority, the leader is // from new majority discrete_failure_detector fd; auto A = create_follower(A_id, log, fd); auto B = create_follower(B_id, log, fd); auto C = create_follower(C_id, log, fd); election_timeout(A); communicate(A, B, C); A.add_entry(raft::configuration({C_id, D_id, E_id})); communicate(A, B, C); fd.mark_dead(A_id); auto D = create_follower(D_id, log, fd); auto E = create_follower(E_id, log, fd); election_timeout(C); election_threshold(B); communicate_until([&C]() { return C.is_leader(); }, B, C, D, E); BOOST_CHECK_EQUAL(C.get_configuration().is_joint(), true); fd.mark_alive(A.id()); communicate(D, A, B, E); fd.mark_alive(C.id()); communicate_until([&C]() { return C.get_configuration().is_joint() == false; }, B, C, D, E); communicate(C, D); fd.mark_dead(C.id()); election_timeout(D); // E may still be in joint. It must vote for D anyway. D is in C_new // and will replicate C_new to E after becoming a leader election_threshold(E); A.tick(); communicate(A, D, E); BOOST_CHECK(D.is_leader()); BOOST_CHECK(A.is_follower()); BOOST_CHECK_EQUAL(D.get_configuration().is_joint(), false); BOOST_CHECK_EQUAL(D.get_configuration().current.size(), 3); } BOOST_AUTO_TEST_CASE(test_reply_from_removed_follower) { // Messages from followers may be delayed. Check they don't // upset the leader when they are delivered past configuration // change server_id A_id = id(), B_id = id(); raft::log log(raft::snapshot_descriptor{.idx = index_t{0}, .config = raft::configuration{A_id, B_id}}); auto A = create_follower(A_id, log); auto B = create_follower(B_id, log); election_timeout(A); communicate(A, B); A.add_entry(raft::configuration({A_id})); communicate(A, B); BOOST_CHECK(A.is_leader()); BOOST_CHECK_EQUAL(A.get_configuration().is_joint(), false); BOOST_CHECK_EQUAL(A.get_configuration().current.size(), 1); auto idx = A.log_last_idx(); A.step(B.id(), raft::append_reply{A.get_current_term(), idx, raft::append_reply::accepted{idx}}); A.step(B.id(), raft::append_reply{A.get_current_term(), idx, raft::append_reply::rejected{idx}}); A.step(B.id(), raft::snapshot_reply{A.get_current_term(), true}); BOOST_CHECK(A.is_leader()); } BOOST_AUTO_TEST_CASE(test_leader_ignores_messages_with_current_term) { // Check that the leader properly handles InstallSnapshot/AppendRequest/VoteRequest // messages carrying its own term. discrete_failure_detector fd; server_id A_id = id(), B_id = id(); raft::log log(raft::snapshot_descriptor{.idx = index_t{0}, .config = raft::configuration{A_id, B_id}}); auto A = create_follower(A_id, log, fd); auto B = create_follower(B_id, log, fd); election_timeout(A); communicate(A, B); BOOST_CHECK(A.is_leader()); // Check that InstallSnapshot with current term gets negative reply A.step(B.id(), raft::install_snapshot{A.get_current_term()}); auto output = A.get_output(); BOOST_CHECK_EQUAL(output.messages.size(), 1); raft::snapshot_reply msg; BOOST_REQUIRE_NO_THROW(msg = std::get(output.messages[0].second)); BOOST_CHECK(!msg.success); // Check that AppendRequest with current term is ignired by the leader A.step(B.id(), raft::append_request{A.get_current_term()}); output = A.get_output(); BOOST_CHECK_EQUAL(output.messages.size(), 0); // Check that VoteRequest with current term is not granted A.step(B.id(), raft::vote_request{A.get_current_term(), index_t{}, term_t{}, false, false}); output = A.get_output(); BOOST_CHECK_EQUAL(output.messages.size(), 1); raft::vote_reply msg1; BOOST_REQUIRE_NO_THROW(msg1 = std::get(output.messages[0].second)); BOOST_CHECK(!msg1.vote_granted); } BOOST_AUTO_TEST_CASE(test_leader_read_quorum) { discrete_failure_detector fd; server_id A_id = id(), B_id = id(), C_id = id(), D_id = id(); // 4 nodes 3 voting 1 non voting (quorum is 2) raft::server_address_set nodes{raft::server_address{A_id}, raft::server_address{B_id}, raft::server_address{C_id}, raft::server_address{D_id, false}}; raft::log log(raft::snapshot_descriptor{.idx = index_t{0}, .config = raft::configuration(nodes)}); auto A = create_follower(A_id, log, fd); auto B = create_follower(B_id, log, fd); auto C = create_follower(C_id, log, fd); auto D = create_follower(D_id, log, fd); election_timeout(A); communicate(A, B, C, D); BOOST_CHECK(A.is_leader()); // Just because timeout passes the leader does not stepdown if quorum of nodes is alive election_timeout(A); BOOST_CHECK(A.is_leader()); // One of voting members dies but the leader is still not steepping down because there is // a quorum of nodes that are still alive fd.mark_dead(C_id); election_timeout(A); BOOST_CHECK(A.is_leader()); // Non voting member dies and the leader is still not stepping down (there two voting members still) fd.mark_dead(D_id); election_timeout(A); BOOST_CHECK(A.is_leader()); // One more voting members dies and the leader becomes a follower now fd.mark_dead(B_id); election_timeout(A); BOOST_CHECK(!A.is_leader()); } BOOST_AUTO_TEST_CASE(test_zero) { server_id id{}; BOOST_CHECK_THROW(raft::configuration cfg({id}), std::invalid_argument); BOOST_CHECK_THROW(raft::configuration cfg(raft::server_address_set{raft::server_address{id}}), std::invalid_argument); BOOST_CHECK_THROW(create_follower(id, raft::log(raft::snapshot_descriptor{})), std::invalid_argument); } BOOST_AUTO_TEST_CASE(test_reordered_reject) { auto id1 = id(); raft::fsm fsm1(id1, term_t{1}, server_id{}, raft::log{raft::snapshot_descriptor{.config = {{{id1}}}}}, trivial_failure_detector, fsm_cfg); while (!fsm1.is_leader()) { fsm1.tick(); } fsm1.add_entry(log_entry::dummy{}); (void)fsm1.get_output(); auto id2 = id(); raft::fsm fsm2(id2, term_t{1}, server_id{}, raft::log{raft::snapshot_descriptor{.config = raft::configuration{}}}, trivial_failure_detector, fsm_cfg); raft_routing_map routes{{fsm1.id(), &fsm1}, {fsm2.id(), &fsm2}}; fsm1.add_entry(raft::configuration{{{fsm1.id()}, {fsm2.id()}}}); fsm1.tick(); // fsm1 sends append_entries with idx=2 to fsm2 auto append_idx2_1 = fsm1.get_output(); fsm1.tick(); // fsm1 sends append_entries with idx=2 to fsm2 (again) auto append_idx2_2 = fsm1.get_output(); raft::logger.trace("delivering first append idx=2"); deliver(routes, fsm1.id(), std::move(append_idx2_1.messages)); // fsm2 rejects the first idx=2 append auto reject_1 = fsm2.get_output(); raft::logger.trace("delivering second append idx=2"); deliver(routes, fsm1.id(), std::move(append_idx2_2.messages)); // fsm2 rejects the second idx=2 append auto reject_2 = fsm2.get_output(); raft::logger.trace("delivering first reject"); deliver(routes, fsm2.id(), std::move(reject_1.messages)); // fsm1 sends append_entries with idx=1 to fsm2 auto append_idx1 = fsm1.get_output(); raft::logger.trace("delivering append idx=1"); deliver(routes, fsm1.id(), std::move(append_idx1.messages)); // fsm2 accepts the idx=1 append auto accept = fsm2.get_output(); raft::logger.trace("delivering accept for append idx=1"); deliver(routes, fsm2.id(), std::move(accept.messages)); raft::logger.trace("delivering second reject"); deliver(routes, fsm2.id(), std::move(reject_2.messages)); } BOOST_AUTO_TEST_CASE(test_non_voter_stays_pipeline) { // Check that a node stays in PIPELINE mode // through configuration changes. server_id A_id = id(), B_id = id(); raft::server_address_set addrset{raft::server_address{A_id}, raft::server_address{B_id, false}}; raft::configuration cfg(addrset); raft::log log(raft::snapshot_descriptor{.idx = index_t{0}, .config = cfg}); auto A = create_follower(A_id, log); auto B = create_follower(B_id, log); election_timeout(A); communicate(A); BOOST_CHECK(A.is_leader()); BOOST_CHECK(A.get_progress(B_id).state == raft::follower_progress::state::PROBE); A.add_entry(log_entry::dummy{}); // We need to deliver a probe from A to B so that B gets // a change to respond and A can switch B to PIPELINE mode. A.tick(); communicate(A, B); BOOST_CHECK(A.get_progress(B_id).state == raft::follower_progress::state::PIPELINE); raft::configuration newcfg({A_id, B_id}); A.add_entry(newcfg); communicate(A, B); BOOST_CHECK_EQUAL(A.get_configuration().is_joint(), false); BOOST_CHECK_EQUAL(A.get_configuration().current.find(raft::server_address{B_id})->can_vote, true); BOOST_CHECK(A.get_progress(B_id).state == raft::follower_progress::state::PIPELINE); A.add_entry(cfg); auto tick_occasionally = [&A, &B]() { if (rolladice()) { A.tick(); B.tick(); } return false; }; communicate_until(tick_occasionally, A, B); BOOST_CHECK_EQUAL(A.get_configuration().is_joint(), false); BOOST_CHECK_EQUAL(A.get_configuration().current.find(raft::server_address{B_id})->can_vote, false); BOOST_CHECK(A.get_progress(B_id).state == raft::follower_progress::state::PIPELINE); } BOOST_AUTO_TEST_CASE(test_leader_change_to_non_voter) { // Test a two-node cluster, change a leader to a non-voter. server_id A_id = id(), B_id = id(); raft::server_address_set oldset{raft::server_address{A_id, true}, raft::server_address{B_id, false}}; raft::configuration cfg(oldset); raft::log log(raft::snapshot_descriptor{.idx = index_t{0}, .config = cfg}); auto A = create_follower(A_id, log); auto B = create_follower(B_id, log); election_timeout(A); communicate(A, B); BOOST_CHECK(A.is_leader()); raft::server_address_set newset{raft::server_address{A_id, false}, raft::server_address{B_id, true}}; raft::configuration newcfg(newset); A.add_entry(newcfg); A.tick(); communicate(A, B); BOOST_CHECK(A.is_follower()); BOOST_CHECK(B.is_leader()); // Try to switch the leader to a non-voter, leaving no other voters. newset = raft::server_address_set{raft::server_address{A_id, false}, raft::server_address{B_id, false}}; newcfg = raft::configuration(newset); BOOST_CHECK_THROW(B.add_entry(newcfg), std::invalid_argument); // Try to remove the last remaining voter newset = raft::server_address_set{raft::server_address{B_id, false}}; newcfg = raft::configuration(newset); BOOST_CHECK_THROW(B.add_entry(newcfg), std::invalid_argument); } BOOST_AUTO_TEST_CASE(test_non_voter_gets_timeout_now) { // Test that even if a non-voter gets timeout now, there is no // elections and later this learner can rejoin the cluster, // although it does disrupt the cluster a bit (through // leader's having to increase its term). server_id A_id = id(), B_id = id(), C_id = id(); raft::configuration cfg(raft::server_address_set{ raft::server_address{A_id}, raft::server_address{B_id}, raft::server_address{C_id, false}}); raft::log log(raft::snapshot_descriptor{.idx = index_t{0}, .config = cfg}); auto A = create_follower(A_id, log); auto B = create_follower(B_id, log); auto C = create_follower(C_id, log); election_timeout(A); communicate(A, B, C); BOOST_CHECK(A.is_leader()); C.step(A.id(), raft::timeout_now{.current_term = A.get_current_term()}); C.tick(); auto output = C.get_output(); BOOST_CHECK(C.is_follower()); BOOST_CHECK_EQUAL(output.messages.size(), 0); BOOST_CHECK(!output.term_and_vote); A.add_entry(log_entry::dummy{}); communicate(A, B, C); BOOST_CHECK_EQUAL(A.log_last_idx(), C.log_last_idx()); BOOST_CHECK_EQUAL(A.get_current_term(), C.get_current_term()); BOOST_CHECK(A.is_leader()); } BOOST_AUTO_TEST_CASE(test_non_voter_election_timeout) { // Test that non-voter doesn't start election even if its // election timeout expires and it doesn't see a valid leader. discrete_failure_detector fd; server_id A_id = id(), B_id = id(), C_id = id(); raft::configuration cfg(raft::server_address_set{ raft::server_address{A_id}, raft::server_address{B_id}, raft::server_address{C_id, false}}); raft::log log(raft::snapshot_descriptor{.idx = index_t{0}, .config = cfg}); auto A = create_follower(A_id, log, fd); auto B = create_follower(B_id, log, fd); auto C = create_follower(C_id, log, fd); election_timeout(A); communicate(A, B, C); BOOST_CHECK(A.is_leader()); fd.mark_all_dead(); auto C_term = C.get_current_term(); election_timeout(C); BOOST_CHECK(C.is_follower()); BOOST_CHECK_EQUAL(C_term, C.get_current_term()); } BOOST_AUTO_TEST_CASE(test_non_voter_voter_loop) { // Test voter-non-voter change in a loop server_id A_id = id(), B_id = id(), C_id = id(); raft::configuration cfg({A_id, B_id, C_id}); raft::configuration cfg_with_non_voter(raft::server_address_set{ raft::server_address{A_id}, raft::server_address{B_id}, raft::server_address{C_id, false}}); raft::log log(raft::snapshot_descriptor{.idx = index_t{0}, .config = cfg}); auto A = create_follower(A_id, log); auto B = create_follower(B_id, log); auto C = create_follower(C_id, log); election_timeout(A); communicate(A, B, C); BOOST_CHECK(A.is_leader()); for (int i = 0; i < 100; ++i) { A.add_entry(i % 2 ? cfg_with_non_voter : cfg); if (rolladice()) { A.add_entry(log_entry::dummy{}); } communicate(A, B, C); if (rolladice()) { A.add_entry(log_entry::dummy()); communicate(A, B, C); } // If iteration count is large, this helps save some // memory if (rolladice(1./1000)) { A.get_log().apply_snapshot(log_snapshot(A.get_log(), A.log_last_idx()), 0); } if (rolladice(1./100)) { B.get_log().apply_snapshot(log_snapshot(A.get_log(), B.log_last_idx()), 0); } if (rolladice(1./5000)) { C.get_log().apply_snapshot(log_snapshot(A.get_log(), B.log_last_idx()), 0); } } BOOST_CHECK(A.is_leader()); BOOST_CHECK_EQUAL(A.get_current_term(), C.get_current_term()); BOOST_CHECK_EQUAL(A.log_last_idx(), C.log_last_idx()); } BOOST_AUTO_TEST_CASE(test_non_voter_confchange_in_snapshot) { // Test non-voter learns it's a non-voter via snapshot discrete_failure_detector fd; server_id A_id = id(), B_id = id(), C_id = id(); raft::configuration cfg({A_id, B_id, C_id}); raft::log log(raft::snapshot_descriptor{.idx = index_t{0}, .config = cfg}); auto A = create_follower(A_id, log, fd); auto B = create_follower(B_id, log, fd); auto C = create_follower(C_id, log, fd); election_timeout(A); communicate(A, B, C); BOOST_CHECK(A.is_leader()); A.add_entry(log_entry::dummy{}); raft::configuration cfg_with_non_voter(raft::server_address_set{ raft::server_address{A_id}, raft::server_address{B_id}, raft::server_address{C_id, false}}); A.tick(); A.add_entry(cfg_with_non_voter); A.tick(); // Majority commits the configuration change communicate(A, B); BOOST_CHECK_EQUAL(A.get_configuration().is_joint(), false); BOOST_CHECK_EQUAL(A.get_configuration().current.find(raft::server_address{C_id})->can_vote, false); A.tick(); raft::snapshot_descriptor A_snp{.idx = A.log_last_idx(), .term = A.log_last_term(), .config = A.get_configuration()}; A.apply_snapshot(A_snp, 0, true); A.tick(); communicate(A, B, C); BOOST_CHECK(A.is_leader()); BOOST_CHECK_EQUAL(A.get_current_term(), C.get_current_term()); BOOST_CHECK_EQUAL(A.log_last_idx(), C.log_last_idx()); // A non-voter doesn't become candidate on election timeout fd.mark_all_dead(); election_timeout(C); BOOST_CHECK(C.is_follower()); // Now try the same trick, but this time convert a non-voter // to a voter with a snapshot fd.mark_all_alive(); A.tick(); for (int i = 0; i < 100; i++) { A.add_entry(log_entry::dummy{}); } A.add_entry(cfg); A.tick(); // Majority commits the configuration change communicate(A, B); BOOST_CHECK_EQUAL(A.get_configuration().is_joint(), false); BOOST_CHECK_EQUAL(A.get_configuration().current.find(raft::server_address{C_id})->can_vote, true); A.tick(); A_snp = raft::snapshot_descriptor{.idx = A.log_last_idx(), .term = A.log_last_term(), .config = A.get_configuration()}; A.apply_snapshot(A_snp, 0, true); A.tick(); communicate(A, B, C); BOOST_CHECK(A.is_leader()); BOOST_CHECK_EQUAL(A.get_current_term(), C.get_current_term()); BOOST_CHECK_EQUAL(A.log_last_idx(), C.log_last_idx()); fd.mark_all_dead(); election_timeout(C); BOOST_CHECK(C.is_candidate()); // Check an ex-voter can become a leader alright (LearnerPromotion) election_threshold(B); communicate(C, B); BOOST_CHECK(C.is_leader()); } BOOST_AUTO_TEST_CASE(test_non_voter_can_vote) { // Test non-voter can vote when it is requested to - it may // not be aware of the configuration in which it is a voter discrete_failure_detector fd; server_id A_id = id(), B_id = id(), C_id = id(); raft::configuration cfg(raft::server_address_set{ raft::server_address{A_id}, raft::server_address{B_id}, raft::server_address{C_id, false}}); raft::log log(raft::snapshot_descriptor{.idx = index_t{0}, .config = cfg}); auto A = create_follower(A_id, log, fd); auto B = create_follower(B_id, log, fd); auto C = create_follower(C_id, log, fd); election_timeout(A); communicate(A, B, C); BOOST_CHECK(A.is_leader()); raft::configuration cfg_all_voters({A_id, B_id, C_id}); A.add_entry(cfg_all_voters); // Majority commits the configuration change communicate(A, B); BOOST_CHECK_EQUAL(A.get_configuration().is_joint(), false); BOOST_CHECK_EQUAL(A.get_configuration().current.find(raft::server_address{C_id})->can_vote, true); BOOST_CHECK_EQUAL(A.log_last_idx(), B.log_last_idx()); fd.mark_dead(A_id); election_timeout(B); election_threshold(C); // B and C are enough to elect B in the new configuration. communicate(B, C); BOOST_CHECK(B.is_leader()); BOOST_CHECK_EQUAL(B.get_current_term(), C.get_current_term()); BOOST_CHECK_EQUAL(B.log_last_idx(), C.log_last_idx()); } BOOST_AUTO_TEST_CASE(test_leader_transferee_dies_upon_receiving_timeout_now) { /// 4-node cluster (A, B, C, D). A is initially elected a leader. /// The leader adds a new configuration entry, that removes it from the /// cluster (B, C, D). /// Communicate the cluster up to the point where A starts to resign /// its leadership (calls `transfer_leadership()`). /// At this point, A should send a `timeout_now` message to one /// the remaining nodes (B, C or D) and the new configuration should be /// committed. But no nodes actually have received the `timeout_now` message /// yet. /// /// Determine on which node the message should arrive, accept the /// `timeout_now` message and disconnect the target from the rest of the /// group. /// /// Check that after that the cluster, which has only two live members, /// could progress and elect a new leader through a normal election process. discrete_failure_detector fd; raft::server_id A_id = id(), B_id = id(), C_id = id(), D_id = id(); raft::log log(raft::snapshot_descriptor{.idx = raft::index_t{0}, .config = raft::configuration({A_id, B_id, C_id, D_id})}); auto A = create_follower(A_id, log, fd); auto B = create_follower(B_id, log, fd); auto C = create_follower(C_id, log, fd); auto D = create_follower(D_id, log, fd); raft_routing_map map; map.emplace(A_id, &A); map.emplace(B_id, &B); map.emplace(C_id, &C); map.emplace(D_id, &D); // A becomes leader election_timeout(A); communicate(A, B, C, D); BOOST_CHECK(A.is_leader()); // Add a cfg entry on leader that removes it from the cluster ({B_id, C_id, D_id}) raft::configuration newcfg({B_id, C_id, D_id}); A.add_entry(newcfg); // Commit new config and stop communicating right after A steps down due to // starting leadership transfer. communicate_until([&A] { return !A.is_leader(); }, A, B, C, D); // At this point A should have a `timeout_now` message in its message queue. BOOST_CHECK(A.is_follower()); // We cannot assume which node will be selected as the target for // `timeout_now` message, because the order in which A should test each // follower whether it's an eligible target for `timeout_now` is // unspecified. Let's call it X. X can be either B, C, or D. // // Maintain the routing map state since it will be used later to // determine which two nodes will remain in the cluster after partitioning // `timeout_now` target node (X) away. map.erase(A_id); // We don't really care on which node `timeout_now` message arrives so adapt // in a dynamic fashion. // // Check that A has sent the `timeout_now` message and determine to whom it was sent auto output = A.get_output(); BOOST_CHECK_EQUAL(output.messages.size(), 1); BOOST_CHECK(std::holds_alternative(output.messages.back().second)); auto timeout_now_target_id = output.messages.back().first; auto timeout_now_msg = std::get(output.messages.back().second); // Accept the message on target node... map[timeout_now_target_id]->step(A_id, std::move(timeout_now_msg)); // ...and immediately cut it from the rest of the cluster so that others think it's dead. fd.mark_dead(timeout_now_target_id); map.erase(timeout_now_target_id); // Two more nodes should remain in the cluster. // Again, we don't care which node from these two would like to become a leader, // so just select the first one in the list of remaining nodes. // // Wait for standard election_timeout() on the first node, and for election_threshold() // on the second. // Then, check, that the new leader is elected among these two remaining nodes. auto first_fsm = map.begin(); auto second_fsm = ++map.begin(); election_timeout(*first_fsm->second); election_threshold(*second_fsm->second); communicate(B, C, D); auto final_leader = select_leader(B, C, D); BOOST_CHECK(final_leader->id() == first_fsm->first || final_leader->id() == second_fsm->first); } BOOST_AUTO_TEST_CASE(test_leader_transfer_lost_timeout_now) { /// 3-node cluster (A, B, C). A is initially elected a leader. /// The leader adds a new configuration entry, that removes it from the /// cluster (B, C). /// /// Wait up until the former leader commits the new configuration and starts /// leader transfer procedure, sending out the `timeout_now` message to /// one of the remaining nodes. But at that point it haven't received it yet. /// /// Lose this message and verify that the rest of the cluster (B, C) /// can make progress and elect a new leader. raft::server_id A_id = id(), B_id = id(), C_id = id(); raft::log log(raft::snapshot_descriptor{.idx = raft::index_t{0}, .config = raft::configuration({A_id, B_id, C_id})}); auto A = create_follower(A_id, log); auto B = create_follower(B_id, log); auto C = create_follower(C_id, log); // A becomes leader election_timeout(A); communicate(A, B, C); BOOST_CHECK(A.is_leader()); // Add a cfg entry on leader that removes it from the cluster ({B_id, C_id}) raft::configuration newcfg({B_id, C_id}); A.add_entry(newcfg); // Commit new config and stop communicating right after A steps down due to // starting leadership transfer. communicate_until([&A] { return !A.is_leader(); }, A, B, C); // We don't really care on which node `timeout_now` message should arrive, // since it'll be lost, anyway. // // Check that the `timeout_now` message was sent... auto output = A.get_output(); BOOST_CHECK_EQUAL(output.messages.size(), 1); BOOST_CHECK(std::holds_alternative(output.messages.back().second)); auto timeout_now_msg = std::get(output.messages.back().second); (void)timeout_now_msg; // ... and lose it. // By now, B and C should both remain in the follower state. // Check that and attept to go forward with a normal election process to see // that the cluster operates normally after `timeout_now` has been lost. BOOST_CHECK(B.is_follower()); BOOST_CHECK(C.is_follower()); // Elect B a leader and check that normal election proceeds as expected. election_timeout(B); election_threshold(C); communicate(B, C); BOOST_CHECK(B.is_leader()); } BOOST_AUTO_TEST_CASE(test_leader_transfer_lost_force_vote_request) { /// 3-node cluster (A, B, C). A is initially elected a leader. /// The leader adds a new configuration entry, that removes it from the /// cluster (B, C). /// /// Wait up until the former leader commits the new configuration and starts /// leader transfer procedure, sending out the `timeout_now` message to /// one of the remaining nodes. But at that point it haven't received it yet. /// /// Deliver the `timeout_now` message to the target but lose all the /// `vote_request(force)` messages it attempts to send. /// This should halt the election process. /// Then wait for election timeout so that candidate node starts another /// normal election (without `force` flag for vote requests). /// /// Check that this candidate further makes progress and is elected a /// leader. raft::server_id A_id = id(), B_id = id(), C_id = id(); raft::log log(raft::snapshot_descriptor{.idx = raft::index_t{0}, .config = raft::configuration({A_id, B_id, C_id})}); auto A = create_follower(A_id, log); auto B = create_follower(B_id, log); auto C = create_follower(C_id, log); raft_routing_map map; map.emplace(A_id, &A); map.emplace(B_id, &B); map.emplace(C_id, &C); // A becomes leader election_timeout(A); communicate(A, B, C); BOOST_CHECK(A.is_leader()); // Add a cfg entry on leader that removes it from the cluster ({B_id, C_id}) raft::configuration newcfg({B_id, C_id}); A.add_entry(newcfg); // Commit new config and stop communicating right after A steps down due to // starting leadership transfer. communicate_until([&A] { return !A.is_leader(); }, A, B, C); map.erase(A_id); // We don't really care on which node `timeout_now` message arrives so adapt // in a dynamic fashion. // // Check that A has sent the `timeout_now` message and determine to whom it was sent auto output = A.get_output(); BOOST_CHECK_EQUAL(output.messages.size(), 1); BOOST_CHECK(std::holds_alternative(output.messages.back().second)); auto timeout_now_target_id = output.messages.back().first; auto timeout_now_msg = std::get(output.messages.back().second); // Accept the message on the node selected by A to be eligible for leadership transfer. auto& timeout_now_target = *map[timeout_now_target_id]; timeout_now_target.step(A_id, std::move(timeout_now_msg)); // New candidate should've sent a vote_request with force flag set output = timeout_now_target.get_output(); BOOST_CHECK_EQUAL(output.messages.size(), 1); BOOST_CHECK(std::holds_alternative(output.messages.front().second)); auto vote_req1 = std::get(output.messages.front().second); BOOST_CHECK(vote_req1.force); // Lose the forced vote request so that the candidates' election is halted. // After election timeout has passed it should become a regular candidate and // then proceed with non-force vote requests to elect itself a leader through // the normal election process. election_timeout(timeout_now_target); output = timeout_now_target.get_output(); BOOST_CHECK_EQUAL(output.messages.size(), 1); BOOST_CHECK(std::holds_alternative(output.messages.front().second)); // These requests will be sent after election threshold passes for other remaining nodes. auto vote_req1_regular = std::get(output.messages.front().second); auto vote_req1_regular_target = output.messages.front().first; BOOST_CHECK(!vote_req1_regular.force); // Pass election threshold for remaining node and send pending regular vote request election_threshold(*map[vote_req1_regular_target]); map[vote_req1_regular_target]->step(timeout_now_target_id, std::move(vote_req1_regular)); communicate(B, C); auto final_leader = select_leader(B, C); BOOST_CHECK(final_leader->id() == timeout_now_target_id); } // A follower should reject remote snapshots that are behind its current commit index. BOOST_AUTO_TEST_CASE(test_reject_outdated_remote_snapshot) { server_id A_id = id(), B_id = id(); raft::configuration cfg({A_id, B_id}); raft::log log(raft::snapshot_descriptor{.idx = index_t{0}, .config = cfg}); auto A = create_follower(A_id, log); auto B = create_follower(B_id, log); election_timeout(A); communicate(A, B); BOOST_CHECK(A.is_leader()); A.add_entry(log_entry::dummy{}); A.add_entry(log_entry::dummy{}); communicate(A, B); auto snp_idx = index_t{1}; BOOST_CHECK(B.log_last_idx() > snp_idx); auto snp_term = B.get_log().term_for(snp_idx); BOOST_CHECK(snp_term); auto snp = raft::snapshot_descriptor{.idx = index_t{1}, .term = *snp_term, .config = cfg}; BOOST_CHECK(!B.apply_snapshot(snp, 0, false)); // But it should apply this snapshot if it's locally generated BOOST_CHECK(B.apply_snapshot(snp, 0, true)); } // A server should sometimes become a candidate even though it is outside the current configuration, // for example if it's the only server that can become a leader (due to log lengths). BOOST_AUTO_TEST_CASE(test_candidate_outside_configuration) { server_id A_id = id(), B_id = id(); raft::server_address_set addrset{raft::server_address{A_id}, raft::server_address{B_id}}; raft::configuration cfg(addrset); raft::log log(raft::snapshot_descriptor{.idx = index_t{0}, .config = cfg}); discrete_failure_detector fd; auto A = create_follower(A_id, log, fd); auto B = create_follower(B_id, log, fd); election_timeout(A); communicate(A, B); BOOST_CHECK(A.is_leader()); raft::configuration newcfg({B_id}); A.add_entry(newcfg); BOOST_CHECK(!B.get_log().get_configuration().is_joint()); communicate_until([&A, &B] () { return !A.get_configuration().is_joint() && B.get_log().get_configuration().is_joint(); }, A, B); BOOST_CHECK(!A.get_configuration().is_joint()); BOOST_CHECK(B.get_log().get_configuration().is_joint()); fd.mark_dead(B_id); election_timeout(A); // A steps down because it cannot communicate with a quorum in the current configuration ({B}). BOOST_CHECK(!A.is_leader()); fd.mark_alive(B_id); election_timeout(A); // A should become a candidate - it is the only server that can become a leader; // B's configuration is joint and it can't receive a vote from A due to shorter log. BOOST_CHECK(A.is_candidate()); communicate_until([&A] () { return A.is_leader(); }, A, B); BOOST_CHECK(A.is_leader()); communicate(A, B); BOOST_CHECK(B.is_leader()); } BOOST_AUTO_TEST_CASE(test_read_barrier) { raft::server_id A_id = id(), B_id = id(), C_id = id(), D_id = id(), E_id = id(); raft::log log(raft::snapshot_descriptor{.idx = raft::index_t{0}, .config = raft::configuration({A_id, B_id, C_id, D_id})}); auto A = create_follower(A_id, log); auto B = create_follower(B_id, log); auto C = create_follower(C_id, log); auto D = create_follower(D_id, log); auto E = create_follower(E_id, log); // A becomes leader election_timeout(A); communicate(A, B, C, D); BOOST_CHECK(A.is_leader()); // propagate commit index A.tick(); communicate(A, B, C, D); // Check that a node outside of config cannot start read barrier BOOST_CHECK_THROW(A.start_read_barrier(E_id), std::runtime_error); // start read barrier auto rid = A.start_read_barrier(A_id); BOOST_CHECK(rid); // Check that read_quorum was broadcasted to other nodes auto output = A.get_output(); BOOST_CHECK_EQUAL(output.messages.size(), 3); BOOST_CHECK(std::holds_alternative(output.messages[0].second)); BOOST_CHECK(std::holds_alternative(output.messages[1].second)); BOOST_CHECK(std::holds_alternative(output.messages[2].second)); // Check that it gets re-broadcasted on leader's tick A.tick(); output = A.get_output(); BOOST_CHECK_EQUAL(output.messages.size(), 3); BOOST_CHECK(std::holds_alternative(output.messages[0].second)); BOOST_CHECK(std::holds_alternative(output.messages[1].second)); BOOST_CHECK(std::holds_alternative(output.messages[2].second)); auto read_quorum_msg = std::get(output.messages[0].second); // check that read id is correct BOOST_CHECK_EQUAL(read_quorum_msg.id, rid->first); // Check that a leader ignores read_barrier with its own term A.step(B_id, std::move(read_quorum_msg)); output = A.get_output(); BOOST_CHECK_EQUAL(output.messages.size(), 0); // Check that a follower replies to read_barrier with read_quorum_reply B.step(A_id, std::move(read_quorum_msg)); output = B.get_output(); BOOST_CHECK_EQUAL(output.messages.size(), 1); BOOST_CHECK(std::holds_alternative(output.messages[0].second)); auto read_quorum_reply_msg = std::get(output.messages[0].second); // Ack barrier from B and check that this is not enough to complete a read A.step(B_id, std::move(read_quorum_reply_msg)); output = A.get_output(); BOOST_CHECK(!output.max_read_id_with_quorum); // Ack from B one more time and check that ack is not counted twice A.step(B_id, std::move(read_quorum_reply_msg)); output = A.get_output(); BOOST_CHECK(!output.max_read_id_with_quorum); // Ack from C and check that the read barrier is completed A.step(C_id, std::move(read_quorum_reply_msg)); output = A.get_output(); BOOST_CHECK(output.max_read_id_with_quorum); // Enter joint config raft::configuration newcfg({A_id, E_id}); A.add_entry(newcfg); // Process log storing event and drop append_entries messages output = A.get_output(); // start read barrier rid = A.start_read_barrier(A_id); BOOST_CHECK(rid); // check that read_barrier is broadcasted to all nodes output = A.get_output(); BOOST_CHECK_EQUAL(output.messages.size(), 4); BOOST_CHECK(std::holds_alternative(output.messages[0].second)); BOOST_CHECK(std::holds_alternative(output.messages[1].second)); BOOST_CHECK(std::holds_alternative(output.messages[2].second)); BOOST_CHECK(std::holds_alternative(output.messages[3].second)); // Ack in only old quorum and check that the read is not completed A.step(B_id, read_quorum_reply{A.get_current_term(), index_t{0}, rid->first}); A.step(C_id, read_quorum_reply{A.get_current_term(), index_t{0}, rid->first}); A.step(D_id, read_quorum_reply{A.get_current_term(), index_t{0}, rid->first}); output = A.get_output(); BOOST_CHECK(!output.max_read_id_with_quorum); // Ack in new config as well and see that it is committed now A.step(E_id, read_quorum_reply{A.get_current_term(), index_t{0}, rid->first}); output = A.get_output(); BOOST_CHECK(output.max_read_id_with_quorum); // check that read_barrier with lower term does not depose the leader A.step(E_id, read_quorum{term_t{A.get_current_term() - 1}, index_t{10}, rid->first}); BOOST_CHECK(A.is_leader()); // check that read_barrier with higher term leads to leader // step down A.step(E_id, read_quorum{term_t{A.get_current_term() + 1}, index_t{10}, rid->first}); BOOST_CHECK(!A.is_leader()); // create one node cluster raft::log log1(raft::snapshot_descriptor{.idx = raft::index_t{0}, .config = raft::configuration({A_id})}); auto AA = create_follower(A_id, log1); // Make AA a leader election_timeout(AA); BOOST_CHECK(AA.is_leader()); output = AA.get_output(); // execute read barrier rid = AA.start_read_barrier(A_id); BOOST_CHECK(rid); // check that it completes immediately output = AA.get_output(); BOOST_CHECK(output.max_read_id_with_quorum); } BOOST_AUTO_TEST_CASE(test_append_entry_inside_snapshot) { server_id A_id = id(), B_id = id(), C_id = id(); raft::log log(raft::snapshot_descriptor{.idx = index_t{0}, .config = raft::configuration{A_id, B_id, C_id}}); auto A = create_follower(A_id, log); auto B = create_follower(B_id, log); auto C = create_follower(C_id, log); election_timeout(A); communicate(A, B, C); A.add_entry(log_entry::dummy{}); A.add_entry(log_entry::dummy{}); A.add_entry(log_entry::dummy{}); communicate(A, B, C); // Add new entry and commit it with B A.add_entry(log_entry::dummy{}); auto output = A.get_output(); BOOST_CHECK_EQUAL(output.messages.size(), 2); auto append = std::get(output.messages.back().second); B.step(A_id, std::move(append)); output = B.get_output(); BOOST_CHECK_EQUAL(output.messages.size(), 1); auto reply = std::get(output.messages.back().second); A.step(B_id, std::move(reply)); // A commits last entry here // propagate commit index to B A.tick(); communicate(A, B); // generate new message for C, first one will be empty // so feed it back to A and get next one A.tick(); output = A.get_output(); BOOST_CHECK_EQUAL(output.messages.size(), 1); append = std::get(output.messages.back().second); C.step(A_id, std::move(append)); output = C.get_output(); BOOST_CHECK_EQUAL(output.messages.size(), 1); reply = std::get(output.messages.back().second); A.step(C_id, std::move(reply)); output = A.get_output(); BOOST_CHECK_EQUAL(output.messages.size(), 1); append = std::get(output.messages.back().second); // Now send it to C and ignore the reply C.step(A_id, std::move(append)); (void)C.get_output(); // C snapshots the log C.apply_snapshot(log_snapshot(C.get_log(), C.log_last_idx()), 0, true); // Try to add one more entry A.add_entry(log_entry::dummy{}); A.tick(); communicate(A, B, C); BOOST_CHECK(!C.get_log().empty()); } BOOST_AUTO_TEST_CASE(test_ping_leader) { discrete_failure_detector fd; server_id A_id = id(), B_id = id(), C_id = id(); raft::configuration cfg(raft::server_address_set{ raft::server_address{A_id}, raft::server_address{B_id}, raft::server_address{C_id, false}}); raft::log log(raft::snapshot_descriptor{.idx = index_t{0}, .config = cfg}); auto A = create_follower(A_id, log, fd); auto B = create_follower(B_id, log, fd); auto C = create_follower(C_id, log, fd); election_timeout(A); communicate(A, B, C); BOOST_CHECK(A.is_leader()); // Check that non voter forgot a leader after election timeout. // It does not have to be this way, but currently our impl behaves this // way. fd.mark_all_dead(); election_timeout(C); BOOST_CHECK(!C.current_leader()); // Check that without any new input a node will not find out who leader is // after network repairs. fd.mark_all_alive(); communicate(A, B, C); BOOST_CHECK(!C.current_leader()); // Check that is we request leader ping then a node is able to find out // the leader after communicating with the cluster. C.ping_leader(); C.tick(); communicate(A, B, C); BOOST_CHECK(C.current_leader()); }