Files
scylladb/raft/server.hh
Petr Gusev 27e60ecbf4 raft server, log size limit in bytes
Before this patch we could get an OOM if we
received several big commands. The number of
commands was small, but their total size
in bytes was large.

snapshot_trailing_size is needed to guarantee
progress. Without this limit the fsm could
get stuck if the size of the next item is
greater than max_log_size - (size of trailing entries).
2022-09-26 13:10:10 +04:00

199 lines
10 KiB
C++

/*
* Copyright (C) 2020-present ScyllaDB
*/
/*
* SPDX-License-Identifier: AGPL-3.0-or-later
*/
#pragma once
#include <seastar/core/abort_source.hh>
#include "raft.hh"
namespace raft {
enum class wait_type {
committed,
applied
};
// A single uniquely identified participant of a Raft group.
class server {
public:
struct configuration {
// automatically snapshot state machine after applying
// this number of entries
size_t snapshot_threshold = 1024;
// Automatically snapshot state machine if the log memory usage exceeds this value.
// The value is in bytes.
// Must be smaller than max_log_size.
// It is recommended to set this value to no more than half of the max_log_size,
// so that snapshots are taken in advance and there is no backpressure due to max_log_size.
size_t snapshot_threshold_log_size = 2 * 1024 * 1024;
// how many entries to leave in the log after taking a snapshot
size_t snapshot_trailing = 200;
// Limit on the total number of bytes, consumed by snapshot trailing entries.
// Must be smaller than snapshot_threshold_log_size.
// It is recommended to set this value to no more than half of snapshot_threshold_log_size
// so that not all memory is held for trailing when taking a snapshot.
size_t snapshot_trailing_size = 1 * 1024 * 1024;
// max size of appended entries in bytes
size_t append_request_threshold = 100000;
// Limit in bytes on the size of in-memory part of the log after
// which requests are stopped to be admitted until the log
// is shrunk back by a snapshot.
// The following condition must be satisfied:
// max_command_size <= max_log_size - snapshot_trailing_size
// this ensures that trailing log entries won't block incoming commands and at least
// one command can fit in the log
size_t max_log_size = 4 * 1024 * 1024;
// If set to true will enable prevoting stage during election
bool enable_prevoting = true;
// If set to true, forward configuration and entries from
// follower to the leader autmatically. This guarantees
// add_entry()/modify_config() never throws not_a_leader,
// but makes timed_out_error more likely.
bool enable_forwarding = true;
// Max size of a single command, add_entry with a bigger command will throw command_is_too_big_error.
// The following condition must be satisfied:
// max_command_size <= max_log_size - snapshot_trailing_size
// this ensures that trailing log entries won't block incoming commands and at least
// one command can fit in the log
size_t max_command_size = 100 * 1024;
// A callback to invoke if one of internal server
// background activities has stopped because of an error.
std::function<void(std::exception_ptr e)> on_background_error;
};
virtual ~server() {}
// Add command to replicated log
// Returned future is resolved depending on wait_type parameter:
// 'committed' - when the entry is committed
// 'applied' - when the entry is applied (happens after it is committed)
// May fail because of an internal error or because leader changed and an entry was either
// replaced by the new leader or the server lost track of it. The former will result in
// dropped_entry exception the later in commit_status_unknown.
//
// If forwarding is enabled and this is a follower, and the returned future resolves without exception,
// this means that the entry is committed/applied locally (depending on the wait type).
// Applied locally means the local state machine replica applied this command;
// committed locally means simply that the commit index is beyond this entry's index.
//
// The caller may pass a pointer to an abort_source to make the operation abortable.
// If abort is requested before the operation finishes, the future will contain `raft::request_aborted` exception.
//
// Successfull `add_entry` with `wait_type::committed` does not guarantee that `state_machine::apply` will be called
// locally for this entry. Between the commit and the application we may receive a snapshot containing this entry,
// so the state machine's state 'jumps' forward in time, skipping the entry application.
// However, for `wait_type::applied`, we guarantee that the entry will be applied locally with `state_machine::apply`.
// If a snapshot causes the state machine to jump over the entry, `add_entry` will return `commit_status_unknown`
// (even if the snapshot included that entry).
virtual future<> add_entry(command command, wait_type type, seastar::abort_source* as = nullptr) = 0;
// Set a new cluster configuration. If the configuration is
// identical to the previous one does nothing.
//
// Does not preempt before adding entry to this server's
// in-memory log.
//
// Provided node_info is passed to rpc::add_server() for each
// new server and rpc::remove_server() is called for each
// departing server.
// struct node_info is expected to contain connection
// information/credentials which is then used by RPC.
// Can be called on a leader only, otherwise throws not_a_leader.
// Cannot be called until previous set_configuration() completes
// otherwise throws conf_change_in_progress exception.
//
// Waits until configuration completes, i.e. the server left the joint
// configuration. The server will apply a dummy entry to
// make sure this happens.
//
// Note: committing a dummy entry extends the opportunity for
// uncertainty, thus commit_status_unknown exception may be
// returned even in case of a successful config change.
//
// The caller may pass a pointer to an abort_source to make the operation abortable.
// If abort is requested before the operation finishes, the future will contain `raft::request_aborted` exception.
virtual future<> set_configuration(config_member_set c_new, seastar::abort_source* as = nullptr) = 0;
// A simplified wrapper around set_configuration() which adds
// and deletes servers. Unlike set_configuration(),
// works on a follower (if forwarding is enabled) as well as on a leader
// (forwards the request to the current leader). If the added servers are
// already part of the configuration, or deleted are not
// present, does nothing. The implementation forwards the
// list of added or removed servers to the leader, where they
// are transformed to a new configuration, which is then
// applied to the leader's log without preemption, bypassing
// the log limiter semaphore.
// This makes it possible to retry this command without
// adverse effects to the configuration.
//
// If forwarding is enabled and this is a follower, and the returned future resolves without exception,
// this means that a dummy entry appended after non-joint configuration entry was committed by the leader.
// The local commit index is not necessarily up-to-date yet and the state of the local state machine
// replica may still come from before the configuration entry.
// (exception: if no server was actually added or removed, then nothing gets committed and the leader responds immediately).
//
// The caller may pass a pointer to an abort_source to make the operation abortable.
// If abort is requested before the operation finishes, the future will contain `raft::request_aborted` exception.
virtual future<> modify_config(std::vector<config_member> add,
std::vector<server_id> del, seastar::abort_source* as = nullptr) = 0;
// Return the currently known configuration
virtual raft::configuration get_configuration() const = 0;
// Load persisted state and start background work that needs
// to run for this Raft server to function; The object cannot
// be used until the returned future is resolved.
virtual future<> start() = 0;
// Stop this Raft server, all submitted but not completed
// operations will get an error and callers will not be able
// to know if they succeeded or not. If this server was
// a leader it will relinquish its leadership and cease
// replication.
virtual future<> abort(sstring reason = "") = 0;
// Return Raft protocol current term.
virtual term_t get_current_term() const = 0;
// May be called before attempting a read from the local state
// machine. The read should proceed only after the returned
// future has resolved successfully.
//
// The caller may pass a pointer to an abort_source to make the operation abortable.
// If abort is requested before the operation finishes, the future will contain `raft::request_aborted` exception.
virtual future<> read_barrier(seastar::abort_source* as = nullptr) = 0;
// Initiate leader stepdown process.
// If the node is not a leader returns not_a_leader exception.
// In case of a timeout returns timeout_error.
virtual future<> stepdown(logical_clock::duration timeout) = 0;
// Register metrics for this server. Metric are global but their names
// depend on the server's ID, so it is possible to register metrics
// of two servers iff their IDs are different.
virtual void register_metrics() = 0;
// Ad hoc functions for testing
virtual void wait_until_candidate() = 0;
virtual future<> wait_election_done() = 0;
virtual future<> wait_log_idx_term(std::pair<index_t, term_t> idx_log) = 0;
virtual std::pair<index_t, term_t> log_last_idx_term() = 0;
virtual void elapse_election() = 0;
virtual bool is_leader() = 0;
virtual void tick() = 0;
// Server id of this server
virtual raft::server_id id() const = 0;
virtual void set_applier_queue_max_size(size_t queue_max_size) = 0;
};
std::unique_ptr<server> create_server(server_id uuid, std::unique_ptr<rpc> rpc,
std::unique_ptr<state_machine> state_machine, std::unique_ptr<persistence> persistence,
seastar::shared_ptr<failure_detector> failure_detector, server::configuration config);
} // namespace raft