mirror of
https://github.com/scylladb/scylladb.git
synced 2026-04-25 11:00:35 +00:00
The direct failure detector operates on abstract `endpoint_id`s for
pinging. The `pigner` interface is responsible for translating these IDs
to 'real' addresses.
Earlier we used two types of addresses: IP addresses in 'production'
code (`gms::gossiper::direct_fd_pinger`) and `raft::server_id`s in test
code (in `randomized_nemesis_test`). For each of these use cases we
would maintain mappings between `endpoint_id`s and the address type.
In recent commits we switched the 'production' code to also operate on
Raft server IDs, which are UUIDs underneath.
In this commit we switch `endpoint_id`s from `unsigned` type to
`utils::UUID`. Because each use case operates in Raft server IDs, we can
perform a simple translation: `raft_id.uuid()` to get an `endpoint_id`
from a Raft ID, `raft::server_id{ep_id}` to obtain a Raft ID from
an `endpoint_id`. We no longer have to maintain complex sharded data
structures to store the mappings.
177 lines
6.9 KiB
C++
177 lines
6.9 KiB
C++
/*
|
|
* Copyright (C) 2022-present ScyllaDB
|
|
*/
|
|
|
|
/*
|
|
* SPDX-License-Identifier: AGPL-3.0-or-later
|
|
*/
|
|
#pragma once
|
|
|
|
#include "utils/UUID.hh"
|
|
|
|
#include <seastar/core/sharded.hh>
|
|
|
|
using namespace seastar;
|
|
|
|
namespace seastar {
|
|
class abort_source;
|
|
}
|
|
|
|
namespace direct_failure_detector {
|
|
|
|
class pinger {
|
|
public:
|
|
// Opaque endpoint ID.
|
|
// A specific implementation of `pinger` maps those IDs to 'real' addresses.
|
|
using endpoint_id = utils::UUID;
|
|
|
|
// Send a message to `ep` and wait until it responds.
|
|
// The wait can be aborted using `as`.
|
|
// Abort should be signalized with `abort_requested_exception`.
|
|
//
|
|
// If the ping fails in an expected way (e.g. the endpoint is down and refuses to connect),
|
|
// returns `false`. If it succeeds, returns `true`.
|
|
virtual future<bool> ping(endpoint_id ep, abort_source& as) = 0;
|
|
|
|
protected:
|
|
// The `pinger` object must not be destroyed through the `pinger` interface.
|
|
// `failure_detector` does not take ownership of `pinger`, only a non-owning reference.
|
|
~pinger() = default;
|
|
};
|
|
|
|
// A clock that uses abstract units to measure time.
|
|
// The implementation is responsible for periodically advancing the clock.
|
|
//
|
|
// The timepoints used by the clock do not have to be related to wall-clock time. The clock is only used as a 'stopwatch' (for measuring intervals).
|
|
// For useful failure detection the clock should be steady:
|
|
// it should be monotonic and the time elapsed between two consecutive ticks should be ~constant.
|
|
class clock {
|
|
public:
|
|
using timepoint_t = int64_t;
|
|
using interval_t = int64_t;
|
|
|
|
// Returns current time.
|
|
// The returned values should be non-decreasing.
|
|
virtual timepoint_t now() noexcept = 0;
|
|
|
|
// Wait until `now()` >= `tp`.
|
|
// The condition is guaranteed to be true when the future resolves unless the sleep was aborted using `as`.
|
|
// If `now()` >= `tp` when the function is called, it returns a ready future.
|
|
// Aborts should be signalized using `seastar::sleep_aborted`.
|
|
virtual future<> sleep_until(timepoint_t tp, abort_source& as) = 0;
|
|
|
|
protected:
|
|
// The `clock` object must not be destroyed through the `clock` interface.
|
|
// `failure_detector` does not take ownership of `clock`, only a non-owning reference.
|
|
~clock() = default;
|
|
};
|
|
|
|
class listener {
|
|
public:
|
|
// Called when an endpoint in the detected set (added by `failure_detector::add_endpoint`) responds to a ping
|
|
// after being previously marked dead.
|
|
virtual future<> mark_alive(pinger::endpoint_id) = 0;
|
|
|
|
// Called when an endpoint in the detected set does not respond to a ping for a long enough time
|
|
// after being previously marked 'alive'.
|
|
// The time threshold is specified when registering the listener.
|
|
//
|
|
// A newly added endpoint is considered dead until it first responds.
|
|
//
|
|
// When an alive endpoint is removed from the detected set, a final mark_dead notification is sent.
|
|
virtual future<> mark_dead(pinger::endpoint_id) = 0;
|
|
|
|
protected:
|
|
// The `listener` object must not be destroyed through the `listener` interface.
|
|
// `failure_detector` does not take ownership of `listener`s, only non-owning references.
|
|
~listener() = default;
|
|
};
|
|
|
|
class failure_detector;
|
|
|
|
// A RAII object returned when registering a `listener`.
|
|
// The listener is unregistered when this object is destroyed.
|
|
//
|
|
// All subscriptions must be destroyed before the failure detector service is stopped.
|
|
class subscription {
|
|
failure_detector& _fd;
|
|
listener* _listener;
|
|
|
|
subscription(failure_detector&, listener&) noexcept;
|
|
friend class failure_detector;
|
|
|
|
public:
|
|
subscription(subscription&&) noexcept;
|
|
|
|
// Unregisters the listener.
|
|
~subscription();
|
|
};
|
|
|
|
class failure_detector : public seastar::peering_sharded_service<failure_detector> {
|
|
class impl;
|
|
std::unique_ptr<impl> _impl;
|
|
|
|
friend struct endpoint_worker;
|
|
friend struct subscription;
|
|
|
|
public:
|
|
failure_detector(
|
|
pinger& pinger, clock& clock,
|
|
|
|
// Every endpoint in the detected set will be periodically pinged every `ping_period`,
|
|
// assuming that the pings return in a timely manner. A ping may take longer than `ping_period`
|
|
// before it's aborted (up to a certain multiple of `ping_period`), in which case the next ping
|
|
// will start immediately.
|
|
//
|
|
// `ping_period` should be chosen so that during normal operation, a ping takes significantly
|
|
// less time than `ping_period` (preferably at least an order of magnitude less).
|
|
//
|
|
// The passed-in value must be the same on every shard.
|
|
clock::interval_t ping_period
|
|
);
|
|
|
|
~failure_detector();
|
|
|
|
// Stop all tasks. Must be called before the object is destroyed.
|
|
// All listener subscriptions must be destroyed before `stop()` is called.
|
|
future<> stop();
|
|
|
|
// Receive updates about endpoints in the detected set, where an endpoint is considered dead
|
|
// if the duration between last `ping()` response and `clock.now()` passes `threshold`.
|
|
// Note: the `mark_dead` notification may be sent earlier if we know ahead of time
|
|
// that `threshold` will be crossed before the next `ping()` can start.
|
|
//
|
|
// Listeners must be added before any endpoints are added to the detected set (e.g. during Scylla startup).
|
|
// The same listener& must not be registered twice (even if it was deregistered before the second time).
|
|
//
|
|
// The listener stops being called when the returned subscription is destroyed.
|
|
// The subscription must be destroyed before service is stopped.
|
|
//
|
|
// `threshold` should be significantly larger than `ping_period`, preferably at least an order of magnitude larger.
|
|
//
|
|
// Different listeners may use different thresholds, depending on the use case:
|
|
// some listeners may want to mark endpoints as dead more aggressively if fast reaction times are important
|
|
// (e.g. it's important to elect a new Raft leader quickly when the current one is dead),
|
|
// while others may prefer stability (e.g. if availability is not lost even though a dead endpoint is considered alive).
|
|
//
|
|
// Listeners can be registered on any shard. If your use case requires to have a listener on multiple shards,
|
|
// you need to register a separate listener on each shard.
|
|
future<subscription> register_listener(
|
|
listener&,
|
|
clock::interval_t threshold);
|
|
|
|
// Add this endpoint to the detected set.
|
|
// Has no effect if the endpoint is already there.
|
|
// The newly added endpoing is initially considered dead for all listeners.
|
|
// Run only on shard 0.
|
|
void add_endpoint(pinger::endpoint_id);
|
|
|
|
// Remove this endpoint from the detected set.
|
|
// Has no effect if the endpoint is not there.
|
|
// If the endpoint is considered alive when removed, a final mark_dead notification is sent to all listeners.
|
|
// Run only on shard 0.
|
|
void remove_endpoint(pinger::endpoint_id);
|
|
};
|
|
|
|
} // namespace direct_failure_detector
|