Merge 'gossiper: failure_detector_loop_for_node: abort send_gossip_echo using abort_source' from Benny Halevy

Currently send_gossip_echo has a 22 seconds timeout
during which _abort_source is ignored.

Use a function-local abort_source to abort
send_gossip_echo either on timeout or if
_abort_source requested abort, and co_return in
the latter case.

Closes scylladb/scylladb#12296

* github.com:scylladb/scylladb:
  gossiper: make send_gossip_echo cancellable
  gossiper: add send_echo helper
  idl, message: make with_timeout and cancellable verb attributes composable
  gossiper: failure_detector_loop_for_node: ignore abort_requested_exception
  gossiper: failure_detector_loop_for_node: check if abort_requested in loop condition
This commit is contained in:
Botond Dénes
2025-05-12 15:35:30 +03:00
6 changed files with 53 additions and 19 deletions

View File

@@ -954,17 +954,21 @@ future<std::set<inet_address>> gossiper::get_unreachable_members_synchronized()
});
}
future<> gossiper::send_echo(locator::host_id host_id, std::chrono::milliseconds timeout_ms, int64_t generation_number, bool notify_up) {
return ser::gossip_rpc_verbs::send_gossip_echo(&_messaging, host_id, netw::messaging_service::clock_type::now() + timeout_ms, _abort_source, generation_number, notify_up);
}
future<> gossiper::failure_detector_loop_for_node(locator::host_id host_id, generation_type gossip_generation, uint64_t live_endpoints_version) {
auto last = gossiper::clk::now();
auto diff = gossiper::clk::duration(0);
auto echo_interval = std::chrono::seconds(2);
auto max_duration = echo_interval + std::chrono::milliseconds(_gcfg.failure_detector_timeout_ms());
auto node = _address_map.get(host_id);
while (is_enabled()) {
while (is_enabled() && !_abort_source.abort_requested()) {
bool failed = false;
try {
logger.debug("failure_detector_loop: Send echo to node {}/{}, status = started", host_id, node);
co_await ser::gossip_rpc_verbs::send_gossip_echo(&_messaging, host_id, netw::messaging_service::clock_type::now() + max_duration, gossip_generation.value(), false);
co_await send_echo(host_id, max_duration, gossip_generation.value(), false);
logger.debug("failure_detector_loop: Send echo to node {}/{}, status = ok", host_id, node);
} catch (...) {
failed = true;
@@ -992,7 +996,7 @@ future<> gossiper::failure_detector_loop_for_node(locator::host_id host_id, gene
host_id, node, _live_endpoints, _live_endpoints_version, live_endpoints_version);
co_return;
} else {
co_await sleep_abortable(echo_interval, _abort_source);
co_await sleep_abortable(echo_interval, _abort_source).handle_exception_type([] (const abort_requested_exception&) {});
}
}
co_return;
@@ -1688,7 +1692,7 @@ future<> gossiper::notify_nodes_on_up(std::unordered_set<locator::host_id> dsts)
if (dst != _gcfg.host_id) {
try {
auto generation = my_endpoint_state().get_heart_beat_state().get_generation();
co_await ser::gossip_rpc_verbs::send_gossip_echo(&_messaging, dst, netw::messaging_service::clock_type::now() + std::chrono::seconds(10), generation.value(), true);
co_await send_echo(dst, std::chrono::seconds(10), generation.value(), true);
} catch (...) {
logger.warn("Failed to notify node {} that I am UP: {}", dst, std::current_exception());
}
@@ -1724,7 +1728,7 @@ void gossiper::mark_alive(endpoint_state_ptr node) {
// Enter the _background_msg gate so stop() would wait on it
auto gh = _background_msg.hold();
logger.debug("Sending a EchoMessage to {}/{}, with generation_number={}", id, addr, generation);
(void) ser::gossip_rpc_verbs::send_gossip_echo(&_messaging, id, netw::messaging_service::clock_type::now() + std::chrono::seconds(15), generation.value(), false).then([this, id] {
(void) send_echo(id, std::chrono::seconds(15), generation.value(), false).then([this, id] {
logger.trace("Got EchoMessage Reply");
return real_mark_alive(id);
}).handle_exception([addr, gh = std::move(gh), unmark_pending = std::move(unmark_pending), id] (auto ep) {

View File

@@ -109,6 +109,7 @@ private:
future<> handle_shutdown_msg(locator::host_id from, std::optional<int64_t> generation_number_opt);
future<> do_send_ack_msg(locator::host_id from, gossip_digest_syn syn_msg);
future<> do_send_ack2_msg(locator::host_id from, utils::chunked_vector<gossip_digest> ack_msg_digest);
future<> send_echo(locator::host_id host_id, std::chrono::milliseconds timeout_ms, int64_t generation_number, bool notify_up);
future<gossip_get_endpoint_states_response> handle_get_endpoint_states_msg(gossip_get_endpoint_states_request request);
static constexpr uint32_t _default_cpuid = 0;
void do_sort(utils::chunked_vector<gossip_digest>& g_digest_list) const;

View File

@@ -479,11 +479,9 @@ class RpcVerb(ASTBase):
- [[with_timeout]] - an additional time_point parameter is supplied
to the handler function and send* method uses send_message_*_timeout
variant of internal function to actually send the message.
Incompatible with [[cancellable]].
- [[cancellable]] - an additional abort_source& parameter is supplied
to the handler function and send* method uses send_message_*_cancellable
variant of internal function to actually send the message.
Incompatible with [[with_timeout]].
- [[one_way]] - the handler function is annotated by
future<rpc::no_wait_type> return type to designate that a client
doesn't need to wait for an answer.
@@ -697,8 +695,6 @@ def rpc_verb_parse_action(tokens):
one_way = not raw_attrs.empty() and 'one_way' in raw_attrs.attr_items
if one_way and 'return_values' in tokens:
raise Exception(f"Invalid return type specification for one-way RPC verb '{name}'")
if with_timeout and cancellable:
raise Exception(f"Error in verb {name}: [[with_timeout]] cannot be used together with [[cancellable]] in the same verb")
return RpcVerb(name=name, parameters=params, return_values=tokens.get('return_values'), with_client_info=with_client_info, with_timeout=with_timeout, cancellable=cancellable, one_way=one_way, ip=ip)

View File

@@ -9,7 +9,7 @@
#include "gms/gossip_digest_syn.hh"
namespace gms {
verb [[with_client_info, with_timeout]] gossip_echo (int64_t generation_number [[version 4.6.0]], bool notify_up [[version 6.1.0]])
verb [[with_client_info, with_timeout, cancellable]] gossip_echo (int64_t generation_number [[version 4.6.0]], bool notify_up [[version 6.1.0]])
verb [[with_client_info, one_way]] gossip_shutdown (gms::inet_address from, int64_t generation_number [[version 4.6.0]])
verb [[with_client_info, one_way, ip]] gossip_digest_syn (gms::gossip_digest_syn syn)
verb [[with_client_info, one_way]] gossip_digest_ack (gms::gossip_digest_ack ask)

View File

@@ -250,6 +250,41 @@ auto send_message_cancellable(messaging_service* ms, messaging_verb verb, locato
return send_message_cancellable<MsgIn, MsgOut...>(ms, verb, std::optional{id}, ms->addr_for_host_id(id), as, std::forward<MsgOut>(msg)...);
}
template <typename MsgIn, typename Timeout, typename... MsgOut>
auto send_message_timeout_cancellable(messaging_service* ms, messaging_verb verb, locator::host_id host_id, Timeout timeout, abort_source& as, MsgOut&&... msg) {
auto rpc_handler = ms->rpc()->make_client<MsgIn(MsgOut...)>(verb);
using futurator = futurize<std::invoke_result_t<decltype(rpc_handler), rpc_protocol::client&, MsgOut...>>;
if (ms->is_shutting_down()) {
return futurator::make_exception_future(rpc::closed_error());
}
auto rpc_client_ptr = ms->get_rpc_client(verb, ms->addr_for_host_id(host_id), host_id);
auto& rpc_client = *rpc_client_ptr;
auto c = std::make_unique<seastar::rpc::cancellable>();
auto& c_ref = *c;
auto sub = as.subscribe([c = std::move(c)] () noexcept {
c->cancel();
});
if (!sub) {
return futurator::make_exception_future(abort_requested_exception{});
}
return rpc_handler(rpc_client, timeout, c_ref, std::forward<MsgOut>(msg)...).handle_exception([ms = ms->shared_from_this(), host_id, verb, rpc_client_ptr = std::move(rpc_client_ptr), sub = std::move(sub)] (std::exception_ptr&& eptr) {
ms->increment_dropped_messages(verb);
if (try_catch<rpc::closed_error>(eptr)) {
// This is a transport error
ms->remove_error_rpc_client(verb, host_id);
return futurator::make_exception_future(std::move(eptr));
} else if (try_catch<rpc::canceled_error>(eptr)) {
// Translate low-level canceled_error into high-level abort_requested_exception.
return futurator::make_exception_future(abort_requested_exception{});
} else {
// This is expected to be a rpc server error, e.g., the rpc handler throws a std::runtime_error.
return futurator::make_exception_future(std::move(eptr));
}
});
}
// Send one way message for verb
template <typename... MsgOut>
auto send_message_oneway(messaging_service* ms, messaging_verb verb, msg_addr id, MsgOut&&... msg) {

View File

@@ -13,6 +13,7 @@
#include <seastar/core/app-template.hh>
#include <seastar/core/sstring.hh>
#include <seastar/core/thread.hh>
#include <seastar/core/coroutine.hh>
#include <seastar/rpc/rpc_types.hh>
#include <seastar/util/closeable.hh>
#include "db/config.hh"
@@ -157,15 +158,12 @@ public:
future<> test_echo() {
test_logger.info("=== {} ===", __func__);
int64_t gen = 0x1;
return ser::gossip_rpc_verbs::send_gossip_echo(&ms, _server_id, netw::messaging_service::clock_type::now() + std::chrono::seconds(10), gen, false).then_wrapped([] (auto&& f) {
try {
f.get();
return make_ready_future<>();
} catch (std::runtime_error& e) {
test_logger.error("test_echo: {}", e.what());
}
return make_ready_future<>();
});
abort_source as;
try {
co_await ser::gossip_rpc_verbs::send_gossip_echo(&ms, _server_id, netw::messaging_service::clock_type::now() + std::chrono::seconds(10), as, gen, false);
} catch (...) {
test_logger.error("test_echo: {}", std::current_exception());
}
}
};