mirror of
https://github.com/scylladb/scylladb.git
synced 2026-04-24 18:40:38 +00:00
These changes are complementary to those from a recent commit where we
handled aborting ongoing operations during tablet events, such as
tablet migration. In this commit, we consider the case of shutting down
a node.
When a node is shutting down, we eventually close the connections. When
the client can no longer get a response from the server, it makes no
sense to continue with the queries. We'd like to cancel them at that
point.
We leverage the abort source passed down via `client_state` down to
the strongly consistent coordinator. This way, the transport layer can
communicate with it and signal that the queries should be canceled.
The abort source is triggered by the CQL server (cf.
`generic_server::server::{stop,shutdown}`).
---
Note that this is not an optional change. In fact, if we don't abort
those requests, we might hang for an indefinite amount of time when
executing the following code in `main.cc`:
```
// Register at_exit last, so that storage_service::drain_on_shutdown will be called first
auto do_drain = defer_verbose_shutdown("local storage", [&ss] {
ss.local().drain_on_shutdown().get();
});
```
The problem boils down to the fact that `generic_server::server::stop`
will wait for all connections to be closed, but that won't happen until
all ongoing operations (at least those to strongly consistent tables)
are finished.
It's important to highlight that even though we hang on this, the
client can no longer get any response. Thus, it's crucial that at that
point we simply abort ongoing operations to proceed with the rest of
shutdown.
---
Two tests are added to verify that the implementation is correct:
one focusing on local operations, the other -- on a forwarded write.
Difference in time spent on the whole test file
`test_strong_consistency.py` on my local machine, in dev mode:
Before:
```
real 0m31.775s
user 1m4.475s
sys 0m22.615s
```
After:
```
real 0m32.024s
user 1m10.751s
sys 0m23.871s
```
Individual runs of the added tests:
test_queries_when_shutting_down:
```
real 0m12.818s
user 0m36.726s
sys 0m4.577s
```
test_abort_forwarded_write_upon_shutdown:
```
real 0m12.930s
user 0m36.622s
sys 0m4.752s
```
449 lines
18 KiB
C++
449 lines
18 KiB
C++
/*
|
|
* Copyright (C) 2021-present ScyllaDB
|
|
*/
|
|
|
|
/*
|
|
* SPDX-License-Identifier: LicenseRef-ScyllaDB-Source-Available-1.0
|
|
*/
|
|
|
|
#include "generic_server.hh"
|
|
|
|
#include <exception>
|
|
#include <fmt/ranges.h>
|
|
#include <seastar/core/when_all.hh>
|
|
#include <seastar/coroutine/parallel_for_each.hh>
|
|
#include <seastar/core/reactor.hh>
|
|
#include <seastar/core/smp.hh>
|
|
#include <seastar/coroutine/maybe_yield.hh>
|
|
#include <seastar/coroutine/switch_to.hh>
|
|
#include <utility>
|
|
|
|
namespace generic_server {
|
|
|
|
class counted_data_source_impl : public data_source_impl {
|
|
data_source _ds;
|
|
connection::cpu_concurrency_t& _cpu_concurrency;
|
|
|
|
template <typename F>
|
|
future<temporary_buffer<char>> invoke_with_counting(F&& fun) {
|
|
if (_cpu_concurrency.stopped) {
|
|
return fun();
|
|
}
|
|
size_t units = _cpu_concurrency.units.count();
|
|
_cpu_concurrency.units.return_all();
|
|
return fun().finally([this, units] () {
|
|
_cpu_concurrency.units.adopt(consume_units(_cpu_concurrency.semaphore, units));
|
|
});
|
|
};
|
|
public:
|
|
counted_data_source_impl(data_source ds, connection::cpu_concurrency_t& cpu_concurrency) : _ds(std::move(ds)), _cpu_concurrency(cpu_concurrency) {};
|
|
virtual ~counted_data_source_impl() = default;
|
|
virtual future<temporary_buffer<char>> get() override {
|
|
return invoke_with_counting([this] {return _ds.get();});
|
|
};
|
|
virtual future<temporary_buffer<char>> skip(uint64_t n) override {
|
|
return invoke_with_counting([this, n] {return _ds.skip(n);});
|
|
};
|
|
virtual future<> close() override {
|
|
return _ds.close();
|
|
};
|
|
};
|
|
|
|
class counted_data_sink_impl : public data_sink_impl {
|
|
data_sink _ds;
|
|
connection::cpu_concurrency_t& _cpu_concurrency;
|
|
|
|
template <typename F>
|
|
future<> invoke_with_counting(F&& fun) {
|
|
if (_cpu_concurrency.stopped) {
|
|
return fun();
|
|
}
|
|
size_t units = _cpu_concurrency.units.count();
|
|
_cpu_concurrency.units.return_all();
|
|
return fun().finally([this, units] () {
|
|
_cpu_concurrency.units.adopt(consume_units(_cpu_concurrency.semaphore, units));
|
|
});
|
|
};
|
|
public:
|
|
counted_data_sink_impl(data_sink ds, connection::cpu_concurrency_t& cpu_concurrency) : _ds(std::move(ds)), _cpu_concurrency(cpu_concurrency) {};
|
|
virtual ~counted_data_sink_impl() = default;
|
|
virtual temporary_buffer<char> allocate_buffer(size_t size) override {
|
|
return _ds.allocate_buffer(size);
|
|
}
|
|
virtual future<> put(std::span<temporary_buffer<char>> data) override {
|
|
if (_cpu_concurrency.stopped) {
|
|
return _ds.put(std::move(data));
|
|
}
|
|
size_t units = _cpu_concurrency.units.count();
|
|
_cpu_concurrency.units.return_all();
|
|
return _ds.put(std::move(data)).finally([this, units] {
|
|
_cpu_concurrency.units.adopt(consume_units(_cpu_concurrency.semaphore, units));
|
|
});
|
|
}
|
|
virtual future<> flush() override {
|
|
return invoke_with_counting([this] (void) mutable {
|
|
return _ds.flush();
|
|
});
|
|
}
|
|
virtual future<> close() override {
|
|
return _ds.close();
|
|
}
|
|
virtual size_t buffer_size() const noexcept override {
|
|
return _ds.buffer_size();
|
|
}
|
|
virtual bool can_batch_flushes() const noexcept override {
|
|
return _ds.can_batch_flushes();
|
|
}
|
|
virtual void on_batch_flush_error() noexcept override {
|
|
_ds.on_batch_flush_error();
|
|
}
|
|
};
|
|
|
|
connection::connection(server& server, connected_socket&& fd, named_semaphore& sem, semaphore_units<named_semaphore_exception_factory> initial_sem_units)
|
|
: _conns_cpu_concurrency{sem, std::move(initial_sem_units), false}
|
|
, _server{server}
|
|
, _connections_list_entry(_server._connections_list.emplace(*this))
|
|
, _fd{std::move(fd)}
|
|
, _read_buf(data_source(std::make_unique<counted_data_source_impl>(_fd.input().detach(), _conns_cpu_concurrency)))
|
|
, _write_buf(output_stream<char>(data_sink(std::make_unique<counted_data_sink_impl>(_fd.output().detach(), _conns_cpu_concurrency)), 8192, output_stream_options{.batch_flushes = true}))
|
|
, _pending_requests_gate("generic_server::connection")
|
|
, _hold_server(_server._gate)
|
|
{
|
|
++_server._total_connections;
|
|
}
|
|
|
|
connection::~connection()
|
|
{
|
|
}
|
|
|
|
connection::execute_under_tenant_type
|
|
connection::no_tenant() {
|
|
// return a function that runs the process loop with no scheduling group games
|
|
return [] (connection_process_loop loop) {
|
|
return loop();
|
|
};
|
|
}
|
|
|
|
void connection::switch_tenant(execute_under_tenant_type exec) {
|
|
_execute_under_current_tenant = std::move(exec);
|
|
_tenant_switch = true;
|
|
}
|
|
|
|
future<> server::for_each_gently(noncopyable_function<void(connection&)> fn) {
|
|
return _connections_list.for_each_gently([fn = std::move(fn)](std::reference_wrapper<connection> c) {
|
|
fn(c.get());
|
|
});
|
|
}
|
|
|
|
static bool is_broken_pipe_or_connection_reset(std::exception_ptr ep) {
|
|
try {
|
|
std::rethrow_exception(ep);
|
|
} catch (const std::system_error& e) {
|
|
auto& code = e.code();
|
|
if (code.category() == std::system_category() && (code.value() == EPIPE || code.value() == ECONNRESET)) {
|
|
return true;
|
|
}
|
|
if (code.category() == tls::error_category()) {
|
|
// Typically ECONNRESET
|
|
if (code.value() == tls::ERROR_PREMATURE_TERMINATION) {
|
|
return true;
|
|
}
|
|
// If we got an actual EPIPE in push/pull of gnutls, it is _not_ translated
|
|
// to anything more useful than generic push/pull error. Need to look at
|
|
// nested exception.
|
|
if (code.value() == tls::ERROR_PULL || code.value() == tls::ERROR_PUSH) {
|
|
if (auto p = dynamic_cast<const std::nested_exception*>(std::addressof(e))) {
|
|
return is_broken_pipe_or_connection_reset(p->nested_ptr());
|
|
}
|
|
}
|
|
}
|
|
return false;
|
|
} catch (...) {}
|
|
return false;
|
|
}
|
|
|
|
future<> connection::process_until_tenant_switch() {
|
|
_tenant_switch = false;
|
|
{
|
|
return do_until([this] {
|
|
return _read_buf.eof() || _tenant_switch;
|
|
}, [this] {
|
|
return process_request();
|
|
});
|
|
}
|
|
}
|
|
|
|
future<> connection::process()
|
|
{
|
|
return with_gate(_pending_requests_gate, [this] {
|
|
return do_until([this] {
|
|
return _read_buf.eof();
|
|
}, [this] {
|
|
return _execute_under_current_tenant([this] {
|
|
return process_until_tenant_switch();
|
|
});
|
|
}).then_wrapped([this] (future<> f) {
|
|
handle_error(std::move(f));
|
|
});
|
|
}).finally([this] {
|
|
return _pending_requests_gate.close().then([this] {
|
|
return _ready_to_respond.handle_exception([] (std::exception_ptr ep) {
|
|
if (is_broken_pipe_or_connection_reset(ep)) {
|
|
// expected if another side closes a connection or we're shutting down
|
|
return;
|
|
}
|
|
std::rethrow_exception(ep);
|
|
}).finally([this] {
|
|
return _write_buf.close();
|
|
});
|
|
});
|
|
});
|
|
}
|
|
|
|
void connection::on_connection_ready()
|
|
{
|
|
_conns_cpu_concurrency.stopped = true;
|
|
_conns_cpu_concurrency.units.return_all();
|
|
}
|
|
|
|
void connection::shutdown()
|
|
{
|
|
shutdown_input();
|
|
shutdown_output();
|
|
}
|
|
|
|
bool connection::shutdown_input() {
|
|
try {
|
|
_fd.shutdown_input();
|
|
} catch (...) {
|
|
_server._logger.warn("Error shutting down input side of connection {}->{}, exception: {}", _fd.remote_address(), _fd.local_address(), std::current_exception());
|
|
return false;
|
|
}
|
|
return true;
|
|
}
|
|
|
|
bool connection::shutdown_output() {
|
|
try {
|
|
_fd.shutdown_output();
|
|
} catch (...) {
|
|
_server._logger.warn("Error shutting down output side of connection {}->{}, exception: {}", _fd.remote_address(), _fd.local_address(), std::current_exception());
|
|
return false;
|
|
}
|
|
return true;
|
|
}
|
|
|
|
server::server(const sstring& server_name, logging::logger& logger, config cfg)
|
|
: _server_name{server_name}
|
|
, _logger{logger}
|
|
, _gate("generic_server::server")
|
|
, _conns_cpu_concurrency(cfg.uninitialized_connections_semaphore_cpu_concurrency)
|
|
, _conns_cpu_concurrency_observer(_conns_cpu_concurrency.observe([this] (const uint32_t &concurrency) {
|
|
if (concurrency == _prev_conns_cpu_concurrency) {
|
|
return;
|
|
}
|
|
_logger.info("Updating uninitialized_connections_semaphore_cpu_concurrency from {} to {} due to config update", _prev_conns_cpu_concurrency, concurrency);
|
|
|
|
if (concurrency > _prev_conns_cpu_concurrency) {
|
|
_conns_cpu_concurrency_semaphore.signal(concurrency - _prev_conns_cpu_concurrency);
|
|
} else {
|
|
_conns_cpu_concurrency_semaphore.consume(_prev_conns_cpu_concurrency - concurrency);
|
|
}
|
|
_prev_conns_cpu_concurrency = concurrency;
|
|
}))
|
|
, _prev_conns_cpu_concurrency(_conns_cpu_concurrency)
|
|
// Total semaphore capacity is concurrency - 1 + nr_listeners;
|
|
// start with concurrency - 1, each listen() adds 1.
|
|
, _conns_cpu_concurrency_semaphore(_prev_conns_cpu_concurrency - 1, named_semaphore_exception_factory{"connections cpu concurrency semaphore"})
|
|
, _shutdown_timeout(std::chrono::seconds{cfg.shutdown_timeout_in_seconds})
|
|
{
|
|
}
|
|
|
|
future<> server::stop() {
|
|
co_await shutdown();
|
|
co_await std::exchange(_all_connections_stopped, make_ready_future<>());
|
|
}
|
|
|
|
future<> server::shutdown() {
|
|
if (_gate.is_closed()) {
|
|
co_return;
|
|
}
|
|
|
|
shared_future<> connections_stopped{_gate.close()};
|
|
_all_connections_stopped = connections_stopped.get_future();
|
|
|
|
// Stop all listeners.
|
|
size_t nr = 0;
|
|
size_t nr_total = _listeners.size();
|
|
_logger.debug("abort accept nr_total={}", nr_total);
|
|
for (auto&& l : _listeners) {
|
|
l.abort_accept();
|
|
_logger.debug("abort accept {} out of {} done", ++nr, nr_total);
|
|
}
|
|
co_await std::move(_listeners_stopped);
|
|
|
|
// Shutdown RX side of the connections, so no new requests could be received.
|
|
// Leave the TX side so the responses to ongoing requests could be sent.
|
|
_logger.debug("Shutting down RX side of {} connections", _connections_list.size());
|
|
co_await for_each_gently([](auto& connection) {
|
|
if (!connection.shutdown_input()) {
|
|
// If failed to shutdown the input side, then attempt to shutdown the output side which should do a complete shutdown of the connection.
|
|
connection.shutdown_output();
|
|
}
|
|
});
|
|
|
|
// Wait for the remaining requests to finish.
|
|
_logger.debug("Waiting for connections to stop");
|
|
try {
|
|
co_await connections_stopped.get_future(seastar::lowres_clock::now() + _shutdown_timeout);
|
|
} catch (const timed_out_error& _) {
|
|
_logger.info("Timed out waiting for connections shutdown.");
|
|
}
|
|
|
|
// Either all requests stopped or a timeout occurred, do the full shutdown of the connections.
|
|
size_t nr_conn = 0;
|
|
auto nr_conn_total = _connections_list.size();
|
|
_logger.debug("shutdown connection nr_total={}", nr_conn_total);
|
|
co_await for_each_gently([this, &nr_conn, nr_conn_total] (std::reference_wrapper<connection> c) {
|
|
c.get().shutdown();
|
|
_logger.debug("shutdown connection {} out of {} done", ++nr_conn, nr_conn_total);
|
|
});
|
|
_abort_source.request_abort();
|
|
|
|
_logger.debug("generic_server::shutdown completed");
|
|
}
|
|
|
|
future<>
|
|
server::listen(socket_address addr, std::shared_ptr<seastar::tls::credentials_builder> builder, bool is_shard_aware, bool keepalive, std::optional<file_permissions> unix_domain_socket_permissions, bool proxy_protocol, std::function<server&()> get_shard_instance) {
|
|
// Note: We are making the assumption that if builder is provided it will be the same for each
|
|
// invocation, regardless of address etc. In general, only CQL server will call this multiple times,
|
|
// and if TLS, it will use the same cert set.
|
|
// Could hold certs in a map<addr, certs> and ensure separation, but then we will for all
|
|
// current uses of this class create duplicate reloadable certs for shard 0, which is
|
|
// kind of what we wanted to avoid in the first place...
|
|
if (builder && !_credentials) {
|
|
if (!get_shard_instance || this_shard_id() == 0) {
|
|
_credentials = co_await builder->build_reloadable_server_credentials([this, get_shard_instance = std::move(get_shard_instance)](const tls::credentials_builder& b, const std::unordered_set<sstring>& files, std::exception_ptr ep) -> future<> {
|
|
if (ep) {
|
|
_logger.warn("Exception loading {}: {}", files, ep);
|
|
} else {
|
|
if (get_shard_instance) {
|
|
co_await smp::invoke_on_others([&]() {
|
|
auto& s = get_shard_instance();
|
|
if (s._credentials) {
|
|
b.rebuild(*s._credentials);
|
|
}
|
|
});
|
|
|
|
}
|
|
_logger.info("Reloaded {}", files);
|
|
}
|
|
});
|
|
} else {
|
|
_credentials = builder->build_server_credentials();
|
|
}
|
|
}
|
|
listen_options lo;
|
|
lo.reuse_address = true;
|
|
lo.unix_domain_socket_permissions = unix_domain_socket_permissions;
|
|
lo.proxy_protocol = proxy_protocol;
|
|
if (is_shard_aware) {
|
|
lo.lba = server_socket::load_balancing_algorithm::port;
|
|
}
|
|
server_socket ss;
|
|
bool is_tls = false;
|
|
try {
|
|
ss = builder
|
|
? is_tls = true, seastar::tls::listen(_credentials, addr, lo)
|
|
: seastar::listen(addr, lo);
|
|
} catch (...) {
|
|
throw std::runtime_error(format("{} error while listening on {} -> {}", _server_name, addr, std::current_exception()));
|
|
}
|
|
_listeners.emplace_back(std::move(ss));
|
|
// Each listener's do_accepts loop needs at least 1 unit to accept.
|
|
_conns_cpu_concurrency_semaphore.signal(1);
|
|
_listeners_stopped = when_all(std::move(_listeners_stopped), do_accepts(_listeners.size() - 1, keepalive, addr, is_tls)).discard_result();
|
|
}
|
|
|
|
future<> server::do_accepts(int which, bool keepalive, socket_address server_addr, bool is_tls) {
|
|
co_await coroutine::switch_to(get_scheduling_group_for_new_connection());
|
|
while (!_gate.is_closed()) {
|
|
seastar::gate::holder holder(_gate);
|
|
bool shed = false;
|
|
size_t waiters_at_block = 0;
|
|
try {
|
|
semaphore_units<named_semaphore_exception_factory> units(_conns_cpu_concurrency_semaphore, 0);
|
|
if (_conns_cpu_concurrency != std::numeric_limits<uint32_t>::max()) {
|
|
auto u = try_get_units(_conns_cpu_concurrency_semaphore, 1);
|
|
if (u) {
|
|
units = std::move(*u);
|
|
} else {
|
|
_blocked_connections++;
|
|
waiters_at_block = _conns_cpu_concurrency_semaphore.waiters();
|
|
try {
|
|
units = co_await get_units(_conns_cpu_concurrency_semaphore, 1, std::chrono::minutes(1));
|
|
} catch (const semaphore_timed_out&) {
|
|
shed = true;
|
|
}
|
|
}
|
|
}
|
|
accept_result cs_sa = co_await _listeners[which].accept();
|
|
co_await coroutine::switch_to(get_scheduling_group_for_new_connection());
|
|
if (_gate.is_closed()) {
|
|
break;
|
|
}
|
|
auto fd = std::move(cs_sa.connection);
|
|
auto addr = std::move(cs_sa.remote_address);
|
|
fd.set_nodelay(true);
|
|
fd.set_keepalive(keepalive);
|
|
auto conn = make_connection(server_addr, std::move(fd), std::move(addr),
|
|
_conns_cpu_concurrency_semaphore, std::move(units));
|
|
if (shed) {
|
|
// We establish a connection even during shedding to notify the client;
|
|
// otherwise, they might hang waiting for a response.
|
|
_shed_connections++;
|
|
static thread_local logger::rate_limit rate_limit{std::chrono::seconds(10)};
|
|
_logger.log(log_level::warn, rate_limit,
|
|
"too many in-flight connection attempts: {}, connection dropped",
|
|
waiters_at_block);
|
|
conn->shutdown();
|
|
continue;
|
|
}
|
|
conn->_ssl_enabled = is_tls;
|
|
// Move the processing into the background.
|
|
(void)futurize_invoke([this, conn, is_tls] {
|
|
return (is_tls
|
|
? tls::get_protocol_version(conn->_fd).then([conn](const sstring& protocol) {
|
|
return tls::get_cipher_suite(conn->_fd).then(
|
|
[conn, protocol](const sstring& cipher_suite) mutable {
|
|
conn->_ssl_protocol = protocol;
|
|
conn->_ssl_cipher_suite = cipher_suite;
|
|
return make_ready_future<bool>(true);
|
|
});
|
|
}).handle_exception([conn](std::exception_ptr ep) {
|
|
return seastar::make_exception_future<bool>(std::runtime_error(fmt::format("Inspecting TLS connection failed: {}", ep)));
|
|
})
|
|
: make_ready_future<bool>(true)
|
|
).then([conn] (bool ok){
|
|
// Block while monitoring for lifetime/errors.
|
|
return ok ? conn->process() : make_ready_future<>();
|
|
}).then_wrapped([this, conn](auto f) {
|
|
try {
|
|
f.get();
|
|
} catch (...) {
|
|
auto ep = std::current_exception();
|
|
if (!is_broken_pipe_or_connection_reset(ep)) {
|
|
// some exceptions are expected if another side closes a connection
|
|
// or we're shutting down
|
|
_logger.info("exception while processing connection: {}", ep);
|
|
}
|
|
}
|
|
});
|
|
});
|
|
} catch (...) {
|
|
_logger.debug("accept failed: {}", std::current_exception());
|
|
}
|
|
}
|
|
}
|
|
|
|
}
|