scylladb/transport/generic_server.cc

/*
 * Copyright (C) 2021-present ScyllaDB
 */

/*
 * SPDX-License-Identifier: LicenseRef-ScyllaDB-Source-Available-1.0
 */

#include "generic_server.hh"

#include <exception>
#include <fmt/ranges.h>
#include <seastar/core/when_all.hh>
#include <seastar/coroutine/parallel_for_each.hh>
#include <seastar/core/reactor.hh>
#include <seastar/core/smp.hh>
#include <seastar/coroutine/maybe_yield.hh>
#include <seastar/coroutine/switch_to.hh>
#include <utility>

namespace generic_server {

class counted_data_source_impl : public data_source_impl {
    data_source _ds;
    connection::cpu_concurrency_t& _cpu_concurrency;

    template <typename F>
    future<temporary_buffer<char>> invoke_with_counting(F&& fun) {
        if (_cpu_concurrency.stopped) {
            return fun();
        }
        size_t units = _cpu_concurrency.units.count();
        _cpu_concurrency.units.return_all();
        return fun().finally([this, units] () {
            _cpu_concurrency.units.adopt(consume_units(_cpu_concurrency.semaphore, units));
        });
    };
public:
    counted_data_source_impl(data_source ds, connection::cpu_concurrency_t& cpu_concurrency) : _ds(std::move(ds)), _cpu_concurrency(cpu_concurrency) {};
    virtual ~counted_data_source_impl() = default;
    virtual future<temporary_buffer<char>> get() override {
        return invoke_with_counting([this] {return _ds.get();});
    };
    virtual future<temporary_buffer<char>> skip(uint64_t n) override {
        return invoke_with_counting([this, n] {return _ds.skip(n);});
    };
    virtual future<> close() override {
        return _ds.close();
    };
};

class counted_data_sink_impl : public data_sink_impl {
    data_sink _ds;
    connection::cpu_concurrency_t& _cpu_concurrency;

    template <typename F>
    future<> invoke_with_counting(F&& fun) {
        if (_cpu_concurrency.stopped) {
            return fun();
        }
        size_t units = _cpu_concurrency.units.count();
        _cpu_concurrency.units.return_all();
        return fun().finally([this, units] () {
            _cpu_concurrency.units.adopt(consume_units(_cpu_concurrency.semaphore, units));
        });
    };
public:
    counted_data_sink_impl(data_sink ds, connection::cpu_concurrency_t& cpu_concurrency) : _ds(std::move(ds)), _cpu_concurrency(cpu_concurrency) {};
    virtual ~counted_data_sink_impl() = default;
    virtual temporary_buffer<char> allocate_buffer(size_t size) override {
        return _ds.allocate_buffer(size);
    }
    virtual future<> put(std::span<temporary_buffer<char>> data)  override {
        if (_cpu_concurrency.stopped) {
            return _ds.put(std::move(data));
        }
        size_t units = _cpu_concurrency.units.count();
        _cpu_concurrency.units.return_all();
        return _ds.put(std::move(data)).finally([this, units] {
            _cpu_concurrency.units.adopt(consume_units(_cpu_concurrency.semaphore, units));
        });
    }
    virtual future<> flush() override {
        return invoke_with_counting([this] (void) mutable {
            return _ds.flush();
        });
    }
    virtual future<> close() override {
        return _ds.close();
    }
    virtual size_t buffer_size() const noexcept override {
        return _ds.buffer_size();
    }
    virtual bool can_batch_flushes() const noexcept override {
        return _ds.can_batch_flushes();
    }
    virtual void on_batch_flush_error() noexcept override {
        _ds.on_batch_flush_error();
    }
};

connection::connection(server& server, connected_socket&& fd, named_semaphore& sem, semaphore_units<named_semaphore_exception_factory> initial_sem_units)
    : _conns_cpu_concurrency{sem, std::move(initial_sem_units), false}
    , _server{server}
    , _connections_list_entry(_server._connections_list.emplace(*this))
    , _fd{std::move(fd)}
    , _read_buf(data_source(std::make_unique<counted_data_source_impl>(_fd.input().detach(), _conns_cpu_concurrency)))
    , _write_buf(output_stream<char>(data_sink(std::make_unique<counted_data_sink_impl>(_fd.output().detach(), _conns_cpu_concurrency)), 8192, output_stream_options{.batch_flushes = true}))
    , _pending_requests_gate("generic_server::connection")
    , _hold_server(_server._gate)
{
    ++_server._total_connections;
}

connection::~connection()
{
}

connection::execute_under_tenant_type
connection::no_tenant() {
    // return a function that runs the process loop with no scheduling group games
    return [] (connection_process_loop loop) {
        return loop();
    };
}

void connection::switch_tenant(execute_under_tenant_type exec) {
    _execute_under_current_tenant = std::move(exec);
    _tenant_switch = true;
}

future<> server::for_each_gently(noncopyable_function<void(connection&)> fn) {
    return _connections_list.for_each_gently([fn = std::move(fn)](std::reference_wrapper<connection> c) {
        fn(c.get());
    });
}

static bool is_broken_pipe_or_connection_reset(std::exception_ptr ep) {
    try {
        std::rethrow_exception(ep);
    } catch (const std::system_error& e) {
        auto& code = e.code();
        if (code.category() == std::system_category() && (code.value() == EPIPE || code.value() == ECONNRESET)) {
            return true;
        }
        if (code.category() == tls::error_category()) {
            // Typically ECONNRESET
            if (code.value() == tls::ERROR_PREMATURE_TERMINATION) {
                return true;
            }
            // If we got an actual EPIPE in push/pull of gnutls, it is _not_ translated
            // to anything more useful than generic push/pull error. Need to look at
            // nested exception.
            if (code.value() == tls::ERROR_PULL || code.value() == tls::ERROR_PUSH) {
                if (auto p = dynamic_cast<const std::nested_exception*>(std::addressof(e))) {
                    return is_broken_pipe_or_connection_reset(p->nested_ptr());
                }
            }
        }
        return false;
    } catch (...) {}
    return false;
}

future<> connection::process_until_tenant_switch() {
    _tenant_switch = false;
    {
        return do_until([this] {
            return _read_buf.eof() || _tenant_switch;
        }, [this] {
            return process_request();
        });
    }
}

future<> connection::process()
{
    return with_gate(_pending_requests_gate, [this] {
        return do_until([this] {
            return _read_buf.eof();
        }, [this] {
            return _execute_under_current_tenant([this] {
                return process_until_tenant_switch();
            });
        }).then_wrapped([this] (future<> f) {
            handle_error(std::move(f));
        });
    }).finally([this] {
        return _pending_requests_gate.close().then([this] {
            return _ready_to_respond.handle_exception([] (std::exception_ptr ep) {
                if (is_broken_pipe_or_connection_reset(ep)) {
                    // expected if another side closes a connection or we're shutting down
                    return;
                }
                std::rethrow_exception(ep);
            }).finally([this] {
                 return _write_buf.close();
            });
        });
    });
}

void connection::on_connection_ready()
{
    _conns_cpu_concurrency.stopped = true;
    _conns_cpu_concurrency.units.return_all();
}

void connection::shutdown()
{
    shutdown_input();
    shutdown_output();
}

bool connection::shutdown_input() {
    try {
        _fd.shutdown_input();
    } catch (...) {
        _server._logger.warn("Error shutting down input side of connection {}->{}, exception: {}", _fd.remote_address(), _fd.local_address(), std::current_exception());
        return false;
    }
    return true;
}

bool connection::shutdown_output() {
    try {
        _fd.shutdown_output();
    } catch (...) {
        _server._logger.warn("Error shutting down output side of connection {}->{}, exception: {}", _fd.remote_address(), _fd.local_address(), std::current_exception());
        return false;
    }
    return true;
}

server::server(const sstring& server_name, logging::logger& logger, config cfg)
    : _server_name{server_name}
    , _logger{logger}
    , _gate("generic_server::server")
    , _conns_cpu_concurrency(cfg.uninitialized_connections_semaphore_cpu_concurrency)
    , _conns_cpu_concurrency_observer(_conns_cpu_concurrency.observe([this] (const uint32_t &concurrency) {
        if (concurrency == _prev_conns_cpu_concurrency) {
            return;
        }
        _logger.info("Updating uninitialized_connections_semaphore_cpu_concurrency from {} to {} due to config update", _prev_conns_cpu_concurrency, concurrency);

        if (concurrency > _prev_conns_cpu_concurrency) {
            _conns_cpu_concurrency_semaphore.signal(concurrency - _prev_conns_cpu_concurrency);
        } else {
            _conns_cpu_concurrency_semaphore.consume(_prev_conns_cpu_concurrency - concurrency);
        }
        _prev_conns_cpu_concurrency = concurrency;
    }))
    , _prev_conns_cpu_concurrency(_conns_cpu_concurrency)
    // Total semaphore capacity is concurrency - 1 + nr_listeners;
    // start with concurrency - 1, each listen() adds 1.
    , _conns_cpu_concurrency_semaphore(_prev_conns_cpu_concurrency - 1, named_semaphore_exception_factory{"connections cpu concurrency semaphore"})
    , _shutdown_timeout(std::chrono::seconds{cfg.shutdown_timeout_in_seconds})
{
}

future<> server::stop() {
    co_await shutdown();
    co_await std::exchange(_all_connections_stopped, make_ready_future<>());
}

future<> server::shutdown() {
    if (_gate.is_closed()) {
        co_return;
    }

    shared_future<> connections_stopped{_gate.close()};
    _all_connections_stopped = connections_stopped.get_future();

     // Stop all listeners.
    size_t nr = 0;
    size_t nr_total = _listeners.size();
    _logger.debug("abort accept nr_total={}", nr_total);
    for (auto&& l : _listeners) {
        l.abort_accept();
        _logger.debug("abort accept {} out of {} done", ++nr, nr_total);
    }
     co_await std::move(_listeners_stopped);

    // Shutdown RX side of the connections, so no new requests could be received.
    // Leave the TX side so the responses to ongoing requests could be sent.
    _logger.debug("Shutting down RX side of {} connections", _connections_list.size());
    co_await for_each_gently([](auto& connection) {
        if (!connection.shutdown_input()) {
            // If failed to shutdown the input side, then attempt to shutdown the output side which should do a complete shutdown of the connection.
            connection.shutdown_output();
        }
    });

    // Wait for the remaining requests to finish.
    _logger.debug("Waiting for connections to stop");
    try {
        co_await connections_stopped.get_future(seastar::lowres_clock::now() + _shutdown_timeout);
    } catch (const timed_out_error& _) {
        _logger.info("Timed out waiting for connections shutdown.");
    }

    // Either all requests stopped or a timeout occurred, do the full shutdown of the connections.
    size_t nr_conn = 0;
    auto nr_conn_total = _connections_list.size();
    _logger.debug("shutdown connection nr_total={}", nr_conn_total);
    co_await for_each_gently([this, &nr_conn, nr_conn_total] (std::reference_wrapper<connection> c) {
        c.get().shutdown();
        _logger.debug("shutdown connection {} out of {} done", ++nr_conn, nr_conn_total);
    });
    _abort_source.request_abort();
}

future<>
server::listen(socket_address addr, std::shared_ptr<seastar::tls::credentials_builder> builder, bool is_shard_aware, bool keepalive, std::optional<file_permissions> unix_domain_socket_permissions, bool proxy_protocol, std::function<server&()> get_shard_instance) {
    // Note: We are making the assumption that if builder is provided it will be the same for each
    // invocation, regardless of address etc. In general, only CQL server will call this multiple times,
    // and if TLS, it will use the same cert set.
    // Could hold certs in a map<addr, certs> and ensure separation, but then we will for all
    // current uses of this class create duplicate reloadable certs for shard 0, which is
    // kind of what we wanted to avoid in the first place...
    if (builder && !_credentials) {
        if (!get_shard_instance || this_shard_id() == 0) {
            _credentials = co_await builder->build_reloadable_server_credentials([this, get_shard_instance = std::move(get_shard_instance)](const tls::credentials_builder& b, const std::unordered_set<sstring>& files, std::exception_ptr ep) -> future<> {
                if (ep) {
                    _logger.warn("Exception loading {}: {}", files, ep);
                } else {
                    if (get_shard_instance) {
                        co_await smp::invoke_on_others([&]() {
                            auto& s = get_shard_instance();
                            if (s._credentials) {
                                b.rebuild(*s._credentials);
                            }
                        });

                    }
                    _logger.info("Reloaded {}", files);
                }
            });
        } else {
            _credentials = builder->build_server_credentials();
        }
    }
    listen_options lo;
    lo.reuse_address = true;
    lo.unix_domain_socket_permissions = unix_domain_socket_permissions;
    lo.proxy_protocol = proxy_protocol;
    if (is_shard_aware) {
        lo.lba = server_socket::load_balancing_algorithm::port;
    }
    server_socket ss;
    bool is_tls = false;
    try {
        ss = builder
            ? is_tls = true, seastar::tls::listen(_credentials, addr, lo)
            : seastar::listen(addr, lo);
    } catch (...) {
        throw std::runtime_error(format("{} error while listening on {} -> {}", _server_name, addr, std::current_exception()));
    }
    _listeners.emplace_back(std::move(ss));
    // Each listener's do_accepts loop needs at least 1 unit to accept.
    _conns_cpu_concurrency_semaphore.signal(1);
    _listeners_stopped = when_all(std::move(_listeners_stopped), do_accepts(_listeners.size() - 1, keepalive, addr, is_tls)).discard_result();
}

future<> server::do_accepts(int which, bool keepalive, socket_address server_addr, bool is_tls) {
    co_await coroutine::switch_to(get_scheduling_group_for_new_connection());
    while (!_gate.is_closed()) {
        seastar::gate::holder holder(_gate);
        bool shed = false;
        size_t waiters_at_block = 0;
        try {
            semaphore_units<named_semaphore_exception_factory> units(_conns_cpu_concurrency_semaphore, 0);
            if (_conns_cpu_concurrency != std::numeric_limits<uint32_t>::max()) {
                auto u = try_get_units(_conns_cpu_concurrency_semaphore, 1);
                if (u) {
                    units = std::move(*u);
                } else {
                    _blocked_connections++;
                    waiters_at_block = _conns_cpu_concurrency_semaphore.waiters();
                    try {
                        units = co_await get_units(_conns_cpu_concurrency_semaphore, 1, std::chrono::minutes(1));
                    } catch (const semaphore_timed_out&) {
                        shed = true;
                    }
                }
            }
            accept_result cs_sa = co_await _listeners[which].accept();
            co_await coroutine::switch_to(get_scheduling_group_for_new_connection());
            if (_gate.is_closed()) {
                break;
            }
            auto fd = std::move(cs_sa.connection);
            auto addr = std::move(cs_sa.remote_address);
            fd.set_nodelay(true);
            fd.set_keepalive(keepalive);
            auto conn = make_connection(server_addr, std::move(fd), std::move(addr),
                    _conns_cpu_concurrency_semaphore, std::move(units));
            if (shed) {
                // We establish a connection even during shedding to notify the client;
                // otherwise, they might hang waiting for a response.
                _shed_connections++;
                static thread_local logger::rate_limit rate_limit{std::chrono::seconds(10)};
                _logger.log(log_level::warn, rate_limit,
                        "too many in-flight connection attempts: {}, connection dropped",
                        waiters_at_block);
                conn->shutdown();
                continue;
            }
            conn->_ssl_enabled = is_tls;
            // Move the processing into the background.
            (void)futurize_invoke([this, conn, is_tls] {
                return (is_tls
                    ? tls::get_protocol_version(conn->_fd).then([conn](const sstring& protocol) {
                            return tls::get_cipher_suite(conn->_fd).then(
                                [conn, protocol](const sstring& cipher_suite) mutable {
                                    conn->_ssl_protocol = protocol;
                                    conn->_ssl_cipher_suite = cipher_suite;
                                    return make_ready_future<bool>(true);
                                });
                        }).handle_exception([conn](std::exception_ptr ep) {
                            return seastar::make_exception_future<bool>(std::runtime_error(fmt::format("Inspecting TLS connection failed: {}", ep)));
                        })
                    : make_ready_future<bool>(true)
                ).then([conn] (bool ok){
                    // Block while monitoring for lifetime/errors.
                    return ok ? conn->process() : make_ready_future<>();
                }).then_wrapped([this, conn](auto f) {
                    try {
                        f.get();
                    } catch (...) {
                        auto ep = std::current_exception();
                        if (!is_broken_pipe_or_connection_reset(ep)) {
                            // some exceptions are expected if another side closes a connection
                            // or we're shutting down
                            _logger.info("exception while processing connection: {}", ep);
                        }
                    }
                });
            });
        } catch (...) {
            _logger.debug("accept failed: {}", std::current_exception());
        }
    }
}

}