Following DynamoDB, Alternator also places a 16 MB limit on the size of a request. Such a limit is necessary to avoid running out of memory - because the AWS message authentication protocol requires reading the entire request into memory before its signature can be verified. Our implementation for this limit used Seastar's HTTP server's content_length_limit feature. However, this Seastar feature is incomplete - it only works when the request uses the Content-Length header, and doesn't do anything if the request doesn't have a Content-Length (it may use chunked encoding, or have no length at all). So malicious users can cause Scylla to OOM by sending a huge request without a Content-Length. So in this patch we stop using the incomplete Seastar feature, and implement the length limit in Scylla in a way that works correctly with or without Content-Length: We read from the input stream and if we go over 16MB, we generate an error. Because we dropped Seastar's protection against a long Content-Length, we also need to fix a piece of code which used Content-Length to reserve some semaphore units to prevent reading many large requests in parallel. We fix two problems in the code: 1. If Content-Length is over the limit, we shouldn't attempt to reserve semaphore units - this should just be a Payload Too Large error. 2. If Content-Length is missing, the existing code did nothing and had a TODO that we should. In this patch we implement what was suggested in that TODO: We temporarily reserve the whole 16 MB limit, and after reading the actual request, we return part of the reservation according to the real request size. That last fix is important, because typically the largest requests will be BatchWriteItem where a well-written client would want to use chunked encoding, not Content-Length, to avoid materializing the entire request up-front. For such clients, the memory use semaphore did nothing, and now it does the right thing. Note that this patch does *not* solve the problem #12166 that existed with Seastar's length-limiting implementation but still exists in the new in-Scylla length-limiting implementation: The fact we send an error response in the middle of the request and then close the connection, while the client continues to send the request, can lead to an RST being sent by the server kernel. Usually this will be fine - well-written client libraries will be able to read the response before the RST. But even with a well-written library in some rare timings the client may get the RST before the response, and will miss the response, and get an empty or partial response or "connection reset by peer". This issue existed before this patch, and still exists, but is probably of minor impact. Fixes #8196 Signed-off-by: Nadav Har'El <nyh@scylladb.com> Closes scylladb/scylladb#23434
119 lines
4.9 KiB
C++
119 lines
4.9 KiB
C++
/*
|
|
* Copyright 2019-present ScyllaDB
|
|
*/
|
|
|
|
/*
|
|
* SPDX-License-Identifier: LicenseRef-ScyllaDB-Source-Available-1.0
|
|
*/
|
|
|
|
#pragma once
|
|
|
|
#include "alternator/executor.hh"
|
|
#include "utils/scoped_item_list.hh"
|
|
#include <seastar/core/future.hh>
|
|
#include <seastar/core/condition-variable.hh>
|
|
#include <seastar/http/httpd.hh>
|
|
#include <seastar/net/tls.hh>
|
|
#include <optional>
|
|
#include "alternator/auth.hh"
|
|
#include "service/qos/service_level_controller.hh"
|
|
#include "utils/small_vector.hh"
|
|
#include "utils/updateable_value.hh"
|
|
#include <seastar/core/units.hh>
|
|
|
|
struct client_data;
|
|
|
|
namespace alternator {
|
|
|
|
using chunked_content = rjson::chunked_content;
|
|
|
|
class server : public peering_sharded_service<server> {
|
|
// The maximum size of a request body that Alternator will accept,
|
|
// in bytes. This is a safety measure to prevent Alternator from
|
|
// running out of memory when a client sends a very large request.
|
|
// DynamoDB also has the same limit set to 16 MB.
|
|
static constexpr size_t request_content_length_limit = 16*MB;
|
|
using alternator_callback = std::function<future<executor::request_return_type>(executor&, executor::client_state&,
|
|
tracing::trace_state_ptr, service_permit, rjson::value, std::unique_ptr<http::request>)>;
|
|
using alternator_callbacks_map = std::unordered_map<std::string_view, alternator_callback>;
|
|
|
|
httpd::http_server _http_server;
|
|
httpd::http_server _https_server;
|
|
executor& _executor;
|
|
service::storage_proxy& _proxy;
|
|
gms::gossiper& _gossiper;
|
|
auth::service& _auth_service;
|
|
qos::service_level_controller& _sl_controller;
|
|
|
|
key_cache _key_cache;
|
|
utils::updateable_value<bool> _enforce_authorization;
|
|
utils::updateable_value<uint64_t> _max_users_query_size_in_trace_output;
|
|
utils::small_vector<std::reference_wrapper<seastar::httpd::http_server>, 2> _enabled_servers;
|
|
named_gate _pending_requests;
|
|
// In some places we will need a CQL updateable_timeout_config object even
|
|
// though it isn't really relevant for Alternator which defines its own
|
|
// timeouts separately. We can create this object only once.
|
|
updateable_timeout_config _timeout_config;
|
|
|
|
alternator_callbacks_map _callbacks;
|
|
|
|
semaphore* _memory_limiter;
|
|
utils::updateable_value<uint32_t> _max_concurrent_requests;
|
|
|
|
::shared_ptr<seastar::tls::server_credentials> _credentials;
|
|
|
|
class json_parser {
|
|
static constexpr size_t yieldable_parsing_threshold = 16*KB;
|
|
chunked_content _raw_document;
|
|
rjson::value _parsed_document;
|
|
std::exception_ptr _current_exception;
|
|
semaphore _parsing_sem{1};
|
|
condition_variable _document_waiting;
|
|
condition_variable _document_parsed;
|
|
abort_source _as;
|
|
future<> _run_parse_json_thread;
|
|
public:
|
|
json_parser();
|
|
// Moving a chunked_content into parse() allows parse() to free each
|
|
// chunk as soon as it is parsed, so when chunks are relatively small,
|
|
// we don't need to store the sum of unparsed and parsed sizes.
|
|
future<rjson::value> parse(chunked_content&& content);
|
|
future<> stop();
|
|
};
|
|
json_parser _json_parser;
|
|
|
|
// The server maintains a list of ongoing requests, that are being handled
|
|
// by handle_api_request(). It uses this list in get_client_data(), which
|
|
// is called when reading the "system.clients" virtual table.
|
|
struct ongoing_request {
|
|
socket_address _client_address;
|
|
sstring _user_agent;
|
|
sstring _username;
|
|
scheduling_group _scheduling_group;
|
|
bool _is_https;
|
|
client_data make_client_data() const;
|
|
};
|
|
utils::scoped_item_list<ongoing_request> _ongoing_requests;
|
|
|
|
public:
|
|
server(executor& executor, service::storage_proxy& proxy, gms::gossiper& gossiper, auth::service& service, qos::service_level_controller& sl_controller);
|
|
|
|
future<> init(net::inet_address addr, std::optional<uint16_t> port, std::optional<uint16_t> https_port, std::optional<tls::credentials_builder> creds,
|
|
utils::updateable_value<bool> enforce_authorization, utils::updateable_value<uint64_t> max_users_query_size_in_trace_output,
|
|
semaphore* memory_limiter, utils::updateable_value<uint32_t> max_concurrent_requests);
|
|
future<> stop();
|
|
// get_client_data() is called (on each shard separately) when the virtual
|
|
// table "system.clients" is read. It is expected to generate a list of
|
|
// clients connected to this server (on this shard). This function is
|
|
// called by alternator::controller::get_client_data().
|
|
future<utils::chunked_vector<client_data>> get_client_data();
|
|
private:
|
|
void set_routes(seastar::httpd::routes& r);
|
|
// If verification succeeds, returns the authenticated user's username
|
|
future<std::string> verify_signature(const seastar::http::request&, const chunked_content&);
|
|
future<executor::request_return_type> handle_api_request(std::unique_ptr<http::request> req);
|
|
};
|
|
|
|
}
|
|
|