diff --git a/alternator/error.hh b/alternator/error.hh index a063d59399..b507649d22 100644 --- a/alternator/error.hh +++ b/alternator/error.hh @@ -94,6 +94,9 @@ public: static api_error internal(std::string msg) { return api_error("InternalServerError", std::move(msg), http::reply::status_type::internal_server_error); } + static api_error payload_too_large(std::string msg) { + return api_error("PayloadTooLarge", std::move(msg), status_type::payload_too_large); + } // Provide the "std::exception" interface, to make it easier to print this // exception in log messages. Note that this function is *not* used to diff --git a/alternator/server.cc b/alternator/server.cc index ddfc0b2786..4a4ae64b75 100644 --- a/alternator/server.cc +++ b/alternator/server.cc @@ -462,25 +462,81 @@ static tracing::trace_state_ptr maybe_trace_query(service::client_state& client_ return trace_state; } +// This read_entire_stream() is similar to Seastar's read_entire_stream() +// which reads the given content_stream until its end into non-contiguous +// memory. The difference is that this implementation takes an extra length +// limit, and throws an error if we read more than this limit. +// This length-limited variant would not have been needed if Seastar's HTTP +// server's set_content_length_limit() worked in every case, but unfortunately +// it does not - it only works if the request has a Content-Length header (see +// issue #8196). In contrast this function can limit the request's length no +// matter how it's encoded. We need this limit to protect Alternator from +// oversized requests that can deplete memory. +static future +read_entire_stream(input_stream& inp, size_t length_limit) { + chunked_content ret; + // We try to read length_limit + 1 bytes, so that we can throw an + // exception if we managed to read more than length_limit. + ssize_t remain = length_limit + 1; + do { + temporary_buffer buf = co_await inp.read_up_to(remain); + if (buf.empty()) { + break; + } + remain -= buf.size(); + ret.push_back(std::move(buf)); + } while (remain > 0); + // If we read the full length_limit + 1 bytes, we went over the limit: + if (remain <= 0) { + // By throwing here an error, we may send a reply (the error message) + // without having read the full request body. Seastar's httpd will + // realize that we have not read the entire content stream, and + // correctly mark the connection unreusable, i.e., close it. + // This means we are currently exposed to issue #12166 caused by + // Seastar issue 1325), where the client may get an RST instead of + // a FIN, and may rarely get a "Connection reset by peer" before + // reading the error we send. + throw api_error::payload_too_large(fmt::format("Request content length limit of {} bytes exceeded", length_limit)); + } + co_return ret; +} + future server::handle_api_request(std::unique_ptr req) { _executor._stats.total_operations++; sstring target = req->get_header("X-Amz-Target"); // target is DynamoDB API version followed by a dot '.' and operation type (e.g. CreateTable) auto dot = target.find('.'); std::string_view op = (dot == sstring::npos) ? std::string_view() : std::string_view(target).substr(dot+1); + if (req->content_length > request_content_length_limit) { + // If we have a Content-Length header and know the request will be too + // long, we don't need to wait for read_entire_stream() below to + // discover it. And we definitely mustn't try to get_units() below for + // for such a size. + co_return api_error::payload_too_large(fmt::format("Request content length limit of {} bytes exceeded", request_content_length_limit)); + } // JSON parsing can allocate up to roughly 2x the size of the raw // document, + a couple of bytes for maintenance. - // TODO: consider the case where req->content_length is missing. Maybe - // we need to take the content_length_limit and return some of the units - // when we finish read_content_and_verify_signature? - size_t mem_estimate = req->content_length * 2 + 8000; + // If the Content-Length of the request is not available, we assume + // the largest possible request (request_content_length_limit, i.e., 16 MB) + // and after reading the request we return_units() the excess. + size_t mem_estimate = (req->content_length ? req->content_length : request_content_length_limit) * 2 + 8000; auto units_fut = get_units(*_memory_limiter, mem_estimate); if (_memory_limiter->waiters()) { ++_executor._stats.requests_blocked_memory; } auto units = co_await std::move(units_fut); SCYLLA_ASSERT(req->content_stream); - chunked_content content = co_await util::read_entire_stream(*req->content_stream); + chunked_content content = co_await read_entire_stream(*req->content_stream, request_content_length_limit); + // If the request had no Content-Length, we reserved too many units + // so need to return some + if (req->content_length == 0) { + size_t content_length = 0; + for (const auto& chunk : content) { + content_length += chunk.size(); + } + size_t new_mem_estimate = content_length * 2 + 8000; + units.return_units(mem_estimate - new_mem_estimate); + } auto username = co_await verify_signature(*req, content); // As long as the system_clients_entry object is alive, this request will // be visible in the "system.clients" virtual table. When requested, this @@ -659,14 +715,12 @@ future<> server::init(net::inet_address addr, std::optional port, std: if (port) { set_routes(_http_server._routes); - _http_server.set_content_length_limit(server::content_length_limit); _http_server.set_content_streaming(true); _http_server.listen(socket_address{addr, *port}).get(); _enabled_servers.push_back(std::ref(_http_server)); } if (https_port) { set_routes(_https_server._routes); - _https_server.set_content_length_limit(server::content_length_limit); _https_server.set_content_streaming(true); if (this_shard_id() == 0) { diff --git a/alternator/server.hh b/alternator/server.hh index 2e2ef595fb..d966f24095 100644 --- a/alternator/server.hh +++ b/alternator/server.hh @@ -28,7 +28,11 @@ namespace alternator { using chunked_content = rjson::chunked_content; class server : public peering_sharded_service { - static constexpr size_t content_length_limit = 16*MB; + // The maximum size of a request body that Alternator will accept, + // in bytes. This is a safety measure to prevent Alternator from + // running out of memory when a client sends a very large request. + // DynamoDB also has the same limit set to 16 MB. + static constexpr size_t request_content_length_limit = 16*MB; using alternator_callback = std::function(executor&, executor::client_state&, tracing::trace_state_ptr, service_permit, rjson::value, std::unique_ptr)>; using alternator_callbacks_map = std::unordered_map; diff --git a/test/alternator/test_manual_requests.py b/test/alternator/test_manual_requests.py index c509cb3230..d7526faec9 100644 --- a/test/alternator/test_manual_requests.py +++ b/test/alternator/test_manual_requests.py @@ -110,8 +110,15 @@ def test_too_large_request(dynamodb, test_table): # 1. An over-long request is rejected no matter if it is sent using a # Content-Length header or chunked encoding (reproduces issue #8196). # 2. The client should be able to recognize this error as a 413 error, not -# some I/O error like broken pipe (reproduces issue #8195). -@pytest.mark.xfail(reason="issue #8196, #12166") +# some I/O error like broken pipe. Reproduces issue #8195 and, rarely, +# #12166. +# +# Because issue #12166 is still open, currently we can (in theory, but +# almost never in practice) get a "connection reset by peer" instead of a +# clean 413 reply if the packets get reordered and the RST arrives before +# the reply. So we accept this failure mode too to avoid test flakiness. +# When #12166 is fixed, we should stop allowing that failure mode. + def test_too_large_request_chunked(dynamodb, test_table): if Version(urllib3.__version__) < Version('1.26'): pytest.skip("urllib3 before 1.26.0 threw broken pipe and did not read response and cause issue #8195. Fixed by pull request urllib3/urllib3#1524") @@ -121,20 +128,32 @@ def test_too_large_request_chunked(dynamodb, test_table): '{"TableName": "' + test_table.name + '", ' + spaces + '"Item": {"p": {"S": "x"}, "c": {"S": "x"}}}') def generator(s): yield s - response = requests.post(req.url, headers=req.headers, data=generator(req.body), verify=False) + try: + response = requests.post(req.url, headers=req.headers, data=generator(req.body), verify=False) + # Until #12166 is fixed, we need this except. See comment above why. + except requests.exceptions.ConnectionError as e: + return # In issue #8196, Alternator did not recognize the request is too long # because it uses chunked encoding instead of Content-Length, so the # request succeeded, and the status_code was 200 instead of 413. assert response.status_code == 413 -@pytest.mark.xfail(reason="issue #12166. Note that only fails very rairly, will usually xpass") -def test_too_large_request_content_length(dynamodb, test_table): +# 17 MB is enough to verify that Alternator set a 16 MB limit on request +# length (as DynamoDB does), but let's also try a bigger size, which in +# at some point during the development caused us to reserve too much memory +# and hang. +@pytest.mark.parametrize("mb", [17, 50]) +def test_too_large_request_content_length(dynamodb, test_table, mb): if Version(urllib3.__version__) < Version('1.26'): pytest.skip("urllib3 before 1.26.0 threw broken pipe and did not read response and cause issue #8195. Fixed by pull request urllib3/urllib3#1524") - spaces = ' ' * (17 * 1024 * 1024) + spaces = ' ' * (mb * 1024 * 1024) req = get_signed_request(dynamodb, 'PutItem', '{"TableName": "' + test_table.name + '", ' + spaces + '"Item": {"p": {"S": "x"}, "c": {"S": "x"}}}') - response = requests.post(req.url, headers=req.headers, data=req.body, verify=False) + try: + response = requests.post(req.url, headers=req.headers, data=req.body, verify=False) + # Until #12166 is fixed, we need this except. See comment above why. + except requests.exceptions.ConnectionError as e: + return # In issue #8195, Alternator closed the connection early, causing the # library to incorrectly throw an exception (Broken Pipe) instead noticing # the error code 413 which the server did send.