From a86928caa16498ccab66bf35bcda4ca1399b1433 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C5=81ukasz=20Paszkowski?= Date: Sat, 28 Mar 2026 20:40:33 +0100 Subject: [PATCH 1/2] alternator: check concurrency limit before memory acquisition The concurrency limit check in the Alternator server was positioned after memory acquisition (get_units), request body reading (read_entire_stream), signature verification, and decompression. This allowed unlimited requests to pile up consuming memory before being rejected, exhausting LSA memory and causing logalloc::bad_alloc errors that cascade into Raft applier and topology coordinator failures, breaking subsequent operations. Without this fix, test_limit_concurrent_requests on a 1GB node produces 50 logalloc::bad_alloc errors and cascading failures: reads from system.scylla_local fail, the Raft applier fiber stops, the topology coordinator stops, and all subsequent CreateTable operations fail with InternalServerError (500). With this fix, the cascade is eliminated -- admitted requests may still cause LSA pressure on a memory-constrained node, but the server remains functional. Move the concurrency check to right after the content-length early-out, before any memory acquisition or I/O. This mirrors the CQL transport which correctly checks concurrency before memory acquisition (transport/server.cc). The concurrency check was originally added in 1b8c946ad7 (Sep 2020) *before* memory acquisition, which at the time lived inside with_gate (after the concurrency gate). The ordering was inverted by f41dac2a3a (Mar 2021, "avoid large contiguous allocation for request body"), which moved get_units() earlier in the function to reserve memory before reading the newly-introduced content stream -- but inadvertently also moved it before the concurrency check. c3593462a4 (Mar 2025) further worsened the situation by adding a 16MB fallback reservation for requests without Content-Length and ungzip/deflate decompression steps -- all before the concurrency check -- greatly increasing the memory consumed by requests that would ultimately be rejected. --- alternator/server.cc | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/alternator/server.cc b/alternator/server.cc index 3133f15360..e74ea15054 100644 --- a/alternator/server.cc +++ b/alternator/server.cc @@ -699,6 +699,17 @@ future server::handle_api_request(std::unique_ptr // for such a size. co_return api_error::payload_too_large(fmt::format("Request content length limit of {} bytes exceeded", request_content_length_limit)); } + // Check the concurrency limit early, before acquiring memory and + // reading the request body, to avoid piling up memory from excess + // requests that will be rejected anyway. This mirrors the CQL + // transport which also checks concurrency before memory acquisition + // (transport/server.cc). + if (_pending_requests.get_count() >= _max_concurrent_requests) { + _executor._stats.requests_shed++; + co_return api_error::request_limit_exceeded(format("too many in-flight requests (configured via max_concurrent_requests_per_shard): {}", _pending_requests.get_count())); + } + _pending_requests.enter(); + auto leave = defer([this] () noexcept { _pending_requests.leave(); }); // JSON parsing can allocate up to roughly 2x the size of the raw // document, + a couple of bytes for maintenance. // If the Content-Length of the request is not available, we assume @@ -760,12 +771,6 @@ future server::handle_api_request(std::unique_ptr _executor._stats.unsupported_operations++; co_return api_error::unknown_operation(fmt::format("Unsupported operation {}", op)); } - if (_pending_requests.get_count() >= _max_concurrent_requests) { - _executor._stats.requests_shed++; - co_return api_error::request_limit_exceeded(format("too many in-flight requests (configured via max_concurrent_requests_per_shard): {}", _pending_requests.get_count())); - } - _pending_requests.enter(); - auto leave = defer([this] () noexcept { _pending_requests.leave(); }); executor::client_state client_state(service::client_state::external_tag(), _auth_service, &_sl_controller, _timeout_config.current_values(), req->get_client_address()); if (!username.empty()) { From b8e3ef0c64b5a12930546d0d110cb89e7d9b8cc8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C5=81ukasz=20Paszkowski?= Date: Sat, 28 Mar 2026 20:40:33 +0100 Subject: [PATCH 2/2] test: reduce concurrent-request-limit test pressure to avoid LSA exhaustion MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The test_limit_concurrent_requests dtest uses concurrent CreateTable requests to verify Alternator's concurrency limiting. Each admitted CreateTable triggers Raft consensus, schema mutations, and memtable flushes—all of which consume LSA memory. On the 1 GB test node (2 SMP × 512 MB), the original settings (limit=5, 25 threads) created enough flush pressure to exhaust the LSA emergency reserve, producing logalloc::bad_alloc errors in the node log. The test was always marginal under these settings and became flaky as new system tables increased baseline LSA usage over time. Lower concurrent_requests_limit from 5 to 3 and the thread multiplier from 5 to 2 (6 threads total). This is still well above the limit and sufficient to reliably trigger RequestLimitExceeded, while keeping flush pressure within what 512 MB per shard can sustain. --- test/cluster/dtest/alternator_tests.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/test/cluster/dtest/alternator_tests.py b/test/cluster/dtest/alternator_tests.py index e59349ec97..a823835b7f 100644 --- a/test/cluster/dtest/alternator_tests.py +++ b/test/cluster/dtest/alternator_tests.py @@ -481,12 +481,14 @@ class TesterAlternator(BaseAlternator): 2) Issue Alternator 'heavy' requests concurrently (create-table) 3) wait for RequestLimitExceeded error response. """ - concurrent_requests_limit = 5 + # Keep the limit low to avoid exhausting LSA memory on the 1GB test node + # when multiple CreateTable requests (Raft + schema + flush) run concurrently. + concurrent_requests_limit = 3 extra_config = {"max_concurrent_requests_per_shard": concurrent_requests_limit, "num_tokens": 1} self.prepare_dynamodb_cluster(num_of_nodes=1, extra_config=extra_config) node1 = self.cluster.nodelist()[0] create_tables_threads = [] - for tables_num in range(concurrent_requests_limit * 5): + for tables_num in range(concurrent_requests_limit * 2): create_tables_threads.append(self.run_create_table_thread()) @retrying(num_attempts=150, sleep_time=0.2, allowed_exceptions=ConcurrencyLimitNotExceededError, message="Running create-table request")