From 3d167dd36e1c237caa8afe2c2691d80924baa472 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Botond=20D=C3=A9nes?= Date: Mon, 16 Mar 2026 17:18:00 +0200 Subject: [PATCH] Merge 'Alternator: add per-table batch latency metrics and test coverage' from Amnon Heiman This series fixes a metrics visibility gap in Alternator and adds regression coverage. Until now, BatchGetItem and BatchWriteItem updated global latency histograms but did not consistently update per-table latency histograms. As a result, table-level latency dashboards could miss batch traffic. It updates the batch read/write paths to compute request duration once and record it in both global and per-table latency metrics. Add the missing tests, including a metric-agnostic helper and a dedicated per-table latency test that verifies latency counters increase for item and batch operations. This change is metrics-only (no API/behavior change for requests) and improves observability consistency between global and per-table views. Fixes #28721 **We assume the alternator per-table metrics exist, but the batch ones are not updated** Closes scylladb/scylladb#28732 * github.com:scylladb/scylladb: test(alternator): add per-table latency coverage for item and batch ops alternator: track per-table latency for batch get/write operations (cherry picked from commit 035aa90d4b07b24a57381d27595e6afb69176205) Closes scylladb/scylladb#29067 --- alternator/executor.cc | 13 +++++++++++-- test/alternator/test_metrics.py | 21 ++++++++++++++++++--- 2 files changed, 29 insertions(+), 5 deletions(-) diff --git a/alternator/executor.cc b/alternator/executor.cc index b054e6d223..03b7b6455d 100644 --- a/alternator/executor.cc +++ b/alternator/executor.cc @@ -3464,7 +3464,11 @@ future executor::batch_write_item(client_state& c if (should_add_wcu) { rjson::add(ret, "ConsumedCapacity", std::move(consumed_capacity)); } - _stats.api_operations.batch_write_item_latency.mark(std::chrono::steady_clock::now() - start_time); + auto duration = std::chrono::steady_clock::now() - start_time; + _stats.api_operations.batch_write_item_latency.mark(duration); + for (const auto& w : per_table_wcu) { + w.first->api_operations.batch_write_item_latency.mark(duration); + } co_return rjson::print(std::move(ret)); } @@ -4975,7 +4979,12 @@ future executor::batch_get_item(client_state& cli if (!some_succeeded && eptr) { co_await coroutine::return_exception_ptr(std::move(eptr)); } - _stats.api_operations.batch_get_item_latency.mark(std::chrono::steady_clock::now() - start_time); + auto duration = std::chrono::steady_clock::now() - start_time; + _stats.api_operations.batch_get_item_latency.mark(duration); + for (const table_requests& rs : requests) { + lw_shared_ptr per_table_stats = get_stats_from_schema(_proxy, *rs.schema); + per_table_stats->api_operations.batch_get_item_latency.mark(duration); + } if (is_big(response)) { co_return make_streamed(std::move(response)); } else { diff --git a/test/alternator/test_metrics.py b/test/alternator/test_metrics.py index 3e01cc0002..f62d93171b 100644 --- a/test/alternator/test_metrics.py +++ b/test/alternator/test_metrics.py @@ -423,14 +423,17 @@ def test_streams_operations(test_table_s, dynamodbstreams, metrics): # to update latencies for one kind of operation (#17616, and compare #9406), # and to do that checking that ..._count increases for that op is enough. @contextmanager -def check_sets_latency(metrics, operation_names): +def check_sets_latency_by_metric(metrics, operation_names, metric_name): the_metrics = get_metrics(metrics) - saved_latency_count = { x: get_metric(metrics, 'scylla_alternator_op_latency_count', {'op': x}, the_metrics) for x in operation_names } + saved_latency_count = { x: get_metric(metrics, f'{metric_name}_count', {'op': x}, the_metrics) for x in operation_names } yield the_metrics = get_metrics(metrics) for op in operation_names: # The total "count" on all shards should strictly increase - assert saved_latency_count[op] < get_metric(metrics, 'scylla_alternator_op_latency_count', {'op': op}, the_metrics) + assert saved_latency_count[op] < get_metric(metrics, f'{metric_name}_count', {'op': op}, the_metrics) + +def check_sets_latency(metrics, operation_names): + return check_sets_latency_by_metric(metrics, operation_names, 'scylla_alternator_op_latency') # Test latency metrics for PutItem, GetItem, DeleteItem, UpdateItem. # We can't check what exactly the latency is - just that it gets updated. @@ -446,6 +449,18 @@ def test_item_latency(test_table_s, metrics): test_table_s.meta.client.batch_get_item(RequestItems = { test_table_s.name: {'Keys': [{'p': random_string()}], 'ConsistentRead': True}}) +def test_item_latency_per_table(test_table_s, metrics): + with check_sets_latency_by_metric(metrics, ['DeleteItem', 'GetItem', 'PutItem', 'UpdateItem', 'BatchWriteItem', 'BatchGetItem'], 'scylla_alternator_table_op_latency'): + p = random_string() + test_table_s.put_item(Item={'p': p}) + test_table_s.get_item(Key={'p': p}) + test_table_s.delete_item(Key={'p': p}) + test_table_s.update_item(Key={'p': p}) + test_table_s.meta.client.batch_write_item(RequestItems = { + test_table_s.name: [{'PutRequest': {'Item': {'p': random_string(), 'a': 'hi'}}}]}) + test_table_s.meta.client.batch_get_item(RequestItems = { + test_table_s.name: {'Keys': [{'p': random_string()}], 'ConsistentRead': True}}) + # Test latency metrics for GetRecords. Other Streams-related operations - # ListStreams, DescribeStream, and GetShardIterator, have an operation # count (tested above) but do NOT currently have a latency histogram.