Merge 'Alternator: add per-table batch latency metrics and test coverage' from Amnon Heiman

This series fixes a metrics visibility gap in Alternator and adds regression coverage. Until now, BatchGetItem and BatchWriteItem updated global latency histograms but did not consistently update per-table latency histograms. As a result, table-level latency dashboards could miss batch traffic. It updates the batch read/write paths to compute request duration once and record it in both global and per-table latency metrics. Add the missing tests, including a metric-agnostic helper and a dedicated per-table latency test that verifies latency counters increase for item and batch operations. This change is metrics-only (no API/behavior change for requests) and improves observability consistency between global and per-table views. Fixes #28721 **We assume the alternator per-table metrics exist, but the batch ones are not updated** Closes scylladb/scylladb#28732 * github.com:scylladb/scylladb: test(alternator): add per-table latency coverage for item and batch ops alternator: track per-table latency for batch get/write operations (cherry picked from commit 035aa90d4b) Closes scylladb/scylladb#29067
2026-04-25 02:50:33 +00:00 · 2026-03-16 17:18:00 +02:00
parent c42799fb01
commit 3d167dd36e
2 changed files with 29 additions and 5 deletions
--- a/alternator/executor.cc
+++ b/alternator/executor.cc
@@ -3464,7 +3464,11 @@ future<executor::request_return_type> executor::batch_write_item(client_state& c
    if (should_add_wcu) {
        rjson::add(ret, "ConsumedCapacity", std::move(consumed_capacity));
    }
-    _stats.api_operations.batch_write_item_latency.mark(std::chrono::steady_clock::now() - start_time);
+    auto duration = std::chrono::steady_clock::now() - start_time;
+    _stats.api_operations.batch_write_item_latency.mark(duration);
+    for (const auto& w : per_table_wcu) {
+        w.first->api_operations.batch_write_item_latency.mark(duration);
+    }
    co_return rjson::print(std::move(ret));
 }

@@ -4975,7 +4979,12 @@ future<executor::request_return_type> executor::batch_get_item(client_state& cli
    if (!some_succeeded && eptr) {
        co_await coroutine::return_exception_ptr(std::move(eptr));
    }
-    _stats.api_operations.batch_get_item_latency.mark(std::chrono::steady_clock::now() - start_time);
+    auto duration = std::chrono::steady_clock::now() - start_time;
+    _stats.api_operations.batch_get_item_latency.mark(duration);
+    for (const table_requests& rs : requests) {
+        lw_shared_ptr<stats> per_table_stats = get_stats_from_schema(_proxy, *rs.schema);
+        per_table_stats->api_operations.batch_get_item_latency.mark(duration);
+    }
    if (is_big(response)) {
        co_return make_streamed(std::move(response));
    } else {
--- a/test/alternator/test_metrics.py
+++ b/test/alternator/test_metrics.py
@@ -423,14 +423,17 @@ def test_streams_operations(test_table_s, dynamodbstreams, metrics):
 # to update latencies for one kind of operation (#17616, and compare #9406),
 # and to do that checking that ..._count increases for that op is enough.
@contextmanager
-def check_sets_latency(metrics, operation_names):
+def check_sets_latency_by_metric(metrics, operation_names, metric_name):
    the_metrics = get_metrics(metrics)
-    saved_latency_count = { x: get_metric(metrics, 'scylla_alternator_op_latency_count', {'op': x}, the_metrics) for x in operation_names }
+    saved_latency_count = { x: get_metric(metrics, f'{metric_name}_count', {'op': x}, the_metrics) for x in operation_names }
    yield
    the_metrics = get_metrics(metrics)
    for op in operation_names:
        # The total "count" on all shards should strictly increase
-        assert saved_latency_count[op] < get_metric(metrics, 'scylla_alternator_op_latency_count', {'op': op}, the_metrics)
+        assert saved_latency_count[op] < get_metric(metrics, f'{metric_name}_count', {'op': op}, the_metrics)
+
+def check_sets_latency(metrics, operation_names):
+    return check_sets_latency_by_metric(metrics, operation_names, 'scylla_alternator_op_latency')

 # Test latency metrics for PutItem, GetItem, DeleteItem, UpdateItem.
 # We can't check what exactly the latency is - just that it gets updated.
@@ -446,6 +449,18 @@ def test_item_latency(test_table_s, metrics):
        test_table_s.meta.client.batch_get_item(RequestItems = {
            test_table_s.name: {'Keys': [{'p': random_string()}], 'ConsistentRead': True}})

+def test_item_latency_per_table(test_table_s, metrics):
+    with check_sets_latency_by_metric(metrics, ['DeleteItem', 'GetItem', 'PutItem', 'UpdateItem', 'BatchWriteItem', 'BatchGetItem'], 'scylla_alternator_table_op_latency'):
+        p = random_string()
+        test_table_s.put_item(Item={'p': p})
+        test_table_s.get_item(Key={'p': p})
+        test_table_s.delete_item(Key={'p': p})
+        test_table_s.update_item(Key={'p': p})
+        test_table_s.meta.client.batch_write_item(RequestItems = {
+            test_table_s.name: [{'PutRequest': {'Item': {'p': random_string(), 'a': 'hi'}}}]})
+        test_table_s.meta.client.batch_get_item(RequestItems = {
+            test_table_s.name: {'Keys': [{'p': random_string()}], 'ConsistentRead': True}})
+
 # Test latency metrics for GetRecords. Other Streams-related operations -
 # ListStreams, DescribeStream, and GetShardIterator, have an operation
 # count (tested above) but do NOT currently have a latency histogram.