From 3d167dd36e1c237caa8afe2c2691d80924baa472 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Botond=20D=C3=A9nes?= <bdenes@scylladb.com>
Date: Mon, 16 Mar 2026 17:18:00 +0200
Subject: [PATCH] Merge 'Alternator: add per-table batch latency metrics and
 test coverage' from Amnon Heiman

This series fixes a metrics visibility gap in Alternator and adds regression coverage.

Until now, BatchGetItem and BatchWriteItem updated global latency histograms but did not consistently update per-table latency histograms. As a result, table-level latency dashboards could miss batch traffic.

It updates the batch read/write paths to compute request duration once and record it in both global and per-table latency metrics.

Add the missing tests, including a metric-agnostic helper and a dedicated per-table latency test that verifies latency counters increase for item and batch operations.

This change is metrics-only (no API/behavior change for requests) and improves observability consistency between global and per-table views.

Fixes #28721

**We assume the alternator per-table metrics exist, but the batch ones are not updated**

Closes scylladb/scylladb#28732

* github.com:scylladb/scylladb:
  test(alternator): add per-table latency coverage for item and batch ops
  alternator: track per-table latency for batch get/write operations

(cherry picked from commit 035aa90d4b07b24a57381d27595e6afb69176205)

Closes scylladb/scylladb#29067
---
 alternator/executor.cc          | 13 +++++++++++--
 test/alternator/test_metrics.py | 21 ++++++++++++++++++---
 2 files changed, 29 insertions(+), 5 deletions(-)
diff --git a/alternator/executor.cc b/alternator/executor.cc
index b054e6d223..03b7b6455d 100644
--- a/alternator/executor.cc
+++ b/alternator/executor.cc
@@ -3464,7 +3464,11 @@ future<executor::request_return_type> executor::batch_write_item(client_state& c
     if (should_add_wcu) {
         rjson::add(ret, "ConsumedCapacity", std::move(consumed_capacity));
     }
-    _stats.api_operations.batch_write_item_latency.mark(std::chrono::steady_clock::now() - start_time);
+    auto duration = std::chrono::steady_clock::now() - start_time;
+    _stats.api_operations.batch_write_item_latency.mark(duration);
+    for (const auto& w : per_table_wcu) {
+        w.first->api_operations.batch_write_item_latency.mark(duration);
+    }
     co_return rjson::print(std::move(ret));
 }
 
@@ -4975,7 +4979,12 @@ future<executor::request_return_type> executor::batch_get_item(client_state& cli
     if (!some_succeeded && eptr) {
         co_await coroutine::return_exception_ptr(std::move(eptr));
     }
-    _stats.api_operations.batch_get_item_latency.mark(std::chrono::steady_clock::now() - start_time);
+    auto duration = std::chrono::steady_clock::now() - start_time;
+    _stats.api_operations.batch_get_item_latency.mark(duration);
+    for (const table_requests& rs : requests) {
+        lw_shared_ptr<stats> per_table_stats = get_stats_from_schema(_proxy, *rs.schema);
+        per_table_stats->api_operations.batch_get_item_latency.mark(duration);
+    }
     if (is_big(response)) {
         co_return make_streamed(std::move(response));
     } else {
diff --git a/test/alternator/test_metrics.py b/test/alternator/test_metrics.py
index 3e01cc0002..f62d93171b 100644
--- a/test/alternator/test_metrics.py
+++ b/test/alternator/test_metrics.py
@@ -423,14 +423,17 @@ def test_streams_operations(test_table_s, dynamodbstreams, metrics):
 # to update latencies for one kind of operation (#17616, and compare #9406),
 # and to do that checking that ..._count increases for that op is enough.
 @contextmanager
-def check_sets_latency(metrics, operation_names):
+def check_sets_latency_by_metric(metrics, operation_names, metric_name):
     the_metrics = get_metrics(metrics)
-    saved_latency_count = { x: get_metric(metrics, 'scylla_alternator_op_latency_count', {'op': x}, the_metrics) for x in operation_names }
+    saved_latency_count = { x: get_metric(metrics, f'{metric_name}_count', {'op': x}, the_metrics) for x in operation_names }
     yield
     the_metrics = get_metrics(metrics)
     for op in operation_names:
         # The total "count" on all shards should strictly increase
-        assert saved_latency_count[op] < get_metric(metrics, 'scylla_alternator_op_latency_count', {'op': op}, the_metrics)
+        assert saved_latency_count[op] < get_metric(metrics, f'{metric_name}_count', {'op': op}, the_metrics)
+
+def check_sets_latency(metrics, operation_names):
+    return check_sets_latency_by_metric(metrics, operation_names, 'scylla_alternator_op_latency')
 
 # Test latency metrics for PutItem, GetItem, DeleteItem, UpdateItem.
 # We can't check what exactly the latency is - just that it gets updated.
@@ -446,6 +449,18 @@ def test_item_latency(test_table_s, metrics):
         test_table_s.meta.client.batch_get_item(RequestItems = {
             test_table_s.name: {'Keys': [{'p': random_string()}], 'ConsistentRead': True}})
 
+def test_item_latency_per_table(test_table_s, metrics):
+    with check_sets_latency_by_metric(metrics, ['DeleteItem', 'GetItem', 'PutItem', 'UpdateItem', 'BatchWriteItem', 'BatchGetItem'], 'scylla_alternator_table_op_latency'):
+        p = random_string()
+        test_table_s.put_item(Item={'p': p})
+        test_table_s.get_item(Key={'p': p})
+        test_table_s.delete_item(Key={'p': p})
+        test_table_s.update_item(Key={'p': p})
+        test_table_s.meta.client.batch_write_item(RequestItems = {
+            test_table_s.name: [{'PutRequest': {'Item': {'p': random_string(), 'a': 'hi'}}}]})
+        test_table_s.meta.client.batch_get_item(RequestItems = {
+            test_table_s.name: {'Keys': [{'p': random_string()}], 'ConsistentRead': True}})
+
 # Test latency metrics for GetRecords. Other Streams-related operations -
 # ListStreams, DescribeStream, and GetShardIterator, have an operation
 # count (tested above) but do NOT currently have a latency histogram.