mirror of
https://github.com/scylladb/scylladb.git
synced 2026-04-25 02:50:33 +00:00
Merge 'Alternator: add per-table batch latency metrics and test coverage' from Amnon Heiman
This series fixes a metrics visibility gap in Alternator and adds regression coverage.
Until now, BatchGetItem and BatchWriteItem updated global latency histograms but did not consistently update per-table latency histograms. As a result, table-level latency dashboards could miss batch traffic.
It updates the batch read/write paths to compute request duration once and record it in both global and per-table latency metrics.
Add the missing tests, including a metric-agnostic helper and a dedicated per-table latency test that verifies latency counters increase for item and batch operations.
This change is metrics-only (no API/behavior change for requests) and improves observability consistency between global and per-table views.
Fixes #28721
**We assume the alternator per-table metrics exist, but the batch ones are not updated**
Closes scylladb/scylladb#28732
* github.com:scylladb/scylladb:
test(alternator): add per-table latency coverage for item and batch ops
alternator: track per-table latency for batch get/write operations
(cherry picked from commit 035aa90d4b)
Closes scylladb/scylladb#29067
This commit is contained in:
@@ -3464,7 +3464,11 @@ future<executor::request_return_type> executor::batch_write_item(client_state& c
|
||||
if (should_add_wcu) {
|
||||
rjson::add(ret, "ConsumedCapacity", std::move(consumed_capacity));
|
||||
}
|
||||
_stats.api_operations.batch_write_item_latency.mark(std::chrono::steady_clock::now() - start_time);
|
||||
auto duration = std::chrono::steady_clock::now() - start_time;
|
||||
_stats.api_operations.batch_write_item_latency.mark(duration);
|
||||
for (const auto& w : per_table_wcu) {
|
||||
w.first->api_operations.batch_write_item_latency.mark(duration);
|
||||
}
|
||||
co_return rjson::print(std::move(ret));
|
||||
}
|
||||
|
||||
@@ -4975,7 +4979,12 @@ future<executor::request_return_type> executor::batch_get_item(client_state& cli
|
||||
if (!some_succeeded && eptr) {
|
||||
co_await coroutine::return_exception_ptr(std::move(eptr));
|
||||
}
|
||||
_stats.api_operations.batch_get_item_latency.mark(std::chrono::steady_clock::now() - start_time);
|
||||
auto duration = std::chrono::steady_clock::now() - start_time;
|
||||
_stats.api_operations.batch_get_item_latency.mark(duration);
|
||||
for (const table_requests& rs : requests) {
|
||||
lw_shared_ptr<stats> per_table_stats = get_stats_from_schema(_proxy, *rs.schema);
|
||||
per_table_stats->api_operations.batch_get_item_latency.mark(duration);
|
||||
}
|
||||
if (is_big(response)) {
|
||||
co_return make_streamed(std::move(response));
|
||||
} else {
|
||||
|
||||
@@ -423,14 +423,17 @@ def test_streams_operations(test_table_s, dynamodbstreams, metrics):
|
||||
# to update latencies for one kind of operation (#17616, and compare #9406),
|
||||
# and to do that checking that ..._count increases for that op is enough.
|
||||
@contextmanager
|
||||
def check_sets_latency(metrics, operation_names):
|
||||
def check_sets_latency_by_metric(metrics, operation_names, metric_name):
|
||||
the_metrics = get_metrics(metrics)
|
||||
saved_latency_count = { x: get_metric(metrics, 'scylla_alternator_op_latency_count', {'op': x}, the_metrics) for x in operation_names }
|
||||
saved_latency_count = { x: get_metric(metrics, f'{metric_name}_count', {'op': x}, the_metrics) for x in operation_names }
|
||||
yield
|
||||
the_metrics = get_metrics(metrics)
|
||||
for op in operation_names:
|
||||
# The total "count" on all shards should strictly increase
|
||||
assert saved_latency_count[op] < get_metric(metrics, 'scylla_alternator_op_latency_count', {'op': op}, the_metrics)
|
||||
assert saved_latency_count[op] < get_metric(metrics, f'{metric_name}_count', {'op': op}, the_metrics)
|
||||
|
||||
def check_sets_latency(metrics, operation_names):
|
||||
return check_sets_latency_by_metric(metrics, operation_names, 'scylla_alternator_op_latency')
|
||||
|
||||
# Test latency metrics for PutItem, GetItem, DeleteItem, UpdateItem.
|
||||
# We can't check what exactly the latency is - just that it gets updated.
|
||||
@@ -446,6 +449,18 @@ def test_item_latency(test_table_s, metrics):
|
||||
test_table_s.meta.client.batch_get_item(RequestItems = {
|
||||
test_table_s.name: {'Keys': [{'p': random_string()}], 'ConsistentRead': True}})
|
||||
|
||||
def test_item_latency_per_table(test_table_s, metrics):
|
||||
with check_sets_latency_by_metric(metrics, ['DeleteItem', 'GetItem', 'PutItem', 'UpdateItem', 'BatchWriteItem', 'BatchGetItem'], 'scylla_alternator_table_op_latency'):
|
||||
p = random_string()
|
||||
test_table_s.put_item(Item={'p': p})
|
||||
test_table_s.get_item(Key={'p': p})
|
||||
test_table_s.delete_item(Key={'p': p})
|
||||
test_table_s.update_item(Key={'p': p})
|
||||
test_table_s.meta.client.batch_write_item(RequestItems = {
|
||||
test_table_s.name: [{'PutRequest': {'Item': {'p': random_string(), 'a': 'hi'}}}]})
|
||||
test_table_s.meta.client.batch_get_item(RequestItems = {
|
||||
test_table_s.name: {'Keys': [{'p': random_string()}], 'ConsistentRead': True}})
|
||||
|
||||
# Test latency metrics for GetRecords. Other Streams-related operations -
|
||||
# ListStreams, DescribeStream, and GetShardIterator, have an operation
|
||||
# count (tested above) but do NOT currently have a latency histogram.
|
||||
|
||||
Reference in New Issue
Block a user