Files
scylladb/test/alternator/test_metrics.py
Andrei Chekun 93b9b85c12 [test.py] Refactor alternator, nodetool, rest_api
Make alternator, nodetool and rest_api test directories as python packages.
Move scylla-gdb to scylla_gdb and make it python package.
2024-06-13 13:56:10 +02:00

377 lines
20 KiB
Python

# Copyright 2021-present ScyllaDB
#
# SPDX-License-Identifier: AGPL-3.0-or-later
##############################################################################
# Tests for Scylla's metrics (see docs/dev/metrics.md) for Alternator
# queries. Reproduces issues #9406 and #17615, where although metrics were
# implemented for Alternator requests, they were missing for some operations
# (BatchGetItem).
# In the tests here we attempt to ensure that the metrics continue to work
# for the relevant operations as the code evolves.
#
# Note that all tests in this file test Scylla-specific features, and are
# "skipped" when not running against Scylla, or when unable to retrieve
# metrics through out-of-band HTTP requests to Scylla's Prometheus port (9180).
#
# IMPORTANT: we do not want these tests to assume that are not running in
# parallel with any other tests or workload - because such an assumption
# would limit our test deployment options in the future. NOT making this
# assumption means that these tests can't check that a certain operation
# increases a certain counter by exactly 1 - because other concurrent
# operations might increase it further! So our test can only check that the
# counter increases.
##############################################################################
import re
import time
from contextlib import contextmanager
import pytest
import requests
from botocore.exceptions import ClientError
from test.alternator.test_manual_requests import get_signed_request
from test.alternator.util import random_string, new_test_table, is_aws
# Fixture for checking if we are able to test Scylla metrics. Scylla metrics
# are not available on AWS (of course), but may also not be available for
# Scylla if for some reason we have only access to the Alternator protocol
# port but no access to the metrics port (9180).
# If metrics are *not* available, tests using this fixture will be skipped.
# Tests using this fixture may call get_metrics(metrics).
@pytest.fixture(scope="module")
def metrics(dynamodb):
if dynamodb.meta.client._endpoint.host.endswith('.amazonaws.com'):
pytest.skip('Scylla-only feature not supported by AWS')
url = dynamodb.meta.client._endpoint.host
# The Prometheus API is on port 9180, and always http, not https.
url = re.sub(r':[0-9]+(/|$)', ':9180', url)
url = re.sub(r'^https:', 'http:', url)
url = url + '/metrics'
resp = requests.get(url)
if resp.status_code != 200:
pytest.skip('Metrics port 9180 is not available')
yield url
# Utility function for fetching all metrics from Scylla, using an HTTP request
# to port 9180. The response format is defined by the Prometheus protocol.
# get_metrics() needs the test to use the "metrics" fixture.
def get_metrics(metrics):
response = requests.get(metrics)
assert response.status_code == 200
return response.text
# Utility function for fetching a metric with a given name and optionally a
# given sub-metric label (which should be a name-value map). If multiple
# matches are found, they are summed - this is useful for summing up the
# counts from multiple shards.
def get_metric(metrics, name, requested_labels=None, the_metrics=None):
if not the_metrics:
the_metrics = get_metrics(metrics)
total = 0.0
lines = re.compile('^'+name+'{.*$', re.MULTILINE)
for match in re.findall(lines, the_metrics):
a = match.split()
metric = a[0]
val = float(a[1])
# Check if match also matches the requested labels
if requested_labels:
# we know metric begins with name{ and ends with } - the labels
# are what we have between those
got_labels = metric[len(name)+1:-1].split(',')
# Check that every one of the requested labels is in got_labels:
for k, v in requested_labels.items():
if not f'{k}="{v}"' in got_labels:
# No match for requested label, skip this metric (python
# doesn't have "continue 2" so let's just set val to 0...
val = 0
break
total += float(val)
return total
# context manager for checking that a certain piece of code increases each
# of the specified metrics. Helps reduce the amount of code duplication
# below.
@contextmanager
def check_increases_metric(metrics, metric_names):
the_metrics = get_metrics(metrics)
saved_metrics = { x: get_metric(metrics, x, None, the_metrics) for x in metric_names }
yield
the_metrics = get_metrics(metrics)
for n in metric_names:
assert saved_metrics[n] < get_metric(metrics, n, None, the_metrics), f'metric {n} did not increase'
@contextmanager
def check_increases_operation(metrics, operation_names):
the_metrics = get_metrics(metrics)
saved_metrics = { x: get_metric(metrics, 'scylla_alternator_operation', {'op': x}, the_metrics) for x in operation_names }
yield
the_metrics = get_metrics(metrics)
for op in operation_names:
assert saved_metrics[op] < get_metric(metrics, 'scylla_alternator_operation', {'op': op}, the_metrics)
###### Test for metrics that count DynamoDB API operations:
def test_batch_write_item(test_table_s, metrics):
with check_increases_operation(metrics, ['BatchWriteItem']):
test_table_s.meta.client.batch_write_item(RequestItems = {
test_table_s.name: [{'PutRequest': {'Item': {'p': random_string(), 'a': 'hi'}}}]})
# Reproduces issue #9406:
def test_batch_get_item(test_table_s, metrics):
with check_increases_operation(metrics, ['BatchGetItem']):
test_table_s.meta.client.batch_get_item(RequestItems = {
test_table_s.name: {'Keys': [{'p': random_string()}], 'ConsistentRead': True}})
# Test counters for CreateTable, DescribeTable, UpdateTable and DeleteTable
def test_table_operations(dynamodb, metrics):
with check_increases_operation(metrics, ['CreateTable', 'DescribeTable', 'UpdateTable', 'DeleteTable']):
with new_test_table(dynamodb,
KeySchema=[ { 'AttributeName': 'p', 'KeyType': 'HASH' }, ],
AttributeDefinitions=[ { 'AttributeName': 'p', 'AttributeType': 'S' } ]) as table:
# Run DescribeTable and UpdateTable on the new table to have those
# counted as well. The DescribeTable call isn't strictly necessary
# here because new_test_table already does it (to make sure the
# table exists before returning), but let's not assume it.
dynamodb.meta.client.describe_table(TableName=table.name)
dynamodb.meta.client.update_table(TableName=table.name)
# Test counters for DeleteItem, GetItem, PutItem and UpdateItem:
def test_item_operations(test_table_s, metrics):
with check_increases_operation(metrics, ['DeleteItem', 'GetItem', 'PutItem', 'UpdateItem']):
p = random_string()
test_table_s.put_item(Item={'p': p})
test_table_s.get_item(Key={'p': p})
test_table_s.delete_item(Key={'p': p})
test_table_s.update_item(Key={'p': p})
# Test counters for Query and Scan:
def test_scan_operations(test_table_s, metrics):
with check_increases_operation(metrics, ['Query', 'Scan']):
# We don't care whether test_table_s contains any content when we scan
# it. We just want the scan to be counted
test_table_s.scan(Limit=1)
test_table_s.query(Limit=1, KeyConditionExpression='p=:p',
ExpressionAttributeValues={':p': 'dog'})
# Test counters for DescribeEndpoints:
def test_describe_endpoints_operations(dynamodb, metrics):
with check_increases_operation(metrics, ['DescribeEndpoints']):
dynamodb.meta.client.describe_endpoints()
# Test counters for ListTables:
def test_list_tables_operations(dynamodb, metrics):
with check_increases_operation(metrics, ['ListTables']):
dynamodb.meta.client.list_tables(Limit=1)
# Test counters for TagResource, UntagResource, ListTagsOfResource:
def test_tag_operations(test_table_s, metrics):
with check_increases_operation(metrics, ['TagResource', 'UntagResource', 'ListTagsOfResource']):
client = test_table_s.meta.client
arn = client.describe_table(TableName=test_table_s.name)['Table']['TableArn']
client.list_tags_of_resource(ResourceArn=arn)
p = random_string()
client.tag_resource(ResourceArn=arn, Tags=[{'Key': p, 'Value': 'dog'}])
client.untag_resource(ResourceArn=arn, TagKeys=[p])
# Test counters for DescribeTimeToLive, UpdateTimeToLive:
def test_ttl_operations(test_table_s, metrics):
with check_increases_operation(metrics, ['DescribeTimeToLive', 'UpdateTimeToLive']):
client = test_table_s.meta.client
client.describe_time_to_live(TableName=test_table_s.name)
# We don't need to enable time-to-live to check that the
# UpdateTimeToLive operation is counted. It's fine that it fails.
with pytest.raises(ClientError, match='ValidationException'):
client.update_time_to_live(TableName=test_table_s.name)
# Test counters for ListStreams, DescribeStream, GetShardIterator, GetRecords:
def test_streams_operations(test_table_s, dynamodbstreams, metrics):
# Just to verify that these operations are counted, they don't have
# to pass. So we can use them on a table which does not have
# streams enabled, and bogus ARNs, ignoring the errors. This makes
# this test faster than actually testing working streams (which we
# test thoroughly in test_streams.py).
with check_increases_operation(metrics, ['ListStreams', 'DescribeStream', 'GetShardIterator', 'GetRecords']):
dynamodbstreams.list_streams(TableName=test_table_s.name)
arn=random_string(37)
with pytest.raises(ClientError, match='ValidationException'):
dynamodbstreams.describe_stream(StreamArn=arn)
shard_id=random_string(28)
with pytest.raises(ClientError, match='ValidationException'):
dynamodbstreams.get_shard_iterator(StreamArn=arn, ShardId=shard_id, ShardIteratorType='LATEST')
shard_iterator='garbage'
with pytest.raises(ClientError, match='ValidationException'):
dynamodbstreams.get_records(ShardIterator=shard_iterator)
# TODO: The following DynamoDB operations are not yet supported by Alternator,
# so we can't increment their metrics so we don't have tests for them:
# CreateBackup, DeleteBackup, DescribeBackup, DescribeContinuousBackups,
# ListBackups, RestoreTableFromBackup, RestoreTableToPointInTime,
# UpdateContinuousBackups, TransactGetItems, TransactWriteItems,
# CreateGlobalTable, DescribeGlobalTable, DescribeGlobalTableSettings,
# ListGlobalTables, UpdateGlobalTable, UpdateGlobalTableSettings,
# DescribeLimits.
###### Test metrics that are latency histograms of DynamoDB API operations:
# Check that an operation sets the desired latency histograms to non-zero.
# We don't know what this latency should be, but we can still check that
# the desired latency metric gets updated.
#
# A latency histogram metric "scylla_alternator_op_latency" is actually three
# separate metrics: scylla_alternator_op_latency_count, ..._sum and ..._bucket.
# As discussed in issue #18847, we cannot assume that the _sum must increase
# on every request (it is possible for it to stay unchanged if the request is
# very short and to sum until now was high). But we can rely on the _count
# to change when the latency is updated, so that's what we'll check here.
# Remember that the goal of this test isn't to verify that the histogram
# metrics code is correct, but to verify that Alternator didn't forget
# to update latencies for one kind of operation (#17616, and compare #9406),
# and to do that checking that ..._count increases for that op is enough.
@contextmanager
def check_sets_latency(metrics, operation_names):
the_metrics = get_metrics(metrics)
saved_latency_count = { x: get_metric(metrics, 'scylla_alternator_op_latency_count', {'op': x}, the_metrics) for x in operation_names }
yield
the_metrics = get_metrics(metrics)
for op in operation_names:
# The total "count" on all shards should strictly increase
assert saved_latency_count[op] < get_metric(metrics, 'scylla_alternator_op_latency_count', {'op': op}, the_metrics)
# Test latency metrics for PutItem, GetItem, DeleteItem, UpdateItem.
# We can't check what exactly the latency is - just that it gets updated.
def test_item_latency(test_table_s, metrics):
with check_sets_latency(metrics, ['DeleteItem', 'GetItem', 'PutItem', 'UpdateItem']):
p = random_string()
test_table_s.put_item(Item={'p': p})
test_table_s.get_item(Key={'p': p})
test_table_s.delete_item(Key={'p': p})
test_table_s.update_item(Key={'p': p})
# Test latency metrics for GetRecords. Other Streams-related operations -
# ListStreams, DescribeStream, and GetShardIterator, have an operation
# count (tested above) but do NOT currently have a latency histogram.
def test_streams_latency(dynamodb, dynamodbstreams, metrics):
# Whereas the *count* metric for various operations also counts failed
# operations so we could write this test without a real stream, the
# latency metrics are only updated for *successful* operations so we
# need to use a real Alternator Stream in this test.
with new_test_table(dynamodb,
# Alternator Streams is expected to fail with tablets due to #16317.
# To ensure that this test still runs, instead of xfailing it, we
# temporarily coerce Altenator to avoid using default tablets
# setting, even if it's available. We do this by using the following
# tags when creating the table:
Tags=[{'Key': 'experimental:initial_tablets', 'Value': 'none'}],
KeySchema=[{ 'AttributeName': 'p', 'KeyType': 'HASH' }],
AttributeDefinitions=[{ 'AttributeName': 'p', 'AttributeType': 'S' }],
StreamSpecification={ 'StreamEnabled': True, 'StreamViewType': 'NEW_AND_OLD_IMAGES'}
) as table:
# Wait for the stream to become active. This should be instantenous
# in Alternator, so the following loop won't wait
stream_enabled = False
start_time = time.time()
while time.time() < start_time + 60:
desc = table.meta.client.describe_table(TableName=table.name)['Table']
if 'LatestStreamArn' in desc:
arn = desc['LatestStreamArn']
desc = dynamodbstreams.describe_stream(StreamArn=arn)
if desc['StreamDescription']['StreamStatus'] == 'ENABLED':
stream_enabled = True
break
time.sleep(1)
assert stream_enabled
desc = dynamodbstreams.describe_stream(StreamArn=arn)
shard_id = desc['StreamDescription']['Shards'][0]['ShardId'];
it = dynamodbstreams.get_shard_iterator(StreamArn=arn, ShardId=shard_id, ShardIteratorType='LATEST')['ShardIterator']
with check_sets_latency(metrics, ['GetRecords']):
dynamodbstreams.get_records(ShardIterator=it)
###### Test for other metrics, not counting specific DynamoDB API operations:
# Test that unsupported operations operations increment a counter. Instead
# using an actual unsupported operation - which can change in the future as
# Alternator supports more operations - let's use a bogus operation name
# which will never be implemented.
# An unsupported operation also increments the total_operations counter.
def test_unsupported_operation(dynamodb, metrics):
with check_increases_metric(metrics, ['scylla_alternator_unsupported_operations', 'scylla_alternator_total_operations']):
req = get_signed_request(dynamodb, 'BoguousOperationName', '{}')
requests.post(req.url, headers=req.headers, data=req.body, verify=False)
# Test that also supported operations (such as DescribeEndPoints in this
# example) increment the total_operations metric:
def test_total_operations(dynamodb, metrics):
with check_increases_metric(metrics, ['scylla_alternator_total_operations']):
dynamodb.meta.client.describe_endpoints()
# A fixture to read alternator-ttl-period-in-seconds from Scylla's
# configuration. If we're testing something which isn't Scylla, or
# this configuration does not exist, skip this test. If the configuration
# isn't low enough (it is more than one second), skip this test unless
# the "--runveryslow" option is used.
@pytest.fixture(scope="session")
def alternator_ttl_period_in_seconds(dynamodb, request):
# If not running on Scylla, skip the test
if is_aws(dynamodb):
pytest.skip('Scylla-only test skipped')
# In Scylla, we can inspect the configuration via a system table
# (which is also visible in Alternator)
config_table = dynamodb.Table('.scylla.alternator.system.config')
resp = config_table.query(
KeyConditionExpression='#key=:val',
ExpressionAttributeNames={'#key': 'name'},
ExpressionAttributeValues={':val': 'alternator_ttl_period_in_seconds'})
if not 'Items' in resp:
pytest.skip('missing TTL feature, skipping test')
period = float(resp['Items'][0]['value'])
if period > 1 and not request.config.getoption('runveryslow'):
pytest.skip('need --runveryslow option to run')
return period
# Test metrics of the background expiration thread run for Alternator's TTL
# feature. The metrics tested in this test are scylla_expiration_scan_passes,
# scylla_expiration_scan_table and scylla_expiration_items_deleted. The
# metric scylla_expiration_secondary_ranges_scanned is not tested in this
# test - testing it requires a multi-node cluster because it counts the
# number of times that this node took over another node's expiration duty.
#
# Unfortunately, to see TTL expiration in action this test may need to wait
# up to the setting of alternator_ttl_period_in_seconds. test/alternator/run
# sets this to 1 second, which becomes the maximum delay of this test, but
# if it is set higher we skip this test unless --runveryslow is enabled.
# This test fails with tablets due to #16567, so to temporarily ensure that
# Alternator TTL is still being tested, we use the following TAGS to
# coerce Alternator to create the test table without tablets.
TAGS = [{'Key': 'experimental:initial_tablets', 'Value': 'none'}]
def test_ttl_stats(dynamodb, metrics, alternator_ttl_period_in_seconds):
print(alternator_ttl_period_in_seconds)
with check_increases_metric(metrics, ['scylla_expiration_scan_passes', 'scylla_expiration_scan_table', 'scylla_expiration_items_deleted']):
with new_test_table(dynamodb,
Tags = TAGS,
KeySchema=[ { 'AttributeName': 'p', 'KeyType': 'HASH' }, ],
AttributeDefinitions=[ { 'AttributeName': 'p', 'AttributeType': 'S' } ]) as table:
# Insert one already-expired item, and then enable TTL:
p0 = random_string()
table.put_item(Item={'p': p0, 'expiration': int(time.time())-60})
assert 'Item' in table.get_item(Key={'p': p0})
client = table.meta.client
client.update_time_to_live(TableName=table.name,
TimeToLiveSpecification= {'AttributeName': 'expiration', 'Enabled': True})
# After alternator_ttl_period_in_seconds, we expect the metrics
# to have increased. Add some time to that, to account for
# extremely overloaded test machines.
start_time = time.time()
while time.time() < start_time + alternator_ttl_period_in_seconds + 120:
if not 'Item' in table.get_item(Key={'p': p0}):
break
time.sleep(0.1)
assert not 'Item' in table.get_item(Key={'p': p0})
# TODO: there are additional metrics which we don't yet test here. At the
# time of this writing they are:
# reads_before_write, write_using_lwt, shard_bounce_for_lwt,
# requests_blocked_memory, requests_shed