Merge 'Alternator: Run more TTL tests by default (and add a test for metrics)' from Nadav Har'El

We had quite a few tests for Alternator TTL in test/alternator, but most
of them did not run as part of the usual Jenkins test suite, because
they were considered "very slow" (and require a special "--runveryslow"
flag to run).

In this series we enable six tests which run quickly enough to run by
default, without an additional flag. We also make them even quicker -
the six tests now take around 2.5 seconds.

I also noticed that we don't have a test for the Alternator TTL metrics
- and added one.

Fixes #11374.
Refs https://github.com/scylladb/scylla-monitoring/issues/1783

Closes #11384

* github.com:scylladb/scylladb:
  test/alternator: insert test names into Scylla logs
  rest api: add a new /system/log operation
  alternator ttl: log warning if scan took too long.
  alternator,ttl: allow sub-second TTL scanning period, for tests
  test/alternator: skip fewer Alternator TTL tests
  test/alternator: test Alternator TTL metrics
This commit is contained in:
Piotr Sarna
2022-09-22 09:47:50 +02:00
9 changed files with 201 additions and 47 deletions

View File

@@ -768,13 +768,15 @@ future<> expiration_service::run() {
// in the next iteration by reducing the scanner's scheduling-group
// share (if using a separate scheduling group), or introduce
// finer-grain sleeps into the scanning code.
std::chrono::seconds scan_duration(std::chrono::duration_cast<std::chrono::seconds>(lowres_clock::now() - start));
std::chrono::seconds period(_db.get_config().alternator_ttl_period_in_seconds());
std::chrono::milliseconds scan_duration(std::chrono::duration_cast<std::chrono::milliseconds>(lowres_clock::now() - start));
std::chrono::milliseconds period(long(_db.get_config().alternator_ttl_period_in_seconds() * 1000));
if (scan_duration < period) {
try {
tlogger.info("sleeping {} seconds until next period", (period - scan_duration).count());
tlogger.info("sleeping {} seconds until next period", (period - scan_duration).count()/1000.0);
co_await seastar::sleep_abortable(period - scan_duration, _abort_source);
} catch(seastar::sleep_aborted&) {}
} else {
tlogger.warn("scan took {} seconds, longer than period - not sleeping", scan_duration.count()/1000.0);
}
}
}

View File

@@ -52,6 +52,45 @@
}
]
},
{
"path":"/system/log",
"operations":[
{
"method":"POST",
"summary":"Write a message to the Scylla log",
"type":"void",
"nickname":"write_log_message",
"produces":[
"application/json"
],
"parameters":[
{
"name":"message",
"description":"The message to write to the log",
"required":true,
"allowMultiple":false,
"type":"string",
"paramType":"query"
},
{
"name":"level",
"description":"The logging level to use",
"required":true,
"allowMultiple":false,
"type":"string",
"enum":[
"error",
"warn",
"info",
"debug",
"trace"
],
"paramType":"query"
}
]
}
]
},
{
"path":"/system/drop_sstable_caches",
"operations":[

View File

@@ -61,6 +61,16 @@ void set_system(http_context& ctx, routes& r) {
return json::json_void();
});
hs::write_log_message.set(r, [](const_req req) {
try {
logging::log_level level = boost::lexical_cast<logging::log_level>(std::string(req.get_query_param("level")));
apilog.log(level, "/system/log: {}", std::string(req.get_query_param("message")));
} catch (boost::bad_lexical_cast& e) {
throw bad_param_exception("Unknown logging level " + req.get_query_param("level"));
}
return json::json_void();
});
hs::drop_sstable_caches.set(r, [&ctx](std::unique_ptr<request> req) {
apilog.info("Dropping sstable caches");
return ctx.db.invoke_on_all([] (replica::database& db) {

View File

@@ -359,7 +359,7 @@ public:
named_value<sstring> alternator_write_isolation;
named_value<uint32_t> alternator_streams_time_window_s;
named_value<uint32_t> alternator_timeout_in_ms;
named_value<uint32_t> alternator_ttl_period_in_seconds;
named_value<double> alternator_ttl_period_in_seconds;
named_value<bool> abort_on_ebadf;

View File

@@ -13,7 +13,7 @@ import pytest
import boto3
import requests
import re
from util import create_test_table, is_aws
from util import create_test_table, is_aws, scylla_log
# When tests are run with HTTPS, the server often won't have its SSL
# certificate signed by a known authority. So we will disable certificate
@@ -112,7 +112,8 @@ def dynamodbstreams(request):
# that the server is still alive - and if not report the test which crashed
# it and stop running any more tests.
@pytest.fixture(scope="function", autouse=True)
def dynamodb_test_connection(dynamodb, request):
def dynamodb_test_connection(dynamodb, request, optional_rest_api):
scylla_log(optional_rest_api, f'test/alternator: Starting {request.node.parent.name}::{request.node.name}', 'info')
yield
try:
# We want to run a do-nothing DynamoDB command. The health-check
@@ -122,6 +123,7 @@ def dynamodb_test_connection(dynamodb, request):
assert response.ok
except:
pytest.exit(f"Scylla appears to have crashed in test {request.node.parent.name}::{request.node.name}")
scylla_log(optional_rest_api, f'test/alternator: Ended {request.node.parent.name}::{request.node.name}', 'info')
# "test_table" fixture: Create and return a temporary table to be used in tests
@@ -275,9 +277,14 @@ def test_table_s_forbid_rmw(dynamodb, scylla_only):
# the test using this fixture will be skipped with a message about the REST
# API not being available.
@pytest.fixture(scope="session")
def rest_api(dynamodb):
def rest_api(dynamodb, optional_rest_api):
if optional_rest_api is None:
pytest.skip('Cannot connect to Scylla REST API')
return optional_rest_api
@pytest.fixture(scope="session")
def optional_rest_api(dynamodb):
if is_aws(dynamodb):
pytest.skip('Scylla-only REST API not supported by AWS')
return None
url = dynamodb.meta.client._endpoint.host
# The REST API is on port 10000, and always http, not https.
url = re.sub(r':[0-9]+(/|$)', ':10000', url)
@@ -287,5 +294,5 @@ def rest_api(dynamodb):
try:
requests.get(f'{url}/column_family/name/keyspace', timeout=1).raise_for_status()
except:
pytest.skip('Cannot connect to Scylla REST API')
return None
return url

View File

@@ -43,7 +43,7 @@ def run_alternator_cmd(pid, dir):
'--alternator-write-isolation', 'always_use_lwt',
'--alternator-streams-time-window-s', '0',
'--alternator-timeout-in-ms', '30000',
'--alternator-ttl-period-in-seconds', '1',
'--alternator-ttl-period-in-seconds', '0.5',
# Allow testing experimental features. Following issue #9467, we need
# to add here specific experimental features as they are introduced.
# We only list here Alternator-specific experimental features - CQL

View File

@@ -25,10 +25,11 @@
import pytest
import requests
import re
import time
from contextlib import contextmanager
from botocore.exceptions import ClientError
from util import random_string, new_test_table
from util import random_string, new_test_table, is_aws
# Fixture for checking if we are able to test Scylla metrics. Scylla metrics
# are not available on AWS (of course), but may also not be available for
@@ -96,7 +97,7 @@ def check_increases_metric(metrics, metric_names):
yield
the_metrics = get_metrics(metrics)
for n in metric_names:
assert saved_metrics[n] < get_metric(metrics, n, None, the_metrics)
assert saved_metrics[n] < get_metric(metrics, n, None, the_metrics), f'metric {n} did not increase'
@contextmanager
def check_increases_operation(metrics, operation_names):
@@ -226,6 +227,64 @@ def test_total_operations(dynamodb, metrics):
with check_increases_metric(metrics, ['scylla_alternator_total_operations']):
dynamodb.meta.client.describe_endpoints()
# A fixture to read alternator-ttl-period-in-seconds from Scylla's
# configuration. If we're testing something which isn't Scylla, or
# this configuration does not exist, skip this test. If the configuration
# isn't low enough (it is more than one second), skip this test unless
# the "--runveryslow" option is used.
@pytest.fixture(scope="session")
def alternator_ttl_period_in_seconds(dynamodb, request):
# If not running on Scylla, skip the test
if is_aws(dynamodb):
pytest.skip('Scylla-only test skipped')
# In Scylla, we can inspect the configuration via a system table
# (which is also visible in Alternator)
config_table = dynamodb.Table('.scylla.alternator.system.config')
resp = config_table.query(
KeyConditionExpression='#key=:val',
ExpressionAttributeNames={'#key': 'name'},
ExpressionAttributeValues={':val': 'alternator_ttl_period_in_seconds'})
if not 'Items' in resp:
pytest.skip('missing TTL feature, skipping test')
period = float(resp['Items'][0]['value'])
if period > 1 and not request.config.getoption('runveryslow'):
pytest.skip('need --runveryslow option to run')
return period
# Test metrics of the background expiration thread run for Alternator's TTL
# feature. The metrics tested in this test are scylla_expiration_scan_passes,
# scylla_expiration_scan_table and scylla_expiration_items_deleted. The
# metric scylla_expiration_secondary_ranges_scanned is not tested in this
# test - testing it requires a multi-node cluster because it counts the
# number of times that this node took over another node's expiration duty.
#
# Unfortunately, to see TTL expiration in action this test may need to wait
# up to the setting of alternator_ttl_period_in_seconds. test/alternator/run
# sets this to 1 second, which becomes the maximum delay of this test, but
# if it is set higher we skip this test unless --runveryslow is enabled.
def test_ttl_stats(dynamodb, metrics, alternator_ttl_period_in_seconds):
print(alternator_ttl_period_in_seconds)
with check_increases_metric(metrics, ['scylla_expiration_scan_passes', 'scylla_expiration_scan_table', 'scylla_expiration_items_deleted']):
with new_test_table(dynamodb,
KeySchema=[ { 'AttributeName': 'p', 'KeyType': 'HASH' }, ],
AttributeDefinitions=[ { 'AttributeName': 'p', 'AttributeType': 'S' } ]) as table:
# Insert one already-expired item, and then enable TTL:
p0 = random_string()
table.put_item(Item={'p': p0, 'expiration': int(time.time())-60})
assert 'Item' in table.get_item(Key={'p': p0})
client = table.meta.client
client.update_time_to_live(TableName=table.name,
TimeToLiveSpecification= {'AttributeName': 'expiration', 'Enabled': True})
# After alternator_ttl_period_in_seconds, we expect the metrics
# to have increased. Add some time to that, to account for
# extremely overloaded test machines.
start_time = time.time()
while time.time() < start_time + alternator_ttl_period_in_seconds + 60:
if not 'Item' in table.get_item(Key={'p': p0}):
break
time.sleep(0.1)
assert not 'Item' in table.get_item(Key={'p': p0})
# TODO: there are additional metrics which we don't yet test here. At the
# time of this writing they are: latency histograms
# ({put,get,delete,update}_item_latency, get_records_latency),

View File

@@ -36,6 +36,39 @@ def passes_or_raises(expected_exception, match=None):
except Exception as err:
pytest.fail(f"Got unexpected exception type {type(err).__name__} instead of {expected_exception.__name__}: {err}")
# Some of the TTL tests below can be very slow on DynamoDB, or even in
# Scylla in some configurations. The following two fixtures allow tests to
# run when they can be run reasonably quickly, but be skipped otherwise -
# unless the "--runverslow" option is passed to pytest. This is similar
# to our @pytest.mark.veryslow, but the latter is a blunter instrument.
# The waits_for_expiration fixture indicates that this test waits for expired
# items to be deleted. This is always very slow on AWS, and whether it is
# very slow on Scylla or reasonably fast depends on the
# alternator_ttl_period_in_seconds configuration - test/alternator/run sets
# it very low, but Scylla may have been run manually.
@pytest.fixture(scope="session")
def waits_for_expiration(dynamodb, request):
if is_aws(dynamodb) and not request.config.getoption('runveryslow'):
pytest.skip('need --runveryslow option to run')
config_table = dynamodb.Table('.scylla.alternator.system.config')
resp = config_table.query(
KeyConditionExpression='#key=:val',
ExpressionAttributeNames={'#key': 'name'},
ExpressionAttributeValues={':val': 'alternator_ttl_period_in_seconds'})
period = float(resp['Items'][0]['value'])
if period > 1 and not request.config.getoption('runveryslow'):
pytest.skip('need --runveryslow option to run')
# The veryslow_on_aws says that this test is very slow on AWS, but
# always reasonably fast on Scylla. If fastness on Scylla requires a
# specific setting of alternator_ttl_period_in_seconds, don't use this
# fixture - use the above waits_for_expiration instead.
@pytest.fixture(scope="session")
def veryslow_on_aws(dynamodb, request):
if is_aws(dynamodb) and not request.config.getoption('runveryslow'):
pytest.skip('need --runveryslow option to run')
# Test the DescribeTimeToLive operation on a table where the time-to-live
# feature was *not* enabled.
def test_describe_ttl_without_ttl(test_table):
@@ -110,12 +143,11 @@ def test_ttl_disable_errors(dynamodb):
client.update_time_to_live(TableName=table.name,
TimeToLiveSpecification={'AttributeName': 'expiration', 'Enabled': False})
# Test *sucessful* disabling of TTL. This is an extremely slow test, as
# DynamoDB refuses to disable TTL if it was enabled in the last half hour
# Test *sucessful* disabling of TTL. This is an extremely slow test on AWS,
# as DynamoDB refuses to disable TTL if it was enabled in the last half hour
# (the documentation suggests a full hour, but in practice it seems 30
# minutes).
@pytest.mark.veryslow
def test_ttl_disable(dynamodb):
# minutes). But on Scylla it is currently almost instantaneous.
def test_ttl_disable(dynamodb, veryslow_on_aws):
with new_test_table(dynamodb,
KeySchema=[ { 'AttributeName': 'p', 'KeyType': 'HASH' }, ],
AttributeDefinitions=[ { 'AttributeName': 'p', 'AttributeType': 'S' } ]) as table:
@@ -312,12 +344,12 @@ def test_ttl_expiration(dynamodb):
# such a table as well. This test is simpler than test_ttl_expiration because
# all we want to check is that the deletion works - not the expiration time
# parsing and semantics.
@pytest.mark.veryslow
def test_ttl_expiration_with_rangekey(dynamodb):
def test_ttl_expiration_with_rangekey(dynamodb, waits_for_expiration):
# Note that unlike test_ttl_expiration, this test doesn't have a fixed
# duration - it finishes as soon as the item we expect to be expired
# is expired.
max_duration = 1200 if is_aws(dynamodb) else 10
max_duration = 1200 if is_aws(dynamodb) else 120
sleep = 30 if is_aws(dynamodb) else 0.1
with new_test_table(dynamodb,
KeySchema=[ { 'AttributeName': 'p', 'KeyType': 'HASH' },
{ 'AttributeName': 'c', 'KeyType': 'RANGE' } ],
@@ -344,7 +376,7 @@ def test_ttl_expiration_with_rangekey(dynamodb):
if ('Item' in table.get_item(Key={'p': p1, 'c': c1})) and not ('Item' in table.get_item(Key={'p': p2, 'c': c2})):
# p2 expired, p1 didn't. We're done
break
time.sleep(max_duration/15.0)
time.sleep(sleep)
assert 'Item' in table.get_item(Key={'p': p1, 'c': c1})
assert not 'Item' in table.get_item(Key={'p': p2, 'c': c2})
@@ -352,10 +384,10 @@ def test_ttl_expiration_with_rangekey(dynamodb):
# expiration-time attribute *may* be the hash or range key attributes.
# So let's test that this indeed works and items indeed expire
# Just like test_ttl_expiration() above, these tests are extremely slow
# because DynamoDB delays expiration by around 10 minutes.
@pytest.mark.veryslow
def test_ttl_expiration_hash(dynamodb):
duration = 1200 if is_aws(dynamodb) else 10
# on AWS because DynamoDB delays expiration by around 10 minutes.
def test_ttl_expiration_hash(dynamodb, waits_for_expiration):
max_duration = 1200 if is_aws(dynamodb) else 120
sleep = 30 if is_aws(dynamodb) else 0.1
with new_test_table(dynamodb,
KeySchema=[ { 'AttributeName': 'p', 'KeyType': 'HASH' }, ],
AttributeDefinitions=[ { 'AttributeName': 'p', 'AttributeType': 'N' } ]) as table:
@@ -373,7 +405,7 @@ def test_ttl_expiration_hash(dynamodb):
def check_expired():
return not 'Item' in table.get_item(Key={'p': p1}) and 'Item' in table.get_item(Key={'p': p2})
while time.time() < start_time + duration:
while time.time() < start_time + max_duration:
print(f"--- {int(time.time()-start_time)} seconds")
if 'Item' in table.get_item(Key={'p': p1}):
print("p1 alive")
@@ -381,13 +413,13 @@ def test_ttl_expiration_hash(dynamodb):
print("p2 alive")
if check_expired():
break
time.sleep(duration/15)
time.sleep(sleep)
# After the delay, p2 should be alive, p1 should not
assert check_expired()
@pytest.mark.veryslow
def test_ttl_expiration_range(dynamodb):
duration = 1200 if is_aws(dynamodb) else 10
def test_ttl_expiration_range(dynamodb, waits_for_expiration):
max_duration = 1200 if is_aws(dynamodb) else 120
sleep = 30 if is_aws(dynamodb) else 0.1
with new_test_table(dynamodb,
KeySchema=[ { 'AttributeName': 'p', 'KeyType': 'HASH' }, { 'AttributeName': 'c', 'KeyType': 'RANGE' } ],
AttributeDefinitions=[ { 'AttributeName': 'p', 'AttributeType': 'S' }, { 'AttributeName': 'c', 'AttributeType': 'N' } ]) as table:
@@ -406,7 +438,7 @@ def test_ttl_expiration_range(dynamodb):
def check_expired():
return not 'Item' in table.get_item(Key={'p': p, 'c': c1}) and 'Item' in table.get_item(Key={'p': p, 'c': c2})
while time.time() < start_time + duration:
while time.time() < start_time + max_duration:
print(f"--- {int(time.time()-start_time)} seconds")
if 'Item' in table.get_item(Key={'p': p, 'c': c1}):
print("c1 alive")
@@ -414,7 +446,7 @@ def test_ttl_expiration_range(dynamodb):
print("c2 alive")
if check_expired():
break
time.sleep(duration/15)
time.sleep(sleep)
# After the delay, c2 should be alive, c1 should not
assert check_expired()
@@ -442,7 +474,7 @@ def test_ttl_expiration_hash_wrong_type(dynamodb):
print(f"--- {int(time.time()-start_time)} seconds")
if 'Item' in table.get_item(Key={'p': p1}):
print("p1 alive")
time.sleep(duration/15)
time.sleep(duration/30)
# After the delay, p2 should be alive, p1 should not
assert 'Item' in table.get_item(Key={'p': p1})
@@ -453,13 +485,13 @@ def test_ttl_expiration_hash_wrong_type(dynamodb):
# and can test just one expiring item. This also allows us to finish the
# test as soon as the item expires, instead of after a fixed "duration" as
# in the above tests.
@pytest.mark.veryslow
def test_ttl_expiration_gsi_lsi(dynamodb):
def test_ttl_expiration_gsi_lsi(dynamodb, waits_for_expiration):
# In our experiments we noticed that when a table has secondary indexes,
# items are expired with significant delay. Whereas a 10 minute delay
# for regular tables was typical, in the table we have here we saw
# a typical delay of 30 minutes before items expired.
max_duration = 3600 if is_aws(dynamodb) else 10
max_duration = 3600 if is_aws(dynamodb) else 120
sleep = 30 if is_aws(dynamodb) else 0.1
with new_test_table(dynamodb,
KeySchema=[
{ 'AttributeName': 'p', 'KeyType': 'HASH' },
@@ -524,7 +556,7 @@ def test_ttl_expiration_gsi_lsi(dynamodb):
# test is done - and successful:
if not base_alive and not gsi_alive and gsi_was_alive and not lsi_alive:
return
time.sleep(max_duration/15)
time.sleep(sleep)
pytest.fail('base, gsi, or lsi not expired')
# Above in test_ttl_expiration_hash() and test_ttl_expiration_range() we
@@ -541,11 +573,11 @@ def test_ttl_expiration_gsi_lsi(dynamodb):
# and make their keys regular attributes - to support adding GSIs after a
# table already exists - so after such a change GSIs will no longer be a
# special case.
@pytest.mark.veryslow
def test_ttl_expiration_lsi_key(dynamodb):
def test_ttl_expiration_lsi_key(dynamodb, waits_for_expiration):
# In my experiments, a 30-minute (1800 seconds) is the typical delay
# for expiration in this test for DynamoDB
max_duration = 3600 if is_aws(dynamodb) else 10
max_duration = 3600 if is_aws(dynamodb) else 120
sleep = 30 if is_aws(dynamodb) else 0.1
with new_test_table(dynamodb,
KeySchema=[
{ 'AttributeName': 'p', 'KeyType': 'HASH' },
@@ -586,7 +618,7 @@ def test_ttl_expiration_lsi_key(dynamodb):
else:
# test is done - and successful:
return
time.sleep(max_duration/15)
time.sleep(max_duration/200)
pytest.fail('base not expired')
# Check that in the DynamoDB Streams API, an event appears about an item
@@ -702,7 +734,7 @@ def read_entire_stream(dynamodbstreams, table):
# in more than one page - so verifies that the expiration scanner service
# does the paging properly.
@pytest.mark.veryslow
def test_ttl_expiration_long(dynamodb):
def test_ttl_expiration_long(dynamodb, waits_for_expiration):
# Write 100*N items to the table, 1/100th of them are expired. The test
# completes when all of them are expired (or on timeout).
# To compilicate matter for the paging that we're trying to test,
@@ -730,19 +762,17 @@ def test_ttl_expiration_long(dynamodb):
'p': p, 'c': c, 'expiration': expiration })
start_time = time.time()
while time.time() < start_time + max_duration:
# We could have used Select=COUNT here, but Alternator doesn't
# implement it yet (issue #5058).
count = 0
response = table.scan(ConsistentRead=True)
response = table.scan(ConsistentRead=True, Select='COUNT')
if 'Count' in response:
count += response['Count']
while 'LastEvaluatedKey' in response:
response = table.scan(ExclusiveStartKey=response['LastEvaluatedKey'],
ConsistentRead=True)
ConsistentRead=True, Select='COUNT')
if 'Count' in response:
count += response['Count']
print(count)
if count == 99*N:
break
time.sleep(max_duration/15.0)
time.sleep(max_duration/100.0)
assert count == 99*N

View File

@@ -231,3 +231,10 @@ def scylla_inject_error(rest_api, err, one_shot=False):
finally:
print("Disabling error injection", err)
response = requests.delete(f'{rest_api}/v2/error_injection/injection/{err}')
# Send a message to the Scylla log. E.g., we can write a message to the log
# indicating that a test has started, which will make it easier to see which
# test caused which errors in the log.
def scylla_log(optional_rest_api, message, level):
if optional_rest_api:
requests.post(f'{optional_rest_api}/system/log?message={requests.utils.quote(message)}&level={level}')