From 297109f6ee51fbc561c282516432bac4633f23f5 Mon Sep 17 00:00:00 2001 From: Nadav Har'El Date: Thu, 25 Aug 2022 13:08:44 +0300 Subject: [PATCH 1/6] test/alternator: test Alternator TTL metrics This patch adds a test for the metrics generated by the background expiration thread run for Alternator's TTL feature. We test three of the four metrics: scylla_expiration_scan_passes, scylla_expiration_scan_table and scylla_expiration_items_deleted. The fourth metric, scylla_expiration_secondary_ranges_scanned, counts the number of times that this node took over another node's expiration duty. so requires a multi-node cluster to test, and we can't test it in the single-node cluster test framework. To see TTL expiration in action this test may need to wait up to the setting of alternator_ttl_period_in_seconds. For a setting of 1 second (the default set by test/alternator/run), this means this test can take up to 1 second to run. If alternator_ttl_period_in_seconds is set higher, the test is skipped unless --runveryslow is requested. Signed-off-by: Nadav Har'El --- test/alternator/test_metrics.py | 63 +++++++++++++++++++++++++++++++-- 1 file changed, 61 insertions(+), 2 deletions(-) diff --git a/test/alternator/test_metrics.py b/test/alternator/test_metrics.py index 21ba1ec529..f7209367a4 100644 --- a/test/alternator/test_metrics.py +++ b/test/alternator/test_metrics.py @@ -25,10 +25,11 @@ import pytest import requests import re +import time from contextlib import contextmanager from botocore.exceptions import ClientError -from util import random_string, new_test_table +from util import random_string, new_test_table, is_aws # Fixture for checking if we are able to test Scylla metrics. Scylla metrics # are not available on AWS (of course), but may also not be available for @@ -96,7 +97,7 @@ def check_increases_metric(metrics, metric_names): yield the_metrics = get_metrics(metrics) for n in metric_names: - assert saved_metrics[n] < get_metric(metrics, n, None, the_metrics) + assert saved_metrics[n] < get_metric(metrics, n, None, the_metrics), f'metric {n} did not increase' @contextmanager def check_increases_operation(metrics, operation_names): @@ -226,6 +227,64 @@ def test_total_operations(dynamodb, metrics): with check_increases_metric(metrics, ['scylla_alternator_total_operations']): dynamodb.meta.client.describe_endpoints() +# A fixture to read alternator-ttl-period-in-seconds from Scylla's +# configuration. If we're testing something which isn't Scylla, or +# this configuration does not exist, skip this test. If the configuration +# isn't low enough (it is more than one second), skip this test unless +# the "--runveryslow" option is used. +@pytest.fixture(scope="session") +def alternator_ttl_period_in_seconds(dynamodb, request): + # If not running on Scylla, skip the test + if is_aws(dynamodb): + pytest.skip('Scylla-only test skipped') + # In Scylla, we can inspect the configuration via a system table + # (which is also visible in Alternator) + config_table = dynamodb.Table('.scylla.alternator.system.config') + resp = config_table.query( + KeyConditionExpression='#key=:val', + ExpressionAttributeNames={'#key': 'name'}, + ExpressionAttributeValues={':val': 'alternator_ttl_period_in_seconds'}) + if not 'Items' in resp: + pytest.skip('missing TTL feature, skipping test') + period = float(resp['Items'][0]['value']) + if period > 1 and not request.config.getoption('runveryslow'): + pytest.skip('need --runveryslow option to run') + return period + +# Test metrics of the background expiration thread run for Alternator's TTL +# feature. The metrics tested in this test are scylla_expiration_scan_passes, +# scylla_expiration_scan_table and scylla_expiration_items_deleted. The +# metric scylla_expiration_secondary_ranges_scanned is not tested in this +# test - testing it requires a multi-node cluster because it counts the +# number of times that this node took over another node's expiration duty. +# +# Unfortunately, to see TTL expiration in action this test may need to wait +# up to the setting of alternator_ttl_period_in_seconds. test/alternator/run +# sets this to 1 second, which becomes the maximum delay of this test, but +# if it is set higher we skip this test unless --runveryslow is enabled. +def test_ttl_stats(dynamodb, metrics, alternator_ttl_period_in_seconds): + print(alternator_ttl_period_in_seconds) + with check_increases_metric(metrics, ['scylla_expiration_scan_passes', 'scylla_expiration_scan_table', 'scylla_expiration_items_deleted']): + with new_test_table(dynamodb, + KeySchema=[ { 'AttributeName': 'p', 'KeyType': 'HASH' }, ], + AttributeDefinitions=[ { 'AttributeName': 'p', 'AttributeType': 'S' } ]) as table: + # Insert one already-expired item, and then enable TTL: + p0 = random_string() + table.put_item(Item={'p': p0, 'expiration': int(time.time())-60}) + assert 'Item' in table.get_item(Key={'p': p0}) + client = table.meta.client + client.update_time_to_live(TableName=table.name, + TimeToLiveSpecification= {'AttributeName': 'expiration', 'Enabled': True}) + # After alternator_ttl_period_in_seconds, we expect the metrics + # to have increased. Add some time to that, to account for + # extremely overloaded test machines. + start_time = time.time() + while time.time() < start_time + alternator_ttl_period_in_seconds + 60: + if not 'Item' in table.get_item(Key={'p': p0}): + break + time.sleep(0.1) + assert not 'Item' in table.get_item(Key={'p': p0}) + # TODO: there are additional metrics which we don't yet test here. At the # time of this writing they are: latency histograms # ({put,get,delete,update}_item_latency, get_records_latency), From 746c4bd9eb657dfc61fec56e6b2d38ead91cf9d6 Mon Sep 17 00:00:00 2001 From: Nadav Har'El Date: Thu, 25 Aug 2022 15:56:28 +0300 Subject: [PATCH 2/6] test/alternator: skip fewer Alternator TTL tests Most of the Alternator TTL tests are extremely slow on DynamoDB because item expiration may be delayed up to 24 hours (!), and in practice for 10 to 30 minutes. Because of this, we marked most of these tests with the "veryslow" mark, causing them to be skipped by default - unless pytest is given the "--runveryslow" option. The result was that the TTL tests were not run in the normal test runs, which can allow regressions to be introduced (luckily, this hasn't happened). However, this "veryslow" mark was excessive. Many of the tests are very slow only on DynamoDB, but aren't very slow on Scylla. In particular, many of the tests involve waiting for an item to expire, something that happens after the configurable alternator_ttl_period_in_seconds, which is just one second in our tests. So in this patch, we remove the "veryslow" mark from 6 tests of Alternator TTL tests, and instead use two new fixtures - waits_for_expiration and veryslow_on_aws - to only skip the test when running on DynamoDB or when alternator_ttl_period_in_seconds is high - but in our usual test environment they will not get skipped. Because 5 of these 6 tests wait for an item to expire, they take one second each and this patch adds 5 seconds to the Alternator test runtime. This is unfortunate (it's more than 25% of the total Alternator test runtime!) but not a disaster, and we plan to reduce this 5 second time futher in the following patch, but decreasing the TTL scanning period even further. This patch also increases the timeout of several of these tests, to 120 seconds from the previous 10 seconds. As mentioned above, normally, these tests should always finish in alternator_ttl_period_in_seconds (1 second) with a single scan taking less than 0.2 seconds, but in extreme cases of debug builds on overloaded test machines, we saw even 60 seconds being passed, so let's increase the maximum. I also needed to make the sleep time between retries smaller, not a function of the new (unrealistic) timeout. 4 more tests remain "veryslow" (and won't run by default) because they are take 5-10 seconds each (e.g., a test which waits to see that an item does *not* get expired, and a test involving writing a lot of data). We should reconsider this in the future - to perhaps run these tests in our normal test runs - but even for now, the 6 extra tests that we start running are a much better protection against regressions than what we had until now. Fixes #11374 Signed-off-by: Nadav Har'El x Signed-off-by: Nadav Har'El --- test/alternator/test_ttl.py | 100 +++++++++++++++++++++++------------- 1 file changed, 65 insertions(+), 35 deletions(-) diff --git a/test/alternator/test_ttl.py b/test/alternator/test_ttl.py index dadb46f880..748898d014 100644 --- a/test/alternator/test_ttl.py +++ b/test/alternator/test_ttl.py @@ -36,6 +36,39 @@ def passes_or_raises(expected_exception, match=None): except Exception as err: pytest.fail(f"Got unexpected exception type {type(err).__name__} instead of {expected_exception.__name__}: {err}") +# Some of the TTL tests below can be very slow on DynamoDB, or even in +# Scylla in some configurations. The following two fixtures allow tests to +# run when they can be run reasonably quickly, but be skipped otherwise - +# unless the "--runverslow" option is passed to pytest. This is similar +# to our @pytest.mark.veryslow, but the latter is a blunter instrument. + +# The waits_for_expiration fixture indicates that this test waits for expired +# items to be deleted. This is always very slow on AWS, and whether it is +# very slow on Scylla or reasonably fast depends on the +# alternator_ttl_period_in_seconds configuration - test/alternator/run sets +# it very low, but Scylla may have been run manually. +@pytest.fixture(scope="session") +def waits_for_expiration(dynamodb, request): + if is_aws(dynamodb) and not request.config.getoption('runveryslow'): + pytest.skip('need --runveryslow option to run') + config_table = dynamodb.Table('.scylla.alternator.system.config') + resp = config_table.query( + KeyConditionExpression='#key=:val', + ExpressionAttributeNames={'#key': 'name'}, + ExpressionAttributeValues={':val': 'alternator_ttl_period_in_seconds'}) + period = float(resp['Items'][0]['value']) + if period > 1 and not request.config.getoption('runveryslow'): + pytest.skip('need --runveryslow option to run') + +# The veryslow_on_aws says that this test is very slow on AWS, but +# always reasonably fast on Scylla. If fastness on Scylla requires a +# specific setting of alternator_ttl_period_in_seconds, don't use this +# fixture - use the above waits_for_expiration instead. +@pytest.fixture(scope="session") +def veryslow_on_aws(dynamodb, request): + if is_aws(dynamodb) and not request.config.getoption('runveryslow'): + pytest.skip('need --runveryslow option to run') + # Test the DescribeTimeToLive operation on a table where the time-to-live # feature was *not* enabled. def test_describe_ttl_without_ttl(test_table): @@ -110,12 +143,11 @@ def test_ttl_disable_errors(dynamodb): client.update_time_to_live(TableName=table.name, TimeToLiveSpecification={'AttributeName': 'expiration', 'Enabled': False}) -# Test *sucessful* disabling of TTL. This is an extremely slow test, as -# DynamoDB refuses to disable TTL if it was enabled in the last half hour +# Test *sucessful* disabling of TTL. This is an extremely slow test on AWS, +# as DynamoDB refuses to disable TTL if it was enabled in the last half hour # (the documentation suggests a full hour, but in practice it seems 30 -# minutes). -@pytest.mark.veryslow -def test_ttl_disable(dynamodb): +# minutes). But on Scylla it is currently almost instantaneous. +def test_ttl_disable(dynamodb, veryslow_on_aws): with new_test_table(dynamodb, KeySchema=[ { 'AttributeName': 'p', 'KeyType': 'HASH' }, ], AttributeDefinitions=[ { 'AttributeName': 'p', 'AttributeType': 'S' } ]) as table: @@ -312,12 +344,12 @@ def test_ttl_expiration(dynamodb): # such a table as well. This test is simpler than test_ttl_expiration because # all we want to check is that the deletion works - not the expiration time # parsing and semantics. -@pytest.mark.veryslow -def test_ttl_expiration_with_rangekey(dynamodb): +def test_ttl_expiration_with_rangekey(dynamodb, waits_for_expiration): # Note that unlike test_ttl_expiration, this test doesn't have a fixed # duration - it finishes as soon as the item we expect to be expired # is expired. - max_duration = 1200 if is_aws(dynamodb) else 10 + max_duration = 1200 if is_aws(dynamodb) else 120 + sleep = 30 if is_aws(dynamodb) else 0.1 with new_test_table(dynamodb, KeySchema=[ { 'AttributeName': 'p', 'KeyType': 'HASH' }, { 'AttributeName': 'c', 'KeyType': 'RANGE' } ], @@ -344,7 +376,7 @@ def test_ttl_expiration_with_rangekey(dynamodb): if ('Item' in table.get_item(Key={'p': p1, 'c': c1})) and not ('Item' in table.get_item(Key={'p': p2, 'c': c2})): # p2 expired, p1 didn't. We're done break - time.sleep(max_duration/15.0) + time.sleep(sleep) assert 'Item' in table.get_item(Key={'p': p1, 'c': c1}) assert not 'Item' in table.get_item(Key={'p': p2, 'c': c2}) @@ -352,10 +384,10 @@ def test_ttl_expiration_with_rangekey(dynamodb): # expiration-time attribute *may* be the hash or range key attributes. # So let's test that this indeed works and items indeed expire # Just like test_ttl_expiration() above, these tests are extremely slow -# because DynamoDB delays expiration by around 10 minutes. -@pytest.mark.veryslow -def test_ttl_expiration_hash(dynamodb): - duration = 1200 if is_aws(dynamodb) else 10 +# on AWS because DynamoDB delays expiration by around 10 minutes. +def test_ttl_expiration_hash(dynamodb, waits_for_expiration): + max_duration = 1200 if is_aws(dynamodb) else 120 + sleep = 30 if is_aws(dynamodb) else 0.1 with new_test_table(dynamodb, KeySchema=[ { 'AttributeName': 'p', 'KeyType': 'HASH' }, ], AttributeDefinitions=[ { 'AttributeName': 'p', 'AttributeType': 'N' } ]) as table: @@ -373,7 +405,7 @@ def test_ttl_expiration_hash(dynamodb): def check_expired(): return not 'Item' in table.get_item(Key={'p': p1}) and 'Item' in table.get_item(Key={'p': p2}) - while time.time() < start_time + duration: + while time.time() < start_time + max_duration: print(f"--- {int(time.time()-start_time)} seconds") if 'Item' in table.get_item(Key={'p': p1}): print("p1 alive") @@ -381,13 +413,13 @@ def test_ttl_expiration_hash(dynamodb): print("p2 alive") if check_expired(): break - time.sleep(duration/15) + time.sleep(sleep) # After the delay, p2 should be alive, p1 should not assert check_expired() -@pytest.mark.veryslow -def test_ttl_expiration_range(dynamodb): - duration = 1200 if is_aws(dynamodb) else 10 +def test_ttl_expiration_range(dynamodb, waits_for_expiration): + max_duration = 1200 if is_aws(dynamodb) else 120 + sleep = 30 if is_aws(dynamodb) else 0.1 with new_test_table(dynamodb, KeySchema=[ { 'AttributeName': 'p', 'KeyType': 'HASH' }, { 'AttributeName': 'c', 'KeyType': 'RANGE' } ], AttributeDefinitions=[ { 'AttributeName': 'p', 'AttributeType': 'S' }, { 'AttributeName': 'c', 'AttributeType': 'N' } ]) as table: @@ -406,7 +438,7 @@ def test_ttl_expiration_range(dynamodb): def check_expired(): return not 'Item' in table.get_item(Key={'p': p, 'c': c1}) and 'Item' in table.get_item(Key={'p': p, 'c': c2}) - while time.time() < start_time + duration: + while time.time() < start_time + max_duration: print(f"--- {int(time.time()-start_time)} seconds") if 'Item' in table.get_item(Key={'p': p, 'c': c1}): print("c1 alive") @@ -414,7 +446,7 @@ def test_ttl_expiration_range(dynamodb): print("c2 alive") if check_expired(): break - time.sleep(duration/15) + time.sleep(sleep) # After the delay, c2 should be alive, c1 should not assert check_expired() @@ -442,7 +474,7 @@ def test_ttl_expiration_hash_wrong_type(dynamodb): print(f"--- {int(time.time()-start_time)} seconds") if 'Item' in table.get_item(Key={'p': p1}): print("p1 alive") - time.sleep(duration/15) + time.sleep(duration/30) # After the delay, p2 should be alive, p1 should not assert 'Item' in table.get_item(Key={'p': p1}) @@ -453,13 +485,13 @@ def test_ttl_expiration_hash_wrong_type(dynamodb): # and can test just one expiring item. This also allows us to finish the # test as soon as the item expires, instead of after a fixed "duration" as # in the above tests. -@pytest.mark.veryslow -def test_ttl_expiration_gsi_lsi(dynamodb): +def test_ttl_expiration_gsi_lsi(dynamodb, waits_for_expiration): # In our experiments we noticed that when a table has secondary indexes, # items are expired with significant delay. Whereas a 10 minute delay # for regular tables was typical, in the table we have here we saw # a typical delay of 30 minutes before items expired. - max_duration = 3600 if is_aws(dynamodb) else 10 + max_duration = 3600 if is_aws(dynamodb) else 120 + sleep = 30 if is_aws(dynamodb) else 0.1 with new_test_table(dynamodb, KeySchema=[ { 'AttributeName': 'p', 'KeyType': 'HASH' }, @@ -524,7 +556,7 @@ def test_ttl_expiration_gsi_lsi(dynamodb): # test is done - and successful: if not base_alive and not gsi_alive and gsi_was_alive and not lsi_alive: return - time.sleep(max_duration/15) + time.sleep(sleep) pytest.fail('base, gsi, or lsi not expired') # Above in test_ttl_expiration_hash() and test_ttl_expiration_range() we @@ -541,11 +573,11 @@ def test_ttl_expiration_gsi_lsi(dynamodb): # and make their keys regular attributes - to support adding GSIs after a # table already exists - so after such a change GSIs will no longer be a # special case. -@pytest.mark.veryslow -def test_ttl_expiration_lsi_key(dynamodb): +def test_ttl_expiration_lsi_key(dynamodb, waits_for_expiration): # In my experiments, a 30-minute (1800 seconds) is the typical delay # for expiration in this test for DynamoDB - max_duration = 3600 if is_aws(dynamodb) else 10 + max_duration = 3600 if is_aws(dynamodb) else 120 + sleep = 30 if is_aws(dynamodb) else 0.1 with new_test_table(dynamodb, KeySchema=[ { 'AttributeName': 'p', 'KeyType': 'HASH' }, @@ -586,7 +618,7 @@ def test_ttl_expiration_lsi_key(dynamodb): else: # test is done - and successful: return - time.sleep(max_duration/15) + time.sleep(max_duration/200) pytest.fail('base not expired') # Check that in the DynamoDB Streams API, an event appears about an item @@ -702,7 +734,7 @@ def read_entire_stream(dynamodbstreams, table): # in more than one page - so verifies that the expiration scanner service # does the paging properly. @pytest.mark.veryslow -def test_ttl_expiration_long(dynamodb): +def test_ttl_expiration_long(dynamodb, waits_for_expiration): # Write 100*N items to the table, 1/100th of them are expired. The test # completes when all of them are expired (or on timeout). # To compilicate matter for the paging that we're trying to test, @@ -730,19 +762,17 @@ def test_ttl_expiration_long(dynamodb): 'p': p, 'c': c, 'expiration': expiration }) start_time = time.time() while time.time() < start_time + max_duration: - # We could have used Select=COUNT here, but Alternator doesn't - # implement it yet (issue #5058). count = 0 - response = table.scan(ConsistentRead=True) + response = table.scan(ConsistentRead=True, Select='COUNT') if 'Count' in response: count += response['Count'] while 'LastEvaluatedKey' in response: response = table.scan(ExclusiveStartKey=response['LastEvaluatedKey'], - ConsistentRead=True) + ConsistentRead=True, Select='COUNT') if 'Count' in response: count += response['Count'] print(count) if count == 99*N: break - time.sleep(max_duration/15.0) + time.sleep(max_duration/100.0) assert count == 99*N From e7e9adc519db0d5addfead654be66157e1c40add Mon Sep 17 00:00:00 2001 From: Nadav Har'El Date: Thu, 25 Aug 2022 17:04:45 +0300 Subject: [PATCH 3/6] alternator,ttl: allow sub-second TTL scanning period, for tests Alternator has the "alternator_ttl_period_in_seconds" parameter for controlling how often the expiration thread looks for expired items to delete. It is usually a very large number of seconds, but for tests to finish quickly, we set it to 1 second. With 1 second expiration latency, test/alternator/test_ttl.py took 5 seconds to run. In this patch, we change the parameter to allow a floating-point number of seconds instead of just an integer. Then, this allows us to halve the TTL period used by tests to 0.5 seconds, and as a result, the run time of test_ttl.py halves to 2.5 seconds. I think this is fast enough for now. I verified that even if I change the period to 0.1, there is no noticable slowdown to other Alternator tests, so 0.5 is definitely safe. Signed-off-by: Nadav Har'El --- alternator/ttl.cc | 6 +++--- db/config.hh | 2 +- test/alternator/run | 2 +- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/alternator/ttl.cc b/alternator/ttl.cc index 9d7afb905c..f2f2b4abe7 100644 --- a/alternator/ttl.cc +++ b/alternator/ttl.cc @@ -769,11 +769,11 @@ future<> expiration_service::run() { // in the next iteration by reducing the scanner's scheduling-group // share (if using a separate scheduling group), or introduce // finer-grain sleeps into the scanning code. - std::chrono::seconds scan_duration(std::chrono::duration_cast(lowres_clock::now() - start)); - std::chrono::seconds period(_db.get_config().alternator_ttl_period_in_seconds()); + std::chrono::milliseconds scan_duration(std::chrono::duration_cast(lowres_clock::now() - start)); + std::chrono::milliseconds period(long(_db.get_config().alternator_ttl_period_in_seconds() * 1000)); if (scan_duration < period) { try { - tlogger.info("sleeping {} seconds until next period", (period - scan_duration).count()); + tlogger.info("sleeping {} seconds until next period", (period - scan_duration).count()/1000.0); co_await seastar::sleep_abortable(period - scan_duration, _abort_source); } catch(seastar::sleep_aborted&) {} } diff --git a/db/config.hh b/db/config.hh index 94e437b595..52c51958db 100644 --- a/db/config.hh +++ b/db/config.hh @@ -358,7 +358,7 @@ public: named_value alternator_write_isolation; named_value alternator_streams_time_window_s; named_value alternator_timeout_in_ms; - named_value alternator_ttl_period_in_seconds; + named_value alternator_ttl_period_in_seconds; named_value abort_on_ebadf; diff --git a/test/alternator/run b/test/alternator/run index 386c6661d3..084db62cbe 100755 --- a/test/alternator/run +++ b/test/alternator/run @@ -43,7 +43,7 @@ def run_alternator_cmd(pid, dir): '--alternator-write-isolation', 'always_use_lwt', '--alternator-streams-time-window-s', '0', '--alternator-timeout-in-ms', '30000', - '--alternator-ttl-period-in-seconds', '1', + '--alternator-ttl-period-in-seconds', '0.5', # Allow testing experimental features. Following issue #9467, we need # to add here specific experimental features as they are introduced. # We only list here Alternator-specific experimental features - CQL From b9792ffb0609811b9c98ac5563910d28f4efd9ac Mon Sep 17 00:00:00 2001 From: Nadav Har'El Date: Mon, 29 Aug 2022 15:59:11 +0300 Subject: [PATCH 4/6] alternator ttl: log warning if scan took too long. Currently, we log at "info" level how much time remained at the end of a full TTL scan until the next scanning period (we sleep for that time). If the scan was slower than the period, we didn't print anything. Let's print a warning in this case - it can be useful for debugging, and also users should know when their desired scan period is not being honored because the full scan is taking longer than the desired scan period. Signed-off-by: Nadav Har'El --- alternator/ttl.cc | 2 ++ 1 file changed, 2 insertions(+) diff --git a/alternator/ttl.cc b/alternator/ttl.cc index f2f2b4abe7..91b7bfce5f 100644 --- a/alternator/ttl.cc +++ b/alternator/ttl.cc @@ -776,6 +776,8 @@ future<> expiration_service::run() { tlogger.info("sleeping {} seconds until next period", (period - scan_duration).count()/1000.0); co_await seastar::sleep_abortable(period - scan_duration, _abort_source); } catch(seastar::sleep_aborted&) {} + } else { + tlogger.warn("scan took {} seconds, longer than period - not sleeping", scan_duration.count()/1000.0); } } } From a81310e23d5455192a8069441763abc8b1ee82a7 Mon Sep 17 00:00:00 2001 From: Nadav Har'El Date: Mon, 29 Aug 2022 17:59:28 +0300 Subject: [PATCH 5/6] rest api: add a new /system/log operation Add a new REST API operation, taking a log level and a message, and printing it into the Scylla log. This can be useful when a test wants to mark certain positions in the log (e.g., to see which other log messages we get between the two positions). An alternative way to achieve this could have been for the test to write directly into the log file - but an on-disk log file is only one of the logging options that Scylla support, and the approach in this patch allows to add log message regardless of how Scylla keeps the logs. In motivation of this feature is that in the following patch the test/alternator framework will add log messages when starting and ending tests, which can help debug test failures. Signed-off-by: Nadav Har'El --- api/api-doc/system.json | 39 +++++++++++++++++++++++++++++++++++++++ api/system.cc | 10 ++++++++++ 2 files changed, 49 insertions(+) diff --git a/api/api-doc/system.json b/api/api-doc/system.json index 61606ffa6e..8ac40f2879 100644 --- a/api/api-doc/system.json +++ b/api/api-doc/system.json @@ -52,6 +52,45 @@ } ] }, + { + "path":"/system/log", + "operations":[ + { + "method":"POST", + "summary":"Write a message to the Scylla log", + "type":"void", + "nickname":"write_log_message", + "produces":[ + "application/json" + ], + "parameters":[ + { + "name":"message", + "description":"The message to write to the log", + "required":true, + "allowMultiple":false, + "type":"string", + "paramType":"query" + }, + { + "name":"level", + "description":"The logging level to use", + "required":true, + "allowMultiple":false, + "type":"string", + "enum":[ + "error", + "warn", + "info", + "debug", + "trace" + ], + "paramType":"query" + } + ] + } + ] + }, { "path":"/system/drop_sstable_caches", "operations":[ diff --git a/api/system.cc b/api/system.cc index a109705558..8ac5c3ef37 100644 --- a/api/system.cc +++ b/api/system.cc @@ -61,6 +61,16 @@ void set_system(http_context& ctx, routes& r) { return json::json_void(); }); + hs::write_log_message.set(r, [](const_req req) { + try { + logging::log_level level = boost::lexical_cast(std::string(req.get_query_param("level"))); + apilog.log(level, "/system/log: {}", std::string(req.get_query_param("message"))); + } catch (boost::bad_lexical_cast& e) { + throw bad_param_exception("Unknown logging level " + req.get_query_param("level")); + } + return json::json_void(); + }); + hs::drop_sstable_caches.set(r, [&ctx](std::unique_ptr req) { apilog.info("Dropping sstable caches"); return ctx.db.invoke_on_all([] (replica::database& db) { From b0371b6bf8f1e80c43967d599dee4dd83db0f71d Mon Sep 17 00:00:00 2001 From: Nadav Har'El Date: Mon, 29 Aug 2022 18:05:18 +0300 Subject: [PATCH 6/6] test/alternator: insert test names into Scylla logs The output of test/alternator/run ends in Scylla's full log file, where it is hard to understand which log messages are related to which test. In this patch, we add a log message (using the new /system/log REST API) every time a test is started and ends. The messages look like this: INFO 2022-08-29 18:07:15,926 [shard 0] api - /system/log: test/alternator: Starting test_ttl.py::test_describe_ttl_without_ttl ... INFO 2022-08-29 18:07:15,930 [shard 0] api - /system/log: test/alternator: Ended test_ttl.py::test_describe_ttl_without_ttl Signed-off-by: Nadav Har'El --- test/alternator/conftest.py | 17 ++++++++++++----- test/alternator/util.py | 7 +++++++ 2 files changed, 19 insertions(+), 5 deletions(-) diff --git a/test/alternator/conftest.py b/test/alternator/conftest.py index 1ad938b703..90eaef42f6 100644 --- a/test/alternator/conftest.py +++ b/test/alternator/conftest.py @@ -13,7 +13,7 @@ import pytest import boto3 import requests import re -from util import create_test_table, is_aws +from util import create_test_table, is_aws, scylla_log # When tests are run with HTTPS, the server often won't have its SSL # certificate signed by a known authority. So we will disable certificate @@ -112,7 +112,8 @@ def dynamodbstreams(request): # that the server is still alive - and if not report the test which crashed # it and stop running any more tests. @pytest.fixture(scope="function", autouse=True) -def dynamodb_test_connection(dynamodb, request): +def dynamodb_test_connection(dynamodb, request, optional_rest_api): + scylla_log(optional_rest_api, f'test/alternator: Starting {request.node.parent.name}::{request.node.name}', 'info') yield try: # We want to run a do-nothing DynamoDB command. The health-check @@ -122,6 +123,7 @@ def dynamodb_test_connection(dynamodb, request): assert response.ok except: pytest.exit(f"Scylla appears to have crashed in test {request.node.parent.name}::{request.node.name}") + scylla_log(optional_rest_api, f'test/alternator: Ended {request.node.parent.name}::{request.node.name}', 'info') # "test_table" fixture: Create and return a temporary table to be used in tests @@ -275,9 +277,14 @@ def test_table_s_forbid_rmw(dynamodb, scylla_only): # the test using this fixture will be skipped with a message about the REST # API not being available. @pytest.fixture(scope="session") -def rest_api(dynamodb): +def rest_api(dynamodb, optional_rest_api): + if optional_rest_api is None: + pytest.skip('Cannot connect to Scylla REST API') + return optional_rest_api +@pytest.fixture(scope="session") +def optional_rest_api(dynamodb): if is_aws(dynamodb): - pytest.skip('Scylla-only REST API not supported by AWS') + return None url = dynamodb.meta.client._endpoint.host # The REST API is on port 10000, and always http, not https. url = re.sub(r':[0-9]+(/|$)', ':10000', url) @@ -287,5 +294,5 @@ def rest_api(dynamodb): try: requests.get(f'{url}/column_family/name/keyspace', timeout=1).raise_for_status() except: - pytest.skip('Cannot connect to Scylla REST API') + return None return url diff --git a/test/alternator/util.py b/test/alternator/util.py index a7897ec698..08bcb1e702 100644 --- a/test/alternator/util.py +++ b/test/alternator/util.py @@ -231,3 +231,10 @@ def scylla_inject_error(rest_api, err, one_shot=False): finally: print("Disabling error injection", err) response = requests.delete(f'{rest_api}/v2/error_injection/injection/{err}') + +# Send a message to the Scylla log. E.g., we can write a message to the log +# indicating that a test has started, which will make it easier to see which +# test caused which errors in the log. +def scylla_log(optional_rest_api, message, level): + if optional_rest_api: + requests.post(f'{optional_rest_api}/system/log?message={requests.utils.quote(message)}&level={level}')