From 568f20396aedd5ab100c9eedea53dbd189dcbf84 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C5=82=20Jadwiszczak?= Date: Tue, 7 Apr 2026 15:45:20 +0200 Subject: [PATCH] test: fix flaky test_create_index_synchronous_updates trace event race The test_create_index_synchronous_updates test in test_secondary_index_properties.py was intermittently failing with 'assert found_wanted_trace' because the expected trace event 'Forcing ... view update to be synchronous' was missing from the trace events returned by get_query_trace(). Root cause: trace events are written asynchronously to system_traces.events. The Python driver's populate() method considers a trace complete once the session row in system_traces.sessions has duration IS NOT NULL, then reads events exactly once. Since the session row and event rows are written as separate mutations with no transactional guarantee, the driver can read an incomplete set of events. Evidence from the failed CI run logs: - The entire test (CREATE TABLE through DROP TABLE) completed in ~300ms (01:38:54,859 - 01:38:55,157) - The INSERT with tracing happened in a ~50ms window between the second CREATE INDEX completing (01:38:55,108) and DROP TABLE starting (01:38:55,157) - The 'Forcing ... synchronous' trace message is generated during the INSERT write path (db/view/view.cc:2061), so it was produced, but not yet flushed to system_traces.events when the driver read them - This matches the known limitation documented in test/alternator/ test_tracing.py: 'we have no way to know whether the tracing events returned is the entire trace' Fix: replace the single-shot trace.events read with a retry loop that directly queries system_traces.events until the expected event appears (with a 30s timeout). Use ConsistencyLevel.ONE since system_traces has RF=2 and cqlpy tests run on a single-node cluster. The same race condition pattern exists in test_mv_synchronous_updates in test_materialized_view.py (which this test was modeled after), so the same fix is proactively applied there as well. Fixes SCYLLADB-1314 Closes scylladb/scylladb#29374 --- test/cqlpy/test_materialized_view.py | 35 ++++++++++++++----- test/cqlpy/test_secondary_index_properties.py | 27 ++++++++++---- 2 files changed, 47 insertions(+), 15 deletions(-) diff --git a/test/cqlpy/test_materialized_view.py b/test/cqlpy/test_materialized_view.py index 4a0a00af69..9f9b4a7105 100644 --- a/test/cqlpy/test_materialized_view.py +++ b/test/cqlpy/test_materialized_view.py @@ -366,15 +366,32 @@ def test_mv_synchronous_updates(cql, test_keyspace, scylla_only): unwanted_trace1 = f"Forcing {async_mv} view update to be synchronous" unwanted_trace2 = f"Forcing {async_mv_altered} view update to be synchronous" - wanted_traces_were_found = [False, False] - for event in trace.events: - assert unwanted_trace1 not in event.description - assert unwanted_trace2 not in event.description - if wanted_trace1 in event.description: - wanted_traces_were_found[0] = True - if wanted_trace2 in event.description: - wanted_traces_were_found[1] = True - assert all(wanted_traces_were_found) + # Trace events are written asynchronously to system_traces.events. + # The trace session may be marked complete before all events are + # flushed, so the driver's get_query_trace() can return an + # incomplete set of events. Retry reading events directly until + # the expected ones appear. + deadline = time.time() + 30 + while True: + rows = list(cql.execute(SimpleStatement( + f"SELECT activity FROM system_traces.events WHERE session_id = {trace.trace_id}", + consistency_level=ConsistencyLevel.ONE))) + activities = [row.activity for row in rows] + + for activity in activities: + assert unwanted_trace1 not in activity + assert unwanted_trace2 not in activity + + found = [ + any(wanted_trace1 in a for a in activities), + any(wanted_trace2 in a for a in activities), + ] + if all(found): + break + + assert time.time() < deadline, \ + f"Timed out waiting for trace events" + time.sleep(0.1) # Reproduces #8627: # Whereas regular columns values are limited in size to 2GB, key columns are diff --git a/test/cqlpy/test_secondary_index_properties.py b/test/cqlpy/test_secondary_index_properties.py index 8ded73283e..276dceb145 100644 --- a/test/cqlpy/test_secondary_index_properties.py +++ b/test/cqlpy/test_secondary_index_properties.py @@ -6,9 +6,11 @@ import itertools import pytest +import time import uuid from cassandra.protocol import SyntaxException, InvalidRequest, ConfigurationException +from cassandra.query import SimpleStatement, ConsistencyLevel from test.cqlpy.util import new_test_table, unique_name # Verify that creating a named index with simple valid view properties finishes successfully, @@ -266,14 +268,27 @@ def test_create_index_synchronous_updates(cql, test_keyspace, scylla_only): wanted_trace = f"Forcing {test_keyspace}.{s_view_name} view update to be synchronous" unwanted_trace = f"Forcing {test_keyspace}.{as_view_name} view update to be synchronous" - found_wanted_trace = False + # Trace events are written asynchronously to system_traces.events. + # The trace session may be marked complete before all events are + # flushed, so the driver's get_query_trace() can return an incomplete + # set of events. Retry reading events directly until the expected + # one appears. + deadline = time.time() + 30 + while True: + rows = list(cql.execute(SimpleStatement( + f"SELECT activity FROM system_traces.events WHERE session_id = {trace.trace_id}", + consistency_level=ConsistencyLevel.ONE))) + activities = [row.activity for row in rows] - for event in trace.events: - assert unwanted_trace not in event.description - if wanted_trace in event.description: - found_wanted_trace = True + for activity in activities: + assert unwanted_trace not in activity - assert found_wanted_trace + if any(wanted_trace in a for a in activities): + break + + assert time.time() < deadline, \ + f"Timed out waiting for trace event '{wanted_trace}'" + time.sleep(0.1) # Verify that we cannot create an index with CDC enabled. def test_create_index_cdc(cql, test_keyspace, scylla_only):