From 568f20396aedd5ab100c9eedea53dbd189dcbf84 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Micha=C5=82=20Jadwiszczak?=
 <michal.jadwiszczak@scylladb.com>
Date: Tue, 7 Apr 2026 15:45:20 +0200
Subject: [PATCH] test: fix flaky test_create_index_synchronous_updates trace
 event race

The test_create_index_synchronous_updates test in test_secondary_index_properties.py
was intermittently failing with 'assert found_wanted_trace' because the expected
trace event 'Forcing ... view update to be synchronous' was missing from the
trace events returned by get_query_trace().

Root cause: trace events are written asynchronously to system_traces.events.
The Python driver's populate() method considers a trace complete once the
session row in system_traces.sessions has duration IS NOT NULL, then reads
events exactly once. Since the session row and event rows are written as
separate mutations with no transactional guarantee, the driver can read an
incomplete set of events.

Evidence from the failed CI run logs:
- The entire test (CREATE TABLE through DROP TABLE) completed in ~300ms
  (01:38:54,859 - 01:38:55,157)
- The INSERT with tracing happened in a ~50ms window between the second
  CREATE INDEX completing (01:38:55,108) and DROP TABLE starting
  (01:38:55,157)
- The 'Forcing ... synchronous' trace message is generated during the
  INSERT write path (db/view/view.cc:2061), so it was produced, but
  not yet flushed to system_traces.events when the driver read them
- This matches the known limitation documented in test/alternator/
  test_tracing.py: 'we have no way to know whether the tracing events
  returned is the entire trace'

Fix: replace the single-shot trace.events read with a retry loop that
directly queries system_traces.events until the expected event appears
(with a 30s timeout). Use ConsistencyLevel.ONE since system_traces has
RF=2 and cqlpy tests run on a single-node cluster.

The same race condition pattern exists in test_mv_synchronous_updates in
test_materialized_view.py (which this test was modeled after), so the
same fix is proactively applied there as well.

Fixes SCYLLADB-1314

Closes scylladb/scylladb#29374
---
 test/cqlpy/test_materialized_view.py          | 35 ++++++++++++++-----
 test/cqlpy/test_secondary_index_properties.py | 27 ++++++++++----
 2 files changed, 47 insertions(+), 15 deletions(-)

diff --git a/test/cqlpy/test_materialized_view.py b/test/cqlpy/test_materialized_view.py
index 4a0a00af69..9f9b4a7105 100644
--- a/test/cqlpy/test_materialized_view.py
+++ b/test/cqlpy/test_materialized_view.py
@@ -366,15 +366,32 @@ def test_mv_synchronous_updates(cql, test_keyspace, scylla_only):
             unwanted_trace1 = f"Forcing {async_mv} view update to be synchronous"
             unwanted_trace2 = f"Forcing {async_mv_altered} view update to be synchronous"
 
-            wanted_traces_were_found = [False, False]
-            for event in trace.events:
-                assert unwanted_trace1 not in event.description
-                assert unwanted_trace2 not in event.description
-                if wanted_trace1 in event.description:
-                    wanted_traces_were_found[0] = True
-                if wanted_trace2 in event.description:
-                    wanted_traces_were_found[1] = True
-            assert all(wanted_traces_were_found)
+            # Trace events are written asynchronously to system_traces.events.
+            # The trace session may be marked complete before all events are
+            # flushed, so the driver's get_query_trace() can return an
+            # incomplete set of events. Retry reading events directly until
+            # the expected ones appear.
+            deadline = time.time() + 30
+            while True:
+                rows = list(cql.execute(SimpleStatement(
+                    f"SELECT activity FROM system_traces.events WHERE session_id = {trace.trace_id}",
+                    consistency_level=ConsistencyLevel.ONE)))
+                activities = [row.activity for row in rows]
+
+                for activity in activities:
+                    assert unwanted_trace1 not in activity
+                    assert unwanted_trace2 not in activity
+
+                found = [
+                    any(wanted_trace1 in a for a in activities),
+                    any(wanted_trace2 in a for a in activities),
+                ]
+                if all(found):
+                    break
+
+                assert time.time() < deadline, \
+                    f"Timed out waiting for trace events"
+                time.sleep(0.1)
 
 # Reproduces #8627:
 # Whereas regular columns values are limited in size to 2GB, key columns are
diff --git a/test/cqlpy/test_secondary_index_properties.py b/test/cqlpy/test_secondary_index_properties.py
index 8ded73283e..276dceb145 100644
--- a/test/cqlpy/test_secondary_index_properties.py
+++ b/test/cqlpy/test_secondary_index_properties.py
@@ -6,9 +6,11 @@
 
 import itertools
 import pytest
+import time
 import uuid
 
 from cassandra.protocol import SyntaxException, InvalidRequest, ConfigurationException
+from cassandra.query import SimpleStatement, ConsistencyLevel
 from test.cqlpy.util import new_test_table, unique_name
 
 # Verify that creating a named index with simple valid view properties finishes successfully,
@@ -266,14 +268,27 @@ def test_create_index_synchronous_updates(cql, test_keyspace, scylla_only):
         wanted_trace = f"Forcing {test_keyspace}.{s_view_name} view update to be synchronous"
         unwanted_trace = f"Forcing {test_keyspace}.{as_view_name} view update to be synchronous"
 
-        found_wanted_trace = False
+        # Trace events are written asynchronously to system_traces.events.
+        # The trace session may be marked complete before all events are
+        # flushed, so the driver's get_query_trace() can return an incomplete
+        # set of events. Retry reading events directly until the expected
+        # one appears.
+        deadline = time.time() + 30
+        while True:
+            rows = list(cql.execute(SimpleStatement(
+                f"SELECT activity FROM system_traces.events WHERE session_id = {trace.trace_id}",
+                consistency_level=ConsistencyLevel.ONE)))
+            activities = [row.activity for row in rows]
 
-        for event in trace.events:
-            assert unwanted_trace not in event.description
-            if wanted_trace in event.description:
-                found_wanted_trace = True
+            for activity in activities:
+                assert unwanted_trace not in activity
 
-        assert found_wanted_trace
+            if any(wanted_trace in a for a in activities):
+                break
+
+            assert time.time() < deadline, \
+                f"Timed out waiting for trace event '{wanted_trace}'"
+            time.sleep(0.1)
 
 # Verify that we cannot create an index with CDC enabled.
 def test_create_index_cdc(cql, test_keyspace, scylla_only):