From d280517e2772fe454e8544cbd943d07201b382be Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Botond=20D=C3=A9nes?= <bdenes@scylladb.com>
Date: Wed, 15 Apr 2026 13:49:33 +0300
Subject: [PATCH] test/cluster/test_incremental_repair: fix flaky
 do_tablet_incremental_repair_and_ops

The log grep in get_sst_status searched from the beginning of the log
(no from_mark), so the second-repair assertions were checking cumulative
counts across both repairs rather than counts for the second repair alone.

The expected values (sst_add==2, sst_mark==2) relied on this cumulative
behaviour: 1 from the first repair + 1 from the second = 2. This works
when the second repair encounters exactly one unrepaired sstable, but
fails whenever the second repair sees two.

The second repair can see two unrepaired sstables when the 100 keys
inserted before it (via asyncio.gather) trigger a background auto-flush
before take_storage_snapshot runs. take_storage_snapshot always flushes
the memtable itself, so if an auto-flush already split the batch into two
sstables on disk, the second repair's snapshot contains both and logs
"Added sst" twice, making the cumulative count 3 instead of 2.

Fix: take a log mark per-server before each repair call and pass it to
get_sst_status so each check counts only the entries produced by that
repair. The expected values become 1/0/1 and 1/1/1 respectively,
independent of how many sstables happened to exist beforehand.

get_sst_status gains an optional from_mark parameter (default None)
which preserves existing call sites that intentionally grep from the
start of the log.

Fixes: SCYLLADB-1086

Closes scylladb/scylladb#29484
---
 test/cluster/test_incremental_repair.py | 22 ++++++++++++----------
 1 file changed, 12 insertions(+), 10 deletions(-)

diff --git a/test/cluster/test_incremental_repair.py b/test/cluster/test_incremental_repair.py
index ba3ad5ef39..7bd277accc 100644
--- a/test/cluster/test_incremental_repair.py
+++ b/test/cluster/test_incremental_repair.py
@@ -44,10 +44,10 @@ def get_sstables(workdir, ks, table):
     sstables = glob.glob(base_pattern)
     return sstables
 
-async def get_sst_status(run, log):
-    sst_add = await log.grep(rf'.*Added sst.*for incremental repair')
-    sst_skip = await log.grep(rf'.*Skipped adding sst.*for incremental repair')
-    sst_mark = await log.grep(rf'.*Marking.*for incremental repair')
+async def get_sst_status(run, log, from_mark=None):
+    sst_add = await log.grep(rf'.*Added sst.*for incremental repair', from_mark=from_mark)
+    sst_skip = await log.grep(rf'.*Skipped adding sst.*for incremental repair', from_mark=from_mark)
+    sst_mark = await log.grep(rf'.*Marking.*for incremental repair', from_mark=from_mark)
     logging.info(f'{run=}: {sst_add=} {sst_skip=} {sst_mark=}');
     logging.info(f'{run=}: {len(sst_add)=} {len(sst_skip)=} {len(sst_mark)=}');
     return sst_add, sst_skip, sst_mark
@@ -292,10 +292,11 @@ async def do_tablet_incremental_repair_and_ops(manager: ManagerClient, ops: str)
     servers, cql, hosts, ks, table_id, logs, repaired_keys, unrepaired_keys, current_key, token = await prepare_cluster_for_incremental_repair(manager, nr_keys, cmdline=['--logger-log-level', 'compaction=debug'])
     token = -1
 
+    marks = [await log.mark() for log in logs]
     await manager.api.tablet_repair(servers[0].ip_addr, ks, "test", token, incremental_mode='incremental')
     # 1 add 0 skip 1 mark
-    for log in logs:
-        sst_add, sst_skip, sst_mark = await get_sst_status("First", log)
+    for log, mark in zip(logs, marks):
+        sst_add, sst_skip, sst_mark = await get_sst_status("First", log, from_mark=mark)
         assert len(sst_add) == 1
         assert len(sst_skip) == 0
         assert len(sst_mark) == 1
@@ -318,13 +319,14 @@ async def do_tablet_incremental_repair_and_ops(manager: ManagerClient, ops: str)
         else:
             assert False # Wrong ops
 
+    marks = [await log.mark() for log in logs]
     await manager.api.tablet_repair(servers[0].ip_addr, ks, "test", token, incremental_mode='incremental')
 
     # 1 add 1 skip 1 mark
-    for log in logs:
-        sst_add, sst_skip, sst_mark = await get_sst_status("Second", log)
-        assert len(sst_add) == 2
-        assert len(sst_mark) == 2
+    for log, mark in zip(logs, marks):
+        sst_add, sst_skip, sst_mark = await get_sst_status("Second", log, from_mark=mark)
+        assert len(sst_add) == 1
+        assert len(sst_mark) == 1
         assert len(sst_skip) == 1
 
 @pytest.mark.asyncio