From 168973622397bc1f8b0c7638be1cbb64ac37dfd5 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Botond=20D=C3=A9nes?= <bdenes@scylladb.com>
Date: Tue, 24 Mar 2026 13:39:56 +0200
Subject: [PATCH] =?UTF-8?q?Merge=20'test:=20cluster:=20Deflake=20test=5Fwr?=
 =?UTF-8?q?ite=5Fcl=5Fany=5Fto=5Fdead=5Fnode=5Fgenerates=5Fhints'=20from?=
 =?UTF-8?q?=20Dawid=20M=C4=99drek?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Before these changes, we would send mutations to the node and
immediately query the metrics to see how many hints had been written.
However, that could lead to random failures of the test: even if the
mutations have finished executing, hints are stored asynchronously, so
we don't have a guarantee they have already been processed.

To prevent such failures, we rewrite the check: we will perform multiple
checks against the metrics until we have confirmed that the hints have
indeed been written or we hit the timeout.

We're generous with the timeout: we give the test 60 seconds. That
should be enough time to avoid flakiness even on super slow machines,
and if the test does fail, we will know something is really wrong.

As a bonus, we improve the test in general too. We explicitly express
the preconditions we rely on, as well as bump the log level. If the
test fails in the future, it might be very difficult do debug it
without this additional information.

Fixes SCYLLADB-1133

Backport: The test is present on all supported branches. To avoid
          running into more failures, we should backport these changes
          to them.

Closes scylladb/scylladb#29191

* github.com:scylladb/scylladb:
  test: cluster: Increase log level in test_write_cl_any_to_dead_node_generates_hints
  test: cluster: Await all mutations concurrently in test_write_cl_any_to_dead_node_generates_hints
  test: cluster: Specify min_tablet_count in test_write_cl_any_to_dead_node_generates_hints
  test: cluster: Use new_test_table in test_write_cl_any_to_dead_node_generates_hints
  test: cluster: Introduce auxiliary function keyspace_has_tablets
  test: cluster: Deflake test_write_cl_any_to_dead_node_generates_hints

(cherry picked from commit ffd58ca1f04cbd67beaa1ffb14c373195cd40adb)

Closes scylladb/scylladb#29246
---
 test/cluster/test_hints.py | 46 +++++++++++++++++++++++++-------------
 test/cluster/util.py       | 11 +++++++++
 2 files changed, 41 insertions(+), 16 deletions(-)

diff --git a/test/cluster/test_hints.py b/test/cluster/test_hints.py
index 3ef32e352f..c292211da2 100644
--- a/test/cluster/test_hints.py
+++ b/test/cluster/test_hints.py
@@ -17,10 +17,10 @@ from test.pylib.manager_client import ManagerClient
 from test.pylib.rest_client import ScyllaMetricsClient, TCPRESTClient, inject_error
 from test.pylib.tablets import get_tablet_replicas
 from test.pylib.scylla_cluster import ReplaceConfig
-from test.pylib.util import wait_for
+from test.pylib.util import gather_safely, wait_for
 
 from test.cluster.conftest import skip_mode
-from test.cluster.util import get_topology_coordinator, find_server_by_host_id, new_test_keyspace
+from test.cluster.util import get_topology_coordinator, find_server_by_host_id, keyspace_has_tablets, new_test_keyspace, new_test_table
 
 
 logger = logging.getLogger(__name__)
@@ -52,28 +52,42 @@ async def await_sync_point(client: TCPRESTClient, server_ip: IPAddress, sync_poi
 @pytest.mark.asyncio
 async def test_write_cl_any_to_dead_node_generates_hints(manager: ManagerClient):
     node_count = 2
-    servers = await manager.servers_add(node_count)
+    cmdline = ["--logger-log-level", "hints_manager=trace"]
+    servers = await manager.servers_add(node_count, cmdline=cmdline)
+
+    async def wait_for_hints_written(min_hint_count: int, timeout: int):
+        async def aux():
+            hints_written = await get_hint_metrics(manager.metrics, servers[0].ip_addr, "written")
+            if hints_written >= min_hint_count:
+                return True
+            return None
+        assert await wait_for(aux, time.time() + timeout)
 
     cql = manager.get_cql()
     async with new_test_keyspace(manager, "WITH replication = {'class': 'NetworkTopologyStrategy', 'replication_factor': 1}") as ks:
-        table = f"{ks}.t"
-        await cql.run_async(f"CREATE TABLE {table} (pk int primary key, v int)")
+        uses_tablets = await keyspace_has_tablets(manager, ks)
+        # If the keyspace uses tablets, let's explicitly require the table to use multiple tablets.
+        # Otherwise, it could happen that all mutations would target servers[0] only, which would
+        # ultimately lead to a test failure here. We rely on the assumption that mutations will be
+        # distributed more or less uniformly!
+        extra_opts = "WITH tablets = {'min_tablet_count': 16}" if uses_tablets else ""
+        async with new_test_table(manager, ks, "pk int PRIMARY KEY, v int", extra_opts) as table:
+            await manager.server_stop_gracefully(servers[1].server_id)
 
-        await manager.server_stop_gracefully(servers[1].server_id)
+            hints_before = await get_hint_metrics(manager.metrics, servers[0].ip_addr, "written")
 
-        hints_before = await get_hint_metrics(manager.metrics, servers[0].ip_addr, "written")
+            stmt = cql.prepare(f"INSERT INTO {table} (pk, v) VALUES (?, ?)")
+            stmt.consistency_level = ConsistencyLevel.ANY
 
-        # Some of the inserts will be targeted to the dead node.
-        # The coordinator doesn't have live targets to send the write to, but it should write a hint.
-        for i in range(100):
-            await cql.run_async(SimpleStatement(f"INSERT INTO {table} (pk, v) VALUES ({i}, {i+1})", consistency_level=ConsistencyLevel.ANY))
+            # Some of the inserts will be targeted to the dead node.
+            # The coordinator doesn't have live targets to send the write to, but it should write a hint.
+            await gather_safely(*[cql.run_async(stmt, (i, i + 1)) for i in range(100)])
 
-        # Verify hints are written
-        hints_after = await get_hint_metrics(manager.metrics, servers[0].ip_addr, "written")
-        assert hints_after > hints_before
+            # Verify hints are written
+            await wait_for_hints_written(hints_before + 1, timeout=60)
 
-        # For dropping the keyspace
-        await manager.server_start(servers[1].server_id)
+            # For dropping the keyspace
+            await manager.server_start(servers[1].server_id)
 
 @pytest.mark.asyncio
 async def test_limited_concurrency_of_writes(manager: ManagerClient):
diff --git a/test/cluster/util.py b/test/cluster/util.py
index 025b69a86f..820dc025cb 100644
--- a/test/cluster/util.py
+++ b/test/cluster/util.py
@@ -614,6 +614,17 @@ async def new_materialized_view(manager: ManagerClient, table, select, pk, where
         await manager.get_cql().run_async(f"DROP MATERIALIZED VIEW {mv}")
 
 
+async def keyspace_has_tablets(manager: ManagerClient, keyspace: str) -> bool:
+    """
+    Checks whether the given keyspace uses tablets.
+    Adapted from its counterpart in the cqlpy test: cqlpy/util.py::keyspace_has_tablets.
+    """
+    cql = manager.get_cql()
+    rows_iter = await cql.run_async(f"SELECT * FROM system_schema.scylla_keyspaces WHERE keyspace_name='{keyspace}'")
+    rows = list(rows_iter)
+    return len(rows) > 0 and getattr(rows[0], "initial_tablets", None) is not None
+
+
 async def get_raft_log_size(cql, host) -> int:
     query = "select count(\"index\") from system.raft"
     return (await cql.run_async(query, host=host))[0][0]