From 6caaead4aca424d13baf72078f52f5575da7e352 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Micha=C5=82=20Chojnowski?= <michocho98@gmail.com>
Date: Mon, 16 Dec 2024 23:06:47 +0100
Subject: [PATCH] cache_algorithm_test: harden against stats being confused by
 background activity

Cache stats are global, so there's no good way to reliably
verify that e.g. a given read causes 0 cache misses,
because something done by Scylla in a background can trigger a cache miss.

This can cause the test to fail spuriously.

With how the test framework and the cache are designed, there's probably
no good way to test this properly. It would require ensuring that cache
stats are per-read, or at least per-table, and that Scylla's background
activity doesn't cause enough memory pressure to evict the tested rows.

This patch tries to deal with the flakiness without deleting the test
altogether by letting it retry after a failure if it notices that it
can be explained by a read which wasn't done by the test.
(Though, if the test can't be written well, maybe it just shouldn't be written...)
---
 test/boost/cache_algorithm_test.cc | 48 ++++++++++++++++++++++++++----
 1 file changed, 42 insertions(+), 6 deletions(-)

diff --git a/test/boost/cache_algorithm_test.cc b/test/boost/cache_algorithm_test.cc
index d505637ae4..09a66a6ca7 100644
--- a/test/boost/cache_algorithm_test.cc
+++ b/test/boost/cache_algorithm_test.cc
@@ -9,6 +9,7 @@
 #include <fmt/ranges.h>
 #include <seastar/testing/test_case.hh>
 #include "test/lib/cql_test_env.hh"
+#include "test/lib/log.hh"
 #include "db/config.hh"
 
 // These tests are slow, and tuned to a particular amount of memory
@@ -82,7 +83,9 @@ SEASTAR_TEST_CASE(test_index_doesnt_flood_cache_in_small_partition_workload) {
         // Flushing makes reasoning easier.
         e.db().invoke_on_all(&replica::database::flush_all_memtables).get();
 
-
+        int retries = 0;
+        constexpr int max_retries = 100;
+    retry:
         constexpr uint64_t hot_subset_size = 1000;
         // Sanity check. The test assumes that the total *hot* index size is significantly bigger than RAM.
         //
@@ -97,18 +100,32 @@ SEASTAR_TEST_CASE(test_index_doesnt_flood_cache_in_small_partition_workload) {
         auto get_misses = [&e] { return e.local_db().row_cache_tracker().get_stats().partition_misses; };
         uint64_t misses_before = get_misses();
         for (size_t i = 0; i < hot_subset_size; ++i) {
-            e.execute_prepared(select_query, {{cql3::raw_value::make_value(make_key(i))}}).get();
-            // Sanity check. If a single query is causing multiple partition misses,
-            // something unexpected is happening.
-            BOOST_REQUIRE_LE(get_misses() - misses_before, i + 1);
+            e.execute_prepared(select_query, {{cql3::raw_value::make_value(make_key(hot_subset_size * retries + i))}}).get();
         }
 
         misses_before = get_misses();
+        uint64_t reads_before = e.local_db().row_cache_tracker().get_stats().reads;
+        uint64_t reads_expected = reads_before;
+
         // The rows we just read have a small total size. They should be perfectly cached.
         for (size_t repeat = 0; repeat < 3; ++repeat) {
             for (size_t i = 0; i < hot_subset_size; ++i) {
-                e.execute_prepared(select_query, {{cql3::raw_value::make_value(make_key(i))}}).get();
+                e.execute_prepared(select_query, {{cql3::raw_value::make_value(make_key(hot_subset_size * retries + i))}}).get();
+                ++reads_expected;
                 // If the rows were perfectly cached, there were no new misses.
+                uint64_t reads_after = e.local_db().row_cache_tracker().get_stats().reads_done;
+                uint64_t misses_after = get_misses();
+                if (get_misses() != misses_before) {
+                    // Cache misses are allowed only if they were done by something outside of the test.
+                    BOOST_REQUIRE_GT(reads_after, reads_expected);
+                    if (retries < max_retries) {
+                        ++retries;
+                        testlog.warn("Detected extra cache misses (actual={}, expected={}, repeat={}, i={}), but they can be explained by extra reads (after={}, before={}, expected={}) done by something in the background, so retrying. (retries={})", misses_after, misses_before, repeat, i, reads_after, reads_before, reads_expected, retries);
+                        goto retry;
+                    } else {
+                        BOOST_FAIL("Test failed due to too much background noise");
+                    }
+                }
                 BOOST_REQUIRE_EQUAL(get_misses(), misses_before);
             }
         }
@@ -168,12 +185,31 @@ SEASTAR_TEST_CASE(test_index_is_cached_in_big_partition_workload) {
             }
         }
 
+        int retries = 0;
+        constexpr int max_retries = 100;
+    retry: 
         // The index is small and used once every few reads, so it should be perfectly cached.
         auto get_misses = [&e] { return e.local_db().row_cache_tracker().get_partition_index_cache_stats().misses; };
         uint64_t misses_before = get_misses();
+        uint64_t reads_before = e.local_db().row_cache_tracker().get_stats().reads;
+        uint64_t reads_expected = reads_before;
         for (size_t ck = 0; ck < ck_number; ++ck) {
             for (size_t pk = 0; pk < pk_number; ++pk) {
                 e.execute_prepared(select_query, {{cql3::raw_value::make_value(make_key(pk))}, {cql3::raw_value::make_value(make_key(ck))}}).get();
+                ++reads_expected;
+            }
+        }
+        uint64_t reads_after = e.local_db().row_cache_tracker().get_stats().reads_done;
+        uint64_t misses_after = get_misses();
+        if (misses_after != misses_before) {
+            // Cache misses are allowed only if they were done by something outside of the test.
+            BOOST_REQUIRE_GT(reads_after, reads_expected);
+            if (retries < max_retries) {
+                ++retries;
+                testlog.warn("Detected extra cache misses (actual={}, expected={}), but they can be explained by extra reads (after={}, before={}, expected={}) done by something in the background, so retrying. (retries={})", misses_after, misses_before, reads_after, reads_before, reads_expected, retries);
+                goto retry;
+            } else {
+                BOOST_FAIL("Test failed due to too much background noise");
             }
         }
         BOOST_REQUIRE_EQUAL(get_misses(), misses_before);