test_refresh_deletes_uploaded_sstables should wait for sstables to get deleted

SSTable unlinking is async, so in some cases it may happen that the upload dir is not empty immediately after refresh is done. This patch adjusts test_refresh_deletes_uploaded_sstables so it waits with a timeout till the upload dir becomes empty instead of just assuming the API will sync on sstables being gone. Fixes SCYLLADB-1190 Signed-off-by: Robert Bindar <robert.bindar@scylladb.com> Closes scylladb/scylladb#29215
Merge 'test: audit: clean up test helper class naming' from Dario Mirovic
2026-03-26 08:43:14 +03:00 · 2026-03-25 15:30:16 +01:00 · 2026-03-25 13:21:08 +01:00 · 2026-03-25 13:18:37 +01:00 · 2026-03-25 11:45:53 +02:00 · 2026-03-24 23:49:49 +02:00
31 changed files with 1162 additions and 562 deletions
--- a/.github/workflows/trigger-scylla-ci.yaml
+++ b/.github/workflows/trigger-scylla-ci.yaml
@@ -1,4 +1,6 @@
 name: Trigger Scylla CI Route
+permissions:
+  contents: read

 on:
  issue_comment:
--- a/.github/workflows/trigger_jenkins.yaml
+++ b/.github/workflows/trigger_jenkins.yaml
@@ -1,5 +1,8 @@
 name: Trigger next gating

+permissions:
+  contents: read
+
 on:
  push:
    branches:
--- a/ent/encryption/encrypted_file_impl.cc
+++ b/ent/encryption/encrypted_file_impl.cc
@@ -727,7 +727,12 @@ public:

        // now we need one page more to be able to save one for next lap
        auto fill_size = align_up(buf1.size(), block_size) + block_size - buf1.size();
-        auto buf2 = co_await _input.read_exactly(fill_size);
+        // If the underlying stream is already at EOF (e.g. buf1 came from
+        // cached _next while the previous read_exactly drained the source),
+        // skip the read_exactly call — it would return empty anyway.
+        auto buf2 = _input.eof()
+            ? temporary_buffer<char>()
+            : co_await _input.read_exactly(fill_size);

        temporary_buffer<char> output(buf1.size() + buf2.size());

--- a/locator/everywhere_replication_strategy.cc
+++ b/locator/everywhere_replication_strategy.cc
@@ -42,7 +42,14 @@ void everywhere_replication_strategy::validate_options(const gms::feature_servic

 sstring everywhere_replication_strategy::sanity_check_read_replicas(const effective_replication_map& erm, const host_id_vector_replica_set& read_replicas) const {
    const auto replication_factor = erm.get_replication_factor();
-    if (read_replicas.size() > replication_factor) {
+    if (const auto& topo_info = erm.get_token_metadata().get_topology_change_info(); topo_info && topo_info->read_new) {
+        if (read_replicas.size() > replication_factor + 1) {
+            return seastar::format(
+                    "everywhere_replication_strategy: the number of replicas for everywhere_replication_strategy is {}, "
+                    "cannot be higher than replication factor {} + 1 during the 'read from new replicas' stage of a topology change",
+                    read_replicas.size(), replication_factor);
+        }
+    } else if (read_replicas.size() > replication_factor) {
        return seastar::format("everywhere_replication_strategy: the number of replicas for everywhere_replication_strategy is {}, cannot be higher than replication factor {}", read_replicas.size(), replication_factor);
    }
    return {};
--- a/pgo/pgo.py
+++ b/pgo/pgo.py
@@ -15,6 +15,7 @@ from typing import Any, Optional
 import asyncio
 import contextlib
 import glob
+import hashlib
 import json
 import logging
 import os
@@ -364,12 +365,14 @@ async def start_node(executable: PathLike, cluster_workdir: PathLike, addr: str,
    llvm_profile_file = f"{addr}-%m.profraw"
    scylla_workdir = f"{addr}"
    logfile = f"{addr}.log"
+    socket = maintenance_socket_path(cluster_workdir, addr)
    command = [
        "env",
        f"LLVM_PROFILE_FILE={llvm_profile_file}",
        f"SCYLLA_HOME={os.path.realpath(os.getcwd())}", # We assume that the script has Scylla's `conf/` as its filesystem neighbour.
        os.path.realpath(executable),
        f"--workdir={scylla_workdir}",
+        f"--maintenance-socket={socket}",
        "--ring-delay-ms=0",
        "--developer-mode=yes",
        "--memory=1G",
@@ -391,6 +394,7 @@ async def start_node(executable: PathLike, cluster_workdir: PathLike, addr: str,
        f"--authenticator=PasswordAuthenticator",
        f"--authorizer=CassandraAuthorizer",
    ] + list(extra_opts)
+    training_logger.info(f"Using maintenance socket {socket}")
    return await run(['bash', '-c', fr"""exec {shlex.join(command)} >{q(logfile)} 2>&1"""], cwd=cluster_workdir)

 async def start_cluster(executable: PathLike, addrs: list[str], cpusets: Optional[list[str]], workdir: PathLike, cluster_name: str, extra_opts: list[str]) -> list[Process]:
@@ -433,16 +437,25 @@ async def start_cluster(executable: PathLike, addrs: list[str], cpusets: Optiona
            procs.append(proc)
            await wait_for_node(proc, addrs[i], timeout)
    except:
-        await stop_cluster(procs, addrs)
+        await stop_cluster(procs, addrs, cluster_workdir=workdir)
        raise
    return procs

-async def stop_cluster(procs: list[Process], addrs: list[str]) -> None:
+async def stop_cluster(procs: list[Process], addrs: list[str], cluster_workdir: PathLike) -> None:
    """Stops a Scylla cluster started with start_cluster().
    Doesn't return until all nodes exit, even if stop_cluster() is cancelled.

    """
    await clean_gather(*[cancel_process(p, timeout=60) for p in procs])
+    _cleanup_short_sockets(cluster_workdir, addrs)
+
+def _cleanup_short_sockets(cluster_workdir: PathLike, addrs: list[str]) -> None:
+    """Remove short maintenance socket files created in /tmp."""
+    for addr in addrs:
+        try:
+            os.unlink(maintenance_socket_path(cluster_workdir, addr))
+        except OSError:
+            pass

 async def wait_for_port(addr: str, port: int) -> None:
    await bash(fr'until printf "" >>/dev/tcp/{addr}/{port}; do sleep 0.1; done 2>/dev/null')
@@ -453,12 +466,17 @@ async def merge_profraw(directory: PathLike) -> None:
        await bash(fr"llvm-profdata merge {q(directory)}/*.profraw -output {q(directory)}/prof.profdata")

 def maintenance_socket_path(cluster_workdir: PathLike, addr: str) -> str:
-    """Returns the absolute path of the maintenance socket for a given node.
+    """Return the maintenance socket path for a node.

-    With ``maintenance_socket: workdir`` in scylla.yaml the socket lives at
-    ``<node-workdir>/cql.m``, i.e. ``<cluster_workdir>/<addr>/cql.m``.
+    Returns a short deterministic path in /tmp (derived from an MD5 hash of
+    the natural ``<cluster_workdir>/<addr>/cql.m`` path) to stay within the
+    Unix domain socket length limit.
+    The same path is passed to Scylla via ``--maintenance-socket`` in
+    ``start_node()``.
    """
-    return os.path.realpath(f"{cluster_workdir}/{addr}/cql.m")
+    natural = os.path.realpath(f"{cluster_workdir}/{addr}/cql.m")
+    path_hash = hashlib.md5(natural.encode()).hexdigest()[:12]
+    return os.path.join(tempfile.gettempdir(), f'pgo-{path_hash}.m')

 async def setup_cassandra_user(workdir: PathLike, addr: str) -> None:
    """Create the ``cassandra`` superuser via the maintenance socket.
@@ -525,7 +543,7 @@ async def with_cluster(executable: PathLike, workdir: PathLike, cpusets: Optiona
        yield addrs, procs
    finally:
        training_logger.info(f"Stopping the cluster in {workdir}")
-        await stop_cluster(procs, addrs)
+        await stop_cluster(procs, addrs, cluster_workdir=workdir)
        training_logger.info(f"Stopped the cluster in {workdir}")

 ################################################################################
--- a/pgo/profiles/x86_64/profile.profdata.xz
+++ b/pgo/profiles/x86_64/profile.profdata.xz
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:34a0955d2c5a88e18ddab0f1df085e10a17e14129c3e21de91e4f27ef949b6c4
-size 6502668
+oid sha256:d424ce6cc7f65338c34dd35881d23f5ad3425651d66e47dc2c3a20dc798848d4
+size 6598648
--- a/replica/logstor/segment_manager.cc
+++ b/replica/logstor/segment_manager.cc
@@ -1622,14 +1622,14 @@ future<> segment_manager_impl::do_recovery(replica::database& db) {
    size_t next_file_id = 0;
    for (auto file_id : found_file_ids) {
        if (file_id != next_file_id) {
-            throw std::runtime_error(fmt::format("Missing log segment file(s) detected during recovery: file {} missing", _file_mgr.get_file_path(next_file_id)));
+            throw std::runtime_error(fmt::format("Missing log segment file(s) detected during recovery: file {} missing", _file_mgr.get_file_path(next_file_id).string()));
        }
        next_file_id++;
    }

    // populate index from all segments. keep the latest record for each key.
    for (auto file_id : found_file_ids) {
-        logstor_logger.info("Recovering segments from file {}: {}%", _file_mgr.get_file_path(file_id), (file_id + 1) * 100 / found_file_ids.size());
+        logstor_logger.info("Recovering segments from file {}: {}%", _file_mgr.get_file_path(file_id).string(), (file_id + 1) * 100 / found_file_ids.size());
        co_await max_concurrent_for_each(segments_in_file(file_id), 32,
            [this, &db] (log_segment_id seg_id) {
                return recover_segment(db, seg_id);
--- a/replica/table.cc
+++ b/replica/table.cc
@@ -4860,13 +4860,14 @@ table::query(schema_ptr query_schema,
    }

    std::optional<full_position> last_pos;
-    if (querier_opt && querier_opt->current_position()) {
-        last_pos.emplace(*querier_opt->current_position());
-    }
-
-    if (!saved_querier || (querier_opt && !querier_opt->are_limits_reached() && !qs.builder.is_short_read())) {
-        co_await querier_opt->close();
-        querier_opt = {};
+    if (querier_opt) {
+        if (querier_opt->current_position()) {
+            last_pos.emplace(*querier_opt->current_position());
+        }
+        if (!saved_querier || (!querier_opt->are_limits_reached() && !qs.builder.is_short_read())) {
+            co_await querier_opt->close();
+            querier_opt = {};
+        }
    }
    if (saved_querier) {
        *saved_querier = std::move(querier_opt);
--- a/rust/CMakeLists.txt
+++ b/rust/CMakeLists.txt
@@ -87,6 +87,11 @@ target_include_directories(wasmtime_bindings
 target_link_libraries(wasmtime_bindings
  INTERFACE Rust::rust_combined)
 if (Scylla_USE_PRECOMPILED_HEADER_USE)
+  # The PCH from scylla-precompiled-header is compiled with Seastar's compile
+  # flags, including sanitizer flags in Debug/Sanitize modes. Any target reusing
+  # this PCH must have matching compile options, otherwise the compiler rejects
+  # the PCH due to flag mismatch (e.g., -fsanitize=address).
+  target_link_libraries(wasmtime_bindings PRIVATE Seastar::seastar)
  target_precompile_headers(wasmtime_bindings REUSE_FROM scylla-precompiled-header)
 endif()

@@ -108,5 +113,6 @@ target_include_directories(inc
 target_link_libraries(inc
  INTERFACE Rust::rust_combined)
 if (Scylla_USE_PRECOMPILED_HEADER_USE)
+  target_link_libraries(inc PRIVATE Seastar::seastar)
  target_precompile_headers(inc REUSE_FROM scylla-precompiled-header)
 endif()
--- a/service/storage_service.cc
+++ b/service/storage_service.cc
@@ -910,7 +910,7 @@ future<> storage_service::merge_topology_snapshot(raft_snapshot snp) {
                    frozen_muts_to_apply.push_back(co_await freeze_gently(mut));
                } else {
                    co_await for_each_split_mutation(std::move(mut), max_size, [&] (mutation m) -> future<> {
-                        frozen_muts_to_apply.push_back(co_await freeze_gently(mut));
+                        frozen_muts_to_apply.push_back(co_await freeze_gently(m));
                    });
                }
            }
--- a/test.py
+++ b/test.py
@@ -181,7 +181,7 @@ def parse_cmd_line() -> argparse.Namespace:
                        help="Run only tests for given build mode(s)")
    parser.add_argument('--repeat', action="store", default="1", type=int,
                        help="number of times to repeat test execution")
-    parser.add_argument('--timeout', action="store", default="24000", type=int,
+    parser.add_argument('--timeout', action="store", default="3600", type=int,
                        help="timeout value for single test execution")
    parser.add_argument('--session-timeout', action="store", default="24000", type=int,
                        help="timeout value for test.py/pytest session execution")
--- a/test/boost/encrypted_file_test.cc
+++ b/test/boost/encrypted_file_test.cc
@@ -23,8 +23,11 @@
 #include "test/lib/tmpdir.hh"
 #include "test/lib/random_utils.hh"
 #include "test/lib/exception_utils.hh"
+#include "test/lib/limiting_data_source.hh"
 #include "utils/io-wrappers.hh"

+#include <seastar/util/memory-data-source.hh>
+
 using namespace encryption;

 static tmpdir dir;
@@ -595,6 +598,113 @@ SEASTAR_TEST_CASE(test_encrypted_data_source_simple) {
    co_await test_random_data_source(sizes);
 }

+// Reproduces the production deadlock where encrypted SSTable component downloads
+// got stuck during restore. The encrypted_data_source::get() caches a block in
+// _next, then on the next call bypasses input_stream::read()'s _eof check and
+// calls input_stream::read_exactly() — which does NOT check _eof when _buf is
+// empty. This causes a second get() on the underlying source after EOS.
+//
+// In production the underlying source was chunked_download_source whose get()
+// hung forever. Here we simulate it with a strict source that fails the test.
+//
+// The fix belongs in seastar's input_stream::read_exactly(): check _eof before
+// calling _fd.get(), consistent with read(), read_up_to(), and consume().
+static future<> test_encrypted_source_copy(size_t plaintext_size) {
+    testlog.info("test_encrypted_source_copy: plaintext_size={}", plaintext_size);
+
+    key_info info{"AES/CBC", 256};
+    auto k = ::make_shared<symmetric_key>(info);
+
+    // Step 1: Encrypt the plaintext into memory buffers
+    auto plaintext = generate_random<char>(plaintext_size);
+    std::vector<temporary_buffer<char>> encrypted_bufs;
+    {
+        data_sink sink(make_encrypted_sink(create_memory_sink(encrypted_bufs), k));
+        co_await sink.put(plaintext.clone());
+        co_await sink.close();
+    }
+
+    // Flatten encrypted buffers into a single contiguous buffer
+    size_t encrypted_total = 0;
+    for (const auto& b : encrypted_bufs) {
+        encrypted_total += b.size();
+    }
+    temporary_buffer<char> encrypted(encrypted_total);
+    size_t pos = 0;
+    for (const auto& b : encrypted_bufs) {
+        std::copy(b.begin(), b.end(), encrypted.get_write() + pos);
+        pos += b.size();
+    }
+
+    // Step 2: Create a data source from the encrypted data that fails on
+    // post-EOS get() — simulating a source like chunked_download_source
+    // that would hang forever in this situation.
+    class strict_memory_source final : public limiting_data_source_impl {
+        bool _eof = false;
+    public:
+        strict_memory_source(temporary_buffer<char> data, size_t chunk_size)
+            : limiting_data_source_impl(
+                data_source(std::make_unique<util::temporary_buffer_data_source>(std::move(data))),
+                chunk_size) {}
+
+        future<temporary_buffer<char>> get() override {
+            BOOST_REQUIRE_MESSAGE(!_eof,
+                "get() called on source after it already returned EOS — "
+                "this is the production deadlock: read_exactly() does not "
+                "check _eof before calling _fd.get()");
+            auto buf = co_await limiting_data_source_impl::get();
+            _eof = buf.empty();
+            co_return buf;
+        }
+    };
+
+    // Step 3: Wrap in encrypted_data_source and drain via consume() —
+    // the exact code path used by seastar::copy() which is what
+    // sstables_loader_helpers::download_sstable() calls.
+    // Try multiple chunk sizes to hit different alignment scenarios.
+    for (size_t chunk_size : {1ul, 7ul, 4096ul, 8192ul, encrypted_total, encrypted_total + 1}) {
+        if (chunk_size == 0) continue;
+        auto src = data_source(make_encrypted_source(
+            data_source(std::make_unique<strict_memory_source>(encrypted.clone(), chunk_size)), k));
+        auto in = input_stream<char>(std::move(src));
+
+        // consume() is what seastar::copy() uses internally. It calls
+        // encrypted_data_source::get() via _fd.get() until EOF.
+        size_t total_decrypted = 0;
+        co_await in.consume([&total_decrypted](temporary_buffer<char> buf) {
+            total_decrypted += buf.size();
+            return make_ready_future<consumption_result<char>>(continue_consuming{});
+        });
+        co_await in.close();
+
+        BOOST_REQUIRE_EQUAL(total_decrypted, plaintext_size);
+    }
+}
+
+SEASTAR_TEST_CASE(test_encrypted_source_copy_8k) {
+    co_await test_encrypted_source_copy(8192);
+}
+
+SEASTAR_TEST_CASE(test_encrypted_source_copy_4k) {
+    co_await test_encrypted_source_copy(4096);
+}
+
+SEASTAR_TEST_CASE(test_encrypted_source_copy_small) {
+    co_await test_encrypted_source_copy(100);
+}
+
+SEASTAR_TEST_CASE(test_encrypted_source_copy_12k) {
+    co_await test_encrypted_source_copy(12288);
+}
+
+SEASTAR_TEST_CASE(test_encrypted_source_copy_unaligned) {
+    co_await test_encrypted_source_copy(8193);
+}
+
+SEASTAR_TEST_CASE(test_encrypted_source_copy_1byte) {
+    co_await test_encrypted_source_copy(1);
+}
+

 SEASTAR_TEST_CASE(test_encrypted_data_source_fuzzy) {
    std::mt19937_64 rand_gen(std::random_device{}());
--- a/test/cluster/dtest/audit_test.py
+++ b/test/cluster/dtest/audit_test.py
--- a/test/cluster/test_config.yaml
+++ b/test/cluster/test_config.yaml
@@ -44,6 +44,7 @@ run_in_dev:
  - dtest/bypass_cache_test
  - dtest/auth_roles_test
  - dtest/audit_test
+  - audit/test_audit
  - dtest/commitlog_test
  - dtest/cfid_test
  - dtest/rebuild_test
--- a/test/cluster/test_hints.py
+++ b/test/cluster/test_hints.py
@@ -17,9 +17,9 @@ from test.pylib.manager_client import ManagerClient
 from test.pylib.rest_client import ScyllaMetricsClient, TCPRESTClient, inject_error
 from test.pylib.tablets import get_tablet_replicas
 from test.pylib.scylla_cluster import ReplaceConfig
-from test.pylib.util import wait_for
+from test.pylib.util import gather_safely, wait_for

-from test.cluster.util import get_topology_coordinator, find_server_by_host_id, new_test_keyspace
+from test.cluster.util import get_topology_coordinator, find_server_by_host_id, keyspace_has_tablets, new_test_keyspace, new_test_table


 logger = logging.getLogger(__name__)
@@ -51,28 +51,42 @@ async def await_sync_point(client: TCPRESTClient, server_ip: IPAddress, sync_poi
@pytest.mark.asyncio
 async def test_write_cl_any_to_dead_node_generates_hints(manager: ManagerClient):
    node_count = 2
-    servers = await manager.servers_add(node_count)
+    cmdline = ["--logger-log-level", "hints_manager=trace"]
+    servers = await manager.servers_add(node_count, cmdline=cmdline)
+
+    async def wait_for_hints_written(min_hint_count: int, timeout: int):
+        async def aux():
+            hints_written = await get_hint_metrics(manager.metrics, servers[0].ip_addr, "written")
+            if hints_written >= min_hint_count:
+                return True
+            return None
+        assert await wait_for(aux, time.time() + timeout)

    cql = manager.get_cql()
    async with new_test_keyspace(manager, "WITH replication = {'class': 'NetworkTopologyStrategy', 'replication_factor': 1}") as ks:
-        table = f"{ks}.t"
-        await cql.run_async(f"CREATE TABLE {table} (pk int primary key, v int)")
+        uses_tablets = await keyspace_has_tablets(manager, ks)
+        # If the keyspace uses tablets, let's explicitly require the table to use multiple tablets.
+        # Otherwise, it could happen that all mutations would target servers[0] only, which would
+        # ultimately lead to a test failure here. We rely on the assumption that mutations will be
+        # distributed more or less uniformly!
+        extra_opts = "WITH tablets = {'min_tablet_count': 16}" if uses_tablets else ""
+        async with new_test_table(manager, ks, "pk int PRIMARY KEY, v int", extra_opts) as table:
+            await manager.server_stop_gracefully(servers[1].server_id)

-        await manager.server_stop_gracefully(servers[1].server_id)
+            hints_before = await get_hint_metrics(manager.metrics, servers[0].ip_addr, "written")

-        hints_before = await get_hint_metrics(manager.metrics, servers[0].ip_addr, "written")
+            stmt = cql.prepare(f"INSERT INTO {table} (pk, v) VALUES (?, ?)")
+            stmt.consistency_level = ConsistencyLevel.ANY

-        # Some of the inserts will be targeted to the dead node.
-        # The coordinator doesn't have live targets to send the write to, but it should write a hint.
-        for i in range(100):
-            await cql.run_async(SimpleStatement(f"INSERT INTO {table} (pk, v) VALUES ({i}, {i+1})", consistency_level=ConsistencyLevel.ANY))
+            # Some of the inserts will be targeted to the dead node.
+            # The coordinator doesn't have live targets to send the write to, but it should write a hint.
+            await gather_safely(*[cql.run_async(stmt, (i, i + 1)) for i in range(100)])

-        # Verify hints are written
-        hints_after = await get_hint_metrics(manager.metrics, servers[0].ip_addr, "written")
-        assert hints_after > hints_before
+            # Verify hints are written
+            await wait_for_hints_written(hints_before + 1, timeout=60)

-        # For dropping the keyspace
-        await manager.server_start(servers[1].server_id)
+            # For dropping the keyspace
+            await manager.server_start(servers[1].server_id)

@pytest.mark.asyncio
 async def test_limited_concurrency_of_writes(manager: ManagerClient):
--- a/test/cluster/test_incremental_repair.py
+++ b/test/cluster/test_incremental_repair.py
@@ -151,7 +151,7 @@ async def trigger_tablet_merge(manager, servers, logs):
    await s1_log.wait_for('Detected tablet merge for table', from_mark=s1_mark)
    await inject_error_off(manager, "tablet_force_tablet_count_decrease", servers)

-async def preapre_cluster_for_incremental_repair(manager, nr_keys = 100 , cmdline = []):
+async def prepare_cluster_for_incremental_repair(manager, nr_keys = 100 , cmdline = []):
    servers, cql, hosts, ks, table_id = await create_table_insert_data_for_repair(manager, nr_keys=nr_keys, cmdline=cmdline)
    repaired_keys = set(range(0, nr_keys))
    unrepaired_keys = set()
@@ -164,7 +164,7 @@ async def preapre_cluster_for_incremental_repair(manager, nr_keys = 100 , cmdlin

@pytest.mark.asyncio
 async def test_tablet_repair_sstable_skipped_read_metrics(manager: ManagerClient):
-    servers, cql, hosts, ks, table_id, logs, _, _, _, token = await preapre_cluster_for_incremental_repair(manager)
+    servers, cql, hosts, ks, table_id, logs, _, _, _, token = await prepare_cluster_for_incremental_repair(manager)

    await insert_keys(cql, ks, 0, 100)

@@ -274,7 +274,7 @@ async def test_tablet_incremental_repair_error(manager: ManagerClient):

 async def do_tablet_incremental_repair_and_ops(manager: ManagerClient, ops: str):
    nr_keys = 100
-    servers, cql, hosts, ks, table_id, logs, repaired_keys, unrepaired_keys, current_key, token = await preapre_cluster_for_incremental_repair(manager, nr_keys)
+    servers, cql, hosts, ks, table_id, logs, repaired_keys, unrepaired_keys, current_key, token = await prepare_cluster_for_incremental_repair(manager, nr_keys, cmdline=['--logger-log-level', 'compaction=debug'])
    token = -1

    await manager.api.tablet_repair(servers[0].ip_addr, ks, "test", token, incremental_mode='incremental')
@@ -335,7 +335,7 @@ async def test_tablet_incremental_repair_and_major(manager: ManagerClient):
@pytest.mark.asyncio
 async def test_tablet_incremental_repair_and_minor(manager: ManagerClient):
    nr_keys = 100
-    servers, cql, hosts, ks, table_id, logs, repaired_keys, unrepaired_keys, current_key, token = await preapre_cluster_for_incremental_repair(manager, nr_keys)
+    servers, cql, hosts, ks, table_id, logs, repaired_keys, unrepaired_keys, current_key, token = await prepare_cluster_for_incremental_repair(manager, nr_keys)

    # Disable autocompaction
    for server in servers:
@@ -381,7 +381,7 @@ async def test_tablet_incremental_repair_and_minor(manager: ManagerClient):

 async def do_test_tablet_incremental_repair_with_split_and_merge(manager, do_split, do_merge):
    nr_keys = 100
-    servers, cql, hosts, ks, table_id, logs, repaired_keys, unrepaired_keys, current_key, token = await preapre_cluster_for_incremental_repair(manager, nr_keys)
+    servers, cql, hosts, ks, table_id, logs, repaired_keys, unrepaired_keys, current_key, token = await prepare_cluster_for_incremental_repair(manager, nr_keys)

    # First repair
    await manager.api.tablet_repair(servers[0].ip_addr, ks, "test", token, incremental_mode='incremental') # sstables_repaired_at 1
@@ -442,7 +442,7 @@ async def test_tablet_incremental_repair_with_merge(manager: ManagerClient):
 async def test_tablet_incremental_repair_existing_and_repair_produced_sstable(manager: ManagerClient):
    nr_keys = 100
    cmdline = ["--hinted-handoff-enabled", "0"]
-    servers, cql, hosts, ks, table_id, logs, repaired_keys, unrepaired_keys, current_key, token = await preapre_cluster_for_incremental_repair(manager, nr_keys, cmdline)
+    servers, cql, hosts, ks, table_id, logs, repaired_keys, unrepaired_keys, current_key, token = await prepare_cluster_for_incremental_repair(manager, nr_keys, cmdline)

    await manager.server_stop_gracefully(servers[1].server_id)

@@ -466,7 +466,7 @@ async def test_tablet_incremental_repair_existing_and_repair_produced_sstable(ma
@pytest.mark.skip_mode(mode='release', reason='error injections are not supported in release mode')
 async def test_tablet_incremental_repair_merge_higher_repaired_at_number(manager):
    nr_keys = 100
-    servers, cql, hosts, ks, table_id, logs, repaired_keys, unrepaired_keys, current_key, token = await preapre_cluster_for_incremental_repair(manager, nr_keys)
+    servers, cql, hosts, ks, table_id, logs, repaired_keys, unrepaired_keys, current_key, token = await prepare_cluster_for_incremental_repair(manager, nr_keys)

    # First repair
    await manager.api.tablet_repair(servers[0].ip_addr, ks, "test", token, incremental_mode='incremental') # sstables_repaired_at 1
@@ -507,7 +507,7 @@ async def test_tablet_incremental_repair_merge_higher_repaired_at_number(manager
@pytest.mark.skip_mode(mode='release', reason='error injections are not supported in release mode')
 async def test_tablet_incremental_repair_merge_correct_repaired_at_number_after_merge(manager):
    nr_keys = 100
-    servers, cql, hosts, ks, table_id, logs, repaired_keys, unrepaired_keys, current_key, token = await preapre_cluster_for_incremental_repair(manager, nr_keys)
+    servers, cql, hosts, ks, table_id, logs, repaired_keys, unrepaired_keys, current_key, token = await prepare_cluster_for_incremental_repair(manager, nr_keys)

    # First repair
    await manager.api.tablet_repair(servers[0].ip_addr, ks, "test", token, incremental_mode='incremental') # sstables_repaired_at 1
@@ -541,7 +541,7 @@ async def do_test_tablet_incremental_repair_merge_error(manager, error):
    nr_keys = 100
    # Make sure no data commit log replay after force server stop
    cmdline = ['--enable-commitlog', '0']
-    servers, cql, hosts, ks, table_id, logs, repaired_keys, unrepaired_keys, current_key, token = await preapre_cluster_for_incremental_repair(manager, nr_keys, cmdline)
+    servers, cql, hosts, ks, table_id, logs, repaired_keys, unrepaired_keys, current_key, token = await prepare_cluster_for_incremental_repair(manager, nr_keys, cmdline)

    # First repair
    await manager.api.tablet_repair(servers[0].ip_addr, ks, "test", token, incremental_mode='incremental') # sstables_repaired_at 1
@@ -587,7 +587,7 @@ async def test_tablet_incremental_repair_merge_error_in_merge_completion_fiber(m

@pytest.mark.asyncio
 async def test_tablet_repair_with_incremental_option(manager: ManagerClient):
-    servers, cql, hosts, ks, table_id, logs, _, _, _, token = await preapre_cluster_for_incremental_repair(manager)
+    servers, cql, hosts, ks, table_id, logs, _, _, _, token = await prepare_cluster_for_incremental_repair(manager)
    token = -1

    sstables_repaired_at = 0
@@ -632,7 +632,7 @@ async def test_tablet_repair_with_incremental_option(manager: ManagerClient):

@pytest.mark.asyncio
 async def test_incremental_repair_tablet_time_metrics(manager: ManagerClient):
-    servers, _, _, ks, _, _, _, _, _, token = await preapre_cluster_for_incremental_repair(manager)
+    servers, _, _, ks, _, _, _, _, _, token = await prepare_cluster_for_incremental_repair(manager)
    time1 = 0
    time2 = 0

@@ -820,7 +820,7 @@ async def test_repair_sigsegv_with_diff_shard_count(manager: ManagerClient, use_
@pytest.mark.skip_mode(mode='release', reason='error injections are not supported in release mode')
 async def test_tablet_incremental_repair_table_drop_compaction_group_gone(manager: ManagerClient):
    cmdline = ['--logger-log-level', 'repair=debug']
-    servers, cql, hosts, ks, table_id, logs, _, _, _, _ = await preapre_cluster_for_incremental_repair(manager, cmdline=cmdline)
+    servers, cql, hosts, ks, table_id, logs, _, _, _, _ = await prepare_cluster_for_incremental_repair(manager, cmdline=cmdline)

    coord = await get_topology_coordinator(manager)
    coord_serv = await find_server_by_host_id(manager, servers, coord)
--- a/test/cluster/test_multidc.py
+++ b/test/cluster/test_multidc.py
@@ -20,6 +20,7 @@ from cassandra.query import SimpleStatement
 from test.pylib.async_cql import _wrap_future
 from test.pylib.manager_client import ManagerClient
 from test.pylib.random_tables import RandomTables, TextType, Column
+from test.pylib.rest_client import read_barrier
 from test.pylib.util import unique_name
 from test.cluster.conftest import cluster_con

@@ -403,6 +404,7 @@ async def test_arbiter_dc_rf_rack_valid_keyspaces(manager: ManagerClient):
        for task in [*valid_keyspaces, *invalid_keyspaces]:
            _ = tg.create_task(task)

+@pytest.mark.asyncio
 async def test_startup_with_keyspaces_violating_rf_rack_valid_keyspaces(manager: ManagerClient):
    """
    This test verifies that starting a Scylla node fails when there's an RF-rack-invalid keyspace.
@@ -464,22 +466,50 @@ async def test_startup_with_keyspaces_violating_rf_rack_valid_keyspaces(manager:
        for rfs, tablets in valid_keyspaces:
            _ = tg.create_task(create_keyspace(rfs, tablets))

-    await manager.server_stop_gracefully(s1.server_id)
-    await manager.server_update_config(s1.server_id, "rf_rack_valid_keyspaces", "true")
-
+    # Precondition: s1 has rf_rack_valid_keyspaces set to false.
+    # Postcondition: s1 still has rf_rack_valid_keyspaces set to false.
    async def try_fail(rfs: List[int], dc: str, rf: int, rack_count: int):
+        running_servers = await manager.running_servers()
+        should_start = s1.server_id not in [server.server_id for server in running_servers]
+        if should_start:
+            await manager.server_start(s1.server_id)
+
        ks = await create_keyspace(rfs, True)
+        # We need to wait for the new schema to propagate.
+        # Otherwise, it's not clear when the mutation
+        # corresponding to the created keyspace will
+        # arrive at server 1.
+        # It could happen only after the node performs
+        # the check upon start-up, effectively leading
+        # to a successful start-up, which we don't want.
+        # For more context, see issue: SCYLLADB-1137.
+        await read_barrier(manager.api, s1.ip_addr)
+
+        await manager.server_stop_gracefully(s1.server_id)
+        await manager.server_update_config(s1.server_id, "rf_rack_valid_keyspaces", "true")
+
        err = f"The keyspace '{ks}' is required to be RF-rack-valid. " \
              f"That condition is violated for DC '{dc}': RF={rf} vs. rack count={rack_count}."
-        _ = await manager.server_start(s1.server_id, expected_error=err)
+        await manager.server_start(s1.server_id, expected_error=err)
        await cql.run_async(f"DROP KEYSPACE {ks}")

+        await manager.server_update_config(s1.server_id, "rf_rack_valid_keyspaces", "false")
+
    # Test RF-rack-invalid keyspaces.
    await try_fail([2, 0], "dc1", 2, 3)
    await try_fail([3, 2], "dc2", 2, 1)
    await try_fail([4, 1], "dc1", 4, 3)

-    _ = await manager.server_start(s1.server_id)
+    # We need to perform a read barrier on the node to make
+    # sure that it processes the last DROP KEYSPACE.
+    # Otherwise, the node could think the RF-rack-invalid
+    # keyspace still exists.
+    await manager.server_start(s1.server_id)
+    await read_barrier(manager.api, s1.ip_addr)
+    await manager.server_stop_gracefully(s1.server_id)
+
+    await manager.server_update_config(s1.server_id, "rf_rack_valid_keyspaces", "true")
+    await manager.server_start(s1.server_id)

@pytest.mark.asyncio
 async def test_startup_with_keyspaces_violating_rf_rack_valid_keyspaces_but_not_enforced(manager: ManagerClient):
--- a/test/cluster/test_refresh.py
+++ b/test/cluster/test_refresh.py
@@ -23,10 +23,25 @@ from test.cluster.object_store.conftest import format_tuples
 from test.cluster.object_store.test_backup import topo, take_snapshot, do_test_streaming_scopes
 from test.cluster.util import new_test_keyspace
 from test.pylib.rest_client import read_barrier
-from test.pylib.util import unique_name
+from test.pylib.util import unique_name, wait_for

 logger = logging.getLogger(__name__)

+
+async def wait_for_upload_dir_empty(upload_dir, timeout=30):
+    '''
+    Wait until the upload directory is empty with a timeout.
+    SSTable unlinking is asynchronous and in rare situations, it can happen
+    that not all sstables are deleted from the upload dir immediately after refresh is done.
+    '''
+    deadline = time.time() + timeout
+    async def check_empty():
+        files = os.listdir(upload_dir)
+        if not files:
+            return True
+        return None
+    await wait_for(check_empty, deadline, period=0.5)
+
 class SSTablesOnLocalStorage:
    def __init__(self):
        self.tmpdir = f'tmpbackup-{str(uuid.uuid4())}'
@@ -153,7 +168,8 @@ async def test_refresh_deletes_uploaded_sstables(manager: ManagerClient):

        for s in servers:
            cf_dir = dirs[s.server_id]["cf_dir"]
-            files = os.listdir(os.path.join(cf_dir, 'upload'))
-            assert files == [], f'Upload dir not empty on server {s.server_id}: {files}'
+            upload_dir = os.path.join(cf_dir, 'upload')
+            assert os.path.exists(upload_dir)
+            await wait_for_upload_dir_empty(upload_dir)

        shutil.rmtree(tmpbackup)
--- a/test/cluster/test_tombstone_gc.py
+++ b/test/cluster/test_tombstone_gc.py
@@ -196,7 +196,7 @@ async def test_group0_tombstone_gc(manager: ManagerClient):
            tombstone_mark = datetime.now(timezone.utc)

            # test #2: the tombstones are not cleaned up when one node is down
-            with pytest.raises(AssertionError, match="Deadline exceeded"):
+            with pytest.raises(AssertionError, match="timed out"):
                # waiting for shorter time (5s normally enough for a successful case, we expect the timeout here)
                await verify_tombstone_gc(tombstone_mark, timeout=5)

@@ -249,7 +249,7 @@ async def test_group0_tombstone_gc(manager: ManagerClient):
            await wait_for_cql_and_get_hosts(cql, servers, time.time() + 60)

            # test #4a: the tombstones are not cleaned up after both live nodes join the new group0
-            with pytest.raises(AssertionError, match="Deadline exceeded"):
+            with pytest.raises(AssertionError, match="timed out"):
                await verify_tombstone_gc(tombstone_mark, timeout=5)

            await manager.remove_node(servers[0].server_id, down_server.server_id)
--- a/test/cluster/util.py
+++ b/test/cluster/util.py
@@ -165,7 +165,7 @@ async def wait_for_cdc_generations_publishing(cql: Session, hosts: list[Host], d
            unpublished_generations = topo_res[0].unpublished_cdc_generations
            return unpublished_generations is None or len(unpublished_generations) == 0 or None

-        await wait_for(all_generations_published, deadline=deadline, period=1.0)
+        await wait_for(all_generations_published, deadline=deadline)


 async def check_system_topology_and_cdc_generations_v3_consistency(manager: ManagerClient, live_hosts: list[Host], cqls: Optional[list[Session]] = None, ignored_hosts: list[Host] = []):
@@ -470,6 +470,17 @@ async def new_materialized_view(manager: ManagerClient, table, select, pk, where
        await manager.get_cql().run_async(f"DROP MATERIALIZED VIEW {mv}")


+async def keyspace_has_tablets(manager: ManagerClient, keyspace: str) -> bool:
+    """
+    Checks whether the given keyspace uses tablets.
+    Adapted from its counterpart in the cqlpy test: cqlpy/util.py::keyspace_has_tablets.
+    """
+    cql = manager.get_cql()
+    rows_iter = await cql.run_async(f"SELECT * FROM system_schema.scylla_keyspaces WHERE keyspace_name='{keyspace}'")
+    rows = list(rows_iter)
+    return len(rows) > 0 and getattr(rows[0], "initial_tablets", None) is not None
+
+
 async def get_raft_log_size(cql, host) -> int:
    query = "select count(\"index\") from system.raft"
    return (await cql.run_async(query, host=host))[0][0]
--- a/test/lib/limiting_data_source.cc
+++ b/test/lib/limiting_data_source.cc
@@ -7,54 +7,42 @@
 */

 #include "limiting_data_source.hh"
-#include <seastar/core/iostream.hh>
-#include <seastar/core/temporary_buffer.hh>
-#include <cstdint>

 using namespace seastar;

-class limiting_data_source_impl final : public data_source_impl {
-    data_source _src;
-    size_t _limit;
-    temporary_buffer<char> _buf;
-    future<temporary_buffer<char>> do_get() {
-        uint64_t size = std::min(_limit, _buf.size());
-        auto res = _buf.share(0, size);
-        _buf.trim_front(size);
-        return make_ready_future<temporary_buffer<char>>(std::move(res));
-    }
-public:
-    limiting_data_source_impl(data_source&& src, size_t limit)
-        : _src(std::move(src))
-        , _limit(limit)
-    {}
+future<temporary_buffer<char>> limiting_data_source_impl::do_get() {
+    uint64_t size = std::min(_limit, _buf.size());
+    auto res = _buf.share(0, size);
+    _buf.trim_front(size);
+    return make_ready_future<temporary_buffer<char>>(std::move(res));
+}

-    limiting_data_source_impl(limiting_data_source_impl&&) noexcept = default;
-    limiting_data_source_impl& operator=(limiting_data_source_impl&&) noexcept = default;
+limiting_data_source_impl::limiting_data_source_impl(data_source&& src, size_t limit) : _src(std::move(src)), _limit(limit) {
+}

-    virtual future<temporary_buffer<char>> get() override {
-        if (_buf.empty()) {
-            _buf.release();
-            return _src.get().then([this] (auto&& buf) {
-                _buf = std::move(buf);
-                return do_get();
-            });
-        }
-        return do_get();
-    }
-    virtual future<temporary_buffer<char>> skip(uint64_t n) override {
-        if (n < _buf.size()) {
-            _buf.trim_front(n);
-            return do_get();
-        }
-        n -= _buf.size();
+future<temporary_buffer<char>> limiting_data_source_impl::get() {
+    if (_buf.empty()) {
        _buf.release();
-        return _src.skip(n).then([this] (auto&& buf) {
+        return _src.get().then([this](auto&& buf) {
            _buf = std::move(buf);
            return do_get();
        });
    }
-};
+    return do_get();
+}
+
+future<temporary_buffer<char>> limiting_data_source_impl::skip(uint64_t n) {
+    if (n < _buf.size()) {
+        _buf.trim_front(n);
+        return do_get();
+    }
+    n -= _buf.size();
+    _buf.release();
+    return _src.skip(n).then([this](auto&& buf) {
+        _buf = std::move(buf);
+        return do_get();
+    });
+}

 data_source make_limiting_data_source(data_source&& src, size_t limit) {
    return data_source{std::make_unique<limiting_data_source_impl>(std::move(src), limit)};
--- a/test/lib/limiting_data_source.hh
+++ b/test/lib/limiting_data_source.hh
@@ -8,13 +8,25 @@

 #pragma once

-#include <stddef.h>
+#include <seastar/core/iostream.hh>
+#include <seastar/core/temporary_buffer.hh>

-namespace seastar {

-class data_source;
+class limiting_data_source_impl : public seastar::data_source_impl {
+    seastar::data_source _src;
+    size_t _limit;
+    seastar::temporary_buffer<char> _buf;
+    seastar::future<seastar::temporary_buffer<char>> do_get();

-}
+public:
+    limiting_data_source_impl(seastar::data_source&& src, size_t limit);
+
+    limiting_data_source_impl(limiting_data_source_impl&&) noexcept = default;
+    limiting_data_source_impl& operator=(limiting_data_source_impl&&) noexcept = default;
+
+    seastar::future<seastar::temporary_buffer<char>> get() override;
+    seastar::future<seastar::temporary_buffer<char>> skip(uint64_t n) override;
+};

 /// \brief Creates an data_source from another data_source but returns its data in chunks not bigger than a given limit
 ///
--- a/test/lib/proc_utils.cc
+++ b/test/lib/proc_utils.cc
@@ -271,10 +271,21 @@ future<std::tuple<tests::proc::process_fixture, int>> tests::proc::start_docker_
            // arbitrary timeout of 120s for the server to make some output. Very generous.
            // but since we (maybe) run docker, and might need to pull image, this can take
            // some time if we're unlucky.
-            co_await with_timeout(std::chrono::steady_clock::now() + 120s, when_all(std::move(out_fut), std::move(err_fut)));
-        } catch (in_use&) {
-            retry = true;
-            p = std::current_exception();
+            auto [f1, f2] = co_await with_timeout(std::chrono::steady_clock::now() + 120s, when_all(std::move(out_fut), std::move(err_fut)));
+            for (auto* f : {&f1, &f2}) {
+                if (f->failed()) {
+                    try {
+                        f->get();
+                    } catch (in_use&) {
+                        retry = true;
+                        p = std::current_exception();
+                    } catch (...) {
+                        if (!p) {
+                            p = std::current_exception();
+                        }
+                    }
+                }
+            }
        } catch (...) {
            p = std::current_exception();
        }
--- a/test/pylib/manager_client.py
+++ b/test/pylib/manager_client.py
@@ -60,6 +60,7 @@ class ManagerClient:
        self.con_gen = con_gen
        self.ccluster: Optional[CassandraCluster] = None
        self.cql: Optional[CassandraSession] = None
+        self.exclusive_clusters: List[CassandraCluster] = []
        # A client for communicating with ScyllaClusterManager (server)
        self.sock_path = sock_path
        self.client_for_asyncio_loop = {asyncio.get_running_loop(): UnixRESTClient(sock_path)}
@@ -113,6 +114,9 @@ class ManagerClient:

    def driver_close(self) -> None:
        """Disconnect from cluster"""
+        for cluster in self.exclusive_clusters:
+            cluster.shutdown()
+        self.exclusive_clusters.clear()
        if self.ccluster is not None:
            logger.debug("shutting down driver")
            safe_driver_shutdown(self.ccluster)
@@ -134,9 +138,12 @@ class ManagerClient:
        hosts = await wait_for_cql_and_get_hosts(cql, servers, time() + 60)
        return cql, hosts

-    async def get_cql_exclusive(self, server: ServerInfo):
-        cql = self.con_gen([server.ip_addr], self.port, self.use_ssl, self.auth_provider,
-                                     WhiteListRoundRobinPolicy([server.ip_addr])).connect()
+    async def get_cql_exclusive(self, server: ServerInfo, auth_provider: Optional[AuthProvider] = None):
+        cluster = self.con_gen([server.ip_addr], self.port, self.use_ssl,
+                               auth_provider if auth_provider else self.auth_provider,
+                               WhiteListRoundRobinPolicy([server.ip_addr]))
+        self.exclusive_clusters.append(cluster)
+        cql = cluster.connect()
        await wait_for_cql_and_get_hosts(cql, [server], time() + 60)
        return cql

--- a/test/pylib/scylla_cluster.py
+++ b/test/pylib/scylla_cluster.py
@@ -1394,7 +1394,11 @@ class ScyllaCluster:
                                   f"the test must drop all keyspaces it creates.")
        for server in itertools.chain(self.running.values(), self.stopped.values()):
            server.write_log_marker(f"------ Ending test {name} ------\n")
-            if not server.log_file.closed:
+            # Only close log files when the cluster is dirty (will be destroyed).
+            # If the cluster is clean and will be reused, keep the log file open
+            # so that write_log_marker() and take_log_savepoint() work in the
+            # next test's before_test().
+            if self.is_dirty and not server.log_file.closed:
                server.log_file.close()

    async def server_stop(self, server_id: ServerNum, gracefully: bool) -> None:
--- a/test/pylib/util.py
+++ b/test/pylib/util.py
@@ -56,15 +56,25 @@ def unique_name(unique_name_prefix = 'test_'):
 async def wait_for(
        pred: Callable[[], Awaitable[Optional[T]]],
        deadline: float,
-        period: float = 1,
+        period: float = 0.1,
        before_retry: Optional[Callable[[], Any]] = None,
-        backoff_factor: float = 1,
-        max_period: float = None) -> T:
+        backoff_factor: float = 1.5,
+        max_period: float = 1.0,
+        label: Optional[str] = None) -> T:
+    tag = label or getattr(pred, '__name__', 'unlabeled')
+    start = time.time()
+    retries = 0
    while True:
-        assert(time.time() < deadline), "Deadline exceeded, failing test."
+        elapsed = time.time() - start
+        assert time.time() < deadline, \
+            f"wait_for({tag}) timed out after {elapsed:.2f}s ({retries} retries)"
        res = await pred()
        if res is not None:
+            if retries > 0:
+                logger.debug(f"wait_for({tag}) completed "
+                            f"in {elapsed:.2f}s ({retries} retries)")
            return res
+        retries += 1
        await asyncio.sleep(period)
        period *= backoff_factor
        if max_period is not None:
@@ -273,14 +283,14 @@ async def wait_for_view_v1(cql: Session, name: str, node_count: int, timeout: in
        done = await cql.run_async(f"SELECT COUNT(*) FROM system_distributed.view_build_status WHERE status = 'SUCCESS' AND view_name = '{name}' ALLOW FILTERING")
        return done[0][0] == node_count or None
    deadline = time.time() + timeout
-    await wait_for(view_is_built, deadline)
+    await wait_for(view_is_built, deadline, label=f"view_v1_{name}")

 async def wait_for_view(cql: Session, name: str, node_count: int, timeout: int = 120):
    async def view_is_built():
        done = await cql.run_async(f"SELECT COUNT(*) FROM system.view_build_status_v2 WHERE status = 'SUCCESS' AND view_name = '{name}' ALLOW FILTERING")
        return done[0][0] == node_count or None
    deadline = time.time() + timeout
-    await wait_for(view_is_built, deadline)
+    await wait_for(view_is_built, deadline, label=f"view_{name}")


 async def wait_for_first_completed(coros: list[Coroutine], timeout: int|None = None):
--- a/test/scylla_gdb/gdb_utils.py
+++ b/test/scylla_gdb/gdb_utils.py
@@ -4,7 +4,7 @@
 """
 GDB helper functions for `scylla_gdb` tests.
 They should be loaded to GDB by "-x {dir}/gdb_utils.py}",
-when loaded, they can be run in gdb e.g. `python get_sstables()`
+when loaded, they can be run in gdb e.g. `$get_sstables()`

 Depends on helper functions injected to GDB by `scylla-gdb.py` script.
 (sharded, for_each_table, seastar_lw_shared_ptr, find_sstables, find_vptrs, resolve,
@@ -15,39 +15,65 @@ import gdb
 import uuid


-def get_schema():
-    """Execute GDB commands to get schema information."""
-    db = sharded(gdb.parse_and_eval('::debug::the_database')).local()
-    table = next(for_each_table(db))
-    ptr = seastar_lw_shared_ptr(table['_schema']).get()
-    print('schema=', ptr)
+class get_schema(gdb.Function):
+    """Finds and returns a schema pointer."""
+
+    def __init__(self):
+        super(get_schema, self).__init__('get_schema')
+
+    def invoke(self):
+        db = sharded(gdb.parse_and_eval('::debug::the_database')).local()
+        table = next(for_each_table(db))
+        return seastar_lw_shared_ptr(table['_schema']).get()


-def get_sstables():
-    """Execute GDB commands to get sstables information."""
-    sst = next(find_sstables())
-    print(f"sst=(sstables::sstable *)", sst)
+class get_sstable(gdb.Function):
+    """Finds and returns an sstable pointer."""
+
+    def __init__(self):
+        super(get_sstable, self).__init__('get_sstable')
+
+    def invoke(self):
+        return next(find_sstables())


-def get_task():
+class get_task(gdb.Function):
    """
-    Some commands need a task to work on. The following fixture finds one.
+    Finds and returns a Scylla fiber task.
    Because we stopped Scylla while it was idle, we don't expect to find
    any ready task with get_local_tasks(), but we can find one with a
    find_vptrs() loop. I noticed that a nice one (with multiple tasks chained
    to it for "scylla fiber") is one from http_server::do_accept_one.
    """
-    for obj_addr, vtable_addr in find_vptrs():
-        name = resolve(vtable_addr, startswith='vtable for seastar::continuation')
-        if name and 'do_accept_one' in name:
-            print(f"task={obj_addr.cast(gdb.lookup_type('uintptr_t'))}")
-            break
+    def __init__(self):
+        super(get_task, self).__init__('get_task')
+
+    def invoke(self):
+        for obj_addr, vtable_addr in find_vptrs():
+            name = resolve(vtable_addr, startswith='vtable for seastar::continuation')
+            if name and 'do_accept_one' in name:
+                return obj_addr.cast(gdb.lookup_type('uintptr_t'))


-def get_coroutine():
-    """Similar to get_task(), but looks for a coroutine frame."""
-    target = 'service::topology_coordinator::run() [clone .resume]'
-    for obj_addr, vtable_addr in find_vptrs():
-        name = resolve(vtable_addr)
-        if name and name.strip() == target:
-            print(f"coroutine_config={obj_addr.cast(gdb.lookup_type('uintptr_t'))}")
+class get_coroutine(gdb.Function):
+    """
+    Finds and returns a coroutine frame.
+    Prints COROUTINE_NOT_FOUND if the coroutine is not present.
+    """
+    def __init__(self):
+        super(get_coroutine, self).__init__('get_coroutine')
+
+    def invoke(self):
+        target = 'service::topology_coordinator::run() [clone .resume]'
+        for obj_addr, vtable_addr in find_vptrs():
+            name = resolve(vtable_addr)
+            if name and name.strip() == target:
+                return obj_addr.cast(gdb.lookup_type('uintptr_t'))
+        print("COROUTINE_NOT_FOUND")
+
+
+# Register the functions in GDB
+get_schema()
+get_sstable()
+get_task()
+get_coroutine()
--- a/test/scylla_gdb/test_schema_commands.py
+++ b/test/scylla_gdb/test_schema_commands.py
@@ -7,7 +7,6 @@ Each only checks that the command does not fail - but not what it does or return
 """

 import pytest
-import re

 from test.scylla_gdb.conftest import execute_gdb_command

@@ -23,20 +22,6 @@ pytestmark = [
    ),
 ]

-
-@pytest.fixture(scope="module")
-def schema(gdb_cmd):
-    """
-    Returns pointer to schema of the first table it finds
-    Even without any user tables, we will always have system tables.
-    """
-    result = execute_gdb_command(gdb_cmd, full_command="python get_schema()").stdout
-    match = re.search(r"schema=\s*(0x[0-9a-fA-F]+)", result)
-    schema_pointer = match.group(1) if match else None
-
-    return schema_pointer
-
-
@pytest.mark.parametrize(
    "command",
    [
@@ -45,21 +30,17 @@ def schema(gdb_cmd):
        "schema (const schema *)",  # `schema` requires type-casted pointer
    ],
 )
-def test_schema(gdb_cmd, command, schema):
-    assert schema, "Failed to find schema of any table"
-
-    result = execute_gdb_command(gdb_cmd, f"{command} {schema}")
+def test_schema(gdb_cmd, command):
+    result = execute_gdb_command(gdb_cmd, f"{command} $get_schema()")
    assert result.returncode == 0, (
        f"GDB command {command} failed. stdout: {result.stdout} stderr: {result.stderr}"
    )


-def test_generate_object_graph(gdb_cmd, schema, request):
-    assert schema, "Failed to find schema of any table"
-
+def test_generate_object_graph(gdb_cmd, request):
    tmpdir = request.config.getoption("--tmpdir")
    result = execute_gdb_command(
-        gdb_cmd, f"generate-object-graph -o {tmpdir}/og.dot -d 2 -t 10 {schema}"
+        gdb_cmd, f"generate-object-graph -o {tmpdir}/og.dot -d 2 -t 10 $get_schema()"
    )
    assert result.returncode == 0, (
        f"GDB command `generate-object-graph` failed. stdout: {result.stdout} stderr: {result.stderr}"
--- a/test/scylla_gdb/test_sstable_commands.py
+++ b/test/scylla_gdb/test_sstable_commands.py
@@ -7,7 +7,6 @@ Each only checks that the command does not fail - but not what it does or return
 """

 import pytest
-import re

 from test.scylla_gdb.conftest import execute_gdb_command

@@ -24,16 +23,6 @@ pytestmark = [
 ]


-@pytest.fixture(scope="module")
-def sstable(gdb_cmd):
-    """Finds sstable"""
-    result = execute_gdb_command(gdb_cmd, full_command="python get_sstables()").stdout
-    match = re.search(r"(\(sstables::sstable \*\) 0x)([0-9a-f]+)", result)
-    sstable_pointer = match.group(0).strip() if match else None
-
-    return sstable_pointer
-
-
@pytest.mark.parametrize(
    "command",
    [
@@ -41,10 +30,8 @@ def sstable(gdb_cmd):
        "sstable-index-cache",
    ],
 )
-def test_sstable(gdb_cmd, command, sstable):
-    assert sstable, "No sstable was found"
-
-    result = execute_gdb_command(gdb_cmd, f"{command} {sstable}")
+def test_sstable(gdb_cmd, command):
+    result = execute_gdb_command(gdb_cmd, f"{command} $get_sstable()")
    assert result.returncode == 0, (
        f"GDB command {command} failed. stdout: {result.stdout} stderr: {result.stderr}"
    )
--- a/test/scylla_gdb/test_task_commands.py
+++ b/test/scylla_gdb/test_task_commands.py
@@ -6,8 +6,6 @@ Tests for commands, that need a some task to work on.
 Each only checks that the command does not fail - but not what it does or returns.
 """

-import re
-
 import pytest

 from test.scylla_gdb.conftest import execute_gdb_command
@@ -25,59 +23,25 @@ pytestmark = [
 ]


-@pytest.fixture(scope="module")
-def task(gdb_cmd):
-    """
-    Finds a Scylla fiber task using a `find_vptrs()` loop.
-
-    Since Scylla is fresh‑booted, `get_local_tasks()` returns nothing.
-    Nevertheless, a `find_vptrs()` scan can still discover the first task
-    skeleton created by `http_server::do_accept_one` (often the earliest
-    “Scylla fiber” to appear).
-    """
-    result = execute_gdb_command(gdb_cmd, full_command="python get_task()").stdout
-    match = re.search(r"task=(\d+)", result)
-    task = match.group(1) if match else None
-    return task
-
-
-@pytest.fixture(scope="module")
-def coroutine_task(gdb_cmd, scylla_server):
-    """
-    Finds a coroutine task, similar to the `task` fixture.
-
-    This fixture executes the `coroutine_config` script in GDB to locate a
-    specific coroutine task.
-    """
-    result = execute_gdb_command(gdb_cmd, full_command="python get_coroutine()").stdout
-    match = re.search(r"coroutine_config=\s*(.*)", result)
-    if not match:
-        # See https://github.com/scylladb/scylladb/issues/22501
-        pytest.skip("Failed to find coroutine task. Skipping test.")
-
-    return match.group(1).strip()
-
-
-def test_coroutine_frame(gdb_cmd, coroutine_task):
+def test_coroutine_frame(gdb_cmd):
    """
    Offsets the pointer by two words to shift from the outer coroutine frame
    to the inner `seastar::task`, as required by `$coro_frame`, which expects
    a `seastar::task*`.
    """
-    assert coroutine_task, "No coroutine task was found"
-
    result = execute_gdb_command(
-        gdb_cmd, full_command=f"p *$coro_frame({coroutine_task} + 16)"
+        gdb_cmd, full_command="p *$coro_frame($get_coroutine() + 16)"
    )
+    if "COROUTINE_NOT_FOUND" in result.stdout:
+        # See https://github.com/scylladb/scylladb/issues/22501
+        pytest.skip("Failed to find coroutine task. Skipping test.")
    assert result.returncode == 0, (
        f"GDB command `coro_frame` failed. stdout: {result.stdout} stderr: {result.stderr}"
    )


-def test_fiber(gdb_cmd, task):
-    assert task, f"No task was found using `find_vptrs()`"
-
-    result = execute_gdb_command(gdb_cmd, f"fiber {task}")
+def test_fiber(gdb_cmd):
+    result = execute_gdb_command(gdb_cmd, "fiber $get_task()")
    assert result.returncode == 0, (
        f"GDB command `fiber` failed. stdout: {result.stdout} stderr: {result.stderr}"
    )
--- a/utils/directories.cc
+++ b/utils/directories.cc
@@ -10,6 +10,7 @@
 #include <seastar/core/seastar.hh>
 #include <seastar/core/coroutine.hh>
 #include <seastar/coroutine/parallel_for_each.hh>
+#include <seastar/util/closeable.hh>
 #include "init.hh"
 #include "supervisor.hh"
 #include "directories.hh"