db: set_skip_when_empty() for rare error-path metrics

Add .set_skip_when_empty() to four metrics in the db module that are only incremented on very rare error paths and are almost always zero: - cache::pinned_dirty_memory_overload: described as 'should sit constantly at 0, nonzero is indicative of a bug' - corrupt_data::entries_reported: only fires on actual data corruption - hints::corrupted_files: only fires on on-disk hint file corruption - rate_limiter::failed_allocations: only fires when the rate limiter hash table is completely full and gives up allocating, requiring extreme cardinality pressure These metrics create unnecessary reporting overhead when they are perpetually zero. set_skip_when_empty() suppresses them from metrics output until they become non-zero. AI-Assisted: yes Signed-off-by: Yaniv Kaul <yaniv.kaul@scylladb.com>
test: fix flaky test_create_ks_auth by removing bad retry timeout
2026-04-20 08:30:35 +00:00 · 2026-04-06 14:41:40 +03:00 · 2026-04-05 19:13:15 +03:00 · 2026-04-05 16:58:02 +03:00 · 2026-04-05 16:00:15 +03:00 · 2026-04-05 15:07:50 +03:00
33 changed files with 469 additions and 204 deletions
--- a/alternator/server.cc
+++ b/alternator/server.cc
@@ -699,6 +699,17 @@ future<executor::request_return_type> server::handle_api_request(std::unique_ptr
        // for such a size.
        co_return api_error::payload_too_large(fmt::format("Request content length limit of {} bytes exceeded", request_content_length_limit));
    }
+    // Check the concurrency limit early, before acquiring memory and
+    // reading the request body, to avoid piling up memory from excess
+    // requests that will be rejected anyway. This mirrors the CQL
+    // transport which also checks concurrency before memory acquisition
+    // (transport/server.cc).
+    if (_pending_requests.get_count() >= _max_concurrent_requests) {
+        _executor._stats.requests_shed++;
+        co_return api_error::request_limit_exceeded(format("too many in-flight requests (configured via max_concurrent_requests_per_shard): {}", _pending_requests.get_count()));
+    }
+    _pending_requests.enter();
+    auto leave = defer([this] () noexcept { _pending_requests.leave(); });
    // JSON parsing can allocate up to roughly 2x the size of the raw
    // document, + a couple of bytes for maintenance.
    // If the Content-Length of the request is not available, we assume
@@ -760,12 +771,6 @@ future<executor::request_return_type> server::handle_api_request(std::unique_ptr
        _executor._stats.unsupported_operations++;
        co_return api_error::unknown_operation(fmt::format("Unsupported operation {}", op));
    }
-    if (_pending_requests.get_count() >= _max_concurrent_requests) {
-        _executor._stats.requests_shed++;
-        co_return api_error::request_limit_exceeded(format("too many in-flight requests (configured via max_concurrent_requests_per_shard): {}", _pending_requests.get_count()));
-    }
-    _pending_requests.enter();
-    auto leave = defer([this] () noexcept { _pending_requests.leave(); });
    executor::client_state client_state(service::client_state::external_tag(),
        _auth_service, &_sl_controller, _timeout_config.current_values(), req->get_client_address());
    if (!username.empty()) {
--- a/db/corrupt_data_handler.cc
+++ b/db/corrupt_data_handler.cc
@@ -22,7 +22,7 @@ corrupt_data_handler::corrupt_data_handler(register_metrics rm) {
        _metrics.add_group("corrupt_data", {
                sm::make_counter("entries_reported", _stats.corrupt_data_reported,
                               sm::description("Counts the number of corrupt data instances reported to the corrupt data handler. "
-                                               "A non-zero value indicates that the database suffered data corruption."))
+                                                "A non-zero value indicates that the database suffered data corruption.")).set_skip_when_empty()
                });
    }
 }
--- a/db/hints/manager.cc
+++ b/db/hints/manager.cc
@@ -186,7 +186,7 @@ void manager::register_metrics(const sstring& group_name) {
            sm::description("Number of unexpected errors during sending, sending will be retried later")),

        sm::make_counter("corrupted_files", _stats.corrupted_files,
-                        sm::description("Number of hints files that were discarded during sending because the file was corrupted.")),
+                        sm::description("Number of hints files that were discarded during sending because the file was corrupted.")).set_skip_when_empty(),

        sm::make_gauge("pending_drains",
                        sm::description("Number of tasks waiting in the queue for draining hints"),
--- a/db/rate_limiter.cc
+++ b/db/rate_limiter.cc
@@ -206,7 +206,7 @@ void rate_limiter_base::register_metrics() {
                sm::description("Number of times a lookup returned an already allocated entry.")),

        sm::make_counter("failed_allocations", _metrics.failed_allocations,
-                sm::description("Number of times the rate limiter gave up trying to allocate.")),
+                sm::description("Number of times the rate limiter gave up trying to allocate.")).set_skip_when_empty(),

        sm::make_counter("probe_count", _metrics.probe_count,
                sm::description("Number of probes made during lookups.")),
--- a/db/row_cache.cc
+++ b/db/row_cache.cc
@@ -174,7 +174,7 @@ cache_tracker::setup_metrics() {
        sm::make_counter("sstable_reader_recreations", sm::description("number of times sstable reader was recreated due to memtable flush"), _stats.underlying_recreations),
        sm::make_counter("sstable_partition_skips", sm::description("number of times sstable reader was fast forwarded across partitions"), _stats.underlying_partition_skips),
        sm::make_counter("sstable_row_skips", sm::description("number of times sstable reader was fast forwarded within a partition"), _stats.underlying_row_skips),
-        sm::make_counter("pinned_dirty_memory_overload", sm::description("amount of pinned bytes that we tried to unpin over the limit. This should sit constantly at 0, and any number different than 0 is indicative of a bug"), _stats.pinned_dirty_memory_overload),
+        sm::make_counter("pinned_dirty_memory_overload", sm::description("amount of pinned bytes that we tried to unpin over the limit. This should sit constantly at 0, and any number different than 0 is indicative of a bug"), _stats.pinned_dirty_memory_overload).set_skip_when_empty(),
        sm::make_counter("rows_processed_from_memtable", _stats.rows_processed_from_memtable,
            sm::description("total number of rows in memtables which were processed during cache update on memtable flush")),
        sm::make_counter("rows_dropped_from_memtable", _stats.rows_dropped_from_memtable,
--- a/dist/common/scripts/scylla_swap_setup
+++ b/dist/common/scripts/scylla_swap_setup
@@ -9,6 +9,7 @@

 import os
 import sys
+import shlex
 import argparse
 import psutil
 from pathlib import Path
@@ -103,16 +104,41 @@ if __name__ == '__main__':
        run('dd if=/dev/zero of={} bs=1M count={}'.format(swapfile, swapsize_mb), shell=True, check=True)
    swapfile.chmod(0o600)
    run('mkswap -f {}'.format(swapfile), shell=True, check=True)
+
+    mount_point = find_mount_point(swap_directory)
+    mount_unit = out(f'systemd-escape -p --suffix=mount {shlex.quote(str(mount_point))}')
+
+    # Add DefaultDependencies=no to the swap unit to avoid getting the default
+    # Before=swap.target dependency. We apply this to all clouds, but the
+    # requirement came from Azure:
+    #
+    # On Azure, the swap directory is on the Azure ephemeral disk (mounted on /mnt).
+    # However, cloud-init makes this mount (i.e., the mnt.mount unit) depend on
+    # the network (After=network-online.target). By extension, this means that
+    # the swap unit depends on the network. If we didn't use DefaultDependencies=no,
+    # then the swap unit would be part of the swap.target which other services
+    # assume to be a local boot target, so we would end up with dependency cycles
+    # such as:
+    #
+    # swap.target -> mnt-swapfile.swap -> mnt.mount -> network-online.target -> network.target -> systemd-resolved.service -> tmp.mount -> swap.target
+    #
+    # By removing the automatic Before=swap.target, the swap unit is no longer
+    # part of swap.target, avoiding such cycles. The swap will still be
+    # activated via WantedBy=multi-user.target.
    unit_data = '''
 [Unit]
 Description=swapfile
+DefaultDependencies=no
+After={}
+Conflicts=umount.target
+Before=umount.target

 [Swap]
 What={}

 [Install]
 WantedBy=multi-user.target
-'''[1:-1].format(swapfile)
+'''[1:-1].format(mount_unit, swapfile)
    with swapunit.open('w') as f:
        f.write(unit_data)
    systemd_unit.reload()
--- a/pgo/profiles/aarch64/profile.profdata.xz
+++ b/pgo/profiles/aarch64/profile.profdata.xz
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:e59fe56eac435fd03c2f0d7dfc11c6998d7c0750e1851535575497dd13d96015
-size 6505524
+oid sha256:54662978b9ce4a6e25790b1b0a5099e6063173ffa95a399a6287cf474376ed09
+size 6595952
--- a/pgo/profiles/x86_64/profile.profdata.xz
+++ b/pgo/profiles/x86_64/profile.profdata.xz
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:d424ce6cc7f65338c34dd35881d23f5ad3425651d66e47dc2c3a20dc798848d4
-size 6598648
+oid sha256:0cf44ea1fb2ae20de45d26fe8095054e60cb8700cddcb2fd79ef79705484b18a
+size 6603780
--- a/raft/fsm.cc
+++ b/raft/fsm.cc
@@ -1098,7 +1098,8 @@ std::optional<std::pair<read_id, index_t>> fsm::start_read_barrier(server_id req

    // Make sure that only a leader or a node that is part of the config can request read barrier
    // Nodes outside of the config may never get the data, so they will not be able to read it.
-    if (requester != _my_id && leader_state().tracker.find(requester) == nullptr) {
+    follower_progress* opt_progress = leader_state().tracker.find(requester);
+    if (requester != _my_id && opt_progress == nullptr) {
        throw std::runtime_error(fmt::format("Read barrier requested by a node outside of the configuration {}", requester));
    }

@@ -1109,19 +1110,23 @@ std::optional<std::pair<read_id, index_t>> fsm::start_read_barrier(server_id req
        return {};
    }

+    // Optimization for read barriers requested on non-voters. A non-voter doesn't receive the read_quorum message, so
+    // it might update its commit index only after another leader tick, which would slow down wait_for_apply() at the
+    // end of the read barrier. Prevent that by replicating to the non-voting requester here.
+    if (requester != _my_id && opt_progress->commit_idx < _commit_idx && opt_progress->match_idx == _log.last_idx()
+            && !opt_progress->can_vote) {
+        logger.trace("start_read_barrier[{}]: replicate to {} because follower commit_idx={} < commit_idx={}, "
+                     "follower match_idx={} == last_idx={}, and follower can_vote={}",
+                     _my_id, requester, opt_progress->commit_idx, _commit_idx, opt_progress->match_idx,
+                     _log.last_idx(), opt_progress->can_vote);
+        replicate_to(*opt_progress, true);
+    }
+
    read_id id = next_read_id();
    logger.trace("start_read_barrier[{}] starting read barrier with id {}", _my_id, id);
    return std::make_pair(id, _commit_idx);
 }

-void fsm::maybe_update_commit_idx_for_read(index_t read_idx) {
-    // read_idx from the leader might not be replicated to the local node yet.
-    const bool in_local_log = read_idx <= _log.last_idx();
-    if (in_local_log && log_term_for(read_idx) == get_current_term()) {
-        advance_commit_idx(read_idx);
-    }
-}
-
 void fsm::stop() {
    if (is_leader()) {
        // Become follower to stop accepting requests
--- a/raft/fsm.hh
+++ b/raft/fsm.hh
@@ -480,15 +480,6 @@ public:

    std::optional<std::pair<read_id, index_t>> start_read_barrier(server_id requester);

-    // Update the commit index to the read index (a read barrier result from the leader) if the local entry with the
-    // read index belongs to the current term.
-    //
-    // Satisfying the condition above guarantees that the local log matches the current leader's log up to the read
-    // index (the Log Matching Property), so the current leader won't drop the local entry with the read index.
-    // Moreover, this entry has been committed by the leader, so future leaders also won't drop it (the Leader
-    // Completeness Property). Hence, updating the commit index is safe.
-    void maybe_update_commit_idx_for_read(index_t read_idx);
-
    size_t in_memory_log_size() const {
        return _log.in_memory_size();
    }
--- a/raft/server.cc
+++ b/raft/server.cc
@@ -1571,7 +1571,6 @@ future<> server_impl::read_barrier(seastar::abort_source* as) {
            co_return stop_iteration::no;
        }
        read_idx = std::get<index_t>(res);
-        _fsm->maybe_update_commit_idx_for_read(read_idx);
        co_return stop_iteration::yes;
    });

--- a/service/raft/raft_group0.cc
+++ b/service/raft/raft_group0.cc
@@ -538,6 +538,7 @@ future<> raft_group0::join_group0(std::vector<gms::inet_address> seeds, shared_p
        group0_id = g0_info.group0_id;
        raft::server_address my_addr{my_id, {}};

+        bool starting_server_as_follower = false;
        if (server == nullptr) {
            // This is the first time discovery is run. Create and start a Raft server for group 0 on this node.
            raft::configuration initial_configuration;
@@ -565,6 +566,7 @@ future<> raft_group0::join_group0(std::vector<gms::inet_address> seeds, shared_p
                // trigger an empty snapshot transfer.
                nontrivial_snapshot = true;
            } else {
+                starting_server_as_follower = true;
                co_await handshaker->pre_server_start(g0_info);
            }

@@ -591,7 +593,9 @@ future<> raft_group0::join_group0(std::vector<gms::inet_address> seeds, shared_p
        }

        SCYLLA_ASSERT(server);
-        if (server->get_configuration().contains(my_id)) {
+        co_await utils::get_local_injector().inject("join_group0_pause_before_config_check",
+                utils::wait_for_message(std::chrono::minutes{5}));
+        if (!starting_server_as_follower && server->get_configuration().contains(my_id)) {
            // True if we started a new group or completed a configuration change initiated earlier.
            group0_log.info("server {} already in group 0 (id {}) as {}", my_id, group0_id,
                    server->get_configuration().can_vote(my_id)? "voter" : "non-voter");
--- a/sstables/index_entry.hh
+++ b/sstables/index_entry.hh
@@ -239,11 +239,9 @@ public:
    // The i-th element corresponds to the i-th entry in _entries.
    // Can be smaller than _entries. If _entries[i] doesn't have a matching element in _promoted_indexes then
    // that entry doesn't have a promoted index.
-    // It's not chunked, because promoted index is present only when there are large partitions in the page,
-    // which also means the page will have typically only 1 entry due to summary:data_file size ratio.
    // Kept separately to avoid paying for storage cost in pages where no entry has a promoted index,
    // which is typical in workloads with small partitions.
-    managed_vector<promoted_index> _promoted_indexes;
+    lsa::chunked_managed_vector<promoted_index> _promoted_indexes;
 public:
    partition_index_page() = default;
    partition_index_page(partition_index_page&&) noexcept = default;
--- a/test.py
+++ b/test.py
@@ -11,6 +11,7 @@ from __future__ import annotations

 import argparse
 import asyncio
+import math
 import shlex
 import textwrap
 from random import randint
@@ -73,6 +74,51 @@ PYTEST_RUNNER_DIRECTORIES = [

 launch_time = time.monotonic()

+class ThreadsCalculator:
+    """
+    The ThreadsCalculator class calculates the number of jobs that can be run concurrently based on system
+    memory and CPU constraints. It allows resource reservation and configurable parameters for
+    flexible job scheduling in various modes, such as `debug`.
+    """
+
+    def __init__(self,
+                 modes: list[str],
+                 min_system_memory_reserve: float = 5e9,
+                 max_system_memory_reserve: float = 8e9,
+                 system_memory_reserve_fraction = 16,
+                 max_test_memory: float = 5e9,
+                 test_memory_fraction: float = 8.0,
+                 debug_test_memory_multiplier: float = 1.5,
+                 debug_cpus_per_test_job=1.5,
+                 non_debug_cpus_per_test_job: float =1.0,
+                 non_debug_max_test_memory: float = 4e9
+                 ):
+        sys_mem = int(os.sysconf("SC_PAGE_SIZE") * os.sysconf("SC_PHYS_PAGES"))
+        test_mem = min(sys_mem / test_memory_fraction, max_test_memory)
+        if "debug" in modes:
+            test_mem *= debug_test_memory_multiplier
+        system_memory_reserve = int(min(
+            max(sys_mem / system_memory_reserve_fraction, min_system_memory_reserve),
+            max_system_memory_reserve,
+        ))
+        available_mem = max(0, sys_mem - system_memory_reserve)
+        is_debug = "debug" in modes
+        test_mem = min(
+            sys_mem / test_memory_fraction,
+            max_test_memory if is_debug else non_debug_max_test_memory,
+        )
+        if is_debug:
+            test_mem *= debug_test_memory_multiplier
+        self.cpus_per_test_job = (
+            debug_cpus_per_test_job if is_debug else non_debug_cpus_per_test_job
+        )
+        self.default_num_jobs_mem = max(1, int(available_mem // test_mem))
+
+    def get_number_of_threads(self, nr_cpus: int) -> int:
+        default_num_jobs_cpu = max(1, math.ceil(nr_cpus / self.cpus_per_test_job))
+        return min(self.default_num_jobs_mem, default_num_jobs_cpu)
+
+

 class TabularConsoleOutput:
    """Print test progress to the console"""
@@ -273,6 +319,13 @@ def parse_cmd_line() -> argparse.Namespace:
    if args.skip_patterns and args.k:
        parser.error(palette.fail('arguments --skip and -k are mutually exclusive, please use only one of them'))

+    if not args.modes:
+        try:
+            args.modes = get_configured_modes()
+        except Exception:
+            print(palette.fail("Failed to read output of `ninja mode_list`: please run ./configure.py first"))
+            raise
+
    if not args.jobs:
        if not args.cpus:
            nr_cpus = multiprocessing.cpu_count()
@@ -280,19 +333,7 @@ def parse_cmd_line() -> argparse.Namespace:
            nr_cpus = int(subprocess.check_output(
                ['taskset', '-c', args.cpus, 'python3', '-c',
                 'import os; print(len(os.sched_getaffinity(0)))']))
-
-        cpus_per_test_job = 1
-        sysmem = os.sysconf('SC_PAGE_SIZE') * os.sysconf('SC_PHYS_PAGES')
-        testmem = 6e9 if os.sysconf('SC_PAGE_SIZE') > 4096 else 2e9
-        default_num_jobs_mem = ((sysmem - 4e9) // testmem)
-        args.jobs = min(default_num_jobs_mem, nr_cpus // cpus_per_test_job)
-
-    if not args.modes:
-        try:
-            args.modes = get_configured_modes()
-        except Exception:
-            print(palette.fail("Failed to read output of `ninja mode_list`: please run ./configure.py first"))
-            raise
+        args.jobs = ThreadsCalculator(args.modes).get_number_of_threads(nr_cpus)

    if not args.coverage_modes and args.coverage:
        args.coverage_modes = list(args.modes)
@@ -350,16 +391,12 @@ def run_pytest(options: argparse.Namespace) -> tuple[int, list[SimpleNamespace]]
    if options.list_tests:
        args.extend(['--collect-only', '--quiet', '--no-header'])
    else:
-        threads = int(options.jobs)
-        # debug mode is very CPU and memory hungry, so we need to lower the number of threads to be able to finish tests
-        if 'debug' in options.modes:
-            threads = int(threads * 0.5)
        args.extend([
            "--log-level=DEBUG",  # Capture logs
            f'--junit-xml={junit_output_file}',
            "-rf",
            '--test-py-init',
-            f'-n{threads}',
+            f'-n{options.jobs}',
            f'--tmpdir={temp_dir}',
            f'--maxfail={options.max_failures}',
            f'--alluredir={report_dir / f"allure_{HOST_ID}"}',
--- a/test/boost/address_map_test.cc
+++ b/test/boost/address_map_test.cc
@@ -18,7 +18,7 @@

 #include <seastar/core/coroutine.hh>
 #include <seastar/core/manual_clock.hh>
-#include <seastar/util/later.hh>
+#include <seastar/core/timer.hh>
 #include <seastar/util/defer.hh>
 #include <seastar/coroutine/maybe_yield.hh>
 #include <seastar/util/alloc_failure_injector.hh>
@@ -290,12 +290,17 @@ SEASTAR_THREAD_TEST_CASE(test_address_map_replication) {
        m.set_expiring(id1);
        BOOST_CHECK(m.find(id1) && *m.find(id1) == addr1);
        m.barrier().get();
+        promise<> shard0_timer_expired;
+        timer<manual_clock> shard0_timer([&shard0_timer_expired] {
+            shard0_timer_expired.set_value();
+        });
+        shard0_timer.arm(manual_clock::now() + expiration_time);
        m_svc.invoke_on(1, [] (address_map_t<manual_clock>& m) {
            BOOST_CHECK(m.find(id1) && *m.find(id1) == addr1);
            manual_clock::advance(expiration_time);
            BOOST_CHECK(!m.find(id1));
-            return smp::submit_to(0, []{}); // Ensure shard 0 notices timer is expired.
        }).get();
+        shard0_timer_expired.get_future().get();
        BOOST_CHECK(!m.find(id1));

        // Expiring entries are replicated
--- a/test/boost/multishard_query_test.cc
+++ b/test/boost/multishard_query_test.cc
@@ -1045,7 +1045,6 @@ validate_result_size(size_t i, schema_ptr schema, const utils::chunked_vector<mu

 struct fuzzy_test_config {
    uint32_t seed;
-    std::chrono::seconds timeout;
    unsigned concurrency;
    unsigned scans;
 };
@@ -1077,6 +1076,9 @@ run_fuzzy_test_scan(size_t i, fuzzy_test_config cfg, sharded<replica::database>&
    testlog.debug("[scan#{}]: seed={}, is_stateful={}, prange={}, ckranges={}", i, seed, is_stateful, partition_range,
            partition_slice.default_row_ranges());

+    // Use a small max_size to force many pages per scan, stressing the
+    // paging and result-merging logic.  With the large row limit here,
+    // the byte limit is typically the tighter bound.
    const auto [results, npages] = read_partitions_with_paged_scan(db, schema, 1000, 1024, is_stateful, partition_range, partition_slice);

    const auto expected_partitions = slice_partitions(*schema, mutations, partition_index_range, partition_slice);
@@ -1160,21 +1162,27 @@ SEASTAR_THREAD_TEST_CASE(fuzzy_test) {
                std::uniform_int_distribution<size_t>(0, 100), // clustering-rows
                std::uniform_int_distribution<size_t>(0, 100), // range-tombstones
 #else
+                // Keep these values moderate: with complex randomly-generated
+                // schemas (deeply nested frozen collections/UDTs), large row
+                // counts cause data generation and paged scanning to be very
+                // slow, leading to CI timeouts. The test's value comes from
+                // schema variety and paging correctness, not from sheer data
+                // volume.
                std::uniform_int_distribution<size_t>(32, 64), // partitions
-                std::uniform_int_distribution<size_t>(0, 1000), // clustering-rows
-                std::uniform_int_distribution<size_t>(0, 1000), // range-tombstones
+                std::uniform_int_distribution<size_t>(0, 200), // clustering-rows
+                std::uniform_int_distribution<size_t>(0, 200), // range-tombstones
 #endif
                tests::default_timestamp_generator());

 #if defined DEBUG
-        auto cfg = fuzzy_test_config{seed, std::chrono::seconds{8}, 1, 1};
+        auto cfg = fuzzy_test_config{seed, 1, 1};
 #elif defined DEVEL
-        auto cfg = fuzzy_test_config{seed, std::chrono::seconds{2}, 2, 4};
+        auto cfg = fuzzy_test_config{seed, 2, 4};
 #else
-        auto cfg = fuzzy_test_config{seed, std::chrono::seconds{2}, 4, 8};
+        auto cfg = fuzzy_test_config{seed, 4, 8};
 #endif

-        testlog.info("Running test workload with configuration: seed={}, timeout={}s, concurrency={}, scans={}", cfg.seed, cfg.timeout.count(),
+        testlog.info("Running test workload with configuration: seed={}, concurrency={}, scans={}", cfg.seed,
                cfg.concurrency, cfg.scans);

        smp::invoke_on_all([cfg, db = &env.db(), gs = global_schema_ptr(tbl.schema), &compacted_frozen_mutations = tbl.compacted_frozen_mutations] {
--- a/test/boost/querier_cache_test.cc
+++ b/test/boost/querier_cache_test.cc
@@ -906,9 +906,13 @@ SEASTAR_THREAD_TEST_CASE(test_timeout_is_applied_on_lookup) {
    BOOST_REQUIRE(entry.permit.timeout() == new_timeout);
    BOOST_REQUIRE(!entry.permit.get_abort_exception());

-    sleep(ttl_timeout_test_timeout * 2).get();
+    // Don't waste time retrying before the timeout is up
+    sleep(ttl_timeout_test_timeout).get();
+
+    eventually_true([&entry] {
+        return bool(entry.permit.get_abort_exception());
+    });

-    BOOST_REQUIRE(entry.permit.get_abort_exception());
    BOOST_REQUIRE_THROW(std::rethrow_exception(entry.permit.get_abort_exception()), seastar::named_semaphore_timed_out);
 }

--- a/test/boost/row_cache_test.cc
+++ b/test/boost/row_cache_test.cc
@@ -2644,7 +2644,10 @@ SEASTAR_TEST_CASE(test_exception_safety_of_update_from_memtable) {
                return rd;
            };

-            populate_range(cache, population_range);
+            {
+                memory::scoped_critical_alloc_section dfg;
+                populate_range(cache, population_range);
+            }
            auto rd1_v1 = assert_that(make_reader(population_range));
            mutation_reader_opt snap;
            auto close_snap = defer([&snap] {
--- a/test/cluster/conftest.py
+++ b/test/cluster/conftest.py
@@ -257,39 +257,44 @@ async def manager(request: pytest.FixtureRequest,
    yield manager_client
    # `request.node.stash` contains a report stored in `pytest_runtest_makereport` from where we can retrieve
    # test failure.
-    report = request.node.stash[PHASE_REPORT_KEY]
-    failed = report.when == "call" and report.failed
-
-    # Check if the test has the check_nodes_for_errors marker
-    found_errors = await manager_client.check_all_errors(check_all_errors=(request.node.get_closest_marker("check_nodes_for_errors") is not None))
-
+    cluster_status = None
+    found_errors = {}
+    failed = False
    failed_test_dir_path = None
-    if failed or found_errors:
-        # Save scylladb logs for failed tests in a separate directory and copy XML report to the same directory to have
-        # all related logs in one dir.
-        # Then add property to the XML report with the path to the directory, so it can be visible in Jenkins
-        failed_test_dir_path = testpy_test.suite.log_dir / "failed_test" / test_case_name.translate(
-            str.maketrans('[]', '()'))
-        failed_test_dir_path.mkdir(parents=True, exist_ok=True)
+    try:
+        report = request.node.stash[PHASE_REPORT_KEY]
+        failed = report.when == "call" and report.failed

-    if failed:
-        await manager_client.gather_related_logs(
-            failed_test_dir_path,
-            {'pytest.log': test_log, 'test_py.log': test_py_log_test}
-        )
-        with open(failed_test_dir_path / "stacktrace.txt", "w") as f:
-            f.write(report.longreprtext)
-        if request.config.getoption('artifacts_dir_url') is not None:
-            # get the relative path to the tmpdir for the failed directory
-            dir_path_relative = f"{failed_test_dir_path.as_posix()[failed_test_dir_path.as_posix().find('testlog'):]}"
-            full_url = urllib.parse.urljoin(request.config.getoption('artifacts_dir_url') + '/',
-                                            urllib.parse.quote(dir_path_relative))
-            record_property("TEST_LOGS", full_url)
+        # Check if the test has the check_nodes_for_errors marker
+        found_errors = await manager_client.check_all_errors(check_all_errors=(request.node.get_closest_marker("check_nodes_for_errors") is not None))

-    cluster_status = await manager_client.after_test(test_case_name, not failed)
-    await manager_client.stop()  # Stop client session and close driver after each test
+        if failed or found_errors:
+            # Save scylladb logs for failed tests in a separate directory and copy XML report to the same directory to have
+            # all related logs in one dir.
+            # Then add property to the XML report with the path to the directory, so it can be visible in Jenkins
+            failed_test_dir_path = testpy_test.suite.log_dir / "failed_test" / test_case_name.translate(
+                str.maketrans('[]', '()'))
+            failed_test_dir_path.mkdir(parents=True, exist_ok=True)

-    if cluster_status["server_broken"] and not failed:
+        if failed:
+            await manager_client.gather_related_logs(
+                failed_test_dir_path,
+                {'pytest.log': test_log, 'test_py.log': test_py_log_test}
+            )
+            with open(failed_test_dir_path / "stacktrace.txt", "w") as f:
+                f.write(report.longreprtext)
+            if request.config.getoption('artifacts_dir_url') is not None:
+                # get the relative path to the tmpdir for the failed directory
+                dir_path_relative = f"{failed_test_dir_path.as_posix()[failed_test_dir_path.as_posix().find('testlog'):]}"
+                full_url = urllib.parse.urljoin(request.config.getoption('artifacts_dir_url') + '/',
+                                                urllib.parse.quote(dir_path_relative))
+                record_property("TEST_LOGS", full_url)
+
+        cluster_status = await manager_client.after_test(test_case_name, not failed)
+    finally:
+        await manager_client.stop()  # Stop client session and close driver after each test
+
+    if cluster_status is not None and cluster_status["server_broken"] and not failed:
        failed = True
        pytest.fail(
            f"test case {test_case_name} left unfinished tasks on Scylla server. Server marked as broken,"
--- a/test/cluster/dtest/alternator_tests.py
+++ b/test/cluster/dtest/alternator_tests.py
@@ -10,6 +10,7 @@ import random
 import string
 import tempfile
 import time
+import threading
 from concurrent.futures.thread import ThreadPoolExecutor
 from pprint import pformat

@@ -273,6 +274,30 @@ class TesterAlternator(BaseAlternator):
        logger.info("Testing and validating an update query using key condition expression")
        logger.info(f"ConditionExpression update of short circuit is: {conditional_update_short_circuit}")
        dc2_table.update_item(**conditional_update_short_circuit)
+        # Wait for cross-DC replication to reach both live DC1 nodes
+        # before stopping dc2_node.  The LWT commit uses LOCAL_QUORUM,
+        # which only guarantees DC2 persistence; replication to DC1 is
+        # async background work.  Without this wait, stopping dc2_node
+        # can drop in-flight RPCs to DC1 while CAS mutations don't
+        # store hints.  We must confirm both live DC1 replicas have the
+        # data so that the later ConsistentRead=True (LOCAL_QUORUM)
+        # read on restarted node1 is guaranteed to succeed.
+        # See https://scylladb.atlassian.net/browse/SCYLLADB-1267
+        dc1_live_nodes = [
+            node for node in self.cluster.nodelist()
+            if node.data_center == node1.data_center and node.server_id != node1.server_id
+        ]
+        dc1_live_tables = [self.get_table(table_name=TABLE_NAME, node=node) for node in dc1_live_nodes]
+        wait_for(
+            lambda: all(
+                t.get_item(
+                    Key={self._table_primary_key: new_pk_val}, ConsistentRead=False
+                ).get("Item", {}).get("c") == 3
+                for t in dc1_live_tables
+            ),
+            timeout=60,
+            text="Waiting for cross-DC replication of conditional update to both live DC1 nodes",
+        )
        dc2_node.stop()
        node1.start()

@@ -481,28 +506,33 @@ class TesterAlternator(BaseAlternator):
        2) Issue Alternator 'heavy' requests concurrently (create-table)
        3) wait for RequestLimitExceeded error response.
        """
-        concurrent_requests_limit = 5
+        # Keep the limit low to avoid exhausting LSA memory on the 1GB test node
+        # when multiple CreateTable requests (Raft + schema + flush) run concurrently.
+        concurrent_requests_limit = 3
        extra_config = {"max_concurrent_requests_per_shard": concurrent_requests_limit, "num_tokens": 1}
        self.prepare_dynamodb_cluster(num_of_nodes=1, extra_config=extra_config)
        node1 = self.cluster.nodelist()[0]
-        create_tables_threads = []
-        for tables_num in range(concurrent_requests_limit * 5):
-            create_tables_threads.append(self.run_create_table_thread())
+        stop_workers = threading.Event()

-        @retrying(num_attempts=150, sleep_time=0.2, allowed_exceptions=ConcurrencyLimitNotExceededError, message="Running create-table request")
-        def wait_for_create_table_request_failure():
-            try:
-                self.create_table(table_name=random_string(length=10), node=node1, wait_until_table_exists=False)
-            except Exception as error:
-                if "RequestLimitExceeded" in error.args[0]:
-                    return
-                raise
-            raise ConcurrencyLimitNotExceededError
+        def run_create_table_until_limited() -> None:
+            while not stop_workers.is_set():
+                try:
+                    self.create_table(table_name=random_string(length=10), node=node1, wait_until_table_exists=False)
+                except Exception as error:  # noqa: BLE001
+                    if "RequestLimitExceeded" in str(error):
+                        stop_workers.set()
+                        return
+                    raise

-        wait_for_create_table_request_failure()
+        with ThreadPoolExecutor(max_workers=concurrent_requests_limit * 5) as executor:
+            create_table_futures = [executor.submit(run_create_table_until_limited) for _ in range(concurrent_requests_limit * 5)]

-        for thread in create_tables_threads:
-            thread.join()
+            if not stop_workers.wait(timeout=30):
+                raise ConcurrencyLimitNotExceededError
+
+            stop_workers.set()
+            for future in create_table_futures:
+                future.result(timeout=60)

    @staticmethod
    def _set_slow_query_logging_api(run_on_node: ScyllaNode, is_enable: bool = True, threshold: int | None = None):
--- a/test/cluster/dtest/auth_test.py
+++ b/test/cluster/dtest/auth_test.py
@@ -1182,9 +1182,9 @@ class TestAuth(Tester):
    def get_session(self, node_idx=0, user=None, password=None, exclusive=True):
        node = self.cluster.nodelist()[node_idx]
        if exclusive:
-            conn = self.patient_exclusive_cql_connection(node, user=user, password=password, timeout=0.1)
+            conn = self.patient_exclusive_cql_connection(node, user=user, password=password)
        else:
-            conn = self.patient_cql_connection(node, user=user, password=password, timeout=0.1)
+            conn = self.patient_cql_connection(node, user=user, password=password)
        return conn

    def assert_permissions_listed(self, expected, session, query, include_superuser=False):
--- a/test/cluster/object_store/conftest.py
+++ b/test/cluster/object_store/conftest.py
@@ -199,7 +199,7 @@ class GSServer(GSFront):

    def unpublish(self):
        for k in self.vars:
-            v = self.oldvars[k]
+            v = self.oldvars.get(k)
            if v:
                os.environ[k] = v
            elif os.environ.get(k):
--- a/test/cluster/tasks/test_node_ops_tasks.py
+++ b/test/cluster/tasks/test_node_ops_tasks.py
@@ -215,6 +215,11 @@ async def test_node_ops_tasks_tree(manager: ManagerClient):
        servers, vt_ids = await check_remove_node_tasks_tree(manager, tm, module_name, servers, vt_ids)
        servers, vt_ids = await check_decommission_tasks_tree(manager, tm, module_name, servers, vt_ids)

+        # Reconnect the driver after topology changes (replace, removenode,
+        # decommission) so that the new_test_keyspace cleanup can reach a
+        # live node for DROP KEYSPACE.
+        await manager.driver_connect()
+
@pytest.mark.asyncio
 async def test_node_ops_tasks_ttl(manager: ManagerClient):
    """Test node ops virtual tasks' ttl."""
--- a/test/cluster/test_audit.py
+++ b/test/cluster/test_audit.py
@@ -17,6 +17,7 @@ import socket
 import socketserver
 import tempfile
 import threading
+import time
 import uuid
 from collections import namedtuple
 from contextlib import contextmanager
@@ -33,9 +34,9 @@ from test.cluster.dtest.dtest_class import create_ks, wait_for
 from test.cluster.dtest.tools.assertions import assert_invalid
 from test.cluster.dtest.tools.data import rows_to_list, run_in_parallel

-from test.cluster.test_config import wait_for_config
 from test.pylib.manager_client import ManagerClient
 from test.pylib.rest_client import read_barrier
+from test.pylib.util import wait_for as wait_for_async

 logger = logging.getLogger(__name__)

@@ -113,11 +114,10 @@ class AuditTester:
                for k in AUTH_CONFIG:
                    await self.manager.server_remove_config_option(srv.server_id, k)

-            # Remove absent keys so the server reverts to compiled-in defaults.
-            for k in absent_keys:
-                await self.manager.server_remove_config_option(srv.server_id, k)
-
            if needs_restart:
+                # Remove absent keys so the server reverts to compiled-in defaults.
+                for k in absent_keys:
+                    await self.manager.server_remove_config_option(srv.server_id, k)
                await self.manager.server_stop_gracefully(srv.server_id)
                full_cfg = self._build_server_config(needed, enable_compact_storage, user)
                await self.manager.server_update_config(srv.server_id, config_options=full_cfg)
@@ -127,10 +127,17 @@ class AuditTester:
                # Server stays up — only push live-updatable keys.
                live_cfg = {k: v for k, v in needed.items() if k in LIVE_AUDIT_KEYS}
                live_cfg["enable_create_table_with_compact_storage"] = enable_compact_storage
+                log_file = await self.manager.server_open_log(srv.server_id)
+                # Each remove/update sends a SIGHUP.  Wait for each one's
+                # "completed re-reading configuration file" before the next
+                # so we never match a stale message.
+                for k in absent_keys:
+                    from_mark = await log_file.mark()
+                    await self.manager.server_remove_config_option(srv.server_id, k)
+                    await log_file.wait_for(r"completed re-reading configuration file", from_mark=from_mark, timeout=60)
+                from_mark = await log_file.mark()
                await self.manager.server_update_config(srv.server_id, config_options=live_cfg)
-                for key in LIVE_AUDIT_KEYS:
-                    if key in live_cfg:
-                        await wait_for_config(self.manager, srv, key, live_cfg[key])
+                await log_file.wait_for(r"completed re-reading configuration file", from_mark=from_mark, timeout=60)

    async def _start_fresh_servers(self, needed: dict[str, str],
                                   enable_compact_storage: bool,
@@ -345,7 +352,7 @@ class UnixSockerListener:
                elif data != "Initializing syslog audit backend.":
                    self.server.parent_instance.lines.append(data)

-    class UnixDatagramServer(socketserver.ThreadingUnixDatagramServer):
+    class UnixDatagramServer(socketserver.UnixDatagramServer):
        def __init__(self, socket_path, handler, parent_instance, lock):
            self.parent_instance = parent_instance
            self.mutex = lock
@@ -1342,7 +1349,13 @@ class CQLAuditTester(AuditTester):
            conn = await self.manager.get_cql_exclusive(srv)
            stmt = SimpleStatement("INSERT INTO ks.test1 (k, v1) VALUES (1000, 1000)", consistency_level=ConsistencyLevel.THREE)
            conn.execute(stmt)
-            audit_node_ips = await self.get_audit_partitions_for_operation(session, stmt.query_string)
+            # The audit log entry may not be visible immediately after the
+            # insert, so retry with exponential backoff until it appears.
+            audit_node_ips = await wait_for_async(
+                lambda: self.get_audit_partitions_for_operation(session, stmt.query_string),
+                deadline=time.time() + 10,
+                period=0.05,
+                label=f"audit entry for node {index}")
            node_to_audit_nodes[index] = set(audit_node_ips)

        all_addresses = set(srv.ip_addr for srv in servers)
--- a/test/cluster/test_bootstrap_with_quick_group0_join.py
+++ b/test/cluster/test_bootstrap_with_quick_group0_join.py
@@ -0,0 +1,70 @@
+#
+# Copyright (C) 2026-present ScyllaDB
+#
+# SPDX-License-Identifier: LicenseRef-ScyllaDB-Source-Available-1.0
+#
+import logging
+import asyncio
+import time
+
+import pytest
+
+from test.cluster.util import get_current_group0_config
+from test.pylib.manager_client import ManagerClient
+from test.pylib.rest_client import read_barrier
+from test.pylib.util import wait_for
+
+
+logger = logging.getLogger(__name__)
+
+
+@pytest.mark.asyncio
+@pytest.mark.skip_mode(mode='release', reason='error injections are not supported in release mode')
+async def test_bootstrap_with_quick_group0_join(manager: ManagerClient):
+    """Regression test for https://scylladb.atlassian.net/browse/SCYLLADB-959.
+
+    The bug was that when the bootstrapping node joined group0 before reaching
+    post_server_start, it skipped post_server_start and thus hung forever.
+
+    The test simulates the scenario by starting the second node with the
+    join_group0_pause_before_config_check injection. Without the fix, the
+    startup times out.
+    """
+    logger.info("Adding first server")
+    s1 = await manager.server_add()
+
+    logger.info("Adding second server with join_group0_pause_before_config_check enabled")
+    s2 = await manager.server_add(start=False, config={
+        'error_injections_at_startup': ['join_group0_pause_before_config_check']
+    })
+
+    logger.info(f"Starting {s2}")
+    start_task = asyncio.create_task(manager.server_start(s2.server_id))
+
+    s2_log = await manager.server_open_log(s2.server_id)
+
+    await s2_log.wait_for("join_group0_pause_before_config_check: waiting for message", timeout=60)
+
+    s1_host_id = await manager.get_host_id(s1.server_id)
+    s2_host_id = await manager.get_host_id(s2.server_id)
+
+    async def s2_in_group0_config_on_s1():
+        config = await get_current_group0_config(manager, s1)
+        ids = {m[0] for m in config}
+        assert s1_host_id in ids  # sanity check
+        return True if s2_host_id in ids else None
+
+    # Note: we would like to wait for s2 to see itself in the group0 config, but we can't execute
+    # get_current_group0_config for s2, as s2 doesn't handle CQL requests at this point. As a workaround, we wait for s1
+    # to see s2 and then perform a read barrier on s2.
+    logger.info(f"Waiting for {s1} to see {s2} in the group0 config")
+    await wait_for(s2_in_group0_config_on_s1, deadline=time.time() + 60, period=0.1)
+
+    logger.info(f"Performing read barrier on {s2} to make sure it sees itself in the group0 config")
+    await read_barrier(manager.api, s2.ip_addr)
+
+    logger.info(f"Unblocking {s2}")
+    await manager.api.message_injection(s2.ip_addr, 'join_group0_pause_before_config_check')
+
+    logger.info(f"Waiting for {s2} to complete bootstrap")
+    await asyncio.wait_for(start_task, timeout=60)
--- a/test/cluster/test_hints.py
+++ b/test/cluster/test_hints.py
@@ -100,7 +100,7 @@ async def test_limited_concurrency_of_writes(manager: ManagerClient):
    })
    node2 = await manager.server_add()

-    cql = manager.get_cql()
+    cql = await manager.get_cql_exclusive(node1)
    async with new_test_keyspace(manager, "WITH replication = {'class': 'SimpleStrategy', 'replication_factor': 2}") as ks:
        table = f"{ks}.t"
        await cql.run_async(f"CREATE TABLE {table} (pk int primary key, v int)")
--- a/test/cluster/test_sstable_cleanup_stop.py
+++ b/test/cluster/test_sstable_cleanup_stop.py
@@ -11,7 +11,6 @@ from test.cluster.util import check_token_ring_and_group0_consistency, new_test_
 import pytest
 import asyncio
 import logging
-import time

 logger = logging.getLogger(__name__)
@pytest.mark.asyncio
@@ -53,7 +52,7 @@ async def test_cleanup_stop(manager: ManagerClient):
        await s0_log.wait_for('sstable_cleanup_wait: waiting', from_mark=s0_mark)

        stop_cleanup = asyncio.create_task(manager.api.stop_compaction(servers[0].ip_addr, "CLEANUP"))
-        time.sleep(1)
+        await asyncio.sleep(1)

        await manager.api.message_injection(servers[0].ip_addr, "sstable_cleanup_wait")
        await stop_cleanup
--- a/test/cluster/test_tablets2.py
+++ b/test/cluster/test_tablets2.py
@@ -2279,7 +2279,7 @@ async def test_split_stopped_on_shutdown(manager: ManagerClient):

        shutdown_task = asyncio.create_task(manager.server_stop_gracefully(server.server_id))

-        await log.wait_for('Stopping.*ongoing compactions')
+        await log.wait_for('Stopping.*ongoing compactions', from_mark=log_mark)
        await manager.api.message_injection(server.ip_addr, "splitting_mutation_writer_switch_wait")

        await log.wait_for('storage_service_drain_wait: waiting', from_mark=log_mark)
--- a/test/nodetool/conftest.py
+++ b/test/nodetool/conftest.py
@@ -6,6 +6,7 @@

 import os
 import random
+import socket
 import subprocess
 import sys
 import time
@@ -59,7 +60,14 @@ async def server_address(request, testpy_test: None|Test):
            ip = await testpy_test.suite.hosts.lease_host()
        else:
            ip = f"127.{random.randint(0, 255)}.{random.randint(0, 255)}.{random.randint(0, 255)}"
-        port = random.randint(10000, 65535)
+        # Ask the OS to pick a free port by binding to port 0. This avoids
+        # collisions with ports still in TIME_WAIT from a previous test module
+        # that used the same IP. SO_REUSEADDR is set on the probe socket so it
+        # can reclaim a TIME_WAIT port itself
+        with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
+            s.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
+            s.bind((ip, 0))
+            port = s.getsockname()[1]
    yield ServerAddress(ip, port)
    if testpy_test is not None:
        await testpy_test.suite.hosts.release_host(ip)
--- a/test/nodetool/rest_api_mock.py
+++ b/test/nodetool/rest_api_mock.py
@@ -257,7 +257,7 @@ async def run_server(ip, port):

    runner = aiohttp.web.AppRunner(app)
    await runner.setup()
-    site = aiohttp.web.TCPSite(runner, ip, port)
+    site = aiohttp.web.TCPSite(runner, ip, port, reuse_address=True, reuse_port=True)
    await site.start()

    try:
--- a/test/pylib/dockerized_service.py
+++ b/test/pylib/dockerized_service.py
@@ -4,14 +4,18 @@
 # SPDX-License-Identifier: LicenseRef-ScyllaDB-Source-Available-1.0
 #

+import logging
 import shutil
 import itertools
 import asyncio
 import pathlib
 import re
 import os
+import subprocess
 from typing import Callable

+logger = logging.getLogger("DockerizedServer")
+
 class DockerizedServer:
    """class for running an external dockerized service image, typically mock server"""
    # pylint: disable=too-many-instance-attributes
@@ -37,6 +41,7 @@ class DockerizedServer:
        self.port = None
        self.proc = None
        self.service_port = port
+        self.echo_thread = None

    async def start(self):
        """Starts docker image on a random port"""
@@ -45,77 +50,107 @@ class DockerizedServer:
                                                if exe is not None)).resolve()
        sid = f"{os.getpid()}-{DockerizedServer.newid()}"
        name = f'{self.logfilenamebase}-{sid}'
+        logfilename = (pathlib.Path(self.tmpdir) / name).with_suffix(".log")
+        self.logfile = logfilename.open("wb")
+
+        docker_args = self.docker_args(self.host, self.service_port)
+        image_args = self.image_args(self.host, self.service_port)
+
+        args = [exe, "run", "--name", name, "--rm" ]
+        if self.service_port is None:
+            args = args + ["-P"]
+        else:
+            args = args + ["-p", str(self.service_port)]
+
+        args = args + docker_args + [self.image] + image_args
+
+        # This seems weird, using the blocking IO subprocess.
+        # However, we want to use a pipe reader so we can push the 
+        # output into the test log (because we are bad at propagating
+        # log files etc from CI)
+        # But the pipe reader needs to read until EOF, otherwise the
+        # docker process will eventually hang. So we can't await a 
+        # coroutine.
+        # We _can_, sort of, use pool.create_task(...) to send a coro
+        # to the background, and use a signal for waiting, like here,
+        # thus ensuring the coro runs forever, sort of... However, 
+        # this currently breaks, probably due to some part of the 
+        # machinery/tests that don't async fully, causing us to not
+        # process the log, and thus hand/fail, bla bla.
+        # The solution is to make the process synced, and use a 
+        # background thread (execution pool) for the processing.
+        # This way we know the pipe reader will not suddenly get
+        # blocked at inconvinient times.
+        proc = subprocess.Popen(args, stderr=subprocess.PIPE)
+        loop = asyncio.get_running_loop()
+        ready_fut = loop.create_future()
+
+        def process_io(): 
+            f = ready_fut
+            try:
+                while True:
+                    data = proc.stderr.readline()
+                    if not data:
+                        if f:
+                            loop.call_soon_threadsafe(f.set_exception, RuntimeError("Log EOF"))
+                        logger.debug("EOF received")
+                        break
+                    line = data.decode()
+                    self.logfile.write(data)
+                    logger.debug(line)
+                    if f and self.is_success_line(line, self.service_port):
+                        logger.info('Got start message: %s', line)
+                        loop.call_soon_threadsafe(f.set_result, True)
+                        f = None
+                    if f and self.is_failure_line(line, self.service_port):
+                        logger.info('Got fail message: %s', line)
+                        loop.call_soon_threadsafe(f.set_result, False)
+                        f = None
+            except Exception as e:
+                logger.error("Exception in log processing: %s", e)
+                if f:
+                    loop.call_soon_threadsafe(f.set_exception, e)
+
+        self.echo_thread = loop.run_in_executor(None, process_io)
+        ok = await ready_fut
+        if not ok:
+            self.logfile.close()
+            proc.kill()
+            proc.wait()
+            raise RuntimeError("Could not parse expected launch message from container")
+
+        check_proc = await asyncio.create_subprocess_exec(exe
+                                                          , *["container", "port", name]
+                                                          , stdout=asyncio.subprocess.PIPE
+        )
        while True:
-            logfilename = (pathlib.Path(self.tmpdir) / name).with_suffix(".log")
-            self.logfile = logfilename.open("wb")
+            data = await check_proc.stdout.readline()
+            if not data:
+                break
+            s = data.decode()
+            m = re.search(r"\d+\/\w+ -> [\w+\.\[\]\:]+:(\d+)", s)
+            if m:
+                self.port = int(m.group(1))

-            docker_args = self.docker_args(self.host, self.service_port)
-            image_args = self.image_args(self.host, self.service_port)
-
-            args = ["run", "--name", name, "--rm" ]
-            if self.service_port is None:
-                args = args + ["-P"]
-            else:
-                args = args + ["-p", str(self.service_port)]
-
-            args = args + docker_args + [self.image] + image_args
-
-            proc = await asyncio.create_subprocess_exec(exe, *args, stderr=self.logfile)
-            failed = False
-
-            # In any sane world we would just pipe stderr to a pipe and launch a background
-            # task to just readline from there to both check the start message as well as 
-            # add it to the log (preferrably via logger).
-            # This works fine when doing this in a standalone python script. 
-            # However, for some reason, when run in a pytest fixture, the pipe will fill up,
-            # without or reader waking up and doing anyhing, and for any test longer than very
-            # short, we will fill the stderr buffer and hang.
-            # I cannot figure out how to get around this, so we workaround it
-            # instead by directing stderr to a log file, and simply repeatedly
-            # try to read the info from this file until we are happy.
-            async with asyncio.timeout(120):
-                done = False
-                while not done and not failed:
-                    with logfilename.open("r") as f:
-                        for line in f:
-                            if self.is_success_line(line, self.service_port):
-                                print(f'Got start message: {line}')
-                                done = True
-                                break
-                            if self.is_failure_line(line, self.service_port):
-                                print(f'Got fail message: {line}')
-                                failed = True
-                                break
-
-            if failed:
-                self.logfile.close()
-                await proc.wait()
-                continue
-
-            check_proc = await asyncio.create_subprocess_exec(exe
-                                                              , *["container", "port", name]
-                                                              , stdout=asyncio.subprocess.PIPE
-            )
-            while True:
-                data = await check_proc.stdout.readline()
-                if not data:
-                    break
-                s = data.decode()
-                m = re.search(r"\d+\/\w+ -> [\w+\.\[\]\:]+:(\d+)", s)
-                if m:
-                    self.port = int(m.group(1))
-
-            await check_proc.wait()
-            if not self.port:
-                proc.kill()
-                raise RuntimeError("Could not query port from container")
-            self.proc = proc
-            break
+        await check_proc.wait()
+        if not self.port:
+            proc.kill()
+            proc.wait()
+            raise RuntimeError("Could not query port from container")
+        self.proc = proc

    async def stop(self):
        """Stops docker image"""
        if self.proc:
+            logger.debug("Stopping docker process")
            self.proc.terminate()
-            await self.proc.wait()
+            self.proc.wait()
+            self.proc = None
+        if self.echo_thread:
+            logger.debug("Waiting for IO thread")
+            await self.echo_thread
+            self.echo_thread = None
        if self.logfile:
+            logger.debug("Closing log file")
            self.logfile.close()
+            self.logfile = None
--- a/test/pylib/scylla_cluster.py
+++ b/test/pylib/scylla_cluster.py
@@ -747,6 +747,8 @@ class ScyllaServer:
        self.notify_socket = socket.socket(socket.AF_UNIX, socket.SOCK_DGRAM | socket.SOCK_CLOEXEC)
        self.notify_socket.bind(str(self.notify_socket_path))
        self._received_serving = False
+        loop = asyncio.get_running_loop()
+
        def poll_status(s: socket.socket, f: asyncio.Future, logger: Union[logging.Logger, logging.LoggerAdapter]):
            # Try to read all available messages from the socket
            while True:
@@ -756,7 +758,7 @@ class ScyllaServer:
                    message = data.decode('utf-8', errors='replace')
                    if 'STATUS=serving' in message:
                        logger.debug("Received sd_notify 'serving' message")
-                        f.set_result(True)
+                        loop.call_soon_threadsafe(f.set_result, True)
                        return
                    if 'STATUS=entering maintenance mode' in message:
                        logger.debug("Receive sd_notify 'entering maintenance mode'")
@@ -766,9 +768,9 @@ class ScyllaServer:
                except Exception as e:
                    logger.debug("Error reading from notify socket: %s", e)
                    break
-            f.set_result(False)
+            loop.call_soon_threadsafe(f.set_result, False)

-        self.serving_signal = asyncio.get_running_loop().create_future()
+        self.serving_signal = loop.create_future()
        t = threading.Thread(target=poll_status, args=[self.notify_socket, self.serving_signal, self.logger], daemon=True)
        t.start()

@@ -892,7 +894,6 @@ class ScyllaServer:
                                return
                        await report_error("the node startup failed, but the log file doesn't contain the expected error")
                await report_error("failed to start the node")
-            self.logger.info("Wait me %s expect %s is %s", self.server_id, expected_server_up_state, server_up_state)
            if await self.try_get_host_id(api):
                if server_up_state == ServerUpState.PROCESS_STARTED:
                    server_up_state = ServerUpState.HOST_ID_QUERIED
--- a/test/pylib/util.py
+++ b/test/pylib/util.py
@@ -64,11 +64,25 @@ async def wait_for(
    tag = label or getattr(pred, '__name__', 'unlabeled')
    start = time.time()
    retries = 0
+    last_exception: Exception | None = None
    while True:
        elapsed = time.time() - start
-        assert time.time() < deadline, \
-            f"wait_for({tag}) timed out after {elapsed:.2f}s ({retries} retries)"
-        res = await pred()
+        if time.time() >= deadline:
+            timeout_msg = f"wait_for({tag}) timed out after {elapsed:.2f}s ({retries} retries)"
+            if last_exception is not None:
+                timeout_msg += (
+                    f"; last exception: {type(last_exception).__name__}: {last_exception}"
+                )
+                raise AssertionError(timeout_msg) from last_exception
+            raise AssertionError(timeout_msg)
+
+        try:
+            res = await pred()
+            last_exception = None
+        except Exception as exc:
+            res = None
+            last_exception = exc
+
        if res is not None:
            if retries > 0:
                logger.debug(f"wait_for({tag}) completed "