From cbf4e27137d008fbcfa80a10e6fa19bc59457877 Mon Sep 17 00:00:00 2001 From: Gleb Natapov Date: Sun, 3 May 2026 13:56:54 +0300 Subject: [PATCH] raft_topology: log sub-step progress in local_topology_barrier When a node processes a barrier_and_drain topology command, it performs two potentially long-running operations inside local_topology_barrier(): waiting for stale token metadata versions to be released (stale_versions_in_use) and draining closing sessions (drain_closing_sessions). Either of these can hang indefinitely -- for example, stale_versions_in_use blocks until all references to previous token metadata versions are released, which depends on in-flight requests completing. Previously, the only logging was a single 'done' message at the end, making it impossible to determine which sub-step was blocking when a barrier_and_drain RPC appeared stuck on a node. In a recent CI failure, a node never responded to barrier_and_drain during a removenode operation, and the logs showed the RPC was received but nothing about what it was waiting on internally. Add info-level logging before each blocking sub-step, including the topology version for correlation. This allows diagnosing hangs by showing whether the node is stuck waiting for stale metadata versions, stuck draining sessions, or never reached these steps at all. (cherry picked from commit e88ce0937274953d5579a12fae9026f0a46a9497) --- service/storage_service.cc | 4 +++- test/cluster/test_alternator.py | 2 +- test/cluster/test_tablets_lwt.py | 2 +- 3 files changed, 5 insertions(+), 3 deletions(-) diff --git a/service/storage_service.cc b/service/storage_service.cc index 7df55c85a9..d1177f6f3c 100644 --- a/service/storage_service.cc +++ b/service/storage_service.cc @@ -6035,10 +6035,12 @@ future storage_service::raft_topology_cmd_handler(raft version, current_version))); } + rtlogger.info("raft_topology_cmd::barrier_and_drain version {}: waiting for stale token metadata versions to be released", version); co_await ss._shared_token_metadata.stale_versions_in_use(); + rtlogger.info("raft_topology_cmd::barrier_and_drain version {}: stale versions released, draining closing sessions", version); co_await get_topology_session_manager().drain_closing_sessions(); - rtlogger.info("raft_topology_cmd::barrier_and_drain done"); + rtlogger.info("raft_topology_cmd::barrier_and_drain version {}: done", version); }); co_await utils::get_local_injector().inject("raft_topology_barrier_and_drain_fail", [this] (auto& handler) -> future<> { diff --git a/test/cluster/test_alternator.py b/test/cluster/test_alternator.py index 1c9cc1d341..06622cd40c 100644 --- a/test/cluster/test_alternator.py +++ b/test/cluster/test_alternator.py @@ -1156,7 +1156,7 @@ async def test_alternator_invalid_shard_for_lwt(manager: ManagerClient): # The next barrier must be for the write_both_read_new, we need a guarantee # that the src_shard observed it logger.info("Waiting for the next barrier") - await log.wait_for(re.escape(f"[shard {src_shard}: gms] raft_topology - raft_topology_cmd::barrier_and_drain done"), + await log.wait_for(f"\\[shard {src_shard}: gms\\] raft_topology - raft_topology_cmd::barrier_and_drain.*done", from_mark=m) # Now we have a guarantee that a new barrier succeeded on the src_shard, diff --git a/test/cluster/test_tablets_lwt.py b/test/cluster/test_tablets_lwt.py index 7d6e9a085b..aec7508f76 100644 --- a/test/cluster/test_tablets_lwt.py +++ b/test/cluster/test_tablets_lwt.py @@ -994,7 +994,7 @@ async def test_tablets_merge_waits_for_lwt(manager: ManagerClient): logger.info("Wait for the global barrier to start draining on shard0") await log0.wait_for("\\[shard 0: gms\\] raft_topology - Got raft_topology_cmd::barrier_and_drain", from_mark=m) # Just to confirm that the guard still holds the erm - matches = await log0.grep("\\[shard 0: gms\\] raft_topology - raft_topology_cmd::barrier_and_drain done", from_mark=m) + matches = await log0.grep("\\[shard 0: gms\\] raft_topology - raft_topology_cmd::barrier_and_drain.*done", from_mark=m) assert len(matches) == 0 # Before the fix, the tablet migration global barrier did not wait for the LWT operation.