mirror of
https://github.com/scylladb/scylladb.git
synced 2026-05-13 03:12:13 +00:00
raft_topology: log sub-step progress in local_topology_barrier
When a node processes a barrier_and_drain topology command, it performs
two potentially long-running operations inside local_topology_barrier():
waiting for stale token metadata versions to be released
(stale_versions_in_use) and draining closing sessions
(drain_closing_sessions). Either of these can hang indefinitely -- for
example, stale_versions_in_use blocks until all references to previous
token metadata versions are released, which depends on in-flight
requests completing.
Previously, the only logging was a single 'done' message at the end,
making it impossible to determine which sub-step was blocking when a
barrier_and_drain RPC appeared stuck on a node. In a recent CI failure,
a node never responded to barrier_and_drain during a removenode
operation, and the logs showed the RPC was received but nothing about
what it was waiting on internally.
Add info-level logging before each blocking sub-step, including the
topology version for correlation. This allows diagnosing hangs by
showing whether the node is stuck waiting for stale metadata versions,
stuck draining sessions, or never reached these steps at all.
(cherry picked from commit e88ce09372)
This commit is contained in:
@@ -6035,10 +6035,12 @@ future<raft_topology_cmd_result> storage_service::raft_topology_cmd_handler(raft
|
||||
version, current_version)));
|
||||
}
|
||||
|
||||
rtlogger.info("raft_topology_cmd::barrier_and_drain version {}: waiting for stale token metadata versions to be released", version);
|
||||
co_await ss._shared_token_metadata.stale_versions_in_use();
|
||||
rtlogger.info("raft_topology_cmd::barrier_and_drain version {}: stale versions released, draining closing sessions", version);
|
||||
co_await get_topology_session_manager().drain_closing_sessions();
|
||||
|
||||
rtlogger.info("raft_topology_cmd::barrier_and_drain done");
|
||||
rtlogger.info("raft_topology_cmd::barrier_and_drain version {}: done", version);
|
||||
});
|
||||
|
||||
co_await utils::get_local_injector().inject("raft_topology_barrier_and_drain_fail", [this] (auto& handler) -> future<> {
|
||||
|
||||
@@ -1156,7 +1156,7 @@ async def test_alternator_invalid_shard_for_lwt(manager: ManagerClient):
|
||||
# The next barrier must be for the write_both_read_new, we need a guarantee
|
||||
# that the src_shard observed it
|
||||
logger.info("Waiting for the next barrier")
|
||||
await log.wait_for(re.escape(f"[shard {src_shard}: gms] raft_topology - raft_topology_cmd::barrier_and_drain done"),
|
||||
await log.wait_for(f"\\[shard {src_shard}: gms\\] raft_topology - raft_topology_cmd::barrier_and_drain.*done",
|
||||
from_mark=m)
|
||||
|
||||
# Now we have a guarantee that a new barrier succeeded on the src_shard,
|
||||
|
||||
@@ -994,7 +994,7 @@ async def test_tablets_merge_waits_for_lwt(manager: ManagerClient):
|
||||
logger.info("Wait for the global barrier to start draining on shard0")
|
||||
await log0.wait_for("\\[shard 0: gms\\] raft_topology - Got raft_topology_cmd::barrier_and_drain", from_mark=m)
|
||||
# Just to confirm that the guard still holds the erm
|
||||
matches = await log0.grep("\\[shard 0: gms\\] raft_topology - raft_topology_cmd::barrier_and_drain done", from_mark=m)
|
||||
matches = await log0.grep("\\[shard 0: gms\\] raft_topology - raft_topology_cmd::barrier_and_drain.*done", from_mark=m)
|
||||
assert len(matches) == 0
|
||||
|
||||
# Before the fix, the tablet migration global barrier did not wait for the LWT operation.
|
||||
|
||||
Reference in New Issue
Block a user