From 269637ffa33639db932aee44e8b23d4c8fb7df2f Mon Sep 17 00:00:00 2001 From: Gleb Natapov Date: Mon, 4 May 2026 11:28:32 +0300 Subject: [PATCH] session, raft_topology: add periodic warnings for hung drain and stale version waits Add periodic warning timers (every 5 minutes) to help diagnose hangs in barrier_and_drain: - drain_closing_sessions(): warn if semaphore acquisition or session gate close is taking too long, reporting the gate count to show how many guards are still alive. - local_topology_barrier(): warn if stale_versions_in_use() is taking too long, reporting the current stale version trackers. - session::gate_count(): new public accessor for diagnostic purposes. These warnings help distinguish between the two possible hang points in barrier_and_drain (stale versions vs session drain) and provide ongoing visibility into what's blocking progress. (cherry picked from commit d2b695aa64e4862c267a087e4bf0d7462befef32) --- service/session.cc | 16 +++++++++++++++- service/session.hh | 4 ++++ service/storage_service.cc | 10 +++++++++- 3 files changed, 28 insertions(+), 2 deletions(-) diff --git a/service/session.cc b/service/session.cc index b53d24c7f0..9220551938 100644 --- a/service/session.cc +++ b/service/session.cc @@ -9,6 +9,7 @@ #include "service/session.hh" #include "utils/log.hh" #include +#include namespace service { @@ -59,7 +60,13 @@ void session_manager::initiate_close_of_sessions_except(const std::unordered_set future<> session_manager::drain_closing_sessions() { slogger.info("drain_closing_sessions: waiting for lock"); + seastar::timer lock_timer([this] { + slogger.warn("drain_closing_sessions: still waiting for lock, available units {}", + _session_drain_sem.available_units()); + }); + lock_timer.arm_periodic(std::chrono::minutes(5)); auto lock = co_await get_units(_session_drain_sem, 1); + lock_timer.cancel(); auto n = std::distance(_closing_sessions.begin(), _closing_sessions.end()); slogger.info("drain_closing_sessions: acquired lock, {} sessions to drain", n); auto i = _closing_sessions.begin(); @@ -67,8 +74,15 @@ future<> session_manager::drain_closing_sessions() { session& s = *i; ++i; auto id = s.id(); - slogger.info("drain_closing_sessions: waiting for session {} to close", id); + slogger.info("drain_closing_sessions: waiting for session {} to close, gate count {}", id, s.gate_count()); + std::optional> warn_timer; + warn_timer.emplace([&s, id] { + slogger.warn("drain_closing_sessions: session {} still not closed, gate count {}", + id, s.gate_count()); + }); + warn_timer->arm_periodic(std::chrono::minutes(5)); co_await s.close(); + warn_timer.reset(); if (_sessions.erase(id)) { slogger.info("drain_closing_sessions: session {} closed", id); } diff --git a/service/session.hh b/service/session.hh index d1941dcd01..ac25ac7573 100644 --- a/service/session.hh +++ b/service/session.hh @@ -95,6 +95,10 @@ public: return _id; } + size_t gate_count() const { + return _gate.get_count(); + } + /// Post-condition of successfully resolved future: There are no guards alive for this session, and /// and it's impossible to create more such guards later. /// Can be called concurrently. diff --git a/service/storage_service.cc b/service/storage_service.cc index 37d86ebf5a..60c71a6684 100644 --- a/service/storage_service.cc +++ b/service/storage_service.cc @@ -6205,7 +6205,15 @@ future<> storage_service::local_topology_barrier() { } rtlogger.info("raft_topology_cmd::barrier_and_drain version {}: waiting for stale token metadata versions to be released", version); - co_await ss._shared_token_metadata.stale_versions_in_use(); + { + seastar::timer warn_timer([&ss, version] { + rtlogger.warn("raft_topology_cmd::barrier_and_drain version {}: still waiting for stale versions, " + "stale versions (version: use_count): {}", + version, ss._shared_token_metadata.describe_stale_versions()); + }); + warn_timer.arm_periodic(std::chrono::minutes(5)); + co_await ss._shared_token_metadata.stale_versions_in_use(); + } rtlogger.info("raft_topology_cmd::barrier_and_drain version {}: stale versions released, draining closing sessions", version); co_await get_topology_session_manager().drain_closing_sessions();