mirror of
https://github.com/scylladb/scylladb.git
synced 2026-05-12 19:02:12 +00:00
session, raft_topology: add periodic warnings for hung drain and stale version waits
Add periodic warning timers (every 5 minutes) to help diagnose hangs in
barrier_and_drain:
- drain_closing_sessions(): warn if semaphore acquisition or session gate
close is taking too long, reporting the gate count to show how many
guards are still alive.
- local_topology_barrier(): warn if stale_versions_in_use() is taking
too long, reporting the current stale version trackers.
- session::gate_count(): new public accessor for diagnostic purposes.
These warnings help distinguish between the two possible hang points
in barrier_and_drain (stale versions vs session drain) and provide
ongoing visibility into what's blocking progress.
(cherry picked from commit d2b695aa64)
This commit is contained in:
@@ -9,6 +9,7 @@
|
||||
#include "service/session.hh"
|
||||
#include "utils/log.hh"
|
||||
#include <seastar/core/coroutine.hh>
|
||||
#include <seastar/core/timer.hh>
|
||||
|
||||
namespace service {
|
||||
|
||||
@@ -59,7 +60,13 @@ void session_manager::initiate_close_of_sessions_except(const std::unordered_set
|
||||
|
||||
future<> session_manager::drain_closing_sessions() {
|
||||
slogger.info("drain_closing_sessions: waiting for lock");
|
||||
seastar::timer<lowres_clock> lock_timer([this] {
|
||||
slogger.warn("drain_closing_sessions: still waiting for lock, available units {}",
|
||||
_session_drain_sem.available_units());
|
||||
});
|
||||
lock_timer.arm_periodic(std::chrono::minutes(5));
|
||||
auto lock = co_await get_units(_session_drain_sem, 1);
|
||||
lock_timer.cancel();
|
||||
auto n = std::distance(_closing_sessions.begin(), _closing_sessions.end());
|
||||
slogger.info("drain_closing_sessions: acquired lock, {} sessions to drain", n);
|
||||
auto i = _closing_sessions.begin();
|
||||
@@ -67,8 +74,15 @@ future<> session_manager::drain_closing_sessions() {
|
||||
session& s = *i;
|
||||
++i;
|
||||
auto id = s.id();
|
||||
slogger.info("drain_closing_sessions: waiting for session {} to close", id);
|
||||
slogger.info("drain_closing_sessions: waiting for session {} to close, gate count {}", id, s.gate_count());
|
||||
std::optional<seastar::timer<lowres_clock>> warn_timer;
|
||||
warn_timer.emplace([&s, id] {
|
||||
slogger.warn("drain_closing_sessions: session {} still not closed, gate count {}",
|
||||
id, s.gate_count());
|
||||
});
|
||||
warn_timer->arm_periodic(std::chrono::minutes(5));
|
||||
co_await s.close();
|
||||
warn_timer.reset();
|
||||
if (_sessions.erase(id)) {
|
||||
slogger.info("drain_closing_sessions: session {} closed", id);
|
||||
}
|
||||
|
||||
@@ -95,6 +95,10 @@ public:
|
||||
return _id;
|
||||
}
|
||||
|
||||
size_t gate_count() const {
|
||||
return _gate.get_count();
|
||||
}
|
||||
|
||||
/// Post-condition of successfully resolved future: There are no guards alive for this session, and
|
||||
/// and it's impossible to create more such guards later.
|
||||
/// Can be called concurrently.
|
||||
|
||||
@@ -6205,7 +6205,15 @@ future<> storage_service::local_topology_barrier() {
|
||||
}
|
||||
|
||||
rtlogger.info("raft_topology_cmd::barrier_and_drain version {}: waiting for stale token metadata versions to be released", version);
|
||||
co_await ss._shared_token_metadata.stale_versions_in_use();
|
||||
{
|
||||
seastar::timer<lowres_clock> warn_timer([&ss, version] {
|
||||
rtlogger.warn("raft_topology_cmd::barrier_and_drain version {}: still waiting for stale versions, "
|
||||
"stale versions (version: use_count): {}",
|
||||
version, ss._shared_token_metadata.describe_stale_versions());
|
||||
});
|
||||
warn_timer.arm_periodic(std::chrono::minutes(5));
|
||||
co_await ss._shared_token_metadata.stale_versions_in_use();
|
||||
}
|
||||
rtlogger.info("raft_topology_cmd::barrier_and_drain version {}: stale versions released, draining closing sessions", version);
|
||||
co_await get_topology_session_manager().drain_closing_sessions();
|
||||
|
||||
|
||||
Reference in New Issue
Block a user