gossiper: Improve the gossip timer callback lock handling (#5097)

- Update the outdated comments in do_stop_gossiping. It was
  storage_service not storage_proxy that used the lock. More
  importantly, storage_service does not use it any more.

- Drop the unused timer_callback_lock and timer_callback_unlock API

- Use with_semaphore to make sure the semaphore usage is balanced.

- Add log in gossiper::do_stop_gossiping when it tries to take the
  semaphore to help debug hang during the shutdown.

Refs: #4891
Refs: #4971
This commit is contained in:
Asias He
2019-09-25 15:46:38 +08:00
committed by Avi Kivity
parent 425cc0c104
commit 19e8c14ad1
2 changed files with 20 additions and 22 deletions

View File

@@ -610,9 +610,9 @@ future<gossiper::endpoint_permit> gossiper::lock_endpoint(inet_address ep) {
// - failure_detector
// - on_remove callbacks, e.g, storage_service -> access token_metadata
void gossiper::run() {
// Run it in the background.
(void)timer_callback_lock().then([this, g = this->shared_from_this()] {
return seastar::async([this, g] {
// Run it in the background.
(void)seastar::with_semaphore(_callback_running, 1, [this] {
return seastar::async([this, g = this->shared_from_this()] {
logger.trace("=== Gossip round START");
//wait on messaging service to start listening
@@ -734,14 +734,13 @@ void gossiper::run() {
}
}).get();
}
});
}).then_wrapped([this] (auto&& f) {
try {
f.get();
_nr_run++;
logger.trace("=== Gossip round OK");
} catch (...) {
logger.trace("=== Gossip round FAIL");
logger.warn("=== Gossip round FAIL: {}", std::current_exception());
}
if (logger.is_enabled(logging::log_level::trace)) {
@@ -751,9 +750,11 @@ void gossiper::run() {
}
if (_enabled) {
_scheduled_gossip_task.arm(INTERVAL);
} else {
logger.info("Gossip loop is not scheduled because it is disabled");
}
this->timer_callback_unlock();
});
});
}
void gossiper::check_seen_seeds() {
@@ -1848,23 +1849,22 @@ future<> gossiper::do_stop_gossiping() {
} else {
logger.warn("No local state or state is in silent shutdown, not announcing shutdown");
}
logger.info("Disable and wait for gossip loop started");
// Set disable flag and cancel the timer makes sure gossip loop will not be scheduled
_enabled = false;
_scheduled_gossip_task.cancel();
timer_callback_lock().get();
//
// Release the timer semaphore since storage_proxy may be waiting for
// it.
// Gossiper timer is promised to be neither running nor scheduled.
//
timer_callback_unlock();
get_gossiper().invoke_on_all([] (gossiper& g) {
if (engine().cpu_id() == 0) {
g.fd().unregister_failure_detection_event_listener(&g);
}
g.uninit_messaging_service_handler();
g._features_condvar.broken();
return make_ready_future<>();
// Take the semaphore makes sure existing gossip loop is finished
seastar::with_semaphore(_callback_running, 1, [] {
logger.info("Disable and wait for gossip loop finished");
return get_gossiper().invoke_on_all([] (gossiper& g) {
if (engine().cpu_id() == 0) {
g.fd().unregister_failure_detection_event_listener(&g);
}
g.uninit_messaging_service_handler();
g._features_condvar.broken();
});
}).get();
logger.info("Gossip is now stopped");
});
}

View File

@@ -119,8 +119,6 @@ private:
semaphore _callback_running{1};
semaphore _apply_state_locally_semaphore{100};
public:
future<> timer_callback_lock() { return _callback_running.wait(); }
void timer_callback_unlock() { _callback_running.signal(); }
sstring get_cluster_name();
sstring get_partitioner_name();
inet_address get_broadcast_address() const {