From c2cfdcd345f654ed76ea63ab934490d45b86debc Mon Sep 17 00:00:00 2001 From: Asias He Date: Tue, 25 May 2021 14:37:14 +0800 Subject: [PATCH] gossiper: Set minimum value for quarantine_delay When a new node bootstraps to join the cluster, it will be set in bootstrap gossip status. If the node is gone in the middle, the node will be removed by gossip after the new node fails to update gossip after fat_client_timeout, which reverts the new node as pending node. However, if the new node is slow to update gossip and it finishes bootstrapping after existing nodes have removed the new node after fat_client_timeout. In handle_state_normal handler, the existing nodes will fail to find the host id for the new node and throw and in turn terminate the scylla process. To mitigate the problem, we set fat_client_timeout which is half of quarantine_delay to a minimum value if users set a small ring_delay value. Refs #8702 Refs #8859 Closes #8860 --- gms/gossiper.cc | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/gms/gossiper.cc b/gms/gossiper.cc index 3557e9f3e8..50e212f8a1 100644 --- a/gms/gossiper.cc +++ b/gms/gossiper.cc @@ -106,7 +106,8 @@ void gossiper::set_seeds(std::set seeds) { } std::chrono::milliseconds gossiper::quarantine_delay() const noexcept { - auto ring_delay = std::chrono::milliseconds(_cfg.ring_delay_ms()); + auto delay = std::max(unsigned(30000), _cfg.ring_delay_ms()); + auto ring_delay = std::chrono::milliseconds(delay); return ring_delay * 2; }