From 7267376eac3ff7d9431a6283edd098e2c5572aef Mon Sep 17 00:00:00 2001
From: Gleb Natapov <gleb@scylladb.com>
Date: Mon, 6 Nov 2023 13:47:23 +0200
Subject: [PATCH]  storage_service: topology coordinator: do not retry the
 metadata barrier forever in write_both_read_new state

Handle the barrier failure by sleeping for a "ring delay" and
continuing. The purpose of the barrier is to wait for all reads to
old replica set to complete and fence the remaining requests.  If the
barrier fails we give the fence some time to propagate and continue with
the topology change. Of fence did not propagate we may have stale reads,
but this is not worse that we have with gossiper.
---
 service/storage_service.cc | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

diff --git a/service/storage_service.cc b/service/storage_service.cc
index 96ef5317e4..643899c656 100644
--- a/service/storage_service.cc
+++ b/service/storage_service.cc
@@ -2063,7 +2063,7 @@ class topology_coordinator {
                 break;
             case topology::transition_state::write_both_read_new: {
                 auto node = get_node_to_work_on(std::move(guard));
-
+                bool barrier_failed = false;
                 // In this state writes goes to old and new replicas but reads start to be done from new replicas
                 // Before we stop writing to old replicas we need to wait for all previous reads to complete
                 try {
@@ -2076,7 +2076,15 @@ class topology_coordinator {
                     slogger.error("raft topology: transition_state::write_both_read_new, "
                                     "global_token_metadata_barrier failed, error {}",
                                     std::current_exception());
-                    break;
+                    barrier_failed = true;
+                }
+                if (barrier_failed) {
+                    // If barrier above failed it means there may be unfenced reads from old replicas.
+                    // Lets wait for the ring delay for those writes to complete or fence to propagate
+                    // before continuing.
+                    // FIXME: nodes that cannot be reached need to be isolated either automatically or
+                    // by an administrator
+                    co_await sleep_abortable(_ring_delay, _as);
                 }
                 switch(node.rs->state) {
                 case node_state::bootstrapping: {