From 7267376eac3ff7d9431a6283edd098e2c5572aef Mon Sep 17 00:00:00 2001 From: Gleb Natapov Date: Mon, 6 Nov 2023 13:47:23 +0200 Subject: [PATCH] storage_service: topology coordinator: do not retry the metadata barrier forever in write_both_read_new state Handle the barrier failure by sleeping for a "ring delay" and continuing. The purpose of the barrier is to wait for all reads to old replica set to complete and fence the remaining requests. If the barrier fails we give the fence some time to propagate and continue with the topology change. Of fence did not propagate we may have stale reads, but this is not worse that we have with gossiper. --- service/storage_service.cc | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/service/storage_service.cc b/service/storage_service.cc index 96ef5317e4..643899c656 100644 --- a/service/storage_service.cc +++ b/service/storage_service.cc @@ -2063,7 +2063,7 @@ class topology_coordinator { break; case topology::transition_state::write_both_read_new: { auto node = get_node_to_work_on(std::move(guard)); - + bool barrier_failed = false; // In this state writes goes to old and new replicas but reads start to be done from new replicas // Before we stop writing to old replicas we need to wait for all previous reads to complete try { @@ -2076,7 +2076,15 @@ class topology_coordinator { slogger.error("raft topology: transition_state::write_both_read_new, " "global_token_metadata_barrier failed, error {}", std::current_exception()); - break; + barrier_failed = true; + } + if (barrier_failed) { + // If barrier above failed it means there may be unfenced reads from old replicas. + // Lets wait for the ring delay for those writes to complete or fence to propagate + // before continuing. + // FIXME: nodes that cannot be reached need to be isolated either automatically or + // by an administrator + co_await sleep_abortable(_ring_delay, _as); } switch(node.rs->state) { case node_state::bootstrapping: {