From e31f6893af0f113d8fbd2bf1e365bbbec85fafb8 Mon Sep 17 00:00:00 2001 From: Gleb Natapov Date: Wed, 27 Dec 2023 17:36:59 +0200 Subject: [PATCH] storage_service: topology coordinator: fix accessing outdated node in case of barrier failure When metadata barrier fails a guard is released and node becomes outdated. Failure handling path needs to re-take the guard and re-create the node before continuing. Fixes: #16568 Message-ID: --- service/storage_service.cc | 2 ++ 1 file changed, 2 insertions(+) diff --git a/service/storage_service.cc b/service/storage_service.cc index e57a3ef92a..3319e2afa1 100644 --- a/service/storage_service.cc +++ b/service/storage_service.cc @@ -2156,6 +2156,7 @@ class topology_coordinator { // FIXME: nodes that cannot be reached need to be isolated either automatically or // by an administrator co_await sleep_abortable(_ring_delay, _as); + node = retake_node(co_await start_operation(), node.id); } switch(node.rs->state) { case node_state::bootstrapping: { @@ -2360,6 +2361,7 @@ class topology_coordinator { // Lets wait for the ring delay for those writes to complete and new topology to propagate // before continuing. co_await sleep_abortable(_ring_delay, _as); + node = retake_node(co_await start_operation(), node.id); } // Tell the node to shut down.