From a45e0765e43f60328d9c851409a0b84ae33dd2a6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Miko=C5=82aj=20Grzebieluch?= Date: Fri, 26 May 2023 13:57:43 +0200 Subject: [PATCH 1/2] raft topology: `wait_for_peers_to_enter_synchronize_state` doesn't need to resolve all IPs Another node can stop after it joined the group0 but before it advertised itself in gossip. `get_inet_addrs` will try to resolve all IPs and `wait_for_peers_to_enter_synchronize_state` will loop indefinitely. But `wait_for_peers_to_enter_synchronize_state` can return early if one of the nodes confirms that the upgrade procedure has finished. For that, it doesn't need the IPs of all group 0 members - only the IP of some nodes which can do the confirmation. This commit restructures the code so that IPs of nodes are resolved inside the `max_concurrent_for_each` that `wait_for_peers_to_enter_synchronize_state` performs. Then, even if some IPs won't be resolved, but one of the nodes confirms a successful upgrade, we can continue. Fixes #13543 --- service/raft/raft_group0.cc | 28 +++++++++++++++++++++------- 1 file changed, 21 insertions(+), 7 deletions(-) diff --git a/service/raft/raft_group0.cc b/service/raft/raft_group0.cc index c39849ad3c..db4d3d8821 100644 --- a/service/raft/raft_group0.cc +++ b/service/raft/raft_group0.cc @@ -469,6 +469,13 @@ struct group0_members { const raft::server& _group0_server; const raft_address_map& _address_map; + raft::config_member_set get_members() const { + return _group0_server.get_configuration().current; + } + + std::optional get_inet_addr(const raft::config_member& member) const { + return _address_map.find(member.addr.id); + } std::vector get_inet_addrs(seastar::compat::source_location l = seastar::compat::source_location::current()) const { @@ -1192,10 +1199,7 @@ static future wait_for_peers_to_enter_synchronize_state( for (sleep_with_exponential_backoff sleep;; co_await sleep(as)) { // We fetch the config again on every attempt to handle the possibility of removing failed nodes. - auto current_config = members0.get_inet_addrs(); - if (current_config.empty()) { - continue; - } + auto current_members_set = members0.get_members(); ::tracker tracker; auto retry = make_lw_shared(false); @@ -1209,10 +1213,20 @@ static future wait_for_peers_to_enter_synchronize_state( } (void) [] (netw::messaging_service& ms, abort_source& as, gate::holder pause_shutdown, - std::vector current_config, + raft::config_member_set current_members_set, group0_members members0, lw_shared_ptr> entered_synchronize, lw_shared_ptr retry, ::tracker tracker) -> future<> { - co_await max_concurrent_for_each(current_config, max_concurrency, [&] (const gms::inet_address& node) -> future<> { + co_await max_concurrent_for_each(current_members_set, max_concurrency, [&] (const raft::config_member& member) -> future<> { + auto node_opt = members0.get_inet_addr(member); + + if (!node_opt.has_value()) { + upgrade_log.warn("wait_for_peers_to_enter_synchronize_state: cannot resolve the IP of {}", member); + *retry = true; + co_return; + } + + auto node = *node_opt; + if (entered_synchronize->contains(node)) { co_return; } @@ -1249,7 +1263,7 @@ static future wait_for_peers_to_enter_synchronize_state( }); tracker.set_value(false); - }(ms, as, pause_shutdown, std::move(current_config), entered_synchronize, retry, tracker); + }(ms, as, pause_shutdown, std::move(current_members_set), members0, entered_synchronize, retry, tracker); auto finish_early = co_await tracker.get(); if (finish_early) { From fa76d6bd642b2bbd50def0378fe9ba3ece470801 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Miko=C5=82aj=20Grzebieluch?= Date: Thu, 15 Jun 2023 14:24:53 +0200 Subject: [PATCH 2/2] raft topology: test: check if aborted node replacing blocks bootstrap Scenario: 1. Start a cluster with nodes node1, node2, node3 2. Start node4 replacing node node2 3. Stop node node4 after it joined group0 but before it advertised itself in gossip 4. Start node5 replacing node node2 Test simulates the behavior described in #13543. Test passes only if `wait_for_peers_to_enter_synchronize_state` doesn't need to resolve all IPs to return early. If not, node5 will hang trying to resolve the IP of node4: ``` raft_group0_upgrade - : failed to resolve IP addresses of some of the cluster members ([node4's host ID]) ``` --- service/raft/raft_group0.cc | 5 ++ test/topology_experimental_raft/suite.yaml | 2 + .../test_blocked_bootstrap.py | 51 +++++++++++++++++++ 3 files changed, 58 insertions(+) create mode 100644 test/topology_experimental_raft/test_blocked_bootstrap.py diff --git a/service/raft/raft_group0.cc b/service/raft/raft_group0.cc index db4d3d8821..dc444ad215 100644 --- a/service/raft/raft_group0.cc +++ b/service/raft/raft_group0.cc @@ -33,6 +33,7 @@ #include #include #include +#include #include "idl/group0.dist.hh" @@ -583,6 +584,10 @@ future<> raft_group0::setup_group0( co_await join_group0(std::move(seeds), false /* non-voter */, ss, qp, mm, cdc_gen_service); group0_log.info("setup_group0: successfully joined group 0."); + utils::get_local_injector().inject("stop_after_joining_group0", [&] { + throw std::runtime_error{"injection: stop_after_joining_group0"}; + }); + if (replace_info) { // Insert the replaced node's (Raft ID, IP address) pair into `raft_address_map`. // In general, the mapping won't be obtained through the regular gossiping route: diff --git a/test/topology_experimental_raft/suite.yaml b/test/topology_experimental_raft/suite.yaml index 0097efd0be..7b828049e4 100644 --- a/test/topology_experimental_raft/suite.yaml +++ b/test/topology_experimental_raft/suite.yaml @@ -6,3 +6,5 @@ extra_scylla_config_options: authenticator: AllowAllAuthenticator authorizer: AllowAllAuthorizer experimental_features: ['raft'] +skip_in_release: + - test_blocked_bootstrap diff --git a/test/topology_experimental_raft/test_blocked_bootstrap.py b/test/topology_experimental_raft/test_blocked_bootstrap.py new file mode 100644 index 0000000000..ec10f04b0f --- /dev/null +++ b/test/topology_experimental_raft/test_blocked_bootstrap.py @@ -0,0 +1,51 @@ +# Copyright (C) 2023-present ScyllaDB +# +# SPDX-License-Identifier: AGPL-3.0-or-later +# +from test.pylib.scylla_cluster import ReplaceConfig +from test.pylib.manager_client import ManagerClient + +import pytest +import logging + +logger = logging.getLogger(__name__) + + +@pytest.mark.asyncio +async def test_blocked_bootstrap(manager: ManagerClient): + """ + Scenario: + 1. Start a cluster with nodes node1, node2, node3 + 2. Start node4 replacing node node2 + 3. Stop node node4 after it joined group0 but before it advertised itself in gossip + 4. Start node5 replacing node node2 + + Test simulates the behavior described in #13543. + + Test passes only if `wait_for_peers_to_enter_synchronize_state` doesn't need to + resolve all IPs to return early. If not, node5 will hang trying to resolve the + IP of node4: + ``` + raft_group0_upgrade - : failed to resolve IP addresses of some of the cluster members ([node4's host ID]) + ``` + """ + servers = [await manager.server_add() for _ in range(3)] + + logger.info(f"Stopping node {servers[0]}") + await manager.server_stop_gracefully(servers[0].server_id) + + logger.info(f"Replacing node {servers[0]}") + replace_cfg = ReplaceConfig(replaced_id = servers[0].server_id, reuse_ip_addr = False, use_host_id = False) + + try: + await manager.server_add(replace_cfg, config={ + 'error_injections_at_startup': ['stop_after_joining_group0'] + }) + except: + # Node stops before it advertised itself in gossip, so manager.server_add throws an exception + pass + else: + assert False, "Node should stop before it advertised itself in gossip" + + logger.info(f"Replacing node {servers[0]}") + await manager.server_add(replace_cfg)