From 2de3c079b28555f068fc792fe052485aad58c15f Mon Sep 17 00:00:00 2001 From: Tomasz Grabiec Date: Mon, 16 Dec 2024 01:11:04 +0100 Subject: [PATCH 1/3] tablets: topology_coordinator: Keep tablet_draining transition if nodes are not drained Empty plan with nodes to drain meant that we can exit tablet_draining transition and move to the next stage of decommission/removenode. In case tablet scheduler creates an empty plan for some reason but there are still underained tablets, that could put topology in an invalid state. For example, this can currently happen if there are no non-draining nodes in a DC. This patch adds a safety net in the topology coordinator which prevents moving forward with undrained tablets. --- service/topology_coordinator.cc | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/service/topology_coordinator.cc b/service/topology_coordinator.cc index d56b428ddb..cef1df0920 100644 --- a/service/topology_coordinator.cc +++ b/service/topology_coordinator.cc @@ -1615,8 +1615,11 @@ class topology_coordinator : public endpoint_lifecycle_subscriber { on_internal_error(rtlogger, "should_preempt_balancing() retook the guard"); } } + + bool has_nodes_to_drain = false; if (!preempt) { auto plan = co_await _tablet_allocator.balance_tablets(get_token_metadata_ptr(), _tablet_load_stats, get_dead_nodes()); + has_nodes_to_drain = plan.has_nodes_to_drain(); if (!drain || plan.has_nodes_to_drain()) { co_await generate_migration_updates(updates, guard, plan); } @@ -1664,6 +1667,14 @@ class topology_coordinator : public endpoint_lifecycle_subscriber { } if (drain) { + if (has_nodes_to_drain) { + // Prevent jumping to write_both_read_old with un-drained tablets. + // This can happen when all candidate nodes are down. + rtlogger.warn("Tablet draining stalled: No tablets migrating but there are nodes to drain"); + release_guard(std::move(guard)); + co_await sleep(3s); // Throttle retries + co_return; + } updates.emplace_back( topology_mutation_builder(guard.write_timestamp()) .set_transition_state(topology::transition_state::write_both_read_old) From 8718450172e86e2f9505feba9c5df06bbbb512cc Mon Sep 17 00:00:00 2001 From: Tomasz Grabiec Date: Mon, 16 Dec 2024 01:14:19 +0100 Subject: [PATCH 2/3] tablets: load_balancer: Ignore skip_list when draining When doing normal load balancing, we can ignore DOWN nodes in the node set and just balance the UP nodes among themselves because it's ok to equalize load just in that set, it improves the situation. It's dangerous to do that when draining because that can lead to overloading of the UP nodes. In the worst case, we can have only one non-drained node in the UP set, which would receive all the tablets of the drained node, doubling its load. It's safer to let the drain fail or stall. This is decided by topology coordinator, currently we will fail (on barrier) and rollback. --- service/tablet_allocator.cc | 15 ++++++-- test/boost/tablets_test.cc | 71 +++++++++++++++++++++++++++++++++++++ 2 files changed, 84 insertions(+), 2 deletions(-) diff --git a/service/tablet_allocator.cc b/service/tablet_allocator.cc index 74d26d60a9..3aef064282 100644 --- a/service/tablet_allocator.cc +++ b/service/tablet_allocator.cc @@ -2443,15 +2443,26 @@ public: lblogger.info("Will drain node {} ({}) from DC {}", node.host_id(), node.get_state(), dc); nodes_to_drain.emplace(node.host_id()); nodes[node.host_id()].drained = true; - } else if (node.is_excluded() || _skiplist.contains(node.host_id())) { + } else if (node.is_excluded()) { // Excluded nodes should not be chosen as targets for migration. - lblogger.debug("Ignoring excluded or dead node {}: state={}", node.host_id(), node.get_state()); + lblogger.debug("Ignoring excluded node {}: state={}", node.host_id(), node.get_state()); } else { ensure_node(node.host_id()); } } }); + // Apply skiplist only when not draining. + // It's unsafe to move tablets to non-skip nodes as this can lead to node overload. + if (nodes_to_drain.empty()) { + for (auto host_to_skip : _skiplist) { + if (auto handle = nodes.extract(host_to_skip)) { + auto& node = handle.mapped(); + lblogger.debug("Ignoring dead node {}: state={}", node.id, node.node->get_state()); + } + } + } + // Compute tablet load on nodes. for (auto&& [table, tmap_] : _tm->tablets().all_tables()) { diff --git a/test/boost/tablets_test.cc b/test/boost/tablets_test.cc index 430c6e78dd..b59b6b6497 100644 --- a/test/boost/tablets_test.cc +++ b/test/boost/tablets_test.cc @@ -2512,6 +2512,77 @@ SEASTAR_THREAD_TEST_CASE(test_drained_node_is_not_balanced_internally) { }).get(); } +SEASTAR_THREAD_TEST_CASE(test_skiplist_is_ignored_when_draining) { + // When doing normal load balancing, we can ignore DOWN nodes in the node set + // and just balance the UP nodes among themselves because it's ok to equalize + // load in that set. + // It's dangerous to do that when draining because that can lead to overloading of the UP nodes. + // In the worst case, we can have only one non-drained node in the UP set, which would receive + // all the tablets of the drained node, doubling its load. + // It's safer to let the drain fail/stall. + do_with_cql_env_thread([] (auto& e) { + inet_address ip1("192.168.0.1"); + inet_address ip2("192.168.0.2"); + inet_address ip3("192.168.0.3"); + + auto host1 = host_id(next_uuid()); + auto host2 = host_id(next_uuid()); + auto host3 = host_id(next_uuid()); + + auto table1 = table_id(next_uuid()); + + unsigned shard_count = 1; + + semaphore sem(1); + shared_token_metadata stm([&sem] () noexcept { return get_units(sem, 1); }, locator::token_metadata::config{ + locator::topology::config{ + .this_endpoint = ip1, + .this_host_id = host1, + .local_dc_rack = locator::endpoint_dc_rack::default_location + } + }); + + stm.mutate_token_metadata([&] (locator::token_metadata& tm) -> future<> { + tm.update_host_id(host1, ip1); + tm.update_host_id(host2, ip2); + tm.update_host_id(host3, ip3); + tm.update_topology(host1, locator::endpoint_dc_rack::default_location, locator::node::state::being_removed, shard_count); + tm.update_topology(host2, locator::endpoint_dc_rack::default_location, locator::node::state::normal, shard_count); + tm.update_topology(host3, locator::endpoint_dc_rack::default_location, locator::node::state::normal, shard_count); + co_await tm.update_normal_tokens(std::unordered_set{token(tests::d2t(1. / 3))}, host1); + co_await tm.update_normal_tokens(std::unordered_set{token(tests::d2t(2. / 3))}, host2); + co_await tm.update_normal_tokens(std::unordered_set{token(tests::d2t(3. / 3))}, host3); + + tablet_map tmap(2); + auto tid = tmap.first_tablet(); + tmap.set_tablet(tid, tablet_info { + tablet_replica_set{tablet_replica{host1, 0}} + }); + tid = *tmap.next_tablet(tid); + tmap.set_tablet(tid, tablet_info { + tablet_replica_set{tablet_replica{host1, 0}} + }); + tablet_metadata tmeta; + tmeta.set_tablet_map(table1, std::move(tmap)); + tm.set_tablets(std::move(tmeta)); + co_return; + }).get(); + + std::unordered_set skiplist = {host2}; + rebalance_tablets(e.get_tablet_allocator().local(), stm, {}, skiplist); + + { + load_sketch load(stm.get()); + load.populate().get(); + + for (auto h : {host2, host3}) { + testlog.debug("Checking host {}", h); + BOOST_REQUIRE_EQUAL(load.get_avg_shard_load(h), 1); + } + } + }).get(); +} + static void check_tablet_invariants(const tablet_metadata& tmeta) { for (auto&& [table, tmap] : tmeta.all_tables()) { From e732ff7cd89d8a78b38d889c86dc18a0d9742fa9 Mon Sep 17 00:00:00 2001 From: Tomasz Grabiec Date: Mon, 16 Dec 2024 01:18:11 +0100 Subject: [PATCH 3/3] tablets: load_balancer: Fail when draining with no candidate nodes If we're draining the last node in a DC, we won't have a chance to evaluate candidates and notice that constraints cannot be satisfied (N < RF). Draining will succeed and node will be removed with replicas still present on that node. This will cause later draining in the same DC to fail when we will have 2 replicas which need relocaiton for a given tablet. The expected behvior is for draining to fail, because we cannot keep the RF in the DC. This is consistent, for example, with what happens when removing a node in a 2-node cluster with RF=2. Fixes #21826 --- service/tablet_allocator.cc | 4 +++ test/boost/tablets_test.cc | 42 ++++++++++++++++++++++++++++ test/topology_custom/test_tablets.py | 3 +- 3 files changed, 47 insertions(+), 2 deletions(-) diff --git a/service/tablet_allocator.cc b/service/tablet_allocator.cc index 3aef064282..ec9b269cbf 100644 --- a/service/tablet_allocator.cc +++ b/service/tablet_allocator.cc @@ -2559,6 +2559,10 @@ public: } if (!min_load_node) { + if (!nodes_to_drain.empty()) { + throw std::runtime_error(format("There are nodes with tablets to drain but no candidate nodes in DC {}." + " Consider adding new nodes or reducing replication factor.", dc)); + } lblogger.debug("No candidate nodes"); _stats.for_dc(dc).stop_no_candidates++; co_return plan; diff --git a/test/boost/tablets_test.cc b/test/boost/tablets_test.cc index b59b6b6497..5fb39cb920 100644 --- a/test/boost/tablets_test.cc +++ b/test/boost/tablets_test.cc @@ -2512,6 +2512,48 @@ SEASTAR_THREAD_TEST_CASE(test_drained_node_is_not_balanced_internally) { }).get(); } +SEASTAR_THREAD_TEST_CASE(test_plan_fails_when_removing_last_replica) { + do_with_cql_env_thread([] (auto& e) { + inet_address ip1("192.168.0.1"); + inet_address ip2("192.168.0.2"); + + auto host1 = host_id(next_uuid()); + + auto table1 = table_id(next_uuid()); + + unsigned shard_count = 1; + + semaphore sem(1); + shared_token_metadata stm([&sem] () noexcept { return get_units(sem, 1); }, locator::token_metadata::config{ + locator::topology::config{ + .this_endpoint = ip1, + .this_host_id = host1, + .local_dc_rack = locator::endpoint_dc_rack::default_location + } + }); + + stm.mutate_token_metadata([&] (locator::token_metadata& tm) -> future<> { + tm.update_host_id(host1, ip1); + tm.update_topology(host1, locator::endpoint_dc_rack::default_location, locator::node::state::being_removed, shard_count); + co_await tm.update_normal_tokens(std::unordered_set{token(tests::d2t(1. / 2))}, host1); + + tablet_map tmap(1); + for (auto tid : tmap.tablet_ids()) { + tmap.set_tablet(tid, tablet_info { + tablet_replica_set{tablet_replica{host1, 0}} + }); + } + tablet_metadata tmeta; + tmeta.set_tablet_map(table1, std::move(tmap)); + tm.set_tablets(std::move(tmeta)); + co_return; + }).get(); + + std::unordered_set skiplist = {host1}; + BOOST_REQUIRE_THROW(rebalance_tablets(e.get_tablet_allocator().local(), stm, {}, skiplist), std::runtime_error); + }).get(); +} + SEASTAR_THREAD_TEST_CASE(test_skiplist_is_ignored_when_draining) { // When doing normal load balancing, we can ignore DOWN nodes in the node set // and just balance the UP nodes among themselves because it's ok to equalize diff --git a/test/topology_custom/test_tablets.py b/test/topology_custom/test_tablets.py index 79172caca1..30a4c8dabc 100644 --- a/test/topology_custom/test_tablets.py +++ b/test/topology_custom/test_tablets.py @@ -631,7 +631,6 @@ async def test_orphaned_sstables_on_startup(manager: ManagerClient): @pytest.mark.asyncio @pytest.mark.parametrize("with_zero_token_node", [False, True]) -@pytest.mark.xfail(reason="https://github.com/scylladb/scylladb/issues/21826") async def test_remove_failure_with_no_normal_token_owners_in_dc(manager: ManagerClient, with_zero_token_node: bool): """ Reproducer for #21826 @@ -664,7 +663,7 @@ async def test_remove_failure_with_no_normal_token_owners_in_dc(manager: Manager logger.info("Attempting removenode - expected to fail") await manager.remove_node(initiator_node.server_id, server_id=node_to_remove.server_id, ignore_dead=[replaced_host_id], - expected_error="Removenode failed. See earlier errors (Rolled back: Failed to drain tablets: std::runtime_error (Unable to find new replica for tablet") + expected_error="Removenode failed. See earlier errors (Rolled back: Failed to drain tablets: std::runtime_error (There are nodes with tablets to drain") logger.info(f"Replacing {node_to_replace} with a new node") replace_cfg = ReplaceConfig(replaced_id=node_to_remove.server_id, reuse_ip_addr = False, use_host_id=True, wait_replaced_dead=True)