diff --git a/service/tablet_allocator.cc b/service/tablet_allocator.cc index f212a1b440..a192484fed 100644 --- a/service/tablet_allocator.cc +++ b/service/tablet_allocator.cc @@ -2443,15 +2443,26 @@ public: lblogger.info("Will drain node {} ({}) from DC {}", node.host_id(), node.get_state(), dc); nodes_to_drain.emplace(node.host_id()); nodes[node.host_id()].drained = true; - } else if (node.is_excluded() || _skiplist.contains(node.host_id())) { + } else if (node.is_excluded()) { // Excluded nodes should not be chosen as targets for migration. - lblogger.debug("Ignoring excluded or dead node {}: state={}", node.host_id(), node.get_state()); + lblogger.debug("Ignoring excluded node {}: state={}", node.host_id(), node.get_state()); } else { ensure_node(node.host_id()); } } }); + // Apply skiplist only when not draining. + // It's unsafe to move tablets to non-skip nodes as this can lead to node overload. + if (nodes_to_drain.empty()) { + for (auto host_to_skip : _skiplist) { + if (auto handle = nodes.extract(host_to_skip)) { + auto& node = handle.mapped(); + lblogger.debug("Ignoring dead node {}: state={}", node.id, node.node->get_state()); + } + } + } + // Compute tablet load on nodes. for (auto&& [table, tmap_] : _tm->tablets().all_tables()) { @@ -2548,6 +2559,10 @@ public: } if (!min_load_node) { + if (!nodes_to_drain.empty()) { + throw std::runtime_error(format("There are nodes with tablets to drain but no candidate nodes in DC {}." + " Consider adding new nodes or reducing replication factor.", dc)); + } lblogger.debug("No candidate nodes"); _stats.for_dc(dc).stop_no_candidates++; co_return plan; diff --git a/service/topology_coordinator.cc b/service/topology_coordinator.cc index eb374d4b46..f9c2f73406 100644 --- a/service/topology_coordinator.cc +++ b/service/topology_coordinator.cc @@ -1617,8 +1617,11 @@ class topology_coordinator : public endpoint_lifecycle_subscriber { on_internal_error(rtlogger, "should_preempt_balancing() retook the guard"); } } + + bool has_nodes_to_drain = false; if (!preempt) { auto plan = co_await _tablet_allocator.balance_tablets(get_token_metadata_ptr(), _tablet_load_stats, get_dead_nodes()); + has_nodes_to_drain = plan.has_nodes_to_drain(); if (!drain || plan.has_nodes_to_drain()) { co_await generate_migration_updates(updates, guard, plan); } @@ -1666,6 +1669,14 @@ class topology_coordinator : public endpoint_lifecycle_subscriber { } if (drain) { + if (has_nodes_to_drain) { + // Prevent jumping to write_both_read_old with un-drained tablets. + // This can happen when all candidate nodes are down. + rtlogger.warn("Tablet draining stalled: No tablets migrating but there are nodes to drain"); + release_guard(std::move(guard)); + co_await sleep(3s); // Throttle retries + co_return; + } updates.emplace_back( topology_mutation_builder(guard.write_timestamp()) .set_transition_state(topology::transition_state::write_both_read_old) diff --git a/test/boost/tablets_test.cc b/test/boost/tablets_test.cc index 4eda80679a..07ceaf04ca 100644 --- a/test/boost/tablets_test.cc +++ b/test/boost/tablets_test.cc @@ -2515,6 +2515,119 @@ SEASTAR_THREAD_TEST_CASE(test_drained_node_is_not_balanced_internally) { }).get(); } +SEASTAR_THREAD_TEST_CASE(test_plan_fails_when_removing_last_replica) { + do_with_cql_env_thread([] (auto& e) { + inet_address ip1("192.168.0.1"); + inet_address ip2("192.168.0.2"); + + auto host1 = host_id(next_uuid()); + + auto table1 = table_id(next_uuid()); + + unsigned shard_count = 1; + + semaphore sem(1); + shared_token_metadata stm([&sem] () noexcept { return get_units(sem, 1); }, locator::token_metadata::config{ + locator::topology::config{ + .this_endpoint = ip1, + .this_host_id = host1, + .local_dc_rack = locator::endpoint_dc_rack::default_location + } + }); + + stm.mutate_token_metadata([&] (locator::token_metadata& tm) -> future<> { + tm.update_host_id(host1, ip1); + tm.update_topology(host1, locator::endpoint_dc_rack::default_location, locator::node::state::being_removed, shard_count); + co_await tm.update_normal_tokens(std::unordered_set{token(tests::d2t(1. / 2))}, host1); + + tablet_map tmap(1); + for (auto tid : tmap.tablet_ids()) { + tmap.set_tablet(tid, tablet_info { + tablet_replica_set{tablet_replica{host1, 0}} + }); + } + tablet_metadata tmeta; + tmeta.set_tablet_map(table1, std::move(tmap)); + tm.set_tablets(std::move(tmeta)); + co_return; + }).get(); + + std::unordered_set skiplist = {host1}; + BOOST_REQUIRE_THROW(rebalance_tablets(e.get_tablet_allocator().local(), stm, {}, skiplist), std::runtime_error); + }).get(); +} + +SEASTAR_THREAD_TEST_CASE(test_skiplist_is_ignored_when_draining) { + // When doing normal load balancing, we can ignore DOWN nodes in the node set + // and just balance the UP nodes among themselves because it's ok to equalize + // load in that set. + // It's dangerous to do that when draining because that can lead to overloading of the UP nodes. + // In the worst case, we can have only one non-drained node in the UP set, which would receive + // all the tablets of the drained node, doubling its load. + // It's safer to let the drain fail/stall. + do_with_cql_env_thread([] (auto& e) { + inet_address ip1("192.168.0.1"); + inet_address ip2("192.168.0.2"); + inet_address ip3("192.168.0.3"); + + auto host1 = host_id(next_uuid()); + auto host2 = host_id(next_uuid()); + auto host3 = host_id(next_uuid()); + + auto table1 = table_id(next_uuid()); + + unsigned shard_count = 1; + + semaphore sem(1); + shared_token_metadata stm([&sem] () noexcept { return get_units(sem, 1); }, locator::token_metadata::config{ + locator::topology::config{ + .this_endpoint = ip1, + .this_host_id = host1, + .local_dc_rack = locator::endpoint_dc_rack::default_location + } + }); + + stm.mutate_token_metadata([&] (locator::token_metadata& tm) -> future<> { + tm.update_host_id(host1, ip1); + tm.update_host_id(host2, ip2); + tm.update_host_id(host3, ip3); + tm.update_topology(host1, locator::endpoint_dc_rack::default_location, locator::node::state::being_removed, shard_count); + tm.update_topology(host2, locator::endpoint_dc_rack::default_location, locator::node::state::normal, shard_count); + tm.update_topology(host3, locator::endpoint_dc_rack::default_location, locator::node::state::normal, shard_count); + co_await tm.update_normal_tokens(std::unordered_set{token(tests::d2t(1. / 3))}, host1); + co_await tm.update_normal_tokens(std::unordered_set{token(tests::d2t(2. / 3))}, host2); + co_await tm.update_normal_tokens(std::unordered_set{token(tests::d2t(3. / 3))}, host3); + + tablet_map tmap(2); + auto tid = tmap.first_tablet(); + tmap.set_tablet(tid, tablet_info { + tablet_replica_set{tablet_replica{host1, 0}} + }); + tid = *tmap.next_tablet(tid); + tmap.set_tablet(tid, tablet_info { + tablet_replica_set{tablet_replica{host1, 0}} + }); + tablet_metadata tmeta; + tmeta.set_tablet_map(table1, std::move(tmap)); + tm.set_tablets(std::move(tmeta)); + co_return; + }).get(); + + std::unordered_set skiplist = {host2}; + rebalance_tablets(e.get_tablet_allocator().local(), stm, {}, skiplist); + + { + load_sketch load(stm.get()); + load.populate().get(); + + for (auto h : {host2, host3}) { + testlog.debug("Checking host {}", h); + BOOST_REQUIRE_EQUAL(load.get_avg_shard_load(h), 1); + } + } + }).get(); +} + static void check_tablet_invariants(const tablet_metadata& tmeta) { for (auto&& [table, tmap] : tmeta.all_tables()) { diff --git a/test/topology_custom/test_tablets.py b/test/topology_custom/test_tablets.py index 5e0b6d89b4..268cabb797 100644 --- a/test/topology_custom/test_tablets.py +++ b/test/topology_custom/test_tablets.py @@ -631,7 +631,6 @@ async def test_orphaned_sstables_on_startup(manager: ManagerClient): @pytest.mark.asyncio @pytest.mark.parametrize("with_zero_token_node", [False, True]) -@pytest.mark.xfail(reason="https://github.com/scylladb/scylladb/issues/21826") async def test_remove_failure_with_no_normal_token_owners_in_dc(manager: ManagerClient, with_zero_token_node: bool): """ Reproducer for #21826 @@ -664,7 +663,7 @@ async def test_remove_failure_with_no_normal_token_owners_in_dc(manager: Manager logger.info("Attempting removenode - expected to fail") await manager.remove_node(initiator_node.server_id, server_id=node_to_remove.server_id, ignore_dead=[replaced_host_id], - expected_error="Removenode failed. See earlier errors (Rolled back: Failed to drain tablets: std::runtime_error (Unable to find new replica for tablet") + expected_error="Removenode failed. See earlier errors (Rolled back: Failed to drain tablets: std::runtime_error (There are nodes with tablets to drain") logger.info(f"Replacing {node_to_replace} with a new node") replace_cfg = ReplaceConfig(replaced_id=node_to_remove.server_id, reuse_ip_addr = False, use_host_id=True, wait_replaced_dead=True)