Merge 'Fix edge case issues related to tablet draining ' from Tomasz Grabiec

Main problem: If we're draining the last node in a DC, we won't have a chance to evaluate candidates and notice that constraints cannot be satisfied (N < RF). Draining will succeed and node will be removed with replicas still present on that node. This will cause later draining in the same DC to fail when we will have 2 replicas which need relocaiton for a given tablet. The expected behvior is for draining to fail, because we cannot keep the RF in the DC. This is consistent, for example, with what happens when removing a node in a 2-node cluster with RF=2. Fixes #21826 Secondary problem: We allowed tablet_draining transition to be exited with undrained nodes, leaving replicas on nodes in the "left" state. Third problem: We removed DOWN nodes from the candidate node set, even when draining. This is not safe because it may lead to overload. This also makes the "main problem" more likely by extending it to the scenario when the DC is DOWN. The overload part in not a problem in practice currently, since migrations will block on global topology barrier if there are DOWN nodes. Closes scylladb/scylladb#21928 * github.com:scylladb/scylladb: tablets: load_balancer: Fail when draining with no candidate nodes tablets: load_balancer: Ignore skip_list when draining tablets: topology_coordinator: Keep tablet_draining transition if nodes are not drained
2026-05-12 19:02:12 +00:00 · 2025-01-07 13:04:00 +02:00
parent 173fad296a e732ff7cd8
commit 69150f0680
4 changed files with 142 additions and 4 deletions
--- a/service/tablet_allocator.cc
+++ b/service/tablet_allocator.cc
@@ -2443,15 +2443,26 @@ public:
                    lblogger.info("Will drain node {} ({}) from DC {}", node.host_id(), node.get_state(), dc);
                    nodes_to_drain.emplace(node.host_id());
                    nodes[node.host_id()].drained = true;
-                } else if (node.is_excluded() || _skiplist.contains(node.host_id())) {
+                } else if (node.is_excluded()) {
                    // Excluded nodes should not be chosen as targets for migration.
-                    lblogger.debug("Ignoring excluded or dead node {}: state={}", node.host_id(), node.get_state());
+                    lblogger.debug("Ignoring excluded node {}: state={}", node.host_id(), node.get_state());
                } else {
                    ensure_node(node.host_id());
                }
            }
        });

+        // Apply skiplist only when not draining.
+        // It's unsafe to move tablets to non-skip nodes as this can lead to node overload.
+        if (nodes_to_drain.empty()) {
+            for (auto host_to_skip : _skiplist) {
+                if (auto handle = nodes.extract(host_to_skip)) {
+                    auto& node = handle.mapped();
+                    lblogger.debug("Ignoring dead node {}: state={}", node.id, node.node->get_state());
+                }
+            }
+        }
+
        // Compute tablet load on nodes.

        for (auto&& [table, tmap_] : _tm->tablets().all_tables()) {
@@ -2548,6 +2559,10 @@ public:
        }

        if (!min_load_node) {
+            if (!nodes_to_drain.empty()) {
+                throw std::runtime_error(format("There are nodes with tablets to drain but no candidate nodes in DC {}."
+                                                " Consider adding new nodes or reducing replication factor.", dc));
+            }
            lblogger.debug("No candidate nodes");
            _stats.for_dc(dc).stop_no_candidates++;
            co_return plan;
--- a/service/topology_coordinator.cc
+++ b/service/topology_coordinator.cc
@@ -1617,8 +1617,11 @@ class topology_coordinator : public endpoint_lifecycle_subscriber {
                on_internal_error(rtlogger, "should_preempt_balancing() retook the guard");
            }
        }
+
+        bool has_nodes_to_drain = false;
        if (!preempt) {
            auto plan = co_await _tablet_allocator.balance_tablets(get_token_metadata_ptr(), _tablet_load_stats, get_dead_nodes());
+            has_nodes_to_drain = plan.has_nodes_to_drain();
            if (!drain || plan.has_nodes_to_drain()) {
                co_await generate_migration_updates(updates, guard, plan);
            }
@@ -1666,6 +1669,14 @@ class topology_coordinator : public endpoint_lifecycle_subscriber {
        }

        if (drain) {
+            if (has_nodes_to_drain) {
+                // Prevent jumping to write_both_read_old with un-drained tablets.
+                // This can happen when all candidate nodes are down.
+                rtlogger.warn("Tablet draining stalled: No tablets migrating but there are nodes to drain");
+                release_guard(std::move(guard));
+                co_await sleep(3s); // Throttle retries
+                co_return;
+            }
            updates.emplace_back(
                topology_mutation_builder(guard.write_timestamp())
                    .set_transition_state(topology::transition_state::write_both_read_old)
--- a/test/boost/tablets_test.cc
+++ b/test/boost/tablets_test.cc
@@ -2515,6 +2515,119 @@ SEASTAR_THREAD_TEST_CASE(test_drained_node_is_not_balanced_internally) {
  }).get();
 }

+SEASTAR_THREAD_TEST_CASE(test_plan_fails_when_removing_last_replica) {
+    do_with_cql_env_thread([] (auto& e) {
+        inet_address ip1("192.168.0.1");
+        inet_address ip2("192.168.0.2");
+
+        auto host1 = host_id(next_uuid());
+
+        auto table1 = table_id(next_uuid());
+
+        unsigned shard_count = 1;
+
+        semaphore sem(1);
+        shared_token_metadata stm([&sem] () noexcept { return get_units(sem, 1); }, locator::token_metadata::config{
+            locator::topology::config{
+                .this_endpoint = ip1,
+                .this_host_id = host1,
+                .local_dc_rack = locator::endpoint_dc_rack::default_location
+            }
+        });
+
+        stm.mutate_token_metadata([&] (locator::token_metadata& tm) -> future<> {
+            tm.update_host_id(host1, ip1);
+            tm.update_topology(host1, locator::endpoint_dc_rack::default_location, locator::node::state::being_removed, shard_count);
+            co_await tm.update_normal_tokens(std::unordered_set{token(tests::d2t(1. / 2))}, host1);
+
+            tablet_map tmap(1);
+            for (auto tid : tmap.tablet_ids()) {
+                tmap.set_tablet(tid, tablet_info {
+                    tablet_replica_set{tablet_replica{host1, 0}}
+                });
+            }
+            tablet_metadata tmeta;
+            tmeta.set_tablet_map(table1, std::move(tmap));
+            tm.set_tablets(std::move(tmeta));
+            co_return;
+        }).get();
+
+        std::unordered_set<host_id> skiplist = {host1};
+        BOOST_REQUIRE_THROW(rebalance_tablets(e.get_tablet_allocator().local(), stm, {}, skiplist), std::runtime_error);
+    }).get();
+}
+
+SEASTAR_THREAD_TEST_CASE(test_skiplist_is_ignored_when_draining) {
+    // When doing normal load balancing, we can ignore DOWN nodes in the node set
+    // and just balance the UP nodes among themselves because it's ok to equalize
+    // load in that set.
+    // It's dangerous to do that when draining because that can lead to overloading of the UP nodes.
+    // In the worst case, we can have only one non-drained node in the UP set, which would receive
+    // all the tablets of the drained node, doubling its load.
+    // It's safer to let the drain fail/stall.
+    do_with_cql_env_thread([] (auto& e) {
+        inet_address ip1("192.168.0.1");
+        inet_address ip2("192.168.0.2");
+        inet_address ip3("192.168.0.3");
+
+        auto host1 = host_id(next_uuid());
+        auto host2 = host_id(next_uuid());
+        auto host3 = host_id(next_uuid());
+
+        auto table1 = table_id(next_uuid());
+
+        unsigned shard_count = 1;
+
+        semaphore sem(1);
+        shared_token_metadata stm([&sem] () noexcept { return get_units(sem, 1); }, locator::token_metadata::config{
+            locator::topology::config{
+                .this_endpoint = ip1,
+                .this_host_id = host1,
+                .local_dc_rack = locator::endpoint_dc_rack::default_location
+            }
+        });
+
+        stm.mutate_token_metadata([&] (locator::token_metadata& tm) -> future<> {
+            tm.update_host_id(host1, ip1);
+            tm.update_host_id(host2, ip2);
+            tm.update_host_id(host3, ip3);
+            tm.update_topology(host1, locator::endpoint_dc_rack::default_location, locator::node::state::being_removed, shard_count);
+            tm.update_topology(host2, locator::endpoint_dc_rack::default_location, locator::node::state::normal, shard_count);
+            tm.update_topology(host3, locator::endpoint_dc_rack::default_location, locator::node::state::normal, shard_count);
+            co_await tm.update_normal_tokens(std::unordered_set{token(tests::d2t(1. / 3))}, host1);
+            co_await tm.update_normal_tokens(std::unordered_set{token(tests::d2t(2. / 3))}, host2);
+            co_await tm.update_normal_tokens(std::unordered_set{token(tests::d2t(3. / 3))}, host3);
+
+            tablet_map tmap(2);
+            auto tid = tmap.first_tablet();
+            tmap.set_tablet(tid, tablet_info {
+                tablet_replica_set{tablet_replica{host1, 0}}
+            });
+            tid = *tmap.next_tablet(tid);
+            tmap.set_tablet(tid, tablet_info {
+                tablet_replica_set{tablet_replica{host1, 0}}
+            });
+            tablet_metadata tmeta;
+            tmeta.set_tablet_map(table1, std::move(tmap));
+            tm.set_tablets(std::move(tmeta));
+            co_return;
+        }).get();
+
+        std::unordered_set<host_id> skiplist = {host2};
+        rebalance_tablets(e.get_tablet_allocator().local(), stm, {}, skiplist);
+
+        {
+            load_sketch load(stm.get());
+            load.populate().get();
+
+            for (auto h : {host2, host3}) {
+                testlog.debug("Checking host {}", h);
+                BOOST_REQUIRE_EQUAL(load.get_avg_shard_load(h), 1);
+            }
+        }
+    }).get();
+}
+
 static
 void check_tablet_invariants(const tablet_metadata& tmeta) {
    for (auto&& [table, tmap] : tmeta.all_tables()) {
--- a/test/topology_custom/test_tablets.py
+++ b/test/topology_custom/test_tablets.py
@@ -631,7 +631,6 @@ async def test_orphaned_sstables_on_startup(manager: ManagerClient):

@pytest.mark.asyncio
@pytest.mark.parametrize("with_zero_token_node", [False, True])
-@pytest.mark.xfail(reason="https://github.com/scylladb/scylladb/issues/21826")
 async def test_remove_failure_with_no_normal_token_owners_in_dc(manager: ManagerClient, with_zero_token_node: bool):
    """
    Reproducer for #21826
@@ -664,7 +663,7 @@ async def test_remove_failure_with_no_normal_token_owners_in_dc(manager: Manager

    logger.info("Attempting removenode - expected to fail")
    await manager.remove_node(initiator_node.server_id, server_id=node_to_remove.server_id, ignore_dead=[replaced_host_id],
-                              expected_error="Removenode failed. See earlier errors (Rolled back: Failed to drain tablets: std::runtime_error (Unable to find new replica for tablet")
+                              expected_error="Removenode failed. See earlier errors (Rolled back: Failed to drain tablets: std::runtime_error (There are nodes with tablets to drain")

    logger.info(f"Replacing {node_to_replace} with a new node")
    replace_cfg = ReplaceConfig(replaced_id=node_to_remove.server_id, reuse_ip_addr = False, use_host_id=True, wait_replaced_dead=True)