From 2de3c079b28555f068fc792fe052485aad58c15f Mon Sep 17 00:00:00 2001
From: Tomasz Grabiec <tgrabiec@scylladb.com>
Date: Mon, 16 Dec 2024 01:11:04 +0100
Subject: [PATCH 1/3] tablets: topology_coordinator: Keep tablet_draining
 transition if nodes are not drained

Empty plan with nodes to drain meant that we can exit tablet_draining
transition and move to the next stage of decommission/removenode.

In case tablet scheduler creates an empty plan for some reason but
there are still underained tablets, that could put topology in an
invalid state. For example, this can currently happen if there
are no non-draining nodes in a DC.

This patch adds a safety net in the topology coordinator which
prevents moving forward with undrained tablets.
---
 service/topology_coordinator.cc | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/service/topology_coordinator.cc b/service/topology_coordinator.cc
index d56b428ddb..cef1df0920 100644
--- a/service/topology_coordinator.cc
+++ b/service/topology_coordinator.cc
@@ -1615,8 +1615,11 @@ class topology_coordinator : public endpoint_lifecycle_subscriber {
                 on_internal_error(rtlogger, "should_preempt_balancing() retook the guard");
             }
         }
+
+        bool has_nodes_to_drain = false;
         if (!preempt) {
             auto plan = co_await _tablet_allocator.balance_tablets(get_token_metadata_ptr(), _tablet_load_stats, get_dead_nodes());
+            has_nodes_to_drain = plan.has_nodes_to_drain();
             if (!drain || plan.has_nodes_to_drain()) {
                 co_await generate_migration_updates(updates, guard, plan);
             }
@@ -1664,6 +1667,14 @@ class topology_coordinator : public endpoint_lifecycle_subscriber {
         }
 
         if (drain) {
+            if (has_nodes_to_drain) {
+                // Prevent jumping to write_both_read_old with un-drained tablets.
+                // This can happen when all candidate nodes are down.
+                rtlogger.warn("Tablet draining stalled: No tablets migrating but there are nodes to drain");
+                release_guard(std::move(guard));
+                co_await sleep(3s); // Throttle retries
+                co_return;
+            }
             updates.emplace_back(
                 topology_mutation_builder(guard.write_timestamp())
                     .set_transition_state(topology::transition_state::write_both_read_old)

From 8718450172e86e2f9505feba9c5df06bbbb512cc Mon Sep 17 00:00:00 2001
From: Tomasz Grabiec <tgrabiec@scylladb.com>
Date: Mon, 16 Dec 2024 01:14:19 +0100
Subject: [PATCH 2/3] tablets: load_balancer: Ignore skip_list when draining

When doing normal load balancing, we can ignore DOWN nodes in the node
set and just balance the UP nodes among themselves because it's ok to
equalize load just in that set, it improves the situation.

It's dangerous to do that when draining because that can lead to
overloading of the UP nodes. In the worst case, we can have only one
non-drained node in the UP set, which would receive all the tablets of
the drained node, doubling its load.

It's safer to let the drain fail or stall. This is decided by topology
coordinator, currently we will fail (on barrier) and rollback.
---
 service/tablet_allocator.cc | 15 ++++++--
 test/boost/tablets_test.cc  | 71 +++++++++++++++++++++++++++++++++++++
 2 files changed, 84 insertions(+), 2 deletions(-)

diff --git a/service/tablet_allocator.cc b/service/tablet_allocator.cc
index 74d26d60a9..3aef064282 100644
--- a/service/tablet_allocator.cc
+++ b/service/tablet_allocator.cc
@@ -2443,15 +2443,26 @@ public:
                     lblogger.info("Will drain node {} ({}) from DC {}", node.host_id(), node.get_state(), dc);
                     nodes_to_drain.emplace(node.host_id());
                     nodes[node.host_id()].drained = true;
-                } else if (node.is_excluded() || _skiplist.contains(node.host_id())) {
+                } else if (node.is_excluded()) {
                     // Excluded nodes should not be chosen as targets for migration.
-                    lblogger.debug("Ignoring excluded or dead node {}: state={}", node.host_id(), node.get_state());
+                    lblogger.debug("Ignoring excluded node {}: state={}", node.host_id(), node.get_state());
                 } else {
                     ensure_node(node.host_id());
                 }
             }
         });
 
+        // Apply skiplist only when not draining.
+        // It's unsafe to move tablets to non-skip nodes as this can lead to node overload.
+        if (nodes_to_drain.empty()) {
+            for (auto host_to_skip : _skiplist) {
+                if (auto handle = nodes.extract(host_to_skip)) {
+                    auto& node = handle.mapped();
+                    lblogger.debug("Ignoring dead node {}: state={}", node.id, node.node->get_state());
+                }
+            }
+        }
+
         // Compute tablet load on nodes.
 
         for (auto&& [table, tmap_] : _tm->tablets().all_tables()) {
diff --git a/test/boost/tablets_test.cc b/test/boost/tablets_test.cc
index 430c6e78dd..b59b6b6497 100644
--- a/test/boost/tablets_test.cc
+++ b/test/boost/tablets_test.cc
@@ -2512,6 +2512,77 @@ SEASTAR_THREAD_TEST_CASE(test_drained_node_is_not_balanced_internally) {
   }).get();
 }
 
+SEASTAR_THREAD_TEST_CASE(test_skiplist_is_ignored_when_draining) {
+    // When doing normal load balancing, we can ignore DOWN nodes in the node set
+    // and just balance the UP nodes among themselves because it's ok to equalize
+    // load in that set.
+    // It's dangerous to do that when draining because that can lead to overloading of the UP nodes.
+    // In the worst case, we can have only one non-drained node in the UP set, which would receive
+    // all the tablets of the drained node, doubling its load.
+    // It's safer to let the drain fail/stall.
+    do_with_cql_env_thread([] (auto& e) {
+        inet_address ip1("192.168.0.1");
+        inet_address ip2("192.168.0.2");
+        inet_address ip3("192.168.0.3");
+
+        auto host1 = host_id(next_uuid());
+        auto host2 = host_id(next_uuid());
+        auto host3 = host_id(next_uuid());
+
+        auto table1 = table_id(next_uuid());
+
+        unsigned shard_count = 1;
+
+        semaphore sem(1);
+        shared_token_metadata stm([&sem] () noexcept { return get_units(sem, 1); }, locator::token_metadata::config{
+            locator::topology::config{
+                .this_endpoint = ip1,
+                .this_host_id = host1,
+                .local_dc_rack = locator::endpoint_dc_rack::default_location
+            }
+        });
+
+        stm.mutate_token_metadata([&] (locator::token_metadata& tm) -> future<> {
+            tm.update_host_id(host1, ip1);
+            tm.update_host_id(host2, ip2);
+            tm.update_host_id(host3, ip3);
+            tm.update_topology(host1, locator::endpoint_dc_rack::default_location, locator::node::state::being_removed, shard_count);
+            tm.update_topology(host2, locator::endpoint_dc_rack::default_location, locator::node::state::normal, shard_count);
+            tm.update_topology(host3, locator::endpoint_dc_rack::default_location, locator::node::state::normal, shard_count);
+            co_await tm.update_normal_tokens(std::unordered_set{token(tests::d2t(1. / 3))}, host1);
+            co_await tm.update_normal_tokens(std::unordered_set{token(tests::d2t(2. / 3))}, host2);
+            co_await tm.update_normal_tokens(std::unordered_set{token(tests::d2t(3. / 3))}, host3);
+
+            tablet_map tmap(2);
+            auto tid = tmap.first_tablet();
+            tmap.set_tablet(tid, tablet_info {
+                tablet_replica_set{tablet_replica{host1, 0}}
+            });
+            tid = *tmap.next_tablet(tid);
+            tmap.set_tablet(tid, tablet_info {
+                tablet_replica_set{tablet_replica{host1, 0}}
+            });
+            tablet_metadata tmeta;
+            tmeta.set_tablet_map(table1, std::move(tmap));
+            tm.set_tablets(std::move(tmeta));
+            co_return;
+        }).get();
+
+        std::unordered_set<host_id> skiplist = {host2};
+        rebalance_tablets(e.get_tablet_allocator().local(), stm, {}, skiplist);
+
+        {
+            load_sketch load(stm.get());
+            load.populate().get();
+
+            for (auto h : {host2, host3}) {
+                testlog.debug("Checking host {}", h);
+                BOOST_REQUIRE_EQUAL(load.get_avg_shard_load(h), 1);
+            }
+        }
+    }).get();
+}
+
 static
 void check_tablet_invariants(const tablet_metadata& tmeta) {
     for (auto&& [table, tmap] : tmeta.all_tables()) {

From e732ff7cd89d8a78b38d889c86dc18a0d9742fa9 Mon Sep 17 00:00:00 2001
From: Tomasz Grabiec <tgrabiec@scylladb.com>
Date: Mon, 16 Dec 2024 01:18:11 +0100
Subject: [PATCH 3/3] tablets: load_balancer: Fail when draining with no
 candidate nodes

If we're draining the last node in a DC, we won't have a chance to
evaluate candidates and notice that constraints cannot be satisfied (N
< RF). Draining will succeed and node will be removed with replicas
still present on that node. This will cause later draining in the same
DC to fail when we will have 2 replicas which need relocaiton for a
given tablet.

The expected behvior is for draining to fail, because we cannot keep
the RF in the DC. This is consistent, for example, with what happens
when removing a node in a 2-node cluster with RF=2.

Fixes #21826
---
 service/tablet_allocator.cc          |  4 +++
 test/boost/tablets_test.cc           | 42 ++++++++++++++++++++++++++++
 test/topology_custom/test_tablets.py |  3 +-
 3 files changed, 47 insertions(+), 2 deletions(-)

diff --git a/service/tablet_allocator.cc b/service/tablet_allocator.cc
index 3aef064282..ec9b269cbf 100644
--- a/service/tablet_allocator.cc
+++ b/service/tablet_allocator.cc
@@ -2559,6 +2559,10 @@ public:
         }
 
         if (!min_load_node) {
+            if (!nodes_to_drain.empty()) {
+                throw std::runtime_error(format("There are nodes with tablets to drain but no candidate nodes in DC {}."
+                                                " Consider adding new nodes or reducing replication factor.", dc));
+            }
             lblogger.debug("No candidate nodes");
             _stats.for_dc(dc).stop_no_candidates++;
             co_return plan;
diff --git a/test/boost/tablets_test.cc b/test/boost/tablets_test.cc
index b59b6b6497..5fb39cb920 100644
--- a/test/boost/tablets_test.cc
+++ b/test/boost/tablets_test.cc
@@ -2512,6 +2512,48 @@ SEASTAR_THREAD_TEST_CASE(test_drained_node_is_not_balanced_internally) {
   }).get();
 }
 
+SEASTAR_THREAD_TEST_CASE(test_plan_fails_when_removing_last_replica) {
+    do_with_cql_env_thread([] (auto& e) {
+        inet_address ip1("192.168.0.1");
+        inet_address ip2("192.168.0.2");
+
+        auto host1 = host_id(next_uuid());
+
+        auto table1 = table_id(next_uuid());
+
+        unsigned shard_count = 1;
+
+        semaphore sem(1);
+        shared_token_metadata stm([&sem] () noexcept { return get_units(sem, 1); }, locator::token_metadata::config{
+            locator::topology::config{
+                .this_endpoint = ip1,
+                .this_host_id = host1,
+                .local_dc_rack = locator::endpoint_dc_rack::default_location
+            }
+        });
+
+        stm.mutate_token_metadata([&] (locator::token_metadata& tm) -> future<> {
+            tm.update_host_id(host1, ip1);
+            tm.update_topology(host1, locator::endpoint_dc_rack::default_location, locator::node::state::being_removed, shard_count);
+            co_await tm.update_normal_tokens(std::unordered_set{token(tests::d2t(1. / 2))}, host1);
+
+            tablet_map tmap(1);
+            for (auto tid : tmap.tablet_ids()) {
+                tmap.set_tablet(tid, tablet_info {
+                    tablet_replica_set{tablet_replica{host1, 0}}
+                });
+            }
+            tablet_metadata tmeta;
+            tmeta.set_tablet_map(table1, std::move(tmap));
+            tm.set_tablets(std::move(tmeta));
+            co_return;
+        }).get();
+
+        std::unordered_set<host_id> skiplist = {host1};
+        BOOST_REQUIRE_THROW(rebalance_tablets(e.get_tablet_allocator().local(), stm, {}, skiplist), std::runtime_error);
+    }).get();
+}
+
 SEASTAR_THREAD_TEST_CASE(test_skiplist_is_ignored_when_draining) {
     // When doing normal load balancing, we can ignore DOWN nodes in the node set
     // and just balance the UP nodes among themselves because it's ok to equalize
diff --git a/test/topology_custom/test_tablets.py b/test/topology_custom/test_tablets.py
index 79172caca1..30a4c8dabc 100644
--- a/test/topology_custom/test_tablets.py
+++ b/test/topology_custom/test_tablets.py
@@ -631,7 +631,6 @@ async def test_orphaned_sstables_on_startup(manager: ManagerClient):
 
 @pytest.mark.asyncio
 @pytest.mark.parametrize("with_zero_token_node", [False, True])
-@pytest.mark.xfail(reason="https://github.com/scylladb/scylladb/issues/21826")
 async def test_remove_failure_with_no_normal_token_owners_in_dc(manager: ManagerClient, with_zero_token_node: bool):
     """
     Reproducer for #21826
@@ -664,7 +663,7 @@ async def test_remove_failure_with_no_normal_token_owners_in_dc(manager: Manager
 
     logger.info("Attempting removenode - expected to fail")
     await manager.remove_node(initiator_node.server_id, server_id=node_to_remove.server_id, ignore_dead=[replaced_host_id],
-                              expected_error="Removenode failed. See earlier errors (Rolled back: Failed to drain tablets: std::runtime_error (Unable to find new replica for tablet")
+                              expected_error="Removenode failed. See earlier errors (Rolled back: Failed to drain tablets: std::runtime_error (There are nodes with tablets to drain")
 
     logger.info(f"Replacing {node_to_replace} with a new node")
     replace_cfg = ReplaceConfig(replaced_id=node_to_remove.server_id, reuse_ip_addr = False, use_host_id=True, wait_replaced_dead=True)