scylladb/service/topology_coordinator.cc

/*
 * Copyright (C) 2024-present ScyllaDB
 */

/*
 * SPDX-License-Identifier: LicenseRef-ScyllaDB-Source-Available-1.1
 */

#include <algorithm>
#include <chrono>
#include <fmt/ranges.h>

#include <memory>
#include <seastar/core/abort_source.hh>
#include <seastar/core/coroutine.hh>
#include <seastar/coroutine/as_future.hh>
#include <seastar/coroutine/maybe_yield.hh>
#include <seastar/coroutine/parallel_for_each.hh>
#include <seastar/core/future.hh>
#include <seastar/core/sharded.hh>
#include <seastar/core/with_scheduling_group.hh>
#include <seastar/util/noncopyable_function.hh>

#include <variant>

#include "auth/service.hh"
#include "cdc/cdc_options.hh"
#include "cdc/generation.hh"
#include "cdc/generation_service.hh"
#include "cql3/statements/ks_prop_defs.hh"
#include "db/system_distributed_keyspace.hh"
#include "db/system_keyspace.hh"
#include "dht/boot_strapper.hh"
#include "gms/gossiper.hh"
#include "gms/feature_service.hh"
#include "locator/host_id.hh"
#include "locator/tablets.hh"
#include "locator/token_metadata.hh"
#include "locator/network_topology_strategy.hh"
#include "message/messaging_service.hh"
#include "mutation/async_utils.hh"
#include "raft/raft.hh"
#include "replica/database.hh"
#include "replica/tablet_mutation_builder.hh"
#include "replica/tablets.hh"
#include "db/config.hh"
#include "db/view/view_builder.hh"
#include "service/qos/service_level_controller.hh"
#include "service/migration_manager.hh"
#include "service/raft/group0_voter_handler.hh"
#include "service/raft/join_node.hh"
#include "service/raft/raft_group0.hh"
#include "service/raft/raft_group0_client.hh"
#include "service/tablet_allocator.hh"
#include "service/tablet_operation.hh"
#include "service/topology_state_machine.hh"
#include "db/view/view_building_coordinator.hh"
#include "topology_mutation.hh"
#include "utils/UUID.hh"
#include "utils/assert.hh"
#include "utils/error_injection.hh"
#include "utils/overloaded_functor.hh"
#include "utils/stall_free.hh"
#include "utils/to_string.hh"
#include "service/endpoint_lifecycle_subscriber.hh"
#include "sstables_loader.hh"

#include "idl/join_node.dist.hh"
#include "idl/storage_service.dist.hh"
#include "replica/exceptions.hh"
#include "service/paxos/prepare_response.hh"
#include "idl/storage_proxy.dist.hh"
#include "utils/updateable_value.hh"
#include "repair/repair.hh"
#include "idl/repair.dist.hh"
#include "idl/sstables_loader.dist.hh"

#include "service/topology_coordinator.hh"

#include <boost/range/join.hpp>
#include <seastar/core/metrics_registration.hh>
#include "utils/labels.hh"

using token = dht::token;
using inet_address = gms::inet_address;

namespace service {

logging::logger rtlogger("raft_topology");

locator::host_id to_host_id(raft::server_id id) {
    return locator::host_id{id.uuid()};
}

future<> wait_for_gossiper(raft::server_id id, const gms::gossiper& g, abort_source& as) {
    const auto timeout = std::chrono::seconds{30};
    const auto deadline = lowres_clock::now() + timeout;
    while (true) {
        auto hids = g.get_node_ip(to_host_id(id));
        if (hids) {
            co_return;
        }
        if (lowres_clock::now() > deadline) {
            co_await coroutine::exception(std::make_exception_ptr(
                wait_for_ip_timeout(id, std::chrono::duration_cast<std::chrono::seconds>(timeout).count())));
        }
        static thread_local logger::rate_limit rate_limit{std::chrono::seconds(1)};
        rtlogger.log(log_level::warn, rate_limit, "cannot map {} to ip, retrying.", id);
        co_await sleep_abortable(std::chrono::milliseconds(5), as);
    }
}

namespace {
// Doesn't throw error on absence of values.
sstring get_application_state_gently(const gms::application_state_map& epmap, gms::application_state app_state) {
    const auto it = epmap.find(app_state);
    if (it == epmap.end()) {
        return sstring{};
    }
    // it's versioned_value::value(), not std::optional::value() - it does not throw
    return it->second.value();
}
} // namespace

class topology_coordinator : public endpoint_lifecycle_subscriber
                           , public migration_listener::empty_listener {
    sharded<db::system_distributed_keyspace>& _sys_dist_ks;
    gms::gossiper& _gossiper;
    netw::messaging_service& _messaging;
    locator::shared_token_metadata& _shared_tm;
    db::system_keyspace& _sys_ks;
    replica::database& _db;
    utils::updateable_value<uint32_t> _tablet_load_stats_refresh_interval_in_seconds;
    service::raft_group0& _group0;
    service::topology_state_machine& _topo_sm;
    db::view::view_building_state_machine& _vb_sm;
    abort_source& _as;
    gms::feature_service& _feature_service;
    endpoint_lifecycle_notifier& _lifecycle_notifier;
    qos::service_level_controller& _sl_controller;

    raft::server& _raft;
    const raft::term_t _term;
    uint64_t _last_cmd_index = 0;

    raft_topology_cmd_handler_type _raft_topology_cmd_handler;

    tablet_allocator& _tablet_allocator;
    std::unique_ptr<db::view::view_building_coordinator> _vb_coordinator;

    cdc::generation_service& _cdc_gens;

    // The reason load_stats_ptr is a shared ptr is that load balancer can yield, and we don't want it
    // to suffer lifetime issues when stats refresh fiber overrides the current stats.
    std::unordered_map<locator::host_id, locator::load_stats> _load_stats_per_node;
    serialized_action _tablet_load_stats_refresh;

    static constexpr std::chrono::seconds cdc_streams_gc_refresh_interval = std::chrono::seconds(60);

    std::chrono::milliseconds _ring_delay;

    gate::holder _group0_holder;

    using drop_guard_and_retake = bool_class<class retake_guard_tag>;

    // Engaged if an ongoing topology change should be rolled back. The string inside
    // will indicate a reason for the rollback.
    std::optional<sstring> _rollback;

    group0_voter_handler _voter_handler;

    topology_coordinator_cmd_rpc_tracker& _topology_cmd_rpc_tracker;

    const locator::token_metadata& get_token_metadata() const noexcept {
        return *_shared_tm.get();
    }

    locator::token_metadata_ptr get_token_metadata_ptr() const noexcept {
        return _shared_tm.get();
    }

    // This is a topology snapshot for a given node. It contains pointers into the topology state machine
    // that may be outdated after guard is released so the structure is meant to be destroyed together
    // with the guard
    struct node_to_work_on {
        group0_guard guard;
        const topology_state_machine::topology_type* topology;
        raft::server_id id;
        const replica_state* rs;
        std::optional<topology_request> request;
        std::optional<request_param> req_param;
    };

    future<group0_guard> cleanup_group0_config_if_needed(group0_guard guard) {
        auto& topo = _topo_sm._topology;
        auto rconf = _group0.group0_server().get_configuration();
        if (!rconf.is_joint()) {
            // Find nodes that 'left' but still in the config and remove them
            auto to_remove = std::ranges::to<std::vector<raft::server_id>>(
                    rconf.current
                    | std::views::transform([&] (const raft::config_member& m) { return m.addr.id; })
                    | std::views::filter([&] (const raft::server_id& id) { return topo.left_nodes.contains(id); }));
            if (!to_remove.empty()) {
                // Remove from group 0 nodes that left. They may failed to do so by themselves
                release_guard(std::move(guard));
                try {
                    rtlogger.debug("topology coordinator fiber removing {}"
                                  " from raft since they are in `left` state", to_remove);
                    co_await _group0.group0_server().modify_config({}, to_remove, &_as);
                } catch (const raft::commit_status_unknown&) {
                    rtlogger.warn("topology coordinator fiber got commit_status_unknown status"
                                  " while removing {} from raft", to_remove);
                }
                guard = co_await start_operation();
            }
        }
        co_return std::move(guard);
    }

    struct cancel_requests {
        group0_guard guard;
        std::unordered_set<raft::server_id> dead_nodes;
    };

    // Request to start a cluster-wide cleanup of vnodes-based tables on nodes marked
    // as cleanup_need. This step is required before removenode or decommission
    // topology operations.
    struct start_vnodes_cleanup {
        group0_guard guard;
        topology_request request;
        raft::server_id request_server_id;
    };

    // Return dead nodes
    std::unordered_set<locator::host_id> get_dead_nodes() const {
        std::unordered_set<locator::host_id> dead_set;
        for (auto& n : _topo_sm._topology.normal_nodes) {
            bool alive = false;
            try {
                alive = _gossiper.is_alive(locator::host_id(n.first.uuid()));
            } catch (...) {}

            if (!alive) {
                dead_set.insert(locator::host_id(n.first.uuid()));
            }
        }
        return dead_set;
    }

    // Return dead nodes and while at it checking if there are live nodes that either need cleanup
    // or running one already
    std::unordered_set<raft::server_id> get_dead_node(bool& cleanup_running, bool& cleanup_needed) const {
        std::unordered_set<raft::server_id> dead_set;
        cleanup_needed = cleanup_running = false;
        for (auto& n : _topo_sm._topology.normal_nodes) {
            bool alive = false;
            try {
                alive = _gossiper.is_alive(locator::host_id(n.first.uuid()));
            } catch (...) {}

            if (!alive) {
                dead_set.insert(n.first);
            } else {
                cleanup_running |= (n.second.cleanup == cleanup_status::running);
                cleanup_needed |= (n.second.cleanup == cleanup_status::needed);
            }
        }
        return dead_set;
    }

    std::optional<request_param> get_request_param(raft::server_id id) const {
        return _topo_sm._topology.get_request_param(id);
    };

    // Returns:
    // guard - there is nothing to do.
    // cancel_requests - no request can be started so cancel the queue
    // start_vnodes_cleanup - cleanup needs to be started
    // node_to_work_on - the node the topology coordinator should work on
    std::variant<group0_guard, cancel_requests, start_vnodes_cleanup, node_to_work_on> get_next_task(group0_guard guard) {
        auto& topo = _topo_sm._topology;

        if (topo.transition_nodes.size() != 0) {
            // If there is a node that is the middle of topology operation continue with it
            return get_node_to_work_on(std::move(guard));
        }

        bool cleanup_running;
        bool cleanup_needed;
        const auto dead_nodes = get_dead_node(cleanup_running, cleanup_needed);

        if (cleanup_running || topo.requests.empty()) {
            // Ether there is no requests or there is a live node that runs cleanup. Wait for it to complete.
            return std::move(guard);
        }

        std::optional<std::pair<raft::server_id, topology_request>> next_req;
        bool at_least_one_can_proceed = false;

        for (auto& req : topo.requests) {
            auto enough_live_nodes = [&] {
                if (req.second == topology_request::rebuild) {
                    // For rebuild only the node itself should be alive to start it
                    // it may still fail if down node has data for the rebuild process
                    return !dead_nodes.contains(req.first);
                }
                auto exclude_nodes = get_excluded_nodes_for_topology_request(topo, req.first, req.second);
                for (auto id : dead_nodes) {
                    if (!exclude_nodes.contains(id)) {
                        return false;
                    }
                }
                return true;
            };
            if (enough_live_nodes()) {
                at_least_one_can_proceed = true;
                if (!topo.paused_requests.contains(req.first)) {
                    if (!next_req || next_req->second > req.second) {
                        next_req = req;
                    }
                }
            }
        }

        if (!next_req) {
            if (at_least_one_can_proceed) {
                return std::move(guard);
            }
            // We did not find a request that has enough live node to proceed
            // Cancel all requests to let admin know that no operation can succeed
            rtlogger.info("topology coordinator: no request can proceed. Dead nodes: {}", dead_nodes);
            return cancel_requests{std::move(guard), std::move(dead_nodes)};
        }

        auto [id, req] = *next_req;

        auto* server_rs = topo.find(id);
        if (!server_rs) {
            on_internal_error(rtlogger, format("Node {} has a pending {} request but is not found in topology", id, req));
        }

        if (cleanup_needed && (req == topology_request::remove || req == topology_request::leave)) {
            // If the highest prio request is removenode or decommission we need to start cleanup if one is needed
            return start_vnodes_cleanup(std::move(guard), req, id);
        }

        return node_to_work_on(std::move(guard), &topo, id, &server_rs->second, req, get_request_param(id));
    };

    node_to_work_on get_node_to_work_on(group0_guard guard) const {
        auto& topo = _topo_sm._topology;

        if (topo.transition_nodes.empty()) {
            on_internal_error(rtlogger, ::format(
                "could not find node to work on"
                " even though the state requires it (state: {})", topo.tstate));
        }

        auto e = &*topo.transition_nodes.begin();
        return node_to_work_on{std::move(guard), &topo, e->first, &e->second, std::nullopt, get_request_param(e->first)};
     };

    future<group0_guard> start_operation() {
        rtlogger.debug("obtaining group 0 guard...");
        auto guard = co_await _group0.client().start_operation(_as);
        rtlogger.debug("guard taken, prev_state_id: {}, new_state_id: {}, coordinator term: {}, current Raft term: {}",
                       guard.observed_group0_state_id(), guard.new_group0_state_id(), _term, _raft.get_current_term());

        if (_term != _raft.get_current_term()) {
            throw term_changed_error{};
        }

        co_return std::move(guard);
    }

    void release_node(std::optional<node_to_work_on> node) {
        // Leaving the scope destroys the object and releases the guard.
    }

    node_to_work_on retake_node(group0_guard guard, raft::server_id id) const {
        auto& topo = _topo_sm._topology;

        auto it = topo.find(id);
        if (!it) {
            on_internal_error(rtlogger, format("retake_node: node {} not found in topology", id));
        }

        std::optional<topology_request> req;
        auto rit = topo.requests.find(id);
        if (rit != topo.requests.end()) {
            req = rit->second;
        }
        std::optional<request_param> req_param;
        auto pit = topo.req_param.find(id);
        if (pit != topo.req_param.end()) {
            req_param = pit->second;
        }
        return node_to_work_on{std::move(guard), &topo, id, &it->second, std::move(req), std::move(req_param)};
    }

    group0_guard take_guard(node_to_work_on&& node) {
        return std::move(node.guard);
    }

    future<> update_topology_state(
            group0_guard guard, utils::chunked_vector<canonical_mutation>&& updates, const sstring& reason) {
        try {
            rtlogger.info("updating topology state: {}", reason);
            rtlogger.trace("update_topology_state mutations: {}", updates);
            topology_change change{std::move(updates)};
            group0_command g0_cmd = _group0.client().prepare_command(std::move(change), guard, reason);
            co_await _group0.client().add_entry(std::move(g0_cmd), std::move(guard), _as);
        } catch (group0_concurrent_modification&) {
            rtlogger.info("race while changing state: {}. Retrying", reason);
            throw;
        }
    };

    future<> update_topology_state_with_mixed_change(
            group0_guard guard, utils::chunked_vector<canonical_mutation>&& updates, const sstring& reason) {
        try {
            rtlogger.info("updating topology state with mixed change: {}", reason);
            rtlogger.trace("update_topology_state mutations: {}", updates);
            mixed_change change{std::move(updates)};
            group0_command g0_cmd = _group0.client().prepare_command(std::move(change), guard, reason);
            co_await _group0.client().add_entry(std::move(g0_cmd), std::move(guard), _as);
        } catch (group0_concurrent_modification&) {
            rtlogger.info("race while changing state: {}. Retrying", reason);
            throw;
        }
    }

    raft::server_id parse_replaced_node(const std::optional<request_param>& req_param) const {
        return service::topology::parse_replaced_node(req_param);
    }

    future<> exec_direct_command_helper(raft::server_id id, uint64_t cmd_index, raft_topology_cmd cmd) {
        rtlogger.debug("send {} command with term {} and index {} to {}",
            cmd.cmd, _term, cmd_index, id);
        _topology_cmd_rpc_tracker.active_dst.emplace(id);
        auto _ = seastar::defer([this, id] { _topology_cmd_rpc_tracker.active_dst.erase(id); });

        auto result = _db.get_token_metadata().get_topology().is_me(to_host_id(id)) ?
                    co_await _raft_topology_cmd_handler(_term, cmd_index, cmd) :
                    co_await ser::storage_service_rpc_verbs::send_raft_topology_cmd(
                            &_messaging, to_host_id(id), id, _term, cmd_index, cmd);
        if (result.status == raft_topology_cmd_result::command_status::fail) {
            auto msg = result.error_message.empty()
                ? ::format("failed status returned from {}", id)
                : ::format("failed status returned from {}: {}", id, result.error_message);
            co_await coroutine::exception(std::make_exception_ptr(
                    std::runtime_error(std::move(msg))));
        }
    };

    future<node_to_work_on> exec_direct_command(node_to_work_on&& node, raft_topology_cmd cmd) {
        auto id = node.id;
        release_node(std::move(node));
        const auto cmd_index = ++_last_cmd_index;
        _topology_cmd_rpc_tracker.current = cmd.cmd;
        _topology_cmd_rpc_tracker.index = cmd_index;
        co_await exec_direct_command_helper(id, cmd_index, cmd);
        co_return retake_node(co_await start_operation(), id);
    };

    future<> exec_global_command_helper(auto nodes, raft_topology_cmd cmd) {
        const auto cmd_index = ++_last_cmd_index;
        _topology_cmd_rpc_tracker.current = cmd.cmd;
        _topology_cmd_rpc_tracker.index = cmd_index;
        auto f = co_await coroutine::as_future(
                seastar::parallel_for_each(std::move(nodes), [this, &cmd, cmd_index] (raft::server_id id) {
            return exec_direct_command_helper(id, cmd_index, cmd);
        }));

        if (f.failed()) {
            co_await coroutine::return_exception(std::runtime_error(
                ::format("raft topology: exec_global_command({}) failed with {}",
                    cmd.cmd, f.get_exception())));
        }
    };

    future<group0_guard> exec_global_command(
            group0_guard guard, raft_topology_cmd cmd,
            const std::unordered_set<raft::server_id>& exclude_nodes,
            drop_guard_and_retake drop_and_retake = drop_guard_and_retake::yes) {
        rtlogger.info("executing global topology command {}, excluded nodes: {}", cmd.cmd, exclude_nodes);
        auto nodes = boost::range::join(_topo_sm._topology.normal_nodes, _topo_sm._topology.transition_nodes)
            | std::views::filter([&exclude_nodes] (const std::pair<const raft::server_id, replica_state>& n) {
                return !exclude_nodes.contains(n.first);
            })
            | std::views::keys;
        if (drop_and_retake) {
            release_guard(std::move(guard));
        }
        co_await exec_global_command_helper(std::move(nodes), cmd);
        if (drop_and_retake) {
            guard = co_await start_operation();
        }
        co_return guard;
    }

    std::unordered_set<raft::server_id> get_excluded_nodes_for_topology_request(const topology_state_machine::topology_type& topo,
                raft::server_id id, const std::optional<topology_request>& req) const {
        // ignored_nodes is not per request any longer, but for now consider ignored nodes only
        // for remove and replace operations since only those operations support it on streaming level.
        // Specifically, streaming for bootstrapping doesn't support ignored_nodes
        // (see raft_topology_cmd::command::stream_ranges handler in storage_proxy, neither
        // bootstrap_with_repair nor bs.bootstrap take an ignored_nodes parameter).
        // If we ignored a dead node for a join request here, this request wouldn't be cancelled by
        // get_next_task and the stream_ranges would fail with timeout after wasting some time
        // trying to access the dead node.
        const auto is_remove_or_replace = std::invoke([&]() {
            if (req) {
                return *req == topology_request::remove || *req == topology_request::replace;
            }
            const auto it = topo.transition_nodes.find(id);
            if (it == topo.transition_nodes.end()) {
                return false;
            }
            const auto s = it->second.state;
            return s == node_state::removing || s == node_state::replacing;
        });

        std::unordered_set<raft::server_id> exclude_nodes;
        if (is_remove_or_replace) {
            exclude_nodes = topo.ignored_nodes;
        }
        return exclude_nodes;
    }

    std::unordered_set<raft::server_id> get_excluded_nodes_for_topology_request(const node_to_work_on& node) const {
        return get_excluded_nodes_for_topology_request(*node.topology, node.id, node.request);
    }

    future<group0_guard> remove_from_group0(group0_guard guard, const raft::server_id& id) {
        rtlogger.info("removing node {} from group 0 configuration...", id);
        release_guard(std::move(guard));
        co_await _voter_handler.on_node_removed(id, _as);
        co_await _group0.remove_from_raft_config(id);
        rtlogger.info("node {} removed from group 0 configuration", id);
        co_return co_await start_operation();
    }

    future<> step_down_as_nonvoter() {
        // Become a nonvoter which triggers a leader stepdown.
        co_await _voter_handler.on_node_removed(_raft.id(), _as);
        if (_raft.is_leader()) {
            co_await _raft.wait_for_state_change(&_as);
        }

        // throw term_changed_error so we leave the coordinator loop instead of trying another
        // read_barrier which may fail with an (harmless, but unnecessary and annoying) error
        // telling us we're not in the configuration anymore (we'll get removed by the new
        // coordinator)
        throw term_changed_error{};
    }

    struct bootstrapping_info {
        const std::unordered_set<token>& bootstrap_tokens;
        const replica_state& rs;
    };

    // Returns data for a new CDC generation in the form of mutations for the CDC_GENERATIONS_V3 table
    // and the generation's UUID.
    //
    // If there's a bootstrapping node, its tokens should be included in the new generation.
    // Pass them and a reference to the bootstrapping node's replica_state through `binfo`.
    future<std::pair<utils::UUID, utils::chunked_vector<mutation>>> prepare_new_cdc_generation_data(
            locator::token_metadata_ptr tmptr, const group0_guard& guard, std::optional<bootstrapping_info> binfo,
            noncopyable_function<std::pair<size_t, uint8_t>(locator::host_id)> get_sharding_info_for_host_id) {
        auto get_sharding_info = [&] (dht::token end) -> std::pair<size_t, uint8_t> {
            if (binfo && binfo->bootstrap_tokens.contains(end)) {
                return {binfo->rs.shard_count, binfo->rs.ignore_msb};
            } else {
                auto ep = tmptr->get_endpoint(end);
                if (!ep) {
                    // get_sharding_info is only called for bootstrap tokens
                    // or for tokens present in token_metadata
                    on_internal_error(rtlogger, ::format(
                        "make_new_cdc_generation_data: get_sharding_info:"
                        " can't find endpoint for token {}", end));
                }

                try {
                    return get_sharding_info_for_host_id(*ep);
                } catch (...) {
                    on_internal_error(rtlogger, ::format(
                        "make_new_cdc_generation_data: get_sharding_info:"
                        " can't get sharding info for node {}, owner of token {}."
                        " Reason: {}", *ep, end, std::current_exception()));
                }
            }
        };

        auto gen_uuid = guard.new_group0_state_id();
        auto gen_desc = cdc::make_new_generation_description(
            binfo ? binfo->bootstrap_tokens : std::unordered_set<token>{}, get_sharding_info, tmptr);
        auto gen_table_schema = _db.find_schema(
            db::system_keyspace::NAME, db::system_keyspace::CDC_GENERATIONS_V3);

        const size_t max_command_size = _raft.max_command_size();
        const size_t mutation_size_threshold = max_command_size / 2;
        auto gen_mutations = co_await cdc::get_cdc_generation_mutations_v3(
            gen_table_schema, gen_uuid, gen_desc, mutation_size_threshold, guard.write_timestamp());

        co_return std::pair{gen_uuid, std::move(gen_mutations)};
    }

    // Broadcasts all mutations returned from `prepare_new_cdc_generation_data` except the last one.
    // Each mutation is sent in separate raft command. It takes `group0_guard`, and if the number of mutations
    // is greater than one, the guard is dropped, and a new one is created and returned, otherwise the old one
    // will be returned. Commands are sent in parallel and unguarded (the guard used for sending the last mutation
    // will guarantee that the term hasn't been changed). Returns the generation's UUID, guard and last mutation,
    // which will be sent with additional topology data by the caller.
    //
    // If we send the last mutation in the `write_mutation` command, we would use a total of `n + 1` commands
    // instead of `n-1 + 1` (where `n` is the number of mutations), so it's better to send it in `topology_change`
    // (we need to send it after all `write_mutations`) with some small metadata.
    //
    // With the default commitlog segment size, `mutation_size_threshold` will be 4 MB. In large clusters e.g.
    // 100 nodes, 64 shards per node, 256 vnodes cdc generation data can reach the size of 30 MB, thus
    // there will be no more than 8 commands.
    //
    // In a multi-DC cluster with 100ms latencies between DCs, this operation should take about 200ms since we
    // send the commands concurrently, but even if the commands were replicated sequentially by Raft,
    // it should take no more than 1.6s which is incomparably smaller than bootstrapping operation
    // (bootstrapping is quick if there is no data in the cluster, but usually if one has 100 nodes they
    // have tons of data, so indeed streaming/repair will take much longer (hours/days)).
    future<std::tuple<utils::UUID, group0_guard, canonical_mutation>> prepare_and_broadcast_cdc_generation_data(
            locator::token_metadata_ptr tmptr, group0_guard guard, std::optional<bootstrapping_info> binfo) {

        auto get_sharding_info_for_host_id = [&] (locator::host_id ep) -> std::pair<size_t, uint8_t> {
            auto ptr = _topo_sm._topology.find(raft::server_id{ep.uuid()});
            if (!ptr) {
                throw std::runtime_error(::format(
                        "raft topology: prepare_and_broadcast_cdc_generation_data: get_sharding_info_for_host_id:"
                        " couldn't find node {} in topology", ep));
            }

            auto& rs = ptr->second;
            return {rs.shard_count, rs.ignore_msb};
        };

        co_return co_await prepare_and_broadcast_cdc_generation_data(tmptr, std::move(guard), binfo, get_sharding_info_for_host_id);
    }

    future<std::tuple<utils::UUID, group0_guard, canonical_mutation>> prepare_and_broadcast_cdc_generation_data(
            locator::token_metadata_ptr tmptr, group0_guard guard, std::optional<bootstrapping_info> binfo,
            noncopyable_function<std::pair<size_t, uint8_t>(locator::host_id)> get_sharding_info_for_host_id) {

        auto [gen_uuid, gen_mutations] = co_await prepare_new_cdc_generation_data(tmptr, guard, binfo, std::move(get_sharding_info_for_host_id));

        if (gen_mutations.empty()) {
            on_internal_error(rtlogger, "cdc_generation_data: gen_mutations is empty");
        }

        utils::chunked_vector<canonical_mutation> updates{gen_mutations.begin(), gen_mutations.end()};

        if (updates.size() > 1) {
            release_guard(std::move(guard));

            co_await parallel_for_each(updates.begin(), std::prev(updates.end()), [this, gen_uuid = gen_uuid] (canonical_mutation& m) {
                auto const reason = format(
                    "insert CDC generation data (UUID: {}), part", gen_uuid);

                rtlogger.trace("do update {} reason {}", m, reason);
                write_mutations change{{std::move(m)}};
                group0_command g0_cmd = _group0.client().prepare_command(std::move(change), reason);
                return _group0.client().add_entry_unguarded(std::move(g0_cmd), &_as);
            });

            guard = co_await start_operation();
        }

        co_return std::tuple{gen_uuid, std::move(guard), std::move(updates.back())};
    }

    // Deletes obsolete CDC generations. These are the published generations that stopped operating
    // sufficiently long ago (see comment below).
    //
    // Appends necessary mutations to `updates` and updates the `reason` string.
    future<> clean_obsolete_cdc_generations(
            const group0_guard& guard,
            utils::chunked_vector<canonical_mutation>& updates,
            sstring& reason) {
        const auto& committed_gens = _topo_sm._topology.committed_cdc_generations;
        if (committed_gens.empty()) {
            co_return;
        }

        // If some node can still accept a write to a CDC generation, we cannot delete this generation.
        // A request coordinator accepts a write to one of the previous generations (the ones that stopped
        // operating before `now`) if the write's timestamp is higher than `now - 5s` where `now` is
        // provided by the coordinator's local clock. So, we can safely delete a generation if it
        // stopped operating more than 5s ago on all nodes. But since their clocks can be desynchronized,
        // we have to account for that. We assume that the max clock difference is within `ring_delay`.
        // So we delete only generations that stopped operating more than `5s + ring_delay` ago.
        auto ts_upper_bound = db_clock::now() - (cdc::get_generation_leeway() + _ring_delay);
        utils::get_local_injector().inject("clean_obsolete_cdc_generations_change_ts_ub", [&] {
            ts_upper_bound = db_clock::now();
        });

        // Theoretically, some generations might not be published within `5s + ring_delay` after creation.
        // If that happens, we reduce `ts_upper_bound` to ensure they aren't deleted.
        if (!_topo_sm._topology.unpublished_cdc_generations.empty()) {
            auto first_unpublished_ts = _topo_sm._topology.unpublished_cdc_generations.front().ts;
            if (first_unpublished_ts < ts_upper_bound) {
                ts_upper_bound = first_unpublished_ts;
            }
        }

        std::optional<std::vector<cdc::generation_id>::const_iterator> first_nonobsolete_gen_it;
        for (auto it = committed_gens.begin(); it != committed_gens.end() && it->ts <= ts_upper_bound; it++) {
            if (it + 1 == committed_gens.end() || (it + 1)->ts > ts_upper_bound) {
                first_nonobsolete_gen_it = it;
            }
        }
        if (!first_nonobsolete_gen_it || *first_nonobsolete_gen_it == committed_gens.begin()) {
            co_return;
        }

        auto mut_ts = guard.write_timestamp();

        // Insert a tombstone covering all obsolete generations.
        auto s = _db.find_schema(db::system_keyspace::NAME, db::system_keyspace::CDC_GENERATIONS_V3);
        auto id_upper_bound = (*first_nonobsolete_gen_it)->id;
        mutation m(s, partition_key::from_singular(*s, cdc::CDC_GENERATIONS_V3_KEY));
        auto range = query::clustering_range::make_ending_with({
                clustering_key_prefix::from_single_value(*s, timeuuid_type->decompose(id_upper_bound)), false});
        auto bv = bound_view::from_range(range);
        m.partition().apply_delete(*s, range_tombstone{bv.first, bv.second, tombstone{mut_ts, gc_clock::now()}});
        updates.push_back(canonical_mutation(m));

        std::vector<cdc::generation_id> new_committed_gens(*first_nonobsolete_gen_it, committed_gens.end());
        topology_mutation_builder builder(guard.write_timestamp());
        builder.set_committed_cdc_generations(std::move(new_committed_gens));
        updates.push_back(builder.build());

        reason += ::format("deleted data of CDC generations with time UUID lower than {}", id_upper_bound);
    }

    // If there are some unpublished CDC generations, publishes the one with the oldest timestamp
    // to user-facing description tables.
    //
    // Appends necessary mutations to `updates` and updates the `reason` string.
    future<> publish_oldest_cdc_generation(
            const group0_guard& guard,
            utils::chunked_vector<canonical_mutation>& updates,
            sstring& reason) {
        const auto& unpublished_gens = _topo_sm._topology.unpublished_cdc_generations;
        if (unpublished_gens.empty()) {
            co_return;
        }

        // The generation under index 0 is the oldest because unpublished_cdc_generations are sorted by timestamp.
        auto gen_id = unpublished_gens[0];

        auto gen_data = co_await _sys_ks.read_cdc_generation(gen_id.id);

        co_await _sys_dist_ks.local().create_cdc_desc(
                gen_id.ts, gen_data, { get_token_metadata().count_normal_token_owners() });

        std::vector<cdc::generation_id> new_unpublished_gens(unpublished_gens.begin() + 1, unpublished_gens.end());
        topology_mutation_builder builder(guard.write_timestamp());
        builder.set_unpublished_cdc_generations(std::move(new_unpublished_gens));
        updates.push_back(builder.build());

        reason += ::format("published CDC generation with ID {}, ", gen_id);
    }

    // The background fiber of the topology coordinator that continually publishes committed yet unpublished
    // CDC generations. Every generation is published in a separate group 0 operation.
    //
    // It also continually cleans the obsolete CDC generation data.
    future<> cdc_generation_publisher_fiber() {
        rtlogger.debug("start CDC generation publisher fiber");

        while (!_as.abort_requested()) {
            co_await utils::get_local_injector().inject("cdc_generation_publisher_fiber", [] (auto& handler) -> future<> {
                rtlogger.info("CDC generation publisher fiber sleeps after injection");
                co_await handler.wait_for_message(std::chrono::steady_clock::now() + std::chrono::minutes{5});
                rtlogger.info("CDC generation publisher fiber finishes sleeping after injection");
            }, false);

            bool sleep = false;
            try {
                auto guard = co_await start_operation();
                utils::chunked_vector<canonical_mutation> updates;
                sstring reason;

                co_await publish_oldest_cdc_generation(guard, updates, reason);

                co_await clean_obsolete_cdc_generations(guard, updates, reason);

                if (!updates.empty()) {
                    co_await update_topology_state(std::move(guard), std::move(updates), std::move(reason));
                } else {
                    release_guard(std::move(guard));
                }

                if (_topo_sm._topology.unpublished_cdc_generations.empty()) {
                    // No CDC generations to publish. Wait until one appears or the topology coordinator aborts.
                    rtlogger.debug("CDC generation publisher fiber has nothing to do. Sleeping.");
                    co_await _topo_sm.event.when([&] () {
                        return !_topo_sm._topology.unpublished_cdc_generations.empty() || _as.abort_requested();
                    });
                    rtlogger.debug("CDC generation publisher fiber wakes up");
                }
            } catch (raft::request_aborted&) {
                rtlogger.debug("CDC generation publisher fiber aborted");
            } catch (seastar::abort_requested_exception&) {
                rtlogger.debug("CDC generation publisher fiber aborted");
            } catch (group0_concurrent_modification&) {
            } catch (term_changed_error&) {
                rtlogger.debug("CDC generation publisher fiber notices term change {} -> {}", _term, _raft.get_current_term());
            } catch (...) {
                rtlogger.error("CDC generation publisher fiber got error {}", std::current_exception());
                sleep = true;
            }
            if (sleep) {
                try {
                    co_await seastar::sleep_abortable(std::chrono::seconds(1), _as);
                } catch (...) {
                    rtlogger.debug("CDC generation publisher: sleep failed: {}", std::current_exception());
                }
            }
            co_await coroutine::maybe_yield();
        }
    }

    future<> cdc_streams_gc_fiber() {
        auto can_proceed = [this] { return !_async_gate.is_closed() && !_as.abort_requested(); };
        while (can_proceed()) {
            bool sleep = true;
            try {
                auto guard = co_await start_operation();
                utils::chunked_vector<canonical_mutation> updates;

                co_await _cdc_gens.garbage_collect_cdc_streams(updates, guard.write_timestamp());

                if (!updates.empty()) {
                    co_await update_topology_state(std::move(guard), std::move(updates), "CDC streams GC");
                } else {
                    release_guard(std::move(guard));
                }
            } catch (raft::request_aborted&) {
                rtlogger.debug("CDC streams GC fiber aborted");
                sleep = false;
            } catch (seastar::abort_requested_exception&) {
                rtlogger.debug("CDC streams GC fiber aborted");
                sleep = false;
            } catch (...) {
                rtlogger.warn("CDC streams GC fiber got error {}", std::current_exception());
            }
            auto refresh_interval = utils::get_local_injector().is_enabled("short_cdc_streams_gc_refresh_interval") ?
                    std::chrono::seconds(1) : cdc_streams_gc_refresh_interval;
            if (sleep && can_proceed()) {
                try {
                    co_await seastar::sleep_abortable(refresh_interval, _as);
                } catch (...) {
                    rtlogger.debug("CDC streams GC: sleep failed: {}", std::current_exception());
                }
            }
        }
    }

    // If a node crashes after initiating gossip and before joining group0, it becomes orphan and
    // remains in gossip. This background fiber periodically checks for such nodes and purges those
    // entries from gossiper by committing the node as left in raft.
    future<> gossiper_orphan_remover_fiber() {
        rtlogger.debug("start gossiper orphan remover fiber");
        bool do_speedup_fiber = false;
        co_await utils::get_local_injector().inject("fast_orphan_removal_fiber", [&](auto& handler) -> future<> {
            do_speedup_fiber = true;
            // While testing, orphan ip is introduced, and its presence is captured. Wait is added before remover thread so that the presence of orphan ip is
            // confirmed and the difference of gossiper ips before and after this thread application can be easily asserted.
            return handler.wait_for_message(db::timeout_clock::now() + std::chrono::minutes(5));
        });
        while (!_as.abort_requested()) {
            try {
                auto guard = co_await start_operation();
                utils::chunked_vector<canonical_mutation> updates;
                int32_t timeout = 60;
                co_await utils::get_local_injector().inject("speedup_orphan_removal", [&](auto& handler) -> future<> {
                    // Removes all unjoined nodes. Just for testing purposes.
                    timeout = 0;
                    co_return;
                });
                std::string reason = ::format("Ban the orphan nodes in group0. Orphan nodes HostId/IP:");
                _gossiper.for_each_endpoint_state([&](const gms::endpoint_state& eps) -> void {
                    // Since generation is in seconds unit, converting current time to seconds eases comparison computations.
                    auto current_timestamp =
                            std::chrono::duration_cast<std::chrono::seconds>(std::chrono::high_resolution_clock::now().time_since_epoch()).count();
                    auto generation = eps.get_heart_beat_state().get_generation().value();
                    auto host_id = eps.get_host_id();
                    if (current_timestamp - generation > timeout && !_topo_sm._topology.contains(raft::server_id{host_id.id}) && !_gossiper.is_alive(host_id)) {
                        topology_mutation_builder builder(guard.write_timestamp());
                        // This topology mutation moves a node to left state and bans it. Hence, the value of below fields are not useful.
                        // The dummy_value used for few fields indicates the trivialness of this row entry, and is used to detect this special case.
                        static constexpr uint32_t dummy_value = 0;
                        builder.with_node(raft::server_id{host_id.id})
                                .set("datacenter", get_application_state_gently(eps.get_application_state_map(), gms::application_state::DC))
                                .set("rack", get_application_state_gently(eps.get_application_state_map(), gms::application_state::RACK))
                                .set("release_version", get_application_state_gently(eps.get_application_state_map(), gms::application_state::RELEASE_VERSION))
                                .set("num_tokens", dummy_value)
                                .set("tokens_string", sstring{})
                                .set("shard_count", dummy_value)
                                .set("ignore_msb", dummy_value)
                                .set("request_id", dummy_value)
                                .set("cleanup_status", cleanup_status::clean)
                                .set("node_state", node_state::left);
                        reason.append(::format(" {}/{},", host_id, eps.get_ip()));
                        updates.push_back({builder.build()});
                    }
                });
                if (!updates.empty()) {
                    co_await update_topology_state(std::move(guard), std::move(updates), reason);
                }

            } catch (raft::request_aborted&) {
                rtlogger.debug("gossiper orphan remover fiber aborted");
            } catch (seastar::abort_requested_exception&) {
                rtlogger.debug("gossiper orphan remover fiber aborted");
            } catch (group0_concurrent_modification&) {
            } catch (term_changed_error&) {
                rtlogger.debug("gossiper orphan remover fiber notices term change {} -> {}", _term, _raft.get_current_term());
            } catch (...) {
                rtlogger.error("gossiper orphan remover fiber got error {}", std::current_exception());
            }
            try {
                co_await seastar::sleep_abortable(do_speedup_fiber ? std::chrono::milliseconds(1) : std::chrono::seconds(10), _as);
            } catch (...) {
                rtlogger.debug("gossiper orphan remover: sleep failed: {}", std::current_exception());
            }
            co_await coroutine::maybe_yield();
        }
    }

    future<> group0_voter_refresher_fiber() {
        rtlogger.debug("start group0 members refresh fiber");

        // Skip waiting for the event in the first iteration to start the first voter refresh immediately.
        //
        // This is necessary to ensure we update the voters e.g. in case of a topology coordinator change
        // (the previous topology coordinator might have died in which case we'll need to remove it from
        // the voters).
        bool skip_wait = true;

        while (!_as.abort_requested()) {
            try {
                // only wait for the event in case we are not retrying the operation
                if (!skip_wait) {
                    co_await await_event();
                    // Introduce a brief delay to potentially batch multiple changes that occur in close succession
                    co_await seastar::sleep_abortable(std::chrono::milliseconds{100}, _as);
                }
                skip_wait = false;

                if (!_feature_service.group0_limited_voters) {
                    rtlogger.debug("group0 voters refresh fiber iteration skipped because the feature is disabled");
                    continue;
                }

                co_await _voter_handler.refresh(_as);
            } catch (raft::request_aborted&) {
                rtlogger.debug("group0 voters refresh fiber aborted");
                break; // exit the loop immediately (not waiting for the abort source to be set)
            } catch (seastar::abort_requested_exception&) {
                rtlogger.debug("group0 voters refresh fiber aborted");
                break; // exit the loop immediately (not waiting for the abort source to be set)
            } catch (group0_concurrent_modification&) {
                rtlogger.debug("group0 voters refresh fiber notices concurrent modification, retrying...");
                skip_wait = true;
            } catch (term_changed_error&) {
                rtlogger.debug("group0 voters refresh fiber notices term change {} -> {}", _term, _raft.get_current_term());
                break; // exit the loop immediately (term change means we're not the coordinator anymore)
            } catch (...) {
                rtlogger.error("group0 voters refresh fiber got error {}", std::current_exception());
            }
        }
    }

    enum class keyspace_rf_change_kind {
        default_rf_change,
        conversion_to_rack_list,
        multi_rf_change
    };

    future<keyspace_rf_change_kind> choose_keyspace_rf_change_kind(utils::UUID req_id,
            lw_shared_ptr<keyspace_metadata> old_ks_md,
            lw_shared_ptr<keyspace_metadata> new_ks_md,
            const std::vector<schema_ptr>& tables_with_mvs) {
        const auto& new_replication_strategy_config = new_ks_md->strategy_options();
        const auto& old_replication_strategy_config = old_ks_md->strategy_options();
        auto check_needs_colocation = [&] () -> future<bool> {
            bool rack_list_conversion = false;
            for (const auto& [dc, rf_value] : new_replication_strategy_config) {
                if (std::holds_alternative<locator::rack_list>(rf_value)) {
                    auto it = old_replication_strategy_config.find(dc);
                    if (it != old_replication_strategy_config.end() && std::holds_alternative<sstring>(it->second)) {
                        rack_list_conversion = true;
                        break;
                    }
                }
            }
            co_return rack_list_conversion ? co_await requires_rack_list_colocation(_db, get_token_metadata_ptr(), &_sys_ks, req_id) : false;
        };
        auto all_changes_are_0_N = [&] {
            auto all_dcs = old_replication_strategy_config | std::views::keys;
            auto new_dcs = new_replication_strategy_config | std::views::keys;
            std::set<sstring> dcs(all_dcs.begin(), all_dcs.end());
            dcs.insert(new_dcs.begin(), new_dcs.end());
            for (const auto& dc : dcs) {
                auto old_it = old_replication_strategy_config.find(dc);
                auto new_it = new_replication_strategy_config.find(dc);
                size_t old_rf = (old_it != old_replication_strategy_config.end()) ? locator::get_replication_factor(old_it->second) : 0;
                size_t new_rf = (new_it != new_replication_strategy_config.end()) ? locator::get_replication_factor(new_it->second) : 0;
                if (old_rf == new_rf) {
                    continue;
                }
                if (old_rf != 0 && new_rf != 0) {
                    return false;
                }
            }
            return true;
        };

        if (tables_with_mvs.empty()) {
            co_return keyspace_rf_change_kind::default_rf_change;
        }
        if (co_await check_needs_colocation()) {
            co_return keyspace_rf_change_kind::conversion_to_rack_list;
        }
        if (_feature_service.keyspace_multi_rf_change && locator::uses_rack_list_exclusively(old_replication_strategy_config) && locator::uses_rack_list_exclusively(new_replication_strategy_config) && !rf_count_per_dc_equals(old_replication_strategy_config, new_replication_strategy_config) && all_changes_are_0_N()) {
            co_return keyspace_rf_change_kind::multi_rf_change;
        }
        co_return keyspace_rf_change_kind::default_rf_change;
    }

    // Precondition: there is no node request and no ongoing topology transition
    // (checked under the guard we're holding).
    future<> handle_global_request(group0_guard guard) {
        global_topology_request req;
        utils::UUID req_id;
        db::system_keyspace::topology_requests_entry req_entry;

        if (_topo_sm._topology.global_request) {
            req = *_topo_sm._topology.global_request;
            req_id = _topo_sm._topology.global_request_id.value();
        } else {
            assert(_feature_service.topology_global_request_queue);
            req_id = _topo_sm._topology.global_requests_queue[0];
            req_entry = co_await _sys_ks.get_topology_request_entry(req_id);
            req = std::get<global_topology_request>(req_entry.request_type);
        }
        switch (req) {
        case global_topology_request::new_cdc_generation: {
            rtlogger.info("new CDC generation requested");

            auto tmptr = get_token_metadata_ptr();
            auto [gen_uuid, guard_, mutation] = co_await prepare_and_broadcast_cdc_generation_data(tmptr, std::move(guard), std::nullopt);
            guard = std::move(guard_);

            topology_mutation_builder builder(guard.write_timestamp());
            // We don't delete the request now, but only after the generation is committed. If we deleted
            // the request now and received another new_cdc_generation request later, but before committing
            // the new generation, the second request would also create a new generation. Deleting requests
            // after the generation is committed prevents this from happening. The second request would have
            // no effect - it would just overwrite the first request.
            builder.set_transition_state(topology::transition_state::commit_cdc_generation)
                   .set_new_cdc_generation_data_uuid(gen_uuid)
                   .set_global_topology_request(req)
                   .set_global_topology_request_id(req_id)
                   .drop_first_global_topology_request_id(_topo_sm._topology.global_requests_queue, req_id);
            auto reason = ::format(
                "insert CDC generation data (UUID: {})", gen_uuid);
            co_await update_topology_state(std::move(guard), {std::move(mutation), builder.build()}, reason);
        }
        break;
        case global_topology_request::cleanup:
            co_await start_vnodes_cleanup_on_dirty_nodes(std::move(guard), req_id);
            break;
        case global_topology_request::keyspace_rf_change: {
            rtlogger.info("keyspace_rf_change requested");
            sstring ks_name;
            std::unordered_map<sstring, sstring> saved_ks_props;
            if (_topo_sm._topology.new_keyspace_rf_change_ks_name) {
                ks_name = *_topo_sm._topology.new_keyspace_rf_change_ks_name;
                saved_ks_props = *_topo_sm._topology.new_keyspace_rf_change_data;
            } else {
                ks_name = *req_entry.new_keyspace_rf_change_ks_name;
                saved_ks_props = *req_entry.new_keyspace_rf_change_data;
            }

            auto tbuilder_with_request_drop = [&] () {
                topology_mutation_builder tbuilder(guard.write_timestamp());
                tbuilder.set_transition_state(topology::transition_state::tablet_migration)
                        .set_version(_topo_sm._topology.version + 1)
                        .del_global_topology_request()
                        .del_global_topology_request_id()
                        .drop_first_global_topology_request_id(_topo_sm._topology.global_requests_queue, req_id);
                return tbuilder;
            };

            utils::chunked_vector<canonical_mutation> updates;
            sstring error;
            if (_db.has_keyspace(ks_name)) {
                try {
                    auto& ks = _db.find_keyspace(ks_name);
                    auto tmptr = get_token_metadata_ptr();
                    cql3::statements::ks_prop_defs new_ks_props{std::map<sstring, sstring>{saved_ks_props.begin(), saved_ks_props.end()}};
                    new_ks_props.validate();
                    auto ks_md = new_ks_props.as_ks_metadata_update(ks.metadata(), *tmptr, _db.features(), _db.get_config());
                    _db.validate_keyspace_update(*ks_md);
                    size_t unimportant_init_tablet_count = 2; // must be a power of 2
                    locator::tablet_map new_tablet_map{unimportant_init_tablet_count};

                    auto tables_with_mvs = ks.metadata()->tables();
                    auto views = ks.metadata()->views();
                    tables_with_mvs.insert(tables_with_mvs.end(), views.begin(), views.end());
                    auto rf_change_kind = co_await choose_keyspace_rf_change_kind(req_id, ks.metadata(), ks_md, tables_with_mvs);
                    switch (rf_change_kind) {
                        case keyspace_rf_change_kind::default_rf_change: {
                            if (!tables_with_mvs.empty()) {
                                auto table = tables_with_mvs.front();
                                auto tablet_count = tmptr->tablets().get_tablet_map(table->id()).tablet_count();
                                locator::replication_strategy_params params{ks_md->strategy_options(), tablet_count, ks.metadata()->consistency_option()};
                                auto new_strategy = locator::abstract_replication_strategy::create_replication_strategy("NetworkTopologyStrategy", params, tmptr->get_topology());

                                for (const auto& table_or_mv : tables_with_mvs) {
                                    if (!tmptr->tablets().is_base_table(table_or_mv->id())) {
                                        // Apply the transition only on base tables.
                                        // If this table has a base table then the transition will be applied on the base table, and
                                        // the base table will coordinate the transition for the entire group.
                                        continue;
                                    }
                                    auto old_tablets = co_await tmptr->tablets().get_tablet_map(table_or_mv->id()).clone_gently();
                                    new_tablet_map = co_await new_strategy->maybe_as_tablet_aware()->reallocate_tablets(table_or_mv, tmptr, co_await old_tablets.clone_gently());

                                    replica::tablet_mutation_builder tablet_mutation_builder(guard.write_timestamp(), table_or_mv->id());
                                    co_await new_tablet_map.for_each_tablet([&](locator::tablet_id tablet_id, const locator::tablet_info& tablet_info) -> future<> {
                                        auto last_token = new_tablet_map.get_last_token(tablet_id);
                                        auto old_tablet_info = old_tablets.get_tablet_info(last_token);
                                        auto abandoning_replicas = locator::substract_sets(old_tablet_info.replicas, tablet_info.replicas);
                                        auto new_replicas = locator::substract_sets(tablet_info.replicas, old_tablet_info.replicas);
                                        if (abandoning_replicas.size() + new_replicas.size() > 1) {
                                            throw std::runtime_error(fmt::format("Invalid state of a tablet {} of a table {}.{}. Expected replication factor: {}, but the tablet has replicas only on {}. "
                                                "Try again later or use the \"Fixing invalid replica state with RF change\" procedure to fix the problem.", tablet_id, ks_name, table_or_mv->cf_name(),
                                                ks.get_replication_strategy().get_replication_factor(*tmptr), old_tablet_info.replicas));
                                        }

                                        updates.emplace_back(co_await make_canonical_mutation_gently(
                                                replica::tablet_mutation_builder(guard.write_timestamp(), table_or_mv->id())
                                                        .set_new_replicas(last_token, tablet_info.replicas)
                                                        .set_stage(last_token, locator::tablet_transition_stage::allow_write_both_read_old)
                                                        .set_transition(last_token, locator::choose_rebuild_transition_kind(_feature_service))
                                                        .build()
                                        ));

                                        // Calculate abandoning replica and abort view building tasks on them
                                        if (!abandoning_replicas.empty()) {
                                            if (abandoning_replicas.size() != 1) {
                                                on_internal_error(rtlogger, fmt::format("Keyspace RF abandons {} replicas for table {} and tablet id {}", abandoning_replicas.size(), table_or_mv->id(), tablet_id));
                                            }
                                            _vb_coordinator->abort_tasks(updates, guard, table_or_mv->id(), *abandoning_replicas.begin(), last_token);
                                        }

                                        co_await coroutine::maybe_yield();
                                    });
                                }
                            }
                            auto schema_muts = prepare_keyspace_update_announcement(_db, ks_md, guard.write_timestamp());
                            for (auto& m: schema_muts) {
                                updates.emplace_back(m);
                            }

                            updates.push_back(canonical_mutation(tbuilder_with_request_drop().build()));
                            updates.push_back(canonical_mutation(topology_request_tracking_mutation_builder(req_id)
                                                        .done()
                                                        .build()));
                            break;
                        }
                        case keyspace_rf_change_kind::conversion_to_rack_list: {
                            rtlogger.info("keyspace_rf_change for keyspace {} postponed for colocation", ks_name);
                            topology_mutation_builder tbuilder = tbuilder_with_request_drop();
                            tbuilder.pause_rf_change_request(req_id);
                            updates.push_back(canonical_mutation(tbuilder.build()));
                            break;
                        }
                        case keyspace_rf_change_kind::multi_rf_change: {
                            rtlogger.info("keyspace_rf_change for keyspace {} will use multi-rf change procedure", ks_name);
                            ks_md->set_next_strategy_options(ks_md->strategy_options());
                            ks_md->set_strategy_options(ks.metadata()->strategy_options()); // start from the old strategy
                            auto schema_muts = prepare_keyspace_update_announcement(_db, ks_md, guard.write_timestamp());
                            for (auto& m: schema_muts) {
                                updates.emplace_back(m);
                            }

                            topology_mutation_builder tbuilder = tbuilder_with_request_drop();
                            tbuilder.start_rf_change_migrations(req_id);
                            updates.push_back(canonical_mutation(tbuilder.build()));
                            break;
                        }
                    }
                } catch (const std::exception& e) {
                    error = e.what();
                    rtlogger.error("Couldn't process global_topology_request::keyspace_rf_change, desired new ks opts: {}, error: {}",
                                   saved_ks_props, std::current_exception());
                    updates.clear(); // remove all tablets mutations and only create mutations deleting the global req
                }
            } else {
                error = "Can't ALTER keyspace " + ks_name + ", keyspace doesn't exist";
            }

            if (error != "") {
                updates.push_back(canonical_mutation(tbuilder_with_request_drop().build()));
                updates.push_back(canonical_mutation(topology_request_tracking_mutation_builder(req_id)
                                                         .done(error)
                                                         .build()));
            }

            sstring reason = seastar::format("ALTER tablets KEYSPACE called with options: {}", saved_ks_props);
            rtlogger.trace("do update {} reason {}", updates, reason);
            mixed_change change{std::move(updates)};
            group0_command g0_cmd = _group0.client().prepare_command(std::move(change), guard, reason);
            co_await utils::get_local_injector().inject("wait-before-committing-rf-change-event", utils::wait_for_message(30s));
            co_await _group0.client().add_entry(std::move(g0_cmd), std::move(guard), _as);
        }
        break;
        case global_topology_request::truncate_table: {
            rtlogger.info("TRUNCATE TABLE requested");
            topology_mutation_builder builder(guard.write_timestamp());
            builder.set_transition_state(topology::transition_state::truncate_table)
                   .set_global_topology_request(req)
                   .set_global_topology_request_id(req_id)
                   .drop_first_global_topology_request_id(_topo_sm._topology.global_requests_queue, req_id)
                   .set_session(session_id(req_id));
            co_await update_topology_state(std::move(guard), {builder.build()}, "TRUNCATE TABLE requested");
        }
        break;
        case global_topology_request::noop_request: {
            utils::chunked_vector<canonical_mutation> updates;
            updates.push_back(canonical_mutation(
                    topology_mutation_builder(guard.write_timestamp())
                         .drop_first_global_topology_request_id(_topo_sm._topology.global_requests_queue, req_id)
                         .build()));
            updates.push_back(canonical_mutation(
                    topology_request_tracking_mutation_builder(req_id)
                         .set("start_time", db_clock::now())
                         .done()
                         .build()));
            co_await update_topology_state(std::move(guard), std::move(updates), "no-op request completed");
        }
        break;
        case global_topology_request::snapshot_tables: {
            rtlogger.info("SNAPSHOT TABLES requested");
            topology_mutation_builder builder(guard.write_timestamp());
            builder.set_transition_state(topology::transition_state::snapshot_tables)
                   .set_global_topology_request(req)
                   .set_global_topology_request_id(req_id)
                   .drop_first_global_topology_request_id(_topo_sm._topology.global_requests_queue, req_id)
                   .set_session(session_id(req_id));
            co_await update_topology_state(std::move(guard), {builder.build()}, "SNAPSHOT TABLES requested");
        }
        break;
        case global_topology_request::finalize_migration: {
            rtlogger.info("finalize_migration requested");

            auto ks_name = *req_entry.finalize_migration_ks_name;
            utils::chunked_vector<canonical_mutation> updates;
            sstring error;

            if (_db.has_keyspace(ks_name)) {
                try {
                    auto& ks = _db.find_keyspace(ks_name);
                    if (ks.uses_tablets()) {
                        throw std::runtime_error(fmt::format("Keyspace '{}' already uses tablets", ks_name));
                    }

                    auto tmptr = get_token_metadata_ptr();
                    const auto& tablet_metadata = tmptr->tablets();
                    auto tables = ks.metadata()->tables();

                    // Verify all tables have tablet maps.
                    for (const auto& schema : tables) {
                        if (!tablet_metadata.has_tablet_map(schema->id())) {
                            throw std::runtime_error(fmt::format(
                                "Table {}.{} does not have a tablet map", ks_name, schema->cf_name()));
                        }
                    }

                    // Find the migration direction (tablets or rollback to vnodes).
                    // Nodes that haven't set their intended mode are treated as vnodes (the default).
                    std::optional<intended_storage_mode> global_intended_mode;
                    for (const auto& [server_id, replica_state] : _topo_sm._topology.normal_nodes) {
                        auto replica_intended_mode = replica_state.storage_mode ? *replica_state.storage_mode : intended_storage_mode::vnodes;
                        if (!global_intended_mode) {
                            global_intended_mode = replica_intended_mode;
                        } else if (replica_intended_mode != *global_intended_mode) {
                            throw std::runtime_error(fmt::format(
                                "Cannot finalize migration for keyspace '{}': node {} has intended storage mode '{}', expected '{}'",
                                ks_name, server_id, replica_intended_mode, *global_intended_mode));
                        }
                    }
                    if (!global_intended_mode) {
                        on_internal_error(rtlogger, fmt::format(
                            "finalize_migration: no normal nodes found while finalizing migration for keyspace '{}'", ks_name));
                    }
                    bool rollback = *global_intended_mode == intended_storage_mode::vnodes;

                    rtlogger.info("Finalizing migration for keyspace '{}': direction={}",
                        ks_name, rollback ? "rollback to vnodes" : "forward to tablets");

                    co_await _tablet_load_stats_refresh.trigger();

                    // Verify that the actual storage mode matches the intended mode for all normal nodes.
                    // A node that has migrated a table's storage to tablets will report it in its load_stats.
                    for (const auto& [node_id, _] : _topo_sm._topology.normal_nodes) {
                        auto host_id = to_host_id(node_id);
                        auto it = _load_stats_per_node.find(host_id);
                        if (!rollback) { // forward path (vnodes to tablets)
                            if (it == _load_stats_per_node.end()) {
                                throw std::runtime_error(fmt::format(
                                    "No load stats available for node {}", host_id));
                            }
                            const auto& node_stats = it->second;
                            for (const auto& schema : tables) {
                                if (!node_stats.tables.contains(schema->id())) {
                                    throw std::runtime_error(fmt::format(
                                        "Node {} has not yet migrated table {}.{} to tablets",
                                        host_id, ks_name, schema->cf_name()));
                                }
                            }
                        } else { // rollback path (tablets to vnodes)
                            if (it != _load_stats_per_node.end()) {
                                const auto& node_stats = it->second;
                                for (const auto& schema : tables) {
                                    if (node_stats.tables.contains(schema->id())) {
                                        throw std::runtime_error(fmt::format(
                                            "Node {} still reports table {}.{} as tablet-based, rollback not complete",
                                            host_id, ks_name, schema->cf_name()));
                                    }
                                }
                            }
                        }
                    }

                    if (!rollback) {
                        // All nodes have been migrated. ALTER the keyspace to use tablets.
                        auto old_md = ks.metadata();
                        auto new_md = data_dictionary::keyspace_metadata::new_keyspace(
                            old_md->name(),
                            old_md->strategy_name(),
                            old_md->strategy_options(),
                            std::optional<unsigned>(0), // initial_tablets=0 means auto
                            old_md->consistency_option(),
                            old_md->durable_writes(),
                            old_md->get_storage_options());
                        auto schema_muts = prepare_keyspace_update_announcement(_db, new_md, guard.write_timestamp());
                        for (auto& m : schema_muts) {
                            updates.emplace_back(m);
                        }
                    } else {
                        // Rollback: delete tablet maps for all tables in the keyspace.
                        for (const auto& schema : tables) {
                            updates.emplace_back(replica::make_drop_tablet_map_mutation(schema->id(), guard.write_timestamp()));
                        }
                    }
                } catch (const std::exception& e) {
                    error = e.what();
                    rtlogger.error("Couldn't process global_topology_request::finalize_migration for keyspace '{}': {}",
                                   ks_name, std::current_exception());
                    updates.clear();
                }
            } else {
                error = fmt::format("Keyspace '{}' does not exist", ks_name);
            }

            topology_mutation_builder tbuilder(guard.write_timestamp());
            tbuilder.del_global_topology_request()
                    .del_global_topology_request_id()
                    .drop_first_global_topology_request_id(_topo_sm._topology.global_requests_queue, req_id);

            if (error.empty()) {
                // Only clear intended_storage_mode if no other keyspace is still under migration.
                auto tmptr = get_token_metadata_ptr();
                const auto& tmd = tmptr->tablets();
                bool has_other_migrating_ks = false;
                for (const auto& other_ks_name : _db.get_non_system_keyspaces()) {
                    if (other_ks_name == ks_name) {
                        continue;
                    }
                    auto& other_ks = _db.find_keyspace(other_ks_name);
                    if (other_ks.uses_tablets()) {
                        continue;
                    }
                    bool other_ks_has_tablet_map = std::ranges::any_of(other_ks.metadata()->tables(), [&](const auto& s) {
                        return tmd.has_tablet_map(s->id());
                    });
                    if (other_ks_has_tablet_map) {
                        has_other_migrating_ks = true;
                        break;
                    }
                }
                if (!has_other_migrating_ks) {
                    for (const auto& [node_id, _] : _topo_sm._topology.normal_nodes) {
                        tbuilder.with_node(node_id).del("intended_storage_mode");
                    }
                }
            }

            updates.push_back(canonical_mutation(
                    topology_request_tracking_mutation_builder(req_id)
                         .done(error)
                         .build()));
            updates.push_back(canonical_mutation(tbuilder.build()));

            sstring reason = fmt::format("finalize vnode-to-tablet migration for keyspace '{}'", ks_name);
            mixed_change change{std::move(updates)};
            group0_command g0_cmd = _group0.client().prepare_command(std::move(change), guard, reason);
            co_await _group0.client().add_entry(std::move(g0_cmd), std::move(guard), _as);
        }
        break;
        }
    }

    // Preconditions:
    // - There are no topology operations in progress
    // - `features_to_enable` represents a set of features that are currently
    //   marked as supported by all normal nodes and it is not empty
    future<> enable_features(group0_guard guard, std::set<sstring> features_to_enable) {
        if (!_topo_sm._topology.transition_nodes.empty()) {
            on_internal_error(rtlogger,
                    "topology coordinator attempted to enable features even though there is"
                    " a topology operations in progress");
        }

        if (utils::get_local_injector().enter("raft_topology_suppress_enabling_features")) {
            // Prevent enabling features while the injection is enabled.
            // The topology coordinator will detect in the next iteration
            // that there are still some cluster features to enable and will
            // reach this place again. In order not to spin in a loop, sleep
            // for a short while.
            co_await sleep(std::chrono::milliseconds(100));
            co_return;
        }

        // If we are here, then we noticed that all normal nodes support some
        // features that are not enabled yet. Perform a global barrier to make
        // sure that:
        //
        // 1. All normal nodes saw (and persisted) a view of the system.topology
        //    table that is equal to what the topology coordinator sees (or newer,
        //    but in that case updating the topology state will fail),
        // 2. None of the normal nodes is restarting at the moment and trying to
        //    downgrade (this is done by a special check in the barrier handler).
        //
        // It's sufficient to only include normal nodes because:
        //
        // - There are no transitioning nodes due to the precondition,
        // - New and left nodes are not part of group 0.
        //
        // After we get a successful confirmation from each normal node, we have
        // a guarantee that they won't attempt to revoke support for those
        // features. That's because we do not allow nodes to boot without
        // a feature that is supported by all nodes in the cluster, even if
        // the feature is not enabled yet.
        guard = co_await exec_global_command(std::move(guard),
                raft_topology_cmd{raft_topology_cmd::command::barrier},
                {_raft.id()},
                drop_guard_and_retake::no);

        topology_mutation_builder builder(guard.write_timestamp());
        builder.add_enabled_features(features_to_enable);
        auto reason = ::format("enabling features: {}", features_to_enable);
        co_await update_topology_state(std::move(guard), {builder.build()}, reason);

        rtlogger.info("enabled features: {}", features_to_enable);
    }

    future<group0_guard> global_token_metadata_barrier(group0_guard&& guard, std::unordered_set<raft::server_id> exclude_nodes = {}, bool* fenced = nullptr, bool drain_all_nodes = false) {
        auto version = _topo_sm._topology.version;
        bool drain_failed = false;
        try {
            guard = co_await exec_global_command(std::move(guard), raft_topology_cmd::command::barrier_and_drain, exclude_nodes, drop_guard_and_retake::yes);
        } catch (...) {
            rtlogger.warn("drain rpc failed, proceed to fence old writes: {}", std::current_exception());
            if (drain_all_nodes) {
                throw;
            }
            drain_failed = true;
        }
        if (drain_failed) {
            guard = co_await start_operation();
        }
        topology_mutation_builder builder(guard.write_timestamp());
        // Other nodes are guaranteed to be drained on success only up to the version which was current
        // prior to barrier_and_drain RPCs were sent.
        builder.set_fence_version(version);
        auto reason = ::format("advance fence version to {}", version);
        co_await update_topology_state(std::move(guard), {builder.build()}, reason);
        if (fenced) {
            *fenced = true;
        }
        guard = co_await start_operation();
        if (drain_failed) {
            // if drain failed need to wait for fence to be active on all nodes
            co_return co_await exec_global_command(std::move(guard), raft_topology_cmd::command::barrier, exclude_nodes, drop_guard_and_retake::yes);
        } else {
            co_return std::move(guard);
        }
    }

    future<group0_guard> global_tablet_token_metadata_barrier(group0_guard guard) {
        // FIXME: Don't require all nodes to be up, only tablet replicas.

        // Let x be the current topology version, post-conditions of this barrier:
        // * there are no coordinators with versions < x and no such coordinators
        //   are possible in the future
        // * no replicas are currently executing requests with versions < x - 1
        //   and no new such requests are possible in the future
        //     Why? The barrier_and_drain handler runs group0.read_barrier() first,
        //     which guarantees that the new version and the previous fence_version are
        //     published on all shards before we drain them. After that we drain all
        //     requests with versions < x ==> no current and future requests are possible
        //     with versions < x - 1 since the fence for x - 1 is set. Future stale
        //     requests with version x - 1 are sill possible until the next
        //     global barrier.
        // * a quorum of replicas doesn't allow new requests with versions < x,
        //   but there could be arbitrary number of already running read or mutation
        //   requests with version x - 1 on those replicas
        // * some replicas could still be accepting new requests with versions == x - 1

        bool* const fenced = nullptr;
        const auto drain_all_nodes = true;
        return global_token_metadata_barrier(std::move(guard),
            _topo_sm._topology.ignored_nodes,
            fenced,
            drain_all_nodes);
    }

    // Represents a two-state state machine which changes monotonically
    // from "not executed" to "executed successfully". This state
    // machine is transient, lives only on this coordinator.
    // The transition is achieved by execution of an idempotent async
    // operation which is tracked by a future. Even though the async
    // action is idempotent, it is costly, so we want to avoid
    // re-executing it if it was already started by this coordinator,
    // that's why we track it.
    using background_action_holder = std::optional<future<>>;

    // Transient state of tablet migration which lives on this coordinator.
    // It is guaranteed to die when migration is finished.
    // Next migration of the same tablet is guaranteed to use a different instance.
    struct tablet_migration_state {
        background_action_holder streaming;
        background_action_holder rebuild_repair;
        background_action_holder cleanup;
        background_action_holder repair;
        background_action_holder repair_update_compaction_ctrl;
        background_action_holder restore;
        std::unordered_map<locator::tablet_transition_stage, background_action_holder> barriers;
        // Record the repair_time returned by the repair_tablet rpc call
        db_clock::time_point repair_time;
        // Record the repair task update muations
        utils::chunked_vector<canonical_mutation> repair_task_updates;
        service::session_id session_id;
        uint32_t repair_update_compaction_ctrl_retried = 0;
    };

    std::unordered_map<locator::global_tablet_id, tablet_migration_state> _tablets;

    // Set to true when any action started on behalf of a background_action_holder
    // for any tablet finishes, or fails and needs to be restarted.
    bool _tablets_ready = false;

    seastar::named_gate _async_gate;

    bool action_failed(background_action_holder& holder) const {
        return holder && holder->failed();
    }

    // This function drives background_action_holder towards "executed successfully"
    // by starting the action if it is not already running or if the previous instance
    // of the action failed. If the action is already running, it does nothing.
    // Returns true iff background_action_holder reached the "executed successfully" state.
    bool advance_in_background(locator::global_tablet_id gid, background_action_holder& holder, const char* name,
                               std::function<future<>()> action) {
        if (!holder || holder->failed()) {
            if (action_failed(holder)) {
                // Prevent warnings about abandoned failed future. Logged below.
                holder->ignore_ready_future();
            }
            holder = futurize_invoke(action).then_wrapped([this, gid, name] (future<> f) {
                if (f.failed()) {
                    auto ep = f.get_exception();
                    rtlogger.warn("{} for tablet {} failed: {}", name, gid, ep);
                    return seastar::sleep_abortable(std::chrono::seconds(1), _as).then([ep] () mutable {
                        std::rethrow_exception(ep);
                    });
                }
                return f;
            }).finally([this, g = _async_gate.hold(), gid, name] () noexcept {
                rtlogger.debug("{} for tablet {} resolved.", name, gid);
                _tablets_ready = true;
                _topo_sm.event.broadcast();
            });
            return false;
        }

        if (!holder->available()) {
            rtlogger.debug("Tablet {} still doing {}", gid, name);
            return false;
        }

        return true;
    }

    future<> for_each_tablet_group_transition(std::function<void(const locator::tablet_map&,
                                                           table_id,
                                                           const locator::table_group_set&,
                                                           locator::tablet_id,
                                                           const locator::tablet_transition_info&)> func) {
        auto tm = get_token_metadata_ptr();
        for (auto&& [base_table, tables] : tm->tablets().all_table_groups()) {
            co_await coroutine::maybe_yield();
            const auto& tmap = tm->tablets().get_tablet_map(base_table);
            for (auto&& [tablet, trinfo]: tmap.transitions()) {
                co_await coroutine::maybe_yield();
                func(tmap, base_table, tables, tablet, trinfo);
            }
        }
    }

    bool is_excluded(raft::server_id server_id) const {
        return _topo_sm._topology.excluded_tablet_nodes.contains(server_id);
    }

    void generate_migration_update(utils::chunked_vector<canonical_mutation>& out, const group0_guard& guard, const tablet_migration_info& mig) {
        const auto& tmap = get_token_metadata_ptr()->tablets().get_tablet_map(mig.tablet.table);
        auto last_token = tmap.get_last_token(mig.tablet.tablet);
        if (tmap.get_tablet_transition_info(mig.tablet.tablet)) {
            rtlogger.warn("Tablet already in transition, ignoring migration: {}", mig);
            return;
        }
        auto migration_task_info = mig.kind == locator::tablet_transition_kind::migration ? locator::tablet_task_info::make_migration_request()
            : locator::tablet_task_info::make_intranode_migration_request();
        migration_task_info.sched_nr++;
        migration_task_info.sched_time = db_clock::now();
        out.emplace_back(
            replica::tablet_mutation_builder(guard.write_timestamp(), mig.tablet.table)
                .set_new_replicas(last_token, locator::get_new_replicas(tmap.get_tablet_info(mig.tablet.tablet), mig))
                .set_stage(last_token, locator::tablet_transition_stage::allow_write_both_read_old)
                .set_transition(last_token, mig.kind)
                .set_migration_task_info(last_token, std::move(migration_task_info), _feature_service)
                .build());
    }

    void generate_repair_update(utils::chunked_vector<canonical_mutation>& out, const group0_guard& guard, const locator::global_tablet_id& gid, db_clock::time_point sched_time) {
        auto& tmap = get_token_metadata_ptr()->tablets().get_tablet_map(gid.table);
        auto last_token = tmap.get_last_token(gid.tablet);
        if (tmap.get_tablet_transition_info(gid.tablet)) {
            rtlogger.warn("Tablet already in transition, ignoring repair: {}", gid);
            return;
        }
        auto& info = tmap.get_tablet_info(gid.tablet);
        auto repair_task_info = info.repair_task_info;
        if (!repair_task_info.is_user_repair_request()) {
            repair_task_info = locator::tablet_task_info::make_auto_repair_request();
        }
        repair_task_info.sched_nr++;
        repair_task_info.sched_time = db_clock::now();
        out.emplace_back(
            replica::tablet_mutation_builder(guard.write_timestamp(), gid.table)
                .set_new_replicas(last_token, tmap.get_tablet_info(gid.tablet).replicas)
                .set_stage(last_token, locator::tablet_transition_stage::repair)
                .set_transition(last_token, locator::tablet_transition_kind::repair)
                .set_repair_task_info(last_token, repair_task_info, _feature_service)
                .set_session(last_token, session_id(utils::UUID_gen::get_time_UUID()))
                .build());
    }

    void generate_resize_update(utils::chunked_vector<canonical_mutation>& out, const group0_guard& guard, table_id table_id, locator::resize_decision resize_decision) {
            // FIXME: indent.
            auto s = _db.find_schema(table_id);
            const auto& tmap = get_token_metadata_ptr()->tablets().get_tablet_map(table_id);
            // Sequence number is monotonically increasing, globally. Therefore, it can be used to identify a decision.
            resize_decision.sequence_number = tmap.resize_decision().next_sequence_number();
            rtlogger.debug("Generating resize decision for table {} of type {} and sequence number {}",
                           table_id, resize_decision.type_name(), resize_decision.sequence_number);
            out.emplace_back(
                replica::tablet_mutation_builder(guard.write_timestamp(), table_id)
                    .set_resize_decision(std::move(resize_decision), _feature_service)
                    .build());
    }

    void generate_rf_change_resume_update(utils::chunked_vector<canonical_mutation>& out, const group0_guard& guard, utils::UUID request_to_resume) {
        rtlogger.debug("Generating RF change resume for request id {}", request_to_resume);
        out.emplace_back(topology_mutation_builder(guard.write_timestamp())
                .queue_global_topology_request_id(request_to_resume)
                .resume_rf_change_request(_topo_sm._topology.paused_rf_change_requests, request_to_resume)
                .build());
    }

    // Updates keyspace properties; removes system_schema.keyspaces::next_replication;
    // finishes RF change request; Removes request from system.topology::ongoing_rf_changes.
    void generate_rf_change_completion_update(utils::chunked_vector<canonical_mutation>& out, const group0_guard& guard, const rf_change_completion_info& completion) {
        if (rtlogger.is_enabled(seastar::log_level::debug)) {
            sstring props_str;
            for (const auto& [key, value] : completion.saved_ks_props) {
                props_str += fmt::format(" {}={};", key, value);
            }
            rtlogger.debug("generate_rf_change_completion_update: request_id={}, ks_name={}, error='{}', saved_ks_props:{}",
                           completion.request_id, completion.ks_name, completion.error, props_str);
        }
        sstring error = completion.error;
        if (_db.has_keyspace(completion.ks_name)) {
            auto& ks = _db.find_keyspace(completion.ks_name);
            if (error.empty()) {
                cql3::statements::ks_prop_defs new_ks_props{std::map<sstring, sstring>{completion.saved_ks_props.begin(), completion.saved_ks_props.end()}};
                new_ks_props.validate();
                auto ks_md = new_ks_props.as_ks_metadata_update(ks.metadata(), *get_token_metadata_ptr(), _db.features(), _db.get_config());
                ks_md->clear_next_strategy_options();

                auto schema_muts = prepare_keyspace_update_announcement(_db, ks_md, guard.write_timestamp());
                for (auto& m: schema_muts) {
                    out.emplace_back(m);
                }
            } else {
                auto ks_md = make_lw_shared<data_dictionary::keyspace_metadata>(*ks.metadata());
                ks_md->clear_next_strategy_options();

                auto schema_muts = prepare_keyspace_update_announcement(_db, ks_md, guard.write_timestamp());
                for (auto& m: schema_muts) {
                    out.emplace_back(m);
                }
            }
        }

        out.emplace_back(topology_mutation_builder(guard.write_timestamp())
                .finish_rf_change_migrations(_topo_sm._topology.ongoing_rf_changes, completion.request_id)
                .build());

        out.push_back(canonical_mutation(topology_request_tracking_mutation_builder(completion.request_id)
                .done(error)
                .build()));
    }

    // Sets next_replication to current_replication and sets error on the topology request.
    // Similar to storage_service::abort_rf_change for the ongoing_rf_changes case.
    void generate_rf_change_abort_update(utils::chunked_vector<canonical_mutation>& out, const group0_guard& guard, const rf_change_abort_info& abort_info) {
        rtlogger.debug("generate_rf_change_abort_update: request_id={}, ks_name={}, error='{}'", abort_info.request_id, abort_info.ks_name, abort_info.error);

        if (!_db.has_keyspace(abort_info.ks_name)) {
            return;
        }

        auto& ks = _db.find_keyspace(abort_info.ks_name);
        auto ks_md = make_lw_shared<data_dictionary::keyspace_metadata>(*ks.metadata());
        ks_md->set_next_strategy_options(abort_info.current_replication);

        auto schema_muts = prepare_keyspace_update_announcement(_db, ks_md, guard.write_timestamp());
        for (auto& m : schema_muts) {
            out.emplace_back(m);
        }

        out.push_back(canonical_mutation(topology_request_tracking_mutation_builder(abort_info.request_id)
                .abort(abort_info.error)
                .build()));
    }

    future<> generate_rf_change_updates(utils::chunked_vector<canonical_mutation>& out, const group0_guard& guard, const keyspace_rf_change_plan& rf_change_plan) {
        for (const auto& abort_info : rf_change_plan.aborts) {
            co_await coroutine::maybe_yield();
            generate_rf_change_abort_update(out, guard, abort_info);
        }
        if (rf_change_plan.completion.has_value()) {
            generate_rf_change_completion_update(out, guard, *rf_change_plan.completion);
        }
    }

    future<> generate_migration_updates(utils::chunked_vector<canonical_mutation>& out, const group0_guard& guard, const migration_plan& plan) {
        if (plan.resize_plan().finalize_resize.empty() || plan.has_nodes_to_drain()) {
            // schedule tablet migration only if there are no pending resize finalisations or if the node is draining.

            // Do not schedule migrations if there are drain failures because the plan is invalid.
            // Load allocation will change after drain is lifted.
            if (!plan.drain_failures().empty()) {
                for (auto&& drain_fail : plan.drain_failures()) {
                    co_await coroutine::maybe_yield();
                    auto server_id = raft::server_id(drain_fail.node().uuid());
                    _topo_sm.generate_cancel_request_update(out, _feature_service, guard, server_id, drain_fail.reason());
                }
            } else {
                for (const tablet_migration_info& mig: plan.migrations()) {
                    co_await coroutine::maybe_yield();
                    generate_migration_update(out, guard, mig);
                }
            }

            if (auto request_to_resume = plan.rack_list_colocation_plan().request_to_resume(); request_to_resume) {
                generate_rf_change_resume_update(out, guard, request_to_resume);
            }

            co_await generate_rf_change_updates(out, guard, plan.rf_change_plan());
        }

        auto sched_time = db_clock::now();
        for (const auto& gid : plan.repair_plan().repairs()) {
            co_await coroutine::maybe_yield();
            generate_repair_update(out, guard, gid, sched_time);
        }

        for (auto [table_id, resize_decision] : plan.resize_plan().resize) {
            co_await coroutine::maybe_yield();
            generate_resize_update(out, guard, table_id, resize_decision);
        }
    }

    void log_active_transitions(size_t max_count) {
        auto tm = get_token_metadata_ptr();
        size_t logged_count = 0;
        size_t total_count = 0;

        for (auto&& e : tm->tablets().all_table_groups()) {
            auto& base_table = e.first;
            const auto& tmap = tm->tablets().get_tablet_map(base_table);
            total_count += tmap.transitions().size();

            for (auto&& [tablet, trinfo]: tmap.transitions()) {
                if (logged_count >= max_count) {
                    break;
                }
                locator::global_tablet_id gid { base_table, tablet };
                const auto& tinfo = tmap.get_tablet_info(tablet);
                // Log only the replicas involved in the transition (leaving/pending)
                // rather than all replicas, to focus on what's actually changing
                auto leaving = locator::get_leaving_replica(tinfo, trinfo);
                auto pending = trinfo.pending_replica;
                rtlogger.info("Active tablet {}: tablet={}, stage={}{}{}",
                    trinfo.transition, gid, trinfo.stage,
                    leaving ? fmt::format(", leaving={}", *leaving) : "",
                    pending ? fmt::format(", pending={}", *pending) : "");
                logged_count++;
            }
        }

        if (total_count > logged_count) {
            rtlogger.info("(and {} more active tablet transitions)", total_count - logged_count);
        }
    }

    // When "drain" is true, we migrate tablets only as long as there are nodes to drain
    // and then change the transition state to write_both_read_old. Also, while draining,
    // we ignore pending topology requests which normally interrupt load balancing.
    // When "drain" is false, we do regular load balancing.
    future<> handle_tablet_migration(group0_guard guard, bool drain) {
        // This step acts like a pump which advances state machines of individual tablets,
        // batching barriers and group0 updates.
        // If progress cannot be made, e.g. because all transitions are streaming, we block
        // and wait for notification.

        rtlogger.debug("handle_tablet_migration()");
        utils::chunked_vector<canonical_mutation> updates;
        bool needs_barrier = false;
        bool has_transitions = false;

        shared_promise barrier;
        auto fail_barrier = seastar::defer([&] {
            if (needs_barrier) {
                barrier.set_exception(seastar::broken_promise());
            }
        });

        _tablets_ready = false;
        // We operate here on groups of co-located tablets.
        // The tablets of several tables may be co-located and share the same tablet map, and in
        // particular their transitions are shared. Therefore, each transition must be handled for
        // all tablets in a co-location group at once.
        co_await for_each_tablet_group_transition([&] (const locator::tablet_map& tmap,
                                                 table_id base_table,
                                                 const locator::table_group_set& tables,
                                                 locator::tablet_id tid,
                                                 const locator::tablet_transition_info& trinfo) {
            // we have `gid` which is the tablet id of the base table of the tablet group, which we use as a
            // representative of the group for some of the operations - such as logging, and as the key in the _tablets
            // map where we store the migration state.
            // `gids` is all the tablet ids of tablets in the co-location group. When we execute some operation such as
            // streaming, cleanup, etc, we do it for each tablet in `gids`.
            locator::global_tablet_id gid { base_table, tid };
            auto gids = tables | std::views::transform([tid] (table_id table) { return locator::global_tablet_id{table, tid}; }) | std::ranges::to<std::vector>();

            has_transitions = true;
            auto last_token = tmap.get_last_token(gid.tablet);
            auto& tablet_state = _tablets[gid];

            auto get_mutation_builder = [&] () {
                return replica::tablet_mutation_builder(guard.write_timestamp(), base_table);
            };

            auto transition_to = [&] (locator::tablet_transition_stage stage) {
                rtlogger.debug("Will set tablet {} stage to {}", gid, stage);
                updates.emplace_back(get_mutation_builder()
                        .set_stage(last_token, stage)
                        .build());
            };

            auto do_barrier = [&] {
                return advance_in_background(gid, tablet_state.barriers[trinfo.stage], "barrier", [&] {
                    needs_barrier = true;
                    return barrier.get_shared_future();
                });
            };

            auto transition_to_with_barrier = [&] (locator::tablet_transition_stage stage) {
                if (do_barrier()) {
                    transition_to(stage);
                }
            };

            auto check_excluded_replicas = [&] -> std::optional<sstring> {
                    auto tsi = get_migration_streaming_info(get_token_metadata().get_topology(), tmap.get_tablet_info(gid.tablet), trinfo);
                    for (auto r : tsi.read_from) {
                        if (is_excluded(raft::server_id(r.host.uuid()))) {
                            rtlogger.debug("Aborting streaming of {} because read-from {} is marked as ignored", gid, r);
                            return fmt::format("{} is excluded", r);
                        }
                    }
                    for (auto r : tsi.written_to) {
                        if (is_excluded(raft::server_id(r.host.uuid()))) {
                            rtlogger.debug("Aborting streaming of {} because written-to {} is marked as ignored", gid, r);
                            return fmt::format("{} is excluded", r);
                        }
                    }
                    return std::nullopt;
            };

            auto maybe_cancel_drain = [&] (locator::tablet_replica replica, const sstring& reason) {
                auto raft_server = raft::server_id(replica.host.uuid());
                if (!_topo_sm._topology.paused_requests.contains(raft_server)) {
                    return;
                }
                _topo_sm.generate_cancel_request_update(updates, _feature_service, guard, raft_server,
                    fmt::format("tablet draining failed: {}, moving {} to {}, due to {}", gid, replica, trinfo.pending_replica, reason));
            };

            switch (trinfo.stage) {
                case locator::tablet_transition_stage::allow_write_both_read_old:
                    if (action_failed(tablet_state.barriers[trinfo.stage])) {
                        if (check_excluded_replicas()) {
                            transition_to(locator::tablet_transition_stage::cleanup_target);
                            break;
                        }
                    }
                    if (do_barrier()) {
                        rtlogger.debug("Will set tablet {} stage to {}", gid, locator::tablet_transition_stage::write_both_read_old);
                        auto leaving_replica = get_leaving_replica(tmap.get_tablet_info(gid.tablet), trinfo);
                        if (leaving_replica) {
                            _vb_coordinator->abort_tasks(updates, guard, gid.table, *leaving_replica, last_token);
                        }
                        updates.emplace_back(get_mutation_builder()
                            .set_stage(last_token, locator::tablet_transition_stage::write_both_read_old)
                            // Create session a bit earlier to avoid adding barrier
                            // to the streaming stage to create sessions on replicas.
                            .set_session(last_token, session_id(utils::UUID_gen::get_time_UUID()))
                            .build());
                    }
                    break;
                case locator::tablet_transition_stage::write_both_read_old:
                    if (action_failed(tablet_state.barriers[trinfo.stage])) {
                        if (check_excluded_replicas()) {
                            transition_to(locator::tablet_transition_stage::cleanup_target);
                            break;
                        }
                    }
                    if (trinfo.transition == locator::tablet_transition_kind::rebuild_v2) {
                        transition_to_with_barrier(locator::tablet_transition_stage::rebuild_repair);
                    } else {
                        transition_to_with_barrier(locator::tablet_transition_stage::streaming);
                    }
                    break;
                case locator::tablet_transition_stage::write_both_read_old_fallback_cleanup:
                    transition_to_with_barrier(locator::tablet_transition_stage::cleanup_target);
                    break;
                case locator::tablet_transition_stage::rebuild_repair: {
                    if (action_failed(tablet_state.rebuild_repair)) {
                        bool fail = utils::get_local_injector().enter("rebuild_repair_stage_fail");
                        if (fail || check_excluded_replicas()) {
                            rtlogger.debug("Will set tablet {} stage to {}", gid, locator::tablet_transition_stage::cleanup_target);
                            updates.emplace_back(get_mutation_builder()
                                    .set_stage(last_token, locator::tablet_transition_stage::cleanup_target)
                                    .del_session(last_token)
                                    .build());
                            break;
                        }
                    }

                    if (advance_in_background(gid, tablet_state.rebuild_repair, "rebuild_repair", [&] {
                        utils::get_local_injector().inject("rebuild_repair_stage_fail",
                            [] { throw std::runtime_error("rebuild_repair failed due to error injection"); });
                        if (!trinfo.pending_replica) {
                            rtlogger.info("Skipped tablet rebuild repair of {} as no pending replica found", gid);
                            return make_ready_future<>();
                        }
                        auto tsi = get_migration_streaming_info(get_token_metadata().get_topology(), tmap.get_tablet_info(gid.tablet), trinfo);
                        if (tsi.read_from.empty()) {
                            rtlogger.info("Skipped tablet rebuild repair of {} as no tablet replica was found", gid);
                            return make_ready_future<>();
                        }
                        auto dst = locator::maybe_get_primary_replica(gid.tablet, {tsi.read_from.begin(), tsi.read_from.end()},
                                                                      _db.get_token_metadata().get_topology(), [] (const auto& tr) { return true; }).value().host;
                        rtlogger.info("Initiating repair phase of tablet rebuild host={} tablet={}", dst, gid);
                        return do_with(gids, [this, dst, session_id = trinfo.session_id] (const auto& gids) {
                            return do_for_each(gids, [this, dst, session_id] (locator::global_tablet_id gid) {
                                return ser::storage_service_rpc_verbs::send_tablet_repair(&_messaging,
                                        dst, _as, raft::server_id(dst.uuid()), gid, session_id).discard_result();
                            });
                        });
                    })) {
                        rtlogger.debug("Will set tablet {} stage to {}", gid, locator::tablet_transition_stage::streaming);
                        updates.emplace_back(get_mutation_builder()
                            .set_stage(last_token, locator::tablet_transition_stage::streaming)
                            .build());
                    }
                }
                    break;
                // The state "streaming" is needed to ensure that stale stream_tablet() RPC doesn't
                // get admitted before global_tablet_token_metadata_barrier() is finished for earlier
                // stage in case of coordinator failover.
                case locator::tablet_transition_stage::streaming: {
                    if (action_failed(tablet_state.streaming) || utils::get_local_injector().enter("stream_tablet_fail")) {
                        std::optional<sstring> rollback;

                        if (utils::get_local_injector().enter("stream_tablet_move_to_cleanup")) {
                            rollback = "error injection";
                        }

                        bool critical_disk_utilization = false;
                        if (auto stats = _tablet_allocator.get_load_stats()) {
                            const auto& util = stats->critical_disk_utilization;
                            auto it = util.find(trinfo.pending_replica->host);
                            critical_disk_utilization = (it != util.end() && it->second);
                            if (critical_disk_utilization) {
                                rollback = fmt::format("critical disk utilization on {}", trinfo.pending_replica->host);
                            }
                        }

                        auto maybe_leaving = locator::get_leaving_replica(tmap.get_tablet_info(gid.tablet), trinfo);
                        if (maybe_leaving) {
                            auto req = _topo_sm._topology.get_request(raft::server_id(maybe_leaving->host.uuid()));
                            // We only cancel remove request on critical utilization to avoid failing remove operation
                            // once migrations started before the node was marked as excluded are unblocked and roll back.
                            if (req == topology_request::leave || critical_disk_utilization) {
                                maybe_cancel_drain(*maybe_leaving, rollback ? *rollback : "streaming failed");
                                if (!rollback) {
                                    rollback = fmt::format("{} request for {} failed", req, maybe_leaving->host);
                                }
                            }
                        }

                        if (!rollback) {
                            rollback = check_excluded_replicas();
                        }

                        if (rollback) {
                            rtlogger.debug("Will set tablet {} stage to {}: {}", gid, locator::tablet_transition_stage::cleanup_target, *rollback);
                            updates.emplace_back(get_mutation_builder()
                                .set_stage(last_token, locator::tablet_transition_stage::cleanup_target)
                                .del_session(last_token)
                                .build());
                            break;
                        }
                    }

                    bool wait = utils::get_local_injector().enter("stream_tablet_wait");
                    if (!wait && advance_in_background(gid, tablet_state.streaming, "streaming", [&] {
                        utils::get_local_injector().inject("stream_tablet_move_to_cleanup",
                                        [] { throw std::runtime_error("stream_tablet failed due to error injection"); });

                        if (!trinfo.pending_replica) {
                            rtlogger.info("Skipped tablet streaming ({}) of {} as no pending replica found", trinfo.transition, gid);
                            return make_ready_future<>();
                        }
                        rtlogger.info("Initiating tablet streaming ({}) of {} to {}", trinfo.transition, gid, *trinfo.pending_replica);
                        auto dst = trinfo.pending_replica->host;
                        return do_with(gids, [this, dst] (const auto& gids) {
                            return do_for_each(gids, [this, dst] (locator::global_tablet_id gid) {
                                return ser::storage_service_rpc_verbs::send_tablet_stream_data(&_messaging,
                                           dst, _as, raft::server_id(dst.uuid()), gid);
                            });
                        });
                    })) {
                        rtlogger.debug("Will set tablet {} stage to {}", gid, locator::tablet_transition_stage::write_both_read_new);
                        updates.emplace_back(get_mutation_builder()
                            .set_stage(last_token, locator::tablet_transition_stage::write_both_read_new)
                            .del_session(last_token)
                            .build());
                    }
                }
                    break;
                case locator::tablet_transition_stage::write_both_read_new: {
                    utils::get_local_injector().inject("crash-in-tablet-write-both-read-new", [] {
                        rtlogger.info("crash-in-tablet-write-both-read-new hit, killing the node");
                        _exit(1);
                    });

                    if (action_failed(tablet_state.barriers[trinfo.stage])) {
                        auto& tinfo = tmap.get_tablet_info(gid.tablet);
                        unsigned excluded_old = 0;
                        for (auto r : tinfo.replicas) {
                            if (is_excluded(raft::server_id(r.host.uuid()))) {
                                excluded_old++;
                            }
                        }
                        unsigned excluded_new = 0;
                        for (auto r : trinfo.next) {
                            if (is_excluded(raft::server_id(r.host.uuid()))) {
                                excluded_new++;
                            }
                        }
                        // Cannot revert if this is intra-node migration.
                        // We will lose data if the migrating node restarted, in which case the leaving replica
                        // doesn't contain writes anymore as sstables are attached only on the pending replica.
                        // Luckily, this we never reach this condition since excluded_new cannot be larger
                        // than excluded_old for intra-node migration.
                        if (excluded_new > excluded_old && trinfo.transition != locator::tablet_transition_kind::intranode_migration) {
                            rtlogger.debug("During {} stage of {} {} new nodes and {} old nodes were excluded", trinfo.stage, gid, excluded_new, excluded_old);
                            if (_feature_service.tablets_intermediate_fallback_cleanup) {
                                transition_to(locator::tablet_transition_stage::write_both_read_old_fallback_cleanup);
                            } else {
                                transition_to_with_barrier(locator::tablet_transition_stage::cleanup_target);
                            }
                            break;
                        }
                    }
                    transition_to_with_barrier(locator::tablet_transition_stage::use_new);
                }
                    break;
                case locator::tablet_transition_stage::use_new:
                    transition_to_with_barrier(locator::tablet_transition_stage::cleanup);
                    break;
                case locator::tablet_transition_stage::cleanup: {
                    bool wait = utils::get_local_injector().enter("cleanup_tablet_wait");
                    if (!wait && advance_in_background(gid, tablet_state.cleanup, "cleanup", [&] {
                        auto maybe_dst = locator::get_leaving_replica(tmap.get_tablet_info(gid.tablet), trinfo);
                        if (!maybe_dst) {
                            rtlogger.info("Tablet cleanup of {} skipped because no replicas leaving", gid);
                            return make_ready_future<>();
                        }
                        locator::tablet_replica& dst = *maybe_dst;
                        if (is_excluded(raft::server_id(dst.host.uuid()))) {
                            rtlogger.info("Tablet cleanup of {} on {} skipped because node is excluded", gid, dst);
                            return make_ready_future<>();
                        }
                        rtlogger.info("Initiating tablet cleanup of {} on {}", gid, dst);
                        return do_with(gids, [this, dst] (const auto& gids) {
                            return do_for_each(gids, [this, dst] (locator::global_tablet_id gid) {
                                return ser::storage_service_rpc_verbs::send_tablet_cleanup(&_messaging,
                                                                                           dst.host, _as, raft::server_id(dst.host.uuid()), gid);
                            });
                        }).then([] {
                            return utils::get_local_injector().inject("wait_after_tablet_cleanup", [] (auto& handler) -> future<> {
                                rtlogger.info("Waiting after tablet cleanup");
                                return handler.wait_for_message(std::chrono::steady_clock::now() + std::chrono::seconds{60});
                            });
                        });
                    })) {
                        transition_to(locator::tablet_transition_stage::end_migration);
                    }
                }
                    break;
                case locator::tablet_transition_stage::cleanup_target:
                    if (do_barrier()) {
                        if (advance_in_background(gid, tablet_state.cleanup, "cleanup_target", [&] {
                            if (!trinfo.pending_replica) {
                                rtlogger.info("Tablet cleanup of {} skipped because no replicas pending", gid);
                                return make_ready_future<>();
                            }
                            locator::tablet_replica dst = *trinfo.pending_replica;
                            if (is_excluded(raft::server_id(dst.host.uuid()))) {
                                rtlogger.info("Tablet cleanup of {} on {} skipped because node is excluded and doesn't need to revert migration", gid, dst);
                                return make_ready_future<>();
                            }
                            rtlogger.info("Initiating tablet cleanup of {} on {} to revert migration", gid, dst);
                            return do_with(gids, [this, dst] (const auto& gids) {
                                return do_for_each(gids, [this, dst] (locator::global_tablet_id gid) {
                                    return ser::storage_service_rpc_verbs::send_tablet_cleanup(&_messaging,
                                                                                            dst.host, _as, raft::server_id(dst.host.uuid()), gid);
                                });
                            });
                        })) {
                            transition_to(locator::tablet_transition_stage::revert_migration);
                        }
                    }
                    break;
                case locator::tablet_transition_stage::revert_migration:
                    // Need a separate stage and a barrier after cleanup RPC to cut off stale RPCs.
                    // See do_tablet_operation() doc.
                    if (do_barrier()) {
                        _tablets.erase(gid);
                        updates.emplace_back(get_mutation_builder()
                                .del_transition(last_token)
                                .del_migration_task_info(last_token, _feature_service)
                                .build());
                        auto leaving_replica = get_leaving_replica(tmap.get_tablet_info(gid.tablet), trinfo);
                        if (leaving_replica) {
                            _vb_coordinator->rollback_aborted_tasks(updates, guard, gid.table, *leaving_replica, last_token);
                        }
                    }
                    break;
                case locator::tablet_transition_stage::end_migration: {
                    // Update load_stats after a migration or rebuild
                    update_load_stats_on_end_migration(gid, tmap, trinfo);

                    // Need a separate stage and a barrier after cleanup RPC to cut off stale RPCs.
                    // See do_tablet_operation() doc.
                    bool defer_transition = utils::get_local_injector().enter("handle_tablet_migration_end_migration");
                    if (!defer_transition && do_barrier()) {
                        _tablets.erase(gid);
                        updates.emplace_back(get_mutation_builder()
                                .del_transition(last_token)
                                .set_replicas(last_token, trinfo.next)
                                .del_migration_task_info(last_token, _feature_service)
                                .build());
                        _vb_coordinator->generate_tablet_migration_updates(updates, guard, tmap, gid, trinfo);
                    }
                }
                    break;
                case locator::tablet_transition_stage::repair: {
                    bool fail_repair = utils::get_local_injector().enter("handle_tablet_migration_repair_fail");
                    if (fail_repair || action_failed(tablet_state.repair)) {
                        if (do_barrier()) {
                            auto& tinfo = tmap.get_tablet_info(gid.tablet);
                            _tablet_ops_metrics.inc_failed(tinfo.repair_task_info.request_type);
                            updates.emplace_back(get_mutation_builder()
                                    .set_stage(last_token, locator::tablet_transition_stage::end_repair)
                                    .del_session(last_token)
                                    .build());
                        }
                        break;
                    }
                    if (advance_in_background(gid, tablet_state.repair, "repair", [&] () -> future<> {
                        auto& tinfo = tmap.get_tablet_info(gid.tablet);
                        bool valid = tinfo.repair_task_info.is_valid();
                        if (!valid) {
                            rtlogger.info("Skipping tablet repair for tablet={} which is cancelled by user", gid);
                            co_return;
                        }
                        auto* trinfo = tmap.get_tablet_transition_info(gid.tablet);
                        if (trinfo) {
                            tablet_state.session_id = trinfo->session_id;
                        }
                        auto sched_time = tinfo.repair_task_info.sched_time;
                        auto tablet = gid;
                        auto hosts_filter = tinfo.repair_task_info.repair_hosts_filter;
                        auto dcs_filter = tinfo.repair_task_info.repair_dcs_filter;
                        const auto& topo = _db.get_token_metadata().get_topology();
                        locator::host_id dst;
                        if (hosts_filter.empty() && dcs_filter.empty()) {
                            auto primary = tmap.get_primary_replica(gid.tablet, topo);
                            dst = primary.host;
                        } else {
                            auto dst_opt = tmap.maybe_get_selected_replica(gid.tablet, topo, tinfo.repair_task_info);
                            if (!dst_opt) {
                                co_return;
                            }
                            dst = dst_opt.value().host;
                        }
                        // Update repair task
                        db::system_keyspace::repair_task_entry entry{
                            .task_uuid   = tasks::task_id(tinfo.repair_task_info.tablet_task_id.uuid()),
                            .operation   = db::system_keyspace::repair_task_operation::finished,
                            .first_token = dht::token::to_int64(tmap.get_first_token(gid.tablet)),
                            .last_token  = dht::token::to_int64(tmap.get_last_token(gid.tablet)),
                            .table_uuid  = gid.table,
                        };
                        auto request_type = tinfo.repair_task_info.request_type;
                        rtlogger.info("Initiating tablet repair host={} tablet={} request_type={}", dst, gid, request_type);
                        auto session_id = utils::get_local_injector().enter("handle_tablet_migration_repair_random_session") ?
                            service::session_id::create_random_id() : trinfo->session_id;
                        auto res = co_await ser::storage_service_rpc_verbs::send_tablet_repair(&_messaging,
                                dst, _as, raft::server_id(dst.uuid()), gid, session_id);
                        auto duration = std::chrono::duration<float>(db_clock::now() - sched_time);
                        auto& tablet_state = _tablets[tablet];
                        tablet_state.repair_time = db_clock::from_time_t(gc_clock::to_time_t(res.repair_time));
                        if (_feature_service.tablet_repair_tasks_table) {
                            entry.timestamp = db_clock::now();
                            tablet_state.repair_task_updates = co_await _sys_ks.get_update_repair_task_mutations(entry, api::new_timestamp());
                        }
                        rtlogger.info("Finished tablet repair host={} tablet={} duration={} repair_time={} request_type={}",
                                dst, tablet, duration, res.repair_time, request_type);
                    })) {
                        if (utils::get_local_injector().enter("delay_end_repair_update")) {
                            break;
                        }

                        auto& tinfo = tmap.get_tablet_info(gid.tablet);
                        bool valid = tinfo.repair_task_info.is_valid();
                        auto hosts_filter = tinfo.repair_task_info.repair_hosts_filter;
                        auto dcs_filter = tinfo.repair_task_info.repair_dcs_filter;
                        auto incremental = tinfo.repair_task_info.repair_incremental_mode != locator::tablet_repair_incremental_mode::disabled;
                        bool is_filter_off = hosts_filter.empty() && dcs_filter.empty();
                        rtlogger.debug("Will set tablet {} stage to {}", gid, locator::tablet_transition_stage::end_repair);
                        auto update = get_mutation_builder()
                                        .set_stage(last_token, locator::tablet_transition_stage::end_repair)
                                        .del_repair_task_info(last_token, _feature_service)
                                        .del_session(last_token);
                        for (auto& m : tablet_state.repair_task_updates) {
                            updates.push_back(m);
                        }
                        // Skip update repair time in case hosts filter or dcs filter is set.
                        if (valid && is_filter_off) {
                            auto sched_time = tinfo.repair_task_info.sched_time;
                            auto time = tablet_state.repair_time;
                            update.set_repair_time(last_token, time);
                            auto repaired_at = sstring("None");
                            if (_feature_service.tablet_incremental_repair && incremental) {
                                auto sstables_repaired_at = tinfo.sstables_repaired_at + 1;
                                if (utils::get_local_injector().enter("repair_tablet_no_update_sstables_repair_at")) {
                                    rtlogger.info("Skip update system.tablet ssstables_repaired_at={}", sstables_repaired_at);
                                } else {
                                    update.set_sstables_repair_at(last_token, sstables_repaired_at);
                                    repaired_at = seastar::format("{}", sstables_repaired_at);
                                }
                            }
                            rtlogger.debug("Set tablet repair time sched_time={} repair_time={} sstables_repaired_at={} last_token={}",
                                    sched_time, time, repaired_at, last_token);
                        }
                        updates.emplace_back(update.build());
                        _tablet_ops_metrics.inc_succeeded(tinfo.repair_task_info.request_type);
                    }
                }
                    break;
                case locator::tablet_transition_stage::restore: {
                    if (!trinfo.restore_cfg.has_value()) {
                        on_internal_error(rtlogger, format("Cannot handle restore transition without config for tablet {}", gid));
                    }
                    if (action_failed(tablet_state.restore)) {
                        rtlogger.debug("Clearing restore transition for {} due to error", gid);
                        updates.emplace_back(get_mutation_builder().del_transition(last_token).del_restore_config(last_token).build());
                        break;
                    }
                    if (advance_in_background(gid, tablet_state.restore, "restore", [this, gid, &tmap] () -> future<> {
                        auto& tinfo = tmap.get_tablet_info(gid.tablet);
                        auto replicas = tinfo.replicas;

                        rtlogger.info("Restoring tablet={} on {}", gid, replicas);
                        co_await coroutine::parallel_for_each(replicas, [this, gid] (locator::tablet_replica r) -> future<> {
                            auto dst = raft::server_id(r.host.uuid());
                            if (!is_excluded(dst)) {
                                co_await ser::sstables_loader_rpc_verbs::send_restore_tablet(&_messaging, r.host, dst, gid);
                                rtlogger.debug("Tablet {} restored on {}", gid, r.host);
                            }
                        });
                    })) {
                        rtlogger.debug("Clearing restore transition for {}", gid);
                        updates.emplace_back(get_mutation_builder().del_transition(last_token).del_restore_config(last_token).build());
                    }
                }
                    break;
                case locator::tablet_transition_stage::end_repair: {
                    if (do_barrier()) {
                        if (tablet_state.session_id.uuid().is_null()) {
                            tablet_state.session_id = trinfo.session_id;
                        }
                        if (action_failed(tablet_state.repair_update_compaction_ctrl)) {
                            rtlogger.warn("Failed to perform repair_update_compaction_ctrl for tablet repair tablet_id={} session_id={} nr_retried={}",
                                    gid, tablet_state.session_id, tablet_state.repair_update_compaction_ctrl_retried);
                            // Do not erase the tablet from _tablets or delete
                            // the transitions yet so we can retry the
                            // repair_update_compaction_ctrl verb
                            tablet_state.repair_update_compaction_ctrl_retried++;
                        }
                        bool feature = _feature_service.tablet_incremental_repair;
                        if (advance_in_background(gid, tablet_state.repair_update_compaction_ctrl, "repair_update_compaction_ctrl", [this, ms = &_messaging,
                                    gid = gid, sid = tablet_state.session_id, feature, &tmap] () -> future<> {
                            if (feature) {
                                if (utils::get_local_injector().enter("fail_rpc_repair_update_compaction_ctrl")) {
                                    auto msg = fmt::format("Failed repair_update_compaction_ctrl for tablet repair tablet_id={} session_id={} due to error injection", gid, sid);
                                    rtlogger.info("{}", msg);
                                    throw std::runtime_error(msg);
                                }
                                auto& replicas = tmap.get_tablet_info(gid.tablet).replicas;
                                co_await coroutine::parallel_for_each(replicas, [this, ms, gid, sid] (locator::tablet_replica r) -> future<> {
                                    if (!is_excluded(raft::server_id(r.host.uuid()))) {
                                        co_await ser::repair_rpc_verbs::send_repair_update_compaction_ctrl(ms, r.host, gid, sid);
                                    }
                                });
                            }
                        })) {
                            if (utils::get_local_injector().enter("log_tablet_transition_stage_end_repair")) {
                                rtlogger.info("The end_repair stage finished for tablet repair tablet_id={} session_id={}", gid, tablet_state.session_id);
                            }
                            _tablets.erase(gid);
                            updates.emplace_back(get_mutation_builder().del_transition(last_token).build());
                        }
                    }
                }
                    break;
            }
        });

        // In order to keep the cluster saturated, ask the load balancer for more transitions.
        // Unless there is a pending topology change operation.
        bool preempt = false;
        if (!drain) {
            // When draining, this method is invoked with an active node transition, which
            // would normally cause preemption, which we don't want here.
            auto ts = guard.write_timestamp();
            auto [new_preempt, new_guard] = should_preempt_balancing(std::move(guard));
            preempt = new_preempt;
            guard = std::move(new_guard);
            if (ts != guard.write_timestamp()) {
                // We rely on the fact that should_preempt_balancing() does not release the guard
                // so that tablet metadata reading and updates are atomic.
                on_internal_error(rtlogger, "should_preempt_balancing() retook the guard");
            }
        }

        bool has_nodes_to_drain = false;
        bool requires_schema_changes = false;
        if (!preempt) {
            auto plan = co_await _tablet_allocator.balance_tablets(get_token_metadata_ptr(), &_topo_sm._topology, &_sys_ks, {}, get_dead_nodes());
            has_nodes_to_drain = plan.has_nodes_to_drain();
            requires_schema_changes = plan.requires_schema_changes();
            if (!drain || plan.has_nodes_to_drain()) {
                co_await generate_migration_updates(updates, guard, plan);
            }
        }

        // The updates have to be executed under the same guard which was used to read tablet metadata
        // to ensure that we don't reinsert tablet rows which were concurrently deleted by schema change
        // which happens outside the topology coordinator.
        bool has_updates = !updates.empty();
        if (has_updates) {
            co_await utils::get_local_injector().inject("tablet_transition_updates", utils::wait_for_message(2min));
            updates.emplace_back(
                topology_mutation_builder(guard.write_timestamp())
                    .set_version(_topo_sm._topology.version + 1)
                    .build());
            if (requires_schema_changes) {
                co_await update_topology_state_with_mixed_change(std::move(guard), std::move(updates), format("Tablet migration"));
            } else {
                co_await update_topology_state(std::move(guard), std::move(updates), format("Tablet migration"));
            }
        }

        if (needs_barrier) {
            // If has_updates is true then we have dropped the guard and need to re-obtain it.
            // It's fine to start an independent operation here. The barrier doesn't have to be executed
            // atomically with the read which set needs_barrier, because it's fine if the global barrier
            // works with a more recent set of nodes and it's fine if it propagates a more recent topology.
            if (!guard) {
                guard = co_await start_operation();
            }
            guard = co_await global_tablet_token_metadata_barrier(std::move(guard));
            barrier.set_value();
            fail_barrier.cancel();
            co_return;
        }

        if (has_updates) {
            co_return;
        }

        if (has_transitions) {
            // Streaming may have finished after we checked. To avoid missed notification, we need
            // to check atomically with event.wait()
            if (!_tablets_ready) {
                rtlogger.debug("Going to sleep with active tablet transitions");
                log_active_transitions(5);
                release_guard(std::move(guard));
                co_await await_event();
            }
            co_return;
        }

        if (drain) {
            if (has_nodes_to_drain) {
                // Prevent jumping to write_both_read_old with un-drained tablets.
                // This can happen when all candidate nodes are down.
                rtlogger.warn("Tablet draining stalled: No tablets migrating but there are nodes to drain");
                release_guard(std::move(guard));
                co_await sleep(3s); // Throttle retries
                co_return;
            }
            updates.emplace_back(
                topology_mutation_builder(guard.write_timestamp())
                    .set_transition_state(topology::transition_state::write_both_read_old)
                    .set_session(session_id(guard.new_group0_state_id()))
                    .set_version(_topo_sm._topology.version + 1)
                    .build());
        } else {
            updates.emplace_back(
                topology_mutation_builder(guard.write_timestamp())
                    .del_transition_state()
                    .set_version(_topo_sm._topology.version + 1)
                    .build());
        }
        co_await update_topology_state(std::move(guard), std::move(updates), "Finished tablet migration");
    }

    // Migrates tablet size from leaving to pending host after migration,
    // or creates a new tablet size on pending host after a rebuild
    void update_load_stats_on_end_migration(locator::global_tablet_id gid, const locator::tablet_map& tmap, const locator::tablet_transition_info& trinfo) {
        if (auto old_load_stats = _tablet_allocator.get_load_stats()) {
            lw_shared_ptr<locator::load_stats> new_load_stats;
            auto& tinfo = tmap.get_tablet_info(gid.tablet);
            auto leaving = locator::get_leaving_replica(tinfo, trinfo);
            auto pending = trinfo.pending_replica;
            const dht::token_range trange {tmap.get_token_range(gid.tablet)};
            switch (trinfo.transition) {
            case locator::tablet_transition_kind::migration:
                // Handle tablet migration
                new_load_stats = old_load_stats->migrate_tablet_size(leaving->host, pending->host, gid, trange);
                break;
            case locator::tablet_transition_kind::rebuild:
                [[fallthrough]];
            case locator::tablet_transition_kind::rebuild_v2:
                // Handle rebuild
                if (pending && old_load_stats->tablet_stats.contains(pending->host)) {
                    // Compute the average tablet size of existing replicas
                    uint64_t tablet_size_sum = 0;
                    size_t replica_count = 0;
                    bool incomplete = false;
                    const locator::range_based_tablet_id rb_tid {gid.table, trange};
                    auto tsi = get_migration_streaming_info(get_token_metadata().get_topology(), tinfo, trinfo);
                    for (auto& r : tsi.read_from) {
                        auto tablet_size_opt = old_load_stats->get_tablet_size(r.host, rb_tid);
                        if (tablet_size_opt) {
                            tablet_size_sum += *tablet_size_opt;
                            replica_count++;
                        } else {
                            incomplete = true;
                        }
                    }

                    if (!incomplete) {
                        new_load_stats = make_lw_shared<locator::load_stats>(*old_load_stats);
                        auto size = replica_count ? tablet_size_sum / replica_count : 0;
                        new_load_stats->tablet_stats.at(pending->host).tablet_sizes[gid.table][trange] = size;
                    }
                }
                break;
            case locator::tablet_transition_kind::repair:
                [[fallthrough]];
            case locator::tablet_transition_kind::restore:
                [[fallthrough]];
            case locator::tablet_transition_kind::intranode_migration:
                break;
            }
            if (new_load_stats) {
                _tablet_allocator.set_load_stats(std::move(new_load_stats));
            }
        }
    }

    future<> handle_tablet_resize_finalization(group0_guard g) {
        // Executes a global barrier to guarantee that any process (e.g. repair) holding stale version
        // of token metadata will complete before we update topology.
        auto guard = co_await global_tablet_token_metadata_barrier(std::move(g));

        co_await utils::get_local_injector().inject("tablet_resize_finalization_post_barrier", utils::wait_for_message(std::chrono::minutes(2)));

        auto tm = get_token_metadata_ptr();
        auto plan = co_await _tablet_allocator.balance_tablets(tm, &_topo_sm._topology, &_sys_ks, {}, get_dead_nodes());

        utils::chunked_vector<canonical_mutation> updates;
        updates.reserve(plan.resize_plan().finalize_resize.size() * 2 + 1);

        for (auto& table_id : plan.resize_plan().finalize_resize) {
            auto s = _db.find_schema(table_id);
            auto new_tablet_map = co_await _tablet_allocator.resize_tablets(tm, table_id);
            co_await replica::tablet_map_to_mutations(
                new_tablet_map,
                table_id,
                s->ks_name(),
                s->cf_name(),
                guard.write_timestamp(),
                _feature_service,
                [&] (mutation m) -> future<> {
                    updates.emplace_back(co_await make_canonical_mutation_gently(m));
                });

            // Clears the resize decision for a table.
            generate_resize_update(updates, guard, table_id, locator::resize_decision{});
            _vb_coordinator->generate_tablet_resize_updates(updates, guard, table_id, tm->tablets().get_tablet_map(table_id), new_tablet_map);

            for (auto table_id : tm->tablets().all_table_groups().at(table_id)) {
                co_await _cdc_gens.generate_tablet_resize_update(updates, table_id, new_tablet_map, guard.write_timestamp());
            }

            const auto& tmap = get_token_metadata_ptr()->tablets().get_tablet_map(table_id);
            auto old_cnt = tmap.tablet_count();
            auto new_cnt = new_tablet_map.tablet_count();
            if (old_cnt > new_cnt) {
                std::unordered_set<locator::host_id> replicas;
                for (auto& tinfo : tmap.tablets()) {
                    for (auto& r : tinfo.replicas) {
                        if (!is_excluded(raft::server_id(r.host.uuid()))) {
                            replicas.insert(r.host);
                        }
                    }
                }
                if (_feature_service.tablet_incremental_repair) {
                    rtlogger.info("Send rpc verb repair_update_repaired_at_for_merge table={} replicas={} old_cnt={} new_cnt={}", table_id, replicas, old_cnt, new_cnt);
                    if (utils::get_local_injector().enter("handle_tablet_resize_finalization_for_merge_error")) {
                        rtlogger.info("Got handle_tablet_resize_finalization_for_merge_error old_cnt={} new_cnt={}", old_cnt, new_cnt);
                        co_await sleep_abortable(std::chrono::minutes(1), _as);
                    }
                    co_await coroutine::parallel_for_each(replicas, [ms = &_messaging, table_id] (const locator::host_id& h) -> future<> {
                        co_await ser::repair_rpc_verbs::send_repair_update_repaired_at_for_merge(ms, h, table_id);
                    });
                }
            }
        }

        updates.emplace_back(
            topology_mutation_builder(guard.write_timestamp())
                .del_transition_state()
                .set_version(_topo_sm._topology.version + 1)
                .build());
        co_await update_topology_state(std::move(guard), std::move(updates), format("Finished tablet resize finalization"));

        if (auto old_load_stats = _tablet_allocator.get_load_stats()) {
            guard = co_await start_operation();
            auto new_tm = get_token_metadata_ptr();
            auto reconciled_stats = old_load_stats->reconcile_tablets_resize(plan.resize_plan().finalize_resize, *tm, *new_tm);
            if (reconciled_stats) {
                _tablet_allocator.set_load_stats(reconciled_stats);
            }
        }

        // Wait for the background storage group merge to finish before releasing the state machine.
        // Background merge holds the old erm, so a successful barrier joins with it.
        // This guarantees that the background merge doesn't run concurrently with the next merge.
        // Replica-side storage group merge takes compaction locks on the tablet's main compaction group, released
        // by the background merge. If the next merge starts before the background merge finishes, it can cause a deadlock.
        // The background merge fiber will try to stop a compaction group which is locked, and the lock is held
        // by the background merge fiber.
        tm = nullptr;
        if (!guard) {
            guard = co_await start_operation();
        }
        co_await global_tablet_token_metadata_barrier(std::move(guard));
    }

    using get_table_ids_func = std::function<std::unordered_set<table_id>(const db::system_keyspace::topology_requests_entry&)>;
    using send_rpc_func = std::function<future<>(locator::host_id, const service::frozen_topology_guard&)>;
    using desc_func = std::function<std::string()>;

    future<> handle_topology_ordered_op(group0_guard guard, get_table_ids_func get_table_ids, send_rpc_func send_rpc, desc_func desc, std::string_view what) {
        // Execute a barrier to make sure the nodes we are performing truncate on see the session
        // and are able to create a topology_guard using the frozen_guard we are sending over RPC
        // TODO: Exclude nodes which don't contain replicas of the table we are truncating
        guard = co_await global_tablet_token_metadata_barrier(std::move(guard));

        const utils::UUID& global_request_id = _topo_sm._topology.global_request_id.value();
        std::optional<sstring> error;
        // We should perform TRUNCATE only if the session is still valid. It could be cleared if a previous truncate
        // handler performed the truncate and cleared the session, but crashed before finalizing the request
        if (_topo_sm._topology.session) {
            const auto topology_requests_entry = co_await _sys_ks.get_topology_request_entry(global_request_id);
            std::unordered_set<table_id> tables;
            try {
                tables = get_table_ids(topology_requests_entry);
            } catch (std::exception& e) {
                error = e.what();
            }
            if (!tables.empty()) {
                // Collect the IDs of the hosts with replicas, but ignore excluded nodes
                std::unordered_set<locator::host_id> replica_hosts;
                for (auto table_id : tables) {
                    const locator::tablet_map& tmap = get_token_metadata_ptr()->tablets().get_tablet_map(table_id);
                    co_await tmap.for_each_tablet([&] (locator::tablet_id tid, const locator::tablet_info& tinfo) {
                        for (const locator::tablet_replica& replica: tinfo.replicas) {
                            if (!_topo_sm._topology.excluded_tablet_nodes.contains(raft::server_id(replica.host.uuid()))) {
                                replica_hosts.insert(replica.host);
                            }
                        }
                        return make_ready_future<>();
                    });
                }

                // Release the guard to avoid blocking group0 for long periods of time while invoking RPCs
                release_guard(std::move(guard));

                co_await utils::get_local_injector().inject(fmt::format("{}_table_wait", what), utils::wait_for_message(std::chrono::minutes(2)));

                // Check if all the nodes with replicas are alive
                for (const locator::host_id& replica_host: replica_hosts) {
                    if (!_gossiper.is_alive(replica_host)) {
                        throw std::runtime_error(::format("Cannot perform {} because host {} is down", desc(), replica_host));
                    }
                }

                // Send the RPC to all replicas
                const service::frozen_topology_guard frozen_guard { _topo_sm._topology.session };
                co_await coroutine::parallel_for_each(replica_hosts, [&] (const locator::host_id& host_id) -> future<> {
                    co_await send_rpc(host_id, frozen_guard);
                });
            }

            // Clear the session and save the error message
            while (true) {
                if (!guard) {
                    guard = co_await start_operation();
                }

                utils::chunked_vector<canonical_mutation> updates;
                updates.push_back(topology_mutation_builder(guard.write_timestamp())
                                    .del_session()
                                    .build());
                if (error) {
                    updates.push_back(topology_request_tracking_mutation_builder(global_request_id)
                                        .set("error", *error)
                                        .build());
                }

                try {
                    co_await update_topology_state(std::move(guard), std::move(updates), fmt::format("Clear {} session", what));
                    break;
                } catch (group0_concurrent_modification&) {
                }
            }
        }

        utils::get_local_injector().inject(fmt::format("{}_crash_after_session_clear", what), [what] {
            rtlogger.info("{}_crash_after_session_clear hit, killing the node", what);
            _exit(1);
        });

        // Execute a barrier to ensure the TRUNCATE RPC can't run on any nodes after this point
        if (!guard) {
            guard = co_await start_operation();
        }
        guard = co_await global_tablet_token_metadata_barrier(std::move(guard));

        // Finalize the request
        while (true) {
            if (!guard) {
                guard = co_await start_operation();
            }
            utils::chunked_vector<canonical_mutation> updates;
            updates.push_back(topology_mutation_builder(guard.write_timestamp())
                                .del_transition_state()
                                .del_global_topology_request()
                                .del_global_topology_request_id()
                                .build());
            updates.push_back(topology_request_tracking_mutation_builder(global_request_id)
                                .done()
                                .build());

            try {
                co_await update_topology_state(std::move(guard), std::move(updates), fmt::format("{}{} has completed", ::toupper(what[0]), what.substr(1)));
                break;
            } catch (group0_concurrent_modification&) {
            }
        }
    }

    future<> handle_truncate_table(group0_guard guard) {
        std::string ks_name, cf_name;

        co_await handle_topology_ordered_op(std::move(guard)
            , [&](const db::system_keyspace::topology_requests_entry& topology_requests_entry) {
                const table_id& id = topology_requests_entry.truncate_table_id;
                lw_shared_ptr<replica::table> table = _db.get_tables_metadata().get_table_if_exists(id);

                if (table) {
                    ks_name = table->schema()->ks_name();
                    cf_name = table->schema()->cf_name();

                    rtlogger.info("Performing TRUNCATE TABLE for {}.{}", ks_name, cf_name);

                    return std::unordered_set<table_id>({ id });
                }
                throw std::invalid_argument(fmt::format("Cannot TRUNCATE table with UUID {} because it does not exist.", id));
            }
            , [&](locator::host_id host_id, const service::frozen_topology_guard& frozen_guard) {
                return ser::storage_proxy_rpc_verbs::send_truncate_with_tablets(&_messaging, host_id, ks_name, cf_name, frozen_guard);
            }
            , [&] {
                return fmt::format("TRUNCATE on table {}.{}", ks_name, cf_name);
            }
            , "truncate"
        );
    }

    future<> handle_snapshot_tables(group0_guard guard) {
        utils::chunked_vector<table_id> ids;
        sstring tag;
        bool skip_flush;
        gc_clock::time_point t;
        std::optional<gc_clock::time_point> expiry;

        co_await handle_topology_ordered_op(std::move(guard)
            , [&](const db::system_keyspace::topology_requests_entry& topology_requests_entry) {
                tag = *topology_requests_entry.snapshot_tag;
                skip_flush = topology_requests_entry.snapshot_skip_flush;
                t = gc_clock::from_time_t(db_clock::to_time_t(topology_requests_entry.start_time));
                if (topology_requests_entry.snapshot_expiry) {
                    expiry = gc_clock::from_time_t(db_clock::to_time_t(*topology_requests_entry.snapshot_expiry));
                }
                for (auto& id : *topology_requests_entry.snapshot_table_ids) {
                    lw_shared_ptr<replica::table> table = _db.get_tables_metadata().get_table_if_exists(id);
                    if (!table) {
                        throw std::invalid_argument(fmt::format("Cannot SNAPSHOT table with UUID {} because it does not exist.", id));
                    }
                    ids.emplace_back(id);
                }
                rtlogger.info("Performing SNAPSHOT TABLES for {}", ids);
                return *topology_requests_entry.snapshot_table_ids;
            }
            , [&](locator::host_id host_id, const service::frozen_topology_guard& frozen_guard) {
                return ser::storage_proxy_rpc_verbs::send_snapshot_with_tablets(&_messaging, host_id, ids, tag, t, skip_flush, expiry, frozen_guard);
            }
            , [&] {
                return fmt::format("SNAPSHOT on tables {}", ids);
            }
            , "snapshot"
        );
    }

    // This function must not release and reacquire the guard, callers rely
    // on the fact that the block which calls this is atomic.
    // FIXME: Don't take the ownership of the guard to make the above guarantee explicit.
    std::pair<bool, group0_guard> should_preempt_balancing(group0_guard guard) {
        auto work = get_next_task(std::move(guard));
        if (auto* node = std::get_if<node_to_work_on>(&work)) {
            return std::make_pair(true, std::move(node->guard));
        }

        if (auto* cancel = std::get_if<cancel_requests>(&work)) {
            // request queue needs to be canceled, so preempt balancing
            return std::make_pair(true, std::move(cancel->guard));
        }

        if (auto* cleanup = std::get_if<start_vnodes_cleanup>(&work)) {
            // cleanup has to be started
            return std::make_pair(true, std::move(cleanup->guard));
        }

        guard = std::get<group0_guard>(std::move(work));

        if (_topo_sm._topology.global_request || !_topo_sm._topology.global_requests_queue.empty()) {
            return std::make_pair(true, std::move(guard));
        }

        if (!_topo_sm._topology.calculate_not_yet_enabled_features().empty()) {
            return std::make_pair(true, std::move(guard));
        }

        return std::make_pair(false, std::move(guard));
    }

    void cleanup_ignored_nodes_on_left(topology_mutation_builder& builder, raft::server_id id) {
        if (_topo_sm._topology.ignored_nodes.contains(id)) {
            auto l = _topo_sm._topology.ignored_nodes;
            l.erase(id);
            builder.set_ignored_nodes(l);
        }
    }

    void trigger_load_stats_refresh() {
        (void)_tablet_load_stats_refresh.trigger().handle_exception([] (auto ep) {
            rtlogger.warn("Error during tablet load stats refresh: {}", ep);
        });
    }

    virtual void on_create_column_family(const sstring& ks_name, const sstring& cf_name) override {
        // New tablets were allocated, we need per-tablet stats for them for tablet balancer to make progress.
        trigger_load_stats_refresh();
    }

    virtual void on_create_view(const sstring& ks_name, const sstring& view_name) override {
        trigger_load_stats_refresh();
    }

    virtual void on_update_column_family(const sstring& ks_name, const sstring& cf_name, bool columns_changed) override {
        // Tablet hints may have changed. Wake up so that load balancer re-evaluates tablet distribution.
        _topo_sm.event.broadcast();
    }

    virtual void on_drop_column_family(const sstring& ks_name, const sstring& cf_name) override {
        // Tablet distribution has changed. Wake up the load balancer.
        _topo_sm.event.broadcast();
    }

    virtual void on_drop_view(const sstring& ks_name, const sstring& view_name) override {
        // Tablet distribution has changed. Wake up the load balancer.
        _topo_sm.event.broadcast();
    }

    future<> cancel_all_requests(group0_guard guard, std::unordered_set<raft::server_id> dead_nodes) {
        utils::chunked_vector<canonical_mutation> muts;
        std::vector<raft::server_id> reject_join;
        if (_topo_sm._topology.requests.empty()) {
            co_return;
        }
        rtlogger.warn("topology coordintator: cancel all requests because non can proceed");
        for (auto& [id, req] : _topo_sm._topology.requests) {
            auto done_msg = fmt::format("Canceled. Dead nodes: {}", dead_nodes);
            _topo_sm.generate_cancel_request_update(muts, _feature_service, guard, id, done_msg);
            switch (req) {
                case topology_request::replace:
                [[fallthrough]];
                case topology_request::join: {
                    reject_join.emplace_back(id);
                    try {
                        co_await wait_for_gossiper(id, _gossiper, _as);
                    } catch (...) {
                        rtlogger.warn("wait_for_ip failed during cancellation: {}", std::current_exception());
                    }
                }
                break;
                case topology_request::leave:
                [[fallthrough]];
                case topology_request::rebuild:
                [[fallthrough]];
                case topology_request::remove: {
                }
                break;
            }
        }

        co_await update_topology_state(std::move(guard), std::move(muts), "cancel all topology requests");

        for (auto id : reject_join) {
            try {
                co_await respond_to_joining_node(id, join_node_response_params{
                    .response = join_node_response_params::rejected{
                        .reason = "request canceled because some required nodes are dead"
                    },
                });
            } catch (...) {
                rtlogger.warn("attempt to send rejection response to {} failed: {}. "
                                "The node may hang. It's safe to shut it down manually now.",
                                id, std::current_exception());
            }
        }

    }

    // Returns `true` iff there was work to do.
    future<bool> handle_topology_transition(group0_guard guard) {
        auto tstate = _topo_sm._topology.tstate;
        if (!tstate) {
            // When adding a new source of work, make sure to update should_preempt_balancing() as well.

            auto work = get_next_task(std::move(guard));
            if (auto* node = std::get_if<node_to_work_on>(&work)) {
                co_await handle_node_transition(std::move(*node));
                co_return true;
            }

            if (auto* cancel = std::get_if<cancel_requests>(&work)) {
                co_await cancel_all_requests(std::move(cancel->guard), std::move(cancel->dead_nodes));
                co_return true;
            }

            if (auto* cleanup = std::get_if<start_vnodes_cleanup>(&work)) {
                co_await start_vnodes_cleanup_on_dirty_nodes(std::move(cleanup->guard),
                    std::pair(cleanup->request, cleanup->request_server_id));
                co_return true;
            }

            guard = std::get<group0_guard>(std::move(work));

            if (_topo_sm._topology.global_request || !_topo_sm._topology.global_requests_queue.empty()) {
                co_await handle_global_request(std::move(guard));
                co_return true;
            }

            if (auto feats = _topo_sm._topology.calculate_not_yet_enabled_features(); !feats.empty()) {
                co_await enable_features(std::move(guard), std::move(feats));
                co_return true;
            }

            if (auto guard_opt = co_await maybe_migrate_system_tables(std::move(guard)); !guard_opt) {
                // The guard is consumed, it means we did migration
                co_return true;
            } else {
                guard = std::move(*guard_opt);
            }

            // Check if any Alternator tables have deferred stream enablement
            // that can now be finalized (no in-progress tablet merges).
            {
                auto tm = get_token_metadata_ptr();
                auto cdc_muts = co_await _cdc_gens.maybe_finalize_pending_stream_enables(*tm, guard.write_timestamp());
                if (!cdc_muts.empty()) {
                    rtlogger.info("Finalizing deferred Alternator stream enablement for {} table(s)", cdc_muts.size());
                    mixed_change change{std::move(cdc_muts)};
                    group0_command g0_cmd = _group0.client().prepare_command(std::move(change), guard,
                        "Finalize deferred Alternator stream enablement");
                    co_await _group0.client().add_entry(std::move(g0_cmd), std::move(guard), _as);
                    co_return true;
                }
            }

            // If there is no other work, evaluate load and start tablet migration if there is imbalance.
            if (auto guard_opt = co_await maybe_start_tablet_migration(std::move(guard)); !guard_opt) {
                co_return true;
            } else {
                guard = std::move(*guard_opt);
            }

            if (co_await maybe_retry_failed_rf_change_tablet_rebuilds(std::move(guard))) {
                co_return true;
            }
            co_return false;
        }

        rtlogger.info("entered `{}` transition state", *tstate);
        switch (*tstate) {
            case topology::transition_state::join_group0: {
                auto [node, accepted] = co_await finish_accepting_node(get_node_to_work_on(std::move(guard)));

                // If responding to the joining node failed, move the node to the left state and
                // stop the topology transition.
                if (!accepted) {
                    topology_mutation_builder builder(node.guard.write_timestamp());
                    topology_request_tracking_mutation_builder rtbuilder(node.rs->request_id);
                    builder.del_transition_state()
                           .with_node(node.id)
                           .set("node_state", node_state::left);
                    rtbuilder.done("join is not accepted");
                    auto reason = ::format("bootstrap: failed to accept {}", node.id);
                    co_await update_topology_state(std::move(node.guard), {builder.build(), rtbuilder.build()}, reason);

                    rtlogger.info("node {} moved to left state", node.id);

                    break;
                }

                switch (node.rs->state) {
                    case node_state::bootstrapping: {
                        if (node.rs->ring) {
                            on_internal_error(rtlogger, format("Bootstrapping node {} owns tokens", node.id));
                        }
                        auto num_tokens = std::get<join_param>(node.req_param.value()).num_tokens;
                        auto tokens_string = std::get<join_param>(node.req_param.value()).tokens_string;

                        auto guard = take_guard(std::move(node));
                        topology_mutation_builder builder(guard.write_timestamp());

                        // A node has just been accepted. If it is a zero-token node, we can
                        // instantly move its state to normal. Otherwise, we need to assign
                        // random tokens and prepare a new CDC generation.
                        if (num_tokens == 0 && tokens_string.empty()) {
                            topology_request_tracking_mutation_builder rtbuilder(node.rs->request_id);
                            builder.del_transition_state()
                                   .with_node(node.id)
                                   .set("node_state", node_state::normal);
                            rtbuilder.done();
                            auto reason = ::format("bootstrap: joined a zero-token node {}", node.id);
                            co_await _voter_handler.on_node_added(node.id, _as);
                            utils::chunked_vector<canonical_mutation> updates{builder.build(), rtbuilder.build()};
                            co_await mark_view_build_statuses_on_node_join(updates, guard, node.id);
                            co_await update_topology_state(std::move(guard), std::move(updates), reason);
                            break;
                        }

                        if (!_db.check_rf_rack_validity_with_topology_change(get_token_metadata_ptr(),
                                locator::rf_rack_topology_operation{locator::rf_rack_topology_operation::type::add,
                                    to_host_id(node.id), node.rs->datacenter, node.rs->rack})) {
                            _rollback = fmt::format("Cannot add the node because its addition would make some existing keyspace RF-rack-invalid");
                            break;
                        }

                        auto tmptr = get_token_metadata_ptr();
                        std::unordered_set<token> bootstrap_tokens;
                        try {
                            bootstrap_tokens = dht::boot_strapper::get_bootstrap_tokens(tmptr, tokens_string, num_tokens, dht::check_token_endpoint::yes);
                        } catch (...) {
                            _rollback = fmt::format("Failed to assign tokens: {}", std::current_exception());
                            break;
                        }

                        auto [gen_uuid, guard_, mutation] = co_await prepare_and_broadcast_cdc_generation_data(
                                tmptr, std::move(guard), bootstrapping_info{bootstrap_tokens, *node.rs});

                        // Write the new CDC generation data through raft.
                        builder.set_transition_state(topology::transition_state::commit_cdc_generation)
                               .set_new_cdc_generation_data_uuid(gen_uuid)
                               .with_node(node.id)
                               .set("tokens", bootstrap_tokens);
                        auto reason = ::format(
                            "bootstrap: insert tokens and CDC generation data (UUID: {})", gen_uuid);
                        co_await update_topology_state(std::move(guard_), {std::move(mutation), builder.build()}, reason);

                        co_await utils::get_local_injector().inject("topology_coordinator_pause_after_updating_cdc_generation", utils::wait_for_message(5min));
                    }
                        break;
                    case node_state::replacing: {
                        if (node.rs->ring) {
                            on_internal_error(rtlogger, format("Replacing node {} owns tokens", node.id));
                        }
                        auto replaced_id = std::get<replace_param>(node.req_param.value()).replaced_id;
                        auto it = _topo_sm._topology.normal_nodes.find(replaced_id);
                        if (it == _topo_sm._topology.normal_nodes.end()) {
                            on_internal_error(rtlogger,
                                    format("Node {} being replaced by {} not found in normal nodes", replaced_id, node.id));
                        }
                        if (!it->second.ring) {
                            on_internal_error(rtlogger,
                                    format("Node {} being replaced by {} is missing tokens", replaced_id, node.id));
                        }
                        if (it->second.state != node_state::normal) {
                            on_internal_error(rtlogger,
                                    format("Node {} being replaced by {} is not in normal state", replaced_id, node.id));
                        }

                        topology_mutation_builder builder(node.guard.write_timestamp());

                        // If a zero-token node is replacing another zero-token node,
                        // we can instantly move the new node's state to normal.
                        if (it->second.ring->tokens.empty()) {
                            node = retake_node(co_await remove_from_group0(std::move(node.guard), replaced_id), node.id);

                            topology_request_tracking_mutation_builder rtbuilder(node.rs->request_id);
                            builder.del_transition_state()
                                   .with_node(node.id)
                                   .set("node_state", node_state::normal);

                            // Move the old node to the left state.
                            topology_mutation_builder builder2(node.guard.write_timestamp());
                            cleanup_ignored_nodes_on_left(builder2, replaced_id);
                            builder2.with_node(replaced_id)
                                    .set("node_state", node_state::left);
                            rtbuilder.done();
                            co_await _voter_handler.on_node_added(node.id, _as);
                            utils::chunked_vector<canonical_mutation> updates{builder.build(), builder2.build(), rtbuilder.build()};
                            co_await mark_view_build_statuses_on_node_join(updates, node.guard, node.id);
                            co_await remove_view_build_statuses_on_left_node(updates, node.guard, replaced_id);
                            co_await update_topology_state(take_guard(std::move(node)), std::move(updates),
                                    fmt::format("replace: replaced node {} with the new zero-token node {}", replaced_id, node.id));
                            break;
                        }

                        guard = take_guard(std::move(node));
                        builder.set_transition_state(topology::transition_state::write_both_read_old)
                               .set_version(_topo_sm._topology.version + 1)
                               .set_session(session_id(guard.new_group0_state_id()))
                               .with_node(node.id)
                               .set("tokens", it->second.ring->tokens);
                        co_await update_topology_state(std::move(guard), {builder.build()},
                                "replace: transition to write_both_read_old and take ownership of the replaced node's tokens");
                    }
                        break;
                    default:
                        on_internal_error(rtlogger,
                                format("topology is in join_group0 state, but the node"
                                       " being worked on ({}) is in unexpected state '{}'; should be"
                                       " either 'bootstrapping' or 'replacing'", node.id, node.rs->state));
                }
            }
                break;
            case topology::transition_state::commit_cdc_generation: {
                // make sure all nodes know about new topology and have the new CDC generation data
                // (we require all nodes to be alive for topo change for now)
                // Note: if there was a replace or removenode going on, we'd need to put the replaced/removed
                // node into `exclude_nodes` parameter in `exec_global_command`, but CDC generations are never
                // introduced during replace/remove.
                topology_mutation_builder builder(guard.write_timestamp());
                if (_topo_sm._topology.global_request == global_topology_request::new_cdc_generation) {
                    builder.del_global_topology_request();
                    builder.del_global_topology_request_id();
                    builder.del_transition_state();
                }

                try {
                    guard = co_await exec_global_command(std::move(guard), raft_topology_cmd::command::barrier, {_raft.id()});
                } catch (term_changed_error&) {
                    throw;
                } catch (group0_concurrent_modification&) {
                    throw;
                } catch (raft::request_aborted&) {
                    throw;
                } catch (seastar::abort_requested_exception&) {
                    throw;
                } catch (...) {
                    rtlogger.error("transition_state::commit_cdc_generation, "
                                    "raft_topology_cmd::command::barrier failed, error {}", std::current_exception());
                    _rollback = fmt::format("Failed to commit cdc generation: {}", std::current_exception());
                }

                if (_rollback) {
                    if (_topo_sm._topology.global_request == global_topology_request::new_cdc_generation && _feature_service.topology_global_request_queue) {
                        topology_request_tracking_mutation_builder rtbuilder(*_topo_sm._topology.global_request_id);
                        // if this is global command fail it since there is nothing to rollback
                        rtbuilder.done(*_rollback);
                        _rollback.reset();
                        co_await update_topology_state(std::move(guard), {builder.build(), rtbuilder.build()}, "committed new CDC generation command failed");
                    }
                    break;
                }

                // We don't need to add delay to the generation timestamp if this is the first generation.
                bool add_ts_delay = !_topo_sm._topology.committed_cdc_generations.empty();

                // Begin the race.
                // See the large FIXME below.
                auto cdc_gen_ts = cdc::new_generation_timestamp(add_ts_delay, _ring_delay);
                auto cdc_gen_uuid = _topo_sm._topology.new_cdc_generation_data_uuid;
                if (!cdc_gen_uuid) {
                    on_internal_error(rtlogger,
                        "new CDC generation data UUID missing in `commit_cdc_generation` state");
                }

                cdc::generation_id cdc_gen_id {
                    .ts = cdc_gen_ts,
                    .id = *cdc_gen_uuid,
                };

                if (!_topo_sm._topology.committed_cdc_generations.empty()) {
                    // Sanity check.
                    // This could happen if the topology coordinator's clock is broken.
                    auto last_committed_gen_id = _topo_sm._topology.committed_cdc_generations.back();
                    if (last_committed_gen_id.ts >= cdc_gen_ts) {
                        on_internal_error(rtlogger, ::format(
                            "new CDC generation has smaller timestamp than the previous one."
                            " Old generation ID: {}, new generation ID: {}", last_committed_gen_id, cdc_gen_id));
                    }
                }

                // Tell all nodes to start using the new CDC generation by updating the topology
                // with the generation's ID and timestamp.
                // At the same time move the topology change procedure to the next step.
                //
                // FIXME: as in previous implementation with gossiper and ring_delay, this assumes that all nodes
                // will learn about the new CDC generation before their clocks reach the generation's timestamp.
                // With this group 0 based implementation, it means that the command must be committed,
                // replicated and applied on all nodes before their clocks reach the generation's timestamp
                // (i.e. within 2 * ring_delay = 60 seconds by default if clocks are synchronized). If this
                // doesn't hold some coordinators might use the wrong CDC streams for some time and CDC stream
                // readers will miss some data. It's likely that Raft replication doesn't converge as quickly
                // as gossiping does.
                //
                // We could use a two-phase algorithm instead: first tell all nodes to prepare for using
                // the new generation, then tell all nodes to commit. If some nodes don't manage to prepare
                // in time, we abort the generation switch. If all nodes prepare, we commit. If a node prepares
                // but doesn't receive a commit in time, it stops coordinating CDC-enabled writes until it
                // receives a commit or abort. This solution does not have a safety problem like the one
                // above, but it has an availability problem when nodes get disconnected from group 0 majority
                // in the middle of a CDC generation switch (when they are prepared to switch but not
                // committed) - they won't coordinate CDC-enabled writes until they reconnect to the
                // majority and commit.
                utils::chunked_vector<canonical_mutation> updates;
                builder.add_new_committed_cdc_generation(cdc_gen_id);
                if (_topo_sm._topology.global_request == global_topology_request::new_cdc_generation) {
                    if (_feature_service.topology_global_request_queue) {
                        topology_request_tracking_mutation_builder rtbuilder(*_topo_sm._topology.global_request_id);
                        rtbuilder.done();
                        updates.push_back(rtbuilder.build());
                    }
                } else {
                    builder.set_transition_state(topology::transition_state::write_both_read_old);
                    builder.set_session(session_id(guard.new_group0_state_id()));
                    builder.set_version(_topo_sm._topology.version + 1);
                }
                updates.push_back(builder.build());
                auto str = ::format("committed new CDC generation, ID: {}", cdc_gen_id);
                co_await update_topology_state(std::move(guard), std::move(updates), std::move(str));
            }
                break;
            case topology::transition_state::tablet_draining:
                try {
                    co_await handle_tablet_migration(std::move(guard), true);
                } catch (term_changed_error&) {
                    throw;
                } catch (group0_concurrent_modification&) {
                    throw;
                } catch (raft::request_aborted&) {
                    throw;
                } catch (seastar::abort_requested_exception&) {
                    throw;
                } catch (...) {
                    rtlogger.error("tablets draining failed with {}. Aborting the topology operation", std::current_exception());
                    _rollback = fmt::format("Failed to drain tablets: {}", std::current_exception());
                }
                break;
            case topology::transition_state::write_both_read_old: {
                co_await utils::get_local_injector().inject(
                        "topology_coordinator/write_both_read_old/before_global_token_metadata_barrier",
                        utils::wait_for_message(std::chrono::minutes(5), &_as));

                auto node = get_node_to_work_on(std::move(guard));

                // make sure all nodes know about new topology (we require all nodes to be alive for topo change for now)
                try {
                    node = retake_node(co_await global_token_metadata_barrier(std::move(node.guard), get_excluded_nodes_for_topology_request(node)), node.id);
                } catch (term_changed_error&) {
                    throw;
                } catch (group0_concurrent_modification&) {
                    throw;
                } catch (raft::request_aborted&) {
                    throw;
                } catch (seastar::abort_requested_exception&) {
                    throw;
                } catch (...) {
                    rtlogger.error("transition_state::write_both_read_old, "
                                    "global_token_metadata_barrier failed, error {}",
                                    std::current_exception());
                    _rollback = fmt::format("global_token_metadata_barrier failed in write_both_read_old state {}", std::current_exception());
                    break;
                }

                if (node.rs->state == node_state::decommissioning || node.rs->state == node_state::removing) {
                    // Tablets should have been drained earlier, in tablet_draining transition state
                    // or in tablet migration transition state when request was in paused state.
                    if (get_token_metadata_ptr()->tablets().has_replica_on(locator::host_id(node.id.uuid()))) {
                        rtlogger.error("node {} still has tablets in stage write_both_read_old", node.id);
                        _rollback = "Node still has tablets in stage write_both_read_old";
                        break;
                    }
                }

                if (_group0.is_member(node.id, true)) {
                    // If we remove a node, we make it a non-voter early to improve availability in some situations.
                    // There is no downside to it because the removed node is already considered dead by us.
                    //
                    // FIXME: removenode may be aborted and the already dead node can be resurrected. We should consider
                    // restoring its voter state on the recovery path.
                    if (node.rs->state == node_state::removing && !_feature_service.group0_limited_voters) {
                        co_await _voter_handler.on_node_removed(node.id, _as);
                    }
                }
                if (node.rs->state == node_state::replacing && !_feature_service.group0_limited_voters) {
                    // We make a replaced node a non-voter early, just like a removed node.
                    auto replaced_node_id = parse_replaced_node(node.req_param);
                    if (_group0.is_member(replaced_node_id, true)) {
                        co_await _voter_handler.on_node_removed(replaced_node_id, _as);
                    }
                }
                utils::get_local_injector().inject("crash_coordinator_before_stream", [] {
                    rtlogger.info("crash_coordinator_before_stream: aborting");
                    abort();
                });
                raft_topology_cmd cmd{raft_topology_cmd::command::stream_ranges};
                auto state = node.rs->state;
                try {
                    // No need to stream data when the node is zero-token.
                    if (!node.rs->ring->tokens.empty()) {
                        if (node.rs->state == node_state::removing) {
                            // tell all token owners to stream data of the removed node to new range owners
                            auto exclude_nodes = get_excluded_nodes_for_topology_request(node);
                            auto normal_zero_token_nodes = _topo_sm._topology.get_normal_zero_token_nodes();
                            std::move(normal_zero_token_nodes.begin(), normal_zero_token_nodes.end(),
                                    std::inserter(exclude_nodes, exclude_nodes.begin()));
                            node = retake_node(
                                    co_await exec_global_command(take_guard(std::move(node)), cmd, exclude_nodes), node.id);
                        } else {
                            // Tell joining/leaving/replacing node to stream its ranges
                            node = co_await exec_direct_command(std::move(node), cmd);
                        }
                    }
                } catch (term_changed_error&) {
                    throw;
                } catch (raft::request_aborted&) {
                    throw;
                } catch (seastar::abort_requested_exception&) {
                    throw;
                } catch (...) {
                    rtlogger.error("send_raft_topology_cmd(stream_ranges) failed with exception"
                                    " (node state is {}): {}", state, std::current_exception());
                    _rollback = fmt::format("Failed stream ranges: {}", std::current_exception());
                    break;
                }

                co_await utils::get_local_injector().inject("topology_coordinator/write_both_read_old/before_version_increment",
                    utils::wait_for_message(std::chrono::minutes(5)));

                // Streaming completed. We can now move tokens state to topology::transition_state::write_both_read_new
                topology_mutation_builder builder(node.guard.write_timestamp());
                builder
                    .set_transition_state(topology::transition_state::write_both_read_new)
                    .del_session()
                    .set_version(_topo_sm._topology.version + 1);
                auto str = ::format("{}: streaming completed for node {}", node.rs->state, node.id);
                co_await update_topology_state(take_guard(std::move(node)), {builder.build()}, std::move(str));
            }
                break;
            case topology::transition_state::write_both_read_new: {
                while (utils::get_local_injector().enter("topology_coordinator_pause_after_streaming")) {
                    co_await sleep_abortable(std::chrono::milliseconds(10), _as);
                }
                const bool removenode_with_left_token_ring = _feature_service.removenode_with_left_token_ring;
                auto node = get_node_to_work_on(std::move(guard));
                bool barrier_failed = false;
                // In this state writes goes to old and new replicas but reads start to be done from new replicas
                // Before we stop writing to old replicas we need to wait for all previous reads to complete
                try {
                    node = retake_node(co_await global_token_metadata_barrier(std::move(node.guard), get_excluded_nodes_for_topology_request(node)), node.id);
                } catch (term_changed_error&) {
                    throw;
                } catch (group0_concurrent_modification&) {
                    throw;
                } catch (raft::request_aborted&) {
                    throw;
                } catch (seastar::abort_requested_exception&) {
                    throw;
                } catch (...) {
                    rtlogger.error("transition_state::write_both_read_new, "
                                    "global_token_metadata_barrier failed, error {}",
                                    std::current_exception());
                    barrier_failed = true;
                }
                if (barrier_failed) {
                    // If barrier above failed it means there may be unfenced reads from old replicas.
                    // Lets wait for the ring delay for those writes to complete or fence to propagate
                    // before continuing.
                    // FIXME: nodes that cannot be reached need to be isolated either automatically or
                    // by an administrator
                    co_await sleep_abortable(_ring_delay, _as);
                    node = retake_node(co_await start_operation(), node.id);
                }
                co_await utils::get_local_injector().inject("topology_coordinator/write_both_read_new/after_barrier",
                    utils::wait_for_message(std::chrono::minutes(5)));
                topology_request_tracking_mutation_builder rtbuilder(node.rs->request_id);
                rtbuilder.done();
                switch(node.rs->state) {
                case node_state::bootstrapping: {
                    co_await utils::get_local_injector().inject("delay_node_bootstrap", utils::wait_for_message(std::chrono::minutes(5)));
                    utils::chunked_vector<canonical_mutation> muts;
                    // Since after bootstrapping a new node some nodes lost some ranges they need to cleanup
                    muts = mark_nodes_as_cleanup_needed(node, false);
                    topology_mutation_builder builder(node.guard.write_timestamp());
                    builder.del_transition_state()
                           .set_version(_topo_sm._topology.version + 1)
                           .with_node(node.id)
                           .set("node_state", node_state::normal);
                    muts.emplace_back(builder.build());
                    muts.emplace_back(rtbuilder.build());
                    co_await _voter_handler.on_node_added(node.id, _as);
                    co_await mark_view_build_statuses_on_node_join(muts, node.guard, node.id);
                    co_await update_topology_state(take_guard(std::move(node)), std::move(muts), "bootstrap: read fence completed");
                    trigger_load_stats_refresh();
                    }
                    break;
                case node_state::removing: {
                    co_await utils::get_local_injector().inject("delay_node_removal", utils::wait_for_message(std::chrono::minutes(5)));
                    if (!removenode_with_left_token_ring) {
                        node = retake_node(co_await remove_from_group0(std::move(node.guard), node.id), node.id);
                    }
                }
                    [[fallthrough]];
                case node_state::decommissioning: {
                    topology_mutation_builder builder(node.guard.write_timestamp());
                    node_state next_state;
                    utils::chunked_vector<canonical_mutation> muts;
                    muts.reserve(2);
                    if (removenode_with_left_token_ring || node.rs->state == node_state::decommissioning) {
                        // Both decommission and removenode go through left_token_ring state
                        // to ensure a global barrier is executed before the request is marked as done.
                        // This ensures all nodes have observed the topology change.
                        next_state = node.rs->state;
                        builder.set_transition_state(topology::transition_state::left_token_ring);
                    } else {
                        next_state = node_state::left;
                        builder.del_transition_state();
                        cleanup_ignored_nodes_on_left(builder, node.id);
                        muts.push_back(rtbuilder.build());
                        co_await remove_view_build_statuses_on_left_node(muts, node.guard, node.id);
                        co_await db::view::view_builder::generate_mutations_on_node_left(_db, _sys_ks, node.guard.write_timestamp(), locator::host_id(node.id.uuid()), muts);
                    }
                    builder.set_version(_topo_sm._topology.version + 1)
                           .with_node(node.id)
                           .del("tokens")
                           .set("node_state", next_state);
                    auto str = ::format("{}: read fence completed", node.rs->state);
                    muts.push_back(builder.build());
                    co_await update_topology_state(take_guard(std::move(node)), std::move(muts), std::move(str));
                    co_await utils::get_local_injector().inject("in_left_token_ring_transition", utils::wait_for_message(std::chrono::minutes(5)));
                }
                    break;
                case node_state::replacing: {
                    auto replaced_node_id = parse_replaced_node(node.req_param);
                    node = retake_node(co_await remove_from_group0(std::move(node.guard), replaced_node_id), node.id);

                    utils::chunked_vector<canonical_mutation> muts;

                    topology_mutation_builder builder1(node.guard.write_timestamp());
                    // Move new node to 'normal'
                    builder1.del_transition_state()
                            .set_version(_topo_sm._topology.version + 1)
                            .with_node(node.id)
                            .set("node_state", node_state::normal);
                    muts.push_back(builder1.build());

                    // Move old node to 'left'
                    topology_mutation_builder builder2(node.guard.write_timestamp());
                    cleanup_ignored_nodes_on_left(builder2, replaced_node_id);
                    builder2.with_node(replaced_node_id)
                            .del("tokens")
                            .set("node_state", node_state::left);
                    muts.push_back(builder2.build());

                    muts.push_back(rtbuilder.build());
                    co_await mark_view_build_statuses_on_node_join(muts, node.guard, node.id);
                    co_await remove_view_build_statuses_on_left_node(muts, node.guard, replaced_node_id);
                    co_await db::view::view_builder::generate_mutations_on_node_left(_db, _sys_ks, node.guard.write_timestamp(), locator::host_id(replaced_node_id.uuid()), muts);
                    co_await _voter_handler.on_node_added(node.id, _as);
                    co_await update_topology_state(take_guard(std::move(node)), std::move(muts),
                                                  "replace: read fence completed");
                    trigger_load_stats_refresh();
                    }
                    break;
                default:
                    on_internal_error(rtlogger, ::format(
                            "Ring state on node {} is write_both_read_new while the node is in state {}",
                            node.id, node.rs->state));
                }
                // Reads are fenced. We can now remove topology::transition_state and move node state to normal
            }
                break;
            case topology::transition_state::tablet_migration:
                co_await handle_tablet_migration(std::move(guard), false);
                break;
            case topology::transition_state::tablet_split_finalization:
                [[fallthrough]];
            case topology::transition_state::tablet_resize_finalization:
                co_await handle_tablet_resize_finalization(std::move(guard));
                break;
            case topology::transition_state::lock:
                release_guard(std::move(guard));
                co_await await_event();
                break;
            case topology::transition_state::left_token_ring: {
                auto node = get_node_to_work_on(std::move(guard));

                // Need to be captured as the node variable might become invalid (e.g. moved out) at particular points.
                const auto node_rs_state = node.rs->state;

                const bool is_removenode = node_rs_state == node_state::removing;

                if (is_removenode && !_feature_service.removenode_with_left_token_ring) {
                    on_internal_error(
                            rtlogger, "removenode operation can only enter the left_token_ring state when REMOVENODE_WITH_LEFT_TOKEN_RING feature is enabled");
                }

                auto finish_left_token_ring_transition = [&](node_to_work_on& node) -> future<> {
                    // Remove the node from group0 here - in general, it won't be able to leave on its own
                    // because we'll ban it as soon as we tell it to shut down.
                    node = retake_node(co_await remove_from_group0(std::move(node.guard), node.id), node.id);

                    utils::get_local_injector().inject("finish_left_token_ring_transition_throw", [] {
                        throw std::runtime_error("finish_left_token_ring_transition failed due to error injection");
                    });

                    utils::chunked_vector<canonical_mutation> muts;

                    topology_mutation_builder builder(node.guard.write_timestamp());
                    cleanup_ignored_nodes_on_left(builder, node.id);
                    builder.del_transition_state()
                            .with_node(node.id)
                            .set("node_state", node_state::left);
                    muts.push_back(builder.build());
                    co_await remove_view_build_statuses_on_left_node(muts, node.guard, node.id);
                    co_await db::view::view_builder::generate_mutations_on_node_left(_db, _sys_ks, node.guard.write_timestamp(), locator::host_id(node.id.uuid()), muts);
                    auto str = std::invoke([&]() {
                        switch (node_rs_state) {
                        case node_state::decommissioning:
                            return ::format("finished decommissioning node {}", node.id);
                        case node_state::removing:
                            return ::format("finished removing node {}", node.id);
                        default:
                            return ::format("finished rollback of {} after {} failure", node.id, node.rs->state);
                        }
                    });
                    co_await update_topology_state(take_guard(std::move(node)), std::move(muts), std::move(str));
                };

                // Denotes the case when this path is already executed before and first topology state update was successful.
                // So we can skip all steps and perform the second topology state update operation (which modifies node state to left)
                // and remove node conditionally from group0.
                if (auto [done, error] = co_await _sys_ks.get_topology_request_state(node.rs->request_id, false); done) {
                    co_await finish_left_token_ring_transition(node);
                    break;
                }

                if (node.id == _raft.id()) {
                    // Removed node must be dead, so it shouldn't enter here (it can't coordinate its own removal).
                    if (is_removenode) {
                        on_internal_error(rtlogger, "removenode operation cannot be coordinated by the removed node itself");
                    }

                    // Someone else needs to coordinate the rest of the decommission process,
                    // because the decommissioning node is going to shut down in the middle of this state.
                    rtlogger.info("coordinator is decommissioning; giving up leadership");
                    co_await step_down_as_nonvoter();

                    // Note: if we restart after this point and become a voter
                    // and then a coordinator again, it's fine - we'll just repeat this step.
                    // (If we're in `left` state when we try to restart we won't
                    // be able to become a voter - we'll be banned from the cluster.)
                }

                bool barrier_failed = false;
                // Wait until other nodes observe the new token ring and stop sending writes to this node.
                auto excluded_nodes = get_excluded_nodes_for_topology_request(node);
                try {
                    // Removed node is added to ignored nodes, so it should be automatically excluded.
                    if (is_removenode && !excluded_nodes.contains(node.id)) {
                        on_internal_error(rtlogger, "removenode operation must have the removed node in excluded_nodes");
                    }
                    node = retake_node(co_await global_token_metadata_barrier(std::move(node.guard), std::move(excluded_nodes)), node.id);
                } catch (term_changed_error&) {
                    throw;
                } catch (group0_concurrent_modification&) {
                    throw;
                } catch (raft::request_aborted&) {
                    throw;
                } catch (seastar::abort_requested_exception&) {
                    throw;
                } catch (...) {
                    rtlogger.error("transition_state::left_token_ring, "
                                    "raft_topology_cmd::command::barrier failed, error {}",
                                    std::current_exception());
                    barrier_failed = true;
                }

                if (barrier_failed) {
                    // If barrier above failed it means there may be unfinished writes to a decommissioned node,
                    // or some nodes might not have observed the new topology yet (one purpose of the barrier
                    // is to make sure all nodes observed the new topology before completing the request).
                    // Lets wait for the ring delay for those writes to complete and new topology to propagate
                    // before continuing.
                    co_await sleep_abortable(_ring_delay, _as);
                    node = retake_node(co_await start_operation(), node.id);
                }

                // Make decommissioning/removed node a non voter before reporting operation completion below.
                // Otherwise the node may see the completion and exit before it is removed from
                // the config at which point the removal from the config will hang if the cluster had only two
                // nodes before the decommission.
                co_await _voter_handler.on_node_removed(node.id, _as);

                topology_request_tracking_mutation_builder rtbuilder(node.rs->request_id);

                rtbuilder.done();

                co_await update_topology_state(take_guard(std::move(node)), {rtbuilder.build()}, "report request completion in left_token_ring state");

                // For decommission/rollback: Tell the node to shut down.
                // This is done to improve user experience when there are no failures.
                // In the next state (`node_state::left`), the node will be banned by the rest of the cluster,
                // so there's no guarantee that it would learn about entering that state even if it was still
                // a member of group0, hence we use a separate direct RPC in this state to shut it down.
                //
                // There is the possibility that the node will never get the message
                // and decommission will hang on that node.
                // This is fine for the rest of the cluster - we will still remove, ban the node and continue.
                //
                // For removenode: The node is already dead, no need to send shutdown command.
                auto node_id = node.id;
                bool shutdown_failed = false;
                if (!is_removenode) {
                    try {
                        node = co_await exec_direct_command(std::move(node), raft_topology_cmd::command::barrier);
                    } catch (...) {
                        rtlogger.warn("failed to tell node {} to shut down - it may hang."
                                      " It's safe to shut it down manually now. (Exception: {})",
                                node.id, std::current_exception());
                        shutdown_failed = true;
                    }
                }
                if (shutdown_failed) {
                    node = retake_node(co_await start_operation(), node_id);
                }

                co_await finish_left_token_ring_transition(node);
            }
                break;
            case topology::transition_state::rollback_to_normal: {
                auto node = get_node_to_work_on(std::move(guard));

                // The barrier waits for all double writes started during the operation to complete. It allowed to fail
                // since we will fence the requests later.
                bool barrier_failed = false;
                auto state = node.rs->state;
                try {
                    node.guard = co_await exec_global_command(std::move(node.guard),raft_topology_cmd::command::barrier_and_drain, get_excluded_nodes_for_topology_request(node), drop_guard_and_retake::yes);
                } catch (term_changed_error&) {
                    throw;
                } catch (raft::request_aborted&) {
                    throw;
                } catch (seastar::abort_requested_exception&) {
                    throw;
                } catch (...) {
                    rtlogger.warn("failed to run barrier_and_drain during rollback of {} after {} failure: {}",
                            node.id, state, std::current_exception());
                    barrier_failed = true;
                }

                if (barrier_failed) {
                    node.guard = co_await start_operation();
                }

                node = retake_node(std::move(node.guard), node.id);

                topology_mutation_builder builder(node.guard.write_timestamp());
                topology_request_tracking_mutation_builder rtbuilder(node.rs->request_id);
                builder.set_fence_version(_topo_sm._topology.version) // fence requests in case the drain above failed
                       .set_transition_state(topology::transition_state::tablet_migration) // in case tablet drain failed we need to complete tablet transitions
                       .with_node(node.id)
                       .set("node_state", node_state::normal);
                rtbuilder.done();

                auto str = fmt::format("complete rollback of {} to state normal after {} failure", node.id, node.rs->state);

                co_await _voter_handler.on_node_added(node.id, _as);

                rtlogger.info("{}", str);
                co_await update_topology_state(std::move(node.guard), {builder.build(), rtbuilder.build()}, str);
            }
                break;
            case topology::transition_state::truncate_table:
                co_await handle_truncate_table(std::move(guard));
                break;
            case topology::transition_state::snapshot_tables:
                co_await handle_snapshot_tables(std::move(guard));
                break;
        }
        co_return true;
    };

    // Called when there is no ongoing topology transition.
    // Used to start new topology transitions using node requests or perform node operations
    // that don't change the topology (like rebuild).
    future<> handle_node_transition(node_to_work_on&& node) {
        rtlogger.info("coordinator fiber found a node to work on id={} state={}", node.id, node.rs->state);

        switch (node.rs->state) {
            case node_state::none: {
                if (_topo_sm._topology.normal_nodes.empty()) {
                    rtlogger.info("skipping join node handshake for the first node in the cluster");
                } else {
                    auto validation_result = validate_joining_node(node);

                    if (utils::get_local_injector().enter("handle_node_transition_drop_expiring")) {
                        _gossiper.get_mutable_address_map().force_drop_expiring_entries();
                    }

                    // When the validation succeeded, it's important that all nodes in the
                    // cluster are aware of the IP address of the new node before we proceed to
                    // the topology::transition_state::join_group0 state, since in this state
                    // node IPs are already used to populate pending nodes in erm.
                    // This applies both to new and replacing nodes.
                    // If the wait_for_ip is unsuccessful, we should inform the new
                    // node about this failure.
                    // If the validation doesn't pass, we only need to call wait_for_ip on the current node,
                    // so that we can communicate the failure of the join request directly to
                    // the joining node.

                    {
                        std::exception_ptr wait_for_ip_error;
                        try {
                            if (holds_alternative<node_validation_failure>(validation_result)) {
                                release_guard(std::move(node.guard));
                                co_await wait_for_gossiper(node.id, _gossiper, _as);
                                node.guard = co_await start_operation();
                            } else {
                                auto exclude_nodes = get_excluded_nodes_for_topology_request(node);
                                exclude_nodes.insert(node.id);
                                node.guard = co_await exec_global_command(std::move(node.guard),
                                    raft_topology_cmd::command::wait_for_ip,
                                    exclude_nodes);
                            }
                        } catch (term_changed_error&) {
                            throw;
                        } catch (raft::request_aborted&) {
                            throw;
                        } catch (seastar::abort_requested_exception&) {
                            throw;
                        } catch (...) {
                            wait_for_ip_error = std::current_exception();
                            rtlogger.warn("raft_topology_cmd::command::wait_for_ip failed, error {}",
                                wait_for_ip_error);
                        }
                        if (wait_for_ip_error) {
                            node.guard = co_await start_operation();
                        }
                        node = retake_node(std::move(node.guard), node.id);

                        if (wait_for_ip_error && holds_alternative<node_validation_success>(validation_result)) {
                            validation_result = node_validation_failure {
                                .reason = ::format("wait_for_ip failed, error {}", wait_for_ip_error)
                            };
                        }
                    }

                    if (std::holds_alternative<node_validation_failure>(validation_result)) {
                        // Transition to left
                        topology_mutation_builder builder(node.guard.write_timestamp());
                        topology_request_tracking_mutation_builder rtbuilder(node.rs->request_id);
                        builder.with_node(node.id)
                               .del("topology_request")
                               .set("node_state", node_state::left);
                        rtbuilder.done("Join is rejected during validation");
                        auto reason = ::format("bootstrap: node rejected");

                        co_await update_topology_state(std::move(node.guard), {builder.build(), rtbuilder.build()}, reason);

                        rtlogger.info("rejected node moved to left state {}", node.id);

                        try {
                            co_await respond_to_joining_node(node.id, join_node_response_params{
                                .response = validate_joining_node_response(std::move(validation_result)),
                            });
                        } catch (const std::runtime_error& e) {
                            rtlogger.warn("attempt to send rejection response to {} failed. "
                                         "The node may hang. It's safe to shut it down manually now. Error: {}",
                                         node.id, e.what());
                        }

                        break;
                    }
                }
            }
            [[fallthrough]];
            case node_state::normal: {
                // if the state is none there have to be either 'join' or 'replace' request
                // if the state is normal there have to be either 'leave', 'remove' or 'rebuild' request
                topology_mutation_builder builder(node.guard.write_timestamp());
                topology_request_tracking_mutation_builder rtbuilder(node.rs->request_id);
                rtbuilder.set("start_time", db_clock::now());
                switch (node.request.value()) {
                    case topology_request::join: {
                        if (node.rs->ring) {
                            on_internal_error(rtlogger, ::format("Joining node {} owns tokens", node.id));
                        }
                        // Write chosen tokens through raft.
                        builder.set_transition_state(topology::transition_state::join_group0)
                               .with_node(node.id)
                               .set("node_state", node_state::bootstrapping)
                               .del("topology_request");
                        auto reason = ::format("bootstrap: accept node");
                        co_await update_topology_state(std::move(node.guard), {builder.build(), rtbuilder.build()}, reason);
                        co_await utils::get_local_injector().inject("topology_coordinator_pause_after_accept_node", utils::wait_for_message(5min));
                        break;
                        }
                    case topology_request::leave: {
                        if (!node.rs->ring) {
                            on_internal_error(rtlogger, ::format("Leaving node {} doesn't own tokens", node.id));
                        }

                        // Leave break point. For testing decommission
                        co_await utils::get_local_injector().inject("topology_coordinator_before_leave", utils::wait_for_message(std::chrono::minutes(2)));

                        auto validation_result = validate_removing_node(_db, to_host_id(node.id));
                        if (std::holds_alternative<node_validation_failure>(validation_result)) {
                            builder.with_node(node.id)
                                   .del("topology_request");
                            rtbuilder.done(format("node decommission rejected: {}",
                                    std::get<node_validation_failure>(validation_result).reason));
                            co_await update_topology_state(std::move(node.guard), {builder.build(), rtbuilder.build()},
                                                           "decommission rejected");
                            break;
                        }

                        // start decommission and put tokens of decommissioning nodes into write_both_read_old state
                        // meaning that reads will go to the replica being decommissioned
                        // but writes will go to new owner as well
                        auto session = session_id(node.guard.new_group0_state_id());
                        builder.set_transition_state(_feature_service.parallel_tablet_draining
                                    ? topology::transition_state::write_both_read_old
                                    : topology::transition_state::tablet_draining)
                               .set_version(_topo_sm._topology.version + 1)
                               .set_session(session)
                               .with_node(node.id)
                               .set("node_state", node_state::decommissioning)
                               .del("topology_request");
                        co_await update_topology_state(take_guard(std::move(node)), {builder.build(), rtbuilder.build()},
                                                       "start decommission");
                        co_await utils::get_local_injector().inject("topology_coordinator_pause_after_start_decommission", utils::wait_for_message(5min));
                        break;
                        }
                    case topology_request::remove: {
                        if (!node.rs->ring) {
                            on_internal_error(rtlogger, ::format("Node {} being removed doesn't own tokens", node.id));
                        }

                        auto validation_result = validate_removing_node(_db, to_host_id(node.id));
                        if (std::holds_alternative<node_validation_failure>(validation_result)) {
                            builder.with_node(node.id)
                                   .del("topology_request");
                            rtbuilder.done(format("node remove rejected: {}",
                                    std::get<node_validation_failure>(validation_result).reason));
                            co_await update_topology_state(std::move(node.guard), {builder.build(), rtbuilder.build()},
                                                           "removenode rejected");
                            break;
                        }

                        auto session = session_id(node.guard.new_group0_state_id());
                        builder.set_transition_state(_feature_service.parallel_tablet_draining
                                    ? topology::transition_state::write_both_read_old
                                    : topology::transition_state::tablet_draining)
                               .set_version(_topo_sm._topology.version + 1)
                               .set_session(session)
                               .with_node(node.id)
                               .set("node_state", node_state::removing)
                               .del("topology_request");
                        co_await update_topology_state(take_guard(std::move(node)), {builder.build(), rtbuilder.build()},
                                                       "start removenode");
                        co_await utils::get_local_injector().inject("topology_coordinator_pause_after_start_removenode", utils::wait_for_message(5min));
                        break;
                        }
                    case topology_request::replace: {
                        if (node.rs->ring) {
                            on_internal_error(rtlogger, ::format("Replacing node {} owns tokens", node.id));
                        }

                        builder.set_transition_state(topology::transition_state::join_group0)
                               .with_node(node.id)
                               .set("node_state", node_state::replacing)
                               .del("topology_request");
                        co_await update_topology_state(take_guard(std::move(node)), {builder.build(), rtbuilder.build()},
                                                       "replace: accept node");
                        break;
                        }
                    case topology_request::rebuild: {
                        topology_mutation_builder builder(node.guard.write_timestamp());
                        builder.set_session(session_id(node.guard.new_group0_state_id()))
                               .with_node(node.id)
                               .set("node_state", node_state::rebuilding)
                               .del("topology_request");
                        co_await update_topology_state(take_guard(std::move(node)), {builder.build(), rtbuilder.build()},
                                                       "start rebuilding");
                        break;
                    }
                }
                break;
            }
            case node_state::rebuilding: {
                bool retake = false;
                auto id = node.id;
                topology_mutation_builder builder(node.guard.write_timestamp());
                topology_request_tracking_mutation_builder rtbuilder(node.rs->request_id);
                try {
                    node = co_await exec_direct_command(std::move(node), raft_topology_cmd::command::stream_ranges);
                    rtbuilder.done();
                } catch (term_changed_error&) {
                    throw;
                } catch (raft::request_aborted&) {
                    throw;
                } catch (seastar::abort_requested_exception&) {
                    throw;
                } catch (const std::exception& e) {
                    rtlogger.error("send_raft_topology_cmd(stream_ranges) failed with exception"
                                    " (node state is rebuilding): {}", e);
                    rtbuilder.done(e.what());
                    retake = true;
                } catch (...) {
                    rtlogger.error("send_raft_topology_cmd(stream_ranges) failed with exception"
                                    " (node state is rebuilding): {}", std::current_exception());
                    rtbuilder.done("unknown error");
                    retake = true;
                }
                if (retake) {
                    node = retake_node(co_await start_operation(), id);
                }
                builder.del_session().with_node(node.id)
                       .set("node_state", node_state::normal)
                       .del("rebuild_option");
                co_await update_topology_state(take_guard(std::move(node)), {builder.build(), rtbuilder.build()}, "rebuilding completed");
            }
                break;
            case node_state::bootstrapping:
            case node_state::decommissioning:
            case node_state::removing:
            case node_state::replacing:
                // Should not get here
                on_internal_error(rtlogger, ::format(
                    "Found node {} in state {} but there is no ongoing topology transition",
                    node.id, node.rs->state));
            case node_state::left:
                // Should not get here
                on_internal_error(rtlogger, ::format(
                        "Topology coordinator is called for node {} in state 'left'", node.id));
                break;
        }

        co_await utils::get_local_injector().inject("topology_coordinator_pause_after_node_transition", utils::wait_for_message(5min));
    }

    node_validation_result
    validate_joining_node(const node_to_work_on& node) {
        if (*node.request == topology_request::replace) {
            auto replaced_id = std::get<replace_param>(node.req_param.value()).replaced_id;
            if (!_topo_sm._topology.normal_nodes.contains(replaced_id)) {
                return node_validation_failure {
                    .reason = ::format("Cannot replace node {} because it is not in the 'normal' state", replaced_id),
                };
            }
        }

        std::vector<sstring> unsupported_features;
        const auto& supported_features = node.rs->supported_features;
        std::ranges::set_difference(node.topology->enabled_features, supported_features, std::back_inserter(unsupported_features));
        if (!unsupported_features.empty()) {
            rtlogger.warn("node {} does not understand some features: {}", node.id, unsupported_features);
            return node_validation_failure {
                .reason = seastar::format("Feature check failed. The node does not support some features that are enabled by the cluster: {}",
                        unsupported_features),
            };
        }

        return node_validation_success {};
    }

    std::variant<join_node_response_params::accepted, join_node_response_params::rejected>
    validate_joining_node_response(node_validation_result&& validation_result) {
        using ret_type = std::variant<join_node_response_params::accepted, join_node_response_params::rejected>;

        return std::visit(overloaded_functor {
            [&] (node_validation_success&& p) -> ret_type {
                return join_node_response_params::accepted {};
            },
            [&] (node_validation_failure&& p) -> ret_type {
                return join_node_response_params::rejected {
                    .reason = std::move(p.reason)
                };
            }
        }, std::move(validation_result));
    }

    // Tries to finish accepting the joining node by updating the cluster
    // configuration and sending the acceptance response.
    //
    // Returns the retaken node and information on whether responding to the
    // join request succeeded.
    future<std::tuple<node_to_work_on, bool>> finish_accepting_node(node_to_work_on&& node) {
        if (_topo_sm._topology.normal_nodes.empty()) {
            // This is the first node, it joins without the handshake.
            co_return std::tuple{std::move(node), true};
        }

        auto id = node.id;

        if (_topo_sm._topology.transition_nodes.empty()) {
            on_internal_error(rtlogger, format("transition nodes are empty while accepting node {}", node.id));
        }

        release_node(std::move(node));

        if (!_raft.get_configuration().contains(id)) {
            co_await _raft.modify_config({raft::config_member({id, {}}, {})}, {}, &_as);
        }

        auto responded = false;
        try {
            co_await respond_to_joining_node(id, join_node_response_params{
                .response = join_node_response_params::accepted{},
            });
            responded = true;
        } catch (const std::runtime_error& e) {
            rtlogger.warn("attempt to send acceptance response to {} failed. "
                         "The node may hang. It's safe to shut it down manually now. Error: {}",
                         node.id, e.what());
        }

        co_return std::tuple{retake_node(co_await start_operation(), id), responded};
    }

    future<> respond_to_joining_node(raft::server_id id, join_node_response_params&& params) {
        co_await ser::join_node_rpc_verbs::send_join_node_response(
            &_messaging, to_host_id(id), id,
            std::move(params)
        );
    }

    utils::chunked_vector<canonical_mutation> mark_nodes_as_cleanup_needed(node_to_work_on& node, bool rollback) {
        auto& topo = _topo_sm._topology;
        utils::chunked_vector<canonical_mutation> muts;
        muts.reserve(topo.normal_nodes.size());
        std::unordered_set<locator::host_id> dirty_nodes;

        for (auto& [_, ermp] : _db.get_non_local_strategy_keyspaces_erms()) {
            auto* erm = ermp->maybe_as_vnode_effective_replication_map();
            const std::unordered_set<locator::host_id>& nodes = rollback ? erm->get_all_pending_nodes() : erm->get_dirty_endpoints();
            dirty_nodes.insert(nodes.begin(), nodes.end());
        }

        for (auto& n : dirty_nodes) {
            auto id = raft::server_id(n.uuid());
            // mark all nodes (except self) as cleanup needed
            if (node.id != id) {
                topology_mutation_builder builder(node.guard.write_timestamp());
                builder.with_node(id).set("cleanup_status", cleanup_status::needed);
                muts.emplace_back(builder.build());
                rtlogger.debug("mark node {} as needed for cleanup", id);
            }
        }
        return muts;
    }

    future<> start_vnodes_cleanup_on_dirty_nodes(group0_guard guard, std::variant<std::pair<topology_request, raft::server_id>, utils::UUID> cmd) {
        sstring cleanup_reason;
        if (const auto* global_request_id = std::get_if<utils::UUID>(&cmd)) {
            cleanup_reason = ::format("by global request {}", *global_request_id);
        } else {
            const auto& req = std::get<std::pair<topology_request, raft::server_id>>(cmd);
            cleanup_reason = ::format("required by '{}' of the node {}", req.first, req.second);
        }
        rtlogger.info("starting vnodes cleanup {}", cleanup_reason);

        // We need a barrier to make sure that no request coordinators
        // are sending requests to old replicas/ranges that we're going to cleanup.
        if (_topo_sm._topology.fence_version < _topo_sm._topology.version) {
            bool fenced = false;
            bool failed = false;
            try {
                rtlogger.info("vnodes cleanup {}: running global_token_metadata_barrier", cleanup_reason);
                guard = co_await global_token_metadata_barrier(std::move(guard), _topo_sm._topology.ignored_nodes, &fenced);
            } catch (term_changed_error&) {
                throw;
            } catch (group0_concurrent_modification&) {
                throw;
            } catch (raft::request_aborted&) {
                throw;
            } catch (seastar::abort_requested_exception&) {
                throw;
            } catch (...) {
                // The barrier does its best to ensure no data plane requests are affected
                // by fencing. It runs barrier_and_drain command on all nodes and then
                // commits fence_version := version to the raft group0. If the first barrier
                // failed it runs another barrier with the new fence_version.
                // Both of these barriers might fail, but if we managed to commit the
                // new fence_version to group0 then we can continue with the cleanup. Why?
                // Cleanup on a replica first calls `read_barrier()`. This ensures that
                // new requests from old coordinators are fenced out before they start
                // executing, preventing them from corrupting the cleaned-up replica state.
                // There may still be active requests on the replica from the old topology
                // if `barrier_and_drain` failed on this replica. We wait for those requests
                // to finish using `await_pending_writes()` before starting the cleanup.

                if (!fenced) {
                    throw;
                }
                failed = true;
                rtlogger.warn("vnodes cleanup {}: global_token_metadata_barrier threw an error {}; "
                    "this is not fatal, continuing cleanup.",
                    cleanup_reason, std::current_exception());
            }
            if (failed) {
                guard = co_await start_operation();
            }
        }

        auto& topo = _topo_sm._topology;
        utils::chunked_vector<canonical_mutation> muts;

        if (const auto* global_request_id = std::get_if<utils::UUID>(&cmd)) {
            muts.reserve(topo.normal_nodes.size() + 2);
            topology_mutation_builder builder(guard.write_timestamp());
            builder.del_global_topology_request();
            if (_feature_service.topology_global_request_queue) {
                topology_request_tracking_mutation_builder rtbuilder(*global_request_id);
                builder.del_global_topology_request_id()
                       .drop_first_global_topology_request_id(_topo_sm._topology.global_requests_queue, *global_request_id);
                rtbuilder.done();
                muts.emplace_back(rtbuilder.build());
            }
            muts.emplace_back(builder.build());
        } else {
            muts.reserve(topo.normal_nodes.size());
        }

        size_t nodes_to_cleanup = 0;
        for (auto& [id, rs] : topo.normal_nodes) {
            if (rs.cleanup == cleanup_status::needed) {
                topology_mutation_builder builder(guard.write_timestamp());
                builder.with_node(id).set("cleanup_status", cleanup_status::running);
                muts.emplace_back(builder.build());
                rtlogger.debug("mark node {} as cleanup running", id);
                ++nodes_to_cleanup;
            }
        }

        if (!muts.empty()) {
          co_await update_topology_state(std::move(guard), std::move(muts), "Starting cleanup");
        }

        rtlogger.info("vnodes cleanup {} started, nodes to cleanup {}", cleanup_reason, nodes_to_cleanup);
    }

    future<> run_view_building_coordinator() {
        while (!_feature_service.view_building_coordinator && !_as.abort_requested()) {
            condition_variable feature_cv;
            auto listener = _feature_service.view_building_coordinator.when_enabled([&feature_cv] {
                feature_cv.broadcast();
            });
            auto abort = _as.subscribe([&feature_cv] () noexcept {
                feature_cv.broadcast();
            });
            co_await feature_cv.wait();
        }
        if (_as.abort_requested()) {
            co_return;
        }

        _lifecycle_notifier.register_subscriber(_vb_coordinator.get());
        try {
            co_await _vb_coordinator->run();
        } catch (...) {
            on_fatal_internal_error(rtlogger, format("unhandled exception in view_building_coordinator::run(): {}", std::current_exception()));
        }
        co_await _lifecycle_notifier.unregister_subscriber(_vb_coordinator.get());
    }

    future<> mark_view_build_statuses_on_node_join(utils::chunked_vector<canonical_mutation>& out, const group0_guard& guard, raft::server_id node_id) {
        co_await _vb_coordinator->mark_view_build_statuses_on_node_join(out, guard, locator::host_id{node_id.uuid()});
    }

    future<> remove_view_build_statuses_on_left_node(utils::chunked_vector<canonical_mutation>& out, const group0_guard& guard, raft::server_id node_id) {
        co_await _vb_coordinator->remove_view_build_statuses_on_left_node(out, guard, locator::host_id{node_id.uuid()});
    }

    // Returns the guard if no work done. Otherwise, performs a table migration and consumes the guard.
    future<std::optional<group0_guard>> maybe_migrate_system_tables(group0_guard guard);

    // Returns the guard if no work done. Otherwise, transitions the state machine into tablet migration path.
    future<std::optional<group0_guard>> maybe_start_tablet_migration(group0_guard);

    // Returns the guard if no work done. Otherwise, transitions the state machine into tablet resize finalization path.
    future<std::optional<group0_guard>> maybe_start_tablet_resize_finalization(group0_guard, const table_resize_plan& plan);

    // Returns true if the state machine was transitioned into tablet migration path.
    future<bool> maybe_retry_failed_rf_change_tablet_rebuilds(group0_guard guard);

    future<> refresh_tablet_load_stats();
    future<> start_tablet_load_stats_refresher();

    future<> await_event() {
        _as.check();
        co_await _topo_sm.event.when();
    }

    future<> fence_previous_coordinator();
    future<> rollback_current_topology_op(group0_guard&& guard);

    // Returns true if the coordinator should sleep after the exception.
    bool handle_topology_coordinator_error(std::exception_ptr eptr) noexcept;

public:
    topology_coordinator(
            sharded<db::system_distributed_keyspace>& sys_dist_ks, gms::gossiper& gossiper,
            netw::messaging_service& messaging, locator::shared_token_metadata& shared_tm,
            db::system_keyspace& sys_ks, replica::database& db, service::raft_group0& group0,
            service::topology_state_machine& topo_sm, db::view::view_building_state_machine& vb_sm, abort_source& as, raft::server& raft_server,
            raft_topology_cmd_handler_type raft_topology_cmd_handler,
            tablet_allocator& tablet_allocator,
            cdc::generation_service& cdc_gens,
            std::chrono::milliseconds ring_delay,
            gms::feature_service& feature_service,
            endpoint_lifecycle_notifier& lifecycle_notifier,
            qos::service_level_controller& sl_controller,
            topology_coordinator_cmd_rpc_tracker& topology_cmd_rpc_tracker)
        : _sys_dist_ks(sys_dist_ks), _gossiper(gossiper), _messaging(messaging)
        , _shared_tm(shared_tm), _sys_ks(sys_ks), _db(db)
        , _tablet_load_stats_refresh_interval_in_seconds(db.get_config().tablet_load_stats_refresh_interval_in_seconds)
        , _group0(group0), _topo_sm(topo_sm), _vb_sm(vb_sm), _as(as)
        , _feature_service(feature_service), _lifecycle_notifier(lifecycle_notifier)
        , _sl_controller(sl_controller)
        , _raft(raft_server), _term(raft_server.get_current_term())
        , _raft_topology_cmd_handler(std::move(raft_topology_cmd_handler))
        , _tablet_allocator(tablet_allocator)
        , _vb_coordinator(std::make_unique<db::view::view_building_coordinator>(_db, _raft, _group0, _sys_ks, _gossiper, _messaging, _vb_sm, _topo_sm, _term, _as))
        , _cdc_gens(cdc_gens)
        , _tablet_load_stats_refresh([this] {
            return with_scheduling_group(_gossiper.get_scheduling_group(), [this] {
                return refresh_tablet_load_stats();
            });
        })
        , _ring_delay(ring_delay)
        , _group0_holder(_group0.hold_group0_gate())
        , _voter_handler(group0, topo_sm._topology, gossiper, feature_service)
        , _topology_cmd_rpc_tracker(topology_cmd_rpc_tracker)
        , _async_gate("topology_coordinator")
    {
        _lifecycle_notifier.register_subscriber(this);
        _db.get_notifier().register_listener(this);
        // When the delay_cdc_stream_finalization error injection is disabled
        // (test releases it), wake the topology coordinator so it retries
        // maybe_finalize_pending_stream_enables promptly.
        utils::get_local_injector().register_on_disable("delay_cdc_stream_finalization", [this] {
            _topo_sm.event.broadcast();
        });
        _topo_sm.on_tablet_split_ready = [this] {
            trigger_load_stats_refresh();
        };
    }

    future<> run();
    future<> stop();

    virtual void on_up(const gms::inet_address& endpoint, locator::host_id hid) override { _topo_sm.event.broadcast(); };
    virtual void on_down(const gms::inet_address& endpoint, locator::host_id hid) override { _topo_sm.event.broadcast(); };

private:
    tablet_ops_metrics _tablet_ops_metrics;
};

future<std::optional<group0_guard>> topology_coordinator::maybe_migrate_system_tables(group0_guard guard) {
    // Some of the upgrades are guarded by _feature_service. If the feature gets enabled,
    // it's in `topology_coordinator::enable_features` ,so  topology_coordinator will re-run its loop
    // and `maybe_migrate_system_tables` will be called.

    if (_feature_service.driver_service_level && !utils::get_local_injector().enter("skip_service_levels_v2_initialization")) {
        const auto sl_driver_created = co_await _sys_ks.get_service_level_driver_created();
        if (!sl_driver_created.value_or(false)) {
            co_return co_await _sl_controller.migrate_to_driver_service_level(std::move(guard), _sys_ks);
        }
    }

    co_return std::move(guard);
}

future<std::optional<group0_guard>> topology_coordinator::maybe_start_tablet_migration(group0_guard guard) {
    rtlogger.debug("Evaluating tablet balance");

    auto tm = get_token_metadata_ptr();
    auto plan = co_await _tablet_allocator.balance_tablets(tm, &_topo_sm._topology, &_sys_ks, {}, get_dead_nodes());
    if (plan.empty()) {
        rtlogger.debug("Tablet load balancer did not make any plan");
        co_return std::move(guard);
    }

    utils::chunked_vector<canonical_mutation> updates;

    co_await generate_migration_updates(updates, guard, plan);

    // We only want to consider transitioning into tablet resize finalization path, if there's no other work
    // to be done (e.g. start migration or/and emit split decision).
    if (updates.empty()) {
        co_return co_await maybe_start_tablet_resize_finalization(std::move(guard), plan.resize_plan());
    }

    updates.emplace_back(
        topology_mutation_builder(guard.write_timestamp())
            .set_transition_state(topology::transition_state::tablet_migration)
            .set_version(_topo_sm._topology.version + 1)
            .build());

    if (plan.requires_schema_changes()) {
        co_await update_topology_state_with_mixed_change(std::move(guard), std::move(updates), "Starting tablet migration");
    } else {
        co_await update_topology_state(std::move(guard), std::move(updates), "Starting tablet migration");
    }
    co_return std::nullopt;
}

future<std::optional<group0_guard>> topology_coordinator::maybe_start_tablet_resize_finalization(group0_guard guard, const table_resize_plan& plan) {
    if (plan.finalize_resize.empty()) {
        co_return std::move(guard);
    }
    if (utils::get_local_injector().enter("tablet_split_finalization_postpone")) {
        co_return std::move(guard);
    }

    auto resize_finalization_transition_state = [this] {
        return _feature_service.tablet_merge ? topology::transition_state::tablet_resize_finalization : topology::transition_state::tablet_split_finalization;
    };

    utils::chunked_vector<canonical_mutation> updates;

    updates.emplace_back(
        topology_mutation_builder(guard.write_timestamp())
            .set_transition_state(resize_finalization_transition_state())
            .set_version(_topo_sm._topology.version + 1)
            .build());

    co_await update_topology_state(std::move(guard), std::move(updates), "Started tablet resize finalization");
    co_return std::nullopt;
}

future<bool> topology_coordinator::maybe_retry_failed_rf_change_tablet_rebuilds(group0_guard guard) {
    rtlogger.debug("Retrying failed rebuilds");

    if (utils::get_local_injector().enter("maybe_retry_failed_rf_change_tablet_rebuilds_skip")) {
        rtlogger.debug("Skipping retrying failed rebuilds due to error injection");
        co_return false;
    }

    auto tmptr = get_token_metadata_ptr();
    utils::chunked_vector<canonical_mutation> updates;
    for (auto& ks_name : _db.get_tablets_keyspaces()) {
        auto& ks = _db.find_keyspace(ks_name);
        auto& strategy = ks.get_replication_strategy();
        auto tables_with_mvs = ks.metadata()->tables();
        auto views = ks.metadata()->views();
        tables_with_mvs.insert(tables_with_mvs.end(), views.begin(), views.end());
        for (const auto& table_or_mv : tables_with_mvs) {
            if (!tmptr->tablets().is_base_table(table_or_mv->id())) {
                continue;
            }

            auto& tablet_map = tmptr->tablets().get_tablet_map(table_or_mv->id());
            auto new_tablet_map = co_await strategy.maybe_as_tablet_aware()->reallocate_tablets(table_or_mv, tmptr, co_await tablet_map.clone_gently());

            replica::tablet_mutation_builder tablet_mutation_builder(guard.write_timestamp(), table_or_mv->id());
            co_await new_tablet_map.for_each_tablet([&](locator::tablet_id tablet_id, const locator::tablet_info& tablet_info) -> future<> {
                auto& replicas = tablet_map.get_tablet_info(tablet_id).replicas;
                auto it = std::find_if(tablet_info.replicas.begin(), tablet_info.replicas.end(), [&](const auto& replica) {
                    return std::find(replicas.begin(), replicas.end(), replica) == replicas.end();
                });
                if (it == tablet_info.replicas.end()) {
                    co_return;
                }
                auto new_replicas = replicas;
                new_replicas.push_back(*it);
                auto last_token = new_tablet_map.get_last_token(tablet_id);
                updates.emplace_back(co_await make_canonical_mutation_gently(
                        replica::tablet_mutation_builder(guard.write_timestamp(), table_or_mv->id())
                                .set_new_replicas(last_token, new_replicas)
                                .set_stage(last_token, locator::tablet_transition_stage::allow_write_both_read_old)
                                .set_transition(last_token, locator::choose_rebuild_transition_kind(_feature_service))
                                .build()
                ));
            });
        }

        if (!updates.empty()) {
            break;
        }
    }

    if (updates.empty()) {
        rtlogger.debug("No failed RF change rebuilds to retry");
        co_return false;
    }

    updates.emplace_back(
        topology_mutation_builder(guard.write_timestamp())
            .set_transition_state(topology::transition_state::tablet_migration)
            .set_version(_topo_sm._topology.version + 1)
            .build());

    sstring reason = "Retry failed tablet rebuilds";
    co_await update_topology_state(std::move(guard), std::move(updates), reason);
    co_return true;
}

future<> topology_coordinator::refresh_tablet_load_stats() {
    co_await utils::get_local_injector().inject("refresh_tablet_load_stats_pause", utils::wait_for_message(5min));
    auto tm = get_token_metadata_ptr();

    locator::load_stats stats;
    static constexpr std::chrono::seconds wait_for_live_nodes_timeout{30};

    std::unordered_map<table_id, size_t> total_replicas;
    bool table_load_stats_invalid = false;

    for (auto& [dc, nodes] : tm->get_datacenter_token_owners_nodes()) {
        locator::load_stats dc_stats;
        rtlogger.debug("raft topology: Refreshing table load stats for DC {} that has {} token owners", dc, nodes.size());
        co_await coroutine::parallel_for_each(nodes, [&] (const auto& node) -> future<> {
            auto dst = node.get().host_id();
            auto dst_server = raft::server_id(dst.uuid());

            _as.check();

            // FIXME: if a node is down, completion status cannot be relied upon, but the average tablet size
            //  could still be inferred from the replicas available.

            auto timeout = netw::messaging_service::clock_type::now() + wait_for_live_nodes_timeout;

            abort_source as;
            auto request_abort = [&as] () mutable noexcept {
                as.request_abort();
            };
            auto t = timer<lowres_clock>(request_abort);
            t.arm(timeout);
            auto sub = _as.subscribe(request_abort);

            locator::load_stats node_stats;
            if (!_gossiper.is_alive(dst)) {
                if (_load_stats_per_node.contains(dst) &&
                        !utils::get_local_injector().enter("force_down_node_load_stats_invalid")) {
                    node_stats = _load_stats_per_node[dst];
                } else {
                    rtlogger.debug("raft topology: Unable to refresh table load on {} because it's down.", dst);
                    table_load_stats_invalid = true;
                    co_return;
                }
            } else if (_feature_service.tablet_load_stats_v2) {
                node_stats = co_await ser::storage_service_rpc_verbs::send_table_load_stats(&_messaging,
                                                                                            dst,
                                                                                            as,
                                                                                            dst_server);
            } else {
                node_stats = locator::load_stats::from_v1(
                    co_await ser::storage_service_rpc_verbs::send_table_load_stats_v1(&_messaging,
                                                                                      dst,
                                                                                      as,
                                                                                      dst_server));
            }

            _load_stats_per_node[dst] = node_stats;
            dc_stats += node_stats;
        });

        for (auto& [table_id, table_stats] : dc_stats.tables) {
            co_await coroutine::maybe_yield();

            if (!_db.column_family_exists(table_id)) {
                continue;
            }
            auto& t = _db.find_column_family(table_id);
            auto& rs = t.get_effective_replication_map()->get_replication_strategy();
            if (!rs.uses_tablets()) {
                continue;
            }
            const auto* nts_ptr = dynamic_cast<const locator::network_topology_strategy*>(&rs);
            if (!nts_ptr) {
                on_internal_error(rtlogger, "Cannot convert replication_strategy that uses tablets into network_topology_strategy");
            }

            auto rf_for_this_dc = nts_ptr->get_replication_factor(dc);
            if (rf_for_this_dc <= 0) {
                continue;
            }
            total_replicas[table_id] += rf_for_this_dc;
            rtlogger.debug("raft topology: Refreshed table load stats for DC {}, table={}, RF={}, size_in_bytes={}, split_ready_seq_number={}",
                          dc, table_id, rf_for_this_dc, table_stats.size_in_bytes, table_stats.split_ready_seq_number);
        }

        stats += dc_stats;
    }

    for (auto& [table_id, table_load_stats] : stats.tables) {
        if (!total_replicas.contains(table_id)) {
            continue;
        }
        auto table_total_replicas = total_replicas.at(table_id);
        if (table_total_replicas == 0) {
            continue;
        }
        // Takes into account the RF of each DC, so we can compute the average total size
        // for a single table replica. This allows the load balancer to compute, in turn,
        // the average tablet size by dividing total size by tablet count.
        table_load_stats.size_in_bytes /= table_total_replicas;
    }

    if (table_load_stats_invalid) {
        stats.tables.clear();
    }

    rtlogger.debug("raft topology: Refreshed table load stats for all DC(s).");

    _tablet_allocator.set_load_stats(make_lw_shared<const locator::load_stats>(std::move(stats)));
    _topo_sm.event.broadcast(); // wake up load balancer.
}

future<> topology_coordinator::start_tablet_load_stats_refresher() {
    auto can_proceed = [this] { return !_async_gate.is_closed() && !_as.abort_requested(); };
    while (can_proceed()) {
        bool sleep = true;
        try {
            co_await _tablet_load_stats_refresh.trigger();
        } catch (raft::request_aborted&) {
            rtlogger.debug("raft topology: Tablet load stats refresher aborted");
            sleep = false;
        } catch (seastar::abort_requested_exception&) {
            rtlogger.debug("raft topology: Tablet load stats refresher aborted");
            sleep = false;
        } catch (...) {
            rtlogger.warn("Found error while refreshing load stats for tablets: {}, retrying...", std::current_exception());
        }
        auto refresh_interval = std::chrono::seconds(_tablet_load_stats_refresh_interval_in_seconds.get());
        if (sleep && can_proceed()) {
            try {
                co_await seastar::sleep_abortable(refresh_interval, _as);
            } catch (...) {
                rtlogger.debug("raft topology: Tablet load stats refresher: sleep failed: {}", std::current_exception());
            }
        }
    }
}

future<> topology_coordinator::fence_previous_coordinator() {
    // Write empty change to make sure that a guard taken by any previous coordinator cannot
    // be used to do a successful write any more. Otherwise the following can theoretically happen
    // while a coordinator tries to execute RPC R and move to state S.
    // 1. Leader A executes topology RPC R
    // 2. Leader A takes guard G
    // 3. Leader A calls update_topology_state(S)
    // 4. Leadership moves to B (while update_topology_state is still running)
    // 5. B executed topology RPC R again
    // 6. while the RPC is running leadership moves to A again
    // 7. A completes update_topology_state(S)
    // Topology state machine moves to state S while RPC R is still running.
    // If RPC is idempotent that should not be a problem since second one executed by B will do nothing,
    // but better to be safe and cut off previous write attempt
    while (!_as.abort_requested()) {
        try {
            auto guard = co_await start_operation();
            topology_mutation_builder builder(guard.write_timestamp());
            co_await update_topology_state(std::move(guard), {builder.build()}, fmt::format("Starting new topology coordinator {}", _group0.group0_server().id()));
            break;
        } catch (group0_concurrent_modification&) {
            // If we failed to write because of concurrent modification lets retry
            continue;
        } catch (raft::request_aborted&) {
            // Abort was requested. Break the loop
            rtlogger.debug("request to fence previous coordinator was aborted");
            break;
        } catch (seastar::abort_requested_exception&) {
            // Abort was requested. Break the loop
            rtlogger.debug("request to fence previous coordinator was aborted");
            break;
        } catch (...) {
            rtlogger.error("failed to fence previous coordinator {}", std::current_exception());
        }
        try {
            co_await seastar::sleep_abortable(std::chrono::seconds(1), _as);
        } catch (abort_requested_exception&) {
            // Abort was requested. Break the loop
            break;
        } catch (...) {
            rtlogger.debug("sleep failed while fencing previous coordinator: {}", std::current_exception());
        }
    }
}

future<> topology_coordinator::rollback_current_topology_op(group0_guard&& guard) {
    rtlogger.info("start rolling back topology change");
    // Look for a node which operation should be aborted
    // (there should be one since we are in the rollback)
    node_to_work_on node = get_node_to_work_on(std::move(guard));
    topology::transition_state transition_state;

    switch (node.rs->state) {
        case node_state::bootstrapping:
            [[fallthrough]];
        case node_state::replacing:
            // To rollback bootstrap of replace just move the topology to left_token_ring. The node we tried to
            // add will be removed from the group0 by the transition handler. It will also be notified to shutdown.
            transition_state = topology::transition_state::left_token_ring;
            break;
        case node_state::removing: {
            auto id = node.id;
            // The node was removed already. We need to add it back. Lets do it as non voter.
            // If it ever boots again it will make itself a voter.
            release_node(std::move(node));
            co_await _group0.group0_server().modify_config({raft::config_member{{id, {}}, raft::is_voter::no}}, {}, &_as);
            node = retake_node(co_await start_operation(), id);
        }
            [[fallthrough]];
        case node_state::decommissioning:
            // To rollback decommission or remove just move the topology to rollback_to_normal.
            transition_state = topology::transition_state::rollback_to_normal;
            break;
        default:
            on_internal_error(rtlogger, fmt::format("tried to rollback in unsupported state {}", node.rs->state));
    }

    topology_mutation_builder builder(node.guard.write_timestamp());
    topology_request_tracking_mutation_builder rtbuilder(node.rs->request_id);
    builder.set_transition_state(transition_state)
           .set_version(_topo_sm._topology.version + 1);
    rtbuilder.set("error", fmt::format("Rolled back: {}", *_rollback));

    utils::chunked_vector<canonical_mutation> muts;
    // We are in the process of aborting remove or decommission which may have streamed some
    // ranges to other nodes. Cleanup is needed.
    muts = mark_nodes_as_cleanup_needed(node, true);
    muts.emplace_back(builder.build());
    muts.emplace_back(rtbuilder.build());

    std::string str = fmt::format("rollback {} after {} failure, moving transition state to {} and setting cleanup flag",
            node.id, node.rs->state, transition_state);
    rtlogger.info("{}", str);
    co_await update_topology_state(std::move(node.guard), std::move(muts), str);
}

bool topology_coordinator::handle_topology_coordinator_error(std::exception_ptr eptr) noexcept {
    try {
        std::rethrow_exception(std::move(eptr));
    } catch (raft::request_aborted&) {
        rtlogger.debug("topology change coordinator fiber aborted");
    } catch (seastar::abort_requested_exception&) {
        rtlogger.debug("topology change coordinator fiber aborted");
    } catch (raft::commit_status_unknown&) {
        rtlogger.warn("topology change coordinator fiber got commit_status_unknown");
    } catch (group0_concurrent_modification&) {
        rtlogger.info("topology change coordinator fiber got group0_concurrent_modification");
    } catch (term_changed_error&) {
        // Term changed. We may no longer be a leader
        rtlogger.debug("topology change coordinator fiber notices term change {} -> {}", _term, _raft.get_current_term());
    } catch (seastar::rpc::remote_verb_error&) {
        rtlogger.warn("topology change coordinator fiber got rpc::remote_verb_error {}", std::current_exception());
        return true;
    } catch (...) {
        rtlogger.error("topology change coordinator fiber got error {}", std::current_exception());
        return true;
    }
    return false;
}

future<> topology_coordinator::run() {
    auto abort = _as.subscribe([this] () noexcept {
        _topo_sm.event.broadcast();
    });

    co_await fence_previous_coordinator();
    auto cdc_generation_publisher = cdc_generation_publisher_fiber();
    auto cdc_streams_gc = cdc_streams_gc_fiber();
    auto tablet_load_stats_refresher = start_tablet_load_stats_refresher();
    auto gossiper_orphan_remover = gossiper_orphan_remover_fiber();
    auto group0_voter_refresher = group0_voter_refresher_fiber();
    auto vb_coordinator_fiber = run_view_building_coordinator();

    std::optional<future<>> event_wait;

    while (!_as.abort_requested()) {
        bool sleep = false;
        try {
            co_await utils::get_local_injector().inject("topology_coordinator_pause_before_processing_backlog", utils::wait_for_message(5min));

            if (!_topo_sm._topology.tstate && utils::get_local_injector().enter("tablet_load_stats_refresh_before_rebalancing")) {
                co_await _tablet_load_stats_refresh.trigger();
            }

            if (!event_wait) {
                event_wait = _topo_sm.event.wait();
            }

            auto guard = co_await cleanup_group0_config_if_needed(co_await start_operation());

            if (_rollback) {
                co_await rollback_current_topology_op(std::move(guard));
                _rollback = std::nullopt;
                continue;
            }

            bool had_work = co_await handle_topology_transition(std::move(guard));

            if (!had_work) {
                co_await utils::get_local_injector().inject("wait-before-topology-coordinator-goes-to-sleep", utils::wait_for_message(30s));

                // Nothing to work on. Wait for topology change event.
                rtlogger.debug("topology coordinator fiber has nothing to do. Sleeping.");
                _as.check();
                auto f = std::move(*event_wait);
                event_wait.reset();
                co_await std::move(f);
                rtlogger.debug("topology coordinator fiber got an event");
            }
            co_await utils::get_local_injector().inject("wait-after-topology-coordinator-gets-event", utils::wait_for_message(30s));
        } catch (...) {
            sleep = handle_topology_coordinator_error(std::current_exception());
        }
        if (sleep) {
            try {
                co_await seastar::sleep_abortable(std::chrono::seconds(1), _as);
            } catch (...) {
                rtlogger.debug("sleep failed: {}", std::current_exception());
            }
        }
        co_await coroutine::maybe_yield();
    }

    if (event_wait && event_wait->available()) {
        event_wait->ignore_ready_future();
    }

    co_await _async_gate.close();
    co_await std::move(tablet_load_stats_refresher);
    co_await std::move(cdc_generation_publisher);
    co_await std::move(cdc_streams_gc);
    co_await std::move(gossiper_orphan_remover);
    co_await std::move(group0_voter_refresher);
    co_await std::move(vb_coordinator_fiber);
    co_await _vb_coordinator->stop();
}

future<> topology_coordinator::stop() {
    co_await _db.get_notifier().unregister_listener(this);
    utils::get_local_injector().unregister_on_disable("delay_cdc_stream_finalization");
    _topo_sm.on_tablet_split_ready = nullptr;
    co_await _lifecycle_notifier.unregister_subscriber(this);
    co_await _tablet_load_stats_refresh.join();

    // if topology_coordinator::run() is aborted either because we are not a
    // leader anymore, or we are shutting down as a leader, we have to handle
    // futures in _tablets in case any of them failed, before these failures
    // are checked by future's destructor.
    co_await coroutine::parallel_for_each(_tablets, [] (auto& tablet) -> future<> {
        auto& [gid, tablet_state] = tablet;
        rtlogger.debug("Checking tablet migration state for {}", gid);

        auto stop_background_action = [] (background_action_holder& holder, locator::global_tablet_id gid, std::function<std::string()> fmt_desc) -> future<> {
            if (holder) {
                try {
                    co_await std::move(*holder);
                } catch (...) {
                    rtlogger.warn("Tablet '{}' migration failed {}: {}",
                                    gid, fmt_desc(), std::current_exception());
                }
            }
        };

        // we should have at most only a single active barrier for each tablet,
        // but let's check all of them because we never reset these holders
        // once they are added as barriers
        for (auto& [stage, barrier]: tablet_state.barriers) {
            SCYLLA_ASSERT(barrier.has_value());
            co_await stop_background_action(barrier, gid, [stage] { return format("at stage {}", tablet_transition_stage_to_string(stage)); });
        }

        co_await stop_background_action(tablet_state.streaming, gid, [] { return "during streaming"; });
        co_await stop_background_action(tablet_state.cleanup, gid, [] { return "during cleanup"; });
        co_await stop_background_action(tablet_state.rebuild_repair, gid, [] { return "during rebuild_repair"; });
        co_await stop_background_action(tablet_state.repair, gid, [] { return "during repair"; });
    });
}

future<> run_topology_coordinator(
        seastar::sharded<db::system_distributed_keyspace>& sys_dist_ks, gms::gossiper& gossiper,
        netw::messaging_service& messaging, locator::shared_token_metadata& shared_tm,
        db::system_keyspace& sys_ks, replica::database& db, service::raft_group0& group0,
        service::topology_state_machine& topo_sm, db::view::view_building_state_machine& vb_sm, seastar::abort_source& as, raft::server& raft,
        raft_topology_cmd_handler_type raft_topology_cmd_handler,
        tablet_allocator& tablet_allocator,
        cdc::generation_service& cdc_gens,
        std::chrono::milliseconds ring_delay,
        endpoint_lifecycle_notifier& lifecycle_notifier,
        gms::feature_service& feature_service,
        qos::service_level_controller& sl_controller,
        topology_coordinator_cmd_rpc_tracker& topology_cmd_rpc_tracker) {

    topology_coordinator coordinator{
            sys_dist_ks, gossiper, messaging, shared_tm,
            sys_ks, db, group0, topo_sm, vb_sm, as, raft,
            std::move(raft_topology_cmd_handler),
            tablet_allocator,
            cdc_gens,
            ring_delay,
            feature_service, lifecycle_notifier,
            sl_controller,
            topology_cmd_rpc_tracker};

    std::exception_ptr ex;
    try {
        rtlogger.info("start topology coordinator fiber");
        co_await with_scheduling_group(group0.get_scheduling_group(), [&] {
            return coordinator.run();
        });
    } catch (...) {
        ex = std::current_exception();
    }
    if (ex) {
        try {
            if (raft.is_leader()) {
                rtlogger.warn("unhandled exception in topology_coordinator::run: {}; stepping down as a leader", ex);
                const auto stepdown_timeout_ticks = std::chrono::seconds(5) / raft_tick_interval;
                co_await raft.stepdown(raft::logical_clock::duration(stepdown_timeout_ticks));
            }
        } catch (...) {
            rtlogger.error("failed to step down before aborting: {}", std::current_exception());
        }
        on_fatal_internal_error(rtlogger, format("unhandled exception in topology_coordinator::run: {}", ex));
    }
    co_await utils::get_local_injector().inject("topology_coordinator_pause_before_stop", utils::wait_for_message(5min));
    co_await coordinator.stop();
}

tablet_ops_metrics::tablet_ops_metrics() {
    namespace sm = seastar::metrics;
    auto ops_label_type = sm::label("kind");
    _metrics.add_group("tablet_ops", {
        sm::make_gauge("failed", [this] { return stats[locator::tablet_task_type::auto_repair].failed; },
                sm::description("Number of failed tablet auto repair"), {ops_label_type("auto_repair"), basic_level}),
        sm::make_gauge("succeeded", [this] { return stats[locator::tablet_task_type::auto_repair].succeeded ; },
                sm::description("Number of succeeded tablet auto repair"), {ops_label_type("auto_repair"), basic_level}),
        sm::make_gauge("failed", [this] { return stats[locator::tablet_task_type::user_repair].failed; },
                sm::description("Number of failed tablet user repair"), {ops_label_type("user_repair"), basic_level}),
        sm::make_gauge("succeeded", [this] { return stats[locator::tablet_task_type::user_repair].succeeded; },
                sm::description("Number of succeeded tablet user repair"), {ops_label_type("user_repair"), basic_level}),
    });
}

} // namespace service