scylladb/service/strong_consistency/groups_manager.hh

/*
 * Copyright (C) 2025-present ScyllaDB
 */

/*
 * SPDX-License-Identifier: LicenseRef-ScyllaDB-Source-Available-1.1
 */

#pragma once

#include "locator/abstract_replication_strategy.hh"
#include "message/messaging_service.hh"
#include "service/raft/raft_group_registry.hh"
#include "cql3/query_processor.hh"

namespace db {
class system_keyspace;
}

namespace gms {
class gossiper;
}

namespace service {
class migration_manager;
}

namespace service::strong_consistency {

class raft_server;

/// A cache of leader locations for raft groups where this node is not a replica.
/// Populated by the CQL transport layer after a redirect reveals the actual leader.
///
/// Uses a sweep-based eviction strategy tied to token_metadata updates:
/// begin_sweep() before iterating tablets, mark_seen() for each existing group,
/// end_sweep() to evict entries whose groups no longer exist.
class tablet_group_leader_cache {
    struct entry {
        locator::host_id leader;
        bool seen = false;
    };
    std::unordered_map<raft::group_id, entry> _entries;

public:
    void put(raft::group_id group, locator::host_id leader) {
        auto [it, inserted] = _entries.try_emplace(group, entry{leader});
        if (!inserted) {
            it->second.leader = leader;
        }
    }

    std::optional<locator::host_id> get(raft::group_id group) const {
        auto it = _entries.find(group);
        if (it != _entries.end()) {
            return it->second.leader;
        }
        return std::nullopt;
    }

    void erase(raft::group_id group) {
        _entries.erase(group);
    }

    void begin_sweep() {
        for (auto& [_, e] : _entries) {
            e.seen = false;
        }
    }

    void mark_seen(raft::group_id group) {
        auto it = _entries.find(group);
        if (it != _entries.end()) {
            it->second.seen = true;
        }
    }

    void end_sweep() {
        std::erase_if(_entries, [](const auto& p) { return !p.second.seen; });
    }
};

/// A sharded service responsible for the lifecycle and access
/// management of all Raft groups for strongly consistent tablets hosted on this node.
///
/// Listens for token_metadata updates to automatically start Raft servers for tablets newly
/// assigned to this node and schedule the deletion of Raft servers for tablets that have moved away.
///
/// It serves as the entry point for read and write requests via acquire_server() method. It is guaranteed
/// that the raft::server instance and its associated state managed by groups_manager cannot be
/// stopped or destroyed while the returned raft_server object is alive.
///
/// Runs a background fiber (leader_info_updater) per group that monitors the raft::server state
/// and computes the next write timestamp as soon as the server becomes leader.
/// This allows write requests to proceed without waiting for read_barrier(),
/// which would otherwise be needed to compute the timestamp.
class groups_manager : public peering_sharded_service<groups_manager> {
    class state_machine_impl;
    class rpc_impl;

    friend class raft_server;

    struct leader_info {
        // The Raft term this structure describes.
        raft::term_t term;

        // The last timestamp used for mutations in this term.
        api::timestamp_type last_timestamp;
    };

    struct raft_group_state : public boost::intrusive::list_base_hook<boost::intrusive::link_mode<boost::intrusive::auto_unlink>> {
        bool has_tablet = false;
        lw_shared_ptr<gate> gate = nullptr;
        raft::server* server = nullptr;

        // Serialized chain of raft::server control operations (start/stop).
        // This serialization handles (rare) cases where a tablet is migrated out
        // before the raft::server has finished initializing, or conversely,
        // when a tablet is migrated back to this node before deinitialization completes.
        // Subsequent operations wait for the previous one to complete.
        shared_future<> server_control_op = make_ready_future<>();

        // Populated only when this node thinks it's a tablet raft group leader.
        std::optional<leader_info> leader_info = std::nullopt;
        condition_variable leader_info_cond = condition_variable();
        future<> leader_info_updater = make_ready_future<>();
    };

    netw::messaging_service& _ms;
    raft_group_registry& _raft_gr;
    cql3::query_processor& _qp;
    replica::database& _db;
    service::migration_manager& _mm;
    db::system_keyspace& _sys_ks;
    gms::feature_service& _features;
    gms::gossiper& _gossiper;
    std::unordered_map<raft::group_id, raft_group_state> _raft_groups = {};
    boost::intrusive::list<raft_group_state, boost::intrusive::constant_time_size<false>> _starting_groups;
    locator::token_metadata_ptr _pending_tm = nullptr;
    bool _started = false;

    tablet_group_leader_cache _leader_cache;

    // Should be called on the shard that hosts the Raft group
    future<> start_raft_group(locator::global_tablet_id tablet,
        raft::group_id group_id,
        locator::token_metadata_ptr tm);

    void schedule_raft_group_deletion(raft::group_id group_id, raft_group_state& group_state);

    void schedule_raft_groups_deletion(bool all);

    future<> leader_info_updater(raft_group_state& state, locator::global_tablet_id tablet, raft::group_id gid);

    void init_messaging_service();
    future<> uninit_messaging_service();

public:
    groups_manager(netw::messaging_service& ms, raft_group_registry& raft_gr,
        cql3::query_processor& qp, replica::database& _db, service::migration_manager& mm, db::system_keyspace& sys_ks,
        gms::feature_service& features, gms::gossiper& gossiper);

    // Called whenever a new token_metadata is published on this shard.
    // Starts raft::server instances for all strongly consistent tablets now
    // residing on this shard, and schedules removal of servers for tablets
    // that have moved away.
    //
    // Note that the method is synchronous: it only initiates these operations
    // and does not wait for their completion.
    void update(locator::token_metadata_ptr new_tm);

    // The raft_server instance is used to submit write commands and perform read_barrier() before reads.
    future<raft_server> acquire_server(table_id table_id, raft::group_id group_id, abort_source& as);

    // Called during node boot. Starts all raft::server instances corresponding
    // to the latest group0 state in the background.
    void start();

    // Called during node shutdown. Waits for all raft::server instances to stop.
    future<> stop();

    future<> wait_for_groups_to_start(lowres_clock::time_point timeout);

    // Sends an RPC to every host that holds a tablet replica of the given table, asking it to wait
    // until the raft groups for those tablets are started and ready to serve queries.
    // For the local node, waits directly without an RPC.
    future<> wait_for_table_raft_groups_on_all_hosts(table_id table, lowres_clock::time_point timeout);

    tablet_group_leader_cache& leader_cache() { return _leader_cache; }
};

/// A temporary, RAII-style handle to an active Raft group server instance,
/// used to safely submit commands or perform consistency barriers.
///
/// The holder guarantees that the underlying raft::server and its associated state
/// managed by groups_manager cannot be stopped or destroyed while this raft_server object is alive.
/// It ensures that even if a topology change triggers the deletion of the Raft group,
/// the shutdown sequence will wait until this handle is destroyed, preventing use-after-free
/// errors during ongoing operations.
class raft_server {
private:
    groups_manager::raft_group_state& _state;
    gate::holder _holder;

public:
    raft_server(groups_manager::raft_group_state& state, gate::holder holder);

    raft::server& server() {
        return *_state.server;
    }

    // Possible results:
    //   timestamp_with_term - timestamp to use for a new mutation request
    //   raft::not_a_leader - this node is not a leader
    //   need_wait_for_leader - the caller needs to wait on the specified future and then retry `begin_mutate`
    struct timestamp_with_term {
        api::timestamp_type timestamp;
        raft::term_t term;
    };
    struct need_wait_for_leader {
        future<> future;
    };
    using begin_mutate_result = std::variant<timestamp_with_term, raft::not_a_leader, need_wait_for_leader>;
    begin_mutate_result begin_mutate(abort_source&);

    // Possible results:
    //   ok - this node is the leader, proceed with read_barrier() locally
    //   raft::not_a_leader - this node is not a leader, redirect to the leader
    //   need_wait_for_leader - the leader is unknown, the caller needs to wait and retry
    struct ok {};
    using begin_read_result = std::variant<ok, raft::not_a_leader, need_wait_for_leader>;
    begin_read_result begin_read(abort_source&);
};

}