mirror of
https://github.com/scylladb/scylladb.git
synced 2026-06-04 05:53:13 +00:00
Add begin_read() method to raft_server that checks leadership for read operations. Unlike begin_mutate(), it does not need to compute a timestamp or interact with leader_info. It simply checks current_leader() and returns one of three dispositions: - ok: this node is the leader, proceed with read_barrier() locally - raft::not_a_leader: redirect to the indicated leader - need_wait_for_leader: leader unknown, caller must wait and retry This will be used by the read forwarding logic in subsequent commits.
236 lines
8.7 KiB
C++
236 lines
8.7 KiB
C++
/*
|
|
* Copyright (C) 2025-present ScyllaDB
|
|
*/
|
|
|
|
/*
|
|
* SPDX-License-Identifier: LicenseRef-ScyllaDB-Source-Available-1.1
|
|
*/
|
|
|
|
#pragma once
|
|
|
|
#include "locator/abstract_replication_strategy.hh"
|
|
#include "message/messaging_service.hh"
|
|
#include "service/raft/raft_group_registry.hh"
|
|
#include "cql3/query_processor.hh"
|
|
|
|
namespace db {
|
|
class system_keyspace;
|
|
}
|
|
|
|
namespace gms {
|
|
class gossiper;
|
|
}
|
|
|
|
namespace service {
|
|
class migration_manager;
|
|
}
|
|
|
|
namespace service::strong_consistency {
|
|
|
|
class raft_server;
|
|
|
|
/// A cache of leader locations for raft groups where this node is not a replica.
|
|
/// Populated by the CQL transport layer after a redirect reveals the actual leader.
|
|
///
|
|
/// Uses a sweep-based eviction strategy tied to token_metadata updates:
|
|
/// begin_sweep() before iterating tablets, mark_seen() for each existing group,
|
|
/// end_sweep() to evict entries whose groups no longer exist.
|
|
class tablet_group_leader_cache {
|
|
struct entry {
|
|
locator::host_id leader;
|
|
bool seen = false;
|
|
};
|
|
std::unordered_map<raft::group_id, entry> _entries;
|
|
|
|
public:
|
|
void put(raft::group_id group, locator::host_id leader) {
|
|
auto [it, inserted] = _entries.try_emplace(group, entry{leader});
|
|
if (!inserted) {
|
|
it->second.leader = leader;
|
|
}
|
|
}
|
|
|
|
std::optional<locator::host_id> get(raft::group_id group) const {
|
|
auto it = _entries.find(group);
|
|
if (it != _entries.end()) {
|
|
return it->second.leader;
|
|
}
|
|
return std::nullopt;
|
|
}
|
|
|
|
void erase(raft::group_id group) {
|
|
_entries.erase(group);
|
|
}
|
|
|
|
void begin_sweep() {
|
|
for (auto& [_, e] : _entries) {
|
|
e.seen = false;
|
|
}
|
|
}
|
|
|
|
void mark_seen(raft::group_id group) {
|
|
auto it = _entries.find(group);
|
|
if (it != _entries.end()) {
|
|
it->second.seen = true;
|
|
}
|
|
}
|
|
|
|
void end_sweep() {
|
|
std::erase_if(_entries, [](const auto& p) { return !p.second.seen; });
|
|
}
|
|
};
|
|
|
|
/// A sharded service responsible for the lifecycle and access
|
|
/// management of all Raft groups for strongly consistent tablets hosted on this node.
|
|
///
|
|
/// Listens for token_metadata updates to automatically start Raft servers for tablets newly
|
|
/// assigned to this node and schedule the deletion of Raft servers for tablets that have moved away.
|
|
///
|
|
/// It serves as the entry point for read and write requests via acquire_server() method. It is guaranteed
|
|
/// that the raft::server instance and its associated state managed by groups_manager cannot be
|
|
/// stopped or destroyed while the returned raft_server object is alive.
|
|
///
|
|
/// Runs a background fiber (leader_info_updater) per group that monitors the raft::server state
|
|
/// and computes the next write timestamp as soon as the server becomes leader.
|
|
/// This allows write requests to proceed without waiting for read_barrier(),
|
|
/// which would otherwise be needed to compute the timestamp.
|
|
class groups_manager : public peering_sharded_service<groups_manager> {
|
|
class state_machine_impl;
|
|
class rpc_impl;
|
|
|
|
friend class raft_server;
|
|
|
|
struct leader_info {
|
|
// The Raft term this structure describes.
|
|
raft::term_t term;
|
|
|
|
// The last timestamp used for mutations in this term.
|
|
api::timestamp_type last_timestamp;
|
|
};
|
|
|
|
struct raft_group_state : public boost::intrusive::list_base_hook<boost::intrusive::link_mode<boost::intrusive::auto_unlink>> {
|
|
bool has_tablet = false;
|
|
lw_shared_ptr<gate> gate = nullptr;
|
|
raft::server* server = nullptr;
|
|
|
|
// Serialized chain of raft::server control operations (start/stop).
|
|
// This serialization handles (rare) cases where a tablet is migrated out
|
|
// before the raft::server has finished initializing, or conversely,
|
|
// when a tablet is migrated back to this node before deinitialization completes.
|
|
// Subsequent operations wait for the previous one to complete.
|
|
shared_future<> server_control_op = make_ready_future<>();
|
|
|
|
// Populated only when this node thinks it's a tablet raft group leader.
|
|
std::optional<leader_info> leader_info = std::nullopt;
|
|
condition_variable leader_info_cond = condition_variable();
|
|
future<> leader_info_updater = make_ready_future<>();
|
|
};
|
|
|
|
netw::messaging_service& _ms;
|
|
raft_group_registry& _raft_gr;
|
|
cql3::query_processor& _qp;
|
|
replica::database& _db;
|
|
service::migration_manager& _mm;
|
|
db::system_keyspace& _sys_ks;
|
|
gms::feature_service& _features;
|
|
gms::gossiper& _gossiper;
|
|
std::unordered_map<raft::group_id, raft_group_state> _raft_groups = {};
|
|
boost::intrusive::list<raft_group_state, boost::intrusive::constant_time_size<false>> _starting_groups;
|
|
locator::token_metadata_ptr _pending_tm = nullptr;
|
|
bool _started = false;
|
|
|
|
tablet_group_leader_cache _leader_cache;
|
|
|
|
// Should be called on the shard that hosts the Raft group
|
|
future<> start_raft_group(locator::global_tablet_id tablet,
|
|
raft::group_id group_id,
|
|
locator::token_metadata_ptr tm);
|
|
|
|
void schedule_raft_group_deletion(raft::group_id group_id, raft_group_state& group_state);
|
|
|
|
void schedule_raft_groups_deletion(bool all);
|
|
|
|
future<> leader_info_updater(raft_group_state& state, locator::global_tablet_id tablet, raft::group_id gid);
|
|
|
|
void init_messaging_service();
|
|
future<> uninit_messaging_service();
|
|
|
|
public:
|
|
groups_manager(netw::messaging_service& ms, raft_group_registry& raft_gr,
|
|
cql3::query_processor& qp, replica::database& _db, service::migration_manager& mm, db::system_keyspace& sys_ks,
|
|
gms::feature_service& features, gms::gossiper& gossiper);
|
|
|
|
// Called whenever a new token_metadata is published on this shard.
|
|
// Starts raft::server instances for all strongly consistent tablets now
|
|
// residing on this shard, and schedules removal of servers for tablets
|
|
// that have moved away.
|
|
//
|
|
// Note that the method is synchronous: it only initiates these operations
|
|
// and does not wait for their completion.
|
|
void update(locator::token_metadata_ptr new_tm);
|
|
|
|
// The raft_server instance is used to submit write commands and perform read_barrier() before reads.
|
|
future<raft_server> acquire_server(table_id table_id, raft::group_id group_id, abort_source& as);
|
|
|
|
// Called during node boot. Starts all raft::server instances corresponding
|
|
// to the latest group0 state in the background.
|
|
void start();
|
|
|
|
// Called during node shutdown. Waits for all raft::server instances to stop.
|
|
future<> stop();
|
|
|
|
future<> wait_for_groups_to_start(lowres_clock::time_point timeout);
|
|
|
|
// Sends an RPC to every host that holds a tablet replica of the given table, asking it to wait
|
|
// until the raft groups for those tablets are started and ready to serve queries.
|
|
// For the local node, waits directly without an RPC.
|
|
future<> wait_for_table_raft_groups_on_all_hosts(table_id table, lowres_clock::time_point timeout);
|
|
|
|
tablet_group_leader_cache& leader_cache() { return _leader_cache; }
|
|
};
|
|
|
|
/// A temporary, RAII-style handle to an active Raft group server instance,
|
|
/// used to safely submit commands or perform consistency barriers.
|
|
///
|
|
/// The holder guarantees that the underlying raft::server and its associated state
|
|
/// managed by groups_manager cannot be stopped or destroyed while this raft_server object is alive.
|
|
/// It ensures that even if a topology change triggers the deletion of the Raft group,
|
|
/// the shutdown sequence will wait until this handle is destroyed, preventing use-after-free
|
|
/// errors during ongoing operations.
|
|
class raft_server {
|
|
private:
|
|
groups_manager::raft_group_state& _state;
|
|
gate::holder _holder;
|
|
|
|
public:
|
|
raft_server(groups_manager::raft_group_state& state, gate::holder holder);
|
|
|
|
raft::server& server() {
|
|
return *_state.server;
|
|
}
|
|
|
|
// Possible results:
|
|
// timestamp_with_term - timestamp to use for a new mutation request
|
|
// raft::not_a_leader - this node is not a leader
|
|
// need_wait_for_leader - the caller needs to wait on the specified future and then retry `begin_mutate`
|
|
struct timestamp_with_term {
|
|
api::timestamp_type timestamp;
|
|
raft::term_t term;
|
|
};
|
|
struct need_wait_for_leader {
|
|
future<> future;
|
|
};
|
|
using begin_mutate_result = std::variant<timestamp_with_term, raft::not_a_leader, need_wait_for_leader>;
|
|
begin_mutate_result begin_mutate(abort_source&);
|
|
|
|
// Possible results:
|
|
// ok - this node is the leader, proceed with read_barrier() locally
|
|
// raft::not_a_leader - this node is not a leader, redirect to the leader
|
|
// need_wait_for_leader - the leader is unknown, the caller needs to wait and retry
|
|
struct ok {};
|
|
using begin_read_result = std::variant<ok, raft::not_a_leader, need_wait_for_leader>;
|
|
begin_read_result begin_read(abort_source&);
|
|
};
|
|
|
|
}
|