Files
scylladb/service/raft/raft_group0_client.cc
Avi Kivity 0ae22a09d4 LICENSE: Update to version 1.1
Updated terms of non-commercial use (must be a never-customer).
2026-04-12 19:46:33 +03:00

527 lines
23 KiB
C++

/*
* Copyright (C) 2022-present ScyllaDB
*
* Modified by ScyllaDB
*/
/*
* SPDX-License-Identifier: LicenseRef-ScyllaDB-Source-Available-1.1
*/
#include <optional>
#include <seastar/core/coroutine.hh>
#include <seastar/core/when_any.hh>
#include <seastar/coroutine/parallel_for_each.hh>
#include "raft_group0_client.hh"
#include "raft_group_registry.hh"
#include "schema/frozen_schema.hh"
#include "schema/schema_mutations.hh"
#include "service/broadcast_tables/experimental/lang.hh"
#include "idl/experimental/broadcast_tables_lang.dist.hh"
#include "idl/experimental/broadcast_tables_lang.dist.impl.hh"
#include "idl/group0_state_machine.dist.hh"
#include "idl/group0_state_machine.dist.impl.hh"
#include "service/raft/group0_state_machine.hh"
#include "replica/database.hh"
#include "utils/assert.hh"
#include "utils/to_string.hh"
#include "db/system_keyspace.hh"
#include "replica/tablets.hh"
#include "gms/gossiper.hh"
namespace service {
static logging::logger logger("group0_client");
/* *** Linearizing group 0 operations ***
*
* Group 0 changes (e.g. schema changes) are performed through Raft commands, which are executing in the same order
* on every node, according to the order they appear in the Raft log
* (executing a command happens in `group0_state_machine::apply`).
* The commands contain mutations which modify tables that store group 0 state.
*
* However, constructing these mutations often requires reading the current state and validating the change against it.
* This happens outside the code which applies the commands in order and may race with it. At the moment of applying
* a command, the mutations stored within may be 'invalid' because a different command managed to be concurrently applied,
* changing the state.
*
* For example, consider the sequence of commands:
*
* C1, C2, C3.
*
* Suppose that mutations inside C2 were constructed on a node which already applied C1. Thus, when applying C2,
* the state of group 0 is the same as when the change was validated and its mutations were constructed.
*
* On the other hand, suppose that mutations inside C3 were also constructed on a node which applied C1, but didn't
* apply C2 yet. This could easily happen e.g. when C2 and C3 were constructed concurrently on two different nodes.
* Thus, when applying C3, the state of group 0 is different than it was when validating the change and constructing
* its mutations: the state consists of the changes from C1 and C2, but when C3 was created, it used the state consisting
* of changes from C1 (but not C2). Thus the mutations in C3 are not valid and we must not apply them.
*
* To protect ourselves from applying such 'obsolete' changes, we detect such commands during `group0_state_machine:apply`
* and skip their mutations.
*
* For this, group 0 state was extended with a 'history table' (system.group0_history), which stores a sequence of
* 'group 0 state IDs' (which are timeuuids). Each group 0 command also holds a unique state ID; if the command is successful,
* the ID is appended to the history table. Each command also stores a 'previous state ID'; the change described by the command
* is only applied when this 'previous state ID' is equal to the last state ID in the history table. If it's different,
* we skip the change.
*
* To perform a group 0 change the user must first read the last state ID from the history table. This happens by obtaining
* a `group0_guard` through `migration_manager::start_group0_operation`; the observed last state ID is stored in
* `_observed_group0_state_id`. `start_group0_operation` also generates a new state ID for this change and stores it in
* `_new_group0_state_id`. We ensure that the new state ID is greater than the observed state ID (in timeuuid order).
*
* The user then reads group 0 state, validates the change against the observed state, and constructs the mutations
* which modify group 0 state. Finally, the user calls `announce`, passing the mutations and the guard.
*
* `announce` constructs a command for the group 0 state machine. The command stores the mutations and the state IDs.
*
* When the command is applied, we compare the stored observed state ID against the last state ID in the history table.
* If it's the same, that means no change happened in between - no other command managed to 'sneak in' between the moment
* the user started the operation and the moment the command was applied.
*
* The user must use `group0_guard::write_timestamp()` when constructing the mutations. The timestamp is extracted
* from the new state ID. This ensures that mutations applied by successful commands have monotonic timestamps.
* Indeed: the state IDs of successful commands are increasing (the previous state ID of a command that is successful
* is equal to the new state ID of the previous successful command, and we ensure that the new state ID of a command
* is greater than the previous state ID of this command).
*
* To perform a linearized group 0 read the user must also obtain a `group0_guard`. This ensures that all previously
* completed changes are visible on this node, as obtaining the guard requires performing a Raft read barrier.
*
* Furthermore, obtaining the guard ensures that we don't read partial state, since it holds a lock that is also taken
* during command application (`_read_apply_mutex_holder`). The lock is released just before sending the command to Raft.
*
* Obtaining the guard also ensures that there is no concurrent group 0 operation running on this node using another lock
* (`_operation_mutex_holder`); if we allowed multiple concurrent operations to run, some of them could fail
* due to the state ID protection. Concurrent operations may still run on different nodes. This lock is thus used
* for improving liveness of operations running on the same node by serializing them.
*/
struct group0_guard::impl {
semaphore_units<> _operation_mutex_holder;
semaphore_units<> _read_apply_mutex_holder;
utils::UUID _observed_group0_state_id;
utils::UUID _new_group0_state_id;
impl(const impl&) = delete;
impl& operator=(const impl&) = delete;
impl(semaphore_units<> operation_mutex_holder, semaphore_units<> read_apply_mutex_holder, utils::UUID observed_group0_state_id, utils::UUID new_group0_state_id)
: _operation_mutex_holder(std::move(operation_mutex_holder)), _read_apply_mutex_holder(std::move(read_apply_mutex_holder))
, _observed_group0_state_id(observed_group0_state_id), _new_group0_state_id(new_group0_state_id)
{}
void release_read_apply_mutex() {
SCYLLA_ASSERT(_read_apply_mutex_holder.count() == 1);
_read_apply_mutex_holder.return_units(1);
}
};
group0_guard::group0_guard(std::unique_ptr<impl> p) : _impl(std::move(p)) {}
group0_guard::~group0_guard() = default;
group0_guard::group0_guard(group0_guard&&) noexcept = default;
group0_guard& group0_guard::operator=(group0_guard&&) noexcept = default;
utils::UUID group0_guard::observed_group0_state_id() const {
return _impl->_observed_group0_state_id;
}
utils::UUID group0_guard::new_group0_state_id() const {
return _impl->_new_group0_state_id;
}
api::timestamp_type group0_guard::write_timestamp() const {
return utils::UUID_gen::micros_timestamp(_impl->_new_group0_state_id);
}
void release_guard(group0_guard guard) {}
gc_clock::duration raft_group0_client::get_history_gc_duration() const {
return _history_gc_duration;
}
void raft_group0_client::set_history_gc_duration(gc_clock::duration d) {
_history_gc_duration = d;
}
semaphore& raft_group0_client::operation_mutex() {
return _operation_mutex;
}
future<> raft_group0_client::add_entry(group0_command group0_cmd, group0_guard guard, seastar::abort_source& as,
std::optional<raft_timeout> timeout)
{
if (this_shard_id() != 0) {
// This should not happen since all places which construct `group0_guard` also check that they are on shard 0.
// Note: `group0_guard::impl` is private to this module, making this easy to verify.
on_internal_error(logger, "add_entry: must run on shard 0");
}
auto new_group0_state_id = guard.new_group0_state_id();
co_await [&, guard = std::move(guard)] () -> future<> { // lambda is needed to limit guard's lifetime
raft::command cmd;
ser::serialize(cmd, group0_cmd);
// Release the read_apply mutex so `group0_state_machine::apply` can take it.
guard._impl->release_read_apply_mutex();
bool retry;
do {
retry = false;
try {
co_await _raft_gr.group0_with_timeouts().add_entry(cmd, raft::wait_type::applied, as, timeout);
} catch (const raft::dropped_entry& e) {
logger.warn("add_entry: returned \"{}\". Retrying the command (prev_state_id: {}, new_state_id: {})",
e, group0_cmd.prev_state_id, group0_cmd.new_state_id);
retry = true;
} catch (const raft::commit_status_unknown& e) {
logger.warn("add_entry: returned \"{}\". Retrying the command (prev_state_id: {}, new_state_id: {})",
e, group0_cmd.prev_state_id, group0_cmd.new_state_id);
retry = true;
} catch (const raft::not_a_leader& e) {
// This should not happen since follower-to-leader entry forwarding is enabled in group 0.
// Just fail the operation by propagating the error.
logger.error("add_entry: unexpected `not_a_leader` error: \"{}\". Please file an issue.", e);
throw;
}
// Thanks to the `prev_state_id` check in `group0_state_machine::apply`, the command is idempotent.
// It's safe to retry it, even if it means it will be applied multiple times; only the first time
// can have an effect.
} while (retry);
// dropping the guard releases `_group0_operation_mutex`, allowing other operations
// on this node to proceed
} ();
if (!(co_await _sys_ks.group0_history_contains(new_group0_state_id))) {
// The command was applied but the history table does not contain the new group 0 state ID.
// This means `apply` skipped the change due to previous state ID mismatch.
throw group0_concurrent_modification{};
}
}
future<> raft_group0_client::add_entry_unguarded(group0_command group0_cmd, seastar::abort_source* as) {
if (this_shard_id() != 0) {
on_internal_error(logger, "add_entry_unguarded: must run on shard 0");
}
raft::command cmd;
ser::serialize(cmd, group0_cmd);
// Command is not retried, because for now it's not idempotent.
try {
co_await _raft_gr.group0().add_entry(cmd, raft::wait_type::applied, as);
} catch (const raft::not_a_leader& e) {
// This should not happen since follower-to-leader entry forwarding is enabled in group 0.
// Just fail the operation by propagating the error.
logger.error("add_entry_unguarded: unexpected `not_a_leader` error: \"{}\". Please file an issue.", e);
throw;
}
}
utils::UUID raft_group0_client::generate_group0_state_id(utils::UUID prev_state_id) {
auto ts = api::new_timestamp();
if (prev_state_id != utils::UUID{}) {
auto lower_bound = utils::UUID_gen::micros_timestamp(prev_state_id);
if (ts <= lower_bound) {
ts = lower_bound + 1;
}
}
return utils::UUID_gen::get_random_time_UUID_from_micros(std::chrono::microseconds{ts});
}
future<utils::UUID> raft_group0_client::get_last_group0_state_id() {
return _sys_ks.get_last_group0_state_id();
}
future<group0_guard> raft_group0_client::start_operation(seastar::abort_source& as, std::optional<raft_timeout> timeout) {
if (this_shard_id() != 0) {
on_internal_error(logger, "start_group0_operation: must run on shard 0");
}
if (_maintenance_mode) {
throw exceptions::configuration_exception{"cannot start group0 operation in the maintenance mode"};
}
auto operation_holder = co_await get_units(_operation_mutex, 1, as);
co_await _raft_gr.group0_with_timeouts().read_barrier(&as, timeout);
// Take `_group0_read_apply_mutex` *after* read barrier.
// Read barrier may wait for `group0_state_machine::apply` which also takes this mutex.
auto read_apply_holder = co_await hold_read_apply_mutex(as);
auto observed_group0_state_id = co_await get_last_group0_state_id();
auto new_group0_state_id = generate_group0_state_id(observed_group0_state_id);
co_return group0_guard {
std::make_unique<group0_guard::impl>(
std::move(operation_holder),
std::move(read_apply_holder),
observed_group0_state_id,
new_group0_state_id
)
};
}
template<typename Command>
requires std::same_as<Command, topology_change> || std::same_as<Command, mixed_change>
void raft_group0_client::validate_change(const Command& change) {
replica::validate_tablet_metadata_change(_token_metadata.get()->tablets(), change.mutations);
}
template<typename Command>
requires std::same_as<Command, schema_change> || std::same_as<Command, topology_change> || std::same_as<Command, write_mutations> || std::same_as<Command, mixed_change>
group0_command raft_group0_client::prepare_command(Command change, group0_guard& guard, std::string_view description) {
validate_change(change);
group0_command group0_cmd {
.change{std::move(change)},
.history_append{db::system_keyspace::make_group0_history_state_id_mutation(
guard.new_group0_state_id(), _history_gc_duration, description)},
// IMPORTANT: the retry mechanism below assumes that `prev_state_id` is engaged (not nullopt).
// Here it is: the return type of `guard.observerd_group0_state_id()` is `utils::UUID`.
.prev_state_id{guard.observed_group0_state_id()},
.new_state_id{guard.new_group0_state_id()},
.creator_addr{_sys_ks.local_db().get_token_metadata().get_topology().my_address()},
.creator_id{_raft_gr.group0().id()}
};
return group0_cmd;
}
template<typename Command>
requires std::same_as<Command, broadcast_table_query> || std::same_as<Command, write_mutations>
group0_command raft_group0_client::prepare_command(Command change, std::string_view description) {
validate_change(change);
const auto new_group0_state_id = generate_group0_state_id(utils::UUID{});
group0_command group0_cmd {
.change{std::move(change)},
.history_append{db::system_keyspace::make_group0_history_state_id_mutation(
new_group0_state_id, _history_gc_duration, description)},
.prev_state_id{std::nullopt},
.new_state_id{new_group0_state_id},
.creator_addr{_sys_ks.local_db().get_token_metadata().get_topology().my_address()},
.creator_id{_raft_gr.group0().id()}
};
return group0_cmd;
}
raft_group0_client::raft_group0_client(service::raft_group_registry& raft_gr, gms::gossiper& gossiper,
db::system_keyspace& sys_ks, locator::shared_token_metadata& tm, maintenance_mode_enabled maintenance_mode)
: _raft_gr(raft_gr), _gossiper(gossiper), _sys_ks(sys_ks), _token_metadata(tm), _maintenance_mode(maintenance_mode) {
}
size_t raft_group0_client::max_command_size() const {
return _raft_gr.group0().max_command_size();
}
bool raft_group0_client::maintenance_mode() const {
return _maintenance_mode == maintenance_mode_enabled::yes;
}
future<semaphore_units<>> raft_group0_client::hold_read_apply_mutex(abort_source& as) {
if (this_shard_id() != 0) {
on_internal_error(logger, "hold_read_apply_mutex: must run on shard 0");
}
return get_units(_read_apply_mutex, 1, as);
}
raft_group0_client::query_result_guard::query_result_guard(utils::UUID query_id, raft_group0_client& client)
: _query_id{query_id}, _client{&client} {
auto [_, emplaced] = _client->_results.emplace(_query_id, std::nullopt);
if (!emplaced) {
on_internal_error(logger, "query_result_guard::query_result_guard: there is another query_result_guard alive with the same query_id");
}
}
raft_group0_client::query_result_guard::query_result_guard(raft_group0_client::query_result_guard&& other)
: _query_id{other._query_id}, _client{other._client} {
other._client = nullptr;
}
raft_group0_client::query_result_guard::~query_result_guard() {
if (_client != nullptr) {
_client->_results.erase(_query_id);
}
}
service::broadcast_tables::query_result raft_group0_client::query_result_guard::get() {
auto it = _client->_results.find(_query_id);
if (it == _client->_results.end() || !it->second.has_value()) {
on_internal_error(logger, "query_result_guard::get: no result");
}
return std::move(*it->second);
}
raft_group0_client::query_result_guard raft_group0_client::create_result_guard(utils::UUID query_id) {
return query_result_guard{query_id, *this};
}
void raft_group0_client::set_query_result(utils::UUID query_id, service::broadcast_tables::query_result qr) {
auto it = _results.find(query_id);
if (it != _results.end()) {
it->second = std::move(qr);
}
}
template void raft_group0_client::validate_change(const topology_change& change);
template void raft_group0_client::validate_change(const mixed_change& change);
template group0_command raft_group0_client::prepare_command(schema_change change, group0_guard& guard, std::string_view description);
template group0_command raft_group0_client::prepare_command(topology_change change, group0_guard& guard, std::string_view description);
template group0_command raft_group0_client::prepare_command(write_mutations change, group0_guard& guard, std::string_view description);
template group0_command raft_group0_client::prepare_command(broadcast_table_query change, std::string_view description);
template group0_command raft_group0_client::prepare_command(write_mutations change, std::string_view description);
template group0_command raft_group0_client::prepare_command(mixed_change change, group0_guard& guard, std::string_view description);
future<> raft_group0_client::send_group0_read_barrier_to_live_members() {
auto my_id = _raft_gr.get_my_raft_id();
auto live_members = _gossiper.get_live_members();
logger.debug("broadcast_group0_read_barrier: sending read barrier to {} live node(s)", live_members.size());
auto gid = _raft_gr.group0_id();
co_await coroutine::parallel_for_each(live_members, [&] (locator::host_id host) -> future<> {
if (host.uuid() == my_id.uuid()) {
co_return; // skip self, already applied locally
}
try {
auto dst = raft::server_id{host.uuid()};
co_await _raft_gr.send_raft_read_barrier(gid, dst);
} catch (...) {
static thread_local logger::rate_limit rate_limit{std::chrono::seconds(5)};
logger.log(log_level::warn, rate_limit,
"broadcast_group0_read_barrier: failed to complete read barrier on node {}: {}",
host, std::current_exception());
}
});
}
group0_batch::group0_batch(::service::group0_guard&& g)
: _guard(std::move(g)) {
}
group0_batch::group0_batch(std::optional<::service::group0_guard> g)
: _guard(std::move(g)) {
}
group0_batch::~group0_batch() = default;
api::timestamp_type group0_batch::write_timestamp() const {
if (!_guard) {
on_internal_error(logger, "group0_batch: write_timestamp without guard taken");
}
return _guard->write_timestamp();
}
utils::UUID group0_batch::new_group0_state_id() const {
if (!_guard) {
on_internal_error(logger, "group0_batch: new_group0_state_id without guard taken");
}
return _guard->new_group0_state_id();
}
void group0_batch::add_mutation(mutation m, std::string_view description) {
_muts.push_back(std::move(m));
if (!description.empty()) {
_descriptions.emplace_back(description);
}
}
void group0_batch::add_mutations(utils::chunked_vector<mutation> ms, std::string_view description) {
_muts.insert(_muts.end(),
std::make_move_iterator(ms.begin()),
std::make_move_iterator(ms.end()));
if (!description.empty()) {
_descriptions.emplace_back(description);
}
}
void group0_batch::add_generator(generator_func f, std::string_view description) {
_generators.push_back(std::move(f));
if (!description.empty()) {
_descriptions.emplace_back(description);
}
}
static future<> add_write_mutations_entry(
::service::raft_group0_client& group0_client,
std::string_view description,
utils::chunked_vector<canonical_mutation> muts,
::service::group0_guard group0_guard,
seastar::abort_source& as,
std::optional<::service::raft_timeout> timeout) {
logger.trace("add_write_mutations_entry: {} mutations with description {}",
muts.size(), description);
auto group0_cmd = group0_client.prepare_command(
::service::write_mutations{
.mutations{std::move(muts)},
},
group0_guard,
description
);
return group0_client.add_entry(std::move(group0_cmd), std::move(group0_guard), as, timeout);
}
future<> group0_batch::materialize_mutations() {
auto t = _guard->write_timestamp();
for (auto& generator : _generators) {
auto g = generator(t);
while (auto mut = co_await g()) {
_muts.push_back(std::move(*mut));
}
}
}
future<> group0_batch::commit(::service::raft_group0_client& group0_client, seastar::abort_source& as, std::optional<::service::raft_timeout> timeout) && {
if (_muts.size() == 0 && _generators.size() == 0) {
co_return;
}
if (!_guard) {
on_internal_error(logger, "group0_batch: trying to announce without guard");
}
auto description = fmt::to_string(fmt::join(_descriptions, "; "));
// common case, don't bother with generators as we would have only 1-2 mutations,
// when producer expects substantial number or size of mutations it should use generator
if (_generators.size() == 0) {
utils::chunked_vector<canonical_mutation> cmuts = {_muts.begin(), _muts.end()};
co_return co_await add_write_mutations_entry(group0_client, description, std::move(cmuts), std::move(*_guard), as, timeout);
}
// raft doesn't support streaming so we need to materialize all mutations in memory
co_await materialize_mutations();
if (_muts.empty()) {
co_return;
}
utils::chunked_vector<canonical_mutation> cmuts = {_muts.begin(), _muts.end()};
_muts.clear();
co_await add_write_mutations_entry(group0_client, description, std::move(cmuts), std::move(*_guard), as, timeout);
}
future<std::pair<utils::chunked_vector<mutation>, ::service::group0_guard>> group0_batch::extract() && {
co_await materialize_mutations();
co_return std::make_pair(std::move(_muts), std::move(*_guard));
}
bool group0_batch::empty() const {
return _muts.empty() && _generators.empty();
}
}