mirror of
https://github.com/scylladb/scylladb.git
synced 2026-05-23 00:02:37 +00:00
228 lines
10 KiB
C++
228 lines
10 KiB
C++
/*
|
|
* Copyright (C) 2025-present ScyllaDB
|
|
*/
|
|
|
|
/*
|
|
* SPDX-License-Identifier: LicenseRef-ScyllaDB-Source-Available-1.1
|
|
*/
|
|
|
|
#include <seastar/core/abort_source.hh>
|
|
#include <seastar/core/shard_id.hh>
|
|
#include <seastar/core/on_internal_error.hh>
|
|
#include "state_machine.hh"
|
|
#include "db/schema_tables.hh"
|
|
#include "mutation/frozen_mutation.hh"
|
|
#include "schema/schema_registry.hh"
|
|
#include "serializer_impl.hh"
|
|
#include "idl/strong_consistency/state_machine.dist.hh"
|
|
#include "idl/strong_consistency/state_machine.dist.impl.hh"
|
|
#include "replica/database.hh"
|
|
#include "service/migration_manager.hh"
|
|
#include "db/system_keyspace.hh"
|
|
#include "utils/loading_cache.hh"
|
|
#include "utils/error_injection.hh"
|
|
|
|
using namespace std::chrono_literals;
|
|
|
|
namespace service::strong_consistency {
|
|
|
|
static logging::logger logger("sc_state_machine");
|
|
|
|
class state_machine : public raft_state_machine {
|
|
locator::global_tablet_id _tablet;
|
|
raft::group_id _group_id;
|
|
replica::database& _db;
|
|
service::migration_manager& _mm;
|
|
db::system_keyspace& _sys_ks;
|
|
|
|
abort_source _as;
|
|
|
|
public:
|
|
state_machine(locator::global_tablet_id tablet,
|
|
raft::group_id gid,
|
|
replica::database& db,
|
|
service::migration_manager& mm,
|
|
db::system_keyspace& sys_ks)
|
|
: _tablet(tablet)
|
|
, _group_id(gid)
|
|
, _db(db)
|
|
, _mm(mm)
|
|
, _sys_ks(sys_ks)
|
|
{
|
|
}
|
|
|
|
future<> apply(std::vector<raft::command_cref> command) override {
|
|
static thread_local logging::logger::rate_limit rate_limit(std::chrono::seconds(10));
|
|
|
|
try {
|
|
co_await utils::get_local_injector().inject("strong_consistency_state_machine_wait_before_apply", utils::wait_for_message(20min));
|
|
utils::chunked_vector<frozen_mutation> muts;
|
|
muts.reserve(command.size());
|
|
for (const auto& c: command) {
|
|
auto is = ser::as_input_stream(c);
|
|
auto cmd = ser::deserialize(is, std::type_identity<raft_command>{});
|
|
muts.push_back(std::move(cmd.mutation));
|
|
}
|
|
// Hold pointers to schemas until `_db.apply()` is finished
|
|
auto schemas = co_await get_schema_and_upgrade_mutations(muts);
|
|
co_await _db.apply(std::move(muts), db::no_timeout);
|
|
} catch (replica::no_such_column_family&) {
|
|
// If the table doesn't exist, it means it was already dropped.
|
|
// This cannot happen if the table wasn't created yet on the node
|
|
// because the state machine is created only after the table is created
|
|
// (see `schema_applier::commit_on_shard()` and `storage_service::commit_token_metadata_change()`).
|
|
// In this case, we should just ignore mutations without throwing an error.
|
|
logger.log(log_level::warn, rate_limit, "apply(): table {} was already dropped, ignoring mutations", _tablet.table);
|
|
} catch (const abort_requested_exception& ex) {
|
|
// The exception can be thrown by get_schema_and_upgrade_mutations.
|
|
// It means that the Raft group is being removed.
|
|
//
|
|
// Technically, throwing an exception from a state machine
|
|
// may result in killing the corresponding Raft instance:
|
|
// cf. the description of raft::state_machine:
|
|
//
|
|
// "Any of the functions may return an error, but it will kill the
|
|
// raft instance that uses it. Depending on what state the failure
|
|
// leaves the state is the raft instance will either have to be recreated
|
|
// with the same state machine and rejoined the cluster with the same server_id
|
|
// or it new raft instance will have to be created with empty state machine and
|
|
// it will have to rejoin to the cluster with different server_id through
|
|
// configuration change."
|
|
//
|
|
// Fortunately, in strong consistency, we use the default Raft server
|
|
// implementation, which handles abort_requested_exception thrown by
|
|
// raft::state_machine::apply -- it will simply end the applier fiber.
|
|
logger.debug("apply(): execution for tablet {}, group_id={} aborted due to: {}",
|
|
_tablet, _group_id, ex);
|
|
throw;
|
|
}
|
|
catch (...) {
|
|
throw std::runtime_error(::format(
|
|
"tablet {}, group id {}: error while applying mutations {}",
|
|
_tablet, _group_id, std::current_exception()));
|
|
}
|
|
}
|
|
|
|
future<raft::snapshot_id> take_snapshot() override {
|
|
// Until snapshot transfer is fully implemented, return a fake ID
|
|
// and don't actually do anything. As long as we don't do snapshot
|
|
// transfers (attempting to do that throws an exception), we should
|
|
// be safe.
|
|
return make_ready_future<raft::snapshot_id>(raft::snapshot_id(utils::make_random_uuid()));
|
|
}
|
|
|
|
void drop_snapshot(raft::snapshot_id id) override {
|
|
// Taking a snapshot is a no-op, so dropping a snapshot is also a no-op.
|
|
(void) id;
|
|
}
|
|
|
|
future<> load_snapshot(raft::snapshot_id id) override {
|
|
return make_ready_future<>();
|
|
}
|
|
|
|
future<> abort() override {
|
|
logger.debug("abort(): Aborting state machine for group {}", _group_id);
|
|
_as.request_abort();
|
|
return make_ready_future<>();
|
|
}
|
|
|
|
future<> transfer_snapshot(raft::server_id from_id, raft::snapshot_descriptor snp) override {
|
|
throw std::runtime_error("transfer_snapshot() not implemented");
|
|
}
|
|
|
|
private:
|
|
using column_mappings_cache = utils::loading_cache<table_schema_version, column_mapping>;
|
|
using schema_store = std::unordered_map<table_schema_version, std::pair<schema_ptr, column_mappings_cache::value_ptr>>;
|
|
future<schema_store> get_schema_and_upgrade_mutations(utils::chunked_vector<frozen_mutation>& muts) {
|
|
// Cache column mappings to avoid querying `system.scylla_table_schema_history` multiple times.
|
|
static thread_local column_mappings_cache column_mapping_cache(std::numeric_limits<size_t>::max(), 1h, logger);
|
|
// Stores schema pointer and optional column mapping for each schema version present in the mutations
|
|
schema_store schema_mappings;
|
|
bool barrier_executed = false;
|
|
|
|
auto get_schema = [&] (table_schema_version schema_version) -> future<std::pair<schema_ptr, column_mappings_cache::value_ptr>> {
|
|
if (utils::get_local_injector().enter("sc_state_machine_return_empty_schema")) {
|
|
co_return std::pair{nullptr, nullptr};
|
|
}
|
|
|
|
auto schema = local_schema_registry().get_or_null(schema_version);
|
|
if (schema) {
|
|
co_return std::pair{std::move(schema), nullptr};
|
|
}
|
|
|
|
// `_db.find_schema()` may throw `replica::no_such_column_family` if the table was already dropped.
|
|
schema = _db.find_schema(_tablet.table);
|
|
// The column mapping may be already present in the cache from another `apply()` call
|
|
auto cm_ptr = column_mapping_cache.find(schema_version);
|
|
if (cm_ptr) {
|
|
co_return std::pair{std::move(schema), std::move(cm_ptr)};
|
|
}
|
|
|
|
// We may not find the column mapping if the mutation schema is newer than the present schema.
|
|
// In this case, we should trigger the barrier to wait for the schema to be updated and then try again.
|
|
auto cm_opt = co_await db::schema_tables::get_column_mapping_if_exists(_sys_ks, _tablet.table, schema_version);
|
|
if (!cm_opt) {
|
|
co_return std::pair{nullptr, nullptr};
|
|
}
|
|
|
|
cm_ptr = co_await column_mapping_cache.get_ptr(schema_version, [cm = std::move(*cm_opt)] (auto schema_version) -> future<column_mapping> {
|
|
co_return std::move(cm);
|
|
});
|
|
co_return std::pair{std::move(schema), std::move(cm_ptr)};
|
|
};
|
|
|
|
auto resolve_schema = [&] (const frozen_mutation& mut) -> future<const schema_store::mapped_type*> {
|
|
auto schema_version = mut.schema_version();
|
|
auto it = schema_mappings.find(schema_version);
|
|
if (it != schema_mappings.end()) {
|
|
co_return &it->second;
|
|
}
|
|
|
|
auto schema_cm = co_await get_schema(schema_version);
|
|
if (!schema_cm.first && !barrier_executed) {
|
|
if (utils::get_local_injector().enter("disable_raft_drop_append_entries_for_specified_group")) {
|
|
utils::get_local_injector().disable("raft_drop_incoming_append_entries_for_specified_group");
|
|
}
|
|
co_await _mm.get_group0_barrier().trigger(false, &_as);
|
|
barrier_executed = true;
|
|
schema_cm = co_await get_schema(schema_version);
|
|
}
|
|
|
|
if (schema_cm.first) {
|
|
const auto [it, _] = schema_mappings.insert({schema_version, std::move(schema_cm)});
|
|
co_return &it->second;
|
|
}
|
|
co_return nullptr;
|
|
};
|
|
|
|
for (auto& m: muts) {
|
|
auto schema_entry = co_await resolve_schema(m);
|
|
if (!schema_entry) {
|
|
// Old schema are TTLed after 10 days (see comment in `schema_applier::finalize_tables_and_views()`),
|
|
// so this error theoretically may be triggered if a node is stuck longer than this.
|
|
// But in practice we should do a snapshot much earlier, that's why `on_internal_error()` here.
|
|
// And if the table was already dropped, `no_such_column_family` will be dropped earlier.
|
|
on_internal_error(logger, fmt::format("couldn't find schema for table {} and mutation schema version {}", _tablet.table, m.schema_version()));
|
|
}
|
|
if (schema_entry->second) {
|
|
m = freeze(m.unfreeze_upgrading(schema_entry->first, *schema_entry->second));
|
|
}
|
|
}
|
|
|
|
// We only need vector of schema pointers but we're returning the whole map
|
|
// to avoid another allocation
|
|
co_return std::move(schema_mappings);
|
|
}
|
|
};
|
|
|
|
std::unique_ptr<raft_state_machine> make_state_machine(locator::global_tablet_id tablet,
|
|
raft::group_id gid,
|
|
replica::database& db,
|
|
service::migration_manager& mm,
|
|
db::system_keyspace& sys_ks)
|
|
{
|
|
return std::make_unique<state_machine>(tablet, gid, db, mm, sys_ks);
|
|
}
|
|
|
|
};
|