scylladb/service/strong_consistency/state_machine.cc

/*
 * Copyright (C) 2025-present ScyllaDB
 */

/*
 * SPDX-License-Identifier: LicenseRef-ScyllaDB-Source-Available-1.1
 */

#include <seastar/core/abort_source.hh>
#include <seastar/core/shard_id.hh>
#include <seastar/core/on_internal_error.hh>
#include "state_machine.hh"
#include "db/schema_tables.hh"
#include "mutation/frozen_mutation.hh"
#include "schema/schema_registry.hh"
#include "serializer_impl.hh"
#include "idl/strong_consistency/state_machine.dist.hh"
#include "idl/strong_consistency/state_machine.dist.impl.hh"
#include "replica/database.hh"
#include "service/migration_manager.hh"
#include "db/system_keyspace.hh"
#include "utils/loading_cache.hh"
#include "utils/error_injection.hh"

using namespace std::chrono_literals;

namespace service::strong_consistency {

static logging::logger logger("sc_state_machine");

class state_machine : public raft_state_machine {
    locator::global_tablet_id _tablet;
    raft::group_id _group_id;
    replica::database& _db;
    service::migration_manager& _mm;
    db::system_keyspace& _sys_ks;

    abort_source _as;

public:
    state_machine(locator::global_tablet_id tablet,
        raft::group_id gid,
        replica::database& db,
        service::migration_manager& mm,
        db::system_keyspace& sys_ks)
        : _tablet(tablet)
        , _group_id(gid)
        , _db(db)
        , _mm(mm)
        , _sys_ks(sys_ks)
    {
    }

    future<> apply(std::vector<raft::command_cref> command) override {
        static thread_local logging::logger::rate_limit rate_limit(std::chrono::seconds(10));

        try {
            co_await utils::get_local_injector().inject("strong_consistency_state_machine_wait_before_apply", utils::wait_for_message(20min));
            utils::chunked_vector<frozen_mutation> muts;
            muts.reserve(command.size());
            for (const auto& c: command) {
                auto is = ser::as_input_stream(c);
                auto cmd = ser::deserialize(is, std::type_identity<raft_command>{});
                muts.push_back(std::move(cmd.mutation));
            }
            // Hold pointers to schemas until `_db.apply()` is finished
            auto schemas = co_await get_schema_and_upgrade_mutations(muts);
            co_await _db.apply(std::move(muts), db::no_timeout);
        } catch (replica::no_such_column_family&) {
            // If the table doesn't exist, it means it was already dropped.
            // This cannot happen if the table wasn't created yet on the node
            // because the state machine is created only after the table is created
            // (see `schema_applier::commit_on_shard()` and `storage_service::commit_token_metadata_change()`).
            // In this case, we should just ignore mutations without throwing an error.
            logger.log(log_level::warn, rate_limit, "apply(): table {} was already dropped, ignoring mutations", _tablet.table);
        } catch (const abort_requested_exception& ex) {
            // The exception can be thrown by get_schema_and_upgrade_mutations.
            // It means that the Raft group is being removed.
            //
            // Technically, throwing an exception from a state machine
            // may result in killing the corresponding Raft instance:
            // cf. the description of raft::state_machine:
            //
            //  "Any of the functions may return an error, but it will kill the
            //   raft instance that uses it. Depending on what state the failure
            //   leaves the state is the raft instance will either have to be recreated
            //   with the same state machine and rejoined the cluster with the same server_id
            //   or it new raft instance will have to be created with empty state machine and
            //   it will have to rejoin to the cluster with different server_id through
            //   configuration change."
            //
            // Fortunately, in strong consistency, we use the default Raft server
            // implementation, which handles abort_requested_exception thrown by
            // raft::state_machine::apply -- it will simply end the applier fiber.
            logger.debug("apply(): execution for tablet {}, group_id={} aborted due to: {}",
                _tablet, _group_id, ex);
            throw;
        }
         catch (...) {
            throw std::runtime_error(::format(
                "tablet {}, group id {}: error while applying mutations {}",
                _tablet, _group_id, std::current_exception()));
        }
    }

    future<raft::snapshot_id> take_snapshot() override {
        // Until snapshot transfer is fully implemented, return a fake ID
        // and don't actually do anything. As long as we don't do snapshot
        // transfers (attempting to do that throws an exception), we should
        // be safe.
        return make_ready_future<raft::snapshot_id>(raft::snapshot_id(utils::make_random_uuid()));
    }

    void drop_snapshot(raft::snapshot_id id) override {
        // Taking a snapshot is a no-op, so dropping a snapshot is also a no-op.
        (void) id;
    }

    future<> load_snapshot(raft::snapshot_id id) override {
        return make_ready_future<>();
    }

    future<> abort() override {
        logger.debug("abort(): Aborting state machine for group {}", _group_id);
        _as.request_abort();
        return make_ready_future<>();
    }

    future<> transfer_snapshot(raft::server_id from_id, raft::snapshot_descriptor snp) override {
        throw std::runtime_error("transfer_snapshot() not implemented");
    }

private:
    using column_mappings_cache = utils::loading_cache<table_schema_version, column_mapping>;
    using schema_store = std::unordered_map<table_schema_version, std::pair<schema_ptr, column_mappings_cache::value_ptr>>;
    future<schema_store> get_schema_and_upgrade_mutations(utils::chunked_vector<frozen_mutation>& muts) {
        // Cache column mappings to avoid querying `system.scylla_table_schema_history` multiple times.
        static thread_local column_mappings_cache column_mapping_cache(std::numeric_limits<size_t>::max(), 1h, logger);
        // Stores schema pointer and optional column mapping for each schema version present in the mutations
        schema_store schema_mappings;
        bool barrier_executed = false;

        auto get_schema = [&] (table_schema_version schema_version) -> future<std::pair<schema_ptr, column_mappings_cache::value_ptr>> {
            if (utils::get_local_injector().enter("sc_state_machine_return_empty_schema")) {
                co_return std::pair{nullptr, nullptr};
            }

            auto schema = local_schema_registry().get_or_null(schema_version);
            if (schema) {
                co_return std::pair{std::move(schema), nullptr};
            }

            // `_db.find_schema()` may throw `replica::no_such_column_family` if the table was already dropped.
            schema = _db.find_schema(_tablet.table);
            // The column mapping may be already present in the cache from another `apply()` call
            auto cm_ptr = column_mapping_cache.find(schema_version);
            if (cm_ptr) {
                co_return std::pair{std::move(schema), std::move(cm_ptr)};
            }

            // We may not find the column mapping if the mutation schema is newer than the present schema.
            // In this case, we should trigger the barrier to wait for the schema to be updated and then try again.
            auto cm_opt = co_await db::schema_tables::get_column_mapping_if_exists(_sys_ks, _tablet.table, schema_version);
            if (!cm_opt) {
                co_return std::pair{nullptr, nullptr};
            }

            cm_ptr = co_await column_mapping_cache.get_ptr(schema_version, [cm = std::move(*cm_opt)] (auto schema_version) -> future<column_mapping> {
                co_return std::move(cm);
            });
            co_return std::pair{std::move(schema), std::move(cm_ptr)};
        };

        auto resolve_schema = [&] (const frozen_mutation& mut) -> future<const schema_store::mapped_type*> {
            auto schema_version = mut.schema_version();
            auto it = schema_mappings.find(schema_version);
            if (it != schema_mappings.end()) {
                co_return &it->second;
            }

            auto schema_cm = co_await get_schema(schema_version);
            if (!schema_cm.first && !barrier_executed) {
                if (utils::get_local_injector().enter("disable_raft_drop_append_entries_for_specified_group")) {
                    utils::get_local_injector().disable("raft_drop_incoming_append_entries_for_specified_group");
                }
                co_await _mm.get_group0_barrier().trigger(false, &_as);
                barrier_executed = true;
                schema_cm = co_await get_schema(schema_version);
            }

            if (schema_cm.first) {
                const auto [it, _] = schema_mappings.insert({schema_version, std::move(schema_cm)});
                co_return &it->second;
            }
            co_return nullptr;
        };

        for (auto& m: muts) {
            auto schema_entry = co_await resolve_schema(m);
            if (!schema_entry) {
                // Old schema are TTLed after 10 days (see comment in `schema_applier::finalize_tables_and_views()`),
                // so this error theoretically  may be triggered if a node is stuck longer than this.
                // But in practice we should do a snapshot much earlier, that's why `on_internal_error()` here.
                // And if the table was already dropped, `no_such_column_family` will be dropped earlier.
                on_internal_error(logger, fmt::format("couldn't find schema for table {} and mutation schema version {}", _tablet.table, m.schema_version()));
            }
            if (schema_entry->second) {
                m = freeze(m.unfreeze_upgrading(schema_entry->first, *schema_entry->second));
            }
        }

        // We only need vector of schema pointers but we're returning the whole map
        // to avoid another allocation
        co_return std::move(schema_mappings);
    }
};

std::unique_ptr<raft_state_machine> make_state_machine(locator::global_tablet_id tablet,
    raft::group_id gid,
    replica::database& db,
    service::migration_manager& mm,
    db::system_keyspace& sys_ks)
{
    return std::make_unique<state_machine>(tablet, gid, db, mm, sys_ks);
}

};