scylladb/service/migration_manager.cc

/*
 */

/*
 * Copyright (C) 2015-present ScyllaDB
 *
 * Modified by ScyllaDB
 */

/*
 * SPDX-License-Identifier: (AGPL-3.0-or-later and Apache-2.0)
 */

#include <seastar/core/sleep.hh>
#include <seastar/core/coroutine.hh>
#include <seastar/coroutine/maybe_yield.hh>
#include "schema_registry.hh"
#include "service/migration_manager.hh"
#include "service/storage_proxy.hh"
#include "service/raft/group0_state_machine.hh"

#include "service/migration_listener.hh"
#include "message/messaging_service.hh"
#include "gms/feature_service.hh"
#include "utils/runtime.hh"
#include "gms/gossiper.hh"
#include "view_info.hh"
#include "schema_builder.hh"
#include "replica/database.hh"
#include "db/schema_tables.hh"
#include "types/user.hh"
#include "db/system_keyspace.hh"
#include "cql3/functions/user_aggregate.hh"
#include "cql3/functions/user_function.hh"

#include "serialization_visitors.hh"
#include "serializer.hh"
#include "idl/frozen_schema.dist.hh"
#include "idl/uuid.dist.hh"
#include "serializer_impl.hh"
#include "idl/frozen_schema.dist.impl.hh"
#include "idl/uuid.dist.impl.hh"
#include "idl/raft_storage.dist.hh"
#include "idl/raft_storage.dist.impl.hh"
#include "idl/group0_state_machine.dist.hh"
#include "idl/group0_state_machine.dist.impl.hh"


namespace service {

static logging::logger mlogger("migration_manager");

using namespace std::chrono_literals;

const std::chrono::milliseconds migration_manager::migration_delay = 60000ms;
static future<schema_ptr> get_schema_definition(table_schema_version v, netw::messaging_service::msg_addr dst, netw::messaging_service& ms, service::storage_proxy& sp);

migration_manager::migration_manager(migration_notifier& notifier, gms::feature_service& feat, netw::messaging_service& ms,
            service::storage_proxy& storage_proxy, gms::gossiper& gossiper, service::raft_group_registry& raft_gr, sharded<db::system_keyspace>& sysks) :
        _notifier(notifier), _feat(feat), _messaging(ms), _storage_proxy(storage_proxy), _gossiper(gossiper), _raft_gr(raft_gr)
        , _sys_ks(sysks)
        , _schema_push([this] { return passive_announce(); })
        , _group0_read_apply_mutex{1}, _group0_operation_mutex{1}
        , _group0_history_gc_duration{std::chrono::duration_cast<gc_clock::duration>(std::chrono::weeks{1})}
        , _concurrent_ddl_retries{10}
{
}

future<> migration_manager::stop() {
    if (!_as.abort_requested()) {
        co_await drain();
    }
    try {
        co_await _schema_push.join();
    } catch (...) {
        mlogger.error("schema_push failed: {}", std::current_exception());
    }
}

future<> migration_manager::drain()
{
    mlogger.info("stopping migration service");
    _as.request_abort();

    co_await uninit_messaging_service();
    try {
        co_await parallel_for_each(_schema_pulls, [] (auto&& e) {
            return e.second.join();
        });
    } catch (...) {
        mlogger.error("schema_pull failed: {}", std::current_exception());
    }
    co_await _background_tasks.close();
}

void migration_manager::init_messaging_service()
{
    auto update_schema = [this] {
        //FIXME: future discarded.
        (void)with_gate(_background_tasks, [this] {
            mlogger.debug("features changed, recalculating schema version");
            return db::schema_tables::recalculate_schema_version(_sys_ks, _storage_proxy.container(), _feat);
        });
    };

    if (this_shard_id() == 0) {
        _feature_listeners.push_back(_feat.cluster_supports_view_virtual_columns().when_enabled(update_schema));
        _feature_listeners.push_back(_feat.cluster_supports_digest_insensitive_to_expiry().when_enabled(update_schema));
        _feature_listeners.push_back(_feat.cluster_supports_cdc().when_enabled(update_schema));
        _feature_listeners.push_back(_feat.cluster_supports_per_table_partitioners().when_enabled(update_schema));
        _feature_listeners.push_back(_feat.cluster_supports_computed_columns().when_enabled(update_schema));
    }

    _messaging.register_definitions_update([this] (const rpc::client_info& cinfo, std::vector<frozen_mutation> fm, rpc::optional<std::vector<canonical_mutation>> cm) {
        auto src = netw::messaging_service::get_source(cinfo);
        auto f = make_ready_future<>();
        if (cm) {
            f = do_with(std::move(*cm), [this, src] (const std::vector<canonical_mutation>& mutations) {
                return merge_schema_in_background(src, mutations);
            });
        } else {
            f = do_with(std::move(fm), [this, src] (const std::vector<frozen_mutation>& mutations) {
                return merge_schema_in_background(src, mutations);
            });
        }
        // Start a new fiber.
        (void)f.then_wrapped([src] (auto&& f) {
            if (f.failed()) {
                mlogger.error("Failed to update definitions from {}: {}", src, f.get_exception());
            } else {
                mlogger.debug("Applied definitions update from {}.", src);
            }
        });
        return netw::messaging_service::no_wait();
    });
    _messaging.register_migration_request(std::bind_front(
            [] (migration_manager& self, const rpc::client_info& cinfo, rpc::optional<netw::schema_pull_options> options)
                -> future<rpc::tuple<std::vector<frozen_mutation>, std::vector<canonical_mutation>>> {
        const auto cm_retval_supported = options && options->remote_supports_canonical_mutation_retval;

        auto features = self._feat.cluster_schema_features();
        auto& proxy = self._storage_proxy.container();
        auto cm = co_await db::schema_tables::convert_schema_to_mutations(proxy, features);
        if (self._raft_gr.is_enabled() && options->group0_snapshot_transfer) {
            // if `group0_snapshot_transfer` is `true`, the sender must also understand canonical mutations
            // (`group0_snapshot_transfer` was added more recently).
            if (!cm_retval_supported) {
                on_internal_error(mlogger,
                    "migration request handler: group0 snapshot transfer requested, but canonical mutations not supported");
            }
            cm.emplace_back(co_await db::system_keyspace::get_group0_history(proxy));
        }
        if (cm_retval_supported) {
            co_return rpc::tuple(std::vector<frozen_mutation>{}, std::move(cm));
        }
        auto fm = boost::copy_range<std::vector<frozen_mutation>>(cm | boost::adaptors::transformed([&db = proxy.local().get_db().local()] (const canonical_mutation& cm) {
            return cm.to_mutation(db.find_column_family(cm.column_family_id()).schema());
        }));
        co_return rpc::tuple(std::move(fm), std::move(cm));
    }, std::ref(*this)));
    _messaging.register_schema_check([this] {
        return make_ready_future<utils::UUID>(_storage_proxy.get_db().local().get_version());
    });
    _messaging.register_get_schema_version([this] (unsigned shard, table_schema_version v) {
        // FIXME: should this get an smp_service_group? Probably one separate from reads and writes.
        return container().invoke_on(shard, [v] (auto&& sp) {
            mlogger.debug("Schema version request for {}", v);
            return local_schema_registry().get_frozen(v);
        });
    });
}

future<> migration_manager::uninit_messaging_service()
{
    return when_all_succeed(
        _messaging.unregister_migration_request(),
        _messaging.unregister_definitions_update(),
        _messaging.unregister_schema_check(),
        _messaging.unregister_get_schema_version()
    ).discard_result();
}

void migration_notifier::register_listener(migration_listener* listener)
{
    _listeners.add(listener);
}

future<> migration_notifier::unregister_listener(migration_listener* listener)
{
    return _listeners.remove(listener);
}

void migration_manager::schedule_schema_pull(const gms::inet_address& endpoint, const gms::endpoint_state& state)
{
    const auto* value = state.get_application_state_ptr(gms::application_state::SCHEMA);

    if (endpoint != utils::fb_utilities::get_broadcast_address() && value) {
        // FIXME: discarded future
        (void)maybe_schedule_schema_pull(utils::UUID{value->value}, endpoint).handle_exception([endpoint] (auto ep) {
            mlogger.warn("Fail to pull schema from {}: {}", endpoint, ep);
        });
    }
}

bool migration_manager::have_schema_agreement() {
    const auto known_endpoints = _gossiper.endpoint_state_map;
    if (known_endpoints.size() == 1) {
        // Us.
        return true;
    }
    auto our_version = _storage_proxy.get_db().local().get_version();
    bool match = false;
    for (auto& x : known_endpoints) {
        auto& endpoint = x.first;
        auto& eps = x.second;
        if (endpoint == utils::fb_utilities::get_broadcast_address() || !eps.is_alive()) {
            continue;
        }
        mlogger.debug("Checking schema state for {}.", endpoint);
        auto* schema = eps.get_application_state_ptr(gms::application_state::SCHEMA);
        if (!schema) {
            mlogger.debug("Schema state not yet available for {}.", endpoint);
            return false;
        }
        utils::UUID remote_version{schema->value};
        if (our_version != remote_version) {
            mlogger.debug("Schema mismatch for {} ({} != {}).", endpoint, our_version, remote_version);
            return false;
        } else {
            match = true;
        }
    }
    return match;
}

/**
 * If versions differ this node sends request with local migration list to the endpoint
 * and expecting to receive a list of migrations to apply locally.
 */
future<> migration_manager::maybe_schedule_schema_pull(const utils::UUID& their_version, const gms::inet_address& endpoint)
{
    auto& proxy = _storage_proxy;
    auto& db = proxy.get_db().local();

    if (db.get_version() == their_version || !should_pull_schema_from(endpoint)) {
        mlogger.debug("Not pulling schema because versions match or shouldPullSchemaFrom returned false");
        return make_ready_future<>();
    }

    if (db.get_version() == replica::database::empty_version || runtime::get_uptime() < migration_delay) {
        // If we think we may be bootstrapping or have recently started, submit MigrationTask immediately
        mlogger.debug("Submitting migration task for {}", endpoint);
        return submit_migration_task(endpoint);
    }

    return with_gate(_background_tasks, [this, &db, endpoint] {
        // Include a delay to make sure we have a chance to apply any changes being
        // pushed out simultaneously. See CASSANDRA-5025
        return sleep_abortable(migration_delay, _as).then([this, &db, endpoint] {
            // grab the latest version of the schema since it may have changed again since the initial scheduling
            auto* ep_state = _gossiper.get_endpoint_state_for_endpoint_ptr(endpoint);
            if (!ep_state) {
                mlogger.debug("epState vanished for {}, not submitting migration task", endpoint);
                return make_ready_future<>();
            }
            const auto* value = ep_state->get_application_state_ptr(gms::application_state::SCHEMA);
            if (!value) {
                mlogger.debug("application_state::SCHEMA does not exist for {}, not submitting migration task", endpoint);
                return make_ready_future<>();
            }
            utils::UUID current_version{value->value};
            if (db.get_version() == current_version) {
                mlogger.debug("not submitting migration task for {} because our versions match", endpoint);
                return make_ready_future<>();
            }
            mlogger.debug("submitting migration task for {}", endpoint);
            return submit_migration_task(endpoint);
        });
    }).finally([me = shared_from_this()] {});
}

future<> migration_manager::submit_migration_task(const gms::inet_address& endpoint, bool can_ignore_down_node)
{
    if (!_gossiper.is_alive(endpoint)) {
        auto msg = format("Can't send migration request: node {} is down.", endpoint);
        mlogger.warn("{}", msg);
        return can_ignore_down_node ? make_ready_future<>() : make_exception_future<>(std::runtime_error(msg));
    }
    netw::messaging_service::msg_addr id{endpoint, 0};
    return merge_schema_from(id).handle_exception([](std::exception_ptr e) {
        try {
            std::rethrow_exception(e);
        } catch (const exceptions::configuration_exception& e) {
            mlogger.error("Configuration exception merging remote schema: {}", e.what());
            return make_exception_future<>(e);
        }
    });
}

future<> migration_manager::do_merge_schema_from(netw::messaging_service::msg_addr id)
{
    mlogger.info("Pulling schema from {}", id);
    return _messaging.send_migration_request(std::move(id), netw::schema_pull_options{}).then([this, id] (
            rpc::tuple<std::vector<frozen_mutation>, rpc::optional<std::vector<canonical_mutation>>> frozen_and_canonical_mutations) {
        auto&& [mutations, canonical_mutations] = frozen_and_canonical_mutations;
        if (canonical_mutations) {
            return do_with(std::move(*canonical_mutations), [this, id] (std::vector<canonical_mutation>& mutations) {
                return this->merge_schema_from(id, mutations);
            });
        }
        return do_with(std::move(mutations), [this, id] (auto&& mutations) {
            return this->merge_schema_from(id, mutations);
        });
    }).then([id] {
        mlogger.info("Schema merge with {} completed", id);
    });
}

future<> migration_manager::merge_schema_from(netw::messaging_service::msg_addr id)
{
    if (_as.abort_requested()) {
        return make_exception_future<>(abort_requested_exception());
    }

    mlogger.info("Requesting schema pull from {}", id);
    auto i = _schema_pulls.find(id);
    if (i == _schema_pulls.end()) {
        // FIXME: Drop entries for removed nodes (or earlier).
        i = _schema_pulls.emplace(std::piecewise_construct,
                std::tuple<netw::messaging_service::msg_addr>(id),
                std::tuple<std::function<future<>()>>([id, this] {
                    return do_merge_schema_from(id);
                })).first;
    }
    return i->second.trigger();
}

future<> migration_manager::merge_schema_from(netw::messaging_service::msg_addr src, const std::vector<canonical_mutation>& canonical_mutations) {
    mlogger.debug("Applying schema mutations from {}", src);
    auto& proxy = _storage_proxy;
    const auto& db = proxy.get_db().local();

    if (_as.abort_requested()) {
        return make_exception_future<>(abort_requested_exception());
    }

    std::vector<mutation> mutations;
    mutations.reserve(canonical_mutations.size());
    try {
        for (const auto& cm : canonical_mutations) {
            auto& tbl = db.find_column_family(cm.column_family_id());
            mutations.emplace_back(cm.to_mutation(
                    tbl.schema()));
        }
    } catch (replica::no_such_column_family& e) {
        mlogger.error("Error while applying schema mutations from {}: {}", src, e);
        return make_exception_future<>(std::make_exception_ptr<std::runtime_error>(
                    std::runtime_error(fmt::format("Error while applying schema mutations: {}", e))));
    }
    return db::schema_tables::merge_schema(_sys_ks, proxy.container(), _feat, std::move(mutations));
}

future<> migration_manager::merge_schema_from(netw::messaging_service::msg_addr src, const std::vector<frozen_mutation>& mutations)
{
    if (_as.abort_requested()) {
        return make_exception_future<>(abort_requested_exception());
    }

    mlogger.debug("Applying schema mutations from {}", src);
    return map_reduce(mutations, [this, src](const frozen_mutation& fm) {
        // schema table's schema is not syncable so just use get_schema_definition()
        return get_schema_definition(fm.schema_version(), src, _messaging, _storage_proxy).then([&fm](schema_ptr s) {
            s->registry_entry()->mark_synced();
            return fm.unfreeze(std::move(s));
        });
    }, std::vector<mutation>(), [](std::vector<mutation>&& all, mutation&& m) {
        all.emplace_back(std::move(m));
        return std::move(all);
    }).then([this](std::vector<mutation> schema) {
        return db::schema_tables::merge_schema(_sys_ks, _storage_proxy.container(), _feat, std::move(schema));
    });
}

bool migration_manager::has_compatible_schema_tables_version(const gms::inet_address& endpoint) {
    auto* version = _gossiper.get_application_state_ptr(endpoint, gms::application_state::SCHEMA_TABLES_VERSION);
    return version && version->value == db::schema_tables::version;
}

bool migration_manager::should_pull_schema_from(const gms::inet_address& endpoint) {
    return has_compatible_schema_tables_version(endpoint)
            && !_gossiper.is_gossip_only_member(endpoint);
}

future<> migration_notifier::create_keyspace(const lw_shared_ptr<keyspace_metadata>& ksm) {
    return seastar::async([this, ksm] {
        const auto& name = ksm->name();
        _listeners.thread_for_each([&name] (migration_listener* listener) {
            try {
                listener->on_create_keyspace(name);
            } catch (...) {
                mlogger.warn("Create keyspace notification failed {}: {}", name, std::current_exception());
            }
        });
    });
}

future<> migration_notifier::create_column_family(const schema_ptr& cfm) {
    return seastar::async([this, cfm] {
        const auto& ks_name = cfm->ks_name();
        const auto& cf_name = cfm->cf_name();
        _listeners.thread_for_each([&ks_name, &cf_name] (migration_listener* listener) {
            try {
                listener->on_create_column_family(ks_name, cf_name);
            } catch (...) {
                mlogger.warn("Create column family notification failed {}.{}: {}", ks_name, cf_name, std::current_exception());
            }
        });
    });
}

future<> migration_notifier::create_user_type(const user_type& type) {
    return seastar::async([this, type] {
        const auto& ks_name = type->_keyspace;
        const auto& type_name = type->get_name_as_string();
        _listeners.thread_for_each([&ks_name, &type_name] (migration_listener* listener) {
            try {
                listener->on_create_user_type(ks_name, type_name);
            } catch (...) {
                mlogger.warn("Create user type notification failed {}.{}: {}", ks_name, type_name, std::current_exception());
            }
        });
    });
}

future<> migration_notifier::create_view(const view_ptr& view) {
    return seastar::async([this, view] {
        const auto& ks_name = view->ks_name();
        const auto& view_name = view->cf_name();
        _listeners.thread_for_each([&ks_name, &view_name] (migration_listener* listener) {
            try {
                listener->on_create_view(ks_name, view_name);
            } catch (...) {
                mlogger.warn("Create view notification failed {}.{}: {}", ks_name, view_name, std::current_exception());
            }
        });
    });
}

#if 0
public void notifyCreateFunction(UDFunction udf)
{
    for (IMigrationListener listener : listeners)
        listener.onCreateFunction(udf.name().keyspace, udf.name().name);
}

public void notifyCreateAggregate(UDAggregate udf)
{
    for (IMigrationListener listener : listeners)
        listener.onCreateAggregate(udf.name().keyspace, udf.name().name);
}
#endif

future<> migration_notifier::update_keyspace(const lw_shared_ptr<keyspace_metadata>& ksm) {
    return seastar::async([this, ksm] {
        const auto& name = ksm->name();
        _listeners.thread_for_each([&name] (migration_listener* listener) {
            try {
                listener->on_update_keyspace(name);
            } catch (...) {
                mlogger.warn("Update keyspace notification failed {}: {}", name, std::current_exception());
            }
        });
    });
}

future<> migration_notifier::update_column_family(const schema_ptr& cfm, bool columns_changed) {
    return seastar::async([this, cfm, columns_changed] {
        const auto& ks_name = cfm->ks_name();
        const auto& cf_name = cfm->cf_name();
        _listeners.thread_for_each([&ks_name, &cf_name, columns_changed] (migration_listener* listener) {
            try {
                listener->on_update_column_family(ks_name, cf_name, columns_changed);
            } catch (...) {
                mlogger.warn("Update column family notification failed {}.{}: {}", ks_name, cf_name, std::current_exception());
            }
        });
    });
}

future<> migration_notifier::update_user_type(const user_type& type) {
    return seastar::async([this, type] {
        const auto& ks_name = type->_keyspace;
        const auto& type_name = type->get_name_as_string();
        _listeners.thread_for_each([&ks_name, &type_name] (migration_listener* listener) {
            try {
                listener->on_update_user_type(ks_name, type_name);
            } catch (...) {
                mlogger.warn("Update user type notification failed {}.{}: {}", ks_name, type_name, std::current_exception());
            }
        });
    });
}

future<> migration_notifier::update_view(const view_ptr& view, bool columns_changed) {
    return seastar::async([this, view, columns_changed] {
        const auto& ks_name = view->ks_name();
        const auto& view_name = view->cf_name();
        _listeners.thread_for_each([&ks_name, &view_name, columns_changed] (migration_listener* listener) {
            try {
                listener->on_update_view(ks_name, view_name, columns_changed);
            } catch (...) {
                mlogger.warn("Update view notification failed {}.{}: {}", ks_name, view_name, std::current_exception());
            }
        });
    });
}

#if 0
public void notifyUpdateFunction(UDFunction udf)
{
    for (IMigrationListener listener : listeners)
        listener.onUpdateFunction(udf.name().keyspace, udf.name().name);
}

public void notifyUpdateAggregate(UDAggregate udf)
{
    for (IMigrationListener listener : listeners)
        listener.onUpdateAggregate(udf.name().keyspace, udf.name().name);
}
#endif

future<> migration_notifier::drop_keyspace(const sstring& ks_name) {
    return seastar::async([this, ks_name] {
        _listeners.thread_for_each([&ks_name] (migration_listener* listener) {
            try {
                listener->on_drop_keyspace(ks_name);
            } catch (...) {
                mlogger.warn("Drop keyspace notification failed {}: {}", ks_name, std::current_exception());
            }
        });
    });
}

future<> migration_notifier::drop_column_family(const schema_ptr& cfm) {
    return seastar::async([this, cfm] {
        const auto& cf_name = cfm->cf_name();
        const auto& ks_name = cfm->ks_name();
        _listeners.thread_for_each([&ks_name, &cf_name] (migration_listener* listener) {
            try {
                listener->on_drop_column_family(ks_name, cf_name);
            } catch (...) {
                mlogger.warn("Drop column family notification failed {}.{}: {}", ks_name, cf_name, std::current_exception());
            }
        });
    });
}

future<> migration_notifier::drop_user_type(const user_type& type) {
    return seastar::async([this, type] {
        auto&& ks_name = type->_keyspace;
        auto&& type_name = type->get_name_as_string();
        _listeners.thread_for_each([&ks_name, &type_name] (migration_listener* listener) {
            try {
                listener->on_drop_user_type(ks_name, type_name);
            } catch (...) {
                mlogger.warn("Drop user type notification failed {}.{}: {}", ks_name, type_name, std::current_exception());
            }
        });
    });
}

future<> migration_notifier::drop_view(const view_ptr& view) {
    return seastar::async([this, view] {
        auto&& ks_name = view->ks_name();
        auto&& view_name = view->cf_name();
        _listeners.thread_for_each([&ks_name, &view_name] (migration_listener* listener) {
            try {
                listener->on_drop_view(ks_name, view_name);
            } catch (...) {
                mlogger.warn("Drop view notification failed {}.{}: {}", ks_name, view_name, std::current_exception());
            }
        });
    });
}

void migration_notifier::before_create_column_family(const schema& schema,
        std::vector<mutation>& mutations, api::timestamp_type timestamp) {
    _listeners.thread_for_each([&mutations, &schema, timestamp] (migration_listener* listener) {
        // allow exceptions. so a listener can effectively kill a create-table
        listener->on_before_create_column_family(schema, mutations, timestamp);
    });
}

void migration_notifier::before_update_column_family(const schema& new_schema,
        const schema& old_schema, std::vector<mutation>& mutations, api::timestamp_type ts) {
    _listeners.thread_for_each([&mutations, &new_schema, &old_schema, ts] (migration_listener* listener) {
        // allow exceptions. so a listener can effectively kill an update-column
        listener->on_before_update_column_family(new_schema, old_schema, mutations, ts);
    });
}

void migration_notifier::before_drop_column_family(const schema& schema,
        std::vector<mutation>& mutations, api::timestamp_type ts) {
    _listeners.thread_for_each([&mutations, &schema, ts] (migration_listener* listener) {
        // allow exceptions. so a listener can effectively kill a drop-column
        listener->on_before_drop_column_family(schema, mutations, ts);
    });
}


#if 0
public void notifyDropFunction(UDFunction udf)
{
    for (IMigrationListener listener : listeners)
        listener.onDropFunction(udf.name().keyspace, udf.name().name);
}

public void notifyDropAggregate(UDAggregate udf)
{
    for (IMigrationListener listener : listeners)
        listener.onDropAggregate(udf.name().keyspace, udf.name().name);
}
#endif

std::vector<mutation> migration_manager::prepare_keyspace_update_announcement(lw_shared_ptr<keyspace_metadata> ksm, api::timestamp_type ts) {
    auto& proxy = _storage_proxy;
    auto& db = proxy.get_db().local();

    db.validate_keyspace_update(*ksm);
    mlogger.info("Update Keyspace: {}", ksm);
    return db::schema_tables::make_create_keyspace_mutations(db.features().cluster_schema_features(), ksm, ts);
}

std::vector<mutation> migration_manager::prepare_new_keyspace_announcement(lw_shared_ptr<keyspace_metadata> ksm, api::timestamp_type timestamp) {
    auto& proxy = _storage_proxy;
    auto& db = proxy.get_db().local();

    db.validate_new_keyspace(*ksm);
    mlogger.info("Create new Keyspace: {}", ksm);
    return db::schema_tables::make_create_keyspace_mutations(db.features().cluster_schema_features(), ksm, timestamp);
}

future<std::vector<mutation>> migration_manager::include_keyspace(
        const keyspace_metadata& keyspace, std::vector<mutation> mutations) {
    // Include the serialized keyspace in case the target node missed a CREATE KEYSPACE migration (see CASSANDRA-5631).
    mutation m = co_await db::schema_tables::read_keyspace_mutation(_storage_proxy.container(), keyspace.name());
    mutations.push_back(std::move(m));
    co_return std::move(mutations);
}

future<std::vector<mutation>> migration_manager::prepare_new_column_family_announcement(schema_ptr cfm, api::timestamp_type timestamp) {
#if 0
    cfm.validate();
#endif
    try {
        auto& db = _storage_proxy.get_db().local();
        auto&& keyspace = db.find_keyspace(cfm->ks_name());
        if (db.has_schema(cfm->ks_name(), cfm->cf_name())) {
            throw exceptions::already_exists_exception(cfm->ks_name(), cfm->cf_name());
        }
        if (db.column_family_exists(cfm->id())) {
            throw exceptions::invalid_request_exception(format("Table with ID {} already exists: {}", cfm->id(), db.find_schema(cfm->id())));
        }

        mlogger.info("Create new ColumnFamily: {}", cfm);

        auto ksm = keyspace.metadata();
        return seastar::async([this, cfm, timestamp, ksm] {
            auto mutations = db::schema_tables::make_create_table_mutations(cfm, timestamp);
            get_notifier().before_create_column_family(*cfm, mutations, timestamp);
            return mutations;
        }).then([this, ksm](std::vector<mutation> mutations) {
            return include_keyspace(*ksm, std::move(mutations));
        });
    } catch (const replica::no_such_keyspace& e) {
        throw exceptions::configuration_exception(format("Cannot add table '{}' to non existing keyspace '{}'.", cfm->cf_name(), cfm->ks_name()));
    }
}

future<std::vector<mutation>> migration_manager::prepare_column_family_update_announcement(schema_ptr cfm, bool from_thrift, std::vector<view_ptr> view_updates, api::timestamp_type ts) {
    warn(unimplemented::cause::VALIDATION);
#if 0
    cfm.validate();
#endif
    try {
        auto& db = _storage_proxy.get_db().local();
        auto&& old_schema = db.find_column_family(cfm->ks_name(), cfm->cf_name()).schema(); // FIXME: Should we lookup by id?
#if 0
        oldCfm.validateCompatility(cfm);
#endif
        mlogger.info("Update table '{}.{}' From {} To {}", cfm->ks_name(), cfm->cf_name(), *old_schema, *cfm);
        auto&& keyspace = db.find_keyspace(cfm->ks_name()).metadata();

        auto mutations = db::schema_tables::make_update_table_mutations(db, keyspace, old_schema, cfm, ts, from_thrift);
        for (auto&& view : view_updates) {
            auto& old_view = keyspace->cf_meta_data().at(view->cf_name());
            mlogger.info("Update view '{}.{}' From {} To {}", view->ks_name(), view->cf_name(), *old_view, *view);
            auto view_mutations = db::schema_tables::make_update_view_mutations(keyspace, view_ptr(old_view), std::move(view), ts, false);
            std::move(view_mutations.begin(), view_mutations.end(), std::back_inserter(mutations));
            co_await coroutine::maybe_yield();
        }
        co_await seastar::async([&] {
            get_notifier().before_update_column_family(*cfm, *old_schema, mutations, ts);
        });
        co_return co_await include_keyspace(*keyspace, std::move(mutations));
    } catch (const replica::no_such_column_family& e) {
        co_return coroutine::make_exception(exceptions::configuration_exception(format("Cannot update non existing table '{}' in keyspace '{}'.",
                                                         cfm->cf_name(), cfm->ks_name())));
    }
}

future<std::vector<mutation>> migration_manager::do_prepare_new_type_announcement(user_type new_type, api::timestamp_type ts) {
    auto& db = _storage_proxy.get_db().local();
    auto&& keyspace = db.find_keyspace(new_type->_keyspace);
    auto mutations = db::schema_tables::make_create_type_mutations(keyspace.metadata(), new_type, ts);
    return include_keyspace(*keyspace.metadata(), std::move(mutations));
}

future<std::vector<mutation>> migration_manager::prepare_new_type_announcement(user_type new_type, api::timestamp_type ts) {
    mlogger.info("Prepare Create new User Type: {}", new_type->get_name_as_string());
    return do_prepare_new_type_announcement(std::move(new_type), ts);
}

future<std::vector<mutation>> migration_manager::prepare_update_type_announcement(user_type updated_type, api::timestamp_type ts) {
    mlogger.info("Prepare Update User Type: {}", updated_type->get_name_as_string());
    return do_prepare_new_type_announcement(updated_type, ts);
}

future<std::vector<mutation>> migration_manager::prepare_new_function_announcement(shared_ptr<cql3::functions::user_function> func, api::timestamp_type ts) {
    auto& db = _storage_proxy.get_db().local();
    auto&& keyspace = db.find_keyspace(func->name().keyspace);
    auto mutations = db::schema_tables::make_create_function_mutations(func, ts);
    return include_keyspace(*keyspace.metadata(), std::move(mutations));
}

future<std::vector<mutation>> migration_manager::prepare_function_drop_announcement(shared_ptr<cql3::functions::user_function> func, api::timestamp_type ts) {
    auto& db = _storage_proxy.get_db().local();
    auto&& keyspace = db.find_keyspace(func->name().keyspace);
    auto mutations = db::schema_tables::make_drop_function_mutations(func, ts);
    return include_keyspace(*keyspace.metadata(), std::move(mutations));
}

future<std::vector<mutation>> migration_manager::prepare_new_aggregate_announcement(shared_ptr<cql3::functions::user_aggregate> aggregate, api::timestamp_type ts) {
    auto& db = _storage_proxy.get_db().local();
    auto&& keyspace = db.find_keyspace(aggregate->name().keyspace);
    auto mutations = db::schema_tables::make_create_aggregate_mutations(aggregate, ts);
    return include_keyspace(*keyspace.metadata(), std::move(mutations));
}

future<std::vector<mutation>> migration_manager::prepare_aggregate_drop_announcement(shared_ptr<cql3::functions::user_aggregate> aggregate, api::timestamp_type ts) {
    auto& db = _storage_proxy.get_db().local();
    auto&& keyspace = db.find_keyspace(aggregate->name().keyspace);
    auto mutations = db::schema_tables::make_drop_aggregate_mutations(aggregate, ts);
    return include_keyspace(*keyspace.metadata(), std::move(mutations));
}

std::vector<mutation> migration_manager::prepare_keyspace_drop_announcement(const sstring& ks_name, api::timestamp_type ts) {
    auto& db = _storage_proxy.get_db().local();
    if (!db.has_keyspace(ks_name)) {
        throw exceptions::configuration_exception(format("Cannot drop non existing keyspace '{}'.", ks_name));
    }
    auto& keyspace = db.find_keyspace(ks_name);
    mlogger.info("Drop Keyspace '{}'", ks_name);
    return db::schema_tables::make_drop_keyspace_mutations(db.features().cluster_schema_features(), keyspace.metadata(), ts);
}

future<std::vector<mutation>> migration_manager::prepare_column_family_drop_announcement(const sstring& ks_name,
                    const sstring& cf_name, api::timestamp_type ts, drop_views drop_views) {
    try {
        auto& db = _storage_proxy.get_db().local();
        auto& old_cfm = db.find_column_family(ks_name, cf_name);
        auto& schema = old_cfm.schema();
        if (schema->is_view()) {
            co_return coroutine::make_exception(exceptions::invalid_request_exception("Cannot use DROP TABLE on Materialized View"));
        }
        auto keyspace = db.find_keyspace(ks_name).metadata();

        // If drop_views is false (the default), we don't allow to delete a
        // table which has views which aren't part of an index. If drop_views
        // is true, we delete those views as well.
        auto&& views = old_cfm.views();
        if (!drop_views && views.size() > schema->all_indices().size()) {
            auto explicit_view_names = views
                                    | boost::adaptors::filtered([&old_cfm](const view_ptr& v) { return !old_cfm.get_index_manager().is_index(v); })
                                    | boost::adaptors::transformed([](const view_ptr& v) { return v->cf_name(); });
            co_return coroutine::make_exception(exceptions::invalid_request_exception(format("Cannot drop table when materialized views still depend on it ({}.{{{}}})",
                        schema->ks_name(), ::join(", ", explicit_view_names))));
        }
        mlogger.info("Drop table '{}.{}'", schema->ks_name(), schema->cf_name());

        std::vector<mutation> drop_si_mutations;
        if (!schema->all_indices().empty()) {
            auto builder = schema_builder(schema).without_indexes();
            drop_si_mutations = db::schema_tables::make_update_table_mutations(db, keyspace, schema, builder.build(), ts, false);
        }
        auto mutations = db::schema_tables::make_drop_table_mutations(keyspace, schema, ts);
        mutations.insert(mutations.end(), std::make_move_iterator(drop_si_mutations.begin()), std::make_move_iterator(drop_si_mutations.end()));
        for (auto& v : views) {
            if (!old_cfm.get_index_manager().is_index(v)) {
                mlogger.info("Drop view '{}.{}' of table '{}'", v->ks_name(), v->cf_name(), schema->cf_name());
                auto m = db::schema_tables::make_drop_view_mutations(keyspace, v, ts);
                mutations.insert(mutations.end(), std::make_move_iterator(m.begin()), std::make_move_iterator(m.end()));
            }
        }

        // notifiers must run in seastar thread
        co_await seastar::async([&] {
            get_notifier().before_drop_column_family(*schema, mutations, ts);
        });
        co_return co_await include_keyspace(*keyspace, std::move(mutations));
    } catch (const replica::no_such_column_family& e) {
        co_return coroutine::make_exception(exceptions::configuration_exception(format("Cannot drop non existing table '{}' in keyspace '{}'.", cf_name, ks_name)));
    }
}

future<std::vector<mutation>> migration_manager::prepare_type_drop_announcement(user_type dropped_type, api::timestamp_type ts) {
    auto& db = _storage_proxy.get_db().local();
    auto&& keyspace = db.find_keyspace(dropped_type->_keyspace);
    mlogger.info("Drop User Type: {}", dropped_type->get_name_as_string());
    auto mutations =
            db::schema_tables::make_drop_type_mutations(keyspace.metadata(), dropped_type, ts);
    return include_keyspace(*keyspace.metadata(), std::move(mutations));
}

future<std::vector<mutation>> migration_manager::prepare_new_view_announcement(view_ptr view, api::timestamp_type ts) {
#if 0
    view.metadata.validate();
#endif
    auto& db = _storage_proxy.get_db().local();
    try {
        auto&& keyspace = db.find_keyspace(view->ks_name()).metadata();
        if (keyspace->cf_meta_data().contains(view->cf_name())) {
            throw exceptions::already_exists_exception(view->ks_name(), view->cf_name());
        }
        mlogger.info("Create new view: {}", view);
        auto mutations = db::schema_tables::make_create_view_mutations(keyspace, std::move(view), ts);
        co_return co_await include_keyspace(*keyspace, std::move(mutations));
    } catch (const replica::no_such_keyspace& e) {
        co_return coroutine::make_exception(exceptions::configuration_exception(format("Cannot add view '{}' to non existing keyspace '{}'.", view->cf_name(), view->ks_name())));
    }
}

future<std::vector<mutation>> migration_manager::prepare_view_update_announcement(view_ptr view, api::timestamp_type ts) {
#if 0
    view.metadata.validate();
#endif
    auto db = _storage_proxy.data_dictionary();
    try {
        auto&& keyspace = db.find_keyspace(view->ks_name()).metadata();
        auto& old_view = keyspace->cf_meta_data().at(view->cf_name());
        if (!old_view->is_view()) {
            co_return coroutine::make_exception(exceptions::invalid_request_exception("Cannot use ALTER MATERIALIZED VIEW on Table"));
        }
#if 0
        oldCfm.validateCompatility(cfm);
#endif
        mlogger.info("Update view '{}.{}' From {} To {}", view->ks_name(), view->cf_name(), *old_view, *view);
        auto mutations = db::schema_tables::make_update_view_mutations(keyspace, view_ptr(old_view), std::move(view), ts, true);
        co_return co_await include_keyspace(*keyspace, std::move(mutations));
    } catch (const std::out_of_range& e) {
        co_return coroutine::make_exception(exceptions::configuration_exception(format("Cannot update non existing materialized view '{}' in keyspace '{}'.",
                                                         view->cf_name(), view->ks_name())));
    }
}

future<std::vector<mutation>> migration_manager::prepare_view_drop_announcement(const sstring& ks_name, const sstring& cf_name, api::timestamp_type ts) {
    auto& db = _storage_proxy.get_db().local();
    try {
        auto& view = db.find_column_family(ks_name, cf_name).schema();
        if (!view->is_view()) {
            throw exceptions::invalid_request_exception("Cannot use DROP MATERIALIZED VIEW on Table");
        }
        if (db.find_column_family(view->view_info()->base_id()).get_index_manager().is_index(view_ptr(view))) {
            throw exceptions::invalid_request_exception("Cannot use DROP MATERIALIZED VIEW on Index");
        }
        auto keyspace = db.find_keyspace(ks_name).metadata();
        mlogger.info("Drop view '{}.{}'", view->ks_name(), view->cf_name());
        auto mutations = db::schema_tables::make_drop_view_mutations(keyspace, view_ptr(std::move(view)), ts);
        return include_keyspace(*keyspace, std::move(mutations));
    } catch (const replica::no_such_column_family& e) {
        throw exceptions::configuration_exception(format("Cannot drop non existing materialized view '{}' in keyspace '{}'.",
                                                         cf_name, ks_name));
    }
}

future<> migration_manager::push_schema_mutation(const gms::inet_address& endpoint, const std::vector<mutation>& schema)
{
    netw::messaging_service::msg_addr id{endpoint, 0};
    auto schema_features = _feat.cluster_schema_features();
    auto adjusted_schema = db::schema_tables::adjust_schema_for_schema_features(schema, schema_features);
    auto fm = std::vector<frozen_mutation>(adjusted_schema.begin(), adjusted_schema.end());
    auto cm = std::vector<canonical_mutation>(adjusted_schema.begin(), adjusted_schema.end());
    return _messaging.send_definitions_update(id, std::move(fm), std::move(cm));
}

/* *** Linearizing group 0 operations ***
 *
 * Group 0 changes (e.g. schema changes) are performed through Raft commands, which are executing in the same order
 * on every node, according to the order they appear in the Raft log
 * (executing a command happens in `group0_state_machine::apply`).
 * The commands contain mutations which modify tables that store group 0 state.
 *
 * However, constructing these mutations often requires reading the current state and validating the change against it.
 * This happens outside the code which applies the commands in order and may race with it. At the moment of applying
 * a command, the mutations stored within may be 'invalid' because a different command managed to be concurrently applied,
 * changing the state.
 *
 * For example, consider the sequence of commands:
 *
 * C1, C2, C3.
 *
 * Suppose that mutations inside C2 were constructed on a node which already applied C1. Thus, when applying C2,
 * the state of group 0 is the same as when the change was validated and its mutations were constructed.
 *
 * On the other hand, suppose that mutations inside C3 were also constructed on a node which applied C1, but didn't
 * apply C2 yet. This could easily happen e.g. when C2 and C3 were constructed concurrently on two different nodes.
 * Thus, when applying C3, the state of group 0 is different than it was when validating the change and constructing
 * its mutations: the state consists of the changes from C1 and C2, but when C3 was created, it used the state consisting
 * of changes from C1 (but not C2). Thus the mutations in C3 are not valid and we must not apply them.
 *
 * To protect ourselves from applying such 'obsolete' changes, we detect such commands during `group0_state_machine:apply`
 * and skip their mutations.
 *
 * For this, group 0 state was extended with a 'history table' (system.group0_history), which stores a sequence of
 * 'group 0 state IDs' (which are timeuuids). Each group 0 command also holds a unique state ID; if the command is successful,
 * the ID is appended to the history table. Each command also stores a 'previous state ID'; the change described by the command
 * is only applied when this 'previous state ID' is equal to the last state ID in the history table. If it's different,
 * we skip the change.
 *
 * To perform a group 0 change the user must first read the last state ID from the history table. This happens by obtaining
 * a `group0_guard` through `migration_manager::start_group0_operation`; the observed last state ID is stored in
 * `_observed_group0_state_id`. `start_group0_operation` also generates a new state ID for this change and stores it in
 * `_new_group0_state_id`. We ensure that the new state ID is greater than the observed state ID (in timeuuid order).
 *
 * The user then reads group 0 state, validates the change against the observed state, and constructs the mutations
 * which modify group 0 state. Finally, the user calls `announce`, passing the mutations and the guard.
 *
 * `announce` constructs a command for the group 0 state machine. The command stores the mutations and the state IDs.
 *
 * When the command is applied, we compare the stored observed state ID against the last state ID in the history table.
 * If it's the same, that means no change happened in between - no other command managed to 'sneak in' between the moment
 * the user started the operation and the moment the command was applied.
 *
 * The user must use `group0_guard::write_timestamp()` when constructing the mutations. The timestamp is extracted
 * from the new state ID. This ensures that mutations applied by successful commands have monotonic timestamps.
 * Indeed: the state IDs of successful commands are increasing (the previous state ID of a command that is successful
 * is equal to the new state ID of the previous successful command, and we ensure that the new state ID of a command
 * is greater than the previous state ID of this command).
 *
 * To perform a linearized group 0 read the user must also obtain a `group0_guard`. This ensures that all previously
 * completed changes are visible on this node, as obtaining the guard requires performing a Raft read barrier.
 *
 * Furthermore, obtaining the guard ensures that we don't read partial state, since it holds a lock that is also taken
 * during command application (`_read_apply_mutex_holder`). The lock is released just before sending the command to Raft.
 * TODO: we may still read partial state if we crash in the middle of command application.
 * See `group0_state_machine::apply` for a proposed fix.
 *
 * Obtaining the guard also ensures that there is no concurrent group 0 operation running on this node using another lock
 * (`_operation_mutex_holder`); if we allowed multiple concurrent operations to run, some of them could fail
 * due to the state ID protection. Concurrent operations may still run on different nodes. This lock is thus used
 * for improving liveness of operations running on the same node by serializing them.
 */
struct group0_guard::impl {
    semaphore_units<> _operation_mutex_holder;
    semaphore_units<> _read_apply_mutex_holder;

    utils::UUID _observed_group0_state_id;
    utils::UUID _new_group0_state_id;

    impl(const impl&) = delete;
    impl& operator=(const impl&) = delete;

    impl(semaphore_units<> operation_mutex_holder, semaphore_units<> read_apply_mutex_holder, utils::UUID observed_group0_state_id, utils::UUID new_group0_state_id)
        : _operation_mutex_holder(std::move(operation_mutex_holder)), _read_apply_mutex_holder(std::move(read_apply_mutex_holder)),
          _observed_group0_state_id(observed_group0_state_id), _new_group0_state_id(new_group0_state_id)
    {}

    void release_read_apply_mutex() {
        assert(_read_apply_mutex_holder.count() == 1);
        _read_apply_mutex_holder.return_units(1);
    }
};

group0_guard::group0_guard(std::unique_ptr<impl> p) : _impl(std::move(p)) {}

group0_guard::~group0_guard() = default;

group0_guard::group0_guard(group0_guard&&) noexcept = default;

utils::UUID group0_guard::observed_group0_state_id() const {
    return _impl->_observed_group0_state_id;
}

utils::UUID group0_guard::new_group0_state_id() const {
    return _impl->_new_group0_state_id;
}

api::timestamp_type group0_guard::write_timestamp() const {
    return utils::UUID_gen::micros_timestamp(_impl->_new_group0_state_id);
}

future<> migration_manager::announce_with_raft(std::vector<mutation> schema, group0_guard guard, std::string_view description) {
    assert(this_shard_id() == 0);
    auto schema_features = _feat.cluster_schema_features();
    auto adjusted_schema = db::schema_tables::adjust_schema_for_schema_features(schema, schema_features);

    group0_command group0_cmd {
        .change{schema_change{
            .mutations{adjusted_schema.begin(), adjusted_schema.end()},
        }},

        .history_append{db::system_keyspace::make_group0_history_state_id_mutation(
                guard.new_group0_state_id(), _group0_history_gc_duration, description)},

        // IMPORTANT: the retry mechanism below assumes that `prev_state_id` is engaged (not nullopt).
        // Here it is: the return type of `guard.observerd_group0_state_id()` is `utils::UUID`.
        .prev_state_id{guard.observed_group0_state_id()},
        .new_state_id{guard.new_group0_state_id()},

        .creator_addr{utils::fb_utilities::get_broadcast_address()},
        .creator_id{_raft_gr.group0().id()},
    };
    raft::command cmd;
    ser::serialize(cmd, group0_cmd);

    // Release the read_apply mutex so `group0_state_machine::apply` can take it.
    guard._impl->release_read_apply_mutex();

    bool retry;
    do {
        retry = false;
        try {
            co_await _raft_gr.group0().add_entry(cmd, raft::wait_type::applied, &_as);
        } catch (const raft::dropped_entry& e) {
            mlogger.warn("`announce_with_raft`: `add_entry` returned \"{}\". Retrying the command (prev_state_id: {}, new_state_id: {})",
                    e, group0_cmd.prev_state_id, group0_cmd.new_state_id);
            retry = true;
        } catch (const raft::commit_status_unknown& e) {
            mlogger.warn("`announce_with_raft`: `add_entry` returned \"{}\". Retrying the command (prev_state_id: {}, new_state_id: {})",
                    e, group0_cmd.prev_state_id, group0_cmd.new_state_id);
            retry = true;
        } catch (const raft::not_a_leader& e) {
            // This should not happen since follower-to-leader entry forwarding is enabled in group 0.
            // Just fail the operation by propagating the error.
            mlogger.error("`announce_with_raft`: unexpected `not_a_leader` error: \"{}\". Please file an issue.", e);
            throw;
        }

        // Thanks to the `prev_state_id` check in `group0_state_machine::apply`, the command is idempotent.
        // It's safe to retry it, even if it means it will be applied multiple times; only the first time
        // can have an effect.
    } while (retry);

    // dropping the guard releases `_group0_operation_mutex`, allowing other operations
    // on this node to proceed
}

future<> migration_manager::announce_without_raft(std::vector<mutation> schema) {
    auto f = db::schema_tables::merge_schema(_sys_ks, _storage_proxy.container(), _feat, schema);

    try {
        using namespace std::placeholders;
        auto all_live = _gossiper.get_live_members();
        auto live_members = all_live | boost::adaptors::filtered([this] (const gms::inet_address& endpoint) {
            // only push schema to nodes with known and equal versions
            return endpoint != utils::fb_utilities::get_broadcast_address() &&
                _messaging.knows_version(endpoint) &&
                _messaging.get_raw_version(endpoint) == netw::messaging_service::current_version;
        });
        co_await parallel_for_each(live_members.begin(), live_members.end(),
            std::bind(std::mem_fn(&migration_manager::push_schema_mutation), this, std::placeholders::_1, schema));
    } catch (...) {
        mlogger.error("failed to announce migration to all nodes: {}", std::current_exception());
    }

    co_return co_await std::move(f);
}

// Returns a future on the local application of the schema
future<> migration_manager::announce(std::vector<mutation> schema, group0_guard guard, std::string_view description) {
    if (_raft_gr.is_enabled()) {
        if (this_shard_id() != 0) {
            // This should not happen since all places which construct `group0_guard` also check that they are on shard 0.
            // Note: `group0_guard::impl` is private to this module, making this easy to verify.
            on_internal_error(mlogger, "announce: must run on shard 0");
        }

        auto new_group0_state_id = guard.new_group0_state_id();
        co_await announce_with_raft(std::move(schema), std::move(guard), std::move(description));

        if (!(co_await db::system_keyspace::group0_history_contains(new_group0_state_id))) {
            // The command was applied but the history table does not contain the new group 0 state ID.
            // This means `apply` skipped the change due to previous state ID mismatch.
            throw group0_concurrent_modification{};
        }
    } else {
        co_await announce_without_raft(std::move(schema));
    }
}

static utils::UUID generate_group0_state_id(utils::UUID prev_state_id) {
    auto ts = api::new_timestamp();
    if (prev_state_id != utils::UUID{}) {
        auto lower_bound = utils::UUID_gen::micros_timestamp(prev_state_id);
        if (ts <= lower_bound) {
            ts = lower_bound + 1;
        }
    }
    return utils::UUID_gen::get_random_time_UUID_from_micros(std::chrono::microseconds{ts});
}

future<group0_guard> migration_manager::start_group0_operation() {
    if (_raft_gr.is_enabled()) {
        if (this_shard_id() != 0) {
            on_internal_error(mlogger, "start_group0_operation: must run on shard 0");
        }

        auto operation_holder = co_await get_units(_group0_operation_mutex, 1);
        co_await _raft_gr.group0().read_barrier(&_as);

        // Take `_group0_read_apply_mutex` *after* read barrier.
        // Read barrier may wait for `group0_state_machine::apply` which also takes this mutex.
        auto read_apply_holder = co_await get_units(_group0_read_apply_mutex, 1);

        auto observed_group0_state_id = co_await db::system_keyspace::get_last_group0_state_id();
        auto new_group0_state_id = generate_group0_state_id(observed_group0_state_id);

        co_return group0_guard {
            std::make_unique<group0_guard::impl>(
                std::move(operation_holder),
                std::move(read_apply_holder),
                observed_group0_state_id,
                new_group0_state_id
            )
        };
    }

    co_return group0_guard {
        std::make_unique<group0_guard::impl>(
            semaphore_units<>{},
            semaphore_units<>{},
            utils::UUID{},
            generate_group0_state_id(utils::UUID{})
        )
    };
}

/**
 * Announce my version passively over gossip.
 * Used to notify nodes as they arrive in the cluster.
 *
 * @param version The schema version to announce
 */
void migration_manager::passive_announce(utils::UUID version) {
    _schema_version_to_publish = version;
    (void)_schema_push.trigger().handle_exception([version = std::move(version)] (std::exception_ptr ex) {
        mlogger.warn("Passive announcing of version {} failed: {}. Ignored.", version);
    });
}

future<> migration_manager::passive_announce() {
    assert(this_shard_id() == 0);
    mlogger.debug("Gossiping my schema version {}", _schema_version_to_publish);
    return _gossiper.add_local_application_state(gms::application_state::SCHEMA, gms::versioned_value::schema(_schema_version_to_publish));
}

#if 0
/**
 * Clear all locally stored schema information and reset schema to initial state.
 * Called by user (via JMX) who wants to get rid of schema disagreement.
 *
 * @throws IOException if schema tables truncation fails
 */
public static void resetLocalSchema() throws IOException
{
    mlogger.info("Starting local schema reset...");

    mlogger.debug("Truncating schema tables...");

    LegacySchemaTables.truncateSchemaTables();

    mlogger.debug("Clearing local schema keyspace definitions...");

    Schema.instance.clear();

    Set<InetAddress> liveEndpoints = Gossiper.instance.getLiveMembers();
    liveEndpoints.remove(FBUtilities.getBroadcastAddress());

    // force migration if there are nodes around
    for (InetAddress node : liveEndpoints)
    {
        if (shouldPullSchemaFrom(node))
        {
            mlogger.debug("Requesting schema from {}", node);
            FBUtilities.waitOnFuture(submitMigrationTask(node));
            break;
        }
    }

    mlogger.info("Local schema reset is complete.");
}

public static class MigrationsSerializer implements IVersionedSerializer<Collection<Mutation>>
{
    public static MigrationsSerializer instance = new MigrationsSerializer();

    public void serialize(Collection<Mutation> schema, DataOutputPlus out, int version) throws IOException
    {
        out.writeInt(schema.size());
        for (Mutation mutation : schema)
            Mutation.serializer.serialize(mutation, out, version);
    }

    public Collection<Mutation> deserialize(DataInput in, int version) throws IOException
    {
        int count = in.readInt();
        Collection<Mutation> schema = new ArrayList<>(count);

        for (int i = 0; i < count; i++)
            schema.add(Mutation.serializer.deserialize(in, version));

        return schema;
    }

    public long serializedSize(Collection<Mutation> schema, int version)
    {
        int size = TypeSizes.NATIVE.sizeof(schema.size());
        for (Mutation mutation : schema)
            size += Mutation.serializer.serializedSize(mutation, version);
        return size;
    }
}
#endif


// Ensure that given schema version 's' was synced with on current node. See schema::is_synced().
//
// The endpoint is the node from which 's' originated.
//
future<> migration_manager::maybe_sync(const schema_ptr& s, netw::messaging_service::msg_addr endpoint) {
    if (s->is_synced()) {
        return make_ready_future<>();
    }

    return s->registry_entry()->maybe_sync([this, s, endpoint] {
        // Serialize schema sync by always doing it on shard 0.
        if (this_shard_id() == 0) {
            mlogger.debug("Syncing schema of {}.{} (v={}) with {}", s->ks_name(), s->cf_name(), s->version(), endpoint);
            return merge_schema_from(endpoint);
        } else {
            return container().invoke_on(0, [gs = global_schema_ptr(s), endpoint] (migration_manager& local_mm) {
                schema_ptr s = gs.get();
                schema_registry_entry& e = *s->registry_entry();
                mlogger.debug("Syncing schema of {}.{} (v={}) with {}", s->ks_name(), s->cf_name(), s->version(), endpoint);
                return local_mm.merge_schema_from(endpoint);
            });
        }
    });
}

// Returns schema of given version, either from cache or from remote node identified by 'from'.
// Doesn't affect current node's schema in any way.
static future<schema_ptr> get_schema_definition(table_schema_version v, netw::messaging_service::msg_addr dst, netw::messaging_service& ms, service::storage_proxy& storage_proxy) {
    return local_schema_registry().get_or_load(v, [&ms, &storage_proxy, dst] (table_schema_version v) {
        mlogger.debug("Requesting schema {} from {}", v, dst);
        return ms.send_get_schema_version(dst, v).then([&storage_proxy] (frozen_schema s) {
            auto& proxy = storage_proxy.container();
            // Since the latest schema version is always present in the schema registry
            // we only happen to query already outdated schema version, which is
            // referenced by the incoming request.
            // That means the column mapping for the schema should always be inserted
            // with TTL (refresh TTL in case column mapping already existed prior to that).
            auto us = s.unfreeze(db::schema_ctxt(proxy));
            // if this is a view - we might need to fix it's schema before registering it.
            if (us->is_view()) {
                auto& db = proxy.local().local_db();
                schema_ptr base_schema = db.find_schema(us->view_info()->base_id());
                auto fixed_view = db::schema_tables::maybe_fix_legacy_secondary_index_mv_schema(db, view_ptr(us), base_schema,
                        db::schema_tables::preserve_version::yes);
                if (fixed_view) {
                    us = fixed_view;
                }
            }
            return db::schema_tables::store_column_mapping(proxy, us, true).then([us] {
                return frozen_schema{us};
            });
        });
    }).then([&storage_proxy] (schema_ptr s) {
        // If this is a view so this schema also needs a reference to the base
        // table.
        if (s->is_view()) {
            if (!s->view_info()->base_info()) {
                auto& db = storage_proxy.local_db();
                // This line might throw a no_such_column_family
                // It should be fine since if we tried to register a view for which
                // we don't know the base table, our registry is broken.
                schema_ptr base_schema = db.find_schema(s->view_info()->base_id());
                s->view_info()->set_base_info(s->view_info()->make_base_dependent_view_info(*base_schema));
            }
        }
        return s;
    });
}

future<schema_ptr> migration_manager::get_schema_for_read(table_schema_version v, netw::messaging_service::msg_addr dst, netw::messaging_service& ms) {
    return get_schema_for_write(v, dst, ms);
}

future<schema_ptr> migration_manager::get_schema_for_write(table_schema_version v, netw::messaging_service::msg_addr dst, netw::messaging_service& ms) {
    if (_as.abort_requested()) {
        return make_exception_future<schema_ptr>(abort_requested_exception());
    }

    return get_schema_definition(v, dst, ms, _storage_proxy).then([this, dst] (schema_ptr s) {
        return maybe_sync(s, dst).then([s] {
            return s;
        });
    });
}

future<> migration_manager::sync_schema(const replica::database& db, const std::vector<gms::inet_address>& nodes) {
    using schema_and_hosts = std::unordered_map<utils::UUID, std::vector<gms::inet_address>>;
    return do_with(schema_and_hosts(), db.get_version(), [this, &nodes] (schema_and_hosts& schema_map, utils::UUID& my_version) {
        return parallel_for_each(nodes, [this, &schema_map, &my_version] (const gms::inet_address& node) {
            return _messaging.send_schema_check(netw::msg_addr(node)).then([node, &schema_map, &my_version] (utils::UUID remote_version) {
                if (my_version != remote_version) {
                    schema_map[remote_version].emplace_back(node);
                }
            });
        }).then([this, &schema_map] {
            return parallel_for_each(schema_map, [this] (auto& x) {
                mlogger.debug("Pulling schema {} from {}", x.first, x.second.front());
                bool can_ignore_down_node = false;
                return submit_migration_task(x.second.front(), can_ignore_down_node);
            });
        });
    });
}

future<column_mapping> get_column_mapping(utils::UUID table_id, table_schema_version v) {
    schema_ptr s = local_schema_registry().get_or_null(v);
    if (s) {
        return make_ready_future<column_mapping>(s->get_column_mapping());
    }
    return db::schema_tables::get_column_mapping(table_id, v);
}

future<> migration_manager::on_join(gms::inet_address endpoint, gms::endpoint_state ep_state) {
    schedule_schema_pull(endpoint, ep_state);
    return make_ready_future();
}

future<> migration_manager::on_change(gms::inet_address endpoint, gms::application_state state, const gms::versioned_value& value) {
    if (state == gms::application_state::SCHEMA) {
        auto* ep_state = _gossiper.get_endpoint_state_for_endpoint_ptr(endpoint);
        if (!ep_state || _gossiper.is_dead_state(*ep_state)) {
            mlogger.debug("Ignoring state change for dead or unknown endpoint: {}", endpoint);
            return make_ready_future();
        }
        if (_storage_proxy.get_token_metadata_ptr()->is_member(endpoint)) {
            schedule_schema_pull(endpoint, *ep_state);
        }
    }
    return make_ready_future();
}

future<> migration_manager::on_alive(gms::inet_address endpoint, gms::endpoint_state state) {
    schedule_schema_pull(endpoint, state);
    return make_ready_future();
}

void migration_manager::set_group0_history_gc_duration(gc_clock::duration d) {
    _group0_history_gc_duration = d;
}

void migration_manager::set_concurrent_ddl_retries(size_t n) {
    _concurrent_ddl_retries = n;
}

semaphore& migration_manager::group0_operation_mutex() {
    return _group0_operation_mutex;
}

}