/* * Modified by ScyllaDB * Copyright (C) 2015-present ScyllaDB */ /* * SPDX-License-Identifier: (AGPL-3.0-or-later and Apache-2.0) */ #include "db/schema_tables.hh" #include "service/migration_manager.hh" #include "service/storage_proxy.hh" #include "gms/feature_service.hh" #include "partition_slice_builder.hh" #include "dht/i_partitioner.hh" #include "system_keyspace.hh" #include "query-result-set.hh" #include "query-result-writer.hh" #include "schema/schema_builder.hh" #include "map_difference.hh" #include "utils/UUID_gen.hh" #include "utils/to_string.hh" #include #include "log.hh" #include "frozen_schema.hh" #include "schema/schema_registry.hh" #include "mutation_query.hh" #include "system_keyspace.hh" #include "system_distributed_keyspace.hh" #include "cql3/query_processor.hh" #include "cql3/cql3_type.hh" #include "cql3/functions/functions.hh" #include "cql3/functions/user_function.hh" #include "cql3/functions/user_aggregate.hh" #include "cql3/expr/evaluate.hh" #include "cql3/expr/expr-utils.hh" #include "cql3/util.hh" #include "types/list.hh" #include "types/set.hh" #include "db/marshal/type_parser.hh" #include "db/config.hh" #include "db/extensions.hh" #include "utils/hashers.hh" #include #include #include #include #include #include #include #include #include #include #include #include #include #include "compaction/compaction_strategy.hh" #include "view_info.hh" #include "cql_type_parser.hh" #include "db/timeout_clock.hh" #include "replica/database.hh" #include "data_dictionary/user_types_metadata.hh" #include "index/target_parser.hh" #include "lang/lua.hh" #include "idl/mutation.dist.hh" #include "idl/mutation.dist.impl.hh" #include "db/system_keyspace.hh" #include "cql3/untyped_result_set.hh" #include "cql3/functions/user_aggregate.hh" #include "cql3/CqlParser.hpp" #include "cql3/expr/expression.hh" #include "cql3/column_identifier.hh" #include "cql3/column_specification.hh" #include "types/types.hh" #include "mutation/async_utils.hh" using namespace db; using namespace std::chrono_literals; static logging::logger diff_logger("schema_diff"); /** system.schema_* tables used to store keyspace/table/type attributes prior to C* 3.0 */ namespace db { namespace { const auto set_use_schema_commitlog = schema_builder::register_static_configurator([](const sstring& ks_name, const sstring& cf_name, schema_static_props& props) { if (ks_name == schema_tables::NAME) { props.enable_schema_commitlog(); } }); } schema_ctxt::schema_ctxt(const db::config& cfg, std::shared_ptr uts, const gms::feature_service& features, replica::database* db) : _db(db) , _features(features) , _extensions(cfg.extensions()) , _murmur3_partitioner_ignore_msb_bits(cfg.murmur3_partitioner_ignore_msb_bits()) , _schema_registry_grace_period(cfg.schema_registry_grace_period()) , _user_types(std::move(uts)) {} schema_ctxt::schema_ctxt(replica::database& db) : schema_ctxt(db.get_config(), db.as_user_types_storage(), db.features(), &db) {} schema_ctxt::schema_ctxt(distributed& db) : schema_ctxt(db.local()) {} schema_ctxt::schema_ctxt(distributed& proxy) : schema_ctxt(proxy.local().get_db()) {} namespace schema_tables { logging::logger slogger("schema_tables"); const sstring version = "3"; struct qualified_name { sstring keyspace_name; sstring table_name; qualified_name(sstring keyspace_name, sstring table_name) : keyspace_name(std::move(keyspace_name)) , table_name(std::move(table_name)) { } qualified_name(const schema_ptr& s) : keyspace_name(s->ks_name()) , table_name(s->cf_name()) { } auto operator<=>(const qualified_name&) const = default; }; static future read_table_mutations(distributed& proxy, const qualified_name& table, schema_ptr s); static future<> merge_tables_and_views(distributed& proxy, sharded& sys_ks, std::map&& tables_before, std::map&& tables_after, std::map&& views_before, std::map&& views_after, bool reload, bool has_tablet_mutations); struct [[nodiscard]] user_types_to_drop final { seastar::noncopyable_function ()> drop; }; static future merge_types(distributed& proxy, schema_result before, schema_result after); static future<> merge_functions(distributed& proxy, schema_result before, schema_result after); static future<> merge_aggregates(distributed& proxy, schema_result before, schema_result after, schema_result scylla_before, schema_result scylla_after); static future<> do_merge_schema(distributed&, sharded& sys_ks, std::vector, bool do_flush, bool reload); using computed_columns_map = std::unordered_map; static computed_columns_map get_computed_columns(const schema_mutations& sm); static std::vector create_columns_from_column_rows( const schema_ctxt& ctxt, const query::result_set& rows, const sstring& keyspace, const sstring& table, bool is_super, column_view_virtual is_view_virtual, const computed_columns_map& computed_columns); static std::vector create_indices_from_index_rows(const query::result_set& rows, const sstring& keyspace, const sstring& table); static index_metadata create_index_from_index_row(const query::result_set_row& row, sstring keyspace, sstring table); static void add_column_to_schema_mutation(schema_ptr, const column_definition&, api::timestamp_type, mutation&); static void add_computed_column_to_schema_mutation(schema_ptr, const column_definition&, api::timestamp_type, mutation&); static void add_index_to_schema_mutation(schema_ptr table, const index_metadata& index, api::timestamp_type timestamp, mutation& mutation); static void drop_column_from_schema_mutation(schema_ptr schema_table, schema_ptr table, const sstring& column_name, long timestamp, std::vector&); static void drop_index_from_schema_mutation(schema_ptr table, const index_metadata& column, long timestamp, std::vector& mutations); static future create_table_from_table_row( distributed&, const query::result_set_row&); static void prepare_builder_from_table_row(const schema_ctxt&, schema_builder&, const query::result_set_row&); using namespace v3; using days = std::chrono::duration>; static future<> save_system_schema_to_keyspace(cql3::query_processor& qp, const sstring & ksname) { auto ks = qp.db().find_keyspace(ksname); auto ksm = ks.metadata(); // delete old, possibly obsolete entries in schema tables co_await coroutine::parallel_for_each(all_table_names(schema_features::full()), [&qp, ksm] (sstring cf) -> future<> { auto deletion_timestamp = system_keyspace::schema_creation_timestamp() - 1; co_await qp.execute_internal(format("DELETE FROM {}.{} USING TIMESTAMP {} WHERE keyspace_name = ?", NAME, cf, deletion_timestamp), { ksm->name() }, cql3::query_processor::cache_internal::yes).discard_result(); }); { auto mvec = make_create_keyspace_mutations(qp.db().features().cluster_schema_features(), ksm, system_keyspace::schema_creation_timestamp(), true); co_await qp.proxy().mutate_locally(std::move(mvec), tracing::trace_state_ptr()); } } future<> save_system_schema(cql3::query_processor& qp) { co_await save_system_schema_to_keyspace(qp, schema_tables::NAME); // #2514 - make sure "system" is written to system_schema.keyspaces. co_await save_system_schema_to_keyspace(qp, system_keyspace::NAME); } namespace v3 { static constexpr auto schema_gc_grace = std::chrono::duration_cast(days(7)).count(); schema_ptr keyspaces() { static thread_local auto schema = [] { schema_builder builder(generate_legacy_id(NAME, KEYSPACES), NAME, KEYSPACES, // partition key {{"keyspace_name", utf8_type}}, // clustering key {}, // regular columns { {"durable_writes", boolean_type}, {"replication", map_type_impl::get_instance(utf8_type, utf8_type, false)}, }, // static columns {}, // regular column name type utf8_type, // comment "keyspace definitions" ); builder.set_gc_grace_seconds(schema_gc_grace); builder.with_version(system_keyspace::generate_schema_version(builder.uuid())); return builder.build(); }(); return schema; } schema_ptr scylla_keyspaces() { static thread_local auto schema = [] { schema_builder builder(generate_legacy_id(NAME, SCYLLA_KEYSPACES), NAME, SCYLLA_KEYSPACES, // partition key {{"keyspace_name", utf8_type}}, // clustering key {}, // regular columns { {"storage_type", utf8_type}, {"storage_options", map_type_impl::get_instance(utf8_type, utf8_type, false)}, {"initial_tablets", int32_type}, }, // static columns {}, // regular column name type utf8_type, // comment "scylla-specific information for keyspaces" ); builder.set_gc_grace_seconds(schema_gc_grace); builder.with_version(system_keyspace::generate_schema_version(builder.uuid())); return builder.build(); }(); return schema; } schema_ptr tables() { static thread_local auto schema = [] { schema_builder builder(generate_legacy_id(NAME, TABLES), NAME, TABLES, // partition key {{"keyspace_name", utf8_type}}, // clustering key {{"table_name", utf8_type}}, // regular columns { {"bloom_filter_fp_chance", double_type}, {"caching", map_type_impl::get_instance(utf8_type, utf8_type, false)}, {"comment", utf8_type}, {"compaction", map_type_impl::get_instance(utf8_type, utf8_type, false)}, {"compression", map_type_impl::get_instance(utf8_type, utf8_type, false)}, {"crc_check_chance", double_type}, // dclocal_read_repair_chance has been deprecated, preserved to be // backward compatible {"dclocal_read_repair_chance", double_type}, {"default_time_to_live", int32_type}, {"extensions", map_type_impl::get_instance(utf8_type, bytes_type, false)}, {"flags", set_type_impl::get_instance(utf8_type, false)}, // SUPER, COUNTER, DENSE, COMPOUND {"gc_grace_seconds", int32_type}, {"id", uuid_type}, {"max_index_interval", int32_type}, {"memtable_flush_period_in_ms", int32_type}, {"min_index_interval", int32_type}, // read_repair_chance has been deprecated, preserved to be backward // compatible {"read_repair_chance", double_type}, {"speculative_retry", utf8_type}, }, // static columns {}, // regular column name type utf8_type, // comment "table definitions" ); builder.set_gc_grace_seconds(schema_gc_grace); builder.with_version(system_keyspace::generate_schema_version(builder.uuid())); return builder.build(); }(); return schema; } // Holds Scylla-specific table metadata. schema_ptr scylla_tables(schema_features features) { static thread_local schema_ptr schemas[2]{}; bool has_group0_schema_versioning = features.contains(schema_feature::GROUP0_SCHEMA_VERSIONING); schema_ptr& s = schemas[has_group0_schema_versioning]; if (!s) { auto id = generate_legacy_id(NAME, SCYLLA_TABLES); auto sb = schema_builder(NAME, SCYLLA_TABLES, std::make_optional(id)) .with_column("keyspace_name", utf8_type, column_kind::partition_key) .with_column("table_name", utf8_type, column_kind::clustering_key) .with_column("version", uuid_type) .set_gc_grace_seconds(schema_gc_grace); // Each bit in `offset` denotes a different schema feature, // so different values of `offset` are used for different combinations of features. uint16_t offset = 0; // CDC_OPTIONS sb.with_column("cdc", map_type_impl::get_instance(utf8_type, utf8_type, false)); offset |= 0b1; // PER_TABLE_PARTITIONERS sb.with_column("partitioner", utf8_type); offset |= 0b10; // 0b100 reserved for Scylla Enterprise if (has_group0_schema_versioning) { // If true, this table's latest schema was committed by group 0. // In this case `version` column is non-null and will be used for `schema::version()` instead of calculating a hash. // // If false, this table's latest schema was committed outside group 0 (e.g. during RECOVERY mode). // In this case `version` is null and `schema::version()` will be a hash. // // If null, this is either a system table, or the latest schema was committed // before the GROUP0_SCHEMA_VERSIONING feature was enabled (either inside or outside group 0). // In this case, for non-system tables, `version` is null and `schema::version()` will be a hash. sb.with_column("committed_by_group0", boolean_type); offset |= 0b1000; } sb.with_version(system_keyspace::generate_schema_version(id, offset)); s = sb.build(); } return s; } // The "columns" table lists the definitions of all columns in all tables // and views. Its schema needs to be identical to the one in Cassandra because // it is the API through which drivers inspect the list of columns in a table // (e.g., cqlsh's "DESCRIBE TABLE" and "DESCRIBE MATERIALIZED VIEW" get their // information from the columns table). // The "view_virtual_columns" table is an additional table with exactly the // same schema (both are created by columns_schema()), but has a separate // list of "virtual" columns. Those are used in materialized views for keeping // rows without data alive (see issue #3362). These virtual columns cannot be // listed in the regular "columns" table, otherwise the "DESCRIBE MATERIALIZED // VIEW" would list them - while it should only list real, selected, columns. static schema_ptr columns_schema(const char* columns_table_name) { schema_builder builder(generate_legacy_id(NAME, columns_table_name), NAME, columns_table_name, // partition key {{"keyspace_name", utf8_type}}, // clustering key {{"table_name", utf8_type},{"column_name", utf8_type}}, // regular columns { {"clustering_order", utf8_type}, {"column_name_bytes", bytes_type}, {"kind", utf8_type}, {"position", int32_type}, {"type", utf8_type}, }, // static columns {}, // regular column name type utf8_type, // comment "column definitions" ); builder.set_gc_grace_seconds(schema_gc_grace); builder.with_version(system_keyspace::generate_schema_version(builder.uuid())); return builder.build(); } schema_ptr columns() { static thread_local auto schema = columns_schema(COLUMNS); return schema; } schema_ptr view_virtual_columns() { static thread_local auto schema = columns_schema(VIEW_VIRTUAL_COLUMNS); return schema; } // Returns table ids of all schema tables which contribute to schema_mutations, // i.e. those which are used to define schema of a table or a view. // All such tables have a clustering key whose first column is the table name. static const std::unordered_set& schema_tables_holding_schema_mutations() { static const std::unordered_set table_ids = [] { std::unordered_set ids; for (auto&& s : { tables(), views(), columns(), view_virtual_columns(), computed_columns(), dropped_columns(), indexes(), scylla_tables(), db::system_keyspace::legacy::column_families(), db::system_keyspace::legacy::columns(), db::system_keyspace::legacy::triggers()}) { assert(s->clustering_key_size() > 0); auto&& first_column_name = s->clustering_column_at(0).name_as_text(); assert(first_column_name == "table_name" || first_column_name == "view_name" || first_column_name == "columnfamily_name"); ids.emplace(s->id()); } return ids; }(); return table_ids; }; // Computed columns are a special kind of columns. Rather than having their value provided directly // by the user, they are computed - possibly from other column values. This table stores which columns // for a given table are computed, and a serialized computation itself. Full column information is stored // in the `columns` table, this one stores only entries for computed columns, so it will be empty for tables // without any computed columns defined in the schema. `computation` is a serialized blob and its format // is defined in column_computation.hh and system_schema docs. // static schema_ptr computed_columns_schema(const char* columns_table_name) { schema_builder builder(generate_legacy_id(NAME, columns_table_name), NAME, columns_table_name, // partition key {{"keyspace_name", utf8_type}}, // clustering key {{"table_name", utf8_type}, {"column_name", utf8_type}}, // regular columns {{"computation", bytes_type}}, // static columns {}, // regular column name type utf8_type, // comment "computed columns" ); builder.set_gc_grace_seconds(schema_gc_grace); builder.with_version(system_keyspace::generate_schema_version(builder.uuid())); return builder.build(); } schema_ptr computed_columns() { static thread_local auto schema = computed_columns_schema(COMPUTED_COLUMNS); return schema; } schema_ptr dropped_columns() { static thread_local auto schema = [] { schema_builder builder(generate_legacy_id(NAME, DROPPED_COLUMNS), NAME, DROPPED_COLUMNS, // partition key {{"keyspace_name", utf8_type}}, // clustering key {{"table_name", utf8_type},{"column_name", utf8_type}}, // regular columns { {"dropped_time", timestamp_type}, {"type", utf8_type}, }, // static columns {}, // regular column name type utf8_type, // comment "dropped column registry" ); builder.set_gc_grace_seconds(schema_gc_grace); builder.with_version(system_keyspace::generate_schema_version(builder.uuid())); return builder.build(); }(); return schema; } schema_ptr triggers() { static thread_local auto schema = [] { schema_builder builder(generate_legacy_id(NAME, TRIGGERS), NAME, TRIGGERS, // partition key {{"keyspace_name", utf8_type}}, // clustering key {{"table_name", utf8_type},{"trigger_name", utf8_type}}, // regular columns { {"options", map_type_impl::get_instance(utf8_type, utf8_type, false)}, }, // static columns {}, // regular column name type utf8_type, // comment "trigger definitions" ); builder.set_gc_grace_seconds(schema_gc_grace); builder.with_version(system_keyspace::generate_schema_version(builder.uuid())); return builder.build(); }(); return schema; } schema_ptr views() { static thread_local auto schema = [] { schema_builder builder(generate_legacy_id(NAME, VIEWS), NAME, VIEWS, // partition key {{"keyspace_name", utf8_type}}, // clustering key {{"view_name", utf8_type}}, // regular columns { {"base_table_id", uuid_type}, {"base_table_name", utf8_type}, {"where_clause", utf8_type}, {"bloom_filter_fp_chance", double_type}, {"caching", map_type_impl::get_instance(utf8_type, utf8_type, false)}, {"comment", utf8_type}, {"compaction", map_type_impl::get_instance(utf8_type, utf8_type, false)}, {"compression", map_type_impl::get_instance(utf8_type, utf8_type, false)}, {"crc_check_chance", double_type}, // dclocal_read_repair_chance has been deprecated, preserved to be // backward compatible {"dclocal_read_repair_chance", double_type}, {"default_time_to_live", int32_type}, {"extensions", map_type_impl::get_instance(utf8_type, bytes_type, false)}, {"gc_grace_seconds", int32_type}, {"id", uuid_type}, {"include_all_columns", boolean_type}, {"max_index_interval", int32_type}, {"memtable_flush_period_in_ms", int32_type}, {"min_index_interval", int32_type}, // read_repair_chance has been deprecated, preserved to be backward // compatible {"read_repair_chance", double_type}, {"speculative_retry", utf8_type}, }, // static columns {}, // regular column name type utf8_type, // comment "view definitions" ); builder.set_gc_grace_seconds(schema_gc_grace); builder.with_version(system_keyspace::generate_schema_version(builder.uuid())); return builder.build(); }(); return schema; } schema_ptr indexes() { static thread_local auto schema = [] { schema_builder builder(generate_legacy_id(NAME, INDEXES), NAME, INDEXES, // partition key {{"keyspace_name", utf8_type}}, // clustering key {{"table_name", utf8_type},{"index_name", utf8_type}}, // regular columns { {"kind", utf8_type}, {"options", map_type_impl::get_instance(utf8_type, utf8_type, false)}, }, // static columns {}, // regular column name type utf8_type, // comment "secondary index definitions" ); builder.set_gc_grace_seconds(schema_gc_grace); builder.with_version(system_keyspace::generate_schema_version(builder.uuid())); return builder.build(); }(); return schema; } schema_ptr types() { static thread_local auto schema = [] { schema_builder builder(generate_legacy_id(NAME, TYPES), NAME, TYPES, // partition key {{"keyspace_name", utf8_type}}, // clustering key {{"type_name", utf8_type}}, // regular columns { {"field_names", list_type_impl::get_instance(utf8_type, false)}, {"field_types", list_type_impl::get_instance(utf8_type, false)}, }, // static columns {}, // regular column name type utf8_type, // comment "user defined type definitions" ); builder.set_gc_grace_seconds(schema_gc_grace); builder.with_version(system_keyspace::generate_schema_version(builder.uuid())); return builder.build(); }(); return schema; } schema_ptr functions() { static thread_local auto schema = [] { schema_builder builder(generate_legacy_id(NAME, FUNCTIONS), NAME, FUNCTIONS, // partition key {{"keyspace_name", utf8_type}}, // clustering key {{"function_name", utf8_type}, {"argument_types", list_type_impl::get_instance(utf8_type, false)}}, // regular columns { {"argument_names", list_type_impl::get_instance(utf8_type, false)}, {"body", utf8_type}, {"language", utf8_type}, {"return_type", utf8_type}, {"called_on_null_input", boolean_type}, }, // static columns {}, // regular column name type utf8_type, // comment "user defined function definitions" ); builder.set_gc_grace_seconds(schema_gc_grace); builder.with_version(system_keyspace::generate_schema_version(builder.uuid())); return builder.build(); }(); return schema; } schema_ptr aggregates() { static thread_local auto schema = [] { schema_builder builder(generate_legacy_id(NAME, AGGREGATES), NAME, AGGREGATES, // partition key {{"keyspace_name", utf8_type}}, // clustering key {{"aggregate_name", utf8_type}, {"argument_types", list_type_impl::get_instance(utf8_type, false)}}, // regular columns { {"final_func", utf8_type}, {"initcond", utf8_type}, {"return_type", utf8_type}, {"state_func", utf8_type}, {"state_type", utf8_type}, }, // static columns {}, // regular column name type utf8_type, // comment "user defined aggregate definitions" ); builder.set_gc_grace_seconds(schema_gc_grace); builder.with_version(system_keyspace::generate_schema_version(builder.uuid())); return builder.build(); }(); return schema; } schema_ptr scylla_aggregates() { static thread_local auto schema = [] { schema_builder builder(generate_legacy_id(NAME, SCYLLA_AGGREGATES), NAME, SCYLLA_AGGREGATES, // partition key {{"keyspace_name", utf8_type}}, // clustering key { {"aggregate_name", utf8_type}, {"argument_types", list_type_impl::get_instance(utf8_type, false)} }, //regular columns { {"reduce_func", utf8_type}, {"state_type", utf8_type}, }, //static columns, {}, // regular column name type utf8_type, // comment "scylla-specific information for user defined aggregates" ); builder.set_gc_grace_seconds(schema_gc_grace); builder.with_version(system_keyspace::generate_schema_version(builder.uuid())); return builder.build(); }(); return schema; } schema_ptr scylla_table_schema_history() { static thread_local auto s = [] { schema_builder builder(db::system_keyspace::NAME, SCYLLA_TABLE_SCHEMA_HISTORY, generate_legacy_id(db::system_keyspace::NAME, SCYLLA_TABLE_SCHEMA_HISTORY)); builder.with_column("cf_id", uuid_type, column_kind::partition_key); builder.with_column("schema_version", uuid_type, column_kind::clustering_key); builder.with_column("column_name", utf8_type, column_kind::clustering_key); builder.with_column("clustering_order", utf8_type); builder.with_column("column_name_bytes", bytes_type); builder.with_column("kind", utf8_type); builder.with_column("position", int32_type); builder.with_column("type", utf8_type); builder.set_comment("Scylla specific table to store a history of column mappings " "for each table schema version upon an CREATE TABLE/ALTER TABLE operations"); builder.with_version(system_keyspace::generate_schema_version(builder.uuid())); return builder.build(schema_builder::compact_storage::no); }(); return s; } } #if 0 public static void truncateSchemaTables() { for (String table : ALL) getSchemaCFS(table).truncateBlocking(); } private static void flushSchemaTables() { for (String table : ALL) SystemKeyspace.forceBlockingFlush(table); } #endif static mutation redact_columns_for_missing_features(mutation&& m, schema_features features) { return std::move(m); // The following code is needed if there are new schema features that require redaction. #if 0 if (m.schema()->cf_name() != SCYLLA_TABLES) { return std::move(m); } slogger.debug("adjusting schema_tables mutation due to possible in-progress cluster upgrade"); // The global schema ptr make sure it will be registered in the schema registry. global_schema_ptr redacted_schema{scylla_tables(features)}; m.upgrade(redacted_schema); return std::move(m); #endif } /** * Read schema from system keyspace and calculate MD5 digest of every row, resulting digest * will be converted into UUID which would act as content-based version of the schema. */ future calculate_schema_digest(distributed& proxy, schema_features features, noncopyable_function accept_keyspace) { auto map = [&proxy, features, accept_keyspace = std::move(accept_keyspace)] (sstring table) mutable -> future> { auto& db = proxy.local().get_db(); auto rs = co_await db::system_keyspace::query_mutations(db, NAME, table); auto s = db.local().find_schema(NAME, table); std::vector mutations; for (auto&& p : rs->partitions()) { auto mut = co_await unfreeze_gently(p.mut(), s); auto partition_key = value_cast(utf8_type->deserialize(mut.key().get_component(*s, 0))); if (!accept_keyspace(partition_key)) { continue; } mut = redact_columns_for_missing_features(std::move(mut), features); mutations.emplace_back(std::move(mut)); } co_return mutations; }; auto reduce = [features] (auto& hash, auto&& mutations) { for (const mutation& m : mutations) { feed_hash_for_schema_digest(hash, m, features); } }; auto hash = md5_hasher(); auto tables = all_table_names(features); { for (auto& table: tables) { auto mutations = co_await map(table); if (diff_logger.is_enabled(logging::log_level::trace)) { for (const mutation& m : mutations) { md5_hasher h; feed_hash_for_schema_digest(h, m, features); diff_logger.trace("Digest {} for {}, compacted={}", h.finalize(), m, compact_for_schema_digest(m)); } } reduce(hash, mutations); } co_return utils::UUID_gen::get_name_UUID(hash.finalize()); } } future calculate_schema_digest(distributed& proxy, schema_features features) { return calculate_schema_digest(proxy, features, std::not_fn(&is_system_keyspace)); } future> convert_schema_to_mutations(distributed& proxy, schema_features features) { auto map = [&proxy, features] (sstring table) -> future> { auto& db = proxy.local().get_db(); auto rs = co_await db::system_keyspace::query_mutations(db, NAME, table); auto s = db.local().find_schema(NAME, table); std::vector results; results.reserve(rs->partitions().size()); for (auto&& p : rs->partitions()) { auto mut = co_await unfreeze_gently(p.mut(), s); auto partition_key = value_cast(utf8_type->deserialize(mut.key().get_component(*s, 0))); if (is_system_keyspace(partition_key)) { continue; } mut = redact_columns_for_missing_features(std::move(mut), features); results.emplace_back(co_await make_canonical_mutation_gently(mut)); } co_return results; }; auto reduce = [] (auto&& result, auto&& mutations) { std::move(mutations.begin(), mutations.end(), std::back_inserter(result)); return std::move(result); }; co_return co_await map_reduce(all_table_names(features), map, std::vector{}, reduce); } std::vector adjust_schema_for_schema_features(std::vector schema, schema_features features) { for (auto& m : schema) { m = redact_columns_for_missing_features(std::move(m), features); } return schema; } future read_schema_for_keyspaces(distributed& proxy, const sstring& schema_table_name, const std::set& keyspace_names) { auto map = [&proxy, schema_table_name] (const sstring& keyspace_name) { return read_schema_partition_for_keyspace(proxy, schema_table_name, keyspace_name); }; auto insert = [] (schema_result&& result, auto&& schema_entity) { if (!schema_entity.second->empty()) { result.insert(std::move(schema_entity)); } return std::move(result); }; co_return co_await map_reduce(keyspace_names.begin(), keyspace_names.end(), map, schema_result{}, insert); } static future query_partition_mutation(service::storage_proxy& proxy, schema_ptr s, lw_shared_ptr cmd, partition_key pkey) { auto dk = dht::decorate_key(*s, pkey); auto range = dht::partition_range::make_singular(dk); auto res_hit_rate = co_await proxy.query_mutations_locally(s, std::move(cmd), range, db::no_timeout, tracing::trace_state_ptr{}); auto&& [res, hit_rate] = res_hit_rate; auto&& partitions = res->partitions(); if (partitions.size() == 0) { co_return mutation(s, std::move(dk)); } else if (partitions.size() == 1) { co_return co_await unfreeze_gently(partitions[0].mut(), s); } else { auto&& ex = std::make_exception_ptr(std::invalid_argument("Results must have at most one partition")); co_return coroutine::exception(std::move(ex)); } } future read_schema_partition_for_keyspace(distributed& proxy, sstring schema_table_name, sstring keyspace_name) { auto schema = proxy.local().get_db().local().find_schema(NAME, schema_table_name); auto keyspace_key = dht::decorate_key(*schema, partition_key::from_singular(*schema, keyspace_name)); auto rs = co_await db::system_keyspace::query(proxy.local().get_db(), NAME, schema_table_name, keyspace_key); co_return schema_result_value_type{keyspace_name, std::move(rs)}; } future read_schema_partition_for_table(distributed& proxy, schema_ptr schema, const sstring& keyspace_name, const sstring& table_name) { assert(schema_tables_holding_schema_mutations().contains(schema->id())); auto keyspace_key = partition_key::from_singular(*schema, keyspace_name); auto clustering_range = query::clustering_range(clustering_key_prefix::from_clustering_prefix( *schema, exploded_clustering_prefix({utf8_type->decompose(table_name)}))); auto slice = partition_slice_builder(*schema) .with_range(std::move(clustering_range)) .build(); auto cmd = make_lw_shared(schema->id(), schema->version(), std::move(slice), proxy.local().get_max_result_size(slice), query::tombstone_limit::max, query::row_limit(query::max_rows)); co_return co_await query_partition_mutation(proxy.local(), std::move(schema), std::move(cmd), std::move(keyspace_key)); } future read_keyspace_mutation(distributed& proxy, const sstring& keyspace_name) { schema_ptr s = keyspaces(); auto key = partition_key::from_singular(*s, keyspace_name); auto slice = s->full_slice(); auto cmd = make_lw_shared(s->id(), s->version(), std::move(slice), proxy.local().get_max_result_size(slice), query::tombstone_limit::max); co_return co_await query_partition_mutation(proxy.local(), std::move(s), std::move(cmd), std::move(key)); } static thread_local semaphore the_merge_lock {1}; future<> merge_lock() { if (slogger.is_enabled(log_level::trace)) { slogger.trace("merge_lock at {}", current_backtrace()); } return smp::submit_to(0, [] { return the_merge_lock.wait(); }); } future<> merge_unlock() { if (slogger.is_enabled(log_level::trace)) { slogger.trace("merge_unlock at {}", current_backtrace()); } return smp::submit_to(0, [] { the_merge_lock.signal(); }); } future> hold_merge_lock() noexcept { assert(this_shard_id() == 0); if (slogger.is_enabled(log_level::trace)) { slogger.trace("hold_merge_lock at {}", current_backtrace()); } return get_units(the_merge_lock, 1); } static future<> with_merge_lock(noncopyable_function ()> func) { co_await merge_lock(); std::exception_ptr ep; try { co_await func(); } catch (...) { ep = std::current_exception(); } co_await merge_unlock(); if (ep) { std::rethrow_exception(std::move(ep)); } } static future<> update_schema_version_and_announce(sharded& sys_ks, distributed& proxy, schema_features features, std::optional version_from_group0) { auto uuid = version_from_group0 ? *version_from_group0 : co_await calculate_schema_digest(proxy, features); co_await sys_ks.local().update_schema_version(uuid); co_await proxy.local().get_db().invoke_on_all([uuid] (replica::database& db) { db.update_version(uuid); }); slogger.info("Schema version changed to {}", uuid); } static future> get_group0_schema_version(db::system_keyspace& sys_ks) { auto version = co_await sys_ks.get_scylla_local_param_as("group0_schema_version"); if (!version) { co_return std::nullopt; } co_return table_schema_version{*version}; } /** * Merge remote schema in form of mutations with local and mutate ks/cf metadata objects * (which also involves fs operations on add/drop ks/cf) * * @param mutations the schema changes to apply * * @throws ConfigurationException If one of metadata attributes has invalid value * @throws IOException If data was corrupted during transportation or failed to apply fs operations */ future<> merge_schema(sharded& sys_ks, distributed& proxy, gms::feature_service& feat, std::vector mutations, bool reload) { if (this_shard_id() != 0) { // mutations must be applied on the owning shard (0). co_await smp::submit_to(0, coroutine::lambda([&, fmuts = freeze(mutations)] () mutable -> future<> { co_await merge_schema(sys_ks, proxy, feat, co_await unfreeze_gently(fmuts), reload); })); co_return; } co_await with_merge_lock([&] () mutable -> future<> { bool flush_schema = proxy.local().get_db().local().get_config().flush_schema_tables_after_modification(); co_await do_merge_schema(proxy, sys_ks, std::move(mutations), flush_schema, reload); auto version_from_group0 = co_await get_group0_schema_version(sys_ks.local()); co_await update_schema_version_and_announce(sys_ks, proxy, feat.cluster_schema_features(), version_from_group0); }); } future<> recalculate_schema_version(sharded& sys_ks, distributed& proxy, gms::feature_service& feat) { co_await with_merge_lock([&] () -> future<> { auto version_from_group0 = co_await get_group0_schema_version(sys_ks.local()); co_await update_schema_version_and_announce(sys_ks, proxy, feat.cluster_schema_features(), version_from_group0); }); } // Returns names of live table definitions of given keyspace future> static read_table_names_of_keyspace(distributed& proxy, const sstring& keyspace_name, schema_ptr schema_table) { auto pkey = dht::decorate_key(*schema_table, partition_key::from_singular(*schema_table, keyspace_name)); auto&& rs = co_await db::system_keyspace::query(proxy.local().get_db(), schema_table->ks_name(), schema_table->cf_name(), pkey); co_return boost::copy_range>(rs->rows() | boost::adaptors::transformed([schema_table] (const query::result_set_row& row) { const sstring name = schema_table->clustering_key_columns().begin()->name_as_text(); return row.get_nonnull(name); })); } static std::optional table_id_from_mutations(const schema_mutations& sm) { auto table_rs = query::result_set(sm.columnfamilies_mutation()); if (table_rs.empty()) { return std::nullopt; } query::result_set_row table_row = table_rs.row(0); return table_id(table_row.get_nonnull("id")); } using table_name = sstring; using keyspace_name = sstring; enum class table_kind { table, view }; } } template <> struct fmt::formatter { constexpr auto parse(format_parse_context& ctx) { return ctx.begin(); } auto format(db::schema_tables::table_kind k, fmt::format_context& ctx) const { switch (k) { using enum db::schema_tables::table_kind; case table: return fmt::format_to(ctx.out(), "table"); case view: return fmt::format_to(ctx.out(), "view"); } abort(); } }; namespace db { namespace schema_tables { static std::ostream& operator<<(std::ostream& os, table_kind k) { fmt::print(os, "{}", k); return os; } static constexpr std::initializer_list all_table_kinds = { table_kind::table, table_kind::view }; static schema_ptr get_table_holder(table_kind k) { switch (k) { case table_kind::table: return tables(); case table_kind::view: return views(); } abort(); } struct table_selector { bool all_in_keyspace = false; // If true, selects all existing tables in a keyspace plus what's in "tables"; std::unordered_map> tables; table_selector& operator+=(table_selector&& o) { all_in_keyspace |= o.all_in_keyspace; for (auto t : all_table_kinds) { tables[t].merge(std::move(o.tables[t])); } return *this; } void add(table_kind t, sstring name) { tables[t].emplace(std::move(name)); } void add(sstring name) { for (auto t : all_table_kinds) { add(t, name); } } }; static future> read_tables_for_keyspaces(distributed& proxy, const std::set& keyspace_names, table_kind kind, const std::unordered_map& tables_per_keyspace) { std::map result; for (auto&& [keyspace_name, sel] : tables_per_keyspace) { if (!sel.tables.contains(kind)) { continue; } for (auto&& table_name : sel.tables.find(kind)->second) { auto qn = qualified_name(keyspace_name, table_name); auto muts = co_await read_table_mutations(proxy, qn, get_table_holder(kind)); auto id = table_id_from_mutations(muts); if (id) { result.emplace(std::move(*id), std::move(muts)); } } } co_return result; } mutation compact_for_schema_digest(const mutation& m) { // Cassandra is skipping tombstones from digest calculation // to avoid disagreements due to tombstone GC. // See https://issues.apache.org/jira/browse/CASSANDRA-6862. // We achieve similar effect with compact_for_compaction(). mutation m_compacted(m); m_compacted.partition().compact_for_compaction_drop_tombstones_unconditionally(*m.schema(), m.decorated_key()); return m_compacted; } void feed_hash_for_schema_digest(hasher& h, const mutation& m, schema_features features) { auto compacted = compact_for_schema_digest(m); if (!features.contains() || !compacted.partition().empty()) { feed_hash(h, compacted); } } // Applies deletion of the "version" column to system_schema.scylla_tables mutation rows // which weren't committed by group 0. static void maybe_delete_schema_version(mutation& m) { if (m.column_family_id() != scylla_tables()->id()) { return; } const column_definition& origin_col = *m.schema()->get_column_definition(to_bytes("committed_by_group0")); const column_definition& version_col = *m.schema()->get_column_definition(to_bytes("version")); for (auto&& row : m.partition().clustered_rows()) { auto&& cells = row.row().cells(); if (auto&& origin_cell = cells.find_cell(origin_col.id); origin_cell) { auto&& ac = origin_cell->as_atomic_cell(origin_col); if (ac.is_live()) { auto dv = origin_col.type->deserialize(managed_bytes_view(ac.value())); auto committed_by_group0 = value_cast(dv); if (committed_by_group0) { // Don't delete "version" for this entry. continue; } } } auto&& cell = cells.find_cell(version_col.id); api::timestamp_type t = api::new_timestamp(); if (cell) { t = std::max(t, cell->as_atomic_cell(version_col).timestamp()); } cells.apply(version_col, atomic_cell::make_dead(t, gc_clock::now())); } } /// Helper function which fills a given mutation with column information /// provided the corresponding column_definition object. static void fill_column_info(const schema& table, const clustering_key& ckey, const column_definition& column, api::timestamp_type timestamp, ttl_opt ttl, mutation& m) { auto order = "NONE"; if (column.is_clustering_key()) { order = "ASC"; } auto type = column.type; if (type->is_reversed()) { type = type->underlying_type(); if (column.is_clustering_key()) { order = "DESC"; } } int32_t pos = -1; if (column.is_primary_key()) { pos = table.position(column); } m.set_clustered_cell(ckey, "column_name_bytes", data_value(column.name()), timestamp, ttl); m.set_clustered_cell(ckey, "kind", serialize_kind(column.kind), timestamp, ttl); m.set_clustered_cell(ckey, "position", pos, timestamp, ttl); m.set_clustered_cell(ckey, "clustering_order", sstring(order), timestamp, ttl); m.set_clustered_cell(ckey, "type", type->as_cql3_type().to_string(), timestamp, ttl); } future<> store_column_mapping(distributed& proxy, schema_ptr s, bool with_ttl) { // Skip "system*" tables -- only user-related tables are relevant if (static_cast(s->ks_name()).starts_with(db::system_keyspace::NAME)) { co_return; } schema_ptr history_tbl = scylla_table_schema_history(); // Insert the new column mapping for a given schema version (without TTL) std::vector muts; partition_key pk = partition_key::from_exploded(*history_tbl, {uuid_type->decompose(s->id().uuid())}); ttl_opt ttl; if (with_ttl) { ttl = gc_clock::duration(DEFAULT_GC_GRACE_SECONDS); } // Use one timestamp for all mutations for the ease of debugging const auto ts = api::new_timestamp(); for (const auto& cdef : boost::range::join(s->static_columns(), s->regular_columns())) { mutation m(history_tbl, pk); auto ckey = clustering_key::from_exploded(*history_tbl, {uuid_type->decompose(s->version().uuid()), utf8_type->decompose(cdef.name_as_text())}); fill_column_info(*s, ckey, cdef, ts, ttl, m); muts.emplace_back(std::move(m)); } co_await proxy.local().mutate_locally(std::move(muts), tracing::trace_state_ptr()); } // Limit concurrency of user tables to prevent stalls. // See https://github.com/scylladb/scylladb/issues/11574 // Note: we aim at providing enough concurrency to utilize // the cpu while operations are blocked on disk I/O // and or filesystem calls, e.g. fsync. constexpr size_t max_concurrent = 8; // Extracts the names of tables affected by a schema mutation. // The mutation must target one of the tables in schema_tables_holding_schema_mutations(). static table_selector get_affected_tables(const sstring& keyspace_name, const mutation& m) { const schema& s = *m.schema(); auto get_table_name = [&] (const clustering_key& ck) { // The first component of the clustering key in each table listed in // schema_tables_holding_schema_mutations contains the table name. return value_cast(utf8_type->deserialize(ck.get_component(s, 0))); }; table_selector result; if (m.partition().partition_tombstone()) { slogger.trace("Mutation of {}.{} for keyspace {} contains a partition tombstone", m.schema()->ks_name(), m.schema()->cf_name(), keyspace_name); result.all_in_keyspace = true; } for (auto&& e : m.partition().row_tombstones()) { const range_tombstone& rt = e.tombstone(); if (rt.start.size(s) == 0 || rt.end.size(s) == 0) { slogger.trace("Mutation of {}.{} for keyspace {} contains a multi-table range tombstone", m.schema()->ks_name(), m.schema()->cf_name(), keyspace_name); result.all_in_keyspace = true; break; } auto table_name = get_table_name(rt.start); if (table_name != get_table_name(rt.end)) { slogger.trace("Mutation of {}.{} for keyspace {} contains a multi-table range tombstone", m.schema()->ks_name(), m.schema()->cf_name(), keyspace_name); result.all_in_keyspace = true; break; } result.add(table_name); } for (auto&& row : m.partition().clustered_rows()) { result.add(get_table_name(row.key())); } slogger.trace("Mutation of {}.{} for keyspace {} affects tables: {}, all_in_keyspace: {}", m.schema()->ks_name(), m.schema()->cf_name(), keyspace_name, result.tables, result.all_in_keyspace); return result; } static future<> do_merge_schema(distributed& proxy, sharded& sys_ks, std::vector mutations, bool do_flush, bool reload) { slogger.trace("do_merge_schema: {}", mutations); schema_ptr s = keyspaces(); // compare before/after schemas of the affected keyspaces only std::set keyspaces; std::unordered_map affected_tables; bool has_tablet_mutations = false; for (auto&& mutation : mutations) { sstring keyspace_name = value_cast(utf8_type->deserialize(mutation.key().get_component(*s, 0))); if (schema_tables_holding_schema_mutations().contains(mutation.schema()->id())) { affected_tables[keyspace_name] += get_affected_tables(keyspace_name, mutation); } if (mutation.schema()->id() == system_keyspace::tablets()->id()) { has_tablet_mutations = true; } keyspaces.emplace(std::move(keyspace_name)); // We must force recalculation of schema version after the merge, since the resulting // schema may be a mix of the old and new schemas, with the exception of entries // that originate from group 0. maybe_delete_schema_version(mutation); } if (reload) { for (auto&& ks : proxy.local().get_db().local().get_non_system_keyspaces()) { keyspaces.emplace(ks); table_selector sel; sel.all_in_keyspace = true; affected_tables[ks] = sel; } } // Resolve sel.all_in_keyspace == true to the actual list of tables and views. for (auto&& [keyspace_name, sel] : affected_tables) { if (sel.all_in_keyspace) { // FIXME: Obtain from the database object slogger.trace("Reading table list for keyspace {}", keyspace_name); for (auto k : all_table_kinds) { for (auto&& n : co_await read_table_names_of_keyspace(proxy, keyspace_name, get_table_holder(k))) { sel.add(k, std::move(n)); } } } slogger.debug("Affected tables for keyspace {}: {}", keyspace_name, sel.tables); } // current state of the schema auto&& old_keyspaces = co_await read_schema_for_keyspaces(proxy, KEYSPACES, keyspaces); auto&& old_column_families = co_await read_tables_for_keyspaces(proxy, keyspaces, table_kind::table, affected_tables); auto&& old_types = co_await read_schema_for_keyspaces(proxy, TYPES, keyspaces); auto&& old_views = co_await read_tables_for_keyspaces(proxy, keyspaces, table_kind::view, affected_tables); auto old_functions = co_await read_schema_for_keyspaces(proxy, FUNCTIONS, keyspaces); auto old_aggregates = co_await read_schema_for_keyspaces(proxy, AGGREGATES, keyspaces); auto old_scylla_aggregates = co_await read_schema_for_keyspaces(proxy, SCYLLA_AGGREGATES, keyspaces); co_await proxy.local().get_db().local().apply(freeze(mutations), db::no_timeout); // with new data applied auto&& new_keyspaces = co_await read_schema_for_keyspaces(proxy, KEYSPACES, keyspaces); auto&& new_column_families = co_await read_tables_for_keyspaces(proxy, keyspaces, table_kind::table, affected_tables); auto&& new_types = co_await read_schema_for_keyspaces(proxy, TYPES, keyspaces); auto&& new_views = co_await read_tables_for_keyspaces(proxy, keyspaces, table_kind::view, affected_tables); auto new_functions = co_await read_schema_for_keyspaces(proxy, FUNCTIONS, keyspaces); auto new_aggregates = co_await read_schema_for_keyspaces(proxy, AGGREGATES, keyspaces); auto new_scylla_aggregates = co_await read_schema_for_keyspaces(proxy, SCYLLA_AGGREGATES, keyspaces); std::set keyspaces_to_drop = co_await merge_keyspaces(proxy, std::move(old_keyspaces), std::move(new_keyspaces)); auto types_to_drop = co_await merge_types(proxy, std::move(old_types), std::move(new_types)); co_await merge_tables_and_views(proxy, sys_ks, std::move(old_column_families), std::move(new_column_families), std::move(old_views), std::move(new_views), reload, has_tablet_mutations); co_await merge_functions(proxy, std::move(old_functions), std::move(new_functions)); co_await merge_aggregates(proxy, std::move(old_aggregates), std::move(new_aggregates), std::move(old_scylla_aggregates), std::move(new_scylla_aggregates)); co_await types_to_drop.drop(); auto& sharded_db = proxy.local().get_db(); // it is safe to drop a keyspace only when all nested ColumnFamilies where deleted for (auto keyspace_to_drop : keyspaces_to_drop) { co_await replica::database::drop_keyspace_on_all_shards(sharded_db, keyspace_to_drop); } } future> extract_scylla_specific_keyspace_info(distributed& proxy, const schema_result_value_type& partition) { lw_shared_ptr scylla_specific_rs; if (proxy.local().local_db().has_schema(NAME, SCYLLA_KEYSPACES)) { auto&& rs = partition.second; if (rs->empty()) { co_await coroutine::return_exception(std::runtime_error("query result has no rows")); } auto&& row = rs->row(0); auto keyspace_name = row.get_nonnull("keyspace_name"); auto keyspace_key = dht::decorate_key(*scylla_keyspaces(), partition_key::from_singular(*scylla_keyspaces(), keyspace_name)); scylla_specific_rs = co_await db::system_keyspace::query(proxy.local().get_db(), NAME, SCYLLA_KEYSPACES, keyspace_key); } co_return scylla_specific_rs; } future> merge_keyspaces(distributed& proxy, schema_result&& before, schema_result&& after) { std::vector created; std::vector altered; std::set dropped; /* * - we don't care about entriesOnlyOnLeft() or entriesInCommon(), because only the changes are of interest to us * - of all entriesOnlyOnRight(), we only care about ones that have live columns; it's possible to have a ColumnFamily * there that only has the top-level deletion, if: * a) a pushed DROP KEYSPACE change for a keyspace hadn't ever made it to this node in the first place * b) a pulled dropped keyspace that got dropped before it could find a way to this node * - of entriesDiffering(), we don't care about the scenario where both pre and post-values have zero live columns: * that means that a keyspace had been recreated and dropped, and the recreated keyspace had never found a way * to this node */ auto diff = difference(before, after, indirect_equal_to>()); for (auto&& key : diff.entries_only_on_left) { slogger.info("Dropping keyspace {}", key); dropped.emplace(key); } for (auto&& key : diff.entries_only_on_right) { auto&& value = after[key]; slogger.info("Creating keyspace {}", key); created.emplace_back(schema_result_value_type{key, std::move(value)}); } for (auto&& key : diff.entries_differing) { slogger.info("Altering keyspace {}", key); altered.emplace_back(key); } auto& sharded_db = proxy.local().get_db(); for (auto&& val : created) { auto scylla_specific_rs = co_await extract_scylla_specific_keyspace_info(proxy, val); auto ksm = create_keyspace_from_schema_partition(val, std::move(scylla_specific_rs)); co_await replica::database::create_keyspace_on_all_shards(sharded_db, proxy, *ksm); } for (auto& name : altered) { auto v = co_await db::schema_tables::read_schema_partition_for_keyspace(proxy, db::schema_tables::KEYSPACES, name); auto scylla_specific_rs = co_await db::schema_tables::extract_scylla_specific_keyspace_info(proxy, v); auto tmp_ksm = db::schema_tables::create_keyspace_from_schema_partition(v, scylla_specific_rs); co_await replica::database::update_keyspace_on_all_shards(sharded_db, *tmp_ksm); } co_return dropped; } struct schema_diff { struct dropped_schema { global_schema_ptr schema; }; struct altered_schema { global_schema_ptr old_schema; global_schema_ptr new_schema; }; std::vector created; std::vector altered; std::vector dropped; size_t size() const { return created.size() + altered.size() + dropped.size(); } }; // Which side of the diff this schema is on? // Helps ensuring that when creating schema for altered views, we match "before" // version of view to "before" version of base table and "after" to "after" // respectively. enum class schema_diff_side { left, // old, before right, // new, after }; static schema_diff diff_table_or_view(distributed& proxy, std::map&& before, std::map&& after, bool reload, noncopyable_function create_schema) { schema_diff d; auto diff = difference(before, after); for (auto&& key : diff.entries_only_on_left) { auto&& s = proxy.local().get_db().local().find_schema(key); slogger.info("Dropping {}.{} id={} version={}", s->ks_name(), s->cf_name(), s->id(), s->version()); d.dropped.emplace_back(schema_diff::dropped_schema{s}); } for (auto&& key : diff.entries_only_on_right) { auto s = create_schema(std::move(after.at(key)), schema_diff_side::right); slogger.info("Creating {}.{} id={} version={}", s->ks_name(), s->cf_name(), s->id(), s->version()); d.created.emplace_back(s); } for (auto&& key : diff.entries_differing) { auto s_before = create_schema(std::move(before.at(key)), schema_diff_side::left); auto s = create_schema(std::move(after.at(key)), schema_diff_side::right); slogger.info("Altering {}.{} id={} version={}", s->ks_name(), s->cf_name(), s->id(), s->version()); d.altered.emplace_back(schema_diff::altered_schema{s_before, s}); } if (reload) { for (auto&& key: diff.entries_in_common) { auto s = create_schema(std::move(after.at(key)), schema_diff_side::right); slogger.info("Reloading {}.{} id={} version={}", s->ks_name(), s->cf_name(), s->id(), s->version()); d.altered.emplace_back(schema_diff::altered_schema {s, s}); } } return d; } // see the comments for merge_keyspaces() // Atomically publishes schema changes. In particular, this function ensures // that when a base schema and a subset of its views are modified together (i.e., // upon an alter table or alter type statement), then they are published together // as well, without any deferring in-between. static future<> merge_tables_and_views(distributed& proxy, sharded& sys_ks, std::map&& tables_before, std::map&& tables_after, std::map&& views_before, std::map&& views_after, bool reload, bool has_tablet_mutations) { auto tables_diff = diff_table_or_view(proxy, std::move(tables_before), std::move(tables_after), reload, [&] (schema_mutations sm, schema_diff_side) { return create_table_from_mutations(proxy, std::move(sm)); }); auto views_diff = diff_table_or_view(proxy, std::move(views_before), std::move(views_after), reload, [&] (schema_mutations sm, schema_diff_side side) { // The view schema mutation should be created with reference to the base table schema because we definitely know it by now. // If we don't do it we are leaving a window where write commands to this schema are illegal. // There are 3 possibilities: // 1. The table was altered - in this case we want the view to correspond to this new table schema. // 2. The table was just created - the table is guaranteed to be published with the view in that case. // 3. The view itself was altered - in that case we already know the base table so we can take it from // the database object. view_ptr vp = create_view_from_mutations(proxy, std::move(sm)); schema_ptr base_schema; for (auto&& altered : tables_diff.altered) { // Chose the appropriate version of the base table schema: old -> old, new -> new. schema_ptr s = side == schema_diff_side::left ? altered.old_schema : altered.new_schema; if (s->ks_name() == vp->ks_name() && s->cf_name() == vp->view_info()->base_name() ) { base_schema = s; break; } } if (!base_schema) { for (auto&& s : tables_diff.created) { if (s.get()->ks_name() == vp->ks_name() && s.get()->cf_name() == vp->view_info()->base_name() ) { base_schema = s; break; } } } if (!base_schema) { base_schema = proxy.local().local_db().find_schema(vp->ks_name(), vp->view_info()->base_name()); } // Now when we have a referenced base - sanity check that we're not registering an old view // (this could happen when we skip multiple major versions in upgrade, which is unsupported.) check_no_legacy_secondary_index_mv_schema(proxy.local().get_db().local(), vp, base_schema); vp->view_info()->set_base_info(vp->view_info()->make_base_dependent_view_info(*base_schema)); return vp; }); // First drop views and *only then* the tables, if interleaved it can lead // to a mv not finding its schema when snapshoting since the main table // was already dropped (see https://github.com/scylladb/scylla/issues/5614) auto& db = proxy.local().get_db(); co_await max_concurrent_for_each(views_diff.dropped, max_concurrent, [&db, &sys_ks] (schema_diff::dropped_schema& dt) { auto& s = *dt.schema.get(); return replica::database::drop_table_on_all_shards(db, sys_ks, s.ks_name(), s.cf_name()); }); co_await max_concurrent_for_each(tables_diff.dropped, max_concurrent, [&db, &sys_ks] (schema_diff::dropped_schema& dt) -> future<> { auto& s = *dt.schema.get(); return replica::database::drop_table_on_all_shards(db, sys_ks, s.ks_name(), s.cf_name()); }); if (has_tablet_mutations) { slogger.info("Tablet metadata changed"); // We must do it after tables are dropped so that table snapshot doesn't experience missing tablet map, // and so that compaction groups are not destroyed altogether. // We must also do it before tables are created so that new tables see the tablet map. co_await db.invoke_on_all([&] (replica::database& db) -> future<> { co_await db.get_notifier().update_tablet_metadata(); }); } co_await db.invoke_on_all([&] (replica::database& db) -> future<> { // In order to avoid possible races we first create the tables and only then the views. // That way if a view seeks information about its base table it's guaranteed to find it. co_await max_concurrent_for_each(tables_diff.created, max_concurrent, [&] (global_schema_ptr& gs) -> future<> { co_await db.add_column_family_and_make_directory(gs, replica::database::is_new_cf::yes); }); co_await max_concurrent_for_each(views_diff.created, max_concurrent, [&] (global_schema_ptr& gs) -> future<> { co_await db.add_column_family_and_make_directory(gs, replica::database::is_new_cf::yes); }); }); co_await db.invoke_on_all([&](replica::database& db) -> future<> { std::vector columns_changed; columns_changed.reserve(tables_diff.altered.size() + views_diff.altered.size()); for (auto&& altered : boost::range::join(tables_diff.altered, views_diff.altered)) { columns_changed.push_back(db.update_column_family(altered.new_schema)); co_await coroutine::maybe_yield(); } auto it = columns_changed.begin(); auto notify = [&] (auto& r, auto&& f) -> future<> { co_await max_concurrent_for_each(r, max_concurrent, std::move(f)); }; // View drops are notified first, because a table can only be dropped if its views are already deleted co_await notify(views_diff.dropped, [&] (auto&& dt) { return db.get_notifier().drop_view(view_ptr(dt.schema)); }); co_await notify(tables_diff.dropped, [&] (auto&& dt) { return db.get_notifier().drop_column_family(dt.schema); }); // Table creations are notified first, in case a view is created right after the table co_await notify(tables_diff.created, [&] (auto&& gs) { return db.get_notifier().create_column_family(gs); }); co_await notify(views_diff.created, [&] (auto&& gs) { return db.get_notifier().create_view(view_ptr(gs)); }); // Table altering is notified first, in case new base columns appear co_await notify(tables_diff.altered, [&] (auto&& altered) { return db.get_notifier().update_column_family(altered.new_schema, *it++); }); co_await notify(views_diff.altered, [&] (auto&& altered) { return db.get_notifier().update_view(view_ptr(altered.new_schema), *it++); }); }); // Insert column_mapping into history table for altered and created tables. // // Entries for new tables are inserted without TTL, which means that the most // recent schema version should always be available. // // For altered tables we both insert a new column mapping without TTL and // overwrite the previous version entries with TTL to expire them eventually. // // Drop column mapping entries for dropped tables since these will not be TTLed automatically // and will stay there forever if we don't clean them up manually co_await max_concurrent_for_each(tables_diff.created, max_concurrent, [&proxy] (global_schema_ptr& gs) -> future<> { co_await store_column_mapping(proxy, gs.get(), false); }); co_await max_concurrent_for_each(tables_diff.altered, max_concurrent, [&proxy] (schema_diff::altered_schema& altered) -> future<> { co_await when_all_succeed( store_column_mapping(proxy, altered.old_schema.get(), true), store_column_mapping(proxy, altered.new_schema.get(), false)); }); co_await max_concurrent_for_each(tables_diff.dropped, max_concurrent, [&sys_ks] (schema_diff::dropped_schema& dropped) -> future<> { schema_ptr s = dropped.schema.get(); co_await drop_column_mapping(sys_ks.local(), s->id(), s->version()); }); } static std::vector collect_rows(const std::set& keys, const schema_result& result) { std::vector ret; for (const auto& key : keys) { for (const auto& row : result.find(key)->second->rows()) { ret.push_back(&row); } } return ret; } static std::vector get_primary_key_definition(const schema_ptr& schema) { std::vector primary_key; for (const auto& column : schema->partition_key_columns()) { primary_key.push_back(column); } for (const auto& column : schema->clustering_key_columns()) { primary_key.push_back(column); } return primary_key; } static std::vector get_primary_key(const std::vector& primary_key, const query::result_set_row* row) { std::vector key; for (const auto& column : primary_key) { const data_value *val = row->get_data_value(column.name_as_text()); key.push_back(val->serialize_nonnull()); } return key; } // Build a map from primary keys to rows. static std::map, const query::result_set_row*> build_row_map(const query::result_set& result) { const std::vector& rows = result.rows(); auto primary_key = get_primary_key_definition(result.schema()); std::map, const query::result_set_row*> ret; for (const auto& row: rows) { auto key = get_primary_key(primary_key, &row); ret.insert(std::pair(std::move(key), &row)); } return ret; } struct row_diff { std::vector altered; std::vector created; std::vector dropped; }; // Compute which rows have been created, dropped or altered. // A row is identified by its primary key. // In the output, all entries of a given keyspace are together. static row_diff diff_rows(const schema_result& before, const schema_result& after) { auto diff = difference(before, after, indirect_equal_to>()); // For new or empty keyspaces, just record each row. auto dropped = collect_rows(diff.entries_only_on_left, before); // Keyspaces now without rows auto created = collect_rows(diff.entries_only_on_right, after); // New keyspaces with rows std::vector altered; for (const auto& key : diff.entries_differing) { // For each keyspace that changed, compute the difference of the corresponding result_set to find which rows // have changed. auto before_rows = build_row_map(*before.find(key)->second); auto after_rows = build_row_map(*after.find(key)->second); auto diff_row = difference(before_rows, after_rows, indirect_equal_to()); for (const auto& key : diff_row.entries_only_on_left) { dropped.push_back(before_rows.find(key)->second); } for (const auto& key : diff_row.entries_only_on_right) { created.push_back(after_rows.find(key)->second); } for (const auto& key : diff_row.entries_differing) { altered.push_back(after_rows.find(key)->second); } } return {std::move(altered), std::move(created), std::move(dropped)}; } // User-defined aggregate stores its information in two tables: aggregates and scylla_aggregates // The difference has to be joined to properly create an UDA. // // FIXME: Since UDA cannot be altered now, set of differing rows should be empty and those rows are // ignored in calculating the diff. struct aggregate_diff { std::vector> created; std::vector> dropped; }; static aggregate_diff diff_aggregates_rows(const schema_result& aggr_before, const schema_result& aggr_after, const schema_result& scylla_aggr_before, const schema_result& scylla_aggr_after) { using map = std::map, const query::result_set_row*>; auto aggr_diff = difference(aggr_before, aggr_after, indirect_equal_to>()); std::vector> created; std::vector> dropped; // Primary key for `aggregates` and `scylla_aggregates` tables auto primary_key = get_primary_key_definition(aggregates()); // DROPPED for (const auto& key : aggr_diff.entries_only_on_left) { auto scylla_entry = scylla_aggr_before.find(key); auto scylla_aggr_rows = (scylla_entry != scylla_aggr_before.end()) ? build_row_map(*scylla_entry->second) : map(); for (const auto& row : aggr_before.find(key)->second->rows()) { auto pk = get_primary_key(primary_key, &row); auto entry = scylla_aggr_rows.find(pk); dropped.push_back({&row, (entry != scylla_aggr_rows.end()) ? entry->second : nullptr}); } } // CREATED for (const auto& key : aggr_diff.entries_only_on_right) { auto scylla_entry = scylla_aggr_after.find(key); auto scylla_aggr_rows = (scylla_entry != scylla_aggr_after.end()) ? build_row_map(*scylla_entry->second) : map(); for (const auto& row : aggr_after.find(key)->second->rows()) { auto pk = get_primary_key(primary_key, &row); auto entry = scylla_aggr_rows.find(pk); created.push_back({&row, (entry != scylla_aggr_rows.end()) ? entry->second : nullptr}); } } for (const auto& key : aggr_diff.entries_differing) { auto aggr_before_rows = build_row_map(*aggr_before.find(key)->second); auto aggr_after_rows = build_row_map(*aggr_after.find(key)->second); auto diff = difference(aggr_before_rows, aggr_after_rows, indirect_equal_to()); auto scylla_entry_before = scylla_aggr_before.find(key); auto scylla_aggr_rows_before = (scylla_entry_before != scylla_aggr_before.end()) ? build_row_map(*scylla_entry_before->second) : map(); auto scylla_entry_after = scylla_aggr_after.find(key); auto scylla_aggr_rows_after = (scylla_entry_after != scylla_aggr_after.end()) ? build_row_map(*scylla_entry_after->second) : map(); for (const auto& k : diff.entries_only_on_left) { auto entry = scylla_aggr_rows_before.find(k); dropped.push_back({ aggr_before_rows.find(k)->second, (entry != scylla_aggr_rows_before.end()) ? entry->second : nullptr }); } for (const auto& k : diff.entries_only_on_right) { auto entry = scylla_aggr_rows_after.find(k); created.push_back({ aggr_after_rows.find(k)->second, (entry != scylla_aggr_rows_after.end()) ? entry->second : nullptr }); } } return {std::move(created), std::move(dropped)}; } template static std::vector get_list(const query::result_set_row& row, const sstring& name); // Create types for a given keyspace. This takes care of topologically sorting user defined types. template static future> create_types(keyspace_metadata& ks, T&& range) { cql_type_parser::raw_builder builder(ks); std::unordered_set names; for (const query::result_set_row& row : range) { auto name = row.get_nonnull("type_name"); names.insert(to_bytes(name)); builder.add(std::move(name), get_list(row, "field_names"), get_list(row, "field_types")); } // Add user types that use any of the above types. From the // database point of view they haven't changed since the content // of system.types is the same for them. The runtime objects in // the other hand now point to out of date types, so we need to // recreate them. for (const auto& p : ks.user_types().get_all_types()) { const user_type& t = p.second; if (names.contains(t->_name)) { continue; } for (const auto& name : names) { if (t->references_user_type(t->_keyspace, name)) { std::vector field_types; for (const data_type& f : t->field_types()) { field_types.push_back(f->as_cql3_type().to_string()); } builder.add(t->get_name_as_string(), t->string_field_names(), std::move(field_types)); } } } co_return co_await builder.build(); } // Given a set of rows that is sorted by keyspace, create types for each keyspace. // The topological sort in each keyspace is necessary when creating types, since we can only create a type when the // types it reference have already been created. static future> create_types(replica::database& db, const std::vector& rows) { std::vector ret; for (auto i = rows.begin(), e = rows.end(); i != e;) { const auto &row = *i; auto keyspace = row->get_nonnull("keyspace_name"); auto next = std::find_if(i, e, [&keyspace](const query::result_set_row* r) { return r->get_nonnull("keyspace_name") != keyspace; }); auto ks = db.find_keyspace(keyspace).metadata(); auto v = co_await create_types(*ks, boost::make_iterator_range(i, next) | boost::adaptors::indirected); ret.insert(ret.end(), std::make_move_iterator(v.begin()), std::make_move_iterator(v.end())); i = next; } co_return ret; } // see the comments for merge_keyspaces() static future merge_types(distributed& proxy, schema_result before, schema_result after) { auto diff = diff_rows(before, after); // Create and update user types before any tables/views are created that potentially // use those types. Similarly, defer dropping until after tables/views that may use // some of these user types are dropped. co_await proxy.local().get_db().invoke_on_all([&] (replica::database& db) -> future<> { auto created_types = co_await create_types(db, diff.created); for (auto&& user_type : created_types) { db.find_keyspace(user_type->_keyspace).add_user_type(user_type); co_await db.get_notifier().create_user_type(user_type); } auto altered_types = co_await create_types(db, diff.altered); for (auto&& user_type : altered_types) { db.find_keyspace(user_type->_keyspace).add_user_type(user_type); co_await db.get_notifier().update_user_type(user_type); } }); co_return user_types_to_drop{[&proxy, before = std::move(before), rows = std::move(diff.dropped)] () mutable -> future<> { co_await proxy.local().get_db().invoke_on_all([&] (replica::database& db) -> future<> { auto dropped = co_await create_types(db, rows); for (auto& user_type : dropped) { db.find_keyspace(user_type->_keyspace).remove_user_type(user_type); co_await db.get_notifier().drop_user_type(user_type); } }); }}; } static std::vector read_arg_types(replica::database& db, const query::result_set_row& row, const sstring& keyspace) { std::vector arg_types; for (const auto& arg : get_list(row, "argument_types")) { arg_types.push_back(db::cql_type_parser::parse(keyspace, arg, db.user_types())); } return arg_types; } static std::vector read_arg_values(const query::result_set_row& row) { auto args = get_list(row, "argument_types"); return std::vector(args.begin(), args.end()); } #if 0 // see the comments for mergeKeyspaces() private static void mergeAggregates(Map before, Map after) { List created = new ArrayList<>(); List altered = new ArrayList<>(); List dropped = new ArrayList<>(); MapDifference diff = Maps.difference(before, after); // New keyspace with functions for (Map.Entry entry : diff.entriesOnlyOnRight().entrySet()) if (entry.getValue().hasColumns()) created.addAll(createAggregatesFromAggregatesPartition(new Row(entry.getKey(), entry.getValue())).values()); for (Map.Entry> entry : diff.entriesDiffering().entrySet()) { ColumnFamily pre = entry.getValue().leftValue(); ColumnFamily post = entry.getValue().rightValue(); if (pre.hasColumns() && post.hasColumns()) { MapDifference delta = Maps.difference(createAggregatesFromAggregatesPartition(new Row(entry.getKey(), pre)), createAggregatesFromAggregatesPartition(new Row(entry.getKey(), post))); dropped.addAll(delta.entriesOnlyOnLeft().values()); created.addAll(delta.entriesOnlyOnRight().values()); Iterables.addAll(altered, Iterables.transform(delta.entriesDiffering().values(), new Function, UDAggregate>() { public UDAggregate apply(MapDifference.ValueDifference pair) { return pair.rightValue(); } })); } else if (pre.hasColumns()) { dropped.addAll(createAggregatesFromAggregatesPartition(new Row(entry.getKey(), pre)).values()); } else if (post.hasColumns()) { created.addAll(createAggregatesFromAggregatesPartition(new Row(entry.getKey(), post)).values()); } } for (UDAggregate udf : created) Schema.instance.addAggregate(udf); for (UDAggregate udf : altered) Schema.instance.updateAggregate(udf); for (UDAggregate udf : dropped) Schema.instance.dropAggregate(udf); } #endif static seastar::future> create_func(replica::database& db, const query::result_set_row& row) { cql3::functions::function_name name{ row.get_nonnull("keyspace_name"), row.get_nonnull("function_name")}; auto arg_types = read_arg_types(db, row, name.keyspace); data_type return_type = db::cql_type_parser::parse(name.keyspace, row.get_nonnull("return_type"), db.user_types()); // FIXME: We already computed the bitcode in // create_function_statement, but it is not clear how to get it // here. In this point in the code we only get what was saved in // system_schema.functions, and we don't want to store the bitcode // If this was not the replica that the client connected to we do // have to produce bitcode in at least one shard. Right now this // gets run in each shard. auto arg_names = get_list(row, "argument_names"); auto body = row.get_nonnull("body"); auto language = row.get_nonnull("language"); if (language == "lua") { lua::runtime_config cfg = lua::make_runtime_config(db.get_config()); cql3::functions::user_function::context ctx = cql3::functions::user_function::lua_context { .bitcode = lua::compile(cfg, arg_names, body), .cfg = cfg, }; co_return ::make_shared(std::move(name), std::move(arg_types), std::move(arg_names), std::move(body), language, std::move(return_type), row.get_nonnull("called_on_null_input"), std::move(ctx)); } else if (language == "wasm") { wasm::context ctx(db.wasm(), name.name, db.get_config().wasm_udf_yield_fuel(), db.get_config().wasm_udf_total_fuel()); co_await db.wasm().precompile(ctx, arg_names, body); co_return ::make_shared(std::move(name), std::move(arg_types), std::move(arg_names), std::move(body), language, std::move(return_type), row.get_nonnull("called_on_null_input"), std::move(ctx)); } else { throw std::runtime_error(format("Unsupported language for UDF: {}", language)); } } static shared_ptr create_aggregate(replica::database& db, const query::result_set_row& row, const query::result_set_row* scylla_row) { cql3::functions::function_name name{ row.get_nonnull("keyspace_name"), row.get_nonnull("aggregate_name")}; auto arg_types = read_arg_types(db, row, name.keyspace); data_type state_type = db::cql_type_parser::parse(name.keyspace, row.get_nonnull("state_type"), db.user_types()); sstring sfunc = row.get_nonnull("state_func"); auto ffunc = row.get("final_func"); auto initcond_str = row.get("initcond"); std::vector acc_types{state_type}; acc_types.insert(acc_types.end(), arg_types.begin(), arg_types.end()); auto state_func = dynamic_pointer_cast( cql3::functions::functions::find(cql3::functions::function_name{name.keyspace, sfunc}, acc_types)); if (!state_func) { throw std::runtime_error(format("State function {} needed by aggregate {} not found", sfunc, name.name)); } if (state_func->return_type() != state_type) { throw std::runtime_error(format("State function {} needed by aggregate {} doesn't return state", sfunc, name.name)); } ::shared_ptr reduce_func = nullptr; if (scylla_row) { auto rfunc_name = scylla_row->get("reduce_func"); auto rfunc = cql3::functions::functions::find(cql3::functions::function_name{name.keyspace, rfunc_name.value()}, {state_type, state_type}); if (!rfunc) { throw std::runtime_error(format("Reduce function {} needed by aggregate {} not found", rfunc_name.value(), name.name)); } reduce_func = dynamic_pointer_cast(rfunc); if (!reduce_func) { throw std::runtime_error(format("Reduce function {} needed by aggregate {} is not a scalar function", rfunc_name.value(), name.name)); } } ::shared_ptr final_func = nullptr; if (ffunc) { final_func = dynamic_pointer_cast( cql3::functions::functions::find(cql3::functions::function_name{name.keyspace, ffunc.value()}, {state_type})); if (!final_func) { throw std::runtime_error(format("Final function {} needed by aggregate {} not found", ffunc.value(), name.name)); } } bytes_opt initcond = std::nullopt; if (initcond_str) { auto expr = cql3::util::do_with_parser(*initcond_str, std::mem_fn(&cql3_parser::CqlParser::term)); auto dummy_ident = ::make_shared("", true); auto column_spec = make_lw_shared("", "", dummy_ident, state_type); auto raw = cql3::expr::evaluate(prepare_expression(expr, db.as_data_dictionary(), "", nullptr, {column_spec}), cql3::query_options::DEFAULT); initcond = std::move(raw).to_bytes_opt(); } return ::make_shared(name, initcond, std::move(state_func), std::move(reduce_func), std::move(final_func)); } static void drop_cached_func(replica::database& db, const query::result_set_row& row) { auto language = row.get_nonnull("language"); if (language == "wasm") { cql3::functions::function_name name{ row.get_nonnull("keyspace_name"), row.get_nonnull("function_name")}; auto arg_types = read_arg_types(db, row, name.keyspace); db.wasm().remove(name, arg_types); } } static future<> merge_functions(distributed& proxy, schema_result before, schema_result after) { auto diff = diff_rows(before, after); co_await proxy.local().get_db().invoke_on_all(coroutine::lambda([&] (replica::database& db) -> future<> { for (const auto& val : diff.created) { cql3::functions::functions::add_function(co_await create_func(db, *val)); } for (const auto& val : diff.dropped) { cql3::functions::function_name name{ val->get_nonnull("keyspace_name"), val->get_nonnull("function_name")}; auto arg_types = read_arg_types(db, *val, name.keyspace); drop_cached_func(db, *val); cql3::functions::functions::remove_function(name, arg_types); co_await db.get_notifier().drop_function(name, arg_types); } for (const auto& val : diff.altered) { drop_cached_func(db, *val); cql3::functions::functions::replace_function(co_await create_func(db, *val)); } })); } static future<> merge_aggregates(distributed& proxy, schema_result before, schema_result after, schema_result scylla_before, schema_result scylla_after) { auto diff = diff_aggregates_rows(before, after, scylla_before, scylla_after); co_await proxy.local().get_db().invoke_on_all([&] (replica::database& db)-> future<> { for (const auto& val : diff.created) { cql3::functions::functions::add_function(create_aggregate(db, *val.first, val.second)); } for (const auto& val : diff.dropped) { cql3::functions::function_name name{ val.first->get_nonnull("keyspace_name"), val.first->get_nonnull("aggregate_name")}; auto arg_types = read_arg_types(db, *val.first, name.keyspace); cql3::functions::functions::remove_function(name, arg_types); co_await db.get_notifier().drop_aggregate(name, arg_types); } }); } template void set_cell_or_clustered(mutation& m, const clustering_key & ckey, Args && ...args) { m.set_clustered_cell(ckey, std::forward(args)...); } template void set_cell_or_clustered(mutation& m, const exploded_clustering_prefix & ckey, Args && ...args) { m.set_cell(ckey, std::forward(args)...); } template static atomic_cell_or_collection make_map_mutation(const Map& map, const column_definition& column, api::timestamp_type timestamp, noncopyable_function f) { auto column_type = static_pointer_cast(column.type); auto ktyp = column_type->get_keys_type(); auto vtyp = column_type->get_values_type(); if (column_type->is_multi_cell()) { collection_mutation_description mut; for (auto&& entry : map) { auto te = f(entry); mut.cells.emplace_back(ktyp->decompose(data_value(te.first)), atomic_cell::make_live(*vtyp, timestamp, vtyp->decompose(data_value(te.second)), atomic_cell::collection_member::yes)); } return mut.serialize(*column_type); } else { map_type_impl::native_type tmp; tmp.reserve(map.size()); std::transform(map.begin(), map.end(), std::inserter(tmp, tmp.end()), std::move(f)); return atomic_cell::make_live(*column.type, timestamp, column_type->decompose(make_map_value(column_type, std::move(tmp)))); } } template static atomic_cell_or_collection make_map_mutation(const Map& map, const column_definition& column, api::timestamp_type timestamp) { return make_map_mutation(map, column, timestamp, [](auto&& p) { return std::make_pair(data_value(p.first), data_value(p.second)); }); } template static void store_map(mutation& m, const K& ckey, const bytes& name, api::timestamp_type timestamp, const Map& map) { auto s = m.schema(); auto column = s->get_column_definition(name); assert(column); set_cell_or_clustered(m, ckey, *column, make_map_mutation(map, *column, timestamp)); } /* * Keyspace metadata serialization/deserialization. */ std::vector make_create_keyspace_mutations(schema_features features, lw_shared_ptr keyspace, api::timestamp_type timestamp, bool with_tables_and_types_and_functions) { std::vector mutations; schema_ptr s = keyspaces(); auto pkey = partition_key::from_singular(*s, keyspace->name()); mutation m(s, pkey); auto ckey = clustering_key_prefix::make_empty(); m.set_cell(ckey, "durable_writes", keyspace->durable_writes(), timestamp); auto map = keyspace->strategy_options(); map["class"] = keyspace->strategy_name(); store_map(m, ckey, "replication", timestamp, map); if (features.contains()) { schema_ptr scylla_keyspaces_s = scylla_keyspaces(); mutation scylla_m(scylla_keyspaces_s, pkey); // pkey can be reused, it's identical in both tables auto& storage_options = keyspace->get_storage_options(); sstring storage_type(storage_options.type_string()); auto storage_map = storage_options.to_map(); if (!storage_map.empty()) { scylla_m.set_cell(ckey, "storage_type", storage_type, timestamp); store_map(scylla_m, ckey, "storage_options", timestamp, storage_map); } auto initial_tablets = keyspace->initial_tablets(); if (initial_tablets.has_value()) { scylla_m.set_cell(ckey, "initial_tablets", int32_t(*initial_tablets), timestamp); } mutations.emplace_back(std::move(scylla_m)); } mutations.emplace_back(std::move(m)); if (with_tables_and_types_and_functions) { for (const auto& kv : keyspace->user_types().get_all_types()) { add_type_to_schema_mutation(kv.second, timestamp, mutations); } for (auto&& s : keyspace->cf_meta_data() | boost::adaptors::map_values) { add_table_or_view_to_schema_mutation(s, timestamp, true, mutations); } } return mutations; } std::vector make_drop_keyspace_mutations(schema_features features, lw_shared_ptr keyspace, api::timestamp_type timestamp) { std::vector mutations; for (auto&& schema_table : all_tables(schema_features::full())) { auto pkey = partition_key::from_exploded(*schema_table, {utf8_type->decompose(keyspace->name())}); mutation m{schema_table, pkey}; m.partition().apply(tombstone{timestamp, gc_clock::now()}); mutations.emplace_back(std::move(m)); } if (features.contains()) { auto pkey = partition_key::from_exploded(*scylla_keyspaces(), {utf8_type->decompose(keyspace->name())}); mutation km{scylla_keyspaces(), pkey}; km.partition().apply(tombstone{timestamp, gc_clock::now()}); mutations.emplace_back(std::move(km)); } return mutations; } /** * Deserialize only Keyspace attributes without nested tables or types * * @param partition Keyspace attributes in serialized form */ lw_shared_ptr create_keyspace_from_schema_partition(const schema_result_value_type& result, lw_shared_ptr scylla_specific_rs) { auto&& rs = result.second; if (rs->empty()) { throw std::runtime_error("query result has no rows"); } auto&& row = rs->row(0); auto keyspace_name = row.get_nonnull("keyspace_name"); // We get called from multiple shards with result set originating on only one of them. // Cannot use copying accessors for "deep" types like map, because we will hit shared_ptr asserts // (or screw up shared pointers) const auto& replication = row.get_nonnull("replication"); std::map strategy_options; for (auto& p : replication) { strategy_options.emplace(value_cast(p.first), value_cast(p.second)); } auto strategy_name = strategy_options["class"]; strategy_options.erase("class"); bool durable_writes = row.get_nonnull("durable_writes"); data_dictionary::storage_options storage_opts; std::optional initial_tablets; // Scylla-specific row will only be present if SCYLLA_KEYSPACES schema feature is available in the cluster if (scylla_specific_rs) { if (!scylla_specific_rs->empty()) { auto row = scylla_specific_rs->row(0); auto storage_type = row.get("storage_type"); auto options = row.get("storage_options"); if (storage_type && options) { std::map values; for (const auto& entry : *options) { values.emplace(value_cast(entry.first), value_cast(entry.second)); } storage_opts.value = data_dictionary::storage_options::from_map(std::string_view(*storage_type), values); } initial_tablets = row.get("initial_tablets"); } } return keyspace_metadata::new_keyspace(keyspace_name, strategy_name, strategy_options, initial_tablets, durable_writes, storage_opts); } template static std::vector get_list(const query::result_set_row& row, const sstring& name) { std::vector list; const auto& values = row.get_nonnull(name); for (auto&& v : values) { list.emplace_back(value_cast(v)); }; return list; } future> create_types_from_schema_partition( keyspace_metadata& ks, lw_shared_ptr result) { co_return co_await create_types(ks, result->rows()); } seastar::future>> create_functions_from_schema_partition( replica::database& db, lw_shared_ptr result) { std::vector> ret; for (const auto& row : result->rows()) { ret.emplace_back(co_await create_func(db, row)); } co_return ret; } std::vector> create_aggregates_from_schema_partition( replica::database& db, lw_shared_ptr result, lw_shared_ptr scylla_result) { std::unordered_multimap scylla_aggs; if (scylla_result) { for (const auto& scylla_row : scylla_result->rows()) { auto scylla_agg_name = scylla_row.get_nonnull("aggregate_name"); scylla_aggs.emplace(scylla_agg_name, &scylla_row); } } std::vector> ret; for (const auto& row : result->rows()) { auto agg_name = row.get_nonnull("aggregate_name"); auto agg_args = read_arg_types(db, row, row.get_nonnull("keyspace_name")); const query::result_set_row *scylla_row_ptr = nullptr; for (auto [it, end] = scylla_aggs.equal_range(agg_name); it != end; ++it) { auto scylla_agg_args = read_arg_types(db, *it->second, it->second->get_nonnull("keyspace_name")); if (agg_args == scylla_agg_args) { scylla_row_ptr = it->second; break; } } ret.emplace_back(create_aggregate(db, row, scylla_row_ptr)); } return ret; } /* * User type metadata serialization/deserialization */ template static atomic_cell_or_collection make_list_mutation(const std::vector& values, const column_definition& column, api::timestamp_type timestamp, Func&& f) { auto column_type = static_pointer_cast(column.type); auto vtyp = column_type->get_elements_type(); if (column_type->is_multi_cell()) { collection_mutation_description m; m.cells.reserve(values.size()); m.tomb.timestamp = timestamp - 1; m.tomb.deletion_time = gc_clock::now(); for (auto&& value : values) { auto dv = f(value); auto uuid = utils::UUID_gen::get_time_UUID_bytes(); m.cells.emplace_back( bytes(reinterpret_cast(uuid.data()), uuid.size()), atomic_cell::make_live(*vtyp, timestamp, vtyp->decompose(std::move(dv)), atomic_cell::collection_member::yes)); } return m.serialize(*column_type); } else { list_type_impl::native_type tmp; tmp.reserve(values.size()); std::transform(values.begin(), values.end(), std::back_inserter(tmp), f); return atomic_cell::make_live(*column.type, timestamp, column_type->decompose(make_list_value(column_type, std::move(tmp)))); } } void add_type_to_schema_mutation(user_type type, api::timestamp_type timestamp, std::vector& mutations) { schema_ptr s = types(); auto pkey = partition_key::from_singular(*s, type->_keyspace); auto ckey = clustering_key::from_singular(*s, type->get_name_as_string()); mutation m{s, pkey}; auto field_names_column = s->get_column_definition("field_names"); auto field_names = make_list_mutation(type->field_names(), *field_names_column, timestamp, [](auto&& name) { return utf8_type->deserialize(name); }); m.set_clustered_cell(ckey, *field_names_column, std::move(field_names)); auto field_types_column = s->get_column_definition("field_types"); auto field_types = make_list_mutation(type->field_types(), *field_types_column, timestamp, [](auto&& type) { return data_value(type->as_cql3_type().to_string()); }); m.set_clustered_cell(ckey, *field_types_column, std::move(field_types)); mutations.emplace_back(std::move(m)); } std::vector make_create_type_mutations(lw_shared_ptr keyspace, user_type type, api::timestamp_type timestamp) { std::vector mutations; add_type_to_schema_mutation(type, timestamp, mutations); return mutations; } std::vector make_drop_type_mutations(lw_shared_ptr keyspace, user_type type, api::timestamp_type timestamp) { std::vector mutations; schema_ptr s = types(); auto pkey = partition_key::from_singular(*s, type->_keyspace); auto ckey = clustering_key::from_singular(*s, type->get_name_as_string()); mutation m{s, pkey}; m.partition().apply_delete(*s, ckey, tombstone(timestamp, gc_clock::now())); mutations.emplace_back(std::move(m)); return mutations; } /* * UDF metadata serialization/deserialization. */ static std::pair get_mutation(schema_ptr s, const cql3::functions::function& func) { auto name = func.name(); auto pkey = partition_key::from_singular(*s, name.keyspace); list_type_impl::native_type arg_types; for (const auto& arg_type : func.arg_types()) { arg_types.emplace_back(arg_type->as_cql3_type().to_string()); } auto arg_list_type = list_type_impl::get_instance(utf8_type, false); data_value arg_types_val = make_list_value(arg_list_type, std::move(arg_types)); auto ckey = clustering_key::from_exploded( *s, {utf8_type->decompose(name.name), arg_list_type->decompose(arg_types_val)}); mutation m{s, pkey}; return {std::move(m), std::move(ckey)}; } std::vector make_create_function_mutations(shared_ptr func, api::timestamp_type timestamp) { schema_ptr s = functions(); auto p = get_mutation(s, *func); mutation& m = p.first; clustering_key& ckey = p.second; auto argument_names_column = s->get_column_definition("argument_names"); auto argument_names = make_list_mutation(func->arg_names(), *argument_names_column, timestamp, [] (auto&& name) { return name; }); m.set_clustered_cell(ckey, *argument_names_column, std::move(argument_names)); m.set_clustered_cell(ckey, "body", func->body(), timestamp); m.set_clustered_cell(ckey, "language", func->language(), timestamp); m.set_clustered_cell(ckey, "return_type", func->return_type()->as_cql3_type().to_string(), timestamp); m.set_clustered_cell(ckey, "called_on_null_input", func->called_on_null_input(), timestamp); return make_mutation_vector(std::move(m)); } std::vector make_drop_function_mutations(schema_ptr s, const cql3::functions::function& func, api::timestamp_type timestamp) { auto p = get_mutation(s, func); mutation& m = p.first; clustering_key& ckey = p.second; m.partition().apply_delete(*s, ckey, tombstone(timestamp, gc_clock::now())); return make_mutation_vector(std::move(m)); } std::vector make_drop_function_mutations(shared_ptr func, api::timestamp_type timestamp) { return make_drop_function_mutations(functions(), *func, timestamp); } /* * UDA metadata serialization/deserialization */ static std::pair get_mutation(schema_ptr s, const cql3::functions::user_aggregate& aggregate) { auto name = aggregate.name(); auto pkey = partition_key::from_singular(*s, name.keyspace); list_type_impl::native_type arg_types; for (const auto& arg_type : aggregate.arg_types()) { arg_types.emplace_back(arg_type->as_cql3_type().to_string()); } auto arg_list_type = list_type_impl::get_instance(utf8_type, false); data_value arg_types_val = make_list_value(arg_list_type, std::move(arg_types)); auto ckey = clustering_key::from_exploded( *s, {utf8_type->decompose(name.name), arg_list_type->decompose(arg_types_val)}); mutation m{s, pkey}; return {std::move(m), std::move(ckey)}; } std::vector make_create_aggregate_mutations(schema_features features, shared_ptr aggregate, api::timestamp_type timestamp) { schema_ptr s = aggregates(); auto p = get_mutation(s, *aggregate); mutation& m = p.first; clustering_key& ckey = p.second; std::vector muts; data_type state_type = aggregate->sfunc()->arg_types()[0]; if (aggregate->has_finalfunc()) { m.set_clustered_cell(ckey, "final_func", aggregate->finalfunc()->name().name, timestamp); } if (aggregate->initcond()) { m.set_clustered_cell(ckey, "initcond", state_type->deserialize(*aggregate->initcond()).to_parsable_string(), timestamp); } m.set_clustered_cell(ckey, "return_type", aggregate->return_type()->as_cql3_type().to_string(), timestamp); m.set_clustered_cell(ckey, "state_func", aggregate->sfunc()->name().name, timestamp); m.set_clustered_cell(ckey, "state_type", state_type->as_cql3_type().to_string(), timestamp); muts.emplace_back(std::move(m)); if (features.contains() && aggregate->is_reducible()) { schema_ptr sa_schema = scylla_aggregates(); auto sa_p = get_mutation(sa_schema, *aggregate); mutation& sa_mut = sa_p.first; clustering_key& sa_ckey = sa_p.second; sa_mut.set_clustered_cell(sa_ckey, "reduce_func", aggregate->reducefunc()->name().name, timestamp); sa_mut.set_clustered_cell(sa_ckey, "state_type", state_type->as_cql3_type().to_string(), timestamp); muts.emplace_back(std::move(sa_mut)); } return muts; } std::vector make_drop_aggregate_mutations(schema_features features, shared_ptr aggregate, api::timestamp_type timestamp) { auto muts = make_drop_function_mutations(aggregates(), *aggregate, timestamp); if (features.contains() && aggregate->is_reducible()) { auto scylla_muts = make_drop_function_mutations(scylla_aggregates(), *aggregate, timestamp); std::move(scylla_muts.begin(), scylla_muts.end(), std::back_inserter(muts)); } return muts; } /* * Table metadata serialization/deserialization. */ /// Returns mutations which when applied to the database will cause all schema changes /// which create or alter the table with a given name, made with timestamps smaller than t, /// have no effect. Used when overriding schema to shadow concurrent conflicting schema changes. /// Shouldn't be needed if schema changes are serialized with RAFT. static schema_mutations make_table_deleting_mutations(const sstring& ks, const sstring& table, bool is_view, api::timestamp_type t) { tombstone tomb; // Generate neutral mutations if t == api::min_timestamp if (t > api::min_timestamp) { tomb = tombstone(t - 1, gc_clock::now()); } auto tables_m_s = is_view ? views() : tables(); mutation tables_m{tables_m_s, partition_key::from_singular(*tables_m_s, ks)}; { auto ckey = clustering_key::from_singular(*tables_m_s, table); tables_m.partition().apply_delete(*tables_m_s, ckey, tomb); } mutation scylla_tables_m{scylla_tables(), partition_key::from_singular(*scylla_tables(), ks)}; { auto ckey = clustering_key::from_singular(*scylla_tables(), table); scylla_tables_m.partition().apply_delete(*scylla_tables(), ckey, tomb); } auto make_drop_columns = [&] (const schema_ptr& s) { mutation m{s, partition_key::from_singular(*s, ks)}; auto ckey = clustering_key::from_exploded(*s, {utf8_type->decompose(table)}); m.partition().apply_delete(*s, ckey, tomb); return m; }; return schema_mutations(std::move(tables_m), make_drop_columns(columns()), make_drop_columns(view_virtual_columns()), make_drop_columns(computed_columns()), mutation(indexes(), partition_key::from_singular(*indexes(), ks)), make_drop_columns(dropped_columns()), std::move(scylla_tables_m) ); } std::vector make_create_table_mutations(schema_ptr table, api::timestamp_type timestamp) { std::vector mutations; add_table_or_view_to_schema_mutation(table, timestamp, true, mutations); make_table_deleting_mutations(table->ks_name(), table->cf_name(), table->is_view(), timestamp) .copy_to(mutations); return mutations; } static void add_table_params_to_mutations(mutation& m, const clustering_key& ckey, schema_ptr table, api::timestamp_type timestamp) { m.set_clustered_cell(ckey, "bloom_filter_fp_chance", table->bloom_filter_fp_chance(), timestamp); m.set_clustered_cell(ckey, "comment", table->comment(), timestamp); m.set_clustered_cell(ckey, "default_time_to_live", gc_clock::as_int32(table->default_time_to_live()), timestamp); m.set_clustered_cell(ckey, "gc_grace_seconds", gc_clock::as_int32(table->gc_grace_seconds()), timestamp); m.set_clustered_cell(ckey, "max_index_interval", table->max_index_interval(), timestamp); m.set_clustered_cell(ckey, "memtable_flush_period_in_ms", table->memtable_flush_period(), timestamp); m.set_clustered_cell(ckey, "min_index_interval", table->min_index_interval(), timestamp); m.set_clustered_cell(ckey, "speculative_retry", table->speculative_retry().to_sstring(), timestamp); m.set_clustered_cell(ckey, "crc_check_chance", table->crc_check_chance(), timestamp); store_map(m, ckey, "caching", timestamp, table->caching_options().to_map()); { auto map = table->compaction_strategy_options(); map["class"] = sstables::compaction_strategy::name(table->configured_compaction_strategy()); store_map(m, ckey, "compaction", timestamp, map); } store_map(m, ckey, "compression", timestamp, table->get_compressor_params().get_options()); std::map map; if (!table->extensions().empty()) { for (auto& p : table->extensions()) { map.emplace(p.first, p.second->serialize()); } } store_map(m, ckey, "extensions", timestamp, map); } static data_type expand_user_type(data_type); static std::vector expand_user_types(const std::vector& types) { std::vector result; result.reserve(types.size()); std::transform(types.begin(), types.end(), std::back_inserter(result), &expand_user_type); return result; } static data_type expand_user_type(data_type original) { if (original->is_user_type()) { return tuple_type_impl::get_instance( expand_user_types( static_pointer_cast( original)->field_types())); } if (original->is_tuple()) { return tuple_type_impl::get_instance( expand_user_types( static_pointer_cast< const tuple_type_impl>( original)->all_types())); } if (original->is_reversed()) { return reversed_type_impl::get_instance( expand_user_type(original->underlying_type())); } if (original->is_collection()) { auto ct = static_pointer_cast(original); if (ct->is_list()) { return list_type_impl::get_instance( expand_user_type(ct->value_comparator()), ct->is_multi_cell()); } if (ct->is_map()) { return map_type_impl::get_instance( expand_user_type(ct->name_comparator()), expand_user_type(ct->value_comparator()), ct->is_multi_cell()); } if (ct->is_set()) { return set_type_impl::get_instance( expand_user_type(ct->name_comparator()), ct->is_multi_cell()); } } return original; } static void add_dropped_column_to_schema_mutation(schema_ptr table, const sstring& name, const schema::dropped_column& column, api::timestamp_type timestamp, mutation& m) { auto ckey = clustering_key::from_exploded(*dropped_columns(), {utf8_type->decompose(table->cf_name()), utf8_type->decompose(name)}); db_clock::time_point tp(db_clock::duration(column.timestamp)); m.set_clustered_cell(ckey, "dropped_time", tp, timestamp); /* * From origin: * we never store actual UDT names in dropped column types (so that we can safely drop types if nothing refers to * them anymore), so before storing dropped columns in schema we expand UDTs to tuples. See expandUserTypes method. * Because of that, we can safely pass Types.none() to parse() */ m.set_clustered_cell(ckey, "type", expand_user_type(column.type)->as_cql3_type().to_string(), timestamp); } mutation make_scylla_tables_mutation(schema_ptr table, api::timestamp_type timestamp) { schema_ptr s = tables(); auto pkey = partition_key::from_singular(*s, table->ks_name()); auto ckey = clustering_key::from_singular(*s, table->cf_name()); mutation m(scylla_tables(), pkey); m.set_clustered_cell(ckey, "version", table->version().uuid(), timestamp); // Since 4.0, we stopped using cdc column in scylla tables. Extensions are // used instead. Since we stopped reading this column in commit 861c7b5, we // can now keep it always empty. auto& cdc_cdef = *scylla_tables()->get_column_definition("cdc"); m.set_clustered_cell(ckey, cdc_cdef, atomic_cell::make_dead(timestamp, gc_clock::now())); if (table->has_custom_partitioner()) { m.set_clustered_cell(ckey, "partitioner", table->get_partitioner().name(), timestamp); } else { // Avoid storing anything for default partitioner, so we don't end up with // different digests on different nodes due to the other node redacting // the partitioner column when the per_table_partitioners cluster feature is disabled. // // Tombstones are not considered for schema digest, so this is okay (and // needed in order for disabling of per_table_partitioners to have effect). auto& cdef = *scylla_tables()->get_column_definition("partitioner"); m.set_clustered_cell(ckey, cdef, atomic_cell::make_dead(timestamp, gc_clock::now())); } return m; } static schema_mutations make_table_mutations(schema_ptr table, api::timestamp_type timestamp, bool with_columns_and_triggers) { // When adding new schema properties, don't set cells for default values so that // both old and new nodes will see the same version during rolling upgrades. // For property that can be null (and can be changed), we insert tombstones, to make sure // we don't keep a property the user has removed schema_ptr s = tables(); auto pkey = partition_key::from_singular(*s, table->ks_name()); mutation m{s, pkey}; auto ckey = clustering_key::from_singular(*s, table->cf_name()); m.set_clustered_cell(ckey, "id", table->id().uuid(), timestamp); auto scylla_tables_mutation = make_scylla_tables_mutation(table, timestamp); list_type_impl::native_type flags; if (table->is_super()) { flags.emplace_back("super"); } if (table->is_dense()) { flags.emplace_back("dense"); } if (table->is_compound()) { flags.emplace_back("compound"); } if (table->is_counter()) { flags.emplace_back("counter"); } m.set_clustered_cell(ckey, "flags", make_list_value(s->get_column_definition("flags")->type, flags), timestamp); add_table_params_to_mutations(m, ckey, table, timestamp); mutation columns_mutation(columns(), pkey); mutation computed_columns_mutation(computed_columns(), pkey); mutation dropped_columns_mutation(dropped_columns(), pkey); mutation indices_mutation(indexes(), pkey); if (with_columns_and_triggers) { for (auto&& column : table->v3().all_columns()) { if (column.is_view_virtual()) { throw std::logic_error("view_virtual column found in non-view table"); } add_column_to_schema_mutation(table, column, timestamp, columns_mutation); if (column.is_computed()) { add_computed_column_to_schema_mutation(table, column, timestamp, computed_columns_mutation); } } for (auto&& index : table->indices()) { add_index_to_schema_mutation(table, index, timestamp, indices_mutation); } // TODO: triggers for (auto&& e : table->dropped_columns()) { add_dropped_column_to_schema_mutation(table, e.first, e.second, timestamp, dropped_columns_mutation); } } return schema_mutations{std::move(m), std::move(columns_mutation), std::nullopt, std::move(computed_columns_mutation), std::move(indices_mutation), std::move(dropped_columns_mutation), std::move(scylla_tables_mutation)}; } void add_table_or_view_to_schema_mutation(schema_ptr s, api::timestamp_type timestamp, bool with_columns, std::vector& mutations) { make_schema_mutations(s, timestamp, with_columns).copy_to(mutations); } static schema_mutations make_view_mutations(view_ptr view, api::timestamp_type timestamp, bool with_columns); static void make_drop_table_or_view_mutations(schema_ptr schema_table, schema_ptr table_or_view, api::timestamp_type timestamp, std::vector& mutations); static void make_update_indices_mutations( replica::database& db, schema_ptr old_table, schema_ptr new_table, api::timestamp_type timestamp, std::vector& mutations) { mutation indices_mutation(indexes(), partition_key::from_singular(*indexes(), old_table->ks_name())); auto diff = difference(old_table->all_indices(), new_table->all_indices()); // indices that are no longer needed for (auto&& name : diff.entries_only_on_left) { const index_metadata& index = old_table->all_indices().at(name); drop_index_from_schema_mutation(old_table, index, timestamp, mutations); schema_ptr view; try { view = db.find_schema(old_table->ks_name(), secondary_index::index_table_name(name)); db.get_notifier().before_drop_column_family(*view, mutations, timestamp); } catch (const replica::no_such_column_family&) { on_internal_error(slogger, format("Could not find schema for dropped index {}.{}", old_table->ks_name(), secondary_index::index_table_name(name))); } make_drop_table_or_view_mutations(views(), view, timestamp, mutations); } auto add_index = [&](const sstring& name) -> view_ptr { const index_metadata& index = new_table->all_indices().at(name); add_index_to_schema_mutation(new_table, index, timestamp, indices_mutation); auto& cf = db.find_column_family(new_table); auto view = cf.get_index_manager().create_view_for_index(index); auto view_mutations = make_view_mutations(view, timestamp, true); view_mutations.copy_to(mutations); return view; }; // old indices with updated attributes for (auto&& name : diff.entries_differing) { add_index(name); } // Newly added indices. Because these are newly created tables (views), // we need to call the before_create_column_family callback for them. // If we don't, among other things *tablets* will not be created for // these new views. // The callbacks must be called in a Seastar thread, which means that // *this* function must be called in a Seastar thread when creating an // index. for (auto&& name : diff.entries_only_on_right) { auto view = add_index(name); auto ksm = db.find_keyspace(new_table->ks_name()).metadata(); db.get_notifier().before_create_column_family(*ksm, *view, mutations, timestamp); } mutations.emplace_back(std::move(indices_mutation)); } static void add_drop_column_to_mutations(schema_ptr table, const sstring& name, const schema::dropped_column& dc, api::timestamp_type timestamp, std::vector& mutations) { schema_ptr s = dropped_columns(); auto pkey = partition_key::from_singular(*s, table->ks_name()); auto ckey = clustering_key::from_exploded(*s, {utf8_type->decompose(table->cf_name()), utf8_type->decompose(name)}); mutation m(s, pkey); add_dropped_column_to_schema_mutation(table, name, dc, timestamp, m); mutations.emplace_back(std::move(m)); } static void make_update_columns_mutations(schema_ptr old_table, schema_ptr new_table, api::timestamp_type timestamp, bool from_thrift, std::vector& mutations) { mutation columns_mutation(columns(), partition_key::from_singular(*columns(), old_table->ks_name())); mutation view_virtual_columns_mutation(view_virtual_columns(), partition_key::from_singular(*columns(), old_table->ks_name())); mutation computed_columns_mutation(computed_columns(), partition_key::from_singular(*columns(), old_table->ks_name())); auto diff = difference(old_table->v3().columns_by_name(), new_table->v3().columns_by_name()); // columns that are no longer needed for (auto&& name : diff.entries_only_on_left) { // Thrift only knows about the REGULAR ColumnDefinition type, so don't consider other type // are being deleted just because they are not here. const column_definition& column = *old_table->v3().columns_by_name().at(name); if (from_thrift && !column.is_regular()) { continue; } if (column.is_view_virtual()) { drop_column_from_schema_mutation(view_virtual_columns(), old_table, column.name_as_text(), timestamp, mutations); } else { drop_column_from_schema_mutation(columns(), old_table, column.name_as_text(), timestamp, mutations); } if (column.is_computed()) { drop_column_from_schema_mutation(computed_columns(), old_table, column.name_as_text(), timestamp, mutations); } } // newly added columns and old columns with updated attributes for (auto&& name : boost::range::join(diff.entries_differing, diff.entries_only_on_right)) { const column_definition& column = *new_table->v3().columns_by_name().at(name); if (column.is_view_virtual()) { add_column_to_schema_mutation(new_table, column, timestamp, view_virtual_columns_mutation); } else { add_column_to_schema_mutation(new_table, column, timestamp, columns_mutation); } if (column.is_computed()) { add_computed_column_to_schema_mutation(new_table, column, timestamp, computed_columns_mutation); } } mutations.emplace_back(std::move(columns_mutation)); mutations.emplace_back(std::move(view_virtual_columns_mutation)); mutations.emplace_back(std::move(computed_columns_mutation)); // dropped columns auto dc_diff = difference(old_table->dropped_columns(), new_table->dropped_columns()); // newly dropped columns // columns added then dropped again for (auto& name : boost::range::join(dc_diff.entries_differing, dc_diff.entries_only_on_right)) { add_drop_column_to_mutations(new_table, name, new_table->dropped_columns().at(name), timestamp, mutations); } } std::vector make_update_table_mutations(replica::database& db, lw_shared_ptr keyspace, schema_ptr old_table, schema_ptr new_table, api::timestamp_type timestamp, bool from_thrift) { std::vector mutations; add_table_or_view_to_schema_mutation(new_table, timestamp, false, mutations); make_update_indices_mutations(db, old_table, new_table, timestamp, mutations); make_update_columns_mutations(std::move(old_table), std::move(new_table), timestamp, from_thrift, mutations); warn(unimplemented::cause::TRIGGERS); #if 0 MapDifference triggerDiff = Maps.difference(oldTable.getTriggers(), newTable.getTriggers()); // dropped triggers for (TriggerDefinition trigger : triggerDiff.entriesOnlyOnLeft().values()) dropTriggerFromSchemaMutation(oldTable, trigger, timestamp, mutation); // newly created triggers for (TriggerDefinition trigger : triggerDiff.entriesOnlyOnRight().values()) addTriggerToSchemaMutation(newTable, trigger, timestamp, mutation); #endif return mutations; } static void make_drop_table_or_view_mutations(schema_ptr schema_table, schema_ptr table_or_view, api::timestamp_type timestamp, std::vector& mutations) { auto pkey = partition_key::from_singular(*schema_table, table_or_view->ks_name()); mutation m{schema_table, pkey}; auto ckey = clustering_key::from_singular(*schema_table, table_or_view->cf_name()); m.partition().apply_delete(*schema_table, ckey, tombstone(timestamp, gc_clock::now())); mutations.emplace_back(m); for (auto& column : table_or_view->v3().all_columns()) { if (column.is_view_virtual()) { drop_column_from_schema_mutation(view_virtual_columns(), table_or_view, column.name_as_text(), timestamp, mutations); } else { drop_column_from_schema_mutation(columns(), table_or_view, column.name_as_text(), timestamp, mutations); } if (column.is_computed()) { drop_column_from_schema_mutation(computed_columns(), table_or_view, column.name_as_text(), timestamp, mutations); } } for (auto& column : table_or_view->dropped_columns() | boost::adaptors::map_keys) { drop_column_from_schema_mutation(dropped_columns(), table_or_view, column, timestamp, mutations); } mutation m1{scylla_tables(), pkey}; m1.partition().apply_delete(*scylla_tables(), ckey, tombstone(timestamp, gc_clock::now())); mutations.emplace_back(m1); } std::vector make_drop_table_mutations(lw_shared_ptr keyspace, schema_ptr table, api::timestamp_type timestamp) { std::vector mutations; make_drop_table_or_view_mutations(tables(), std::move(table), timestamp, mutations); #if 0 for (TriggerDefinition trigger : table.getTriggers().values()) dropTriggerFromSchemaMutation(table, trigger, timestamp, mutation); // TODO: get rid of in #6717 ColumnFamily indexCells = mutation.addOrGet(SystemKeyspace.BuiltIndexes); for (String indexName : Keyspace.open(keyspace.name).getColumnFamilyStore(table.cfName).getBuiltIndexes()) indexCells.addTombstone(indexCells.getComparator().makeCellName(indexName), ldt, timestamp); #endif return mutations; } static future read_table_mutations(distributed& proxy, const qualified_name& table, schema_ptr s) { auto&& [cf_m, col_m, vv_col_m, c_col_m, dropped_m, idx_m, st_m] = co_await coroutine::all( [&] { return read_schema_partition_for_table(proxy, s, table.keyspace_name, table.table_name); }, [&] { return read_schema_partition_for_table(proxy, columns(), table.keyspace_name, table.table_name); }, [&] { return read_schema_partition_for_table(proxy, view_virtual_columns(), table.keyspace_name, table.table_name); }, [&] { return read_schema_partition_for_table(proxy, computed_columns(), table.keyspace_name, table.table_name); }, [&] { return read_schema_partition_for_table(proxy, dropped_columns(), table.keyspace_name, table.table_name); }, [&] { return read_schema_partition_for_table(proxy, indexes(), table.keyspace_name, table.table_name); }, [&] { return read_schema_partition_for_table(proxy, scylla_tables(), table.keyspace_name, table.table_name); } ); co_return schema_mutations{std::move(cf_m), std::move(col_m), std::move(vv_col_m), std::move(c_col_m), std::move(idx_m), std::move(dropped_m), std::move(st_m)}; #if 0 // FIXME: Row serializedTriggers = readSchemaPartitionForTable(TRIGGERS, ksName, cfName); try { for (TriggerDefinition trigger : createTriggersFromTriggersPartition(serializedTriggers)) cfm.addTriggerDefinition(trigger); } catch (InvalidRequestException e) { throw new RuntimeException(e); } #endif } future create_table_from_name(distributed& proxy, const sstring& keyspace, const sstring& table) { auto qn = qualified_name(keyspace, table); auto sm = co_await read_table_mutations(proxy, qn, tables()); if (!sm.live()) { co_await coroutine::return_exception(std::runtime_error(format("{}:{} not found in the schema definitions keyspace.", qn.keyspace_name, qn.table_name))); } co_return create_table_from_mutations(proxy, std::move(sm)); } /** * Deserialize tables from low-level schema representation, all of them belong to the same keyspace * * @return map containing name of the table and its metadata for faster lookup */ future> create_tables_from_tables_partition(distributed& proxy, const schema_result::mapped_type& result) { auto tables = std::map(); co_await max_concurrent_for_each(result->rows().begin(), result->rows().end(), max_concurrent, [&] (const query::result_set_row& row) -> future<> { schema_ptr cfm = co_await create_table_from_table_row(proxy, row); tables.emplace(cfm->cf_name(), std::move(cfm)); }); co_return std::move(tables); } #if 0 public static CFMetaData createTableFromTablePartitionAndColumnsPartition(Row serializedTable, Row serializedColumns) { String query = String.format("SELECT * FROM %s.%s", SystemKeyspace.NAME, COLUMNFAMILIES); return createTableFromTableRowAndColumnsPartition(QueryProcessor.resultify(query, serializedTable).one(), serializedColumns); } #endif /** * Deserialize table metadata from low-level representation * * @return Metadata deserialized from schema */ static future create_table_from_table_row(distributed& proxy, const query::result_set_row& row) { auto ks_name = row.get_nonnull("keyspace_name"); auto cf_name = row.get_nonnull("table_name"); return create_table_from_name(proxy, ks_name, cf_name); } static void prepare_builder_from_table_row(const schema_ctxt& ctxt, schema_builder& builder, const query::result_set_row& table_row) { // These row reads have been purposefully reordered to match the origin counterpart. For easier matching. if (auto val = table_row.get("bloom_filter_fp_chance")) { builder.set_bloom_filter_fp_chance(*val); } else { builder.set_bloom_filter_fp_chance(builder.get_bloom_filter_fp_chance()); } if (auto map = get_map(table_row, "caching")) { builder.set_caching_options(caching_options::from_map(*map)); } if (auto val = table_row.get("comment")) { builder.set_comment(*val); } if (auto opt_map = get_map(table_row, "compaction")) { auto &map = *opt_map; auto i = map.find("class"); if (i != map.end()) { try { builder.set_compaction_strategy(sstables::compaction_strategy::type(i->second)); map.erase(i); } catch (const exceptions::configuration_exception& e) { // If compaction strategy class isn't supported, fallback to size tiered. slogger.warn("Falling back to size-tiered compaction strategy after the problem: {}", e.what()); builder.set_compaction_strategy(sstables::compaction_strategy_type::size_tiered); } } if (map.contains("max_threshold")) { builder.set_max_compaction_threshold(std::stoi(map["max_threshold"])); } if (map.contains("min_threshold")) { builder.set_min_compaction_threshold(std::stoi(map["min_threshold"])); } if (map.contains("enabled")) { builder.set_compaction_enabled(boost::algorithm::iequals(map["enabled"], "true")); } builder.set_compaction_strategy_options(std::move(map)); } if (auto map = get_map(table_row, "compression")) { compression_parameters cp(*map); builder.set_compressor_params(cp); } if (auto val = table_row.get("default_time_to_live")) { builder.set_default_time_to_live(gc_clock::duration(*val)); } if (auto val = get_map(table_row, "extensions")) { auto &map = *val; schema::extensions_map result; auto& exts = ctxt.extensions().schema_extensions(); for (auto&p : map) { auto i = exts.find(p.first); if (i != exts.end()) { try { auto ep = i->second(p.second); if (ep) { result.emplace(p.first, std::move(ep)); } continue; } catch (...) { slogger.warn("Error parsing extension {}: {}", p.first, std::current_exception()); } } // unknown. we should still preserve it. class placeholder : public schema_extension { bytes _bytes; public: placeholder(bytes bytes) : _bytes(std::move(bytes)) { } bytes serialize() const override { return _bytes; } virtual bool is_placeholder() const override { return true; } }; result.emplace(p.first, ::make_shared(p.second)); } builder.set_extensions(std::move(result)); } if (auto val = table_row.get("gc_grace_seconds")) { builder.set_gc_grace_seconds(*val); } if (auto val = table_row.get("min_index_interval")) { builder.set_min_index_interval(*val); } if (auto val = table_row.get("memtable_flush_period_in_ms")) { builder.set_memtable_flush_period(*val); } if (auto val = table_row.get("max_index_interval")) { builder.set_max_index_interval(*val); } if (auto val = table_row.get("crc_check_chance")) { builder.set_crc_check_chance(*val); } if (auto val = table_row.get("speculative_retry")) { builder.set_speculative_retry(*val); } } schema_ptr create_table_from_mutations(const schema_ctxt& ctxt, schema_mutations sm, std::optional version) { slogger.trace("create_table_from_mutations: version={}, {}", version, sm); auto table_rs = query::result_set(sm.columnfamilies_mutation()); query::result_set_row table_row = table_rs.row(0); auto ks_name = table_row.get_nonnull("keyspace_name"); auto cf_name = table_row.get_nonnull("table_name"); auto id = table_id(table_row.get_nonnull("id")); schema_builder builder{ks_name, cf_name, id}; auto cf = cf_type::standard; auto is_dense = false; auto is_counter = false; auto is_compound = false; auto flags = table_row.get("flags"); if (flags) { for (auto& s : *flags) { if (s == "super") { // cf = cf_type::super; fail(unimplemented::cause::SUPER); } else if (s == "dense") { is_dense = true; } else if (s == "compound") { is_compound = true; } else if (s == "counter") { is_counter = true; } } } auto computed_columns = get_computed_columns(sm); std::vector column_defs = create_columns_from_column_rows( ctxt, query::result_set(sm.columns_mutation()), ks_name, cf_name,/*, fullRawComparator, */ cf == cf_type::super, column_view_virtual::no, computed_columns); builder.set_is_dense(is_dense); builder.set_is_compound(is_compound); builder.set_is_counter(is_counter); prepare_builder_from_table_row(ctxt, builder, table_row); v3_columns columns(std::move(column_defs), is_dense, is_compound); columns.apply_to(builder); std::vector index_defs; if (sm.indices_mutation()) { index_defs = create_indices_from_index_rows(query::result_set(*sm.indices_mutation()), ks_name, cf_name); } for (auto&& index : index_defs) { builder.with_index(index); } if (sm.dropped_columns_mutation()) { query::result_set dcr(*sm.dropped_columns_mutation()); for (auto& row : dcr.rows()) { auto name = row.get_nonnull("column_name"); auto type = cql_type_parser::parse(ks_name, row.get_nonnull("type"), ctxt.user_types()); auto time = row.get_nonnull("dropped_time"); builder.without_column(name, type, time.time_since_epoch().count()); } } if (version) { builder.with_version(*version); } else { builder.with_version(sm.digest(ctxt.features().cluster_schema_features())); } if (auto partitioner = sm.partitioner()) { builder.with_partitioner(*partitioner); builder.with_sharder(smp::count, ctxt.murmur3_partitioner_ignore_msb_bits()); } return builder.build(); } /* * Column metadata serialization/deserialization. */ static void add_column_to_schema_mutation(schema_ptr table, const column_definition& column, api::timestamp_type timestamp, mutation& m) { auto ckey = clustering_key::from_exploded(*m.schema(), {utf8_type->decompose(table->cf_name()), utf8_type->decompose(column.name_as_text())}); fill_column_info(*table, ckey, column, timestamp, std::nullopt, m); } static void add_computed_column_to_schema_mutation(schema_ptr table, const column_definition& column, api::timestamp_type timestamp, mutation& m) { auto ckey = clustering_key::from_exploded(*m.schema(), {utf8_type->decompose(table->cf_name()), utf8_type->decompose(column.name_as_text())}); m.set_clustered_cell(ckey, "computation", data_value(column.get_computation().serialize()), timestamp); } sstring serialize_kind(column_kind kind) { switch (kind) { case column_kind::partition_key: return "partition_key"; case column_kind::clustering_key: return "clustering"; case column_kind::static_column: return "static"; case column_kind::regular_column: return "regular"; default: throw std::invalid_argument("unknown column kind"); } } column_kind deserialize_kind(sstring kind) { if (kind == "partition_key") { return column_kind::partition_key; } else if (kind == "clustering_key" || kind == "clustering") { return column_kind::clustering_key; } else if (kind == "static") { return column_kind::static_column; } else if (kind == "regular") { return column_kind::regular_column; } else if (kind == "compact_value") { // backward compatibility return column_kind::regular_column; } else { throw std::invalid_argument("unknown column kind: " + kind); } } sstring serialize_index_kind(index_metadata_kind kind) { switch (kind) { case index_metadata_kind::keys: return "KEYS"; case index_metadata_kind::composites: return "COMPOSITES"; case index_metadata_kind::custom: return "CUSTOM"; } throw std::invalid_argument("unknown index kind"); } index_metadata_kind deserialize_index_kind(sstring kind) { if (kind == "KEYS") { return index_metadata_kind::keys; } else if (kind == "COMPOSITES") { return index_metadata_kind::composites; } else if (kind == "CUSTOM") { return index_metadata_kind::custom; } else { throw std::invalid_argument("unknown column kind: " + kind); } } static void add_index_to_schema_mutation(schema_ptr table, const index_metadata& index, api::timestamp_type timestamp, mutation& m) { auto ckey = clustering_key::from_exploded(*m.schema(), {utf8_type->decompose(table->cf_name()), utf8_type->decompose(index.name())}); m.set_clustered_cell(ckey, "kind", serialize_index_kind(index.kind()), timestamp); store_map(m, ckey, "options", timestamp, index.options()); } static void drop_index_from_schema_mutation(schema_ptr table, const index_metadata& index, long timestamp, std::vector& mutations) { schema_ptr s = indexes(); auto pkey = partition_key::from_singular(*s, table->ks_name()); auto ckey = clustering_key::from_exploded(*s, {utf8_type->decompose(table->cf_name()), utf8_type->decompose(index.name())}); mutation m{s, pkey}; m.partition().apply_delete(*s, ckey, tombstone(timestamp, gc_clock::now())); mutations.push_back(std::move(m)); } static void drop_column_from_schema_mutation( schema_ptr schema_table, schema_ptr table, const sstring& column_name, long timestamp, std::vector& mutations) { auto pkey = partition_key::from_singular(*schema_table, table->ks_name()); auto ckey = clustering_key::from_exploded(*schema_table, {utf8_type->decompose(table->cf_name()), utf8_type->decompose(column_name)}); mutation m{schema_table, pkey}; m.partition().apply_delete(*schema_table, ckey, tombstone(timestamp, gc_clock::now())); mutations.emplace_back(m); } static computed_columns_map get_computed_columns(const schema_mutations& sm) { if (!sm.computed_columns_mutation()) { return {}; } query::result_set computed_result(*sm.computed_columns_mutation()); return boost::copy_range( computed_result.rows() | boost::adaptors::transformed([] (const query::result_set_row& row) { return computed_columns_map::value_type{to_bytes(row.get_nonnull("column_name")), column_computation::deserialize(row.get_nonnull("computation"))}; })); } static std::vector create_columns_from_column_rows(const schema_ctxt& ctxt, const query::result_set& rows, const sstring& keyspace, const sstring& table, /*, AbstractType rawComparator, */ bool is_super, column_view_virtual is_view_virtual, const computed_columns_map& computed_columns) { std::vector columns; for (auto&& row : rows.rows()) { auto kind = deserialize_kind(row.get_nonnull("kind")); auto type = cql_type_parser::parse(keyspace, row.get_nonnull("type"), ctxt.user_types()); auto name_bytes = row.get_nonnull("column_name_bytes"); column_id position = row.get_nonnull("position"); if (auto val = row.get("clustering_order")) { auto order = *val; std::transform(order.begin(), order.end(), order.begin(), ::toupper); if (order == "DESC") { type = reversed_type_impl::get_instance(type); } } column_computation_ptr computation; auto computed_it = computed_columns.find(name_bytes); if (computed_it != computed_columns.end()) { computation = computed_it->second->clone(); } columns.emplace_back(name_bytes, type, kind, position, is_view_virtual, std::move(computation)); } return columns; } static std::vector create_indices_from_index_rows(const query::result_set& rows, const sstring& keyspace, const sstring& table) { return boost::copy_range>(rows.rows() | boost::adaptors::transformed([&keyspace, &table] (auto&& row) { return create_index_from_index_row(row, keyspace, table); })); } static index_metadata create_index_from_index_row(const query::result_set_row& row, sstring keyspace, sstring table) { auto index_name = row.get_nonnull("index_name"); index_options_map options; auto map = row.get_nonnull("options"); for (auto&& entry : map) { options.emplace(value_cast(entry.first), value_cast(entry.second)); } index_metadata_kind kind = deserialize_index_kind(row.get_nonnull("kind")); sstring target_string = options.at(cql3::statements::index_target::target_option_name); const index_metadata::is_local_index is_local(secondary_index::target_parser::is_local(target_string)); return index_metadata{index_name, options, kind, is_local}; } /* * View metadata serialization/deserialization. */ view_ptr create_view_from_mutations(const schema_ctxt& ctxt, schema_mutations sm, std::optional version) { auto table_rs = query::result_set(sm.columnfamilies_mutation()); query::result_set_row row = table_rs.row(0); auto ks_name = row.get_nonnull("keyspace_name"); auto cf_name = row.get_nonnull("view_name"); auto id = table_id(row.get_nonnull("id")); schema_builder builder{ks_name, cf_name, id}; prepare_builder_from_table_row(ctxt, builder, row); auto computed_columns = get_computed_columns(sm); auto column_defs = create_columns_from_column_rows(ctxt, query::result_set(sm.columns_mutation()), ks_name, cf_name, false, column_view_virtual::no, computed_columns); for (auto&& cdef : column_defs) { builder.with_column_ordered(cdef); } if (sm.view_virtual_columns_mutation()) { column_defs = create_columns_from_column_rows(ctxt, query::result_set(*sm.view_virtual_columns_mutation()), ks_name, cf_name, false, column_view_virtual::yes, computed_columns); for (auto&& cdef : column_defs) { builder.with_column_ordered(cdef); } } if (version) { builder.with_version(*version); } else { builder.with_version(sm.digest(ctxt.features().cluster_schema_features())); } auto base_id = table_id(row.get_nonnull("base_table_id")); auto base_name = row.get_nonnull("base_table_name"); auto include_all_columns = row.get_nonnull("include_all_columns"); auto where_clause = row.get_nonnull("where_clause"); builder.with_view_info(std::move(base_id), std::move(base_name), include_all_columns, std::move(where_clause)); return view_ptr(builder.build()); } static future create_view_from_table_row(distributed& proxy, const query::result_set_row& row) { qualified_name qn(row.get_nonnull("keyspace_name"), row.get_nonnull("view_name")); schema_mutations sm = co_await read_table_mutations(proxy, qn, views()); if (!sm.live()) { co_await coroutine::return_exception(std::runtime_error(format("{}:{} not found in the view definitions keyspace.", qn.keyspace_name, qn.table_name))); } co_return create_view_from_mutations(proxy, std::move(sm)); } /** * Deserialize views from low-level schema representation, all of them belong to the same keyspace * * @return vector containing the view definitions */ future> create_views_from_schema_partition(distributed& proxy, const schema_result::mapped_type& result) { std::vector views; co_await max_concurrent_for_each(result->rows().begin(), result->rows().end(), max_concurrent, [&] (auto&& row) -> future<> { auto v = co_await create_view_from_table_row(proxy, row); views.push_back(std::move(v)); }); co_return std::move(views); } static schema_mutations make_view_mutations(view_ptr view, api::timestamp_type timestamp, bool with_columns) { // When adding new schema properties, don't set cells for default values so that // both old and new nodes will see the same version during rolling upgrades. // For properties that can be null (and can be changed), we insert tombstones, to make sure // we don't keep a property the user has removed schema_ptr s = views(); auto pkey = partition_key::from_singular(*s, view->ks_name()); mutation m{s, pkey}; auto ckey = clustering_key::from_singular(*s, view->cf_name()); m.set_clustered_cell(ckey, "base_table_id", view->view_info()->base_id().uuid(), timestamp); m.set_clustered_cell(ckey, "base_table_name", view->view_info()->base_name(), timestamp); m.set_clustered_cell(ckey, "where_clause", view->view_info()->where_clause(), timestamp); m.set_clustered_cell(ckey, "bloom_filter_fp_chance", view->bloom_filter_fp_chance(), timestamp); m.set_clustered_cell(ckey, "include_all_columns", view->view_info()->include_all_columns(), timestamp); m.set_clustered_cell(ckey, "id", view->id().uuid(), timestamp); add_table_params_to_mutations(m, ckey, view, timestamp); mutation columns_mutation(columns(), pkey); mutation view_virtual_columns_mutation(view_virtual_columns(), pkey); mutation computed_columns_mutation(computed_columns(), pkey); mutation dropped_columns_mutation(dropped_columns(), pkey); mutation indices_mutation(indexes(), pkey); if (with_columns) { for (auto&& column : view->v3().all_columns()) { if (column.is_view_virtual()) { add_column_to_schema_mutation(view, column, timestamp, view_virtual_columns_mutation); } else { add_column_to_schema_mutation(view, column, timestamp, columns_mutation); } if (column.is_computed()) { add_computed_column_to_schema_mutation(view, column, timestamp, computed_columns_mutation); } } for (auto&& e : view->dropped_columns()) { add_dropped_column_to_schema_mutation(view, e.first, e.second, timestamp, dropped_columns_mutation); } for (auto&& index : view->indices()) { add_index_to_schema_mutation(view, index, timestamp, indices_mutation); } } auto scylla_tables_mutation = make_scylla_tables_mutation(view, timestamp); return schema_mutations{std::move(m), std::move(columns_mutation), std::move(view_virtual_columns_mutation), std::move(computed_columns_mutation), std::move(indices_mutation), std::move(dropped_columns_mutation), std::move(scylla_tables_mutation)}; } schema_mutations make_schema_mutations(schema_ptr s, api::timestamp_type timestamp, bool with_columns) { return s->is_view() ? make_view_mutations(view_ptr(s), timestamp, with_columns) : make_table_mutations(s, timestamp, with_columns); } std::vector make_create_view_mutations(lw_shared_ptr keyspace, view_ptr view, api::timestamp_type timestamp) { std::vector mutations; // Include the serialized base table mutations in case the target node is missing them. auto base = keyspace->cf_meta_data().at(view->view_info()->base_name()); // Use a smaller timestamp for the included base mutations. // If the constructed schema change command also contains an update for the base table, // these mutations would conflict with the base mutations we're returning here; using a smaller // timestamp makes sure that the update mutations take precedence. Although there is no known // scenario involving creation of new view where this might happen, there is one with updating // a view (see `make_update_view_mutations`); we use similarly modified timestamp here for consistency. add_table_or_view_to_schema_mutation(base, timestamp - 1, true, mutations); add_table_or_view_to_schema_mutation(view, timestamp, true, mutations); make_table_deleting_mutations(view->ks_name(), view->cf_name(), view->is_view(), timestamp) .copy_to(mutations); return mutations; } /** * Note: new_view can be generated due to an ALTER on its base table; in that * case, the new base schema isn't yet loaded, thus can't be accessed from this * function. */ std::vector make_update_view_mutations(lw_shared_ptr keyspace, view_ptr old_view, view_ptr new_view, api::timestamp_type timestamp, bool include_base) { std::vector mutations; if (include_base) { // Include the serialized base table mutations in case the target node is missing them. auto base = keyspace->cf_meta_data().at(new_view->view_info()->base_name()); // Use a smaller timestamp for the included base mutations. // If the constructed schema change command also contains an update for the base table, // these mutations would conflict with the base mutations we're returning here; using a smaller // timestamp makes sure that the update mutations take precedence. Such conflicting mutations // may appear, for example, when we modify a user defined type that is referenced by both base table // and its attached view. See #15530. add_table_or_view_to_schema_mutation(base, timestamp - 1, true, mutations); } add_table_or_view_to_schema_mutation(new_view, timestamp, false, mutations); make_update_columns_mutations(old_view, new_view, timestamp, false, mutations); return mutations; } std::vector make_drop_view_mutations(lw_shared_ptr keyspace, view_ptr view, api::timestamp_type timestamp) { std::vector mutations; make_drop_table_or_view_mutations(views(), view, timestamp, mutations); return mutations; } #if 0 private static AbstractType getComponentComparator(AbstractType rawComparator, Integer componentIndex) { return (componentIndex == null || (componentIndex == 0 && !(rawComparator instanceof CompositeType))) ? rawComparator : ((CompositeType)rawComparator).types.get(componentIndex); } /* * Trigger metadata serialization/deserialization. */ private static void addTriggerToSchemaMutation(CFMetaData table, TriggerDefinition trigger, long timestamp, Mutation mutation) { ColumnFamily cells = mutation.addOrGet(Triggers); Composite prefix = Triggers.comparator.make(table.cfName, trigger.name); CFRowAdder adder = new CFRowAdder(cells, prefix, timestamp); adder.addMapEntry("trigger_options", "class", trigger.classOption); } private static void dropTriggerFromSchemaMutation(CFMetaData table, TriggerDefinition trigger, long timestamp, Mutation mutation) { ColumnFamily cells = mutation.addOrGet(Triggers); int ldt = (int) (System.currentTimeMillis() / 1000); Composite prefix = Triggers.comparator.make(table.cfName, trigger.name); cells.addAtom(new RangeTombstone(prefix, prefix.end(), timestamp, ldt)); } /** * Deserialize triggers from storage-level representation. * * @param partition storage-level partition containing the trigger definitions * @return the list of processed TriggerDefinitions */ private static List createTriggersFromTriggersPartition(Row partition) { List triggers = new ArrayList<>(); String query = String.format("SELECT * FROM %s.%s", SystemKeyspace.NAME, TRIGGERS); for (UntypedResultSet.Row row : QueryProcessor.resultify(query, partition)) { String name = row.getString("trigger_name"); String classOption = row.getMap("trigger_options", UTF8Type.instance, UTF8Type.instance).get("class"); triggers.add(new TriggerDefinition(name, classOption)); } return triggers; } /* * Aggregate UDF metadata serialization/deserialization. */ public static Mutation makeCreateAggregateMutation(KSMetaData keyspace, UDAggregate aggregate, long timestamp) { // Include the serialized keyspace in case the target node missed a CREATE KEYSPACE migration (see CASSANDRA-5631). Mutation mutation = makeCreateKeyspaceMutation(keyspace, timestamp, false); addAggregateToSchemaMutation(aggregate, timestamp, mutation); return mutation; } private static void addAggregateToSchemaMutation(UDAggregate aggregate, long timestamp, Mutation mutation) { ColumnFamily cells = mutation.addOrGet(Aggregates); Composite prefix = Aggregates.comparator.make(aggregate.name().name, UDHelper.calculateSignature(aggregate)); CFRowAdder adder = new CFRowAdder(cells, prefix, timestamp); adder.resetCollection("argument_types"); adder.add("return_type", aggregate.returnType().toString()); adder.add("state_func", aggregate.stateFunction().name().name); if (aggregate.stateType() != null) adder.add("state_type", aggregate.stateType().toString()); if (aggregate.finalFunction() != null) adder.add("final_func", aggregate.finalFunction().name().name); if (aggregate.initialCondition() != null) adder.add("initcond", aggregate.initialCondition()); for (AbstractType argType : aggregate.argTypes()) adder.addListEntry("argument_types", argType.toString()); } private static Map createAggregatesFromAggregatesPartition(Row partition) { Map aggregates = new HashMap<>(); String query = String.format("SELECT * FROM %s.%s", SystemKeyspace.NAME, AGGREGATES); for (UntypedResultSet.Row row : QueryProcessor.resultify(query, partition)) { UDAggregate aggregate = createAggregateFromAggregateRow(row); aggregates.put(UDHelper.calculateSignature(aggregate), aggregate); } return aggregates; } private static UDAggregate createAggregateFromAggregateRow(UntypedResultSet.Row row) { String ksName = row.getString("keyspace_name"); String functionName = row.getString("aggregate_name"); FunctionName name = new FunctionName(ksName, functionName); List types = row.getList("argument_types", UTF8Type.instance); List> argTypes; if (types == null) { argTypes = Collections.emptyList(); } else { argTypes = new ArrayList<>(types.size()); for (String type : types) argTypes.add(parseType(type)); } AbstractType returnType = parseType(row.getString("return_type")); FunctionName stateFunc = new FunctionName(ksName, row.getString("state_func")); FunctionName finalFunc = row.has("final_func") ? new FunctionName(ksName, row.getString("final_func")) : null; AbstractType stateType = row.has("state_type") ? parseType(row.getString("state_type")) : null; ByteBuffer initcond = row.has("initcond") ? row.getBytes("initcond") : null; try { return UDAggregate.create(name, argTypes, returnType, stateFunc, finalFunc, stateType, initcond); } catch (InvalidRequestException reason) { return UDAggregate.createBroken(name, argTypes, returnType, initcond, reason); } } public static Mutation makeDropAggregateMutation(KSMetaData keyspace, UDAggregate aggregate, long timestamp) { // Include the serialized keyspace in case the target node missed a CREATE KEYSPACE migration (see CASSANDRA-5631). Mutation mutation = makeCreateKeyspaceMutation(keyspace, timestamp, false); ColumnFamily cells = mutation.addOrGet(Aggregates); int ldt = (int) (System.currentTimeMillis() / 1000); Composite prefix = Aggregates.comparator.make(aggregate.name().name, UDHelper.calculateSignature(aggregate)); cells.addAtom(new RangeTombstone(prefix, prefix.end(), timestamp, ldt)); return mutation; } #endif data_type parse_type(sstring str) { return db::marshal::type_parser::parse(str); } std::vector all_tables(schema_features features) { // Don't forget to update this list when new schema tables are added. // The listed schema tables are the ones synchronized between nodes, // and forgetting one of them in this list can cause bugs like #4339. // // This list must be kept backwards-compatible because it's used // for schema digest calculation. Refs #4457. std::vector result = { keyspaces(), tables(), scylla_tables(), columns(), dropped_columns(), triggers(), views(), types(), functions(), aggregates(), indexes() }; result.emplace_back(view_virtual_columns()); if (features.contains()) { result.emplace_back(computed_columns()); } if (features.contains()) { result.emplace_back(scylla_keyspaces()); } if (features.contains()) { result.emplace_back(scylla_aggregates()); } return result; } std::vector all_table_names(schema_features features) { return boost::copy_range>(all_tables(features) | boost::adaptors::transformed([] (auto schema) { return schema->cf_name(); })); } void check_no_legacy_secondary_index_mv_schema(replica::database& db, const view_ptr& v, schema_ptr base_schema) { // Legacy format for a secondary index used a hardcoded "token" column, which ensured a proper // order for indexed queries. This "token" column is has been implemented as a computed column // for a long time now, and migration code has been / will be executed on all reasonable Scylla // deployments (which don't do unsupported upgrades). // // This function is now used as a sanity check that we're not dealing with the legacy format anymore. if (v->clustering_key_size() == 0) { return; } const auto ck_cols = v->clustering_key_columns(); const column_definition& first_view_ck = ck_cols.front(); if (first_view_ck.is_computed()) { return; } if (!base_schema) { base_schema = db.find_schema(v->view_info()->base_id()); } // If the first clustering key part of a view is a column with name not found in base schema, // and the column is not computed (which we checked above), then it must be backing an index // created before computed columns were introduced. if (!base_schema->columns_by_name().contains(first_view_ck.name())) { on_fatal_internal_error(slogger, format( "Materialized view {}.{}: first clustering key column ({}) is not computed and does not have a corresponding" " column in the base table. This materialized view must therefore be a secondary index created" " using legacy method (without computed columns) that wasn't migrated properly to new method." " Make sure that you perform rolling upgrade according to documented procedure without skipping" " major Scylla versions.", v->ks_name(), v->cf_name(), first_view_ck.name_as_text())); } } namespace legacy { table_schema_version schema_mutations::digest() const { md5_hasher h; const db::schema_features no_features; db::schema_tables::feed_hash_for_schema_digest(h, _columnfamilies, no_features); db::schema_tables::feed_hash_for_schema_digest(h, _columns, no_features); return table_schema_version(utils::UUID_gen::get_name_UUID(h.finalize())); } future read_table_mutations(distributed& proxy, sstring keyspace_name, sstring table_name, schema_ptr s) { mutation cf_m = co_await read_schema_partition_for_table(proxy, s, keyspace_name, table_name); mutation col_m = co_await read_schema_partition_for_table(proxy, db::system_keyspace::legacy::columns(), keyspace_name, table_name); co_return schema_mutations{std::move(cf_m), std::move(col_m)}; } } // namespace legacy static auto GET_COLUMN_MAPPING_QUERY = format("SELECT column_name, clustering_order, column_name_bytes, kind, position, type FROM system.{} WHERE cf_id = ? AND schema_version = ?", db::schema_tables::SCYLLA_TABLE_SCHEMA_HISTORY); future get_column_mapping(db::system_keyspace& sys_ks, ::table_id table_id, table_schema_version version) { shared_ptr results = co_await sys_ks._qp.execute_internal( GET_COLUMN_MAPPING_QUERY, db::consistency_level::LOCAL_ONE, {table_id.uuid(), version.uuid()}, cql3::query_processor::cache_internal::no ); if (results->empty()) { // If we don't have a stored column_mapping for an obsolete schema version // then it means it's way too old and been cleaned up already. // Fail the whole learn stage in this case. co_await coroutine::return_exception(std::runtime_error( format("Failed to look up column mapping for schema version {}", version))); } std::vector static_columns, regular_columns; for (const auto& row : *results) { auto kind = deserialize_kind(row.get_as("kind")); auto type = cql_type_parser::parse("" /*unused*/, row.get_as("type"), data_dictionary::dummy_user_types_storage()); auto name_bytes = row.get_blob("column_name_bytes"); column_id position = row.get_as("position"); auto order = row.get_as("clustering_order"); std::transform(order.begin(), order.end(), order.begin(), ::toupper); if (order == "DESC") { type = reversed_type_impl::get_instance(type); } if (kind == column_kind::static_column) { static_columns.emplace_back(name_bytes, type, kind, position); } else if (kind == column_kind::regular_column) { regular_columns.emplace_back(name_bytes, type, kind, position); } } std::vector cm_columns; for (const column_definition& def : boost::range::join(static_columns, regular_columns)) { cm_columns.emplace_back(column_mapping_entry{def.name(), def.type}); } column_mapping cm(std::move(cm_columns), static_columns.size()); co_return std::move(cm); } future column_mapping_exists(db::system_keyspace& sys_ks, table_id table_id, table_schema_version version) { shared_ptr results = co_await sys_ks._qp.execute_internal( GET_COLUMN_MAPPING_QUERY, db::consistency_level::LOCAL_ONE, {table_id.uuid(), version.uuid()}, cql3::query_processor::cache_internal::yes ); co_return !results->empty(); } future<> drop_column_mapping(db::system_keyspace& sys_ks, table_id table_id, table_schema_version version) { const static sstring DEL_COLUMN_MAPPING_QUERY = format("DELETE FROM system.{} WHERE cf_id = ? and schema_version = ?", db::schema_tables::SCYLLA_TABLE_SCHEMA_HISTORY); co_await sys_ks._qp.execute_internal( DEL_COLUMN_MAPPING_QUERY, db::consistency_level::LOCAL_ONE, {table_id.uuid(), version.uuid()}, cql3::query_processor::cache_internal::no); } } // namespace schema_tables } // namespace schema