/* */ /* * Modified by ScyllaDB * Copyright (C) 2015-present ScyllaDB */ /* * SPDX-License-Identifier: (AGPL-3.0-or-later and Apache-2.0) */ #include #include #include #include #include #include #include #include #include #include "system_keyspace.hh" #include "types.hh" #include "service/storage_proxy.hh" #include "service/client_state.hh" #include "service/query_state.hh" #include "cql3/query_options.hh" #include "cql3/query_processor.hh" #include "cql3/untyped_result_set.hh" #include "utils/fb_utilities.hh" #include "utils/hash.hh" #include "version.hh" #include "thrift/server.hh" #include "exceptions/exceptions.hh" #include "cql3/query_processor.hh" #include "query_context.hh" #include "partition_slice_builder.hh" #include "db/config.hh" #include "gms/feature_service.hh" #include "system_keyspace_view_types.hh" #include "schema_builder.hh" #include "hashers.hh" #include "release.hh" #include "log.hh" #include "serializer.hh" #include #include #include "index/secondary_index.hh" #include "service/storage_proxy.hh" #include "message/messaging_service.hh" #include "mutation_query.hh" #include "db/size_estimates_virtual_reader.hh" #include "db/timeout_clock.hh" #include "sstables/sstables.hh" #include "db/view/build_progress_virtual_reader.hh" #include "db/schema_tables.hh" #include "index/built_indexes_virtual_reader.hh" #include "utils/generation-number.hh" #include "db/virtual_table.hh" #include "service/storage_service.hh" #include "gms/gossiper.hh" #include "service/paxos/paxos_state.hh" #include "utils/build_id.hh" #include "query-result-set.hh" #include "idl/frozen_mutation.dist.hh" #include "serializer_impl.hh" #include "idl/frozen_mutation.dist.impl.hh" #include #include "client_data.hh" using days = std::chrono::duration>; namespace db { std::unique_ptr qctx = {}; static logging::logger slogger("system_keyspace"); static const api::timestamp_type creation_timestamp = api::new_timestamp(); bool system_keyspace::is_extra_durable(const sstring& name) { return boost::algorithm::any_of(extra_durable_tables, [name] (const char* table) { return name == table; }); } api::timestamp_type system_keyspace::schema_creation_timestamp() { return creation_timestamp; } // Increase whenever changing schema of any system table. // FIXME: Make automatic by calculating from schema structure. static const uint16_t version_sequence_number = 1; table_schema_version system_keyspace::generate_schema_version(utils::UUID table_id, uint16_t offset) { md5_hasher h; feed_hash(h, table_id); feed_hash(h, version_sequence_number + offset); return utils::UUID_gen::get_name_UUID(h.finalize()); } // Currently, the type variables (uuid_type, etc.) are thread-local reference- // counted shared pointers. This forces us to also make the built in schemas // below thread-local as well. // We return schema_ptr, not schema&, because that's the "tradition" in our // other code. // We hide the thread_local variable inside a function, because if we later // we remove the thread_local, we'll start having initialization order // problems (we need the type variables to be constructed first), and using // functions will solve this problem. So we use functions right now. schema_ptr system_keyspace::hints() { static thread_local auto hints = [] { schema_builder builder(generate_legacy_id(NAME, HINTS), NAME, HINTS, // partition key {{"target_id", uuid_type}}, // clustering key {{"hint_id", timeuuid_type}, {"message_version", int32_type}}, // regular columns {{"mutation", bytes_type}}, // static columns {}, // regular column name type utf8_type, // comment "hints awaiting delivery" ); builder.set_gc_grace_seconds(0); builder.set_compaction_strategy_options({{ "enabled", "false" }}); builder.with_version(generate_schema_version(builder.uuid())); return builder.build(schema_builder::compact_storage::yes); }(); return hints; } schema_ptr system_keyspace::batchlog() { static thread_local auto batchlog = [] { schema_builder builder(generate_legacy_id(NAME, BATCHLOG), NAME, BATCHLOG, // partition key {{"id", uuid_type}}, // clustering key {}, // regular columns {{"data", bytes_type}, {"version", int32_type}, {"written_at", timestamp_type}}, // static columns {}, // regular column name type utf8_type, // comment "batches awaiting replay" // FIXME: the original Java code also had: // operations on resulting CFMetaData: // .compactionStrategyOptions(Collections.singletonMap("min_threshold", "2")) ); builder.set_gc_grace_seconds(0); builder.with_version(generate_schema_version(builder.uuid())); return builder.build(schema_builder::compact_storage::no); }(); return batchlog; } /*static*/ schema_ptr system_keyspace::paxos() { static thread_local auto paxos = [] { // FIXME: switch to the new schema_builder interface (with_column(...), etc) schema_builder builder(generate_legacy_id(NAME, PAXOS), NAME, PAXOS, // partition key {{"row_key", bytes_type}}, // byte representation of a row key that hashes to the same token as original // clustering key {{"cf_id", uuid_type}}, // regular columns { {"promise", timeuuid_type}, {"most_recent_commit", bytes_type}, // serialization format is defined by frozen_mutation idl {"most_recent_commit_at", timeuuid_type}, {"proposal", bytes_type}, // serialization format is defined by frozen_mutation idl {"proposal_ballot", timeuuid_type}, }, // static columns {}, // regular column name type utf8_type, // comment "in-progress paxos proposals" // FIXME: the original Java code also had: // operations on resulting CFMetaData: // .compactionStrategyClass(LeveledCompactionStrategy.class); ); builder.set_gc_grace_seconds(0); builder.with_version(generate_schema_version(builder.uuid())); builder.set_wait_for_sync_to_commitlog(true); return builder.build(schema_builder::compact_storage::no); }(); return paxos; } schema_ptr system_keyspace::raft() { static thread_local auto schema = [] { auto id = generate_legacy_id(NAME, RAFT); return schema_builder(NAME, RAFT, std::optional(id)) .with_column("group_id", timeuuid_type, column_kind::partition_key) // raft log part .with_column("index", long_type, column_kind::clustering_key) .with_column("term", long_type) .with_column("data", bytes_type) // decltype(raft::log_entry::data) - serialized variant // persisted term and vote .with_column("vote_term", long_type, column_kind::static_column) .with_column("vote", uuid_type, column_kind::static_column) // id of the most recent persisted snapshot .with_column("snapshot_id", uuid_type, column_kind::static_column) .with_column("commit_idx", long_type, column_kind::static_column) .set_comment("Persisted RAFT log, votes and snapshot info") .with_version(generate_schema_version(id)) .set_wait_for_sync_to_commitlog(true) .with_null_sharder() .build(); }(); return schema; } // Note that this table does not include actula user snapshot data since it's dependent // on user-provided state machine and could be stored anywhere else in any other form. // This should be seen as a snapshot descriptor, instead. schema_ptr system_keyspace::raft_snapshots() { static thread_local auto schema = [] { auto id = generate_legacy_id(NAME, RAFT_SNAPSHOTS); return schema_builder(NAME, RAFT_SNAPSHOTS, std::optional(id)) .with_column("group_id", timeuuid_type, column_kind::partition_key) // To be able to start multiple raft servers inside one raft group // on the same node, we need to include the server_id in the // partition key, as well. .with_column("server_id", uuid_type, column_kind::partition_key) .with_column("snapshot_id", uuid_type) // Index and term of last entry in the snapshot .with_column("idx", long_type) .with_column("term", long_type) .set_comment("Persisted RAFT snapshot descriptors info") .with_version(generate_schema_version(id)) .set_wait_for_sync_to_commitlog(true) .with_null_sharder() .build(); }(); return schema; } schema_ptr system_keyspace::raft_config() { static thread_local auto schema = [] { auto id = generate_legacy_id(system_keyspace::NAME, RAFT_CONFIG); return schema_builder(system_keyspace::NAME, RAFT_CONFIG, std::optional(id)) .with_column("group_id", timeuuid_type, column_kind::partition_key) .with_column("my_server_id", uuid_type, column_kind::partition_key) .with_column("server_id", uuid_type, column_kind::clustering_key) .with_column("disposition", ascii_type, column_kind::clustering_key) // can be 'CURRENT` or `PREVIOUS' .with_column("can_vote", boolean_type) .with_column("ip_addr", inet_addr_type) .set_comment("RAFT configuration for the latest snapshot descriptor") .with_version(generate_schema_version(id)) .set_wait_for_sync_to_commitlog(true) .with_null_sharder() .build(); }(); return schema; } schema_ptr system_keyspace::repair_history() { static thread_local auto schema = [] { auto id = generate_legacy_id(NAME, REPAIR_HISTORY); return schema_builder(NAME, REPAIR_HISTORY, std::optional(id)) .with_column("table_uuid", uuid_type, column_kind::partition_key) // The time is repair start time .with_column("repair_time", timestamp_type, column_kind::clustering_key) .with_column("repair_uuid", uuid_type, column_kind::clustering_key) // The token range is (range_start, range_end] .with_column("range_start", long_type, column_kind::clustering_key) .with_column("range_end", long_type, column_kind::clustering_key) .with_column("keyspace_name", utf8_type, column_kind::static_column) .with_column("table_name", utf8_type, column_kind::static_column) .set_comment("Record repair history") .with_version(generate_schema_version(id)) .build(); }(); return schema; } schema_ptr system_keyspace::built_indexes() { static thread_local auto built_indexes = [] { schema_builder builder(generate_legacy_id(NAME, BUILT_INDEXES), NAME, BUILT_INDEXES, // partition key {{"table_name", utf8_type}}, // table_name here is the name of the keyspace - don't be fooled // clustering key {{"index_name", utf8_type}}, // regular columns {}, // static columns {}, // regular column name type utf8_type, // comment "built column indexes" ); builder.set_gc_grace_seconds(0); builder.with_version(generate_schema_version(builder.uuid())); return builder.build(schema_builder::compact_storage::yes); }(); return built_indexes; } /*static*/ schema_ptr system_keyspace::local() { static thread_local auto local = [] { schema_builder builder(generate_legacy_id(NAME, LOCAL), NAME, LOCAL, // partition key {{"key", utf8_type}}, // clustering key {}, // regular columns { {"bootstrapped", utf8_type}, {"cluster_name", utf8_type}, {"cql_version", utf8_type}, {"data_center", utf8_type}, {"gossip_generation", int32_type}, {"host_id", uuid_type}, {"native_protocol_version", utf8_type}, {"partitioner", utf8_type}, {"rack", utf8_type}, {"release_version", utf8_type}, {"schema_version", uuid_type}, {"thrift_version", utf8_type}, {"tokens", set_type_impl::get_instance(utf8_type, true)}, {"truncated_at", map_type_impl::get_instance(uuid_type, bytes_type, true)}, // The following 3 columns are only present up until 2.1.8 tables {"rpc_address", inet_addr_type}, {"broadcast_address", inet_addr_type}, {"listen_address", inet_addr_type}, // This column represents advertised local features (i.e. the features // advertised by the node via gossip after passing the feature check // against remote features in the cluster) {"supported_features", utf8_type}, {"scylla_cpu_sharding_algorithm", utf8_type}, {"scylla_nr_shards", int32_type}, {"scylla_msb_ignore", int32_type}, }, // static columns {}, // regular column name type utf8_type, // comment "information about the local node" ); builder.set_gc_grace_seconds(0); builder.with_version(generate_schema_version(builder.uuid())); builder.remove_column("scylla_cpu_sharding_algorithm"); builder.remove_column("scylla_nr_shards"); builder.remove_column("scylla_msb_ignore"); return builder.build(schema_builder::compact_storage::no); }(); return local; } /*static*/ schema_ptr system_keyspace::peers() { static thread_local auto peers = [] { schema_builder builder(generate_legacy_id(NAME, PEERS), NAME, PEERS, // partition key {{"peer", inet_addr_type}}, // clustering key {}, // regular columns { {"data_center", utf8_type}, {"host_id", uuid_type}, {"preferred_ip", inet_addr_type}, {"rack", utf8_type}, {"release_version", utf8_type}, {"rpc_address", inet_addr_type}, {"schema_version", uuid_type}, {"tokens", set_type_impl::get_instance(utf8_type, true)}, {"supported_features", utf8_type}, }, // static columns {}, // regular column name type utf8_type, // comment "information about known peers in the cluster" ); builder.set_gc_grace_seconds(0); builder.with_version(generate_schema_version(builder.uuid())); return builder.build(schema_builder::compact_storage::no); }(); return peers; } /*static*/ schema_ptr system_keyspace::peer_events() { static thread_local auto peer_events = [] { schema_builder builder(generate_legacy_id(NAME, PEER_EVENTS), NAME, PEER_EVENTS, // partition key {{"peer", inet_addr_type}}, // clustering key {}, // regular columns { {"hints_dropped", map_type_impl::get_instance(uuid_type, int32_type, true)}, }, // static columns {}, // regular column name type utf8_type, // comment "events related to peers" ); builder.set_gc_grace_seconds(0); builder.with_version(generate_schema_version(builder.uuid())); return builder.build(schema_builder::compact_storage::no); }(); return peer_events; } /*static*/ schema_ptr system_keyspace::range_xfers() { static thread_local auto range_xfers = [] { schema_builder builder(generate_legacy_id(NAME, RANGE_XFERS), NAME, RANGE_XFERS, // partition key {{"token_bytes", bytes_type}}, // clustering key {}, // regular columns {{"requested_at", timestamp_type}}, // static columns {}, // regular column name type utf8_type, // comment "ranges requested for transfer" ); builder.set_gc_grace_seconds(0); builder.with_version(generate_schema_version(builder.uuid())); return builder.build(schema_builder::compact_storage::no); }(); return range_xfers; } /*static*/ schema_ptr system_keyspace::compactions_in_progress() { static thread_local auto compactions_in_progress = [] { schema_builder builder(generate_legacy_id(NAME, COMPACTIONS_IN_PROGRESS), NAME, COMPACTIONS_IN_PROGRESS, // partition key {{"id", uuid_type}}, // clustering key {}, // regular columns { {"columnfamily_name", utf8_type}, {"inputs", set_type_impl::get_instance(int32_type, true)}, {"keyspace_name", utf8_type}, }, // static columns {}, // regular column name type utf8_type, // comment "unfinished compactions" ); builder.with_version(generate_schema_version(builder.uuid())); return builder.build(schema_builder::compact_storage::no); }(); return compactions_in_progress; } /*static*/ schema_ptr system_keyspace::compaction_history() { static thread_local auto compaction_history = [] { schema_builder builder(generate_legacy_id(NAME, COMPACTION_HISTORY), NAME, COMPACTION_HISTORY, // partition key {{"id", uuid_type}}, // clustering key {}, // regular columns { {"bytes_in", long_type}, {"bytes_out", long_type}, {"columnfamily_name", utf8_type}, {"compacted_at", timestamp_type}, {"keyspace_name", utf8_type}, {"rows_merged", map_type_impl::get_instance(int32_type, long_type, true)}, }, // static columns {}, // regular column name type utf8_type, // comment "week-long compaction history" ); builder.set_default_time_to_live(std::chrono::duration_cast(days(7))); builder.with_version(generate_schema_version(builder.uuid())); return builder.build(schema_builder::compact_storage::no); }(); return compaction_history; } /*static*/ schema_ptr system_keyspace::sstable_activity() { static thread_local auto sstable_activity = [] { schema_builder builder(generate_legacy_id(NAME, SSTABLE_ACTIVITY), NAME, SSTABLE_ACTIVITY, // partition key { {"keyspace_name", utf8_type}, {"columnfamily_name", utf8_type}, {"generation", int32_type}, }, // clustering key {}, // regular columns { {"rate_120m", double_type}, {"rate_15m", double_type}, }, // static columns {}, // regular column name type utf8_type, // comment "historic sstable read rates" ); builder.with_version(generate_schema_version(builder.uuid())); return builder.build(schema_builder::compact_storage::no); }(); return sstable_activity; } schema_ptr system_keyspace::size_estimates() { static thread_local auto size_estimates = [] { schema_builder builder(generate_legacy_id(NAME, SIZE_ESTIMATES), NAME, SIZE_ESTIMATES, // partition key {{"keyspace_name", utf8_type}}, // clustering key {{"table_name", utf8_type}, {"range_start", utf8_type}, {"range_end", utf8_type}}, // regular columns { {"mean_partition_size", long_type}, {"partitions_count", long_type}, }, // static columns {}, // regular column name type utf8_type, // comment "per-table primary range size estimates" ); builder.set_gc_grace_seconds(0); builder.with_version(generate_schema_version(builder.uuid())); return builder.build(schema_builder::compact_storage::no); }(); return size_estimates; } /*static*/ schema_ptr system_keyspace::large_partitions() { static thread_local auto large_partitions = [] { schema_builder builder(generate_legacy_id(NAME, LARGE_PARTITIONS), NAME, LARGE_PARTITIONS, // partition key {{"keyspace_name", utf8_type}, {"table_name", utf8_type}}, // clustering key { {"sstable_name", utf8_type}, {"partition_size", reversed_type_impl::get_instance(long_type)}, {"partition_key", utf8_type} }, // CLUSTERING ORDER BY (partition_size DESC) // regular columns { {"rows", long_type}, {"compaction_time", timestamp_type} }, // static columns {}, // regular column name type utf8_type, // comment "partitions larger than specified threshold" ); builder.set_gc_grace_seconds(0); builder.with_version(generate_schema_version(builder.uuid())); // FIXME re-enable caching for this and the other two // system.large_* tables once // https://github.com/scylladb/scylla/issues/3288 is fixed builder.set_caching_options(caching_options::get_disabled_caching_options()); return builder.build(schema_builder::compact_storage::no); }(); return large_partitions; } schema_ptr system_keyspace::large_rows() { static thread_local auto large_rows = [] { auto id = generate_legacy_id(NAME, LARGE_ROWS); return schema_builder(NAME, LARGE_ROWS, std::optional(id)) .with_column("keyspace_name", utf8_type, column_kind::partition_key) .with_column("table_name", utf8_type, column_kind::partition_key) .with_column("sstable_name", utf8_type, column_kind::clustering_key) // We want the large rows first, so use reversed_type_impl .with_column("row_size", reversed_type_impl::get_instance(long_type), column_kind::clustering_key) .with_column("partition_key", utf8_type, column_kind::clustering_key) .with_column("clustering_key", utf8_type, column_kind::clustering_key) .with_column("compaction_time", timestamp_type) .set_comment("rows larger than specified threshold") .with_version(generate_schema_version(id)) .set_gc_grace_seconds(0) .set_caching_options(caching_options::get_disabled_caching_options()) .build(); }(); return large_rows; } schema_ptr system_keyspace::large_cells() { static thread_local auto large_cells = [] { auto id = generate_legacy_id(NAME, LARGE_CELLS); return schema_builder(NAME, LARGE_CELLS, id) .with_column("keyspace_name", utf8_type, column_kind::partition_key) .with_column("table_name", utf8_type, column_kind::partition_key) .with_column("sstable_name", utf8_type, column_kind::clustering_key) // We want the larger cells first, so use reversed_type_impl .with_column("cell_size", reversed_type_impl::get_instance(long_type), column_kind::clustering_key) .with_column("partition_key", utf8_type, column_kind::clustering_key) .with_column("clustering_key", utf8_type, column_kind::clustering_key) .with_column("column_name", utf8_type, column_kind::clustering_key) .with_column("compaction_time", timestamp_type) .set_comment("cells larger than specified threshold") .with_version(generate_schema_version(id)) .set_gc_grace_seconds(0) .set_caching_options(caching_options::get_disabled_caching_options()) .build(); }(); return large_cells; } /*static*/ schema_ptr system_keyspace::scylla_local() { static thread_local auto scylla_local = [] { schema_builder builder(generate_legacy_id(NAME, SCYLLA_LOCAL), NAME, SCYLLA_LOCAL, // partition key {{"key", utf8_type}}, // clustering key {}, // regular columns { {"value", utf8_type}, }, // static columns {}, // regular column name type utf8_type, // comment "Scylla specific information about the local node" ); builder.set_gc_grace_seconds(0); // Raft Group id and server id updates must be sync builder.set_wait_for_sync_to_commitlog(true); builder.with_version(generate_schema_version(builder.uuid())); return builder.build(schema_builder::compact_storage::no); }(); return scylla_local; } schema_ptr system_keyspace::v3::batches() { static thread_local auto schema = [] { schema_builder builder(generate_legacy_id(NAME, BATCHES), NAME, BATCHES, // partition key {{"id", timeuuid_type}}, // clustering key {}, // regular columns {{"mutations", list_type_impl::get_instance(bytes_type, true)}, {"version", int32_type}}, // static columns {}, // regular column name type utf8_type, // comment "batches awaiting replay" ); builder.set_gc_grace_seconds(0); // FIXME: the original Java code also had: //.copy(new LocalPartitioner(TimeUUIDType.instance)) builder.set_gc_grace_seconds(0); builder.set_compaction_strategy(sstables::compaction_strategy_type::size_tiered); builder.set_compaction_strategy_options({{"min_threshold", "2"}}); builder.with_version(generate_schema_version(builder.uuid())); return builder.build(schema_builder::compact_storage::no); }(); return schema; } schema_ptr system_keyspace::v3::built_indexes() { // identical to ours, but ours otoh is a mix-in of the 3.x series cassandra one return db::system_keyspace::built_indexes(); } schema_ptr system_keyspace::v3::local() { static thread_local auto schema = [] { schema_builder builder(generate_legacy_id(NAME, LOCAL), NAME, LOCAL, // partition key {{"key", utf8_type}}, // clustering key {}, // regular columns { {"bootstrapped", utf8_type}, {"broadcast_address", inet_addr_type}, {"cluster_name", utf8_type}, {"cql_version", utf8_type}, {"data_center", utf8_type}, {"gossip_generation", int32_type}, {"host_id", uuid_type}, {"listen_address", inet_addr_type}, {"native_protocol_version", utf8_type}, {"partitioner", utf8_type}, {"rack", utf8_type}, {"release_version", utf8_type}, {"rpc_address", inet_addr_type}, {"schema_version", uuid_type}, {"thrift_version", utf8_type}, {"tokens", set_type_impl::get_instance(utf8_type, true)}, {"truncated_at", map_type_impl::get_instance(uuid_type, bytes_type, true)}, }, // static columns {}, // regular column name type utf8_type, // comment "information about the local node" ); builder.set_gc_grace_seconds(0); builder.with_version(generate_schema_version(builder.uuid())); return builder.build(schema_builder::compact_storage::no); }(); return schema; } schema_ptr system_keyspace::v3::truncated() { static thread_local auto local = [] { schema_builder builder(generate_legacy_id(NAME, TRUNCATED), NAME, TRUNCATED, // partition key {{"table_uuid", uuid_type}}, // clustering key {{"shard", int32_type}}, // regular columns { {"position", int32_type}, {"segment_id", long_type} }, // static columns { {"truncated_at", timestamp_type}, }, // regular column name type utf8_type, // comment "information about table truncation" ); builder.set_gc_grace_seconds(0); builder.with_version(generate_schema_version(builder.uuid())); return builder.build(schema_builder::compact_storage::no); }(); return local; } schema_ptr system_keyspace::v3::peers() { // identical return db::system_keyspace::peers(); } schema_ptr system_keyspace::v3::peer_events() { // identical return db::system_keyspace::peer_events(); } schema_ptr system_keyspace::v3::range_xfers() { // identical return db::system_keyspace::range_xfers(); } schema_ptr system_keyspace::v3::compaction_history() { // identical return db::system_keyspace::compaction_history(); } schema_ptr system_keyspace::v3::sstable_activity() { // identical return db::system_keyspace::sstable_activity(); } schema_ptr system_keyspace::v3::size_estimates() { // identical return db::system_keyspace::size_estimates(); } schema_ptr system_keyspace::v3::large_partitions() { // identical return db::system_keyspace::large_partitions(); } schema_ptr system_keyspace::v3::scylla_local() { // identical return db::system_keyspace::scylla_local(); } schema_ptr system_keyspace::v3::available_ranges() { static thread_local auto schema = [] { schema_builder builder(generate_legacy_id(NAME, AVAILABLE_RANGES), NAME, AVAILABLE_RANGES, // partition key {{"keyspace_name", utf8_type}}, // clustering key {}, // regular columns {{"ranges", set_type_impl::get_instance(bytes_type, true)}}, // static columns {}, // regular column name type utf8_type, // comment "available keyspace/ranges during bootstrap/replace that are ready to be served" ); builder.set_gc_grace_seconds(0); builder.with_version(generate_schema_version(builder.uuid())); return builder.build(); }(); return schema; } schema_ptr system_keyspace::v3::views_builds_in_progress() { static thread_local auto schema = [] { schema_builder builder(generate_legacy_id(NAME, VIEWS_BUILDS_IN_PROGRESS), NAME, VIEWS_BUILDS_IN_PROGRESS, // partition key {{"keyspace_name", utf8_type}}, // clustering key {{"view_name", utf8_type}}, // regular columns {{"last_token", utf8_type}, {"generation_number", int32_type}}, // static columns {}, // regular column name type utf8_type, // comment "views builds current progress" ); builder.set_gc_grace_seconds(0); builder.with_version(generate_schema_version(builder.uuid())); return builder.build(); }(); return schema; } schema_ptr system_keyspace::v3::built_views() { static thread_local auto schema = [] { schema_builder builder(generate_legacy_id(NAME, BUILT_VIEWS), NAME, BUILT_VIEWS, // partition key {{"keyspace_name", utf8_type}}, // clustering key {{"view_name", utf8_type}}, // regular columns {}, // static columns {}, // regular column name type utf8_type, // comment "built views" ); builder.set_gc_grace_seconds(0); builder.with_version(generate_schema_version(builder.uuid())); return builder.build(); }(); return schema; } schema_ptr system_keyspace::v3::scylla_views_builds_in_progress() { static thread_local auto schema = [] { auto id = generate_legacy_id(NAME, SCYLLA_VIEWS_BUILDS_IN_PROGRESS); return schema_builder(NAME, SCYLLA_VIEWS_BUILDS_IN_PROGRESS, std::make_optional(id)) .with_column("keyspace_name", utf8_type, column_kind::partition_key) .with_column("view_name", utf8_type, column_kind::clustering_key) .with_column("cpu_id", int32_type, column_kind::clustering_key) .with_column("next_token", utf8_type) .with_column("generation_number", int32_type) .with_column("first_token", utf8_type) .with_version(generate_schema_version(id)) .build(); }(); return schema; } /*static*/ schema_ptr system_keyspace::v3::cdc_local() { static thread_local auto cdc_local = [] { schema_builder builder(generate_legacy_id(NAME, CDC_LOCAL), NAME, CDC_LOCAL, // partition key {{"key", utf8_type}}, // clustering key {}, // regular columns { /* Every node announces the identifier of the newest known CDC generation to other nodes. * The identifier consists of two things: a timestamp (which is the generation's timestamp, * denoting the time point from which it starts operating) and an UUID (randomly generated * when the generation is created). * This identifier is persisted here and restored on node restart. * * Some identifiers - identifying generations created in older clusters - have only the timestamp. * For these the uuid column is empty. */ {"streams_timestamp", timestamp_type}, {"uuid", uuid_type}, }, // static columns {}, // regular column name type utf8_type, // comment "CDC-specific information that the local node stores" ); builder.set_gc_grace_seconds(0); builder.with_version(generate_schema_version(builder.uuid())); return builder.build(schema_builder::compact_storage::no); }(); return cdc_local; } schema_ptr system_keyspace::group0_history() { static thread_local auto schema = [] { auto id = generate_legacy_id(NAME, GROUP0_HISTORY); return schema_builder(NAME, GROUP0_HISTORY, id) // this is a single-partition table with key 'history' .with_column("key", utf8_type, column_kind::partition_key) // group0 state timeuuid, descending order .with_column("state_id", reversed_type_impl::get_instance(timeuuid_type), column_kind::clustering_key) // human-readable description of the change .with_column("description", utf8_type) .set_comment("History of Raft group 0 state changes") .with_version(generate_schema_version(id)) .with_null_sharder() .build(); }(); return schema; } schema_ptr system_keyspace::discovery() { static thread_local auto schema = [] { auto id = generate_legacy_id(NAME, DISCOVERY); return schema_builder(NAME, DISCOVERY, id) // This is a single-partition table with key 'peers' .with_column("key", utf8_type, column_kind::partition_key) // Opaque connection properties. See `raft::server_info`. .with_column("server_info", bytes_type, column_kind::clustering_key) // The ID of the group 0 server on that peer. // May be unknown during discovery, then it's set to UUID 0. .with_column("raft_id", uuid_type) .set_comment("State of cluster discovery algorithm: the set of discovered peers") .with_version(generate_schema_version(id)) .set_wait_for_sync_to_commitlog(true) .with_null_sharder() .build(); }(); return schema; } schema_ptr system_keyspace::legacy::hints() { static thread_local auto schema = [] { schema_builder builder(generate_legacy_id(NAME, HINTS), NAME, HINTS, // partition key {{"target_id", uuid_type}}, // clustering key {{"hint_id", timeuuid_type}, {"message_version", int32_type}}, // regular columns {{"mutation", bytes_type}}, // static columns {}, // regular column name type utf8_type, // comment "*DEPRECATED* hints awaiting delivery" ); builder.set_gc_grace_seconds(0); builder.set_compaction_strategy(sstables::compaction_strategy_type::size_tiered); builder.set_compaction_strategy_options({{"enabled", "false"}}); builder.with_version(generate_schema_version(builder.uuid())); builder.with(schema_builder::compact_storage::yes); return builder.build(); }(); return schema; } schema_ptr system_keyspace::legacy::batchlog() { static thread_local auto schema = [] { schema_builder builder(generate_legacy_id(NAME, BATCHLOG), NAME, BATCHLOG, // partition key {{"id", uuid_type}}, // clustering key {}, // regular columns {{"data", bytes_type}, {"version", int32_type}, {"written_at", timestamp_type}}, // static columns {}, // regular column name type utf8_type, // comment "*DEPRECATED* batchlog entries" ); builder.set_gc_grace_seconds(0); builder.set_compaction_strategy(sstables::compaction_strategy_type::size_tiered); builder.set_compaction_strategy_options({{"min_threshold", "2"}}); builder.with(schema_builder::compact_storage::no); builder.with_version(generate_schema_version(builder.uuid())); return builder.build(); }(); return schema; } static constexpr auto schema_gc_grace = std::chrono::duration_cast(days(7)).count(); schema_ptr system_keyspace::legacy::keyspaces() { static thread_local auto schema = [] { schema_builder builder(generate_legacy_id(NAME, KEYSPACES), NAME, KEYSPACES, // partition key {{"keyspace_name", utf8_type}}, // clustering key {}, // regular columns { {"durable_writes", boolean_type}, {"strategy_class", utf8_type}, {"strategy_options", utf8_type} }, // static columns {}, // regular column name type utf8_type, // comment "*DEPRECATED* keyspace definitions" ); builder.set_gc_grace_seconds(schema_gc_grace); builder.with(schema_builder::compact_storage::yes); builder.with_version(generate_schema_version(builder.uuid())); return builder.build(); }(); return schema; } schema_ptr system_keyspace::legacy::column_families() { static thread_local auto schema = [] { schema_builder builder(generate_legacy_id(NAME, COLUMNFAMILIES), NAME, COLUMNFAMILIES, // partition key {{"keyspace_name", utf8_type}}, // clustering key {{"columnfamily_name", utf8_type}}, // regular columns { {"bloom_filter_fp_chance", double_type}, {"caching", utf8_type}, {"cf_id", uuid_type}, {"comment", utf8_type}, {"compaction_strategy_class", utf8_type}, {"compaction_strategy_options", utf8_type}, {"comparator", utf8_type}, {"compression_parameters", utf8_type}, {"default_time_to_live", int32_type}, {"default_validator", utf8_type}, {"dropped_columns", map_type_impl::get_instance(utf8_type, long_type, true)}, {"gc_grace_seconds", int32_type}, {"is_dense", boolean_type}, {"key_validator", utf8_type}, {"local_read_repair_chance", double_type}, {"max_compaction_threshold", int32_type}, {"max_index_interval", int32_type}, {"memtable_flush_period_in_ms", int32_type}, {"min_compaction_threshold", int32_type}, {"min_index_interval", int32_type}, {"read_repair_chance", double_type}, {"speculative_retry", utf8_type}, {"subcomparator", utf8_type}, {"type", utf8_type}, // The following 4 columns are only present up until 2.1.8 tables {"key_aliases", utf8_type}, {"value_alias", utf8_type}, {"column_aliases", utf8_type}, {"index_interval", int32_type},}, // static columns {}, // regular column name type utf8_type, // comment "*DEPRECATED* table definitions" ); builder.set_gc_grace_seconds(schema_gc_grace); builder.with(schema_builder::compact_storage::no); builder.with_version(generate_schema_version(builder.uuid())); return builder.build(); }(); return schema; } schema_ptr system_keyspace::legacy::columns() { static thread_local auto schema = [] { schema_builder builder(generate_legacy_id(NAME, COLUMNS), NAME, COLUMNS, // partition key {{"keyspace_name", utf8_type}}, // clustering key {{"columnfamily_name", utf8_type}, {"column_name", utf8_type}}, // regular columns { {"component_index", int32_type}, {"index_name", utf8_type}, {"index_options", utf8_type}, {"index_type", utf8_type}, {"type", utf8_type}, {"validator", utf8_type}, }, // static columns {}, // regular column name type utf8_type, // comment "column definitions" ); builder.set_gc_grace_seconds(schema_gc_grace); builder.with(schema_builder::compact_storage::no); builder.with_version(generate_schema_version(builder.uuid())); return builder.build(); }(); return schema; } schema_ptr system_keyspace::legacy::triggers() { static thread_local auto schema = [] { schema_builder builder(generate_legacy_id(NAME, TRIGGERS), NAME, TRIGGERS, // partition key {{"keyspace_name", utf8_type}}, // clustering key {{"columnfamily_name", utf8_type}, {"trigger_name", utf8_type}}, // regular columns { {"trigger_options", map_type_impl::get_instance(utf8_type, utf8_type, true)}, }, // static columns {}, // regular column name type utf8_type, // comment "trigger definitions" ); builder.set_gc_grace_seconds(schema_gc_grace); builder.with(schema_builder::compact_storage::no); builder.with_version(generate_schema_version(builder.uuid())); return builder.build(); }(); return schema; } schema_ptr system_keyspace::legacy::usertypes() { static thread_local auto schema = [] { schema_builder builder(generate_legacy_id(NAME, USERTYPES), NAME, USERTYPES, // partition key {{"keyspace_name", utf8_type}}, // clustering key {{"type_name", utf8_type}}, // regular columns { {"field_names", list_type_impl::get_instance(utf8_type, true)}, {"field_types", list_type_impl::get_instance(utf8_type, true)}, }, // static columns {}, // regular column name type utf8_type, // comment "user defined type definitions" ); builder.set_gc_grace_seconds(schema_gc_grace); builder.with(schema_builder::compact_storage::no); builder.with_version(generate_schema_version(builder.uuid())); return builder.build(); }(); return schema; } schema_ptr system_keyspace::legacy::functions() { /** * Note: we have our own "legacy" version of this table (in schema_tables), * but it is (afaik) not used, and differs slightly from the origin one. * This is based on the origin schema, since we're more likely to encounter * installations of that to migrate, rather than our own (if we dont use the table). */ static thread_local auto schema = [] { schema_builder builder(generate_legacy_id(NAME, FUNCTIONS), NAME, FUNCTIONS, // partition key {{"keyspace_name", utf8_type}}, // clustering key {{"function_name", utf8_type},{"signature", list_type_impl::get_instance(utf8_type, false)}}, // regular columns { {"argument_names", list_type_impl::get_instance(utf8_type, true)}, {"argument_types", list_type_impl::get_instance(utf8_type, true)}, {"body", utf8_type}, {"language", utf8_type}, {"return_type", utf8_type}, {"called_on_null_input", boolean_type}, }, // static columns {}, // regular column name type utf8_type, // comment "*DEPRECATED* user defined type definitions" ); builder.set_gc_grace_seconds(schema_gc_grace); builder.with(schema_builder::compact_storage::no); builder.with_version(generate_schema_version(builder.uuid())); return builder.build(); }(); return schema; } schema_ptr system_keyspace::legacy::aggregates() { static thread_local auto schema = [] { schema_builder builder(generate_legacy_id(NAME, AGGREGATES), NAME, AGGREGATES, // partition key {{"keyspace_name", utf8_type}}, // clustering key {{"aggregate_name", utf8_type},{"signature", list_type_impl::get_instance(utf8_type, false)}}, // regular columns { {"argument_types", list_type_impl::get_instance(utf8_type, true)}, {"final_func", utf8_type}, {"initcond", bytes_type}, {"return_type", utf8_type}, {"state_func", utf8_type}, {"state_type", utf8_type}, }, // static columns {}, // regular column name type utf8_type, // comment "*DEPRECATED* user defined aggregate definition" ); builder.set_gc_grace_seconds(schema_gc_grace); builder.with(schema_builder::compact_storage::no); builder.with_version(generate_schema_version(builder.uuid())); return builder.build(); }(); return schema; } future<> system_keyspace::setup_version(sharded& ms) { auto& cfg = _db.local().get_config(); return utils::resolve(cfg.rpc_address).then([this, &cfg, &ms](gms::inet_address a) { sstring req = fmt::format("INSERT INTO system.{} (key, release_version, cql_version, thrift_version, native_protocol_version, data_center, rack, partitioner, rpc_address, broadcast_address, listen_address) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)" , db::system_keyspace::LOCAL); auto& snitch = locator::i_endpoint_snitch::get_local_snitch_ptr(); return execute_cql(req, sstring(db::system_keyspace::LOCAL), version::release(), cql3::query_processor::CQL_VERSION, ::cassandra::thrift_version, to_sstring(cql_serialization_format::latest_version), snitch->get_datacenter(utils::fb_utilities::get_broadcast_address()), snitch->get_rack(utils::fb_utilities::get_broadcast_address()), sstring(cfg.partitioner()), a.addr(), utils::fb_utilities::get_broadcast_address().addr(), ms.local().listen_address().addr() ).discard_result(); }); } future<> system_keyspace::save_local_supported_features(const std::set& feats) { static const auto req = format("INSERT INTO system.{} (key, supported_features) VALUES (?, ?)", LOCAL); return qctx->execute_cql(req, sstring(db::system_keyspace::LOCAL), ::join(",", feats)).discard_result(); } // Changing the real load_dc_rack_info into a future would trigger a tidal wave of futurization that would spread // even into simple string operations like get_rack() / get_dc(). We will cache those at startup, and then change // our view of it every time we do updates on those values. // // The cache must be distributed, because the values themselves may not update atomically, so a shard reading that // is different than the one that wrote, may see a corrupted value. invoke_on_all will be used to guarantee that all // updates are propagated correctly. struct local_cache { std::unordered_map _cached_dc_rack_info; system_keyspace::bootstrap_state _state; }; future<> system_keyspace::build_dc_rack_info() { return execute_cql(format("SELECT peer, data_center, rack from system.{}", PEERS)).then([this] (::shared_ptr msg) { return do_for_each(*msg, [this] (auto& row) { net::inet_address peer = row.template get_as("peer"); if (!row.has("data_center") || !row.has("rack")) { return make_ready_future<>(); } gms::inet_address gms_addr(std::move(peer)); sstring dc = row.template get_as("data_center"); sstring rack = row.template get_as("rack"); locator::endpoint_dc_rack element = { dc, rack }; return container().invoke_on_all([gms_addr = std::move(gms_addr), element = std::move(element)] (auto& sys_ks) { sys_ks._cache->_cached_dc_rack_info.emplace(gms_addr, element); }); }).then([msg] { // Keep msg alive. }); }); } future<> system_keyspace::build_bootstrap_info() { sstring req = format("SELECT bootstrapped FROM system.{} WHERE key = ? ", LOCAL); return execute_cql(req, sstring(LOCAL)).then([this] (auto msg) { static auto state_map = std::unordered_map({ { "NEEDS_BOOTSTRAP", bootstrap_state::NEEDS_BOOTSTRAP }, { "COMPLETED", bootstrap_state::COMPLETED }, { "IN_PROGRESS", bootstrap_state::IN_PROGRESS }, { "DECOMMISSIONED", bootstrap_state::DECOMMISSIONED } }); bootstrap_state state = bootstrap_state::NEEDS_BOOTSTRAP; if (!msg->empty() && msg->one().has("bootstrapped")) { state = state_map.at(msg->one().template get_as("bootstrapped")); } return container().invoke_on_all([state] (auto& sys_ks) { sys_ks._cache->_state = state; }); }); } future<> system_keyspace::setup(sharded& ms) { assert(this_shard_id() == 0); co_await setup_version(ms); co_await update_schema_version(_db.local().get_version()); co_await build_dc_rack_info(); co_await build_bootstrap_info(); co_await check_health(); co_await db::schema_tables::save_system_keyspace_schema(_qp.local()); // #2514 - make sure "system" is written to system_schema.keyspaces. co_await db::schema_tables::save_system_schema(_qp.local(), NAME); co_await cache_truncation_record(); auto preferred_ips = co_await get_preferred_ips(); co_await ms.invoke_on_all([&preferred_ips] (auto& ms) { return ms.init_local_preferred_ip_cache(preferred_ips); }); } struct truncation_record { static constexpr uint32_t current_magic = 0x53435452; // 'S' 'C' 'T' 'R' uint32_t magic; std::vector positions; db_clock::time_point time_stamp; }; } #include "idl/replay_position.dist.hh" #include "idl/truncation_record.dist.hh" #include "serializer_impl.hh" #include "idl/replay_position.dist.impl.hh" #include "idl/truncation_record.dist.impl.hh" namespace db { typedef utils::UUID truncation_key; typedef std::unordered_map truncation_map; static constexpr uint8_t current_version = 1; future system_keyspace::get_truncation_record(utils::UUID cf_id) { sstring req = format("SELECT * from system.{} WHERE table_uuid = ?", TRUNCATED); return qctx->qp().execute_internal(req, {cf_id}, cql3::query_processor::cache_internal::yes).then([cf_id](::shared_ptr rs) { truncation_record r{truncation_record::current_magic}; for (const cql3::untyped_result_set_row& row : *rs) { auto shard = row.get_as("shard"); auto ts = row.get_as("truncated_at"); auto pos = row.get_as("position"); auto id = row.get_as("segment_id"); r.time_stamp = ts; r.positions.emplace_back(replay_position(shard, id, pos)); } return make_ready_future(std::move(r)); }); } // Read system.truncate table and cache last truncation time in `table` object for each table on every shard future<> system_keyspace::cache_truncation_record() { sstring req = format("SELECT DISTINCT table_uuid, truncated_at from system.{}", TRUNCATED); return execute_cql(req).then([this] (::shared_ptr rs) { return parallel_for_each(rs->begin(), rs->end(), [this] (const cql3::untyped_result_set_row& row) { auto table_uuid = row.get_as("table_uuid"); auto ts = row.get_as("truncated_at"); return _db.invoke_on_all([table_uuid, ts] (replica::database& db) mutable { try { replica::table& cf = db.find_column_family(table_uuid); cf.cache_truncation_record(ts); } catch (replica::no_such_column_family&) { slogger.debug("Skip caching truncation time for {} since the table is no longer present", table_uuid); } }); }); }); } future<> system_keyspace::save_truncation_record(utils::UUID id, db_clock::time_point truncated_at, db::replay_position rp) { sstring req = format("INSERT INTO system.{} (table_uuid, shard, position, segment_id, truncated_at) VALUES(?,?,?,?,?)", TRUNCATED); return qctx->qp().execute_internal(req, {id, int32_t(rp.shard_id()), int32_t(rp.pos), int64_t(rp.base_id()), truncated_at}, cql3::query_processor::cache_internal::yes).discard_result().then([] { return force_blocking_flush(TRUNCATED); }); } future<> system_keyspace::save_truncation_record(const replica::column_family& cf, db_clock::time_point truncated_at, db::replay_position rp) { return save_truncation_record(cf.schema()->id(), truncated_at, rp); } future system_keyspace::get_truncated_position(utils::UUID cf_id, uint32_t shard) { return get_truncated_position(std::move(cf_id)).then([shard](replay_positions positions) { for (auto& rp : positions) { if (shard == rp.shard_id()) { return make_ready_future(rp); } } return make_ready_future(); }); } future system_keyspace::get_truncated_position(utils::UUID cf_id) { return get_truncation_record(cf_id).then([](truncation_record e) { return make_ready_future(e.positions); }); } future system_keyspace::get_truncated_at(utils::UUID cf_id) { return get_truncation_record(cf_id).then([](truncation_record e) { return make_ready_future(e.time_stamp); }); } static set_type_impl::native_type prepare_tokens(const std::unordered_set& tokens) { set_type_impl::native_type tset; for (auto& t: tokens) { tset.push_back(t.to_sstring()); } return tset; } std::unordered_set decode_tokens(set_type_impl::native_type& tokens) { std::unordered_set tset; for (auto& t: tokens) { auto str = value_cast(t); assert(str == dht::token::from_sstring(str).to_sstring()); tset.insert(dht::token::from_sstring(str)); } return tset; } future<> system_keyspace::update_tokens(gms::inet_address ep, const std::unordered_set& tokens) { if (ep == utils::fb_utilities::get_broadcast_address()) { co_return co_await remove_endpoint(ep); } sstring req = format("INSERT INTO system.{} (peer, tokens) VALUES (?, ?)", PEERS); auto set_type = set_type_impl::get_instance(utf8_type, true); co_await execute_cql(req, ep.addr(), make_set_value(set_type, prepare_tokens(tokens))).discard_result(); co_await force_blocking_flush(PEERS); } future>> system_keyspace::load_tokens() { sstring req = format("SELECT peer, tokens FROM system.{}", PEERS); return execute_cql(req).then([] (::shared_ptr cql_result) { std::unordered_map> ret; for (auto& row : *cql_result) { auto peer = gms::inet_address(row.get_as("peer")); if (row.has("tokens")) { auto blob = row.get_blob("tokens"); auto cdef = peers()->get_column_definition("tokens"); auto deserialized = cdef->type->deserialize(blob); auto tokens = value_cast(deserialized); ret.emplace(peer, decode_tokens(tokens)); } } return ret; }); } future> system_keyspace::load_host_ids() { sstring req = format("SELECT peer, host_id FROM system.{}", PEERS); return execute_cql(req).then([] (::shared_ptr cql_result) { std::unordered_map ret; for (auto& row : *cql_result) { auto peer = gms::inet_address(row.get_as("peer")); if (row.has("host_id")) { ret.emplace(peer, row.get_as("host_id")); } } return ret; }); } future> system_keyspace::load_peer_features() { sstring req = format("SELECT peer, supported_features FROM system.{}", PEERS); return qctx->execute_cql(req).then([] (::shared_ptr cql_result) { std::unordered_map ret; for (auto& row : *cql_result) { if (row.has("supported_features")) { ret.emplace(row.get_as("peer"), row.get_as("supported_features")); } } return ret; }); } future<> system_keyspace::update_preferred_ip(gms::inet_address ep, gms::inet_address preferred_ip) { sstring req = format("INSERT INTO system.{} (peer, preferred_ip) VALUES (?, ?)", PEERS); return qctx->execute_cql(req, ep.addr(), preferred_ip.addr()).discard_result().then([] { return force_blocking_flush(PEERS); }); } future> system_keyspace::get_preferred_ips() { sstring req = format("SELECT peer, preferred_ip FROM system.{}", PEERS); return execute_cql(req).then([] (::shared_ptr cql_res_set) { std::unordered_map res; for (auto& r : *cql_res_set) { if (r.has("preferred_ip")) { res.emplace(gms::inet_address(r.get_as("peer")), gms::inet_address(r.get_as("preferred_ip"))); } } return res; }); } template future<> system_keyspace::update_cached_values(gms::inet_address ep, sstring column_name, Value value) { return make_ready_future<>(); } template <> future<> system_keyspace::update_cached_values(gms::inet_address ep, sstring column_name, sstring value) { return container().invoke_on_all([ep = std::move(ep), column_name = std::move(column_name), value = std::move(value)] (auto& sys_ks) { if (column_name == "data_center") { sys_ks._cache->_cached_dc_rack_info[ep].dc = value; } else if (column_name == "rack") { sys_ks._cache->_cached_dc_rack_info[ep].rack = value; } return make_ready_future<>(); }); } template future<> system_keyspace::update_peer_info(gms::inet_address ep, sstring column_name, Value value) { if (ep == utils::fb_utilities::get_broadcast_address()) { co_return; } co_await update_cached_values(ep, column_name, value); sstring req = format("INSERT INTO system.{} (peer, {}) VALUES (?, ?)", PEERS, column_name); co_await execute_cql(req, ep.addr(), value).discard_result(); } // sets are not needed, since tokens are updated by another method template future<> system_keyspace::update_peer_info(gms::inet_address ep, sstring column_name, sstring); template future<> system_keyspace::update_peer_info(gms::inet_address ep, sstring column_name, utils::UUID); template future<> system_keyspace::update_peer_info(gms::inet_address ep, sstring column_name, net::inet_address); template future<> set_scylla_local_param_as(const sstring& key, const T& value) { sstring req = format("UPDATE system.{} SET value = ? WHERE key = ?", system_keyspace::SCYLLA_LOCAL); auto type = data_type_for(); return qctx->execute_cql(req, type->to_string_impl(data_value(value)), key).discard_result(); } template future> get_scylla_local_param_as(const sstring& key) { sstring req = format("SELECT value FROM system.{} WHERE key = ?", system_keyspace::SCYLLA_LOCAL); return qctx->execute_cql(req, key).then([] (::shared_ptr res) -> future> { if (res->empty() || !res->one().has("value")) { return make_ready_future>(std::optional()); } auto type = data_type_for(); return make_ready_future>(value_cast(type->deserialize( type->from_string(res->one().get_as("value"))))); }); } future<> system_keyspace::set_scylla_local_param(const sstring& key, const sstring& value) { return set_scylla_local_param_as(key, value); } future> system_keyspace::get_scylla_local_param(const sstring& key){ return get_scylla_local_param_as(key); } future<> system_keyspace::update_schema_version(utils::UUID version) { sstring req = format("INSERT INTO system.{} (key, schema_version) VALUES (?, ?)", LOCAL); return execute_cql(req, sstring(LOCAL), version).discard_result(); } /** * Remove stored tokens being used by another node */ future<> system_keyspace::remove_endpoint(gms::inet_address ep) { co_await container().invoke_on_all([ep] (auto& sys_ks) { sys_ks._cache->_cached_dc_rack_info.erase(ep); }); sstring req = format("DELETE FROM system.{} WHERE peer = ?", PEERS); co_await execute_cql(req, ep.addr()).discard_result(); co_await force_blocking_flush(PEERS); } future<> system_keyspace::update_tokens(const std::unordered_set& tokens) { if (tokens.empty()) { return make_exception_future<>(std::invalid_argument("remove_endpoint should be used instead")); } sstring req = format("INSERT INTO system.{} (key, tokens) VALUES (?, ?)", LOCAL); auto set_type = set_type_impl::get_instance(utf8_type, true); return execute_cql(req, sstring(LOCAL), make_set_value(set_type, prepare_tokens(tokens))).discard_result().then([] { return force_blocking_flush(LOCAL); }); } future<> system_keyspace::force_blocking_flush(sstring cfname) { assert(qctx); return qctx->_qp.invoke_on_all([cfname = std::move(cfname)] (cql3::query_processor& qp) { // if (!Boolean.getBoolean("cassandra.unsafesystem")) return qp.db().real_database().flush(NAME, cfname); // FIXME: get real database in another way }); } /** * One of three things will happen if you try to read the system keyspace: * 1. files are present and you can read them: great * 2. no files are there: great (new node is assumed) * 3. files are present but you can't read them: bad */ future<> system_keyspace::check_health() { using namespace cql_transport::messages; sstring req = format("SELECT cluster_name FROM system.{} WHERE key=?", LOCAL); return execute_cql(req, sstring(LOCAL)).then([this] (::shared_ptr msg) { if (msg->empty() || !msg->one().has("cluster_name")) { // this is a brand new node sstring ins_req = format("INSERT INTO system.{} (key, cluster_name) VALUES (?, ?)", LOCAL); auto cluster_name = _db.local().get_config().cluster_name(); return execute_cql(ins_req, sstring(LOCAL), cluster_name).discard_result(); } else { auto cluster_name = _db.local().get_config().cluster_name(); auto saved_cluster_name = msg->one().get_as("cluster_name"); if (cluster_name != saved_cluster_name) { throw exceptions::configuration_exception("Saved cluster name " + saved_cluster_name + " != configured name " + cluster_name); } return make_ready_future<>(); } }); } future> system_keyspace::get_saved_tokens() { sstring req = format("SELECT tokens FROM system.{} WHERE key = ?", LOCAL); return qctx->execute_cql(req, sstring(LOCAL)).then([] (auto msg) { if (msg->empty() || !msg->one().has("tokens")) { return make_ready_future>(); } auto blob = msg->one().get_blob("tokens"); auto cdef = local()->get_column_definition("tokens"); auto deserialized = cdef->type->deserialize(blob); auto tokens = value_cast(deserialized); return make_ready_future>(decode_tokens(tokens)); }); } future> system_keyspace::get_local_tokens() { return get_saved_tokens().then([] (auto&& tokens) { if (tokens.empty()) { auto err = format("get_local_tokens: tokens is empty"); slogger.error("{}", err); throw std::runtime_error(err); } return std::move(tokens); }); } future<> system_keyspace::update_cdc_generation_id(cdc::generation_id gen_id) { co_await std::visit(make_visitor( [] (cdc::generation_id_v1 id) -> future<> { co_await qctx->execute_cql( format("INSERT INTO system.{} (key, streams_timestamp) VALUES (?, ?)", v3::CDC_LOCAL), sstring(v3::CDC_LOCAL), id.ts); }, [] (cdc::generation_id_v2 id) -> future<> { co_await qctx->execute_cql( format("INSERT INTO system.{} (key, streams_timestamp, uuid) VALUES (?, ?, ?)", v3::CDC_LOCAL), sstring(v3::CDC_LOCAL), id.ts, id.id); } ), gen_id); co_await force_blocking_flush(v3::CDC_LOCAL); } future> system_keyspace::get_cdc_generation_id() { auto msg = co_await qctx->execute_cql( format("SELECT streams_timestamp, uuid FROM system.{} WHERE key = ?", v3::CDC_LOCAL), sstring(v3::CDC_LOCAL)); if (msg->empty()) { co_return std::nullopt; } auto& row = msg->one(); if (!row.has("streams_timestamp")) { // should not happen but whatever co_return std::nullopt; } auto ts = row.get_as("streams_timestamp"); if (!row.has("uuid")) { co_return cdc::generation_id_v1{ts}; } auto id = row.get_as("uuid"); co_return cdc::generation_id_v2{ts, id}; } static const sstring CDC_REWRITTEN_KEY = "rewritten"; future<> system_keyspace::cdc_set_rewritten(std::optional gen_id) { if (gen_id) { return qctx->execute_cql( format("INSERT INTO system.{} (key, streams_timestamp) VALUES (?, ?)", v3::CDC_LOCAL), CDC_REWRITTEN_KEY, gen_id->ts).discard_result(); } else { // Insert just the row marker. return qctx->execute_cql( format("INSERT INTO system.{} (key) VALUES (?)", v3::CDC_LOCAL), CDC_REWRITTEN_KEY).discard_result(); } } future system_keyspace::cdc_is_rewritten() { // We don't care about the actual timestamp; it's additional information for debugging purposes. return qctx->execute_cql(format("SELECT key FROM system.{} WHERE key = ?", v3::CDC_LOCAL), CDC_REWRITTEN_KEY) .then([] (::shared_ptr msg) { return !msg->empty(); }); } bool system_keyspace::bootstrap_needed() const { return get_bootstrap_state() == bootstrap_state::NEEDS_BOOTSTRAP; } bool system_keyspace::bootstrap_complete() const { return get_bootstrap_state() == bootstrap_state::COMPLETED; } bool system_keyspace::bootstrap_in_progress() const { return get_bootstrap_state() == bootstrap_state::IN_PROGRESS; } bool system_keyspace::was_decommissioned() const { return get_bootstrap_state() == bootstrap_state::DECOMMISSIONED; } system_keyspace::bootstrap_state system_keyspace::get_bootstrap_state() const { return _cache->_state; } future<> system_keyspace::set_bootstrap_state(bootstrap_state state) { static std::unordered_map> state_to_name({ { bootstrap_state::NEEDS_BOOTSTRAP, "NEEDS_BOOTSTRAP" }, { bootstrap_state::COMPLETED, "COMPLETED" }, { bootstrap_state::IN_PROGRESS, "IN_PROGRESS" }, { bootstrap_state::DECOMMISSIONED, "DECOMMISSIONED" } }); sstring state_name = state_to_name.at(state); sstring req = format("INSERT INTO system.{} (key, bootstrapped) VALUES (?, ?)", LOCAL); co_await execute_cql(req, sstring(LOCAL), state_name).discard_result(); co_await force_blocking_flush(LOCAL); co_await container().invoke_on_all([state] (auto& sys_ks) { sys_ks._cache->_state = state; }); } class cluster_status_table : public memtable_filling_virtual_table { private: service::storage_service& _ss; gms::gossiper& _gossiper; public: cluster_status_table(service::storage_service& ss, gms::gossiper& g) : memtable_filling_virtual_table(build_schema()) , _ss(ss), _gossiper(g) {} static schema_ptr build_schema() { auto id = generate_legacy_id(system_keyspace::NAME, "cluster_status"); return schema_builder(system_keyspace::NAME, "cluster_status", std::make_optional(id)) .with_column("peer", inet_addr_type, column_kind::partition_key) .with_column("dc", utf8_type) .with_column("up", boolean_type) .with_column("status", utf8_type) .with_column("load", utf8_type) .with_column("tokens", int32_type) .with_column("owns", float_type) .with_column("host_id", uuid_type) .with_version(system_keyspace::generate_schema_version(id)) .build(); } future<> execute(std::function mutation_sink) override { return _ss.get_ownership().then([&, mutation_sink] (std::map ownership) { const locator::token_metadata& tm = _ss.get_token_metadata(); for (auto&& e : _gossiper.endpoint_state_map) { auto endpoint = e.first; mutation m(schema(), partition_key::from_single_value(*schema(), data_value(endpoint).serialize_nonnull())); row& cr = m.partition().clustered_row(*schema(), clustering_key::make_empty()).cells(); set_cell(cr, "up", _gossiper.is_alive(endpoint)); set_cell(cr, "status", _gossiper.get_gossip_status(endpoint)); set_cell(cr, "load", _gossiper.get_application_state_value(endpoint, gms::application_state::LOAD)); std::optional hostid = tm.get_host_id_if_known(endpoint); if (hostid) { set_cell(cr, "host_id", hostid); } if (tm.get_topology().has_endpoint(endpoint)) { sstring dc = tm.get_topology().get_location(endpoint).dc; set_cell(cr, "dc", dc); } if (ownership.contains(endpoint)) { set_cell(cr, "owns", ownership[endpoint]); } set_cell(cr, "tokens", int32_t(tm.get_tokens(endpoint).size())); mutation_sink(std::move(m)); } }); } }; class token_ring_table : public streaming_virtual_table { private: replica::database& _db; service::storage_service& _ss; public: token_ring_table(replica::database& db, service::storage_service& ss) : streaming_virtual_table(build_schema()) , _db(db) , _ss(ss) { _shard_aware = true; } static schema_ptr build_schema() { auto id = generate_legacy_id(system_keyspace::NAME, "token_ring"); return schema_builder(system_keyspace::NAME, "token_ring", std::make_optional(id)) .with_column("keyspace_name", utf8_type, column_kind::partition_key) .with_column("start_token", utf8_type, column_kind::clustering_key) .with_column("endpoint", inet_addr_type, column_kind::clustering_key) .with_column("end_token", utf8_type) .with_column("dc", utf8_type) .with_column("rack", utf8_type) .with_version(system_keyspace::generate_schema_version(id)) .build(); } dht::decorated_key make_partition_key(const sstring& name) { return dht::decorate_key(*_s, partition_key::from_single_value(*_s, data_value(name).serialize_nonnull())); } clustering_key make_clustering_key(sstring start_token, gms::inet_address host) { return clustering_key::from_exploded(*_s, { data_value(start_token).serialize_nonnull(), data_value(host).serialize_nonnull() }); } struct endpoint_details_cmp { bool operator()(const dht::endpoint_details& l, const dht::endpoint_details& r) const { return inet_addr_type->less( data_value(l._host).serialize_nonnull(), data_value(r._host).serialize_nonnull()); } }; future<> execute(reader_permit permit, result_collector& result, const query_restrictions& qr) override { struct decorated_keyspace_name { sstring name; dht::decorated_key key; }; auto keyspace_names = boost::copy_range>( _db.get_keyspaces() | boost::adaptors::transformed([this] (auto&& e) { return decorated_keyspace_name{e.first, make_partition_key(e.first)}; })); boost::sort(keyspace_names, [less = dht::ring_position_less_comparator(*_s)] (const decorated_keyspace_name& l, const decorated_keyspace_name& r) { return less(l.key, r.key); }); for (const decorated_keyspace_name& e : keyspace_names) { auto&& dk = e.key; if (!this_shard_owns(dk) || !contains_key(qr.partition_range(), dk) || !_db.has_keyspace(e.name)) { continue; } std::vector ranges = co_await _ss.describe_ring(e.name); co_await result.emit_partition_start(dk); boost::sort(ranges, [] (const dht::token_range_endpoints& l, const dht::token_range_endpoints& r) { return l._start_token < r._start_token; }); for (dht::token_range_endpoints& range : ranges) { boost::sort(range._endpoint_details, endpoint_details_cmp()); for (const dht::endpoint_details& detail : range._endpoint_details) { clustering_row cr(make_clustering_key(range._start_token, detail._host)); set_cell(cr.cells(), "end_token", sstring(range._end_token)); set_cell(cr.cells(), "dc", sstring(detail._datacenter)); set_cell(cr.cells(), "rack", sstring(detail._rack)); co_await result.emit_row(std::move(cr)); } } co_await result.emit_partition_end(); } } }; class snapshots_table : public streaming_virtual_table { distributed& _db; public: explicit snapshots_table(distributed& db) : streaming_virtual_table(build_schema()) , _db(db) { _shard_aware = true; } static schema_ptr build_schema() { auto id = generate_legacy_id(system_keyspace::NAME, "snapshots"); return schema_builder(system_keyspace::NAME, "snapshots", std::make_optional(id)) .with_column("keyspace_name", utf8_type, column_kind::partition_key) .with_column("table_name", utf8_type, column_kind::clustering_key) .with_column("snapshot_name", utf8_type, column_kind::clustering_key) .with_column("live", long_type) .with_column("total", long_type) .set_comment("Lists all the snapshots along with their size, dropped tables are not part of the listing.") .with_version(system_keyspace::generate_schema_version(id)) .build(); } dht::decorated_key make_partition_key(const sstring& name) { return dht::decorate_key(*_s, partition_key::from_single_value(*_s, data_value(name).serialize_nonnull())); } clustering_key make_clustering_key(sstring table_name, sstring snapshot_name) { return clustering_key::from_exploded(*_s, { data_value(std::move(table_name)).serialize_nonnull(), data_value(std::move(snapshot_name)).serialize_nonnull() }); } future<> execute(reader_permit permit, result_collector& result, const query_restrictions& qr) override { struct decorated_keyspace_name { sstring name; dht::decorated_key key; }; std::vector keyspace_names; for (const auto& [name, _] : _db.local().get_keyspaces()) { auto dk = make_partition_key(name); if (!this_shard_owns(dk) || !contains_key(qr.partition_range(), dk)) { continue; } keyspace_names.push_back({std::move(name), std::move(dk)}); } boost::sort(keyspace_names, [less = dht::ring_position_less_comparator(*_s)] (const decorated_keyspace_name& l, const decorated_keyspace_name& r) { return less(l.key, r.key); }); using snapshots_by_tables_map = std::map>; class snapshot_reducer { private: snapshots_by_tables_map _result; public: future<> operator()(const snapshots_by_tables_map& value) { for (auto& [table_name, snapshots] : value) { if (auto [_, added] = _result.try_emplace(table_name, std::move(snapshots)); added) { continue; } auto& rp = _result.at(table_name); for (auto&& [snapshot_name, snapshot_detail]: snapshots) { if (auto [_, added] = rp.try_emplace(snapshot_name, std::move(snapshot_detail)); added) { continue; } auto& detail = rp.at(snapshot_name); detail.live += snapshot_detail.live; detail.total += snapshot_detail.total; } } return make_ready_future<>(); } snapshots_by_tables_map get() && { return std::move(_result); } }; for (auto& ks_data : keyspace_names) { co_await result.emit_partition_start(ks_data.key); const auto snapshots_by_tables = co_await _db.map_reduce(snapshot_reducer(), [ks_name = ks_data.name] (replica::database& db) -> future { snapshots_by_tables_map snapshots_by_tables; for (auto& [_, table] : db.get_column_families()) { if (table->schema()->ks_name() != ks_name) { continue; } const auto unordered_snapshots = co_await table->get_snapshot_details(); snapshots_by_tables.emplace(table->schema()->cf_name(), std::map(unordered_snapshots.begin(), unordered_snapshots.end())); } co_return snapshots_by_tables; }); for (const auto& [table_name, snapshots] : snapshots_by_tables) { for (auto& [snapshot_name, details] : snapshots) { clustering_row cr(make_clustering_key(table_name, snapshot_name)); set_cell(cr.cells(), "live", details.live); set_cell(cr.cells(), "total", details.total); co_await result.emit_row(std::move(cr)); } } co_await result.emit_partition_end(); } } }; class protocol_servers_table : public memtable_filling_virtual_table { private: service::storage_service& _ss; struct protocol_server_info { sstring name; sstring protocol; sstring protocol_version; std::vector listen_addresses; explicit protocol_server_info(protocol_server& s) : name(s.name()) , protocol(s.protocol()) , protocol_version(s.protocol_version()) { for (const auto& addr : s.listen_addresses()) { listen_addresses.push_back(format("{}:{}", addr.addr(), addr.port())); } } }; public: explicit protocol_servers_table(service::storage_service& ss) : memtable_filling_virtual_table(build_schema()) , _ss(ss) { _shard_aware = true; } static schema_ptr build_schema() { auto id = generate_legacy_id(system_keyspace::NAME, "protocol_servers"); return schema_builder(system_keyspace::NAME, "protocol_servers", std::make_optional(id)) .with_column("name", utf8_type, column_kind::partition_key) .with_column("protocol", utf8_type) .with_column("protocol_version", utf8_type) .with_column("listen_addresses", list_type_impl::get_instance(utf8_type, false)) .set_comment("Lists all client protocol servers and their status.") .with_version(system_keyspace::generate_schema_version(id)) .build(); } future<> execute(std::function mutation_sink) override { // Servers are registered on shard 0 only const auto server_infos = co_await smp::submit_to(0ul, [&ss = _ss.container()] { return boost::copy_range>(ss.local().protocol_servers() | boost::adaptors::transformed([] (protocol_server* s) { return protocol_server_info(*s); })); }); for (auto server : server_infos) { auto dk = dht::decorate_key(*_s, partition_key::from_single_value(*schema(), data_value(server.name).serialize_nonnull())); if (!this_shard_owns(dk)) { continue; } mutation m(schema(), std::move(dk)); row& cr = m.partition().clustered_row(*schema(), clustering_key::make_empty()).cells(); set_cell(cr, "protocol", server.protocol); set_cell(cr, "protocol_version", server.protocol_version); std::vector addresses(server.listen_addresses.begin(), server.listen_addresses.end()); set_cell(cr, "listen_addresses", make_list_value(schema()->get_column_definition("listen_addresses")->type, std::move(addresses))); mutation_sink(std::move(m)); } } }; class runtime_info_table : public memtable_filling_virtual_table { private: distributed& _db; service::storage_service& _ss; std::optional _generic_key; private: std::optional maybe_make_key(sstring key) { auto dk = dht::decorate_key(*_s, partition_key::from_single_value(*schema(), data_value(std::move(key)).serialize_nonnull())); if (this_shard_owns(dk)) { return dk; } return std::nullopt; } void do_add_partition(std::function& mutation_sink, dht::decorated_key key, std::vector> rows) { mutation m(schema(), std::move(key)); for (auto&& [ckey, cvalue] : rows) { row& cr = m.partition().clustered_row(*schema(), clustering_key::from_single_value(*schema(), data_value(std::move(ckey)).serialize_nonnull())).cells(); set_cell(cr, "value", std::move(cvalue)); } mutation_sink(std::move(m)); } void add_partition(std::function& mutation_sink, sstring key, sstring value) { if (_generic_key) { do_add_partition(mutation_sink, *_generic_key, {{key, std::move(value)}}); } } void add_partition(std::function& mutation_sink, sstring key, std::initializer_list> rows) { auto dk = maybe_make_key(std::move(key)); if (dk) { do_add_partition(mutation_sink, std::move(*dk), std::move(rows)); } } future<> add_partition(std::function& mutation_sink, sstring key, std::function()> value_producer) { if (_generic_key) { do_add_partition(mutation_sink, *_generic_key, {{key, co_await value_producer()}}); } } future<> add_partition(std::function& mutation_sink, sstring key, std::function>>()> value_producer) { auto dk = maybe_make_key(std::move(key)); if (dk) { do_add_partition(mutation_sink, std::move(*dk), co_await value_producer()); } } template future map_reduce_tables(std::function map, std::function reduce = std::plus{}) { class shard_reducer { T _v{}; std::function _reduce; public: shard_reducer(std::function reduce) : _reduce(std::move(reduce)) { } future<> operator()(T v) { v = _reduce(_v, v); return make_ready_future<>(); } T get() && { return std::move(_v); } }; co_return co_await _db.map_reduce(shard_reducer(reduce), [map, reduce] (replica::database& db) { T val = {}; for (auto& [_, table] : db.get_column_families()) { val = reduce(val, map(*table)); } return val; }); } template future map_reduce_shards(std::function map, std::function reduce = std::plus{}, T initial = {}) { co_return co_await map_reduce( boost::irange(0u, smp::count), [map] (shard_id shard) { return smp::submit_to(shard, [map] { return map(); }); }, initial, reduce); } public: explicit runtime_info_table(distributed& db, service::storage_service& ss) : memtable_filling_virtual_table(build_schema()) , _db(db) , _ss(ss) { _shard_aware = true; _generic_key = maybe_make_key("generic"); } static schema_ptr build_schema() { auto id = generate_legacy_id(system_keyspace::NAME, "runtime_info"); return schema_builder(system_keyspace::NAME, "runtime_info", std::make_optional(id)) .with_column("group", utf8_type, column_kind::partition_key) .with_column("item", utf8_type, column_kind::clustering_key) .with_column("value", utf8_type) .set_comment("Runtime system information.") .with_version(system_keyspace::generate_schema_version(id)) .build(); } future<> execute(std::function mutation_sink) override { co_await add_partition(mutation_sink, "gossip_active", [this] () -> future { return _ss.is_gossip_running().then([] (bool running){ return format("{}", running); }); }); co_await add_partition(mutation_sink, "load", [this] () -> future { return map_reduce_tables([] (replica::table& tbl) { return tbl.get_stats().live_disk_space_used; }).then([] (int64_t load) { return format("{}", load); }); }); add_partition(mutation_sink, "uptime", format("{} seconds", std::chrono::duration_cast(engine().uptime()).count())); add_partition(mutation_sink, "trace_probability", format("{:.2}", tracing::tracing::get_local_tracing_instance().get_trace_probability())); co_await add_partition(mutation_sink, "memory", [this] () { struct stats { uint64_t total = 0; uint64_t free = 0; static stats reduce(stats a, stats b) { return stats{a.total + b.total, a.free + b.free}; } }; return map_reduce_shards([] () { const auto& s = memory::stats(); return stats{s.total_memory(), s.free_memory()}; }, stats::reduce, stats{}).then([] (stats s) { return std::vector>{ {"total", format("{}", s.total)}, {"used", format("{}", s.total - s.free)}, {"free", format("{}", s.free)}}; }); }); co_await add_partition(mutation_sink, "memtable", [this] () { struct stats { uint64_t total = 0; uint64_t free = 0; uint64_t entries = 0; static stats reduce(stats a, stats b) { return stats{a.total + b.total, a.free + b.free, a.entries + b.entries}; } }; return map_reduce_tables([] (replica::table& t) { const auto s = t.active_memtable().region().occupancy(); return stats{s.total_space(), s.free_space(), t.active_memtable().partition_count()}; }, stats::reduce).then([] (stats s) { return std::vector>{ {"memory_total", format("{}", s.total)}, {"memory_used", format("{}", s.total - s.free)}, {"memory_free", format("{}", s.free)}, {"entries", format("{}", s.entries)}}; }); }); co_await add_partition(mutation_sink, "cache", [this] () { struct stats { uint64_t total = 0; uint64_t free = 0; uint64_t entries = 0; uint64_t hits = 0; uint64_t misses = 0; utils::rate_moving_average hits_moving_average; utils::rate_moving_average requests_moving_average; static stats reduce(stats a, stats b) { return stats{ a.total + b.total, a.free + b.free, a.entries + b.entries, a.hits + b.hits, a.misses + b.misses, a.hits_moving_average + b.hits_moving_average, a.requests_moving_average + b.requests_moving_average}; } }; return _db.map_reduce0([] (replica::database& db) { stats res{}; auto occupancy = db.row_cache_tracker().region().occupancy(); res.total = occupancy.total_space(); res.free = occupancy.free_space(); res.entries = db.row_cache_tracker().partitions(); for (const auto& [_, t] : db.get_column_families()) { auto& cache_stats = t->get_row_cache().stats(); res.hits += cache_stats.hits.count(); res.misses += cache_stats.misses.count(); res.hits_moving_average += cache_stats.hits.rate(); res.requests_moving_average += (cache_stats.hits.rate() + cache_stats.misses.rate()); } return res; }, stats{}, stats::reduce).then([] (stats s) { return std::vector>{ {"memory_total", format("{}", s.total)}, {"memory_used", format("{}", s.total - s.free)}, {"memory_free", format("{}", s.free)}, {"entries", format("{}", s.entries)}, {"hits", format("{}", s.hits)}, {"misses", format("{}", s.misses)}, {"hit_rate_total", format("{:.2}", static_cast(s.hits) / static_cast(s.hits + s.misses))}, {"hit_rate_recent", format("{:.2}", s.hits_moving_average.mean_rate)}, {"requests_total", format("{}", s.hits + s.misses)}, {"requests_recent", format("{}", static_cast(s.requests_moving_average.mean_rate))}}; }); }); co_await add_partition(mutation_sink, "incremental_backup_enabled", [this] () { return _db.map_reduce0([] (replica::database& db) { return boost::algorithm::any_of(db.get_keyspaces(), [] (const auto& id_and_ks) { return id_and_ks.second.incremental_backups_enabled(); }); }, false, std::logical_or{}).then([] (bool res) -> sstring { return res ? "true" : "false"; }); }); } }; class versions_table : public memtable_filling_virtual_table { public: explicit versions_table() : memtable_filling_virtual_table(build_schema()) { _shard_aware = false; } static schema_ptr build_schema() { auto id = generate_legacy_id(system_keyspace::NAME, "versions"); return schema_builder(system_keyspace::NAME, "versions", std::make_optional(id)) .with_column("key", utf8_type, column_kind::partition_key) .with_column("version", utf8_type) .with_column("build_mode", utf8_type) .with_column("build_id", utf8_type) .set_comment("Version information.") .with_version(system_keyspace::generate_schema_version(id)) .build(); } future<> execute(std::function mutation_sink) override { mutation m(schema(), partition_key::from_single_value(*schema(), data_value("local").serialize_nonnull())); row& cr = m.partition().clustered_row(*schema(), clustering_key::make_empty()).cells(); set_cell(cr, "version", scylla_version()); set_cell(cr, "build_mode", scylla_build_mode()); set_cell(cr, "build_id", get_build_id()); mutation_sink(std::move(m)); return make_ready_future<>(); } }; class db_config_table final : public streaming_virtual_table { db::config& _cfg; static schema_ptr build_schema() { auto id = generate_legacy_id(system_keyspace::NAME, "config"); return schema_builder(system_keyspace::NAME, "config", std::make_optional(id)) .with_column("name", utf8_type, column_kind::partition_key) .with_column("type", utf8_type) .with_column("source", utf8_type) .with_column("value", utf8_type) .with_version(system_keyspace::generate_schema_version(id)) .build(); } future<> execute(reader_permit permit, result_collector& result, const query_restrictions& qr) override { struct config_entry { dht::decorated_key key; sstring_view type; sstring source; sstring value; }; std::vector cfg; for (auto&& cfg_ref : _cfg.values()) { auto&& c = cfg_ref.get(); dht::decorated_key dk = dht::decorate_key(*_s, partition_key::from_single_value(*_s, data_value(c.name()).serialize_nonnull())); if (this_shard_owns(dk)) { cfg.emplace_back(config_entry{ std::move(dk), c.type_name(), c.source_name(), c.value_as_json()._res }); } } boost::sort(cfg, [less = dht::ring_position_less_comparator(*_s)] (const config_entry& l, const config_entry& r) { return less(l.key, r.key); }); for (auto&& c : cfg) { co_await result.emit_partition_start(c.key); mutation m(schema(), c.key); clustering_row cr(clustering_key::make_empty()); set_cell(cr.cells(), "type", c.type); set_cell(cr.cells(), "source", c.source); set_cell(cr.cells(), "value", c.value); co_await result.emit_row(std::move(cr)); co_await result.emit_partition_end(); } } virtual future<> apply(const frozen_mutation& fm) override { const mutation m = fm.unfreeze(_s); query::result_set rs(m); auto name = rs.row(0).get("name"); auto value = rs.row(0).get("value"); if (!_cfg.enable_cql_config_updates()) { return virtual_table::apply(fm); // will return back exceptional future } if (!name) { return make_exception_future<>(virtual_table_update_exception("option name is required")); } if (!value) { return make_exception_future<>(virtual_table_update_exception("option value is required")); } if (rs.row(0).cells().contains("type")) { return make_exception_future<>(virtual_table_update_exception("option type is immutable")); } if (rs.row(0).cells().contains("source")) { return make_exception_future<>(virtual_table_update_exception("option source is not updateable")); } return smp::submit_to(0, [&cfg = _cfg, name = std::move(*name), value = std::move(*value)] () mutable { for (auto& c_ref : cfg.values()) { auto& c = c_ref.get(); if (c.name() == name) { try { if (c.set_value(value, utils::config_file::config_source::CQL)) { return cfg.broadcast_to_all_shards(); } else { return make_exception_future<>(virtual_table_update_exception("option is not live-updateable")); } } catch (boost::bad_lexical_cast&) { return make_exception_future<>(virtual_table_update_exception("cannot parse option value")); } } } return make_exception_future<>(virtual_table_update_exception("no such option")); }); } public: explicit db_config_table(db::config& cfg) : streaming_virtual_table(build_schema()) , _cfg(cfg) { _shard_aware = true; } }; class clients_table : public streaming_virtual_table { service::storage_service& _ss; static schema_ptr build_schema() { auto id = generate_legacy_id(system_keyspace::NAME, "clients"); return schema_builder(system_keyspace::NAME, "clients", std::make_optional(id)) .with_column("address", inet_addr_type, column_kind::partition_key) .with_column("port", int32_type, column_kind::clustering_key) .with_column("client_type", utf8_type, column_kind::clustering_key) .with_column("shard_id", int32_type) .with_column("connection_stage", utf8_type) .with_column("driver_name", utf8_type) .with_column("driver_version", utf8_type) .with_column("hostname", utf8_type) .with_column("protocol_version", int32_type) .with_column("ssl_cipher_suite", utf8_type) .with_column("ssl_enabled", boolean_type) .with_column("ssl_protocol", utf8_type) .with_column("username", utf8_type) .with_version(system_keyspace::generate_schema_version(id)) .build(); } dht::decorated_key make_partition_key(net::inet_address ip) { return dht::decorate_key(*_s, partition_key::from_single_value(*_s, data_value(ip).serialize_nonnull())); } clustering_key make_clustering_key(int32_t port, sstring clt) { return clustering_key::from_exploded(*_s, { data_value(port).serialize_nonnull(), data_value(clt).serialize_nonnull() }); } future<> execute(reader_permit permit, result_collector& result, const query_restrictions& qr) override { // Collect using client_data_vec = utils::chunked_vector; using shard_client_data = std::vector; std::vector>> cd_vec; cd_vec.resize(smp::count); auto servers = co_await _ss.container().invoke_on(0, [] (auto& ss) { return ss.protocol_servers(); }); co_await smp::invoke_on_all([&cd_vec_ = cd_vec, &servers_ = servers] () -> future<> { auto& cd_vec = cd_vec_; auto& servers = servers_; auto scd = std::make_unique(); for (const auto& ps : servers) { client_data_vec cds = co_await ps->get_client_data(); if (cds.size() != 0) { scd->emplace_back(std::move(cds)); } } cd_vec[this_shard_id()] = make_foreign(std::move(scd)); }); // Partition struct decorated_ip { dht::decorated_key key; net::inet_address ip; struct compare { dht::ring_position_less_comparator less; explicit compare(const class schema& s) : less(s) {} bool operator()(const decorated_ip& a, const decorated_ip& b) const { return less(a.key, b.key); } }; }; decorated_ip::compare cmp(*_s); std::set ips(cmp); std::unordered_map cd_map; for (int i = 0; i < smp::count; i++) { for (auto&& ps_cdc : *cd_vec[i]) { for (auto&& cd : ps_cdc) { if (cd_map.contains(cd.ip)) { cd_map[cd.ip].emplace_back(std::move(cd)); } else { dht::decorated_key key = make_partition_key(cd.ip); if (this_shard_owns(key) && contains_key(qr.partition_range(), key)) { ips.insert(decorated_ip{std::move(key), cd.ip}); cd_map[cd.ip].emplace_back(std::move(cd)); } } co_await coroutine::maybe_yield(); } } } // Emit for (const auto& dip : ips) { co_await result.emit_partition_start(dip.key); auto& clients = cd_map[dip.ip]; boost::sort(clients, [] (const client_data& a, const client_data& b) { return a.port < b.port || a.client_type_str() < b.client_type_str(); }); for (const auto& cd : clients) { clustering_row cr(make_clustering_key(cd.port, cd.client_type_str())); set_cell(cr.cells(), "shard_id", cd.shard_id); set_cell(cr.cells(), "connection_stage", cd.stage_str()); if (cd.driver_name) { set_cell(cr.cells(), "driver_name", *cd.driver_name); } if (cd.driver_version) { set_cell(cr.cells(), "driver_version", *cd.driver_version); } if (cd.hostname) { set_cell(cr.cells(), "hostname", *cd.hostname); } if (cd.protocol_version) { set_cell(cr.cells(), "protocol_version", *cd.protocol_version); } if (cd.ssl_cipher_suite) { set_cell(cr.cells(), "ssl_cipher_suite", *cd.ssl_cipher_suite); } if (cd.ssl_enabled) { set_cell(cr.cells(), "ssl_enabled", *cd.ssl_enabled); } if (cd.ssl_protocol) { set_cell(cr.cells(), "ssl_protocol", *cd.ssl_protocol); } set_cell(cr.cells(), "username", cd.username ? *cd.username : sstring("anonymous")); co_await result.emit_row(std::move(cr)); } co_await result.emit_partition_end(); } } public: clients_table(service::storage_service& ss) : streaming_virtual_table(build_schema()) , _ss(ss) { _shard_aware = true; } }; // Map from table's schema ID to table itself. Helps avoiding accidental duplication. static thread_local std::map> virtual_tables; void register_virtual_tables(distributed& dist_db, distributed& dist_ss, sharded& dist_gossiper, db::config& cfg) { auto add_table = [] (std::unique_ptr&& tbl) { virtual_tables[tbl->schema()->id()] = std::move(tbl); }; auto& db = dist_db.local(); auto& ss = dist_ss.local(); auto& gossiper = dist_gossiper.local(); // Add built-in virtual tables here. add_table(std::make_unique(ss, gossiper)); add_table(std::make_unique(db, ss)); add_table(std::make_unique(dist_db)); add_table(std::make_unique(ss)); add_table(std::make_unique(dist_db, ss)); add_table(std::make_unique()); add_table(std::make_unique(cfg)); add_table(std::make_unique(ss)); } std::vector system_keyspace::all_tables(const db::config& cfg) { std::vector r; auto schema_tables = db::schema_tables::all_tables(schema_features::full()); std::copy(schema_tables.begin(), schema_tables.end(), std::back_inserter(r)); r.insert(r.end(), { built_indexes(), hints(), batchlog(), paxos(), local(), peers(), peer_events(), range_xfers(), compactions_in_progress(), compaction_history(), sstable_activity(), size_estimates(), large_partitions(), large_rows(), large_cells(), scylla_local(), db::schema_tables::scylla_table_schema_history(), repair_history(), v3::views_builds_in_progress(), v3::built_views(), v3::scylla_views_builds_in_progress(), v3::truncated(), v3::cdc_local(), }); if (cfg.check_experimental(db::experimental_features_t::RAFT)) { r.insert(r.end(), {raft(), raft_snapshots(), raft_config(), group0_history(), discovery()}); } // legacy schema r.insert(r.end(), { // TODO: once we migrate hints/batchlog and add convertor // legacy::hints(), legacy::batchlog(), legacy::keyspaces(), legacy::column_families(), legacy::columns(), legacy::triggers(), legacy::usertypes(), legacy::functions(), legacy::aggregates(), }); for (auto&& [id, vt] : virtual_tables) { r.push_back(vt->schema()); } return r; } static void install_virtual_readers(replica::database& db) { db.find_column_family(system_keyspace::size_estimates()).set_virtual_reader(mutation_source(db::size_estimates::virtual_reader(db))); db.find_column_family(system_keyspace::v3::views_builds_in_progress()).set_virtual_reader(mutation_source(db::view::build_progress_virtual_reader(db))); db.find_column_family(system_keyspace::built_indexes()).set_virtual_reader(mutation_source(db::index::built_indexes_virtual_reader(db))); for (auto&& [id, vt] : virtual_tables) { auto&& cf = db.find_column_family(vt->schema()); cf.set_virtual_reader(vt->as_mutation_source()); cf.set_virtual_writer([&vt = *vt] (const frozen_mutation& m) { return vt.apply(m); }); } } static bool maybe_write_in_user_memory(schema_ptr s) { return (s.get() == system_keyspace::batchlog().get()) || (s.get() == system_keyspace::paxos().get()) || s == system_keyspace::v3::scylla_views_builds_in_progress() || s == system_keyspace::raft(); } future<> system_keyspace_make(distributed& dist_db, distributed& dist_ss, sharded& dist_gossiper, db::config& cfg) { register_virtual_tables(dist_db, dist_ss, dist_gossiper, cfg); auto& db = dist_db.local(); auto& db_config = db.get_config(); auto enable_cache = db_config.enable_cache(); bool durable = db_config.data_file_directories().size() > 0; for (auto&& table : system_keyspace::all_tables(db_config)) { auto ks_name = table->ks_name(); if (!db.has_keyspace(ks_name)) { auto ksm = make_lw_shared(ks_name, "org.apache.cassandra.locator.LocalStrategy", std::map{}, durable ); co_await db.create_keyspace(ksm, dist_ss.local().get_erm_factory(), true, replica::database::system_keyspace::yes); } auto& ks = db.find_keyspace(ks_name); auto cfg = ks.make_column_family_config(*table, db); if (maybe_write_in_user_memory(table)) { cfg.dirty_memory_manager = &db._dirty_memory_manager; } else { cfg.memtable_scheduling_group = default_scheduling_group(); cfg.memtable_to_cache_scheduling_group = default_scheduling_group(); } db.add_column_family(ks, table, std::move(cfg)); } install_virtual_readers(db); } future<> system_keyspace::make(distributed& db, distributed& ss, sharded& g, db::config& cfg) { return system_keyspace_make(db, ss, g, cfg); } future system_keyspace::load_local_host_id() { sstring req = format("SELECT host_id FROM system.{} WHERE key=?", LOCAL); auto msg = co_await execute_cql(req, sstring(LOCAL)); if (msg->empty() || !msg->one().has("host_id")) { co_return co_await set_local_host_id(utils::make_random_uuid()); } else { auto host_id = msg->one().get_as("host_id"); slogger.info("Loaded local host id: {}", host_id); co_return host_id; } } future system_keyspace::set_local_host_id(utils::UUID host_id) { slogger.info("Setting local host id to {}", host_id); sstring req = format("INSERT INTO system.{} (key, host_id) VALUES (?, ?)", LOCAL); co_await execute_cql(req, sstring(LOCAL), host_id); co_await force_blocking_flush(LOCAL); co_return host_id; } std::unordered_map system_keyspace::load_dc_rack_info() { return _cache->_cached_dc_rack_info; } future>> system_keyspace::query_mutations(distributed& proxy, const sstring& ks_name, const sstring& cf_name) { replica::database& db = proxy.local().get_db().local(); schema_ptr schema = db.find_schema(ks_name, cf_name); auto slice = partition_slice_builder(*schema).build(); auto cmd = make_lw_shared(schema->id(), schema->version(), std::move(slice), proxy.local().get_max_result_size(slice)); return proxy.local().query_mutations_locally(std::move(schema), std::move(cmd), query::full_partition_range, db::no_timeout) .then([] (rpc::tuple>, cache_temperature> rr_ht) { return std::get<0>(std::move(rr_ht)); }); } future> system_keyspace::query(distributed& proxy, const sstring& ks_name, const sstring& cf_name) { replica::database& db = proxy.local().get_db().local(); schema_ptr schema = db.find_schema(ks_name, cf_name); auto slice = partition_slice_builder(*schema).build(); auto cmd = make_lw_shared(schema->id(), schema->version(), std::move(slice), proxy.local().get_max_result_size(slice)); return proxy.local().query(schema, cmd, {query::full_partition_range}, db::consistency_level::ONE, {db::no_timeout, empty_service_permit(), service::client_state::for_internal_calls(), nullptr}).then([schema, cmd] (auto&& qr) { return make_lw_shared(query::result_set::from_raw_result(schema, cmd->slice, *qr.query_result)); }); } future> system_keyspace::query(distributed& proxy, const sstring& ks_name, const sstring& cf_name, const dht::decorated_key& key, query::clustering_range row_range) { auto&& db = proxy.local().get_db().local(); auto schema = db.find_schema(ks_name, cf_name); auto slice = partition_slice_builder(*schema) .with_range(std::move(row_range)) .build(); auto cmd = make_lw_shared(schema->id(), schema->version(), std::move(slice), proxy.local().get_max_result_size(slice)); return proxy.local().query(schema, cmd, {dht::partition_range::make_singular(key)}, db::consistency_level::ONE, {db::no_timeout, empty_service_permit(), service::client_state::for_internal_calls(), nullptr}).then([schema, cmd] (auto&& qr) { return make_lw_shared(query::result_set::from_raw_result(schema, cmd->slice, *qr.query_result)); }); } static map_type_impl::native_type prepare_rows_merged(std::unordered_map& rows_merged) { map_type_impl::native_type tmp; for (auto& r: rows_merged) { int32_t first = r.first; int64_t second = r.second; auto map_element = std::make_pair(data_value(first), data_value(second)); tmp.push_back(std::move(map_element)); } return tmp; } future<> system_keyspace::update_compaction_history(utils::UUID uuid, sstring ksname, sstring cfname, int64_t compacted_at, int64_t bytes_in, int64_t bytes_out, std::unordered_map rows_merged) { // don't write anything when the history table itself is compacted, since that would in turn cause new compactions if (ksname == "system" && cfname == COMPACTION_HISTORY) { return make_ready_future<>(); } auto map_type = map_type_impl::get_instance(int32_type, long_type, true); sstring req = format("INSERT INTO system.{} (id, keyspace_name, columnfamily_name, compacted_at, bytes_in, bytes_out, rows_merged) VALUES (?, ?, ?, ?, ?, ?, ?)" , COMPACTION_HISTORY); db_clock::time_point tp{db_clock::duration{compacted_at}}; return qctx->execute_cql(req, uuid, ksname, cfname, tp, bytes_in, bytes_out, make_map_value(map_type, prepare_rows_merged(rows_merged))).discard_result().handle_exception([] (auto ep) { slogger.error("update compaction history failed: {}: ignored", ep); }); } future<> system_keyspace::get_compaction_history(compaction_history_consumer&& f) { return do_with(compaction_history_consumer(std::move(f)), [](compaction_history_consumer& consumer) mutable { sstring req = format("SELECT * from system.{}", COMPACTION_HISTORY); return qctx->qp().query_internal(req, [&consumer] (const cql3::untyped_result_set::row& row) mutable { compaction_history_entry entry; entry.id = row.get_as("id"); entry.ks = row.get_as("keyspace_name"); entry.cf = row.get_as("columnfamily_name"); entry.compacted_at = row.get_as("compacted_at"); entry.bytes_in = row.get_as("bytes_in"); entry.bytes_out = row.get_as("bytes_out"); if (row.has("rows_merged")) { entry.rows_merged = row.get_map("rows_merged"); } return consumer(std::move(entry)).then([] { return stop_iteration::no; }); }); }); } future<> system_keyspace::update_repair_history(repair_history_entry entry) { sstring req = format("INSERT INTO system.{} (table_uuid, repair_time, repair_uuid, keyspace_name, table_name, range_start, range_end) VALUES (?, ?, ?, ?, ?, ?, ?)", REPAIR_HISTORY); co_await execute_cql(req, entry.table_uuid, entry.ts, entry.id, entry.ks, entry.cf, entry.range_start, entry.range_end).discard_result(); } future<> system_keyspace::get_repair_history(utils::UUID table_id, repair_history_consumer f) { sstring req = format("SELECT * from system.{} WHERE table_uuid = {}", REPAIR_HISTORY, table_id); co_await _qp.local().query_internal(req, [&f] (const cql3::untyped_result_set::row& row) mutable -> future { repair_history_entry ent; ent.id = row.get_as("repair_uuid"); ent.table_uuid = row.get_as("table_uuid"); ent.range_start = row.get_as("range_start"); ent.range_end = row.get_as("range_end"); ent.ks = row.get_as("keyspace_name"); ent.cf = row.get_as("table_name"); ent.ts = row.get_as("repair_time"); co_await f(std::move(ent)); co_return stop_iteration::no; }); } future system_keyspace::increment_and_get_generation() { auto req = format("SELECT gossip_generation FROM system.{} WHERE key='{}'", LOCAL, LOCAL); return qctx->qp().execute_internal(req, cql3::query_processor::cache_internal::yes).then([] (auto rs) { int generation; if (rs->empty() || !rs->one().has("gossip_generation")) { // seconds-since-epoch isn't a foolproof new generation // (where foolproof is "guaranteed to be larger than the last one seen at this ip address"), // but it's as close as sanely possible generation = utils::get_generation_number(); } else { // Other nodes will ignore gossip messages about a node that have a lower generation than previously seen. int stored_generation = rs->one().template get_as("gossip_generation") + 1; int now = utils::get_generation_number(); if (stored_generation >= now) { slogger.warn("Using stored Gossip Generation {} as it is greater than current system time {}." "See CASSANDRA-3654 if you experience problems", stored_generation, now); generation = stored_generation; } else { generation = now; } } auto req = format("INSERT INTO system.{} (key, gossip_generation) VALUES ('{}', ?)", LOCAL, LOCAL); return qctx->qp().execute_internal(req, {generation}, cql3::query_processor::cache_internal::yes).then([generation] (auto rs) { return force_blocking_flush(LOCAL); }).then([generation] { return make_ready_future(generation); }); }); } mutation system_keyspace::make_size_estimates_mutation(const sstring& ks, std::vector estimates) { auto&& schema = db::system_keyspace::size_estimates(); auto timestamp = api::new_timestamp(); mutation m_to_apply{schema, partition_key::from_single_value(*schema, utf8_type->decompose(ks))}; for (auto&& e : estimates) { auto ck = clustering_key_prefix(std::vector{ utf8_type->decompose(e.schema->cf_name()), e.range_start_token, e.range_end_token}); m_to_apply.set_clustered_cell(ck, "mean_partition_size", e.mean_partition_size, timestamp); m_to_apply.set_clustered_cell(ck, "partitions_count", e.partitions_count, timestamp); } return m_to_apply; } future<> system_keyspace::register_view_for_building(sstring ks_name, sstring view_name, const dht::token& token) { sstring req = format("INSERT INTO system.{} (keyspace_name, view_name, generation_number, cpu_id, first_token) VALUES (?, ?, ?, ?, ?)", v3::SCYLLA_VIEWS_BUILDS_IN_PROGRESS); return qctx->execute_cql( std::move(req), std::move(ks_name), std::move(view_name), 0, int32_t(this_shard_id()), token.to_sstring()).discard_result(); } future<> system_keyspace::update_view_build_progress(sstring ks_name, sstring view_name, const dht::token& token) { sstring req = format("INSERT INTO system.{} (keyspace_name, view_name, next_token, cpu_id) VALUES (?, ?, ?, ?)", v3::SCYLLA_VIEWS_BUILDS_IN_PROGRESS); return qctx->execute_cql( std::move(req), std::move(ks_name), std::move(view_name), token.to_sstring(), int32_t(this_shard_id())).discard_result(); } future<> system_keyspace::remove_view_build_progress_across_all_shards(sstring ks_name, sstring view_name) { return qctx->execute_cql( format("DELETE FROM system.{} WHERE keyspace_name = ? AND view_name = ?", v3::SCYLLA_VIEWS_BUILDS_IN_PROGRESS), std::move(ks_name), std::move(view_name)).discard_result(); } future<> system_keyspace::remove_view_build_progress(sstring ks_name, sstring view_name) { return qctx->execute_cql( format("DELETE FROM system.{} WHERE keyspace_name = ? AND view_name = ? AND cpu_id = ?", v3::SCYLLA_VIEWS_BUILDS_IN_PROGRESS), std::move(ks_name), std::move(view_name), int32_t(this_shard_id())).discard_result(); } future<> system_keyspace::mark_view_as_built(sstring ks_name, sstring view_name) { return qctx->execute_cql( format("INSERT INTO system.{} (keyspace_name, view_name) VALUES (?, ?)", v3::BUILT_VIEWS), std::move(ks_name), std::move(view_name)).discard_result(); } future<> system_keyspace::remove_built_view(sstring ks_name, sstring view_name) { return qctx->execute_cql( format("DELETE FROM system.{} WHERE keyspace_name = ? AND view_name = ?", v3::BUILT_VIEWS), std::move(ks_name), std::move(view_name)).discard_result(); } future> system_keyspace::load_built_views() { return qctx->execute_cql(format("SELECT * FROM system.{}", v3::BUILT_VIEWS)).then([] (::shared_ptr cql_result) { return boost::copy_range>(*cql_result | boost::adaptors::transformed([] (const cql3::untyped_result_set::row& row) { auto ks_name = row.get_as("keyspace_name"); auto cf_name = row.get_as("view_name"); return std::pair(std::move(ks_name), std::move(cf_name)); })); }); } future> system_keyspace::load_view_build_progress() { return qctx->execute_cql(format("SELECT keyspace_name, view_name, first_token, next_token, cpu_id FROM system.{}", v3::SCYLLA_VIEWS_BUILDS_IN_PROGRESS)).then([] (::shared_ptr cql_result) { std::vector progress; for (auto& row : *cql_result) { auto ks_name = row.get_as("keyspace_name"); auto cf_name = row.get_as("view_name"); auto first_token = dht::token::from_sstring(row.get_as("first_token")); auto next_token_sstring = row.get_opt("next_token"); std::optional next_token; if (next_token_sstring) { next_token = dht::token::from_sstring(std::move(next_token_sstring).value()); } auto cpu_id = row.get_as("cpu_id"); progress.emplace_back(view_build_progress{ view_name(std::move(ks_name), std::move(cf_name)), std::move(first_token), std::move(next_token), static_cast(cpu_id)}); } return progress; }).handle_exception([] (const std::exception_ptr& eptr) { slogger.warn("Failed to load view build progress: {}", eptr); return std::vector(); }); } future system_keyspace::load_paxos_state(partition_key_view key, schema_ptr s, gc_clock::time_point now, db::timeout_clock::time_point timeout) { static auto cql = format("SELECT * FROM system.{} WHERE row_key = ? AND cf_id = ?", PAXOS); // FIXME: we need execute_cql_with_now() (void)now; auto f = qctx->execute_cql_with_timeout(cql, timeout, to_legacy(*key.get_compound_type(*s), key.representation()), s->id()); return f.then([s, key = std::move(key)] (shared_ptr results) mutable { if (results->empty()) { return service::paxos::paxos_state(); } auto& row = results->one(); auto promised = row.has("promise") ? row.get_as("promise") : utils::UUID_gen::min_time_UUID(); std::optional accepted; if (row.has("proposal")) { accepted = service::paxos::proposal(row.get_as("proposal_ballot"), ser::deserialize_from_buffer<>(row.get_blob("proposal"), boost::type(), 0)); } std::optional most_recent; if (row.has("most_recent_commit_at")) { // the value can be missing if it was pruned, suply empty one since // it will not going to be used anyway auto fm = row.has("most_recent_commit") ? ser::deserialize_from_buffer<>(row.get_blob("most_recent_commit"), boost::type(), 0) : freeze(mutation(s, key)); most_recent = service::paxos::proposal(row.get_as("most_recent_commit_at"), std::move(fm)); } return service::paxos::paxos_state(promised, std::move(accepted), std::move(most_recent)); }); } static int32_t paxos_ttl_sec(const schema& s) { // Keep paxos state around for paxos_grace_seconds. If one of the Paxos participants // is down for longer than paxos_grace_seconds it is considered to be dead and must rebootstrap. // Otherwise its Paxos table state will be repaired by nodetool repair or Paxos repair. return std::chrono::duration_cast(s.paxos_grace_seconds()).count(); } future<> system_keyspace::save_paxos_promise(const schema& s, const partition_key& key, const utils::UUID& ballot, db::timeout_clock::time_point timeout) { static auto cql = format("UPDATE system.{} USING TIMESTAMP ? AND TTL ? SET promise = ? WHERE row_key = ? AND cf_id = ?", PAXOS); return qctx->execute_cql_with_timeout(cql, timeout, utils::UUID_gen::micros_timestamp(ballot), paxos_ttl_sec(s), ballot, to_legacy(*key.get_compound_type(s), key.representation()), s.id() ).discard_result(); } future<> system_keyspace::save_paxos_proposal(const schema& s, const service::paxos::proposal& proposal, db::timeout_clock::time_point timeout) { static auto cql = format("UPDATE system.{} USING TIMESTAMP ? AND TTL ? SET promise = ?, proposal_ballot = ?, proposal = ? WHERE row_key = ? AND cf_id = ?", PAXOS); partition_key_view key = proposal.update.key(); return qctx->execute_cql_with_timeout(cql, timeout, utils::UUID_gen::micros_timestamp(proposal.ballot), paxos_ttl_sec(s), proposal.ballot, proposal.ballot, ser::serialize_to_buffer(proposal.update), to_legacy(*key.get_compound_type(s), key.representation()), s.id() ).discard_result(); } future<> system_keyspace::save_paxos_decision(const schema& s, const service::paxos::proposal& decision, db::timeout_clock::time_point timeout) { // We always erase the last proposal when we learn about a new Paxos decision. The ballot // timestamp of the decision is used for entire mutation, so if the "erased" proposal is more // recent it will naturally stay on top. // Erasing the last proposal is just an optimization and does not affect correctness: // sp::begin_and_repair_paxos will exclude an accepted proposal if it is older than the most // recent commit. static auto cql = format("UPDATE system.{} USING TIMESTAMP ? AND TTL ? SET proposal_ballot = null, proposal = null," " most_recent_commit_at = ?, most_recent_commit = ? WHERE row_key = ? AND cf_id = ?", PAXOS); partition_key_view key = decision.update.key(); return qctx->execute_cql_with_timeout(cql, timeout, utils::UUID_gen::micros_timestamp(decision.ballot), paxos_ttl_sec(s), decision.ballot, ser::serialize_to_buffer(decision.update), to_legacy(*key.get_compound_type(s), key.representation()), s.id() ).discard_result(); } future<> system_keyspace::delete_paxos_decision(const schema& s, const partition_key& key, const utils::UUID& ballot, db::timeout_clock::time_point timeout) { // This should be called only if a learn stage succeeded on all replicas. // In this case we can remove learned paxos value using ballot's timestamp which // guarantees that if there is more recent round it will not be affected. static auto cql = format("DELETE most_recent_commit FROM system.{} USING TIMESTAMP ? WHERE row_key = ? AND cf_id = ?", PAXOS); return qctx->execute_cql_with_timeout(cql, timeout, utils::UUID_gen::micros_timestamp(ballot), to_legacy(*key.get_compound_type(s), key.representation()), s.id() ).discard_result(); } future<> system_keyspace::enable_features_on_startup(sharded& feat) { auto pre_enabled_features = co_await get_scylla_local_param(gms::feature_service::ENABLED_FEATURES_KEY); if (!pre_enabled_features) { co_return; } gms::feature_service& local_feat_srv = feat.local(); const auto known_features = local_feat_srv.known_feature_set(); const auto& registered_features = local_feat_srv.registered_features(); const auto persisted_features = gms::feature_service::to_feature_set(*pre_enabled_features); for (auto&& f : persisted_features) { slogger.debug("Enabling persisted feature '{}'", f); const bool is_registered_feat = registered_features.contains(sstring(f)); if (!is_registered_feat || !known_features.contains(f)) { if (is_registered_feat) { throw std::runtime_error(format( "Feature '{}' was previously enabled in the cluster but its support is disabled by this node. " "Set the corresponding configuration option to enable the support for the feature.", f)); } else { throw std::runtime_error(format("Unknown feature '{}' was previously enabled in the cluster. " " That means this node is performing a prohibited downgrade procedure" " and should not be allowed to boot.", f)); } } if (is_registered_feat) { // `gms::feature::enable` should be run within a seastar thread context co_await seastar::async([&local_feat_srv, f] { local_feat_srv.enable(sstring(f)); }); } // If a feature is not in `registered_features` but still in `known_features` list // that means the feature name is used for backward compatibility and should be implicitly // enabled in the code by default, so just skip it. } } future system_keyspace::get_raft_group0_id() { auto opt = co_await get_scylla_local_param_as("raft_group0_id"); co_return opt.value_or({}); } future system_keyspace::get_raft_server_id() { auto opt = co_await get_scylla_local_param_as("raft_server_id"); co_return opt.value_or({}); } future<> system_keyspace::set_raft_group0_id(utils::UUID uuid) { return set_scylla_local_param_as("raft_group0_id", uuid); } future<> system_keyspace::set_raft_server_id(utils::UUID uuid) { return set_scylla_local_param_as("raft_server_id", uuid); } static constexpr auto GROUP0_HISTORY_KEY = "history"; future system_keyspace::get_last_group0_state_id() { auto rs = co_await qctx->execute_cql( format( "SELECT state_id FROM system.{} WHERE key = '{}' LIMIT 1", GROUP0_HISTORY, GROUP0_HISTORY_KEY)); assert(rs); if (rs->empty()) { co_return utils::UUID{}; } co_return rs->one().get_as("state_id"); } future system_keyspace::group0_history_contains(utils::UUID state_id) { auto rs = co_await qctx->execute_cql( format( "SELECT state_id FROM system.{} WHERE key = '{}' AND state_id = ?", GROUP0_HISTORY, GROUP0_HISTORY_KEY), state_id); assert(rs); co_return !rs->empty(); } mutation system_keyspace::make_group0_history_state_id_mutation( utils::UUID state_id, std::optional gc_older_than, std::string_view description) { auto s = group0_history(); mutation m(s, partition_key::from_singular(*s, GROUP0_HISTORY_KEY)); auto& row = m.partition().clustered_row(*s, clustering_key::from_singular(*s, state_id)); auto ts = utils::UUID_gen::micros_timestamp(state_id); row.apply(row_marker(ts)); if (!description.empty()) { auto cdef = s->get_column_definition("description"); assert(cdef); row.cells().apply(*cdef, atomic_cell::make_live(*cdef->type, ts, cdef->type->decompose(description))); } if (gc_older_than) { using namespace std::chrono; assert(*gc_older_than >= gc_clock::duration{0}); auto ts_millis = duration_cast(microseconds{ts}); auto gc_older_than_millis = duration_cast(*gc_older_than); assert(gc_older_than_millis < ts_millis); auto tomb_upper_bound = utils::UUID_gen::min_time_UUID(ts_millis - gc_older_than_millis); // We want to delete all entries with IDs smaller than `tomb_upper_bound` // but the deleted range is of the form (x, +inf) since the schema is reversed. auto range = query::clustering_range::make_starting_with({ clustering_key_prefix::from_single_value(*s, timeuuid_type->decompose(tomb_upper_bound)), false}); auto bv = bound_view::from_range(range); m.partition().apply_delete(*s, range_tombstone{bv.first, bv.second, tombstone{ts, gc_clock::now()}}); } return m; } future system_keyspace::get_group0_history(distributed& sp) { auto s = group0_history(); auto rs = co_await db::system_keyspace::query_mutations(sp, db::system_keyspace::NAME, db::system_keyspace::GROUP0_HISTORY); assert(rs); auto& ps = rs->partitions(); for (auto& p: ps) { auto mut = p.mut().unfreeze(s); auto partition_key = value_cast(utf8_type->deserialize(mut.key().get_component(*s, 0))); if (partition_key == GROUP0_HISTORY_KEY) { co_return mut; } slogger.warn("get_group0_history: unexpected partition in group0 history table: {}", partition_key); } slogger.warn("get_group0_history: '{}' partition not found", GROUP0_HISTORY_KEY); co_return mutation(s, partition_key::from_singular(*s, GROUP0_HISTORY_KEY)); } sstring system_keyspace_name() { return system_keyspace::NAME; } system_keyspace::system_keyspace(sharded& qp, sharded& db) noexcept : _qp(qp) , _db(db) , _cache(std::make_unique()) { } system_keyspace::~system_keyspace() { } future<> system_keyspace::start() { assert(_qp.local_is_initialized() && _db.local_is_initialized()); if (this_shard_id() == 0) { qctx = std::make_unique(_qp); } co_return; } future<> system_keyspace::stop() { co_return; } future<::shared_ptr> system_keyspace::execute_cql(const sstring& query_string, const std::initializer_list& values) { return _qp.local().execute_internal(query_string, values, cql3::query_processor::cache_internal::yes); } } // namespace db