Files
scylladb/db/system_keyspace.cc
Avi Kivity 8576502c48 Merge 'raft topology: ban left nodes from the cluster' from Kamil Braun
Use the new Seastar functionality for storing references to connections to implement banning hosts that have left the cluster (either decommissioned or using removenode) in raft-topology mode. Any attempts at communication from those nodes will be rejected.

This works not only for nodes that restart, but also for nodes that were running behind a network partition and we removed them. Even when the partition resolves, the existing nodes will effectively put a firewall from that node.

Some changes to the decommission algorithm had to be introduced for it to work with node banning. As a side effect a pre-existing problem with decommission was fixed. Read the "introduce `left_token_ring` state" and "prepare decommission path for node banning" commits for details.

Closes #13850

* github.com:scylladb/scylladb:
  test: pylib: increase checking period for `get_alive_endpoints`
  test: add node banning test
  test: pylib: manager_client: `get_cql()` helper
  test: pylib: ScyllaCluster: server pause/unpause API
  raft topology: ban left nodes
  raft topology: skip `left_token_ring` state during `removenode`
  raft topology: prepare decommission path for node banning
  raft topology: introduce `left_token_ring` state
  raft topology: `raft_topology_cmd` implicit constructor
  messaging_service: implement host banning
  messaging_service: exchange host IDs and map them to connections
  messaging_service: store the node's host ID
  messaging_service: don't use parameter defaults in constructor
  main: move messaging_service init after system_keyspace init
2023-06-21 20:16:45 +03:00

3808 lines
162 KiB
C++

/*
* Modified by ScyllaDB
* Copyright (C) 2015-present ScyllaDB
*/
/*
* SPDX-License-Identifier: (AGPL-3.0-or-later and Apache-2.0)
*/
#include <boost/range/algorithm.hpp>
#include <boost/range/algorithm_ext/push_back.hpp>
#include <boost/range/adaptor/transformed.hpp>
#include <boost/range/adaptor/filtered.hpp>
#include <boost/range/adaptor/map.hpp>
#include <seastar/core/coroutine.hh>
#include <seastar/coroutine/maybe_yield.hh>
#include <seastar/core/reactor.hh>
#include <seastar/json/json_elements.hh>
#include "system_keyspace.hh"
#include "types/types.hh"
#include "service/client_state.hh"
#include "service/query_state.hh"
#include "cql3/query_options.hh"
#include "cql3/query_processor.hh"
#include "cql3/untyped_result_set.hh"
#include "utils/fb_utilities.hh"
#include "utils/hash.hh"
#include "version.hh"
#include "thrift/server.hh"
#include "exceptions/exceptions.hh"
#include "cql3/query_processor.hh"
#include "query_context.hh"
#include "partition_slice_builder.hh"
#include "db/config.hh"
#include "gms/feature_service.hh"
#include "system_keyspace_view_types.hh"
#include "schema/schema_builder.hh"
#include "utils/hashers.hh"
#include "release.hh"
#include "log.hh"
#include <seastar/core/enum.hh>
#include "gms/inet_address.hh"
#include "index/secondary_index.hh"
#include "message/messaging_service.hh"
#include "mutation_query.hh"
#include "db/size_estimates_virtual_reader.hh"
#include "db/timeout_clock.hh"
#include "sstables/sstables.hh"
#include "db/view/build_progress_virtual_reader.hh"
#include "db/schema_tables.hh"
#include "index/built_indexes_virtual_reader.hh"
#include "gms/generation-number.hh"
#include "db/virtual_table.hh"
#include "service/storage_service.hh"
#include "protocol_server.hh"
#include "gms/gossiper.hh"
#include "service/paxos/paxos_state.hh"
#include "service/raft/raft_group_registry.hh"
#include "utils/build_id.hh"
#include "query-result-set.hh"
#include "idl/frozen_mutation.dist.hh"
#include "idl/frozen_mutation.dist.impl.hh"
#include <boost/algorithm/cxx11/any_of.hpp>
#include "client_data.hh"
#include "service/topology_state_machine.hh"
#include "sstables/open_info.hh"
#include "sstables/generation_type.hh"
#include "cdc/generation.hh"
#include "replica/tablets.hh"
#include "replica/query.hh"
using days = std::chrono::duration<int, std::ratio<24 * 3600>>;
namespace db {
namespace {
const auto set_null_sharder = schema_builder::register_static_configurator([](const sstring& ks_name, const sstring& cf_name, schema_static_props& props) {
// tables in the "system" keyspace which need to use null sharder
static const std::unordered_set<sstring> system_ks_null_shard_tables = {
schema_tables::SCYLLA_TABLE_SCHEMA_HISTORY,
system_keyspace::RAFT,
system_keyspace::RAFT_SNAPSHOTS,
system_keyspace::RAFT_SNAPSHOT_CONFIG,
system_keyspace::GROUP0_HISTORY,
system_keyspace::DISCOVERY,
system_keyspace::BROADCAST_KV_STORE,
system_keyspace::TOPOLOGY,
system_keyspace::CDC_GENERATIONS_V3,
system_keyspace::TABLETS,
};
if (ks_name == system_keyspace::NAME && system_ks_null_shard_tables.contains(cf_name)) {
props.use_null_sharder = true;
}
});
const auto set_wait_for_sync_to_commitlog = schema_builder::register_static_configurator([](const sstring& ks_name, const sstring& cf_name, schema_static_props& props) {
static const std::unordered_set<sstring> extra_durable_tables = {
system_keyspace::PAXOS,
system_keyspace::SCYLLA_LOCAL,
system_keyspace::RAFT,
system_keyspace::RAFT_SNAPSHOTS,
system_keyspace::RAFT_SNAPSHOT_CONFIG,
system_keyspace::DISCOVERY,
system_keyspace::BROADCAST_KV_STORE,
system_keyspace::TOPOLOGY,
system_keyspace::CDC_GENERATIONS_V3,
system_keyspace::TABLETS,
};
if (ks_name == system_keyspace::NAME && extra_durable_tables.contains(cf_name)) {
props.wait_for_sync_to_commitlog = true;
}
});
const auto set_use_schema_commitlog = schema_builder::register_static_configurator([](const sstring& ks_name, const sstring& cf_name, schema_static_props& props) {
static const std::unordered_set<sstring> raft_tables = {
system_keyspace::RAFT,
system_keyspace::RAFT_SNAPSHOTS,
system_keyspace::RAFT_SNAPSHOT_CONFIG,
system_keyspace::GROUP0_HISTORY,
system_keyspace::DISCOVERY,
system_keyspace::TABLETS,
};
if (ks_name == system_keyspace::NAME && raft_tables.contains(cf_name)) {
props.use_schema_commitlog = true;
props.load_phase = system_table_load_phase::phase2;
}
});
}
std::unique_ptr<query_context> qctx = {};
static logging::logger slogger("system_keyspace");
static const api::timestamp_type creation_timestamp = api::new_timestamp();
api::timestamp_type system_keyspace::schema_creation_timestamp() {
return creation_timestamp;
}
// Increase whenever changing schema of any system table.
// FIXME: Make automatic by calculating from schema structure.
static const uint16_t version_sequence_number = 1;
table_schema_version system_keyspace::generate_schema_version(::table_id table_id, uint16_t offset) {
md5_hasher h;
feed_hash(h, table_id);
feed_hash(h, version_sequence_number + offset);
return table_schema_version(utils::UUID_gen::get_name_UUID(h.finalize()));
}
// Currently, the type variables (uuid_type, etc.) are thread-local reference-
// counted shared pointers. This forces us to also make the built in schemas
// below thread-local as well.
// We return schema_ptr, not schema&, because that's the "tradition" in our
// other code.
// We hide the thread_local variable inside a function, because if we later
// we remove the thread_local, we'll start having initialization order
// problems (we need the type variables to be constructed first), and using
// functions will solve this problem. So we use functions right now.
schema_ptr system_keyspace::hints() {
static thread_local auto hints = [] {
schema_builder builder(generate_legacy_id(NAME, HINTS), NAME, HINTS,
// partition key
{{"target_id", uuid_type}},
// clustering key
{{"hint_id", timeuuid_type}, {"message_version", int32_type}},
// regular columns
{{"mutation", bytes_type}},
// static columns
{},
// regular column name type
utf8_type,
// comment
"hints awaiting delivery"
);
builder.set_gc_grace_seconds(0);
builder.set_compaction_strategy_options({{ "enabled", "false" }});
builder.with_version(generate_schema_version(builder.uuid()));
return builder.build(schema_builder::compact_storage::yes);
}();
return hints;
}
schema_ptr system_keyspace::batchlog() {
static thread_local auto batchlog = [] {
schema_builder builder(generate_legacy_id(NAME, BATCHLOG), NAME, BATCHLOG,
// partition key
{{"id", uuid_type}},
// clustering key
{},
// regular columns
{{"data", bytes_type}, {"version", int32_type}, {"written_at", timestamp_type}},
// static columns
{},
// regular column name type
utf8_type,
// comment
"batches awaiting replay"
// FIXME: the original Java code also had:
// operations on resulting CFMetaData:
// .compactionStrategyOptions(Collections.singletonMap("min_threshold", "2"))
);
builder.set_gc_grace_seconds(0);
builder.with_version(generate_schema_version(builder.uuid()));
return builder.build(schema_builder::compact_storage::no);
}();
return batchlog;
}
/*static*/ schema_ptr system_keyspace::paxos() {
static thread_local auto paxos = [] {
// FIXME: switch to the new schema_builder interface (with_column(...), etc)
schema_builder builder(generate_legacy_id(NAME, PAXOS), NAME, PAXOS,
// partition key
{{"row_key", bytes_type}}, // byte representation of a row key that hashes to the same token as original
// clustering key
{{"cf_id", uuid_type}},
// regular columns
{
{"promise", timeuuid_type},
{"most_recent_commit", bytes_type}, // serialization format is defined by frozen_mutation idl
{"most_recent_commit_at", timeuuid_type},
{"proposal", bytes_type}, // serialization format is defined by frozen_mutation idl
{"proposal_ballot", timeuuid_type},
},
// static columns
{},
// regular column name type
utf8_type,
// comment
"in-progress paxos proposals"
// FIXME: the original Java code also had:
// operations on resulting CFMetaData:
// .compactionStrategyClass(LeveledCompactionStrategy.class);
);
builder.set_gc_grace_seconds(0);
builder.with_version(generate_schema_version(builder.uuid()));
return builder.build(schema_builder::compact_storage::no);
}();
return paxos;
}
schema_ptr system_keyspace::topology() {
static thread_local auto schema = [] {
auto id = generate_legacy_id(NAME, TOPOLOGY);
return schema_builder(NAME, TOPOLOGY, std::optional(id))
.with_column("key", utf8_type, column_kind::partition_key)
.with_column("host_id", uuid_type, column_kind::clustering_key)
.with_column("datacenter", utf8_type)
.with_column("rack", utf8_type)
.with_column("tokens", set_type_impl::get_instance(utf8_type, true))
.with_column("node_state", utf8_type)
.with_column("release_version", utf8_type)
.with_column("topology_request", utf8_type)
.with_column("replaced_id", uuid_type)
.with_column("rebuild_option", utf8_type)
.with_column("num_tokens", int32_type)
.with_column("shard_count", int32_type)
.with_column("ignore_msb", int32_type)
.with_column("supported_features", set_type_impl::get_instance(utf8_type, true))
.with_column("new_cdc_generation_data_uuid", uuid_type, column_kind::static_column)
.with_column("version", long_type, column_kind::static_column)
.with_column("transition_state", utf8_type, column_kind::static_column)
.with_column("current_cdc_generation_uuid", uuid_type, column_kind::static_column)
.with_column("current_cdc_generation_timestamp", timestamp_type, column_kind::static_column)
.with_column("global_topology_request", utf8_type, column_kind::static_column)
.with_column("enabled_features", set_type_impl::get_instance(utf8_type, true), column_kind::static_column)
.set_comment("Current state of topology change machine")
.with_version(generate_schema_version(id))
.build();
}();
return schema;
}
extern thread_local data_type cdc_streams_set_type;
/* An internal table used by nodes to store CDC generation data.
* Written to by Raft Group 0. */
schema_ptr system_keyspace::cdc_generations_v3() {
thread_local auto schema = [] {
auto id = generate_legacy_id(NAME, CDC_GENERATIONS_V3);
return schema_builder(NAME, CDC_GENERATIONS_V3, {id})
/* The unique identifier of this generation. */
.with_column("id", uuid_type, column_kind::partition_key)
/* The generation describes a mapping from all tokens in the token ring to a set of stream IDs.
* This mapping is built from a bunch of smaller mappings, each describing how tokens in a
* subrange of the token ring are mapped to stream IDs; these subranges together cover the entire
* token ring. Each such range-local mapping is represented by a row of this table. The
* clustering key of the row is the end of the range being described by this row. The start of
* this range is the range_end of the previous row (in the clustering order, which is the integer
* order) or of the last row of this partition if this is the first the first row. */
.with_column("range_end", long_type, column_kind::clustering_key)
/* The set of streams mapped to in this range. The number of streams mapped to a single range in
* a CDC generation is bounded from above by the number of shards on the owner of that range in
* the token ring. In other words, the number of elements of this set is bounded by the maximum
* of the number of shards over all nodes. The serialized size is obtained by counting about 20B
* for each stream. For example, if all nodes in the cluster have at most 128 shards, the
* serialized size of this set will be bounded by ~2.5 KB. */
.with_column("streams", cdc_streams_set_type)
/* The value of the `ignore_msb` sharding parameter of the node which was the owner of this token
* range when the generation was first created. Together with the set of streams above it fully
* describes the mapping for this particular range. */
.with_column("ignore_msb", byte_type)
/* Column used for sanity checking. For a given generation it's equal to the number of ranges in
* this generation; thus, after the generation is fully inserted, it must be equal to the number
* of rows in the partition. */
.with_column("num_ranges", int32_type, column_kind::static_column)
.with_version(system_keyspace::generate_schema_version(id))
.build();
}();
return schema;
}
schema_ptr system_keyspace::raft() {
static thread_local auto schema = [] {
auto id = generate_legacy_id(NAME, RAFT);
return schema_builder(NAME, RAFT, std::optional(id))
.with_column("group_id", timeuuid_type, column_kind::partition_key)
// raft log part
.with_column("index", long_type, column_kind::clustering_key)
.with_column("term", long_type)
.with_column("data", bytes_type) // decltype(raft::log_entry::data) - serialized variant
// persisted term and vote
.with_column("vote_term", long_type, column_kind::static_column)
.with_column("vote", uuid_type, column_kind::static_column)
// id of the most recent persisted snapshot
.with_column("snapshot_id", uuid_type, column_kind::static_column)
.with_column("commit_idx", long_type, column_kind::static_column)
.set_comment("Persisted RAFT log, votes and snapshot info")
.with_version(generate_schema_version(id))
.build();
}();
return schema;
}
// Note that this table does not include actula user snapshot data since it's dependent
// on user-provided state machine and could be stored anywhere else in any other form.
// This should be seen as a snapshot descriptor, instead.
schema_ptr system_keyspace::raft_snapshots() {
static thread_local auto schema = [] {
auto id = generate_legacy_id(NAME, RAFT_SNAPSHOTS);
return schema_builder(NAME, RAFT_SNAPSHOTS, std::optional(id))
.with_column("group_id", timeuuid_type, column_kind::partition_key)
.with_column("snapshot_id", uuid_type)
// Index and term of last entry in the snapshot
.with_column("idx", long_type)
.with_column("term", long_type)
.set_comment("Persisted RAFT snapshot descriptors info")
.with_version(generate_schema_version(id))
.build();
}();
return schema;
}
schema_ptr system_keyspace::raft_snapshot_config() {
static thread_local auto schema = [] {
auto id = generate_legacy_id(system_keyspace::NAME, RAFT_SNAPSHOT_CONFIG);
return schema_builder(system_keyspace::NAME, RAFT_SNAPSHOT_CONFIG, std::optional(id))
.with_column("group_id", timeuuid_type, column_kind::partition_key)
.with_column("disposition", ascii_type, column_kind::clustering_key) // can be 'CURRENT` or `PREVIOUS'
.with_column("server_id", uuid_type, column_kind::clustering_key)
.with_column("can_vote", boolean_type)
.set_comment("RAFT configuration for the latest snapshot descriptor")
.with_version(generate_schema_version(id))
.build();
}();
return schema;
}
schema_ptr system_keyspace::repair_history() {
static thread_local auto schema = [] {
auto id = generate_legacy_id(NAME, REPAIR_HISTORY);
return schema_builder(NAME, REPAIR_HISTORY, std::optional(id))
.with_column("table_uuid", uuid_type, column_kind::partition_key)
// The time is repair start time
.with_column("repair_time", timestamp_type, column_kind::clustering_key)
.with_column("repair_uuid", uuid_type, column_kind::clustering_key)
// The token range is (range_start, range_end]
.with_column("range_start", long_type, column_kind::clustering_key)
.with_column("range_end", long_type, column_kind::clustering_key)
.with_column("keyspace_name", utf8_type, column_kind::static_column)
.with_column("table_name", utf8_type, column_kind::static_column)
.set_comment("Record repair history")
.with_version(generate_schema_version(id))
.build();
}();
return schema;
}
schema_ptr system_keyspace::built_indexes() {
static thread_local auto built_indexes = [] {
schema_builder builder(generate_legacy_id(NAME, BUILT_INDEXES), NAME, BUILT_INDEXES,
// partition key
{{"table_name", utf8_type}}, // table_name here is the name of the keyspace - don't be fooled
// clustering key
{{"index_name", utf8_type}},
// regular columns
{},
// static columns
{},
// regular column name type
utf8_type,
// comment
"built column indexes"
);
builder.set_gc_grace_seconds(0);
builder.with_version(generate_schema_version(builder.uuid()));
return builder.build(schema_builder::compact_storage::yes);
}();
return built_indexes;
}
/*static*/ schema_ptr system_keyspace::local() {
static thread_local auto local = [] {
schema_builder builder(generate_legacy_id(NAME, LOCAL), NAME, LOCAL,
// partition key
{{"key", utf8_type}},
// clustering key
{},
// regular columns
{
{"bootstrapped", utf8_type},
{"cluster_name", utf8_type},
{"cql_version", utf8_type},
{"data_center", utf8_type},
{"gossip_generation", int32_type},
{"host_id", uuid_type},
{"native_protocol_version", utf8_type},
{"partitioner", utf8_type},
{"rack", utf8_type},
{"release_version", utf8_type},
{"schema_version", uuid_type},
{"thrift_version", utf8_type},
{"tokens", set_type_impl::get_instance(utf8_type, true)},
{"truncated_at", map_type_impl::get_instance(uuid_type, bytes_type, true)},
// The following 3 columns are only present up until 2.1.8 tables
{"rpc_address", inet_addr_type},
{"broadcast_address", inet_addr_type},
{"listen_address", inet_addr_type},
// This column represents advertised local features (i.e. the features
// advertised by the node via gossip after passing the feature check
// against remote features in the cluster)
{"supported_features", utf8_type},
{"scylla_cpu_sharding_algorithm", utf8_type},
{"scylla_nr_shards", int32_type},
{"scylla_msb_ignore", int32_type},
},
// static columns
{},
// regular column name type
utf8_type,
// comment
"information about the local node"
);
builder.set_gc_grace_seconds(0);
builder.with_version(generate_schema_version(builder.uuid()));
builder.remove_column("scylla_cpu_sharding_algorithm");
builder.remove_column("scylla_nr_shards");
builder.remove_column("scylla_msb_ignore");
return builder.build(schema_builder::compact_storage::no);
}();
return local;
}
/*static*/ schema_ptr system_keyspace::peers() {
constexpr uint16_t schema_version_offset = 0;
static thread_local auto peers = [] {
schema_builder builder(generate_legacy_id(NAME, PEERS), NAME, PEERS,
// partition key
{{"peer", inet_addr_type}},
// clustering key
{},
// regular columns
{
{"data_center", utf8_type},
{"host_id", uuid_type},
{"preferred_ip", inet_addr_type},
{"rack", utf8_type},
{"release_version", utf8_type},
{"rpc_address", inet_addr_type},
{"schema_version", uuid_type},
{"tokens", set_type_impl::get_instance(utf8_type, true)},
{"supported_features", utf8_type},
},
// static columns
{},
// regular column name type
utf8_type,
// comment
"information about known peers in the cluster"
);
builder.set_gc_grace_seconds(0);
builder.with_version(generate_schema_version(builder.uuid(), schema_version_offset));
return builder.build(schema_builder::compact_storage::no);
}();
return peers;
}
/*static*/ schema_ptr system_keyspace::peer_events() {
static thread_local auto peer_events = [] {
schema_builder builder(generate_legacy_id(NAME, PEER_EVENTS), NAME, PEER_EVENTS,
// partition key
{{"peer", inet_addr_type}},
// clustering key
{},
// regular columns
{
{"hints_dropped", map_type_impl::get_instance(uuid_type, int32_type, true)},
},
// static columns
{},
// regular column name type
utf8_type,
// comment
"events related to peers"
);
builder.set_gc_grace_seconds(0);
builder.with_version(generate_schema_version(builder.uuid()));
return builder.build(schema_builder::compact_storage::no);
}();
return peer_events;
}
/*static*/ schema_ptr system_keyspace::range_xfers() {
static thread_local auto range_xfers = [] {
schema_builder builder(generate_legacy_id(NAME, RANGE_XFERS), NAME, RANGE_XFERS,
// partition key
{{"token_bytes", bytes_type}},
// clustering key
{},
// regular columns
{{"requested_at", timestamp_type}},
// static columns
{},
// regular column name type
utf8_type,
// comment
"ranges requested for transfer"
);
builder.set_gc_grace_seconds(0);
builder.with_version(generate_schema_version(builder.uuid()));
return builder.build(schema_builder::compact_storage::no);
}();
return range_xfers;
}
/*static*/ schema_ptr system_keyspace::compactions_in_progress() {
static thread_local auto compactions_in_progress = [] {
schema_builder builder(generate_legacy_id(NAME, COMPACTIONS_IN_PROGRESS), NAME, COMPACTIONS_IN_PROGRESS,
// partition key
{{"id", uuid_type}},
// clustering key
{},
// regular columns
{
{"columnfamily_name", utf8_type},
{"inputs", set_type_impl::get_instance(int32_type, true)},
{"keyspace_name", utf8_type},
},
// static columns
{},
// regular column name type
utf8_type,
// comment
"unfinished compactions"
);
builder.with_version(generate_schema_version(builder.uuid()));
return builder.build(schema_builder::compact_storage::no);
}();
return compactions_in_progress;
}
/*static*/ schema_ptr system_keyspace::compaction_history() {
static thread_local auto compaction_history = [] {
schema_builder builder(generate_legacy_id(NAME, COMPACTION_HISTORY), NAME, COMPACTION_HISTORY,
// partition key
{{"id", uuid_type}},
// clustering key
{},
// regular columns
{
{"bytes_in", long_type},
{"bytes_out", long_type},
{"columnfamily_name", utf8_type},
{"compacted_at", timestamp_type},
{"keyspace_name", utf8_type},
{"rows_merged", map_type_impl::get_instance(int32_type, long_type, true)},
},
// static columns
{},
// regular column name type
utf8_type,
// comment
"week-long compaction history"
);
builder.set_default_time_to_live(std::chrono::duration_cast<std::chrono::seconds>(days(7)));
builder.with_version(generate_schema_version(builder.uuid()));
return builder.build(schema_builder::compact_storage::no);
}();
return compaction_history;
}
/*static*/ schema_ptr system_keyspace::sstable_activity() {
static thread_local auto sstable_activity = [] {
schema_builder builder(generate_legacy_id(NAME, SSTABLE_ACTIVITY), NAME, SSTABLE_ACTIVITY,
// partition key
{
{"keyspace_name", utf8_type},
{"columnfamily_name", utf8_type},
{"generation", int32_type},
},
// clustering key
{},
// regular columns
{
{"rate_120m", double_type},
{"rate_15m", double_type},
},
// static columns
{},
// regular column name type
utf8_type,
// comment
"historic sstable read rates"
);
builder.with_version(generate_schema_version(builder.uuid()));
return builder.build(schema_builder::compact_storage::no);
}();
return sstable_activity;
}
schema_ptr system_keyspace::size_estimates() {
static thread_local auto size_estimates = [] {
schema_builder builder(generate_legacy_id(NAME, SIZE_ESTIMATES), NAME, SIZE_ESTIMATES,
// partition key
{{"keyspace_name", utf8_type}},
// clustering key
{{"table_name", utf8_type}, {"range_start", utf8_type}, {"range_end", utf8_type}},
// regular columns
{
{"mean_partition_size", long_type},
{"partitions_count", long_type},
},
// static columns
{},
// regular column name type
utf8_type,
// comment
"per-table primary range size estimates"
);
builder.set_gc_grace_seconds(0);
builder.with_version(generate_schema_version(builder.uuid()));
return builder.build(schema_builder::compact_storage::no);
}();
return size_estimates;
}
/*static*/ schema_ptr system_keyspace::large_partitions() {
static thread_local auto large_partitions = [] {
schema_builder builder(generate_legacy_id(NAME, LARGE_PARTITIONS), NAME, LARGE_PARTITIONS,
// partition key
{{"keyspace_name", utf8_type}, {"table_name", utf8_type}},
// clustering key
{
{"sstable_name", utf8_type},
{"partition_size", reversed_type_impl::get_instance(long_type)},
{"partition_key", utf8_type}
}, // CLUSTERING ORDER BY (partition_size DESC)
// regular columns
{
{"rows", long_type},
{"compaction_time", timestamp_type}
},
// static columns
{},
// regular column name type
utf8_type,
// comment
"partitions larger than specified threshold"
);
builder.set_gc_grace_seconds(0);
builder.with_version(generate_schema_version(builder.uuid()));
// FIXME re-enable caching for this and the other two
// system.large_* tables once
// https://github.com/scylladb/scylla/issues/3288 is fixed
builder.set_caching_options(caching_options::get_disabled_caching_options());
return builder.build(schema_builder::compact_storage::no);
}();
return large_partitions;
}
schema_ptr system_keyspace::large_rows() {
static thread_local auto large_rows = [] {
auto id = generate_legacy_id(NAME, LARGE_ROWS);
return schema_builder(NAME, LARGE_ROWS, std::optional(id))
.with_column("keyspace_name", utf8_type, column_kind::partition_key)
.with_column("table_name", utf8_type, column_kind::partition_key)
.with_column("sstable_name", utf8_type, column_kind::clustering_key)
// We want the large rows first, so use reversed_type_impl
.with_column("row_size", reversed_type_impl::get_instance(long_type), column_kind::clustering_key)
.with_column("partition_key", utf8_type, column_kind::clustering_key)
.with_column("clustering_key", utf8_type, column_kind::clustering_key)
.with_column("compaction_time", timestamp_type)
.set_comment("rows larger than specified threshold")
.with_version(generate_schema_version(id))
.set_gc_grace_seconds(0)
.set_caching_options(caching_options::get_disabled_caching_options())
.build();
}();
return large_rows;
}
schema_ptr system_keyspace::large_cells() {
constexpr uint16_t schema_version_offset = 1; // collection_elements
static thread_local auto large_cells = [] {
auto id = generate_legacy_id(NAME, LARGE_CELLS);
return schema_builder(NAME, LARGE_CELLS, id)
.with_column("keyspace_name", utf8_type, column_kind::partition_key)
.with_column("table_name", utf8_type, column_kind::partition_key)
.with_column("sstable_name", utf8_type, column_kind::clustering_key)
// We want the larger cells first, so use reversed_type_impl
.with_column("cell_size", reversed_type_impl::get_instance(long_type), column_kind::clustering_key)
.with_column("partition_key", utf8_type, column_kind::clustering_key)
.with_column("clustering_key", utf8_type, column_kind::clustering_key)
.with_column("column_name", utf8_type, column_kind::clustering_key)
// regular rows
.with_column("collection_elements", long_type)
.with_column("compaction_time", timestamp_type)
.set_comment("cells larger than specified threshold")
.with_version(generate_schema_version(id, schema_version_offset))
.set_gc_grace_seconds(0)
.set_caching_options(caching_options::get_disabled_caching_options())
.build();
}();
return large_cells;
}
/*static*/ schema_ptr system_keyspace::scylla_local() {
static thread_local auto scylla_local = [] {
schema_builder builder(generate_legacy_id(NAME, SCYLLA_LOCAL), NAME, SCYLLA_LOCAL,
// partition key
{{"key", utf8_type}},
// clustering key
{},
// regular columns
{
{"value", utf8_type},
},
// static columns
{},
// regular column name type
utf8_type,
// comment
"Scylla specific information about the local node"
);
builder.set_gc_grace_seconds(0);
builder.with_version(generate_schema_version(builder.uuid()));
return builder.build(schema_builder::compact_storage::no);
}();
return scylla_local;
}
schema_ptr system_keyspace::v3::batches() {
static thread_local auto schema = [] {
schema_builder builder(generate_legacy_id(NAME, BATCHES), NAME, BATCHES,
// partition key
{{"id", timeuuid_type}},
// clustering key
{},
// regular columns
{{"mutations", list_type_impl::get_instance(bytes_type, true)}, {"version", int32_type}},
// static columns
{},
// regular column name type
utf8_type,
// comment
"batches awaiting replay"
);
builder.set_gc_grace_seconds(0);
// FIXME: the original Java code also had:
//.copy(new LocalPartitioner(TimeUUIDType.instance))
builder.set_gc_grace_seconds(0);
builder.set_compaction_strategy(sstables::compaction_strategy_type::size_tiered);
builder.set_compaction_strategy_options({{"min_threshold", "2"}});
builder.with_version(generate_schema_version(builder.uuid()));
return builder.build(schema_builder::compact_storage::no);
}();
return schema;
}
schema_ptr system_keyspace::v3::built_indexes() {
// identical to ours, but ours otoh is a mix-in of the 3.x series cassandra one
return db::system_keyspace::built_indexes();
}
schema_ptr system_keyspace::v3::local() {
static thread_local auto schema = [] {
schema_builder builder(generate_legacy_id(NAME, LOCAL), NAME, LOCAL,
// partition key
{{"key", utf8_type}},
// clustering key
{},
// regular columns
{
{"bootstrapped", utf8_type},
{"broadcast_address", inet_addr_type},
{"cluster_name", utf8_type},
{"cql_version", utf8_type},
{"data_center", utf8_type},
{"gossip_generation", int32_type},
{"host_id", uuid_type},
{"listen_address", inet_addr_type},
{"native_protocol_version", utf8_type},
{"partitioner", utf8_type},
{"rack", utf8_type},
{"release_version", utf8_type},
{"rpc_address", inet_addr_type},
{"schema_version", uuid_type},
{"thrift_version", utf8_type},
{"tokens", set_type_impl::get_instance(utf8_type, true)},
{"truncated_at", map_type_impl::get_instance(uuid_type, bytes_type, true)},
},
// static columns
{},
// regular column name type
utf8_type,
// comment
"information about the local node"
);
builder.set_gc_grace_seconds(0);
builder.with_version(generate_schema_version(builder.uuid()));
return builder.build(schema_builder::compact_storage::no);
}();
return schema;
}
schema_ptr system_keyspace::v3::truncated() {
static thread_local auto local = [] {
schema_builder builder(generate_legacy_id(NAME, TRUNCATED), NAME, TRUNCATED,
// partition key
{{"table_uuid", uuid_type}},
// clustering key
{{"shard", int32_type}},
// regular columns
{
{"position", int32_type},
{"segment_id", long_type}
},
// static columns
{
{"truncated_at", timestamp_type},
},
// regular column name type
utf8_type,
// comment
"information about table truncation"
);
builder.set_gc_grace_seconds(0);
builder.with_version(generate_schema_version(builder.uuid()));
return builder.build(schema_builder::compact_storage::no);
}();
return local;
}
schema_ptr system_keyspace::v3::peers() {
// identical
return db::system_keyspace::peers();
}
schema_ptr system_keyspace::v3::peer_events() {
// identical
return db::system_keyspace::peer_events();
}
schema_ptr system_keyspace::v3::range_xfers() {
// identical
return db::system_keyspace::range_xfers();
}
schema_ptr system_keyspace::v3::compaction_history() {
// identical
return db::system_keyspace::compaction_history();
}
schema_ptr system_keyspace::v3::sstable_activity() {
// identical
return db::system_keyspace::sstable_activity();
}
schema_ptr system_keyspace::v3::size_estimates() {
// identical
return db::system_keyspace::size_estimates();
}
schema_ptr system_keyspace::v3::large_partitions() {
// identical
return db::system_keyspace::large_partitions();
}
schema_ptr system_keyspace::v3::scylla_local() {
// identical
return db::system_keyspace::scylla_local();
}
schema_ptr system_keyspace::v3::available_ranges() {
static thread_local auto schema = [] {
schema_builder builder(generate_legacy_id(NAME, AVAILABLE_RANGES), NAME, AVAILABLE_RANGES,
// partition key
{{"keyspace_name", utf8_type}},
// clustering key
{},
// regular columns
{{"ranges", set_type_impl::get_instance(bytes_type, true)}},
// static columns
{},
// regular column name type
utf8_type,
// comment
"available keyspace/ranges during bootstrap/replace that are ready to be served"
);
builder.set_gc_grace_seconds(0);
builder.with_version(generate_schema_version(builder.uuid()));
return builder.build();
}();
return schema;
}
schema_ptr system_keyspace::v3::views_builds_in_progress() {
static thread_local auto schema = [] {
schema_builder builder(generate_legacy_id(NAME, VIEWS_BUILDS_IN_PROGRESS), NAME, VIEWS_BUILDS_IN_PROGRESS,
// partition key
{{"keyspace_name", utf8_type}},
// clustering key
{{"view_name", utf8_type}},
// regular columns
{{"last_token", utf8_type}, {"generation_number", int32_type}},
// static columns
{},
// regular column name type
utf8_type,
// comment
"views builds current progress"
);
builder.set_gc_grace_seconds(0);
builder.with_version(generate_schema_version(builder.uuid()));
return builder.build();
}();
return schema;
}
schema_ptr system_keyspace::v3::built_views() {
static thread_local auto schema = [] {
schema_builder builder(generate_legacy_id(NAME, BUILT_VIEWS), NAME, BUILT_VIEWS,
// partition key
{{"keyspace_name", utf8_type}},
// clustering key
{{"view_name", utf8_type}},
// regular columns
{},
// static columns
{},
// regular column name type
utf8_type,
// comment
"built views"
);
builder.set_gc_grace_seconds(0);
builder.with_version(generate_schema_version(builder.uuid()));
return builder.build();
}();
return schema;
}
schema_ptr system_keyspace::v3::scylla_views_builds_in_progress() {
static thread_local auto schema = [] {
auto id = generate_legacy_id(NAME, SCYLLA_VIEWS_BUILDS_IN_PROGRESS);
return schema_builder(NAME, SCYLLA_VIEWS_BUILDS_IN_PROGRESS, std::make_optional(id))
.with_column("keyspace_name", utf8_type, column_kind::partition_key)
.with_column("view_name", utf8_type, column_kind::clustering_key)
.with_column("cpu_id", int32_type, column_kind::clustering_key)
.with_column("next_token", utf8_type)
.with_column("generation_number", int32_type)
.with_column("first_token", utf8_type)
.with_version(generate_schema_version(id))
.build();
}();
return schema;
}
/*static*/ schema_ptr system_keyspace::v3::cdc_local() {
static thread_local auto cdc_local = [] {
schema_builder builder(generate_legacy_id(NAME, CDC_LOCAL), NAME, CDC_LOCAL,
// partition key
{{"key", utf8_type}},
// clustering key
{},
// regular columns
{
/* Every node announces the identifier of the newest known CDC generation to other nodes.
* The identifier consists of two things: a timestamp (which is the generation's timestamp,
* denoting the time point from which it starts operating) and an UUID (randomly generated
* when the generation is created).
* This identifier is persisted here and restored on node restart.
*
* Some identifiers - identifying generations created in older clusters - have only the timestamp.
* For these the uuid column is empty.
*/
{"streams_timestamp", timestamp_type},
{"uuid", uuid_type},
},
// static columns
{},
// regular column name type
utf8_type,
// comment
"CDC-specific information that the local node stores"
);
builder.set_gc_grace_seconds(0);
builder.with_version(generate_schema_version(builder.uuid()));
return builder.build(schema_builder::compact_storage::no);
}();
return cdc_local;
}
schema_ptr system_keyspace::group0_history() {
static thread_local auto schema = [] {
auto id = generate_legacy_id(NAME, GROUP0_HISTORY);
return schema_builder(NAME, GROUP0_HISTORY, id)
// this is a single-partition table with key 'history'
.with_column("key", utf8_type, column_kind::partition_key)
// group0 state timeuuid, descending order
.with_column("state_id", reversed_type_impl::get_instance(timeuuid_type), column_kind::clustering_key)
// human-readable description of the change
.with_column("description", utf8_type)
.set_comment("History of Raft group 0 state changes")
.with_version(generate_schema_version(id))
.build();
}();
return schema;
}
schema_ptr system_keyspace::discovery() {
static thread_local auto schema = [] {
auto id = generate_legacy_id(NAME, DISCOVERY);
return schema_builder(NAME, DISCOVERY, id)
// This is a single-partition table with key 'peers'
.with_column("key", utf8_type, column_kind::partition_key)
// Peer ip address
.with_column("ip_addr", inet_addr_type, column_kind::clustering_key)
// The ID of the group 0 server on that peer.
// May be unknown during discovery, then it's set to UUID 0.
.with_column("raft_server_id", uuid_type)
.set_comment("State of cluster discovery algorithm: the set of discovered peers")
.with_version(generate_schema_version(id))
.build();
}();
return schema;
}
schema_ptr system_keyspace::broadcast_kv_store() {
static thread_local auto schema = [] {
auto id = generate_legacy_id(NAME, BROADCAST_KV_STORE);
return schema_builder(NAME, BROADCAST_KV_STORE, id)
.with_column("key", utf8_type, column_kind::partition_key)
.with_column("value", utf8_type)
.with_version(generate_schema_version(id))
.build();
}();
return schema;
}
schema_ptr system_keyspace::sstables_registry() {
static thread_local auto schema = [] {
auto id = generate_legacy_id(NAME, SSTABLES_REGISTRY);
return schema_builder(NAME, SSTABLES_REGISTRY, id)
.with_column("location", utf8_type, column_kind::partition_key)
.with_column("generation", timeuuid_type, column_kind::clustering_key)
.with_column("uuid", uuid_type)
.with_column("status", utf8_type)
.with_column("version", utf8_type)
.with_column("format", utf8_type)
.set_comment("SSTables ownership table")
.with_version(generate_schema_version(id))
.build();
}();
return schema;
}
schema_ptr system_keyspace::tablets() {
static thread_local auto schema = replica::make_tablets_schema();
return schema;
}
schema_ptr system_keyspace::legacy::hints() {
static thread_local auto schema = [] {
schema_builder builder(generate_legacy_id(NAME, HINTS), NAME, HINTS,
// partition key
{{"target_id", uuid_type}},
// clustering key
{{"hint_id", timeuuid_type}, {"message_version", int32_type}},
// regular columns
{{"mutation", bytes_type}},
// static columns
{},
// regular column name type
utf8_type,
// comment
"*DEPRECATED* hints awaiting delivery"
);
builder.set_gc_grace_seconds(0);
builder.set_compaction_strategy(sstables::compaction_strategy_type::size_tiered);
builder.set_compaction_strategy_options({{"enabled", "false"}});
builder.with_version(generate_schema_version(builder.uuid()));
builder.with(schema_builder::compact_storage::yes);
return builder.build();
}();
return schema;
}
schema_ptr system_keyspace::legacy::batchlog() {
static thread_local auto schema = [] {
schema_builder builder(generate_legacy_id(NAME, BATCHLOG), NAME, BATCHLOG,
// partition key
{{"id", uuid_type}},
// clustering key
{},
// regular columns
{{"data", bytes_type}, {"version", int32_type}, {"written_at", timestamp_type}},
// static columns
{},
// regular column name type
utf8_type,
// comment
"*DEPRECATED* batchlog entries"
);
builder.set_gc_grace_seconds(0);
builder.set_compaction_strategy(sstables::compaction_strategy_type::size_tiered);
builder.set_compaction_strategy_options({{"min_threshold", "2"}});
builder.with(schema_builder::compact_storage::no);
builder.with_version(generate_schema_version(builder.uuid()));
return builder.build();
}();
return schema;
}
static constexpr auto schema_gc_grace = std::chrono::duration_cast<std::chrono::seconds>(days(7)).count();
schema_ptr system_keyspace::legacy::keyspaces() {
static thread_local auto schema = [] {
schema_builder builder(generate_legacy_id(NAME, KEYSPACES), NAME, KEYSPACES,
// partition key
{{"keyspace_name", utf8_type}},
// clustering key
{},
// regular columns
{
{"durable_writes", boolean_type},
{"strategy_class", utf8_type},
{"strategy_options", utf8_type}
},
// static columns
{},
// regular column name type
utf8_type,
// comment
"*DEPRECATED* keyspace definitions"
);
builder.set_gc_grace_seconds(schema_gc_grace);
builder.with(schema_builder::compact_storage::yes);
builder.with_version(generate_schema_version(builder.uuid()));
return builder.build();
}();
return schema;
}
schema_ptr system_keyspace::legacy::column_families() {
static thread_local auto schema = [] {
schema_builder builder(generate_legacy_id(NAME, COLUMNFAMILIES), NAME, COLUMNFAMILIES,
// partition key
{{"keyspace_name", utf8_type}},
// clustering key
{{"columnfamily_name", utf8_type}},
// regular columns
{
{"bloom_filter_fp_chance", double_type},
{"caching", utf8_type},
{"cf_id", uuid_type},
{"comment", utf8_type},
{"compaction_strategy_class", utf8_type},
{"compaction_strategy_options", utf8_type},
{"comparator", utf8_type},
{"compression_parameters", utf8_type},
{"default_time_to_live", int32_type},
{"default_validator", utf8_type},
{"dropped_columns", map_type_impl::get_instance(utf8_type, long_type, true)},
{"gc_grace_seconds", int32_type},
{"is_dense", boolean_type},
{"key_validator", utf8_type},
{"local_read_repair_chance", double_type},
{"max_compaction_threshold", int32_type},
{"max_index_interval", int32_type},
{"memtable_flush_period_in_ms", int32_type},
{"min_compaction_threshold", int32_type},
{"min_index_interval", int32_type},
{"read_repair_chance", double_type},
{"speculative_retry", utf8_type},
{"subcomparator", utf8_type},
{"type", utf8_type},
// The following 4 columns are only present up until 2.1.8 tables
{"key_aliases", utf8_type},
{"value_alias", utf8_type},
{"column_aliases", utf8_type},
{"index_interval", int32_type},},
// static columns
{},
// regular column name type
utf8_type,
// comment
"*DEPRECATED* table definitions"
);
builder.set_gc_grace_seconds(schema_gc_grace);
builder.with(schema_builder::compact_storage::no);
builder.with_version(generate_schema_version(builder.uuid()));
return builder.build();
}();
return schema;
}
schema_ptr system_keyspace::legacy::columns() {
static thread_local auto schema = [] {
schema_builder builder(generate_legacy_id(NAME, COLUMNS), NAME, COLUMNS,
// partition key
{{"keyspace_name", utf8_type}},
// clustering key
{{"columnfamily_name", utf8_type}, {"column_name", utf8_type}},
// regular columns
{
{"component_index", int32_type},
{"index_name", utf8_type},
{"index_options", utf8_type},
{"index_type", utf8_type},
{"type", utf8_type},
{"validator", utf8_type},
},
// static columns
{},
// regular column name type
utf8_type,
// comment
"column definitions"
);
builder.set_gc_grace_seconds(schema_gc_grace);
builder.with(schema_builder::compact_storage::no);
builder.with_version(generate_schema_version(builder.uuid()));
return builder.build();
}();
return schema;
}
schema_ptr system_keyspace::legacy::triggers() {
static thread_local auto schema = [] {
schema_builder builder(generate_legacy_id(NAME, TRIGGERS), NAME, TRIGGERS,
// partition key
{{"keyspace_name", utf8_type}},
// clustering key
{{"columnfamily_name", utf8_type}, {"trigger_name", utf8_type}},
// regular columns
{
{"trigger_options", map_type_impl::get_instance(utf8_type, utf8_type, true)},
},
// static columns
{},
// regular column name type
utf8_type,
// comment
"trigger definitions"
);
builder.set_gc_grace_seconds(schema_gc_grace);
builder.with(schema_builder::compact_storage::no);
builder.with_version(generate_schema_version(builder.uuid()));
return builder.build();
}();
return schema;
}
schema_ptr system_keyspace::legacy::usertypes() {
static thread_local auto schema = [] {
schema_builder builder(generate_legacy_id(NAME, USERTYPES), NAME, USERTYPES,
// partition key
{{"keyspace_name", utf8_type}},
// clustering key
{{"type_name", utf8_type}},
// regular columns
{
{"field_names", list_type_impl::get_instance(utf8_type, true)},
{"field_types", list_type_impl::get_instance(utf8_type, true)},
},
// static columns
{},
// regular column name type
utf8_type,
// comment
"user defined type definitions"
);
builder.set_gc_grace_seconds(schema_gc_grace);
builder.with(schema_builder::compact_storage::no);
builder.with_version(generate_schema_version(builder.uuid()));
return builder.build();
}();
return schema;
}
schema_ptr system_keyspace::legacy::functions() {
/**
* Note: we have our own "legacy" version of this table (in schema_tables),
* but it is (afaik) not used, and differs slightly from the origin one.
* This is based on the origin schema, since we're more likely to encounter
* installations of that to migrate, rather than our own (if we dont use the table).
*/
static thread_local auto schema = [] {
schema_builder builder(generate_legacy_id(NAME, FUNCTIONS), NAME, FUNCTIONS,
// partition key
{{"keyspace_name", utf8_type}},
// clustering key
{{"function_name", utf8_type},{"signature", list_type_impl::get_instance(utf8_type, false)}},
// regular columns
{
{"argument_names", list_type_impl::get_instance(utf8_type, true)},
{"argument_types", list_type_impl::get_instance(utf8_type, true)},
{"body", utf8_type},
{"language", utf8_type},
{"return_type", utf8_type},
{"called_on_null_input", boolean_type},
},
// static columns
{},
// regular column name type
utf8_type,
// comment
"*DEPRECATED* user defined type definitions"
);
builder.set_gc_grace_seconds(schema_gc_grace);
builder.with(schema_builder::compact_storage::no);
builder.with_version(generate_schema_version(builder.uuid()));
return builder.build();
}();
return schema;
}
schema_ptr system_keyspace::legacy::aggregates() {
static thread_local auto schema = [] {
schema_builder builder(generate_legacy_id(NAME, AGGREGATES), NAME, AGGREGATES,
// partition key
{{"keyspace_name", utf8_type}},
// clustering key
{{"aggregate_name", utf8_type},{"signature", list_type_impl::get_instance(utf8_type, false)}},
// regular columns
{
{"argument_types", list_type_impl::get_instance(utf8_type, true)},
{"final_func", utf8_type},
{"initcond", bytes_type},
{"return_type", utf8_type},
{"state_func", utf8_type},
{"state_type", utf8_type},
},
// static columns
{},
// regular column name type
utf8_type,
// comment
"*DEPRECATED* user defined aggregate definition"
);
builder.set_gc_grace_seconds(schema_gc_grace);
builder.with(schema_builder::compact_storage::no);
builder.with_version(generate_schema_version(builder.uuid()));
return builder.build();
}();
return schema;
}
future<> system_keyspace::setup_version(sharded<netw::messaging_service>& ms) {
auto& cfg = _db.get_config();
sstring req = fmt::format("INSERT INTO system.{} (key, release_version, cql_version, thrift_version, native_protocol_version, data_center, rack, partitioner, rpc_address, broadcast_address, listen_address) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)"
, db::system_keyspace::LOCAL);
return execute_cql(req, sstring(db::system_keyspace::LOCAL),
version::release(),
cql3::query_processor::CQL_VERSION,
::cassandra::thrift_version,
to_sstring(unsigned(cql_serialization_format::latest().protocol_version())),
local_dc_rack().dc,
local_dc_rack().rack,
sstring(cfg.partitioner()),
utils::fb_utilities::get_broadcast_rpc_address().addr(),
utils::fb_utilities::get_broadcast_address().addr(),
ms.local().listen_address().addr()
).discard_result();
}
future<> system_keyspace::save_local_supported_features(const std::set<std::string_view>& feats) {
static const auto req = format("INSERT INTO system.{} (key, supported_features) VALUES (?, ?)", LOCAL);
return execute_cql(req,
sstring(db::system_keyspace::LOCAL),
fmt::to_string(fmt::join(feats, ","))).discard_result();
}
// The cache must be distributed, because the values themselves may not update atomically, so a shard reading that
// is different than the one that wrote, may see a corrupted value. invoke_on_all will be used to guarantee that all
// updates are propagated correctly.
struct local_cache {
locator::endpoint_dc_rack _local_dc_rack_info;
system_keyspace::bootstrap_state _state;
};
future<std::unordered_map<gms::inet_address, locator::endpoint_dc_rack>> system_keyspace::load_dc_rack_info() {
auto msg = co_await execute_cql(format("SELECT peer, data_center, rack from system.{}", PEERS));
std::unordered_map<gms::inet_address, locator::endpoint_dc_rack> ret;
for (const auto& row : *msg) {
net::inet_address peer = row.template get_as<net::inet_address>("peer");
if (!row.has("data_center") || !row.has("rack")) {
continue;
}
gms::inet_address gms_addr(std::move(peer));
sstring dc = row.template get_as<sstring>("data_center");
sstring rack = row.template get_as<sstring>("rack");
ret.emplace(gms_addr, locator::endpoint_dc_rack{ dc, rack });
}
co_return ret;
}
future<> system_keyspace::build_bootstrap_info() {
sstring req = format("SELECT bootstrapped FROM system.{} WHERE key = ? ", LOCAL);
return execute_cql(req, sstring(LOCAL)).then([this] (auto msg) {
static auto state_map = std::unordered_map<sstring, bootstrap_state>({
{ "NEEDS_BOOTSTRAP", bootstrap_state::NEEDS_BOOTSTRAP },
{ "COMPLETED", bootstrap_state::COMPLETED },
{ "IN_PROGRESS", bootstrap_state::IN_PROGRESS },
{ "DECOMMISSIONED", bootstrap_state::DECOMMISSIONED }
});
bootstrap_state state = bootstrap_state::NEEDS_BOOTSTRAP;
if (!msg->empty() && msg->one().has("bootstrapped")) {
state = state_map.at(msg->one().template get_as<sstring>("bootstrapped"));
}
return container().invoke_on_all([state] (auto& sys_ks) {
sys_ks._cache->_state = state;
});
});
}
future<> system_keyspace::setup(sharded<locator::snitch_ptr>& snitch, sharded<netw::messaging_service>& ms) {
assert(this_shard_id() == 0);
co_await setup_version(ms);
co_await update_schema_version(_db.get_version());
co_await build_bootstrap_info();
co_await check_health();
co_await db::schema_tables::save_system_keyspace_schema(_qp);
// #2514 - make sure "system" is written to system_schema.keyspaces.
co_await db::schema_tables::save_system_schema(_qp, NAME);
co_await cache_truncation_record();
if (snitch.local()->prefer_local()) {
auto preferred_ips = co_await get_preferred_ips();
co_await ms.invoke_on_all([&preferred_ips] (auto& ms) {
return ms.init_local_preferred_ip_cache(preferred_ips);
});
}
}
struct truncation_record {
static constexpr uint32_t current_magic = 0x53435452; // 'S' 'C' 'T' 'R'
uint32_t magic;
std::vector<db::replay_position> positions;
db_clock::time_point time_stamp;
};
}
namespace db {
future<truncation_record> system_keyspace::get_truncation_record(table_id cf_id) {
if (_db.get_config().ignore_truncation_record.is_set()) {
truncation_record r{truncation_record::current_magic};
return make_ready_future<truncation_record>(std::move(r));
}
sstring req = format("SELECT * from system.{} WHERE table_uuid = ?", TRUNCATED);
return execute_cql(req, {cf_id.uuid()}).then([](::shared_ptr<cql3::untyped_result_set> rs) {
truncation_record r{truncation_record::current_magic};
for (const cql3::untyped_result_set_row& row : *rs) {
auto shard = row.get_as<int32_t>("shard");
auto ts = row.get_as<db_clock::time_point>("truncated_at");
auto pos = row.get_as<int32_t>("position");
auto id = row.get_as<int64_t>("segment_id");
r.time_stamp = ts;
r.positions.emplace_back(replay_position(shard, id, pos));
}
return make_ready_future<truncation_record>(std::move(r));
});
}
// Read system.truncate table and cache last truncation time in `table` object for each table on every shard
future<> system_keyspace::cache_truncation_record() {
if (_db.get_config().ignore_truncation_record.is_set()) {
return make_ready_future<>();
}
sstring req = format("SELECT DISTINCT table_uuid, truncated_at from system.{}", TRUNCATED);
return execute_cql(req).then([this] (::shared_ptr<cql3::untyped_result_set> rs) {
return parallel_for_each(rs->begin(), rs->end(), [this] (const cql3::untyped_result_set_row& row) {
auto table_uuid = table_id(row.get_as<utils::UUID>("table_uuid"));
auto ts = row.get_as<db_clock::time_point>("truncated_at");
return _db.container().invoke_on_all([table_uuid, ts] (replica::database& db) mutable {
try {
replica::table& cf = db.find_column_family(table_uuid);
cf.cache_truncation_record(ts);
} catch (replica::no_such_column_family&) {
slogger.debug("Skip caching truncation time for {} since the table is no longer present", table_uuid);
}
});
});
});
}
future<> system_keyspace::save_truncation_record(table_id id, db_clock::time_point truncated_at, db::replay_position rp) {
sstring req = format("INSERT INTO system.{} (table_uuid, shard, position, segment_id, truncated_at) VALUES(?,?,?,?,?)", TRUNCATED);
return qctx->qp().execute_internal(req, {id.uuid(), int32_t(rp.shard_id()), int32_t(rp.pos), int64_t(rp.base_id()), truncated_at}, cql3::query_processor::cache_internal::yes).discard_result().then([] {
return force_blocking_flush(TRUNCATED);
});
}
future<> system_keyspace::save_truncation_record(const replica::column_family& cf, db_clock::time_point truncated_at, db::replay_position rp) {
return save_truncation_record(cf.schema()->id(), truncated_at, rp);
}
future<replay_positions> system_keyspace::get_truncated_position(table_id cf_id) {
return get_truncation_record(cf_id).then([](truncation_record e) {
return make_ready_future<replay_positions>(e.positions);
});
}
future<db_clock::time_point> system_keyspace::get_truncated_at(table_id cf_id) {
return get_truncation_record(cf_id).then([](truncation_record e) {
return make_ready_future<db_clock::time_point>(e.time_stamp);
});
}
static set_type_impl::native_type deserialize_set_column(const schema& s, const cql3::untyped_result_set_row& row, const char* name) {
auto blob = row.get_blob(name);
auto cdef = s.get_column_definition(name);
auto deserialized = cdef->type->deserialize(blob);
return value_cast<set_type_impl::native_type>(deserialized);
}
static set_type_impl::native_type prepare_tokens(const std::unordered_set<dht::token>& tokens) {
set_type_impl::native_type tset;
for (auto& t: tokens) {
tset.push_back(t.to_sstring());
}
return tset;
}
std::unordered_set<dht::token> decode_tokens(const set_type_impl::native_type& tokens) {
std::unordered_set<dht::token> tset;
for (auto& t: tokens) {
auto str = value_cast<sstring>(t);
assert(str == dht::token::from_sstring(str).to_sstring());
tset.insert(dht::token::from_sstring(str));
}
return tset;
}
future<> system_keyspace::update_tokens(gms::inet_address ep, const std::unordered_set<dht::token>& tokens)
{
if (ep == utils::fb_utilities::get_broadcast_address()) {
co_return co_await remove_endpoint(ep);
}
sstring req = format("INSERT INTO system.{} (peer, tokens) VALUES (?, ?)", PEERS);
slogger.debug("INSERT INTO system.{} (peer, tokens) VALUES ({}, {})", PEERS, ep, tokens);
auto set_type = set_type_impl::get_instance(utf8_type, true);
co_await execute_cql(req, ep.addr(), make_set_value(set_type, prepare_tokens(tokens))).discard_result();
co_await force_blocking_flush(PEERS);
}
future<std::unordered_map<gms::inet_address, std::unordered_set<dht::token>>> system_keyspace::load_tokens() {
sstring req = format("SELECT peer, tokens FROM system.{}", PEERS);
return execute_cql(req).then([] (::shared_ptr<cql3::untyped_result_set> cql_result) {
std::unordered_map<gms::inet_address, std::unordered_set<dht::token>> ret;
for (auto& row : *cql_result) {
auto peer = gms::inet_address(row.get_as<net::inet_address>("peer"));
if (row.has("tokens")) {
ret.emplace(peer, decode_tokens(deserialize_set_column(*peers(), row, "tokens")));
}
}
return ret;
});
}
future<std::unordered_map<gms::inet_address, locator::host_id>> system_keyspace::load_host_ids() {
sstring req = format("SELECT peer, host_id FROM system.{}", PEERS);
return execute_cql(req).then([] (::shared_ptr<cql3::untyped_result_set> cql_result) {
std::unordered_map<gms::inet_address, locator::host_id> ret;
for (auto& row : *cql_result) {
auto peer = gms::inet_address(row.get_as<net::inet_address>("peer"));
if (row.has("host_id")) {
ret.emplace(peer, locator::host_id(row.get_as<utils::UUID>("host_id")));
}
}
return ret;
});
}
future<std::vector<gms::inet_address>> system_keyspace::load_peers() {
auto res = co_await execute_cql(format("SELECT peer, tokens FROM system.{}", PEERS));
assert(res);
std::vector<gms::inet_address> ret;
for (auto& row: *res) {
if (!row.has("tokens")) {
// Ignore rows that don't have tokens. Such rows may
// be introduced by code that persists parts of peer
// information (such as RAFT_ID) which may potentially
// race with deleting a peer (during node removal).
continue;
}
ret.emplace_back(row.get_as<net::inet_address>("peer"));
}
co_return ret;
}
future<std::unordered_map<gms::inet_address, sstring>> system_keyspace::load_peer_features() {
sstring req = format("SELECT peer, supported_features FROM system.{}", PEERS);
return execute_cql(req).then([] (::shared_ptr<cql3::untyped_result_set> cql_result) {
std::unordered_map<gms::inet_address, sstring> ret;
for (auto& row : *cql_result) {
if (row.has("supported_features")) {
ret.emplace(row.get_as<net::inet_address>("peer"),
row.get_as<sstring>("supported_features"));
}
}
return ret;
});
}
future<std::unordered_map<gms::inet_address, gms::inet_address>> system_keyspace::get_preferred_ips() {
sstring req = format("SELECT peer, preferred_ip FROM system.{}", PEERS);
return execute_cql(req).then([] (::shared_ptr<cql3::untyped_result_set> cql_res_set) {
std::unordered_map<gms::inet_address, gms::inet_address> res;
for (auto& r : *cql_res_set) {
if (r.has("preferred_ip")) {
res.emplace(gms::inet_address(r.get_as<net::inet_address>("peer")),
gms::inet_address(r.get_as<net::inet_address>("preferred_ip")));
}
}
return res;
});
}
template <typename Value>
future<> system_keyspace::update_cached_values(gms::inet_address ep, sstring column_name, Value value) {
return make_ready_future<>();
}
template <typename Value>
future<> system_keyspace::update_peer_info(gms::inet_address ep, sstring column_name, Value value) {
if (ep == utils::fb_utilities::get_broadcast_address()) {
co_return;
}
co_await update_cached_values(ep, column_name, value);
sstring req = format("INSERT INTO system.{} (peer, {}) VALUES (?, ?)", PEERS, column_name);
slogger.debug("INSERT INTO system.{} (peer, {}) VALUES ({}, {})", PEERS, column_name, ep, value);
co_await execute_cql(req, ep.addr(), value).discard_result();
}
// sets are not needed, since tokens are updated by another method
template future<> system_keyspace::update_peer_info<sstring>(gms::inet_address ep, sstring column_name, sstring);
template future<> system_keyspace::update_peer_info<utils::UUID>(gms::inet_address ep, sstring column_name, utils::UUID);
template future<> system_keyspace::update_peer_info<net::inet_address>(gms::inet_address ep, sstring column_name, net::inet_address);
template <typename T>
future<> set_scylla_local_param_as(const sstring& key, const T& value) {
sstring req = format("UPDATE system.{} SET value = ? WHERE key = ?", system_keyspace::SCYLLA_LOCAL);
auto type = data_type_for<T>();
co_await qctx->execute_cql(req, type->to_string_impl(data_value(value)), key).discard_result();
// Flush the table so that the value is available on boot before commitlog replay.
// database::maybe_init_schema_commitlog() depends on it.
co_await smp::invoke_on_all([] () -> future<> {
co_await qctx->qp().db().real_database().flush(db::system_keyspace::NAME, system_keyspace::SCYLLA_LOCAL);
});
}
template <typename T>
future<std::optional<T>> get_scylla_local_param_as(const sstring& key) {
sstring req = format("SELECT value FROM system.{} WHERE key = ?", system_keyspace::SCYLLA_LOCAL);
return qctx->execute_cql(req, key).then([] (::shared_ptr<cql3::untyped_result_set> res)
-> future<std::optional<T>> {
if (res->empty() || !res->one().has("value")) {
return make_ready_future<std::optional<T>>(std::optional<T>());
}
auto type = data_type_for<T>();
return make_ready_future<std::optional<T>>(value_cast<T>(type->deserialize(
type->from_string(res->one().get_as<sstring>("value")))));
});
}
future<> system_keyspace::set_scylla_local_param(const sstring& key, const sstring& value) {
return set_scylla_local_param_as<sstring>(key, value);
}
future<std::optional<sstring>> system_keyspace::get_scylla_local_param(const sstring& key){
return get_scylla_local_param_as<sstring>(key);
}
future<> system_keyspace::update_schema_version(table_schema_version version) {
sstring req = format("INSERT INTO system.{} (key, schema_version) VALUES (?, ?)", LOCAL);
return execute_cql(req, sstring(LOCAL), version.uuid()).discard_result();
}
/**
* Remove stored tokens being used by another node
*/
future<> system_keyspace::remove_endpoint(gms::inet_address ep) {
sstring req = format("DELETE FROM system.{} WHERE peer = ?", PEERS);
slogger.debug("DELETE FROM system.{} WHERE peer = {}", PEERS, ep);
co_await execute_cql(req, ep.addr()).discard_result();
co_await force_blocking_flush(PEERS);
}
future<> system_keyspace::update_tokens(const std::unordered_set<dht::token>& tokens) {
if (tokens.empty()) {
return make_exception_future<>(std::invalid_argument("remove_endpoint should be used instead"));
}
sstring req = format("INSERT INTO system.{} (key, tokens) VALUES (?, ?)", LOCAL);
auto set_type = set_type_impl::get_instance(utf8_type, true);
return execute_cql(req, sstring(LOCAL), make_set_value(set_type, prepare_tokens(tokens))).discard_result().then([] {
return force_blocking_flush(LOCAL);
});
}
future<> system_keyspace::force_blocking_flush(sstring cfname) {
assert(qctx);
return qctx->_qp.invoke_on_all([cfname = std::move(cfname)] (cql3::query_processor& qp) {
// if (!Boolean.getBoolean("cassandra.unsafesystem"))
return qp.db().real_database().flush(NAME, cfname); // FIXME: get real database in another way
});
}
/**
* One of three things will happen if you try to read the system keyspace:
* 1. files are present and you can read them: great
* 2. no files are there: great (new node is assumed)
* 3. files are present but you can't read them: bad
*/
future<> system_keyspace::check_health() {
using namespace cql_transport::messages;
sstring req = format("SELECT cluster_name FROM system.{} WHERE key=?", LOCAL);
return execute_cql(req, sstring(LOCAL)).then([this] (::shared_ptr<cql3::untyped_result_set> msg) {
if (msg->empty() || !msg->one().has("cluster_name")) {
// this is a brand new node
sstring ins_req = format("INSERT INTO system.{} (key, cluster_name) VALUES (?, ?)", LOCAL);
auto cluster_name = _db.get_config().cluster_name();
return execute_cql(ins_req, sstring(LOCAL), cluster_name).discard_result();
} else {
auto cluster_name = _db.get_config().cluster_name();
auto saved_cluster_name = msg->one().get_as<sstring>("cluster_name");
if (cluster_name != saved_cluster_name) {
throw exceptions::configuration_exception("Saved cluster name " + saved_cluster_name + " != configured name " + cluster_name);
}
return make_ready_future<>();
}
});
}
future<std::unordered_set<dht::token>> system_keyspace::get_saved_tokens() {
sstring req = format("SELECT tokens FROM system.{} WHERE key = ?", LOCAL);
return execute_cql(req, sstring(LOCAL)).then([] (auto msg) {
if (msg->empty() || !msg->one().has("tokens")) {
return make_ready_future<std::unordered_set<dht::token>>();
}
auto decoded_tokens = decode_tokens(deserialize_set_column(*local(), msg->one(), "tokens"));
return make_ready_future<std::unordered_set<dht::token>>(std::move(decoded_tokens));
});
}
future<std::unordered_set<dht::token>> system_keyspace::get_local_tokens() {
return get_saved_tokens().then([] (auto&& tokens) {
if (tokens.empty()) {
auto err = format("get_local_tokens: tokens is empty");
slogger.error("{}", err);
throw std::runtime_error(err);
}
return std::move(tokens);
});
}
future<> system_keyspace::update_cdc_generation_id(cdc::generation_id gen_id) {
co_await std::visit(make_visitor(
[this] (cdc::generation_id_v1 id) -> future<> {
co_await execute_cql(
format("INSERT INTO system.{} (key, streams_timestamp) VALUES (?, ?)", v3::CDC_LOCAL),
sstring(v3::CDC_LOCAL), id.ts);
},
[this] (cdc::generation_id_v2 id) -> future<> {
co_await execute_cql(
format("INSERT INTO system.{} (key, streams_timestamp, uuid) VALUES (?, ?, ?)", v3::CDC_LOCAL),
sstring(v3::CDC_LOCAL), id.ts, id.id);
}
), gen_id);
co_await force_blocking_flush(v3::CDC_LOCAL);
}
future<std::optional<cdc::generation_id>> system_keyspace::get_cdc_generation_id() {
auto msg = co_await execute_cql(
format("SELECT streams_timestamp, uuid FROM system.{} WHERE key = ?", v3::CDC_LOCAL),
sstring(v3::CDC_LOCAL));
if (msg->empty()) {
co_return std::nullopt;
}
auto& row = msg->one();
if (!row.has("streams_timestamp")) {
// should not happen but whatever
co_return std::nullopt;
}
auto ts = row.get_as<db_clock::time_point>("streams_timestamp");
if (!row.has("uuid")) {
co_return cdc::generation_id_v1{ts};
}
auto id = row.get_as<utils::UUID>("uuid");
co_return cdc::generation_id_v2{ts, id};
}
static const sstring CDC_REWRITTEN_KEY = "rewritten";
future<> system_keyspace::cdc_set_rewritten(std::optional<cdc::generation_id_v1> gen_id) {
if (gen_id) {
return execute_cql(
format("INSERT INTO system.{} (key, streams_timestamp) VALUES (?, ?)", v3::CDC_LOCAL),
CDC_REWRITTEN_KEY, gen_id->ts).discard_result();
} else {
// Insert just the row marker.
return execute_cql(
format("INSERT INTO system.{} (key) VALUES (?)", v3::CDC_LOCAL),
CDC_REWRITTEN_KEY).discard_result();
}
}
future<bool> system_keyspace::cdc_is_rewritten() {
// We don't care about the actual timestamp; it's additional information for debugging purposes.
return execute_cql(format("SELECT key FROM system.{} WHERE key = ?", v3::CDC_LOCAL), CDC_REWRITTEN_KEY)
.then([] (::shared_ptr<cql3::untyped_result_set> msg) {
return !msg->empty();
});
}
bool system_keyspace::bootstrap_needed() const {
return get_bootstrap_state() == bootstrap_state::NEEDS_BOOTSTRAP;
}
bool system_keyspace::bootstrap_complete() const {
return get_bootstrap_state() == bootstrap_state::COMPLETED;
}
bool system_keyspace::bootstrap_in_progress() const {
return get_bootstrap_state() == bootstrap_state::IN_PROGRESS;
}
bool system_keyspace::was_decommissioned() const {
return get_bootstrap_state() == bootstrap_state::DECOMMISSIONED;
}
system_keyspace::bootstrap_state system_keyspace::get_bootstrap_state() const {
return _cache->_state;
}
future<> system_keyspace::set_bootstrap_state(bootstrap_state state) {
static std::unordered_map<bootstrap_state, sstring, enum_hash<bootstrap_state>> state_to_name({
{ bootstrap_state::NEEDS_BOOTSTRAP, "NEEDS_BOOTSTRAP" },
{ bootstrap_state::COMPLETED, "COMPLETED" },
{ bootstrap_state::IN_PROGRESS, "IN_PROGRESS" },
{ bootstrap_state::DECOMMISSIONED, "DECOMMISSIONED" }
});
sstring state_name = state_to_name.at(state);
sstring req = format("INSERT INTO system.{} (key, bootstrapped) VALUES (?, ?)", LOCAL);
co_await execute_cql(req, sstring(LOCAL), state_name).discard_result();
co_await force_blocking_flush(LOCAL);
co_await container().invoke_on_all([state] (auto& sys_ks) {
sys_ks._cache->_state = state;
});
}
class cluster_status_table : public memtable_filling_virtual_table {
private:
service::storage_service& _ss;
gms::gossiper& _gossiper;
public:
cluster_status_table(service::storage_service& ss, gms::gossiper& g)
: memtable_filling_virtual_table(build_schema())
, _ss(ss), _gossiper(g) {}
static schema_ptr build_schema() {
auto id = generate_legacy_id(system_keyspace::NAME, "cluster_status");
return schema_builder(system_keyspace::NAME, "cluster_status", std::make_optional(id))
.with_column("peer", inet_addr_type, column_kind::partition_key)
.with_column("dc", utf8_type)
.with_column("up", boolean_type)
.with_column("status", utf8_type)
.with_column("load", utf8_type)
.with_column("tokens", int32_type)
.with_column("owns", float_type)
.with_column("host_id", uuid_type)
.with_version(system_keyspace::generate_schema_version(id))
.build();
}
future<> execute(std::function<void(mutation)> mutation_sink) override {
return _ss.get_ownership().then([&, mutation_sink] (std::map<gms::inet_address, float> ownership) {
const locator::token_metadata& tm = _ss.get_token_metadata();
for (auto&& e : _gossiper.get_endpoint_states()) {
auto endpoint = e.first;
mutation m(schema(), partition_key::from_single_value(*schema(), data_value(endpoint).serialize_nonnull()));
row& cr = m.partition().clustered_row(*schema(), clustering_key::make_empty()).cells();
set_cell(cr, "up", _gossiper.is_alive(endpoint));
set_cell(cr, "status", _gossiper.get_gossip_status(endpoint));
set_cell(cr, "load", _gossiper.get_application_state_value(endpoint, gms::application_state::LOAD));
auto hostid = tm.get_host_id_if_known(endpoint);
if (hostid) {
set_cell(cr, "host_id", hostid->uuid());
}
if (tm.is_normal_token_owner(endpoint)) {
sstring dc = tm.get_topology().get_location(endpoint).dc;
set_cell(cr, "dc", dc);
}
if (ownership.contains(endpoint)) {
set_cell(cr, "owns", ownership[endpoint]);
}
set_cell(cr, "tokens", int32_t(tm.get_tokens(endpoint).size()));
mutation_sink(std::move(m));
}
});
}
};
class token_ring_table : public streaming_virtual_table {
private:
replica::database& _db;
service::storage_service& _ss;
public:
token_ring_table(replica::database& db, service::storage_service& ss)
: streaming_virtual_table(build_schema())
, _db(db)
, _ss(ss)
{
_shard_aware = true;
}
static schema_ptr build_schema() {
auto id = generate_legacy_id(system_keyspace::NAME, "token_ring");
return schema_builder(system_keyspace::NAME, "token_ring", std::make_optional(id))
.with_column("keyspace_name", utf8_type, column_kind::partition_key)
.with_column("start_token", utf8_type, column_kind::clustering_key)
.with_column("endpoint", inet_addr_type, column_kind::clustering_key)
.with_column("end_token", utf8_type)
.with_column("dc", utf8_type)
.with_column("rack", utf8_type)
.with_version(system_keyspace::generate_schema_version(id))
.build();
}
dht::decorated_key make_partition_key(const sstring& name) {
return dht::decorate_key(*_s, partition_key::from_single_value(*_s, data_value(name).serialize_nonnull()));
}
clustering_key make_clustering_key(sstring start_token, gms::inet_address host) {
return clustering_key::from_exploded(*_s, {
data_value(start_token).serialize_nonnull(),
data_value(host).serialize_nonnull()
});
}
struct endpoint_details_cmp {
bool operator()(const dht::endpoint_details& l, const dht::endpoint_details& r) const {
return inet_addr_type->less(
data_value(l._host).serialize_nonnull(),
data_value(r._host).serialize_nonnull());
}
};
future<> execute(reader_permit permit, result_collector& result, const query_restrictions& qr) override {
struct decorated_keyspace_name {
sstring name;
dht::decorated_key key;
};
auto keyspace_names = boost::copy_range<std::vector<decorated_keyspace_name>>(
_db.get_keyspaces()
| boost::adaptors::filtered([] (auto&& e) {
auto&& rs = e.second.get_replication_strategy();
return rs.is_vnode_based();
})
| boost::adaptors::transformed([this] (auto&& e) {
return decorated_keyspace_name{e.first, make_partition_key(e.first)};
}));
boost::sort(keyspace_names, [less = dht::ring_position_less_comparator(*_s)]
(const decorated_keyspace_name& l, const decorated_keyspace_name& r) {
return less(l.key, r.key);
});
for (const decorated_keyspace_name& e : keyspace_names) {
auto&& dk = e.key;
if (!this_shard_owns(dk) || !contains_key(qr.partition_range(), dk) || !_db.has_keyspace(e.name)) {
continue;
}
std::vector<dht::token_range_endpoints> ranges = co_await _ss.describe_ring(e.name);
co_await result.emit_partition_start(dk);
boost::sort(ranges, [] (const dht::token_range_endpoints& l, const dht::token_range_endpoints& r) {
return l._start_token < r._start_token;
});
for (dht::token_range_endpoints& range : ranges) {
boost::sort(range._endpoint_details, endpoint_details_cmp());
for (const dht::endpoint_details& detail : range._endpoint_details) {
clustering_row cr(make_clustering_key(range._start_token, detail._host));
set_cell(cr.cells(), "end_token", sstring(range._end_token));
set_cell(cr.cells(), "dc", sstring(detail._datacenter));
set_cell(cr.cells(), "rack", sstring(detail._rack));
co_await result.emit_row(std::move(cr));
}
}
co_await result.emit_partition_end();
}
}
};
class snapshots_table : public streaming_virtual_table {
distributed<replica::database>& _db;
public:
explicit snapshots_table(distributed<replica::database>& db)
: streaming_virtual_table(build_schema())
, _db(db)
{
_shard_aware = true;
}
static schema_ptr build_schema() {
auto id = generate_legacy_id(system_keyspace::NAME, "snapshots");
return schema_builder(system_keyspace::NAME, "snapshots", std::make_optional(id))
.with_column("keyspace_name", utf8_type, column_kind::partition_key)
.with_column("table_name", utf8_type, column_kind::clustering_key)
.with_column("snapshot_name", utf8_type, column_kind::clustering_key)
.with_column("live", long_type)
.with_column("total", long_type)
.set_comment("Lists all the snapshots along with their size, dropped tables are not part of the listing.")
.with_version(system_keyspace::generate_schema_version(id))
.build();
}
dht::decorated_key make_partition_key(const sstring& name) {
return dht::decorate_key(*_s, partition_key::from_single_value(*_s, data_value(name).serialize_nonnull()));
}
clustering_key make_clustering_key(sstring table_name, sstring snapshot_name) {
return clustering_key::from_exploded(*_s, {
data_value(std::move(table_name)).serialize_nonnull(),
data_value(std::move(snapshot_name)).serialize_nonnull()
});
}
future<> execute(reader_permit permit, result_collector& result, const query_restrictions& qr) override {
struct decorated_keyspace_name {
sstring name;
dht::decorated_key key;
};
std::vector<decorated_keyspace_name> keyspace_names;
for (const auto& [name, _] : _db.local().get_keyspaces()) {
auto dk = make_partition_key(name);
if (!this_shard_owns(dk) || !contains_key(qr.partition_range(), dk)) {
continue;
}
keyspace_names.push_back({std::move(name), std::move(dk)});
}
boost::sort(keyspace_names, [less = dht::ring_position_less_comparator(*_s)]
(const decorated_keyspace_name& l, const decorated_keyspace_name& r) {
return less(l.key, r.key);
});
using snapshots_by_tables_map = std::map<sstring, std::map<sstring, replica::table::snapshot_details>>;
class snapshot_reducer {
private:
snapshots_by_tables_map _result;
public:
future<> operator()(const snapshots_by_tables_map& value) {
for (auto& [table_name, snapshots] : value) {
if (auto [_, added] = _result.try_emplace(table_name, std::move(snapshots)); added) {
continue;
}
auto& rp = _result.at(table_name);
for (auto&& [snapshot_name, snapshot_detail]: snapshots) {
if (auto [_, added] = rp.try_emplace(snapshot_name, std::move(snapshot_detail)); added) {
continue;
}
auto& detail = rp.at(snapshot_name);
detail.live += snapshot_detail.live;
detail.total += snapshot_detail.total;
}
}
return make_ready_future<>();
}
snapshots_by_tables_map get() && {
return std::move(_result);
}
};
for (auto& ks_data : keyspace_names) {
co_await result.emit_partition_start(ks_data.key);
const auto snapshots_by_tables = co_await _db.map_reduce(snapshot_reducer(), [ks_name_ = ks_data.name] (replica::database& db) mutable -> future<snapshots_by_tables_map> {
auto ks_name = std::move(ks_name_);
snapshots_by_tables_map snapshots_by_tables;
for (auto& [_, table] : db.get_column_families()) {
if (table->schema()->ks_name() != ks_name) {
continue;
}
const auto unordered_snapshots = co_await table->get_snapshot_details();
snapshots_by_tables.emplace(table->schema()->cf_name(), std::map<sstring, replica::table::snapshot_details>(unordered_snapshots.begin(), unordered_snapshots.end()));
}
co_return snapshots_by_tables;
});
for (const auto& [table_name, snapshots] : snapshots_by_tables) {
for (auto& [snapshot_name, details] : snapshots) {
clustering_row cr(make_clustering_key(table_name, snapshot_name));
set_cell(cr.cells(), "live", details.live);
set_cell(cr.cells(), "total", details.total);
co_await result.emit_row(std::move(cr));
}
}
co_await result.emit_partition_end();
}
}
};
class protocol_servers_table : public memtable_filling_virtual_table {
private:
service::storage_service& _ss;
struct protocol_server_info {
sstring name;
sstring protocol;
sstring protocol_version;
std::vector<sstring> listen_addresses;
explicit protocol_server_info(protocol_server& s)
: name(s.name())
, protocol(s.protocol())
, protocol_version(s.protocol_version()) {
for (const auto& addr : s.listen_addresses()) {
listen_addresses.push_back(format("{}:{}", addr.addr(), addr.port()));
}
}
};
public:
explicit protocol_servers_table(service::storage_service& ss)
: memtable_filling_virtual_table(build_schema())
, _ss(ss) {
_shard_aware = true;
}
static schema_ptr build_schema() {
auto id = generate_legacy_id(system_keyspace::NAME, "protocol_servers");
return schema_builder(system_keyspace::NAME, "protocol_servers", std::make_optional(id))
.with_column("name", utf8_type, column_kind::partition_key)
.with_column("protocol", utf8_type)
.with_column("protocol_version", utf8_type)
.with_column("listen_addresses", list_type_impl::get_instance(utf8_type, false))
.set_comment("Lists all client protocol servers and their status.")
.with_version(system_keyspace::generate_schema_version(id))
.build();
}
future<> execute(std::function<void(mutation)> mutation_sink) override {
// Servers are registered on shard 0 only
const auto server_infos = co_await smp::submit_to(0ul, [&ss = _ss.container()] {
return boost::copy_range<std::vector<protocol_server_info>>(ss.local().protocol_servers()
| boost::adaptors::transformed([] (protocol_server* s) { return protocol_server_info(*s); }));
});
for (auto server : server_infos) {
auto dk = dht::decorate_key(*_s, partition_key::from_single_value(*schema(), data_value(server.name).serialize_nonnull()));
if (!this_shard_owns(dk)) {
continue;
}
mutation m(schema(), std::move(dk));
row& cr = m.partition().clustered_row(*schema(), clustering_key::make_empty()).cells();
set_cell(cr, "protocol", server.protocol);
set_cell(cr, "protocol_version", server.protocol_version);
std::vector<data_value> addresses(server.listen_addresses.begin(), server.listen_addresses.end());
set_cell(cr, "listen_addresses", make_list_value(schema()->get_column_definition("listen_addresses")->type, std::move(addresses)));
mutation_sink(std::move(m));
}
}
};
class runtime_info_table : public memtable_filling_virtual_table {
private:
distributed<replica::database>& _db;
service::storage_service& _ss;
std::optional<dht::decorated_key> _generic_key;
private:
std::optional<dht::decorated_key> maybe_make_key(sstring key) {
auto dk = dht::decorate_key(*_s, partition_key::from_single_value(*schema(), data_value(std::move(key)).serialize_nonnull()));
if (this_shard_owns(dk)) {
return dk;
}
return std::nullopt;
}
void do_add_partition(std::function<void(mutation)>& mutation_sink, dht::decorated_key key, std::vector<std::pair<sstring, sstring>> rows) {
mutation m(schema(), std::move(key));
for (auto&& [ckey, cvalue] : rows) {
row& cr = m.partition().clustered_row(*schema(), clustering_key::from_single_value(*schema(), data_value(std::move(ckey)).serialize_nonnull())).cells();
set_cell(cr, "value", std::move(cvalue));
}
mutation_sink(std::move(m));
}
void add_partition(std::function<void(mutation)>& mutation_sink, sstring key, sstring value) {
if (_generic_key) {
do_add_partition(mutation_sink, *_generic_key, {{key, std::move(value)}});
}
}
void add_partition(std::function<void(mutation)>& mutation_sink, sstring key, std::initializer_list<std::pair<sstring, sstring>> rows) {
auto dk = maybe_make_key(std::move(key));
if (dk) {
do_add_partition(mutation_sink, std::move(*dk), std::move(rows));
}
}
future<> add_partition(std::function<void(mutation)>& mutation_sink, sstring key, std::function<future<sstring>()> value_producer) {
if (_generic_key) {
do_add_partition(mutation_sink, *_generic_key, {{key, co_await value_producer()}});
}
}
future<> add_partition(std::function<void(mutation)>& mutation_sink, sstring key, std::function<future<std::vector<std::pair<sstring, sstring>>>()> value_producer) {
auto dk = maybe_make_key(std::move(key));
if (dk) {
do_add_partition(mutation_sink, std::move(*dk), co_await value_producer());
}
}
template <typename T>
future<T> map_reduce_tables(std::function<T(replica::table&)> map, std::function<T(T, T)> reduce = std::plus<T>{}) {
class shard_reducer {
T _v{};
std::function<T(T, T)> _reduce;
public:
shard_reducer(std::function<T(T, T)> reduce) : _reduce(std::move(reduce)) { }
future<> operator()(T v) {
v = _reduce(_v, v);
return make_ready_future<>();
}
T get() && { return std::move(_v); }
};
co_return co_await _db.map_reduce(shard_reducer(reduce), [map, reduce] (replica::database& db) {
T val = {};
for (auto& [_, table] : db.get_column_families()) {
val = reduce(val, map(*table));
}
return val;
});
}
template <typename T>
future<T> map_reduce_shards(std::function<T()> map, std::function<T(T, T)> reduce = std::plus<T>{}, T initial = {}) {
co_return co_await map_reduce(
boost::irange(0u, smp::count),
[map] (shard_id shard) {
return smp::submit_to(shard, [map] {
return map();
});
},
initial,
reduce);
}
public:
explicit runtime_info_table(distributed<replica::database>& db, service::storage_service& ss)
: memtable_filling_virtual_table(build_schema())
, _db(db)
, _ss(ss) {
_shard_aware = true;
_generic_key = maybe_make_key("generic");
}
static schema_ptr build_schema() {
auto id = generate_legacy_id(system_keyspace::NAME, "runtime_info");
return schema_builder(system_keyspace::NAME, "runtime_info", std::make_optional(id))
.with_column("group", utf8_type, column_kind::partition_key)
.with_column("item", utf8_type, column_kind::clustering_key)
.with_column("value", utf8_type)
.set_comment("Runtime system information.")
.with_version(system_keyspace::generate_schema_version(id))
.build();
}
future<> execute(std::function<void(mutation)> mutation_sink) override {
co_await add_partition(mutation_sink, "gossip_active", [this] () -> future<sstring> {
return _ss.is_gossip_running().then([] (bool running){
return format("{}", running);
});
});
co_await add_partition(mutation_sink, "load", [this] () -> future<sstring> {
return map_reduce_tables<int64_t>([] (replica::table& tbl) {
return tbl.get_stats().live_disk_space_used;
}).then([] (int64_t load) {
return format("{}", load);
});
});
add_partition(mutation_sink, "uptime", format("{} seconds", std::chrono::duration_cast<std::chrono::seconds>(engine().uptime()).count()));
add_partition(mutation_sink, "trace_probability", format("{:.2}", tracing::tracing::get_local_tracing_instance().get_trace_probability()));
co_await add_partition(mutation_sink, "memory", [this] () {
struct stats {
// take the pre-reserved memory into account, as seastar only returns
// the stats of memory managed by the seastar allocator, but we instruct
// it to reserve addition memory for non-seastar allocator on per-shard
// basis.
uint64_t total = 0;
uint64_t free = 0;
static stats reduce(stats a, stats b) {
return stats{
a.total + b.total + db::config::wasm_udf_reserved_memory,
a.free + b.free};
};
};
return map_reduce_shards<stats>([] () {
const auto& s = memory::stats();
return stats{s.total_memory(), s.free_memory()};
}, stats::reduce, stats{}).then([] (stats s) {
return std::vector<std::pair<sstring, sstring>>{
{"total", format("{}", s.total)},
{"used", format("{}", s.total - s.free)},
{"free", format("{}", s.free)}};
});
});
co_await add_partition(mutation_sink, "memtable", [this] () {
struct stats {
uint64_t total = 0;
uint64_t free = 0;
uint64_t entries = 0;
static stats reduce(stats a, stats b) { return stats{a.total + b.total, a.free + b.free, a.entries + b.entries}; }
};
return map_reduce_tables<stats>([] (replica::table& t) {
logalloc::occupancy_stats s;
uint64_t partition_count = 0;
for (replica::memtable* active_memtable : t.active_memtables()) {
s += active_memtable->region().occupancy();
partition_count += active_memtable->partition_count();
}
return stats{s.total_space(), s.free_space(), partition_count};
}, stats::reduce).then([] (stats s) {
return std::vector<std::pair<sstring, sstring>>{
{"memory_total", format("{}", s.total)},
{"memory_used", format("{}", s.total - s.free)},
{"memory_free", format("{}", s.free)},
{"entries", format("{}", s.entries)}};
});
});
co_await add_partition(mutation_sink, "cache", [this] () {
struct stats {
uint64_t total = 0;
uint64_t free = 0;
uint64_t entries = 0;
uint64_t hits = 0;
uint64_t misses = 0;
utils::rate_moving_average hits_moving_average;
utils::rate_moving_average requests_moving_average;
static stats reduce(stats a, stats b) {
return stats{
a.total + b.total,
a.free + b.free,
a.entries + b.entries,
a.hits + b.hits,
a.misses + b.misses,
a.hits_moving_average + b.hits_moving_average,
a.requests_moving_average + b.requests_moving_average};
}
};
return _db.map_reduce0([] (replica::database& db) {
stats res{};
auto occupancy = db.row_cache_tracker().region().occupancy();
res.total = occupancy.total_space();
res.free = occupancy.free_space();
res.entries = db.row_cache_tracker().partitions();
for (const auto& [_, t] : db.get_column_families()) {
auto& cache_stats = t->get_row_cache().stats();
res.hits += cache_stats.hits.count();
res.misses += cache_stats.misses.count();
res.hits_moving_average += cache_stats.hits.rate();
res.requests_moving_average += (cache_stats.hits.rate() + cache_stats.misses.rate());
}
return res;
}, stats{}, stats::reduce).then([] (stats s) {
return std::vector<std::pair<sstring, sstring>>{
{"memory_total", format("{}", s.total)},
{"memory_used", format("{}", s.total - s.free)},
{"memory_free", format("{}", s.free)},
{"entries", format("{}", s.entries)},
{"hits", format("{}", s.hits)},
{"misses", format("{}", s.misses)},
{"hit_rate_total", format("{:.2}", static_cast<double>(s.hits) / static_cast<double>(s.hits + s.misses))},
{"hit_rate_recent", format("{:.2}", s.hits_moving_average.mean_rate)},
{"requests_total", format("{}", s.hits + s.misses)},
{"requests_recent", format("{}", static_cast<uint64_t>(s.requests_moving_average.mean_rate))}};
});
});
co_await add_partition(mutation_sink, "incremental_backup_enabled", [this] () {
return _db.map_reduce0([] (replica::database& db) {
return boost::algorithm::any_of(db.get_keyspaces(), [] (const auto& id_and_ks) {
return id_and_ks.second.incremental_backups_enabled();
});
}, false, std::logical_or{}).then([] (bool res) -> sstring {
return res ? "true" : "false";
});
});
}
};
class versions_table : public memtable_filling_virtual_table {
public:
explicit versions_table()
: memtable_filling_virtual_table(build_schema()) {
_shard_aware = false;
}
static schema_ptr build_schema() {
auto id = generate_legacy_id(system_keyspace::NAME, "versions");
return schema_builder(system_keyspace::NAME, "versions", std::make_optional(id))
.with_column("key", utf8_type, column_kind::partition_key)
.with_column("version", utf8_type)
.with_column("build_mode", utf8_type)
.with_column("build_id", utf8_type)
.set_comment("Version information.")
.with_version(system_keyspace::generate_schema_version(id))
.build();
}
future<> execute(std::function<void(mutation)> mutation_sink) override {
mutation m(schema(), partition_key::from_single_value(*schema(), data_value("local").serialize_nonnull()));
row& cr = m.partition().clustered_row(*schema(), clustering_key::make_empty()).cells();
set_cell(cr, "version", scylla_version());
set_cell(cr, "build_mode", scylla_build_mode());
set_cell(cr, "build_id", get_build_id());
mutation_sink(std::move(m));
return make_ready_future<>();
}
};
class db_config_table final : public streaming_virtual_table {
db::config& _cfg;
static schema_ptr build_schema() {
auto id = generate_legacy_id(system_keyspace::NAME, "config");
return schema_builder(system_keyspace::NAME, "config", std::make_optional(id))
.with_column("name", utf8_type, column_kind::partition_key)
.with_column("type", utf8_type)
.with_column("source", utf8_type)
.with_column("value", utf8_type)
.with_version(system_keyspace::generate_schema_version(id))
.build();
}
future<> execute(reader_permit permit, result_collector& result, const query_restrictions& qr) override {
struct config_entry {
dht::decorated_key key;
sstring_view type;
sstring source;
sstring value;
};
std::vector<config_entry> cfg;
for (auto&& cfg_ref : _cfg.values()) {
auto&& c = cfg_ref.get();
dht::decorated_key dk = dht::decorate_key(*_s, partition_key::from_single_value(*_s, data_value(c.name()).serialize_nonnull()));
if (this_shard_owns(dk)) {
cfg.emplace_back(config_entry{ std::move(dk), c.type_name(), c.source_name(), c.value_as_json()._res });
}
}
boost::sort(cfg, [less = dht::ring_position_less_comparator(*_s)]
(const config_entry& l, const config_entry& r) {
return less(l.key, r.key);
});
for (auto&& c : cfg) {
co_await result.emit_partition_start(c.key);
mutation m(schema(), c.key);
clustering_row cr(clustering_key::make_empty());
set_cell(cr.cells(), "type", c.type);
set_cell(cr.cells(), "source", c.source);
set_cell(cr.cells(), "value", c.value);
co_await result.emit_row(std::move(cr));
co_await result.emit_partition_end();
}
}
virtual future<> apply(const frozen_mutation& fm) override {
const mutation m = fm.unfreeze(_s);
query::result_set rs(m);
auto name = rs.row(0).get<sstring>("name");
auto value = rs.row(0).get<sstring>("value");
if (!_cfg.enable_cql_config_updates()) {
return virtual_table::apply(fm); // will return back exceptional future
}
if (!name) {
return make_exception_future<>(virtual_table_update_exception("option name is required"));
}
if (!value) {
return make_exception_future<>(virtual_table_update_exception("option value is required"));
}
if (rs.row(0).cells().contains("type")) {
return make_exception_future<>(virtual_table_update_exception("option type is immutable"));
}
if (rs.row(0).cells().contains("source")) {
return make_exception_future<>(virtual_table_update_exception("option source is not updateable"));
}
return smp::submit_to(0, [&cfg = _cfg, name = std::move(*name), value = std::move(*value)] () mutable -> future<> {
for (auto& c_ref : cfg.values()) {
auto& c = c_ref.get();
if (c.name() == name) {
std::exception_ptr ex;
try {
if (co_await c.set_value_on_all_shards(value, utils::config_file::config_source::CQL)) {
co_return;
} else {
ex = std::make_exception_ptr(virtual_table_update_exception("option is not live-updateable"));
}
} catch (boost::bad_lexical_cast&) {
ex = std::make_exception_ptr(virtual_table_update_exception("cannot parse option value"));
}
co_await coroutine::return_exception_ptr(std::move(ex));
}
}
co_await coroutine::return_exception(virtual_table_update_exception("no such option"));
});
}
public:
explicit db_config_table(db::config& cfg)
: streaming_virtual_table(build_schema())
, _cfg(cfg)
{
_shard_aware = true;
}
};
class clients_table : public streaming_virtual_table {
service::storage_service& _ss;
static schema_ptr build_schema() {
auto id = generate_legacy_id(system_keyspace::NAME, "clients");
return schema_builder(system_keyspace::NAME, "clients", std::make_optional(id))
.with_column("address", inet_addr_type, column_kind::partition_key)
.with_column("port", int32_type, column_kind::clustering_key)
.with_column("client_type", utf8_type, column_kind::clustering_key)
.with_column("shard_id", int32_type)
.with_column("connection_stage", utf8_type)
.with_column("driver_name", utf8_type)
.with_column("driver_version", utf8_type)
.with_column("hostname", utf8_type)
.with_column("protocol_version", int32_type)
.with_column("ssl_cipher_suite", utf8_type)
.with_column("ssl_enabled", boolean_type)
.with_column("ssl_protocol", utf8_type)
.with_column("username", utf8_type)
.with_version(system_keyspace::generate_schema_version(id))
.build();
}
dht::decorated_key make_partition_key(net::inet_address ip) {
return dht::decorate_key(*_s, partition_key::from_single_value(*_s, data_value(ip).serialize_nonnull()));
}
clustering_key make_clustering_key(int32_t port, sstring clt) {
return clustering_key::from_exploded(*_s, {
data_value(port).serialize_nonnull(),
data_value(clt).serialize_nonnull()
});
}
future<> execute(reader_permit permit, result_collector& result, const query_restrictions& qr) override {
// Collect
using client_data_vec = utils::chunked_vector<client_data>;
using shard_client_data = std::vector<client_data_vec>;
std::vector<foreign_ptr<std::unique_ptr<shard_client_data>>> cd_vec;
cd_vec.resize(smp::count);
auto servers = co_await _ss.container().invoke_on(0, [] (auto& ss) { return ss.protocol_servers(); });
co_await smp::invoke_on_all([&cd_vec_ = cd_vec, &servers_ = servers] () -> future<> {
auto& cd_vec = cd_vec_;
auto& servers = servers_;
auto scd = std::make_unique<shard_client_data>();
for (const auto& ps : servers) {
client_data_vec cds = co_await ps->get_client_data();
if (cds.size() != 0) {
scd->emplace_back(std::move(cds));
}
}
cd_vec[this_shard_id()] = make_foreign(std::move(scd));
});
// Partition
struct decorated_ip {
dht::decorated_key key;
net::inet_address ip;
struct compare {
dht::ring_position_less_comparator less;
explicit compare(const class schema& s) : less(s) {}
bool operator()(const decorated_ip& a, const decorated_ip& b) const {
return less(a.key, b.key);
}
};
};
decorated_ip::compare cmp(*_s);
std::set<decorated_ip, decorated_ip::compare> ips(cmp);
std::unordered_map<net::inet_address, client_data_vec> cd_map;
for (int i = 0; i < smp::count; i++) {
for (auto&& ps_cdc : *cd_vec[i]) {
for (auto&& cd : ps_cdc) {
if (cd_map.contains(cd.ip)) {
cd_map[cd.ip].emplace_back(std::move(cd));
} else {
dht::decorated_key key = make_partition_key(cd.ip);
if (this_shard_owns(key) && contains_key(qr.partition_range(), key)) {
ips.insert(decorated_ip{std::move(key), cd.ip});
cd_map[cd.ip].emplace_back(std::move(cd));
}
}
co_await coroutine::maybe_yield();
}
}
}
// Emit
for (const auto& dip : ips) {
co_await result.emit_partition_start(dip.key);
auto& clients = cd_map[dip.ip];
boost::sort(clients, [] (const client_data& a, const client_data& b) {
return a.port < b.port || a.client_type_str() < b.client_type_str();
});
for (const auto& cd : clients) {
clustering_row cr(make_clustering_key(cd.port, cd.client_type_str()));
set_cell(cr.cells(), "shard_id", cd.shard_id);
set_cell(cr.cells(), "connection_stage", cd.stage_str());
if (cd.driver_name) {
set_cell(cr.cells(), "driver_name", *cd.driver_name);
}
if (cd.driver_version) {
set_cell(cr.cells(), "driver_version", *cd.driver_version);
}
if (cd.hostname) {
set_cell(cr.cells(), "hostname", *cd.hostname);
}
if (cd.protocol_version) {
set_cell(cr.cells(), "protocol_version", *cd.protocol_version);
}
if (cd.ssl_cipher_suite) {
set_cell(cr.cells(), "ssl_cipher_suite", *cd.ssl_cipher_suite);
}
if (cd.ssl_enabled) {
set_cell(cr.cells(), "ssl_enabled", *cd.ssl_enabled);
}
if (cd.ssl_protocol) {
set_cell(cr.cells(), "ssl_protocol", *cd.ssl_protocol);
}
set_cell(cr.cells(), "username", cd.username ? *cd.username : sstring("anonymous"));
co_await result.emit_row(std::move(cr));
}
co_await result.emit_partition_end();
}
}
public:
clients_table(service::storage_service& ss)
: streaming_virtual_table(build_schema())
, _ss(ss)
{
_shard_aware = true;
}
};
// Shows the current state of each Raft group.
// Currently it shows only the configuration.
// In the future we plan to add additional columns with more information.
class raft_state_table : public streaming_virtual_table {
private:
sharded<service::raft_group_registry>& _raft_gr;
public:
raft_state_table(sharded<service::raft_group_registry>& raft_gr)
: streaming_virtual_table(build_schema())
, _raft_gr(raft_gr) {
}
future<> execute(reader_permit permit, result_collector& result, const query_restrictions& qr) override {
struct decorated_gid {
raft::group_id gid;
dht::decorated_key key;
unsigned shard;
};
auto groups_and_shards = co_await _raft_gr.map([] (service::raft_group_registry& raft_gr) {
return std::pair{raft_gr.all_groups(), this_shard_id()};
});
std::vector<decorated_gid> decorated_gids;
for (auto& [groups, shard]: groups_and_shards) {
for (auto& gid: groups) {
decorated_gids.push_back(decorated_gid{gid, make_partition_key(gid), shard});
}
}
// Must return partitions in token order.
std::sort(decorated_gids.begin(), decorated_gids.end(), [less = dht::ring_position_less_comparator(*_s)]
(const decorated_gid& l, const decorated_gid& r) { return less(l.key, r.key); });
for (auto& [gid, dk, shard]: decorated_gids) {
if (!contains_key(qr.partition_range(), dk)) {
continue;
}
auto cfg_opt = co_await _raft_gr.invoke_on(shard,
[gid=gid] (service::raft_group_registry& raft_gr) -> std::optional<raft::configuration> {
// Be ready for a group to disappear while we're querying.
auto* srv = raft_gr.find_server(gid);
if (!srv) {
return std::nullopt;
}
// FIXME: the configuration returned here is obtained from raft::fsm, it may not be
// persisted yet, so this is not 100% correct. It may happen that we crash after
// a config entry is appended in-memory in fsm but before it's persisted. It would be
// incorrect to return the configuration observed during this window - after restart
// the configuration would revert to the previous one. Perhaps this is unlikely to
// happen in practice, but for correctness we should add a way of querying the
// latest persisted configuration.
return srv->get_configuration();
});
if (!cfg_opt) {
continue;
}
co_await result.emit_partition_start(dk);
// List current config first, because 'C' < 'P' and the disposition
// (ascii_type, 'CURRENT' vs 'PREVIOUS') is the first column in the clustering key.
co_await emit_member_set(result, "CURRENT", cfg_opt->current);
co_await emit_member_set(result, "PREVIOUS", cfg_opt->previous);
co_await result.emit_partition_end();
}
}
private:
static schema_ptr build_schema() {
auto id = generate_legacy_id(system_keyspace::NAME, "raft_state");
return schema_builder(system_keyspace::NAME, "raft_state", std::make_optional(id))
.with_column("group_id", timeuuid_type, column_kind::partition_key)
.with_column("disposition", ascii_type, column_kind::clustering_key) // can be 'CURRENT` or `PREVIOUS'
.with_column("server_id", uuid_type, column_kind::clustering_key)
.with_column("can_vote", boolean_type)
.set_comment("Currently operating RAFT configuration")
.with_version(system_keyspace::generate_schema_version(id))
.build();
}
dht::decorated_key make_partition_key(raft::group_id gid) {
// Make sure to use timeuuid_native_type so comparisons are done correctly
// (we must emit partitions in the correct token order).
return dht::decorate_key(*_s, partition_key::from_single_value(
*_s, data_value(timeuuid_native_type{gid.uuid()}).serialize_nonnull()));
}
clustering_key make_clustering_key(std::string_view disposition, raft::server_id id) {
return clustering_key::from_exploded(*_s, {
data_value(disposition).serialize_nonnull(),
data_value(id.uuid()).serialize_nonnull()
});
}
future<> emit_member_set(result_collector& result, std::string_view disposition,
const raft::config_member_set& set) {
// Must sort servers in clustering order (i.e. according to their IDs).
// This is how `config_member::operator<` works so no need for custom comparator.
std::vector<raft::config_member> members{set.begin(), set.end()};
std::sort(members.begin(), members.end());
for (auto& member: members) {
clustering_row cr{make_clustering_key(disposition, member.addr.id)};
set_cell(cr.cells(), "can_vote", member.can_vote);
co_await result.emit_row(std::move(cr));
}
}
};
// Map from table's schema ID to table itself. Helps avoiding accidental duplication.
static thread_local std::map<table_id, std::unique_ptr<virtual_table>> virtual_tables;
void register_virtual_tables(distributed<replica::database>& dist_db, distributed<service::storage_service>& dist_ss, sharded<gms::gossiper>& dist_gossiper, sharded<service::raft_group_registry>& dist_raft_gr, db::config& cfg) {
auto add_table = [] (std::unique_ptr<virtual_table>&& tbl) {
virtual_tables[tbl->schema()->id()] = std::move(tbl);
};
auto& db = dist_db.local();
auto& ss = dist_ss.local();
auto& gossiper = dist_gossiper.local();
// Add built-in virtual tables here.
add_table(std::make_unique<cluster_status_table>(ss, gossiper));
add_table(std::make_unique<token_ring_table>(db, ss));
add_table(std::make_unique<snapshots_table>(dist_db));
add_table(std::make_unique<protocol_servers_table>(ss));
add_table(std::make_unique<runtime_info_table>(dist_db, ss));
add_table(std::make_unique<versions_table>());
add_table(std::make_unique<db_config_table>(cfg));
add_table(std::make_unique<clients_table>(ss));
add_table(std::make_unique<raft_state_table>(dist_raft_gr));
}
// Does not include virtual tables.
std::vector<schema_ptr> system_keyspace::all_tables(const db::config& cfg) {
std::vector<schema_ptr> r;
auto schema_tables = db::schema_tables::all_tables(schema_features::full());
std::copy(schema_tables.begin(), schema_tables.end(), std::back_inserter(r));
r.insert(r.end(), { built_indexes(), hints(), batchlog(), paxos(), local(),
peers(), peer_events(), range_xfers(),
compactions_in_progress(), compaction_history(),
sstable_activity(), size_estimates(), large_partitions(), large_rows(), large_cells(),
scylla_local(), db::schema_tables::scylla_table_schema_history(),
repair_history(),
v3::views_builds_in_progress(), v3::built_views(),
v3::scylla_views_builds_in_progress(),
v3::truncated(),
v3::cdc_local(),
});
if (cfg.consistent_cluster_management()) {
r.insert(r.end(), {raft(), raft_snapshots(), raft_snapshot_config(), group0_history(), discovery()});
if (cfg.check_experimental(db::experimental_features_t::feature::RAFT)) {
r.insert(r.end(), {topology(), cdc_generations_v3()});
}
if (cfg.check_experimental(db::experimental_features_t::feature::BROADCAST_TABLES)) {
r.insert(r.end(), {broadcast_kv_store()});
}
if (cfg.check_experimental(db::experimental_features_t::feature::TABLETS)) {
r.insert(r.end(), {tablets()});
}
}
if (cfg.check_experimental(db::experimental_features_t::feature::KEYSPACE_STORAGE_OPTIONS)) {
r.insert(r.end(), {sstables_registry()});
}
// legacy schema
r.insert(r.end(), {
// TODO: once we migrate hints/batchlog and add convertor
// legacy::hints(), legacy::batchlog(),
legacy::keyspaces(), legacy::column_families(),
legacy::columns(), legacy::triggers(), legacy::usertypes(),
legacy::functions(), legacy::aggregates(), });
return r;
}
// Precondition: `register_virtual_tables` has finished.
static std::vector<schema_ptr> all_virtual_tables() {
std::vector<schema_ptr> r;
for (auto&& [_, vt] : virtual_tables) {
r.push_back(vt->schema());
}
return r;
}
static void install_virtual_readers(db::system_keyspace& sys_ks, replica::database& db) {
db.find_column_family(system_keyspace::size_estimates()).set_virtual_reader(mutation_source(db::size_estimates::virtual_reader(db, sys_ks)));
db.find_column_family(system_keyspace::v3::views_builds_in_progress()).set_virtual_reader(mutation_source(db::view::build_progress_virtual_reader(db)));
db.find_column_family(system_keyspace::built_indexes()).set_virtual_reader(mutation_source(db::index::built_indexes_virtual_reader(db)));
for (auto&& [id, vt] : virtual_tables) {
auto&& cf = db.find_column_family(vt->schema());
cf.set_virtual_reader(vt->as_mutation_source());
cf.set_virtual_writer([&vt = *vt] (const frozen_mutation& m) { return vt.apply(m); });
}
}
static bool maybe_write_in_user_memory(schema_ptr s) {
return (s.get() == system_keyspace::batchlog().get()) || (s.get() == system_keyspace::paxos().get())
|| s == system_keyspace::v3::scylla_views_builds_in_progress()
|| s == system_keyspace::raft();
}
future<> system_keyspace::make(
locator::effective_replication_map_factory& erm_factory,
replica::database& db, db::config& cfg, system_table_load_phase phase) {
for (auto&& table : system_keyspace::all_tables(db.get_config())) {
if (table->static_props().load_phase != phase) {
continue;
}
co_await db.create_local_system_table(table, maybe_write_in_user_memory(table), erm_factory);
}
}
future<> system_keyspace::initialize_virtual_tables(
distributed<replica::database>& dist_db, distributed<service::storage_service>& dist_ss,
sharded<gms::gossiper>& dist_gossiper, distributed<service::raft_group_registry>& dist_raft_gr,
db::config& cfg) {
register_virtual_tables(dist_db, dist_ss, dist_gossiper, dist_raft_gr, cfg);
auto& db = dist_db.local();
for (auto&& table: all_virtual_tables()) {
co_await db.create_local_system_table(table, false, dist_ss.local().get_erm_factory());
}
install_virtual_readers(*this, db);
}
future<locator::host_id> system_keyspace::load_local_host_id() {
sstring req = format("SELECT host_id FROM system.{} WHERE key=?", LOCAL);
auto msg = co_await execute_cql(req, sstring(LOCAL));
if (msg->empty() || !msg->one().has("host_id")) {
co_return co_await set_local_random_host_id();
} else {
auto host_id = locator::host_id(msg->one().get_as<utils::UUID>("host_id"));
slogger.info("Loaded local host id: {}", host_id);
co_return host_id;
}
}
future<locator::host_id> system_keyspace::set_local_random_host_id() {
auto host_id = locator::host_id::create_random_id();
slogger.info("Setting local host id to {}", host_id);
sstring req = format("INSERT INTO system.{} (key, host_id) VALUES (?, ?)", LOCAL);
co_await execute_cql(req, sstring(LOCAL), host_id.uuid());
co_await force_blocking_flush(LOCAL);
co_return host_id;
}
locator::endpoint_dc_rack system_keyspace::local_dc_rack() const {
return _cache->_local_dc_rack_info;
}
future<foreign_ptr<lw_shared_ptr<reconcilable_result>>>
system_keyspace::query_mutations(distributed<replica::database>& db, const sstring& ks_name, const sstring& cf_name) {
schema_ptr schema = db.local().find_schema(ks_name, cf_name);
return replica::query_mutations(db, schema, query::full_partition_range, schema->full_slice(), db::no_timeout);
}
future<foreign_ptr<lw_shared_ptr<reconcilable_result>>>
system_keyspace::query_mutations(distributed<replica::database>& db, const sstring& ks_name, const sstring& cf_name, const dht::partition_range& partition_range, query::clustering_range row_range) {
auto schema = db.local().find_schema(ks_name, cf_name);
auto slice_ptr = std::make_unique<query::partition_slice>(partition_slice_builder(*schema)
.with_range(std::move(row_range))
.build());
return replica::query_mutations(db, std::move(schema), partition_range, *slice_ptr, db::no_timeout).finally([slice_ptr = std::move(slice_ptr)] { });
}
future<lw_shared_ptr<query::result_set>>
system_keyspace::query(distributed<replica::database>& db, const sstring& ks_name, const sstring& cf_name) {
schema_ptr schema = db.local().find_schema(ks_name, cf_name);
return replica::query_data(db, schema, query::full_partition_range, schema->full_slice(), db::no_timeout).then([schema] (auto&& qr) {
return make_lw_shared<query::result_set>(query::result_set::from_raw_result(schema, schema->full_slice(), *qr));
});
}
future<lw_shared_ptr<query::result_set>>
system_keyspace::query(distributed<replica::database>& db, const sstring& ks_name, const sstring& cf_name, const dht::decorated_key& key, query::clustering_range row_range)
{
auto schema = db.local().find_schema(ks_name, cf_name);
auto pr_ptr = std::make_unique<dht::partition_range>(dht::partition_range::make_singular(key));
auto slice_ptr = std::make_unique<query::partition_slice>(partition_slice_builder(*schema)
.with_range(std::move(row_range))
.build());
return replica::query_data(db, schema, *pr_ptr, *slice_ptr, db::no_timeout).then(
[schema, pr_ptr = std::move(pr_ptr), slice_ptr = std::move(slice_ptr)] (auto&& qr) {
return make_lw_shared<query::result_set>(query::result_set::from_raw_result(schema, schema->full_slice(), *qr));
});
}
static map_type_impl::native_type prepare_rows_merged(std::unordered_map<int32_t, int64_t>& rows_merged) {
map_type_impl::native_type tmp;
for (auto& r: rows_merged) {
int32_t first = r.first;
int64_t second = r.second;
auto map_element = std::make_pair<data_value, data_value>(data_value(first), data_value(second));
tmp.push_back(std::move(map_element));
}
return tmp;
}
future<> system_keyspace::update_compaction_history(utils::UUID uuid, sstring ksname, sstring cfname, int64_t compacted_at, int64_t bytes_in, int64_t bytes_out,
std::unordered_map<int32_t, int64_t> rows_merged)
{
// don't write anything when the history table itself is compacted, since that would in turn cause new compactions
if (ksname == "system" && cfname == COMPACTION_HISTORY) {
return make_ready_future<>();
}
auto map_type = map_type_impl::get_instance(int32_type, long_type, true);
sstring req = format("INSERT INTO system.{} (id, keyspace_name, columnfamily_name, compacted_at, bytes_in, bytes_out, rows_merged) VALUES (?, ?, ?, ?, ?, ?, ?)"
, COMPACTION_HISTORY);
db_clock::time_point tp{db_clock::duration{compacted_at}};
return execute_cql(req, uuid, ksname, cfname, tp, bytes_in, bytes_out,
make_map_value(map_type, prepare_rows_merged(rows_merged))).discard_result().handle_exception([] (auto ep) {
slogger.error("update compaction history failed: {}: ignored", ep);
});
}
future<> system_keyspace::get_compaction_history(compaction_history_consumer consumer) {
sstring req = format("SELECT * from system.{}", COMPACTION_HISTORY);
co_await _qp.query_internal(req, [&consumer] (const cql3::untyped_result_set::row& row) mutable -> future<stop_iteration> {
compaction_history_entry entry;
entry.id = row.get_as<utils::UUID>("id");
entry.ks = row.get_as<sstring>("keyspace_name");
entry.cf = row.get_as<sstring>("columnfamily_name");
entry.compacted_at = row.get_as<int64_t>("compacted_at");
entry.bytes_in = row.get_as<int64_t>("bytes_in");
entry.bytes_out = row.get_as<int64_t>("bytes_out");
if (row.has("rows_merged")) {
entry.rows_merged = row.get_map<int32_t, int64_t>("rows_merged");
}
co_await consumer(std::move(entry));
co_return stop_iteration::no;
});
}
future<> system_keyspace::update_repair_history(repair_history_entry entry) {
sstring req = format("INSERT INTO system.{} (table_uuid, repair_time, repair_uuid, keyspace_name, table_name, range_start, range_end) VALUES (?, ?, ?, ?, ?, ?, ?)", REPAIR_HISTORY);
co_await execute_cql(req, entry.table_uuid.uuid(), entry.ts, entry.id.uuid(), entry.ks, entry.cf, entry.range_start, entry.range_end).discard_result();
}
future<> system_keyspace::get_repair_history(::table_id table_id, repair_history_consumer f) {
sstring req = format("SELECT * from system.{} WHERE table_uuid = {}", REPAIR_HISTORY, table_id);
co_await _qp.query_internal(req, [&f] (const cql3::untyped_result_set::row& row) mutable -> future<stop_iteration> {
repair_history_entry ent;
ent.id = tasks::task_id(row.get_as<utils::UUID>("repair_uuid"));
ent.table_uuid = ::table_id(row.get_as<utils::UUID>("table_uuid"));
ent.range_start = row.get_as<int64_t>("range_start");
ent.range_end = row.get_as<int64_t>("range_end");
ent.ks = row.get_as<sstring>("keyspace_name");
ent.cf = row.get_as<sstring>("table_name");
ent.ts = row.get_as<db_clock::time_point>("repair_time");
co_await f(std::move(ent));
co_return stop_iteration::no;
});
}
future<int> system_keyspace::increment_and_get_generation() {
auto req = format("SELECT gossip_generation FROM system.{} WHERE key='{}'", LOCAL, LOCAL);
auto rs = co_await _qp.execute_internal(req, cql3::query_processor::cache_internal::yes);
gms::generation_type generation;
if (rs->empty() || !rs->one().has("gossip_generation")) {
// seconds-since-epoch isn't a foolproof new generation
// (where foolproof is "guaranteed to be larger than the last one seen at this ip address"),
// but it's as close as sanely possible
generation = gms::get_generation_number();
} else {
// Other nodes will ignore gossip messages about a node that have a lower generation than previously seen.
auto stored_generation = gms::generation_type(rs->one().template get_as<int>("gossip_generation") + 1);
auto now = gms::get_generation_number();
if (stored_generation >= now) {
slogger.warn("Using stored Gossip Generation {} as it is greater than current system time {}."
"See CASSANDRA-3654 if you experience problems", stored_generation, now);
generation = stored_generation;
} else {
generation = now;
}
}
req = format("INSERT INTO system.{} (key, gossip_generation) VALUES ('{}', ?)", LOCAL, LOCAL);
co_await _qp.execute_internal(req, {generation.value()}, cql3::query_processor::cache_internal::yes);
co_await force_blocking_flush(LOCAL);
co_return generation;
}
mutation system_keyspace::make_size_estimates_mutation(const sstring& ks, std::vector<system_keyspace::range_estimates> estimates) {
auto&& schema = db::system_keyspace::size_estimates();
auto timestamp = api::new_timestamp();
mutation m_to_apply{schema, partition_key::from_single_value(*schema, utf8_type->decompose(ks))};
for (auto&& e : estimates) {
auto ck = clustering_key_prefix(std::vector<bytes>{
utf8_type->decompose(e.schema->cf_name()), e.range_start_token, e.range_end_token});
m_to_apply.set_clustered_cell(ck, "mean_partition_size", e.mean_partition_size, timestamp);
m_to_apply.set_clustered_cell(ck, "partitions_count", e.partitions_count, timestamp);
}
return m_to_apply;
}
future<> system_keyspace::register_view_for_building(sstring ks_name, sstring view_name, const dht::token& token) {
sstring req = format("INSERT INTO system.{} (keyspace_name, view_name, generation_number, cpu_id, first_token) VALUES (?, ?, ?, ?, ?)",
v3::SCYLLA_VIEWS_BUILDS_IN_PROGRESS);
return execute_cql(
std::move(req),
std::move(ks_name),
std::move(view_name),
0,
int32_t(this_shard_id()),
token.to_sstring()).discard_result();
}
future<> system_keyspace::update_view_build_progress(sstring ks_name, sstring view_name, const dht::token& token) {
sstring req = format("INSERT INTO system.{} (keyspace_name, view_name, next_token, cpu_id) VALUES (?, ?, ?, ?)",
v3::SCYLLA_VIEWS_BUILDS_IN_PROGRESS);
return execute_cql(
std::move(req),
std::move(ks_name),
std::move(view_name),
token.to_sstring(),
int32_t(this_shard_id())).discard_result();
}
future<> system_keyspace::remove_view_build_progress_across_all_shards(sstring ks_name, sstring view_name) {
return execute_cql(
format("DELETE FROM system.{} WHERE keyspace_name = ? AND view_name = ?", v3::SCYLLA_VIEWS_BUILDS_IN_PROGRESS),
std::move(ks_name),
std::move(view_name)).discard_result();
}
future<> system_keyspace::remove_view_build_progress(sstring ks_name, sstring view_name) {
return execute_cql(
format("DELETE FROM system.{} WHERE keyspace_name = ? AND view_name = ? AND cpu_id = ?", v3::SCYLLA_VIEWS_BUILDS_IN_PROGRESS),
std::move(ks_name),
std::move(view_name),
int32_t(this_shard_id())).discard_result();
}
future<> system_keyspace::mark_view_as_built(sstring ks_name, sstring view_name) {
return execute_cql(
format("INSERT INTO system.{} (keyspace_name, view_name) VALUES (?, ?)", v3::BUILT_VIEWS),
std::move(ks_name),
std::move(view_name)).discard_result();
}
future<> system_keyspace::remove_built_view(sstring ks_name, sstring view_name) {
return execute_cql(
format("DELETE FROM system.{} WHERE keyspace_name = ? AND view_name = ?", v3::BUILT_VIEWS),
std::move(ks_name),
std::move(view_name)).discard_result();
}
future<std::vector<system_keyspace::view_name>> system_keyspace::load_built_views() {
return execute_cql(format("SELECT * FROM system.{}", v3::BUILT_VIEWS)).then([] (::shared_ptr<cql3::untyped_result_set> cql_result) {
return boost::copy_range<std::vector<view_name>>(*cql_result
| boost::adaptors::transformed([] (const cql3::untyped_result_set::row& row) {
auto ks_name = row.get_as<sstring>("keyspace_name");
auto cf_name = row.get_as<sstring>("view_name");
return std::pair(std::move(ks_name), std::move(cf_name));
}));
});
}
future<std::vector<system_keyspace::view_build_progress>> system_keyspace::load_view_build_progress() {
return execute_cql(format("SELECT keyspace_name, view_name, first_token, next_token, cpu_id FROM system.{}",
v3::SCYLLA_VIEWS_BUILDS_IN_PROGRESS)).then([] (::shared_ptr<cql3::untyped_result_set> cql_result) {
std::vector<view_build_progress> progress;
for (auto& row : *cql_result) {
auto ks_name = row.get_as<sstring>("keyspace_name");
auto cf_name = row.get_as<sstring>("view_name");
auto first_token = dht::token::from_sstring(row.get_as<sstring>("first_token"));
auto next_token_sstring = row.get_opt<sstring>("next_token");
std::optional<dht::token> next_token;
if (next_token_sstring) {
next_token = dht::token::from_sstring(std::move(next_token_sstring).value());
}
auto cpu_id = row.get_as<int32_t>("cpu_id");
progress.emplace_back(view_build_progress{
view_name(std::move(ks_name), std::move(cf_name)),
std::move(first_token),
std::move(next_token),
static_cast<shard_id>(cpu_id)});
}
return progress;
}).handle_exception([] (const std::exception_ptr& eptr) {
slogger.warn("Failed to load view build progress: {}", eptr);
return std::vector<view_build_progress>();
});
}
future<service::paxos::paxos_state> system_keyspace::load_paxos_state(partition_key_view key, schema_ptr s, gc_clock::time_point now,
db::timeout_clock::time_point timeout) {
static auto cql = format("SELECT * FROM system.{} WHERE row_key = ? AND cf_id = ?", PAXOS);
// FIXME: we need execute_cql_with_now()
(void)now;
auto f = qctx->execute_cql_with_timeout(cql, timeout, to_legacy(*key.get_compound_type(*s), key.representation()), s->id().uuid());
return f.then([s, key = std::move(key)] (shared_ptr<cql3::untyped_result_set> results) mutable {
if (results->empty()) {
return service::paxos::paxos_state();
}
auto& row = results->one();
auto promised = row.has("promise")
? row.get_as<utils::UUID>("promise") : utils::UUID_gen::min_time_UUID();
std::optional<service::paxos::proposal> accepted;
if (row.has("proposal")) {
accepted = service::paxos::proposal(row.get_as<utils::UUID>("proposal_ballot"),
ser::deserialize_from_buffer<>(row.get_blob("proposal"), boost::type<frozen_mutation>(), 0));
}
std::optional<service::paxos::proposal> most_recent;
if (row.has("most_recent_commit_at")) {
// the value can be missing if it was pruned, suply empty one since
// it will not going to be used anyway
auto fm = row.has("most_recent_commit") ?
ser::deserialize_from_buffer<>(row.get_blob("most_recent_commit"), boost::type<frozen_mutation>(), 0) :
freeze(mutation(s, key));
most_recent = service::paxos::proposal(row.get_as<utils::UUID>("most_recent_commit_at"),
std::move(fm));
}
return service::paxos::paxos_state(promised, std::move(accepted), std::move(most_recent));
});
}
static int32_t paxos_ttl_sec(const schema& s) {
// Keep paxos state around for paxos_grace_seconds. If one of the Paxos participants
// is down for longer than paxos_grace_seconds it is considered to be dead and must rebootstrap.
// Otherwise its Paxos table state will be repaired by nodetool repair or Paxos repair.
return std::chrono::duration_cast<std::chrono::seconds>(s.paxos_grace_seconds()).count();
}
future<> system_keyspace::save_paxos_promise(const schema& s, const partition_key& key, const utils::UUID& ballot, db::timeout_clock::time_point timeout) {
static auto cql = format("UPDATE system.{} USING TIMESTAMP ? AND TTL ? SET promise = ? WHERE row_key = ? AND cf_id = ?", PAXOS);
return qctx->execute_cql_with_timeout(cql,
timeout,
utils::UUID_gen::micros_timestamp(ballot),
paxos_ttl_sec(s),
ballot,
to_legacy(*key.get_compound_type(s), key.representation()),
s.id().uuid()
).discard_result();
}
future<> system_keyspace::save_paxos_proposal(const schema& s, const service::paxos::proposal& proposal, db::timeout_clock::time_point timeout) {
static auto cql = format("UPDATE system.{} USING TIMESTAMP ? AND TTL ? SET promise = ?, proposal_ballot = ?, proposal = ? WHERE row_key = ? AND cf_id = ?", PAXOS);
partition_key_view key = proposal.update.key();
return qctx->execute_cql_with_timeout(cql,
timeout,
utils::UUID_gen::micros_timestamp(proposal.ballot),
paxos_ttl_sec(s),
proposal.ballot,
proposal.ballot,
ser::serialize_to_buffer<bytes>(proposal.update),
to_legacy(*key.get_compound_type(s), key.representation()),
s.id().uuid()
).discard_result();
}
future<> system_keyspace::save_paxos_decision(const schema& s, const service::paxos::proposal& decision, db::timeout_clock::time_point timeout) {
// We always erase the last proposal when we learn about a new Paxos decision. The ballot
// timestamp of the decision is used for entire mutation, so if the "erased" proposal is more
// recent it will naturally stay on top.
// Erasing the last proposal is just an optimization and does not affect correctness:
// sp::begin_and_repair_paxos will exclude an accepted proposal if it is older than the most
// recent commit.
static auto cql = format("UPDATE system.{} USING TIMESTAMP ? AND TTL ? SET proposal_ballot = null, proposal = null,"
" most_recent_commit_at = ?, most_recent_commit = ? WHERE row_key = ? AND cf_id = ?", PAXOS);
partition_key_view key = decision.update.key();
return qctx->execute_cql_with_timeout(cql,
timeout,
utils::UUID_gen::micros_timestamp(decision.ballot),
paxos_ttl_sec(s),
decision.ballot,
ser::serialize_to_buffer<bytes>(decision.update),
to_legacy(*key.get_compound_type(s), key.representation()),
s.id().uuid()
).discard_result();
}
future<> system_keyspace::delete_paxos_decision(const schema& s, const partition_key& key, const utils::UUID& ballot, db::timeout_clock::time_point timeout) {
// This should be called only if a learn stage succeeded on all replicas.
// In this case we can remove learned paxos value using ballot's timestamp which
// guarantees that if there is more recent round it will not be affected.
static auto cql = format("DELETE most_recent_commit FROM system.{} USING TIMESTAMP ? WHERE row_key = ? AND cf_id = ?", PAXOS);
return qctx->execute_cql_with_timeout(cql,
timeout,
utils::UUID_gen::micros_timestamp(ballot),
to_legacy(*key.get_compound_type(s), key.representation()),
s.id().uuid()
).discard_result();
}
future<std::set<sstring>> system_keyspace::load_local_enabled_features() {
std::set<sstring> features;
auto features_str = co_await get_scylla_local_param(gms::feature_service::ENABLED_FEATURES_KEY);
if (features_str) {
features = gms::feature_service::to_feature_set(*features_str);
}
co_return features;
}
future<> system_keyspace::save_local_enabled_features(std::set<sstring> features) {
auto features_str = fmt::to_string(fmt::join(features, ","));
co_await set_scylla_local_param(gms::feature_service::ENABLED_FEATURES_KEY, features_str);
}
future<utils::UUID> system_keyspace::get_raft_group0_id() {
auto opt = co_await get_scylla_local_param_as<utils::UUID>("raft_group0_id");
co_return opt.value_or<utils::UUID>({});
}
future<> system_keyspace::set_raft_group0_id(utils::UUID uuid) {
return set_scylla_local_param_as<utils::UUID>("raft_group0_id", uuid);
}
static constexpr auto GROUP0_HISTORY_KEY = "history";
future<utils::UUID> system_keyspace::get_last_group0_state_id() {
auto rs = co_await qctx->execute_cql(
format(
"SELECT state_id FROM system.{} WHERE key = '{}' LIMIT 1",
GROUP0_HISTORY, GROUP0_HISTORY_KEY));
assert(rs);
if (rs->empty()) {
co_return utils::UUID{};
}
co_return rs->one().get_as<utils::UUID>("state_id");
}
future<bool> system_keyspace::group0_history_contains(utils::UUID state_id) {
auto rs = co_await qctx->execute_cql(
format(
"SELECT state_id FROM system.{} WHERE key = '{}' AND state_id = ?",
GROUP0_HISTORY, GROUP0_HISTORY_KEY),
state_id);
assert(rs);
co_return !rs->empty();
}
mutation system_keyspace::make_group0_history_state_id_mutation(
utils::UUID state_id, std::optional<gc_clock::duration> gc_older_than, std::string_view description) {
auto s = group0_history();
mutation m(s, partition_key::from_singular(*s, GROUP0_HISTORY_KEY));
auto& row = m.partition().clustered_row(*s, clustering_key::from_singular(*s, state_id));
auto ts = utils::UUID_gen::micros_timestamp(state_id);
row.apply(row_marker(ts));
if (!description.empty()) {
auto cdef = s->get_column_definition("description");
assert(cdef);
row.cells().apply(*cdef, atomic_cell::make_live(*cdef->type, ts, cdef->type->decompose(description)));
}
if (gc_older_than) {
using namespace std::chrono;
assert(*gc_older_than >= gc_clock::duration{0});
auto ts_micros = microseconds{ts};
auto gc_older_than_micros = duration_cast<microseconds>(*gc_older_than);
assert(gc_older_than_micros < ts_micros);
auto tomb_upper_bound = utils::UUID_gen::min_time_UUID(ts_micros - gc_older_than_micros);
// We want to delete all entries with IDs smaller than `tomb_upper_bound`
// but the deleted range is of the form (x, +inf) since the schema is reversed.
auto range = query::clustering_range::make_starting_with({
clustering_key_prefix::from_single_value(*s, timeuuid_type->decompose(tomb_upper_bound)), false});
auto bv = bound_view::from_range(range);
m.partition().apply_delete(*s, range_tombstone{bv.first, bv.second, tombstone{ts, gc_clock::now()}});
}
return m;
}
future<mutation> system_keyspace::get_group0_history(distributed<replica::database>& db) {
auto s = group0_history();
auto rs = co_await db::system_keyspace::query_mutations(db, db::system_keyspace::NAME, db::system_keyspace::GROUP0_HISTORY);
assert(rs);
auto& ps = rs->partitions();
for (auto& p: ps) {
auto mut = p.mut().unfreeze(s);
auto partition_key = value_cast<sstring>(utf8_type->deserialize(mut.key().get_component(*s, 0)));
if (partition_key == GROUP0_HISTORY_KEY) {
co_return mut;
}
slogger.warn("get_group0_history: unexpected partition in group0 history table: {}", partition_key);
}
slogger.warn("get_group0_history: '{}' partition not found", GROUP0_HISTORY_KEY);
co_return mutation(s, partition_key::from_singular(*s, GROUP0_HISTORY_KEY));
}
static constexpr auto GROUP0_UPGRADE_STATE_KEY = "group0_upgrade_state";
future<std::optional<sstring>> system_keyspace::load_group0_upgrade_state() {
return get_scylla_local_param_as<sstring>(GROUP0_UPGRADE_STATE_KEY);
}
future<> system_keyspace::save_group0_upgrade_state(sstring value) {
return set_scylla_local_param(GROUP0_UPGRADE_STATE_KEY, value);
}
static constexpr auto MUST_SYNCHRONIZE_TOPOLOGY_KEY = "must_synchronize_topology";
future<bool> system_keyspace::get_must_synchronize_topology() {
auto opt = co_await get_scylla_local_param_as<bool>(MUST_SYNCHRONIZE_TOPOLOGY_KEY);
co_return opt.value_or(false);
}
future<> system_keyspace::set_must_synchronize_topology(bool value) {
return set_scylla_local_param_as<bool>(MUST_SYNCHRONIZE_TOPOLOGY_KEY, value);
}
static std::set<sstring> decode_features(const set_type_impl::native_type& features) {
std::set<sstring> fset;
for (auto& f : features) {
fset.insert(value_cast<sstring>(std::move(f)));
}
return fset;
}
future<service::topology> system_keyspace::load_topology_state() {
auto rs = co_await qctx->execute_cql(
format("SELECT * FROM system.{} WHERE key = '{}'", TOPOLOGY, TOPOLOGY));
assert(rs);
service::topology_state_machine::topology_type ret;
if (rs->empty()) {
co_return ret;
}
for (auto& row : *rs) {
raft::server_id host_id{row.get_as<utils::UUID>("host_id")};
auto datacenter = row.get_as<sstring>("datacenter");
auto rack = row.get_as<sstring>("rack");
auto release_version = row.get_as<sstring>("release_version");
uint32_t num_tokens = row.get_as<int32_t>("num_tokens");
size_t shard_count = row.get_as<int32_t>("shard_count");
uint8_t ignore_msb = row.get_as<int32_t>("ignore_msb");
service::node_state nstate = service::node_state_from_string(row.get_as<sstring>("node_state"));
std::optional<service::ring_slice> ring_slice;
if (row.has("tokens")) {
auto tokens = decode_tokens(deserialize_set_column(*topology(), row, "tokens"));
if (tokens.empty()) {
on_fatal_internal_error(slogger, format(
"load_topology_state: node {} has tokens column present but tokens are empty",
host_id));
}
ring_slice = service::ring_slice {
.tokens = std::move(tokens),
};
}
std::optional<raft::server_id> replaced_id;
if (row.has("replaced_id")) {
replaced_id = raft::server_id(row.get_as<utils::UUID>("replaced_id"));
}
std::optional<sstring> rebuild_option;
if (row.has("rebuild_option")) {
rebuild_option = row.get_as<sstring>("rebuild_option");
}
std::set<sstring> supported_features;
if (row.has("supported_features")) {
supported_features = decode_features(deserialize_set_column(*topology(), row, "supported_features"));
}
if (row.has("topology_request")) {
auto req = service::topology_request_from_string(row.get_as<sstring>("topology_request"));
ret.requests.emplace(host_id, req);
switch(req) {
case service::topology_request::replace:
if (!replaced_id) {
on_internal_error(slogger, fmt::format("replaced_id is missing for a node {}", host_id));
}
ret.req_param.emplace(host_id, *replaced_id);
break;
case service::topology_request::rebuild:
if (!rebuild_option) {
on_internal_error(slogger, fmt::format("rebuild_option is missing for a node {}", host_id));
}
ret.req_param.emplace(host_id, *rebuild_option);
break;
case service::topology_request::join:
ret.req_param.emplace(host_id, num_tokens);
break;
default:
// no parameters for other requests
break;
}
} else {
switch (nstate) {
case service::node_state::replacing:
// If a node is replacing abother node we need to know which node it is replacing
if (!replaced_id) {
on_internal_error(slogger, fmt::format("replaced_id is missing for a node {}", host_id));
}
ret.req_param.emplace(host_id, *replaced_id);
break;
case service::node_state::rebuilding:
// If a node is rebuilding it needs to know the parameter for the operation
if (!rebuild_option) {
on_internal_error(slogger, fmt::format("rebuild_option is missing for a node {}", host_id));
}
ret.req_param.emplace(host_id, *rebuild_option);
break;
default:
// no parameters for other operations
break;
}
}
std::unordered_map<raft::server_id, service::replica_state>* map = nullptr;
if (nstate == service::node_state::normal) {
map = &ret.normal_nodes;
if (!ring_slice) {
on_fatal_internal_error(slogger, format(
"load_topology_state: node {} in normal state but missing ring slice", host_id));
}
} else if (nstate == service::node_state::left) {
ret.left_nodes.emplace(host_id);
} else if (nstate == service::node_state::none) {
map = &ret.new_nodes;
} else {
map = &ret.transition_nodes;
if (nstate != service::node_state::left_token_ring && !ring_slice) {
on_fatal_internal_error(slogger, format(
"load_topology_state: node {} in transitioning state but missing ring slice", host_id));
}
}
if (map) {
map->emplace(host_id, service::replica_state{
nstate, std::move(datacenter), std::move(rack), std::move(release_version),
ring_slice, shard_count, ignore_msb, std::move(supported_features)});
}
}
{
// Here we access static columns, any row will do.
auto& some_row = *rs->begin();
if (some_row.has("version")) {
ret.version = some_row.get_as<service::topology::version_t>("version");
}
if (some_row.has("transition_state")) {
ret.tstate = service::transition_state_from_string(some_row.get_as<sstring>("transition_state"));
} else {
// Any remaining transition_nodes must be in left_token_ring state
auto it = std::find_if(ret.transition_nodes.begin(), ret.transition_nodes.end(),
[] (auto& p) { return p.second.state != service::node_state::left_token_ring; });
if (it != ret.transition_nodes.end()) {
on_internal_error(slogger, format(
"load_topology_state: topology not in transition state"
" but transition node {} in state {} is present", it->first, it->second.state));
}
}
if (some_row.has("new_cdc_generation_data_uuid")) {
ret.new_cdc_generation_data_uuid = some_row.get_as<utils::UUID>("new_cdc_generation_data_uuid");
}
if (some_row.has("current_cdc_generation_uuid")) {
auto gen_uuid = some_row.get_as<utils::UUID>("current_cdc_generation_uuid");
if (!some_row.has("current_cdc_generation_timestamp")) {
on_internal_error(slogger, format(
"load_topology_state: current CDC generation UUID ({}) present, but timestamp missing", gen_uuid));
}
auto gen_ts = some_row.get_as<db_clock::time_point>("current_cdc_generation_timestamp");
ret.current_cdc_generation_id = cdc::generation_id_v2 {
.ts = gen_ts,
.id = gen_uuid
};
// Sanity check for CDC generation data consistency.
{
auto gen_rows = co_await qctx->execute_cql(
format("SELECT count(range_end) as cnt, num_ranges FROM system.{} WHERE id = ?",
CDC_GENERATIONS_V3),
gen_uuid);
assert(gen_rows);
if (gen_rows->empty()) {
on_internal_error(slogger, format(
"load_topology_state: current CDC generation UUID ({}) present, but data missing", gen_uuid));
}
auto& row = gen_rows->one();
auto counted_ranges = row.get_as<int64_t>("cnt");
auto num_ranges = row.get_as<int32_t>("num_ranges");
if (counted_ranges != num_ranges) {
on_internal_error(slogger, format(
"load_topology_state: inconsistency in CDC generation data (UUID {}):"
" counted {} ranges, should be {}", gen_uuid, counted_ranges, num_ranges));
}
}
} else {
if (!ret.normal_nodes.empty()) {
on_internal_error(slogger,
"load_topology_state: normal nodes present but no current CDC generation ID");
}
}
if (some_row.has("global_topology_request")) {
auto req = service::global_topology_request_from_string(
some_row.get_as<sstring>("global_topology_request"));
ret.global_request.emplace(req);
}
if (some_row.has("enabled_features")) {
ret.enabled_features = decode_features(deserialize_set_column(*topology(), some_row, "enabled_features"));
}
}
co_return ret;
}
future<int64_t> system_keyspace::get_topology_fence_version() {
auto opt = co_await get_scylla_local_param_as<int64_t>("topology_fence_version");
co_return opt.value_or<int64_t>(0);
}
future<> system_keyspace::update_topology_fence_version(int64_t value) {
return set_scylla_local_param_as<int64_t>("topology_fence_version", value);
}
future<cdc::topology_description>
system_keyspace::read_cdc_generation(utils::UUID id) {
std::vector<cdc::token_range_description> entries;
auto num_ranges = 0;
co_await _qp.query_internal(
format("SELECT range_end, streams, ignore_msb, num_ranges FROM {}.{} WHERE id = ?",
NAME, CDC_GENERATIONS_V3),
db::consistency_level::ONE,
{ id },
1000, // for ~1KB rows, ~1MB page size
[&] (const cql3::untyped_result_set_row& row) {
std::vector<cdc::stream_id> streams;
row.get_list_data<bytes>("streams", std::back_inserter(streams));
entries.push_back(cdc::token_range_description{
dht::token::from_int64(row.get_as<int64_t>("range_end")),
std::move(streams),
uint8_t(row.get_as<int8_t>("ignore_msb"))});
num_ranges = row.get_as<int32_t>("num_ranges");
return make_ready_future<stop_iteration>(stop_iteration::no);
});
if (entries.empty()) {
// The data must be present by precondition.
on_internal_error(slogger, format(
"read_cdc_generation: data for CDC generation {} not present", id));
}
if (entries.size() != num_ranges) {
throw std::runtime_error(format(
"read_cdc_generation: wrong number of rows. The `num_ranges` column claimed {} rows,"
" but reading the partition returned {}.", num_ranges, entries.size()));
}
co_return cdc::topology_description{std::move(entries)};
}
future<> system_keyspace::sstables_registry_create_entry(sstring location, utils::UUID uuid, sstring status, sstables::entry_descriptor desc) {
static const auto req = format("INSERT INTO system.{} (location, generation, uuid, status, version, format) VALUES (?, ?, ?, ?, ?, ?)", SSTABLES_REGISTRY);
slogger.trace("Inserting {}.{}:{} into {}", location, desc.generation, uuid, SSTABLES_REGISTRY);
co_await execute_cql(req, location, desc.generation, uuid, status, fmt::to_string(desc.version), fmt::to_string(desc.format)).discard_result();
}
future<utils::UUID> system_keyspace::sstables_registry_lookup_entry(sstring location, sstables::generation_type gen) {
static const auto req = format("SELECT uuid FROM system.{} WHERE location = ? AND generation = ?", SSTABLES_REGISTRY);
slogger.trace("Looking up {}.{} in {}", location, gen, SSTABLES_REGISTRY);
auto msg = co_await execute_cql(req, location, gen);
if (msg->empty() || !msg->one().has("uuid")) {
slogger.trace("ERROR: Cannot find {}.{} in {}", location, gen, SSTABLES_REGISTRY);
co_await coroutine::return_exception(std::runtime_error("No entry in sstables registry"));
}
auto uuid = msg->one().get_as<utils::UUID>("uuid");
slogger.trace("Found {}.{}:{} in {}", location, gen, uuid, SSTABLES_REGISTRY);
co_return uuid;
}
future<> system_keyspace::sstables_registry_update_entry_status(sstring location, sstables::generation_type gen, sstring status) {
static const auto req = format("UPDATE system.{} SET status = ? WHERE location = ? AND generation = ?", SSTABLES_REGISTRY);
slogger.trace("Updating {}.{} -> {} in {}", location, gen, status, SSTABLES_REGISTRY);
co_await execute_cql(req, status, location, gen).discard_result();
}
future<> system_keyspace::sstables_registry_delete_entry(sstring location, sstables::generation_type gen) {
static const auto req = format("DELETE FROM system.{} WHERE location = ? AND generation = ?", SSTABLES_REGISTRY);
slogger.trace("Removing {}.{} from {}", location, gen, SSTABLES_REGISTRY);
co_await execute_cql(req, location, gen).discard_result();
}
future<> system_keyspace::sstables_registry_list(sstring location, sstable_registry_entry_consumer consumer) {
static const auto req = format("SELECT uuid, status, generation, version, format FROM system.{} WHERE location = ?", SSTABLES_REGISTRY);
slogger.trace("Listing {} entries from {}", location, SSTABLES_REGISTRY);
co_await _qp.query_internal(req, db::consistency_level::ONE, { location }, 1000, [ consumer = std::move(consumer) ] (const cql3::untyped_result_set::row& row) -> future<stop_iteration> {
auto uuid = row.get_as<utils::UUID>("uuid");
auto status = row.get_as<sstring>("status");
auto gen = sstables::generation_type(row.get_as<utils::UUID>("generation"));
auto ver = sstables::version_from_string(row.get_as<sstring>("version"));
auto fmt = sstables::format_from_string(row.get_as<sstring>("format"));
sstables::entry_descriptor desc("", "", "", gen, ver, fmt, sstables::component_type::TOC);
co_await consumer(std::move(uuid), std::move(status), std::move(desc));
co_return stop_iteration::no;
});
}
sstring system_keyspace_name() {
return system_keyspace::NAME;
}
system_keyspace::system_keyspace(
cql3::query_processor& qp, replica::database& db, const locator::snitch_ptr& snitch) noexcept
: _qp(qp)
, _db(db)
, _cache(std::make_unique<local_cache>())
{
if (this_shard_id() == 0) {
qctx = std::make_unique<query_context>(_qp.container());
}
_db.plug_system_keyspace(*this);
// FIXME
// This should be coupled with setup_version()'s part committing these values into
// the system.local table. However, cql_test_env needs cached local_dc_rack strings,
// but it doesn't call system_keyspace::setup() and thus ::setup_version() either
_cache->_local_dc_rack_info.dc = snitch->get_datacenter();
_cache->_local_dc_rack_info.rack = snitch->get_rack();
}
system_keyspace::~system_keyspace() {
}
future<> system_keyspace::shutdown() {
_db.unplug_system_keyspace();
co_return;
}
future<::shared_ptr<cql3::untyped_result_set>> system_keyspace::execute_cql(const sstring& query_string, const std::initializer_list<data_value>& values) {
return _qp.execute_internal(query_string, values, cql3::query_processor::cache_internal::yes);
}
} // namespace db