/* * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ /* * Modified by ScyllaDB * Copyright (C) 2015 ScyllaDB */ /* * This file is part of Scylla. * * Scylla is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * Scylla is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with Scylla. If not, see . */ #include #include #include #include #include "system_keyspace.hh" #include "types.hh" #include "service/storage_service.hh" #include "service/storage_proxy.hh" #include "service/client_state.hh" #include "service/query_state.hh" #include "cql3/query_options.hh" #include "cql3/query_processor.hh" #include "utils/fb_utilities.hh" #include "utils/hash.hh" #include "dht/i_partitioner.hh" #include "version.hh" #include "thrift/server.hh" #include "exceptions/exceptions.hh" #include "cql3/query_processor.hh" #include "query_context.hh" #include "partition_slice_builder.hh" #include "db/config.hh" #include "schema_builder.hh" #include "md5_hasher.hh" #include "release.hh" #include "log.hh" #include "serializer.hh" #include #include "service/storage_proxy.hh" #include "message/messaging_service.hh" #include "mutation_query.hh" #include "db/size_estimates_virtual_reader.hh" using days = std::chrono::duration>; namespace db { std::unique_ptr qctx = {}; namespace system_keyspace { static logging::logger logger("system_keyspace"); static const api::timestamp_type creation_timestamp = api::new_timestamp(); api::timestamp_type schema_creation_timestamp() { return creation_timestamp; } // Increase whenever changing schema of any system table. // FIXME: Make automatic by calculating from schema structure. static const uint16_t version_sequence_number = 1; table_schema_version generate_schema_version(utils::UUID table_id) { md5_hasher h; feed_hash(h, table_id); feed_hash(h, version_sequence_number); return utils::UUID_gen::get_name_UUID(h.finalize()); } // Currently, the type variables (uuid_type, etc.) are thread-local reference- // counted shared pointers. This forces us to also make the built in schemas // below thread-local as well. // We return schema_ptr, not schema&, because that's the "tradition" in our // other code. // We hide the thread_local variable inside a function, because if we later // we remove the thread_local, we'll start having initialization order // problems (we need the type variables to be constructed first), and using // functions will solve this problem. So we use functions right now. schema_ptr hints() { static thread_local auto hints = [] { schema_builder builder(make_lw_shared(schema(generate_legacy_id(NAME, HINTS), NAME, HINTS, // partition key {{"target_id", uuid_type}}, // clustering key {{"hint_id", timeuuid_type}, {"message_version", int32_type}}, // regular columns {{"mutation", bytes_type}}, // static columns {}, // regular column name type utf8_type, // comment "hints awaiting delivery" ))); builder.set_gc_grace_seconds(0); builder.set_compaction_strategy_options({{ "enabled", "false" }}); builder.with_version(generate_schema_version(builder.uuid())); return builder.build(schema_builder::compact_storage::yes); }(); return hints; } schema_ptr batchlog() { static thread_local auto batchlog = [] { schema_builder builder(make_lw_shared(schema(generate_legacy_id(NAME, BATCHLOG), NAME, BATCHLOG, // partition key {{"id", uuid_type}}, // clustering key {}, // regular columns {{"data", bytes_type}, {"version", int32_type}, {"written_at", timestamp_type}}, // static columns {}, // regular column name type utf8_type, // comment "batches awaiting replay" // FIXME: the original Java code also had: // operations on resulting CFMetaData: // .compactionStrategyOptions(Collections.singletonMap("min_threshold", "2")) ))); builder.set_gc_grace_seconds(0); builder.with_version(generate_schema_version(builder.uuid())); return builder.build(schema_builder::compact_storage::no); }(); return batchlog; } /*static*/ schema_ptr paxos() { static thread_local auto paxos = [] { schema_builder builder(make_lw_shared(schema(generate_legacy_id(NAME, PAXOS), NAME, PAXOS, // partition key {{"row_key", bytes_type}}, // clustering key {{"cf_id", uuid_type}}, // regular columns {{"in_progress_ballot", timeuuid_type}, {"most_recent_commit", bytes_type}, {"most_recent_commit_at", timeuuid_type}, {"proposal", bytes_type}, {"proposal_ballot", timeuuid_type}}, // static columns {}, // regular column name type utf8_type, // comment "in-progress paxos proposals" // FIXME: the original Java code also had: // operations on resulting CFMetaData: // .compactionStrategyClass(LeveledCompactionStrategy.class); ))); builder.with_version(generate_schema_version(builder.uuid())); return builder.build(schema_builder::compact_storage::no); }(); return paxos; } schema_ptr built_indexes() { static thread_local auto built_indexes = [] { schema_builder builder(make_lw_shared(schema(generate_legacy_id(NAME, BUILT_INDEXES), NAME, BUILT_INDEXES, // partition key {{"table_name", utf8_type}}, // clustering key {{"index_name", utf8_type}}, // regular columns {}, // static columns {}, // regular column name type utf8_type, // comment "built column indexes" ))); builder.with_version(generate_schema_version(builder.uuid())); return builder.build(schema_builder::compact_storage::yes); }(); return built_indexes; } /*static*/ schema_ptr local() { static thread_local auto local = [] { schema_builder builder(make_lw_shared(schema(generate_legacy_id(NAME, LOCAL), NAME, LOCAL, // partition key {{"key", utf8_type}}, // clustering key {}, // regular columns { {"bootstrapped", utf8_type}, {"cluster_name", utf8_type}, {"cql_version", utf8_type}, {"data_center", utf8_type}, {"gossip_generation", int32_type}, {"host_id", uuid_type}, {"native_protocol_version", utf8_type}, {"partitioner", utf8_type}, {"rack", utf8_type}, {"release_version", utf8_type}, {"schema_version", uuid_type}, {"thrift_version", utf8_type}, {"tokens", set_type_impl::get_instance(utf8_type, true)}, {"truncated_at", map_type_impl::get_instance(uuid_type, bytes_type, true)}, // The following 3 columns are only present up until 2.1.8 tables {"rpc_address", inet_addr_type}, {"broadcast_address", inet_addr_type}, {"listen_address", inet_addr_type}, {"supported_features", utf8_type}, }, // static columns {}, // regular column name type utf8_type, // comment "information about the local node" ))); builder.with_version(generate_schema_version(builder.uuid())); return builder.build(schema_builder::compact_storage::no); }(); return local; } /*static*/ schema_ptr peers() { static thread_local auto peers = [] { schema_builder builder(make_lw_shared(schema(generate_legacy_id(NAME, PEERS), NAME, PEERS, // partition key {{"peer", inet_addr_type}}, // clustering key {}, // regular columns { {"data_center", utf8_type}, {"host_id", uuid_type}, {"preferred_ip", inet_addr_type}, {"rack", utf8_type}, {"release_version", utf8_type}, {"rpc_address", inet_addr_type}, {"schema_version", uuid_type}, {"tokens", set_type_impl::get_instance(utf8_type, true)}, {"supported_features", utf8_type}, }, // static columns {}, // regular column name type utf8_type, // comment "information about known peers in the cluster" ))); builder.with_version(generate_schema_version(builder.uuid())); return builder.build(schema_builder::compact_storage::no); }(); return peers; } /*static*/ schema_ptr peer_events() { static thread_local auto peer_events = [] { schema_builder builder(make_lw_shared(schema(generate_legacy_id(NAME, PEER_EVENTS), NAME, PEER_EVENTS, // partition key {{"peer", inet_addr_type}}, // clustering key {}, // regular columns { {"hints_dropped", map_type_impl::get_instance(uuid_type, int32_type, true)}, }, // static columns {}, // regular column name type utf8_type, // comment "events related to peers" ))); builder.with_version(generate_schema_version(builder.uuid())); return builder.build(schema_builder::compact_storage::no); }(); return peer_events; } /*static*/ schema_ptr range_xfers() { static thread_local auto range_xfers = [] { schema_builder builder(make_lw_shared(schema(generate_legacy_id(NAME, RANGE_XFERS), NAME, RANGE_XFERS, // partition key {{"token_bytes", bytes_type}}, // clustering key {}, // regular columns {{"requested_at", timestamp_type}}, // static columns {}, // regular column name type utf8_type, // comment "ranges requested for transfer" ))); builder.with_version(generate_schema_version(builder.uuid())); return builder.build(schema_builder::compact_storage::no); }(); return range_xfers; } /*static*/ schema_ptr compactions_in_progress() { static thread_local auto compactions_in_progress = [] { schema_builder builder(make_lw_shared(schema(generate_legacy_id(NAME, COMPACTIONS_IN_PROGRESS), NAME, COMPACTIONS_IN_PROGRESS, // partition key {{"id", uuid_type}}, // clustering key {}, // regular columns { {"columnfamily_name", utf8_type}, {"inputs", set_type_impl::get_instance(int32_type, true)}, {"keyspace_name", utf8_type}, }, // static columns {}, // regular column name type utf8_type, // comment "unfinished compactions" ))); builder.with_version(generate_schema_version(builder.uuid())); return builder.build(schema_builder::compact_storage::no); }(); return compactions_in_progress; } /*static*/ schema_ptr compaction_history() { static thread_local auto compaction_history = [] { schema_builder builder(make_lw_shared(schema(generate_legacy_id(NAME, COMPACTION_HISTORY), NAME, COMPACTION_HISTORY, // partition key {{"id", uuid_type}}, // clustering key {}, // regular columns { {"bytes_in", long_type}, {"bytes_out", long_type}, {"columnfamily_name", utf8_type}, {"compacted_at", timestamp_type}, {"keyspace_name", utf8_type}, {"rows_merged", map_type_impl::get_instance(int32_type, long_type, true)}, }, // static columns {}, // regular column name type utf8_type, // comment "week-long compaction history" ))); builder.set_default_time_to_live(std::chrono::duration_cast(days(7))); builder.with_version(generate_schema_version(builder.uuid())); return builder.build(schema_builder::compact_storage::no); }(); return compaction_history; } /*static*/ schema_ptr sstable_activity() { static thread_local auto sstable_activity = [] { schema_builder builder(make_lw_shared(schema(generate_legacy_id(NAME, SSTABLE_ACTIVITY), NAME, SSTABLE_ACTIVITY, // partition key { {"keyspace_name", utf8_type}, {"columnfamily_name", utf8_type}, {"generation", int32_type}, }, // clustering key {}, // regular columns { {"rate_120m", double_type}, {"rate_15m", double_type}, }, // static columns {}, // regular column name type utf8_type, // comment "historic sstable read rates" ))); builder.with_version(generate_schema_version(builder.uuid())); return builder.build(schema_builder::compact_storage::no); }(); return sstable_activity; } schema_ptr size_estimates() { static thread_local auto size_estimates = [] { schema_builder builder(make_lw_shared(schema(generate_legacy_id(NAME, SIZE_ESTIMATES), NAME, SIZE_ESTIMATES, // partition key {{"keyspace_name", utf8_type}}, // clustering key {{"table_name", utf8_type}, {"range_start", utf8_type}, {"range_end", utf8_type}}, // regular columns { {"mean_partition_size", long_type}, {"partitions_count", long_type}, }, // static columns {}, // regular column name type utf8_type, // comment "per-table primary range size estimates" ))); builder.set_gc_grace_seconds(0); builder.with_version(generate_schema_version(builder.uuid())); return builder.build(schema_builder::compact_storage::no); }(); return size_estimates; } static future<> setup_version() { return gms::inet_address::lookup(qctx->db().get_config().rpc_address()).then([](gms::inet_address a) { sstring req = "INSERT INTO system.%s (key, release_version, cql_version, thrift_version, native_protocol_version, data_center, rack, partitioner, rpc_address, broadcast_address, listen_address, supported_features) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)"; auto& snitch = locator::i_endpoint_snitch::get_local_snitch_ptr(); return execute_cql(req, db::system_keyspace::LOCAL, sstring(db::system_keyspace::LOCAL), version::release(), cql3::query_processor::CQL_VERSION, org::apache::cassandra::thrift_version, to_sstring(cql_serialization_format::latest_version), snitch->get_datacenter(utils::fb_utilities::get_broadcast_address()), snitch->get_rack(utils::fb_utilities::get_broadcast_address()), sstring(dht::global_partitioner().name()), a.addr(), utils::fb_utilities::get_broadcast_address().addr(), net::get_local_messaging_service().listen_address().addr(), service::storage_service::get_config_supported_features() ).discard_result(); }); } future<> check_health(); future<> force_blocking_flush(sstring cfname); // Changing the real load_dc_rack_info into a future would trigger a tidal wave of futurization that would spread // even into simple string operations like get_rack() / get_dc(). We will cache those at startup, and then change // our view of it every time we do updates on those values. // // The cache must be distributed, because the values themselves may not update atomically, so a shard reading that // is different than the one that wrote, may see a corrupted value. invoke_on_all will be used to guarantee that all // updates are propagated correctly. struct local_cache { std::unordered_map _cached_dc_rack_info; bootstrap_state _state; future<> stop() { return make_ready_future<>(); } }; static distributed _local_cache; static future<> build_dc_rack_info() { return execute_cql("SELECT peer, data_center, rack from system.%s", PEERS).then([] (::shared_ptr msg) { return do_for_each(*msg, [] (auto& row) { // Not ideal to assume ipv4 here, but currently this is what the cql types wraps. net::ipv4_address peer = row.template get_as("peer"); if (!row.has("data_center") || !row.has("rack")) { return make_ready_future<>(); } gms::inet_address gms_addr(std::move(peer)); sstring dc = row.template get_as("data_center"); sstring rack = row.template get_as("rack"); locator::endpoint_dc_rack element = { dc, rack }; return _local_cache.invoke_on_all([gms_addr = std::move(gms_addr), element = std::move(element)] (local_cache& lc) { lc._cached_dc_rack_info.emplace(gms_addr, element); }); }).then([msg] { // Keep msg alive. }); }); } static future<> build_bootstrap_info() { sstring req = "SELECT bootstrapped FROM system.%s WHERE key = ? "; return execute_cql(req, LOCAL, sstring(LOCAL)).then([] (auto msg) { static auto state_map = std::unordered_map({ { "NEEDS_BOOTSTRAP", bootstrap_state::NEEDS_BOOTSTRAP }, { "COMPLETED", bootstrap_state::COMPLETED }, { "IN_PROGRESS", bootstrap_state::IN_PROGRESS }, { "DECOMMISSIONED", bootstrap_state::DECOMMISSIONED } }); bootstrap_state state = bootstrap_state::NEEDS_BOOTSTRAP; if (!msg->empty() && msg->one().has("bootstrapped")) { state = state_map.at(msg->one().template get_as("bootstrapped")); } return _local_cache.invoke_on_all([state] (local_cache& lc) { lc._state = state; }); }); } future<> init_local_cache() { return _local_cache.start().then([] { engine().at_exit([] { return _local_cache.stop(); }); }); } future<> deinit_local_cache() { return _local_cache.stop(); } void minimal_setup(distributed& db, distributed& qp) { qctx = std::make_unique(db, qp); } future<> setup(distributed& db, distributed& qp) { minimal_setup(db, qp); return setup_version().then([&db] { return update_schema_version(db.local().get_version()); }).then([] { return init_local_cache(); }).then([] { return build_dc_rack_info(); }).then([] { return build_bootstrap_info(); }).then([] { return check_health(); }).then([] { return db::schema_tables::save_system_keyspace_schema(); }).then([] { return net::get_messaging_service().invoke_on_all([] (auto& ms){ return ms.init_local_preferred_ip_cache(); }); }); } struct truncation_record { static constexpr uint32_t current_magic = 0x53435452; // 'S' 'C' 'T' 'R' uint32_t magic; std::vector positions; db_clock::time_point time_stamp; }; } } #include "idl/replay_position.dist.hh" #include "idl/truncation_record.dist.hh" #include "serializer_impl.hh" #include "idl/replay_position.dist.impl.hh" #include "idl/truncation_record.dist.impl.hh" namespace db { namespace system_keyspace { typedef utils::UUID truncation_key; typedef std::unordered_map truncation_map; static constexpr uint8_t current_version = 1; static thread_local std::experimental::optional truncation_records; future<> save_truncation_records(const column_family& cf, db_clock::time_point truncated_at, replay_positions positions) { truncation_record r; r.magic = truncation_record::current_magic; r.time_stamp = truncated_at; r.positions = std::move(positions); auto buf = ser::serialize_to_buffer(r, sizeof(current_version)); buf[0] = current_version; static_assert(sizeof(current_version) == 1, "using this as mark"); assert(buf.size() & 1); // verify we've created an odd-numbered buffer map_type_impl::native_type tmp; tmp.emplace_back(cf.schema()->id(), data_value(buf)); auto map_type = map_type_impl::get_instance(uuid_type, bytes_type, true); sstring req = sprint("UPDATE system.%s SET truncated_at = truncated_at + ? WHERE key = '%s'", LOCAL, LOCAL); return qctx->qp().execute_internal(req, {make_map_value(map_type, tmp)}).then([](auto rs) { truncation_records = {}; return force_blocking_flush(LOCAL); }); } /** * This method is used to remove information about truncation time for specified column family */ future<> remove_truncation_record(utils::UUID id) { sstring req = sprint("DELETE truncated_at[?] from system.%s WHERE key = '%s'", LOCAL, LOCAL); return qctx->qp().execute_internal(req, {id}).then([](auto rs) { truncation_records = {}; return force_blocking_flush(LOCAL); }); } static future get_truncation_record(utils::UUID cf_id) { if (!truncation_records) { sstring req = sprint("SELECT truncated_at FROM system.%s WHERE key = '%s'", LOCAL, LOCAL); return qctx->qp().execute_internal(req).then([cf_id](::shared_ptr rs) { truncation_map tmp; if (!rs->empty() && rs->one().has("truncated_at")) { auto map = rs->one().get_map("truncated_at"); for (auto& p : map) { auto uuid = p.first; auto buf = p.second; try { truncation_record e; if (buf.size() & 1) { // new record. if (buf[0] != current_version) { logger.warn("Found truncation record of unknown version {}. Ignoring.", int(buf[0])); continue; } e = ser::deserialize_from_buffer(buf, boost::type(), 1); if (e.magic == truncation_record::current_magic) { tmp[uuid] = e; continue; } } else { // old scylla records. (We hope) // Read 64+64 bit RP:s, even though the // struct (and official serial size) is 64+32. data_input in(buf); logger.debug("Reading old type record"); while (in.avail() > sizeof(db_clock::rep)) { auto id = in.read(); auto pos = in.read(); e.positions.emplace_back(id, position_type(pos)); } if (in.avail() == sizeof(db_clock::rep)) { e.time_stamp = db_clock::time_point(db_clock::duration(in.read())); tmp[uuid] = e; continue; } } } catch (std::out_of_range &) { } // Trying to load an origin table. // This is useless to us, because the only usage for this // data is commit log and batch replay, and we cannot replay // either from origin anyway. logger.warn("Error reading truncation record for {}. " "Most likely this is data from a cassandra instance." "Make sure you have cleared commit and batch logs before upgrading.", uuid ); } } truncation_records = std::move(tmp); return get_truncation_record(cf_id); }); } return make_ready_future((*truncation_records)[cf_id]); } future<> save_truncation_record(const column_family& cf, db_clock::time_point truncated_at, db::replay_position rp) { // TODO: this is horribly ineffective, we're doing a full flush of all system tables for all cores // once, for each core (calling us). But right now, redesigning so that calling here (or, rather, // save_truncation_records), is done from "somewhere higher, once per machine, not shard" is tricky. // Mainly because drop_tables also uses truncate. And is run per-core as well. Gah. return get_truncation_record(cf.schema()->id()).then([&cf, truncated_at, rp](truncation_record e) { auto i = std::find_if(e.positions.begin(), e.positions.end(), [rp](replay_position& p) { return p.shard_id() == rp.shard_id(); }); if (i == e.positions.end()) { e.positions.emplace_back(rp); } else { *i = rp; } return save_truncation_records(cf, std::max(truncated_at, e.time_stamp), e.positions); }); } future get_truncated_position(utils::UUID cf_id, uint32_t shard) { return get_truncated_position(std::move(cf_id)).then([shard](replay_positions positions) { for (auto& rp : positions) { if (shard == rp.shard_id()) { return make_ready_future(rp); } } return make_ready_future(); }); } future get_truncated_position(utils::UUID cf_id) { return get_truncation_record(cf_id).then([](truncation_record e) { return make_ready_future(e.positions); }); } future get_truncated_at(utils::UUID cf_id) { return get_truncation_record(cf_id).then([](truncation_record e) { return make_ready_future(e.time_stamp); }); } set_type_impl::native_type prepare_tokens(std::unordered_set& tokens) { set_type_impl::native_type tset; for (auto& t: tokens) { tset.push_back(dht::global_partitioner().to_sstring(t)); } return tset; } std::unordered_set decode_tokens(set_type_impl::native_type& tokens) { std::unordered_set tset; for (auto& t: tokens) { auto str = value_cast(t); assert(str == dht::global_partitioner().to_sstring(dht::global_partitioner().from_sstring(str))); tset.insert(dht::global_partitioner().from_sstring(str)); } return tset; } /** * Record tokens being used by another node */ future<> update_tokens(gms::inet_address ep, std::unordered_set tokens) { if (ep == utils::fb_utilities::get_broadcast_address()) { return remove_endpoint(ep); } sstring req = "INSERT INTO system.%s (peer, tokens) VALUES (?, ?)"; auto set_type = set_type_impl::get_instance(utf8_type, true); return execute_cql(req, PEERS, ep.addr(), make_set_value(set_type, prepare_tokens(tokens))).discard_result().then([] { return force_blocking_flush(PEERS); }); } future> update_local_tokens( const std::unordered_set add_tokens, const std::unordered_set rm_tokens) { return get_saved_tokens().then([add_tokens = std::move(add_tokens), rm_tokens = std::move(rm_tokens)] (auto tokens) { for (auto& x : rm_tokens) { tokens.erase(x); } for (auto& x : add_tokens) { tokens.insert(x); } return update_tokens(tokens).then([tokens] { return tokens; }); }); } future>> load_tokens() { sstring req = "SELECT peer, tokens FROM system.%s"; return execute_cql(req, PEERS).then([] (::shared_ptr cql_result) { std::unordered_map> ret; for (auto& row : *cql_result) { auto peer = gms::inet_address(row.get_as("peer")); if (row.has("tokens")) { auto blob = row.get_blob("tokens"); auto cdef = peers()->get_column_definition("tokens"); auto deserialized = cdef->type->deserialize(blob); auto tokens = value_cast(deserialized); ret.emplace(peer, decode_tokens(tokens)); } } return ret; }); } future> load_host_ids() { sstring req = "SELECT peer, host_id FROM system.%s"; return execute_cql(req, PEERS).then([] (::shared_ptr cql_result) { std::unordered_map ret; for (auto& row : *cql_result) { auto peer = gms::inet_address(row.get_as("peer")); if (row.has("host_id")) { ret.emplace(peer, row.get_as("host_id")); } } return ret; }); } future> load_peer_features() { sstring req = "SELECT peer, supported_features FROM system.%s"; return execute_cql(req, PEERS).then([] (::shared_ptr cql_result) { std::unordered_map ret; for (auto& row : *cql_result) { if (row.has("supported_features")) { ret.emplace(row.get_as("peer"), row.get_as("supported_features")); } } return ret; }); } future<> update_preferred_ip(gms::inet_address ep, gms::inet_address preferred_ip) { sstring req = "INSERT INTO system.%s (peer, preferred_ip) VALUES (?, ?)"; return execute_cql(req, PEERS, ep.addr(), preferred_ip.addr()).discard_result().then([] { return force_blocking_flush(PEERS); }); } future> get_preferred_ips() { sstring req = "SELECT peer, preferred_ip FROM system.%s"; return execute_cql(req, PEERS).then([] (::shared_ptr cql_res_set) { std::unordered_map res; for (auto& r : *cql_res_set) { if (r.has("preferred_ip")) { res.emplace(gms::inet_address(r.get_as("peer")), gms::inet_address(r.get_as("preferred_ip"))); } } return res; }); } template static future<> update_cached_values(gms::inet_address ep, sstring column_name, Value value) { return make_ready_future<>(); } template <> future<> update_cached_values(gms::inet_address ep, sstring column_name, sstring value) { return _local_cache.invoke_on_all([ep = std::move(ep), column_name = std::move(column_name), value = std::move(value)] (local_cache& lc) { if (column_name == "data_center") { lc._cached_dc_rack_info[ep].dc = value; } else if (column_name == "rack") { lc._cached_dc_rack_info[ep].rack = value; } return make_ready_future<>(); }); } template future<> update_peer_info(gms::inet_address ep, sstring column_name, Value value) { if (ep == utils::fb_utilities::get_broadcast_address()) { return make_ready_future<>(); } return update_cached_values(ep, column_name, value).then([ep, column_name, value] { sstring clause = sprint("(peer, %s) VALUES (?, ?)", column_name); sstring req = "INSERT INTO system.%s " + clause; return execute_cql(req, PEERS, ep.addr(), value).discard_result(); }); } // sets are not needed, since tokens are updated by another method template future<> update_peer_info(gms::inet_address ep, sstring column_name, sstring); template future<> update_peer_info(gms::inet_address ep, sstring column_name, utils::UUID); template future<> update_peer_info(gms::inet_address ep, sstring column_name, net::ipv4_address); future<> update_hints_dropped(gms::inet_address ep, utils::UUID time_period, int value) { // with 30 day TTL sstring req = "UPDATE system.%s USING TTL 2592000 SET hints_dropped[ ? ] = ? WHERE peer = ?"; return execute_cql(req, PEER_EVENTS, time_period, value, ep.addr()).discard_result(); } future<> update_schema_version(utils::UUID version) { sstring req = "INSERT INTO system.%s (key, schema_version) VALUES (?, ?)"; return execute_cql(req, LOCAL, sstring(LOCAL), version).discard_result(); } /** * Remove stored tokens being used by another node */ future<> remove_endpoint(gms::inet_address ep) { return _local_cache.invoke_on_all([ep] (local_cache& lc) { lc._cached_dc_rack_info.erase(ep); }).then([ep] { sstring req = "DELETE FROM system.%s WHERE peer = ?"; return execute_cql(req, PEERS, ep.addr()).discard_result(); }).then([] { return force_blocking_flush(PEERS); }); } /** * This method is used to update the System Keyspace with the new tokens for this node */ future<> update_tokens(std::unordered_set tokens) { if (tokens.empty()) { throw std::invalid_argument("remove_endpoint should be used instead"); } sstring req = "INSERT INTO system.%s (key, tokens) VALUES (?, ?)"; auto set_type = set_type_impl::get_instance(utf8_type, true); return execute_cql(req, LOCAL, sstring(LOCAL), make_set_value(set_type, prepare_tokens(tokens))).discard_result().then([] { return force_blocking_flush(LOCAL); }); } future<> force_blocking_flush(sstring cfname) { assert(qctx); return qctx->_db.invoke_on_all([cfname = std::move(cfname)](database& db) { // if (!Boolean.getBoolean("cassandra.unsafesystem")) column_family& cf = db.find_column_family(NAME, cfname); return cf.flush(); }); } /** * One of three things will happen if you try to read the system keyspace: * 1. files are present and you can read them: great * 2. no files are there: great (new node is assumed) * 3. files are present but you can't read them: bad */ future<> check_health() { using namespace transport::messages; sstring req = "SELECT cluster_name FROM system.%s WHERE key=?"; return execute_cql(req, LOCAL, sstring(LOCAL)).then([] (::shared_ptr msg) { if (msg->empty() || !msg->one().has("cluster_name")) { // this is a brand new node sstring ins_req = "INSERT INTO system.%s (key, cluster_name) VALUES (?, ?)"; return execute_cql(ins_req, LOCAL, sstring(LOCAL), qctx->db().get_config().cluster_name()).discard_result(); } else { auto saved_cluster_name = msg->one().get_as("cluster_name"); auto cluster_name = qctx->db().get_config().cluster_name(); if (cluster_name != saved_cluster_name) { throw exceptions::configuration_exception("Saved cluster name " + saved_cluster_name + " != configured name " + cluster_name); } return make_ready_future<>(); } }); } future> get_saved_tokens() { sstring req = "SELECT tokens FROM system.%s WHERE key = ?"; return execute_cql(req, LOCAL, sstring(LOCAL)).then([] (auto msg) { if (msg->empty() || !msg->one().has("tokens")) { return make_ready_future>(); } auto blob = msg->one().get_blob("tokens"); auto cdef = local()->get_column_definition("tokens"); auto deserialized = cdef->type->deserialize(blob); auto tokens = value_cast(deserialized); return make_ready_future>(decode_tokens(tokens)); }); } bool bootstrap_complete() { return get_bootstrap_state() == bootstrap_state::COMPLETED; } bool bootstrap_in_progress() { return get_bootstrap_state() == bootstrap_state::IN_PROGRESS; } bool was_decommissioned() { return get_bootstrap_state() == bootstrap_state::DECOMMISSIONED; } bootstrap_state get_bootstrap_state() { return _local_cache.local()._state; } future<> set_bootstrap_state(bootstrap_state state) { static std::unordered_map> state_to_name({ { bootstrap_state::NEEDS_BOOTSTRAP, "NEEDS_BOOTSTRAP" }, { bootstrap_state::COMPLETED, "COMPLETED" }, { bootstrap_state::IN_PROGRESS, "IN_PROGRESS" }, { bootstrap_state::DECOMMISSIONED, "DECOMMISSIONED" } }); sstring state_name = state_to_name.at(state); sstring req = "INSERT INTO system.%s (key, bootstrapped) VALUES (?, ?)"; return execute_cql(req, LOCAL, sstring(LOCAL), state_name).discard_result().then([state] { return force_blocking_flush(LOCAL).then([state] { return _local_cache.invoke_on_all([state] (local_cache& lc) { lc._state = state; }); }); }); } std::vector all_tables() { std::vector r; auto legacy_tables = db::schema_tables::all_tables(); std::copy(legacy_tables.begin(), legacy_tables.end(), std::back_inserter(r)); r.push_back(built_indexes()); r.push_back(hints()); r.push_back(batchlog()); r.push_back(paxos()); r.push_back(local()); r.push_back(peers()); r.push_back(peer_events()); r.push_back(range_xfers()); r.push_back(compactions_in_progress()); r.push_back(compaction_history()); r.push_back(sstable_activity()); r.push_back(size_estimates()); return r; } static void maybe_add_virtual_reader(schema_ptr s, database& db) { if (s.get() == size_estimates().get()) { db.find_column_family(s).set_virtual_reader(mutation_source(db::size_estimates::virtual_reader())); } } static bool maybe_write_in_user_memory(schema_ptr s, database& db) { return (s.get() == batchlog().get()); } void make(database& db, bool durable, bool volatile_testing_only) { auto ksm = make_lw_shared(NAME, "org.apache.cassandra.locator.LocalStrategy", std::map{}, durable ); auto kscfg = db.make_keyspace_config(*ksm); kscfg.enable_disk_reads = !volatile_testing_only; kscfg.enable_disk_writes = !volatile_testing_only; kscfg.enable_commitlog = !volatile_testing_only; kscfg.enable_cache = true; // don't make system keyspace reads wait for user reads kscfg.read_concurrency_config.sem = &db.system_keyspace_read_concurrency_sem(); kscfg.read_concurrency_config.timeout = {}; kscfg.read_concurrency_config.max_queue_length = std::numeric_limits::max(); // don't make system keyspace writes wait for user writes (if under pressure) kscfg.dirty_memory_manager = &db._system_dirty_memory_manager; keyspace _ks{ksm, std::move(kscfg)}; auto rs(locator::abstract_replication_strategy::create_replication_strategy(NAME, "LocalStrategy", service::get_local_storage_service().get_token_metadata(), ksm->strategy_options())); _ks.set_replication_strategy(std::move(rs)); db.add_keyspace(NAME, std::move(_ks)); auto& ks = db.find_keyspace(NAME); for (auto&& table : all_tables()) { auto cfg = ks.make_column_family_config(*table, db.get_config()); if (maybe_write_in_user_memory(table, db)) { cfg.dirty_memory_manager = &db._dirty_memory_manager; } db.add_column_family(ks, table, std::move(cfg)); maybe_add_virtual_reader(table, db); } } future get_local_host_id() { using namespace transport::messages; sstring req = "SELECT host_id FROM system.%s WHERE key=?"; return execute_cql(req, LOCAL, sstring(LOCAL)).then([] (::shared_ptr msg) { auto new_id = [] { auto host_id = utils::make_random_uuid(); return set_local_host_id(host_id); }; if (msg->empty() || !msg->one().has("host_id")) { return new_id(); } auto host_id = msg->one().get_as("host_id"); return make_ready_future(host_id); }); } future set_local_host_id(const utils::UUID& host_id) { sstring req = "INSERT INTO system.%s (key, host_id) VALUES (?, ?)"; return execute_cql(req, LOCAL, sstring(LOCAL), host_id).then([] (auto msg) { return force_blocking_flush(LOCAL); }).then([host_id] { return host_id; }); } std::unordered_map load_dc_rack_info() { return _local_cache.local()._cached_dc_rack_info; } future>> query_mutations(distributed& proxy, const sstring& cf_name) { database& db = proxy.local().get_db().local(); schema_ptr schema = db.find_schema(db::system_keyspace::NAME, cf_name); auto slice = partition_slice_builder(*schema).build(); auto cmd = make_lw_shared(schema->id(), schema->version(), std::move(slice), std::numeric_limits::max()); return proxy.local().query_mutations_locally(std::move(schema), std::move(cmd), query::full_partition_range); } future> query(distributed& proxy, const sstring& cf_name) { database& db = proxy.local().get_db().local(); schema_ptr schema = db.find_schema(db::system_keyspace::NAME, cf_name); auto slice = partition_slice_builder(*schema).build(); auto cmd = make_lw_shared(schema->id(), schema->version(), std::move(slice), std::numeric_limits::max()); return proxy.local().query(schema, cmd, {query::full_partition_range}, db::consistency_level::ONE, nullptr).then([schema, cmd] (auto&& result) { return make_lw_shared(query::result_set::from_raw_result(schema, cmd->slice, *result)); }); } future> query(distributed& proxy, const sstring& cf_name, const dht::decorated_key& key, query::clustering_range row_range) { auto&& db = proxy.local().get_db().local(); auto schema = db.find_schema(db::system_keyspace::NAME, cf_name); auto slice = partition_slice_builder(*schema) .with_range(std::move(row_range)) .build(); auto cmd = make_lw_shared(schema->id(), schema->version(), std::move(slice), query::max_rows); return proxy.local().query(schema, cmd, {dht::partition_range::make_singular(key)}, db::consistency_level::ONE, nullptr).then([schema, cmd] (auto&& result) { return make_lw_shared(query::result_set::from_raw_result(schema, cmd->slice, *result)); }); } static map_type_impl::native_type prepare_rows_merged(std::unordered_map& rows_merged) { map_type_impl::native_type tmp; for (auto& r: rows_merged) { int32_t first = r.first; int64_t second = r.second; auto map_element = std::make_pair(data_value(first), data_value(second)); tmp.push_back(std::move(map_element)); } return tmp; } future<> update_compaction_history(sstring ksname, sstring cfname, int64_t compacted_at, int64_t bytes_in, int64_t bytes_out, std::unordered_map rows_merged) { // don't write anything when the history table itself is compacted, since that would in turn cause new compactions if (ksname == "system" && cfname == COMPACTION_HISTORY) { return make_ready_future<>(); } auto map_type = map_type_impl::get_instance(int32_type, long_type, true); sstring req = "INSERT INTO system.%s (id, keyspace_name, columnfamily_name, compacted_at, bytes_in, bytes_out, rows_merged) VALUES (?, ?, ?, ?, ?, ?, ?)"; return execute_cql(req, COMPACTION_HISTORY, utils::UUID_gen::get_time_UUID(), ksname, cfname, compacted_at, bytes_in, bytes_out, make_map_value(map_type, prepare_rows_merged(rows_merged))).discard_result(); } future> get_compaction_history() { sstring req = "SELECT * from system.%s"; return execute_cql(req, COMPACTION_HISTORY).then([] (::shared_ptr msg) { std::vector history; for (auto& row : *msg) { compaction_history_entry entry; entry.id = row.get_as("id"); entry.ks = row.get_as("keyspace_name"); entry.cf = row.get_as("columnfamily_name"); entry.compacted_at = row.get_as("compacted_at"); entry.bytes_in = row.get_as("bytes_in"); entry.bytes_out = row.get_as("bytes_out"); if (row.has("rows_merged")) { entry.rows_merged = row.get_map("rows_merged"); } history.push_back(std::move(entry)); } return std::move(history); }); } future increment_and_get_generation() { auto req = sprint("SELECT gossip_generation FROM system.%s WHERE key='%s'", LOCAL, LOCAL); return qctx->qp().execute_internal(req).then([] (auto rs) { int generation; if (rs->empty() || !rs->one().has("gossip_generation")) { // seconds-since-epoch isn't a foolproof new generation // (where foolproof is "guaranteed to be larger than the last one seen at this ip address"), // but it's as close as sanely possible generation = service::get_generation_number(); } else { // Other nodes will ignore gossip messages about a node that have a lower generation than previously seen. int stored_generation = rs->one().template get_as("gossip_generation") + 1; int now = service::get_generation_number(); if (stored_generation >= now) { logger.warn("Using stored Gossip Generation {} as it is greater than current system time {}." "See CASSANDRA-3654 if you experience problems", stored_generation, now); generation = stored_generation; } else { generation = now; } } auto req = sprint("INSERT INTO system.%s (key, gossip_generation) VALUES ('%s', ?)", LOCAL, LOCAL); return qctx->qp().execute_internal(req, {generation}).then([generation] (auto rs) { return force_blocking_flush(LOCAL); }).then([generation] { return make_ready_future(generation); }); }); } mutation make_size_estimates_mutation(const sstring& ks, std::vector estimates) { auto&& schema = db::system_keyspace::size_estimates(); auto timestamp = api::new_timestamp(); mutation m_to_apply{partition_key::from_single_value(*schema, utf8_type->decompose(ks)), schema}; for (auto&& e : estimates) { auto ck = clustering_key_prefix(std::vector{ utf8_type->decompose(e.schema->cf_name()), e.range_start_token, e.range_end_token}); m_to_apply.set_clustered_cell(ck, "mean_partition_size", e.mean_partition_size, timestamp); m_to_apply.set_clustered_cell(ck, "partitions_count", e.partitions_count, timestamp); } return m_to_apply; } } // namespace system_keyspace } // namespace db