/* * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ /* * Copyright (C) 2016 ScyllaDB * * Modified by ScyllaDB */ /* * This file is part of Scylla. * * Scylla is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * Scylla is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with Scylla. If not, see . */ #include #include "types.hh" #include "tracing/trace_keyspace_helper.hh" #include "service/migration_manager.hh" #include "cql3/statements/create_table_statement.hh" #include "cql3/statements/batch_statement.hh" namespace tracing { static logging::logger logger("trace_keyspace_helper"); const sstring trace_keyspace_helper::KEYSPACE_NAME("system_traces"); const sstring trace_keyspace_helper::SESSIONS("sessions"); const sstring trace_keyspace_helper::EVENTS("events"); const sstring trace_keyspace_helper::NODE_SLOW_QUERY_LOG("node_slow_log"); struct trace_keyspace_backend_sesssion_state final : public backend_session_state_base { int64_t last_nanos = 0; semaphore write_sem {1}; virtual ~trace_keyspace_backend_sesssion_state() {} }; trace_keyspace_helper::trace_keyspace_helper(tracing& tr) : i_tracing_backend_helper(tr) , _dummy_query_state(service::client_state(service::client_state::external_tag{})) , _sessions(SESSIONS, *this, sprint("CREATE TABLE IF NOT EXISTS %s.%s (" "session_id uuid," "command text," "client inet," "coordinator inet," "duration int," "parameters map," "request text," "started_at timestamp," "PRIMARY KEY ((session_id))) " "WITH default_time_to_live = 86400", KEYSPACE_NAME, SESSIONS), sprint("INSERT INTO %s.%s (" "session_id," "command," "client," "coordinator," "duration," "parameters," "request," "started_at) VALUES (?, ?, ?, ?, ?, ?, ?, ?) " "USING TTL ?", KEYSPACE_NAME, SESSIONS)) , _events(EVENTS, *this, sprint("CREATE TABLE IF NOT EXISTS %s.%s (" "session_id uuid," "event_id timeuuid," "activity text," "source inet," "source_elapsed int," "thread text," "PRIMARY KEY ((session_id), event_id)) " "WITH default_time_to_live = 86400", KEYSPACE_NAME, EVENTS), sprint("INSERT INTO %s.%s (" "session_id, " "event_id, " "activity, " "source, " "source_elapsed, " "thread) VALUES (?, ?, ?, ?, ?, ?) " "USING TTL ?", KEYSPACE_NAME, EVENTS)) , _slow_query_log(NODE_SLOW_QUERY_LOG, *this, sprint("CREATE TABLE IF NOT EXISTS %s.%s (" "node_ip inet," "shard int," "session_id uuid," "date timestamp," "start_time timeuuid," "command text," "duration int," "parameters map," "source_ip inet," "table_names set," "username text," "PRIMARY KEY (start_time, node_ip, shard)) " "WITH default_time_to_live = 86400", KEYSPACE_NAME, NODE_SLOW_QUERY_LOG), sprint("INSERT INTO %s.%s (" "node_ip," "shard," "session_id," "date," "start_time," "command," "duration," "parameters," "source_ip," "table_names," "username) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?) " "USING TTL ?", KEYSPACE_NAME, NODE_SLOW_QUERY_LOG)) { namespace sm = seastar::metrics; _metrics.add_group("tracing_keyspace_helper", { sm::make_derive("tracing_errors", [this] { return _stats.tracing_errors; }, sm::description("Counts a number of errors during writing to a system_traces keyspace. " "One error may cause one or more tracing records to be lost.")), sm::make_derive("bad_column_family_errors", [this] { return _stats.bad_column_family_errors; }, sm::description("Counts a number of times write failed due to one of the tables in the system_traces keyspace has an incompatible schema. " "One error may result one or more tracing records to be lost. " "Non-zero value indicates that the administrator has to take immediate steps to fix the corresponding schema. " "The appropriate error message will be printed in the syslog.")), }); } future<> trace_keyspace_helper::table_helper::setup_table() const { auto& qp = cql3::get_local_query_processor(); auto& db = qp.db().local(); if (db.has_schema(KEYSPACE_NAME, _name)) { return make_ready_future<>(); } ::shared_ptr parsed = static_pointer_cast< cql3::statements::raw::cf_statement>(cql3::query_processor::parse_statement(_create_cql)); parsed->prepare_keyspace(KEYSPACE_NAME); ::shared_ptr statement = static_pointer_cast( parsed->prepare(db, qp.get_cql_stats())->statement); auto schema = statement->get_cf_meta_data(); // Generate the CF UUID based on its KF names. This is needed to ensure that // all Nodes that create it would create it with the same UUID and we don't // hit the #420 issue. auto uuid = generate_legacy_id(schema->ks_name(), schema->cf_name()); schema_builder b(schema); b.set_uuid(uuid); // We don't care it it fails really - this may happen due to concurrent // "CREATE TABLE" invocation on different Nodes. // The important thing is that it will converge eventually (some traces may // be lost in a process but that's ok). return service::get_local_migration_manager().announce_new_column_family(b.build(), false).discard_result().handle_exception([this] (auto ep) {});; } future<> trace_keyspace_helper::start() { if (engine().cpu_id() == 0) { return seastar::async([this] { auto& db = cql3::get_local_query_processor().db().local(); // Create a keyspace if (!db.has_keyspace(KEYSPACE_NAME)) { std::map opts; opts["replication_factor"] = "2"; auto ksm = keyspace_metadata::new_keyspace(KEYSPACE_NAME, "org.apache.cassandra.locator.SimpleStrategy", std::move(opts), true); // We use min_timestamp so that default keyspace metadata will loose with any manual adjustments. See issue #2129. service::get_local_migration_manager().announce_new_keyspace(ksm, api::min_timestamp, false).get(); } _dummy_query_state.get_client_state().set_keyspace(cql3::get_local_query_processor().db(), KEYSPACE_NAME); // Create tables _sessions.setup_table().get(); _events.setup_table().get(); _slow_query_log.setup_table().get(); }); } else { return make_ready_future<>(); } } void trace_keyspace_helper::write_one_session_records(lw_shared_ptr records) { with_gate(_pending_writes, [this, records = std::move(records)] { auto num_records = records->size(); return this->flush_one_session_mutations(std::move(records)).finally([this, num_records] { _local_tracing.write_complete(num_records); }); }).handle_exception([this] (auto ep) { try { ++_stats.tracing_errors; std::rethrow_exception(ep); } catch (exceptions::overloaded_exception&) { logger.warn("Too many nodes are overloaded to save trace events"); } catch (bad_column_family& e) { if (_stats.bad_column_family_errors++ % bad_column_family_message_period == 0) { logger.warn("Tracing is enabled but {}", e.what()); } } catch (std::logic_error& e) { logger.error(e.what()); } catch (...) { // TODO: Handle some more exceptions maybe? } }).discard_result(); } void trace_keyspace_helper::write_records_bulk(records_bulk& bulk) { logger.trace("Writing {} sessions", bulk.size()); std::for_each(bulk.begin(), bulk.end(), [this] (records_bulk::value_type& one_session_records_ptr) { write_one_session_records(std::move(one_session_records_ptr)); }); } cql3::query_options trace_keyspace_helper::make_session_mutation_data(const one_session_records& session_records) { const session_record& record = session_records.session_rec; auto millis_since_epoch = std::chrono::duration_cast(record.started_at.time_since_epoch()).count(); std::vector> parameters_values_vector; parameters_values_vector.reserve(record.parameters.size()); std::for_each(record.parameters.begin(), record.parameters.end(), [¶meters_values_vector] (auto& val_pair) { parameters_values_vector.emplace_back(val_pair.first, val_pair.second); }); auto my_map_type = map_type_impl::get_instance(utf8_type, utf8_type, true); std::vector values { cql3::raw_value::make_value(uuid_type->decompose(session_records.session_id)), cql3::raw_value::make_value(utf8_type->decompose(type_to_string(record.command))), cql3::raw_value::make_value(inet_addr_type->decompose(record.client.addr())), cql3::raw_value::make_value(inet_addr_type->decompose(utils::fb_utilities::get_broadcast_address().addr())), cql3::raw_value::make_value(int32_type->decompose(elapsed_to_micros(record.elapsed))), cql3::raw_value::make_value(make_map_value(my_map_type, map_type_impl::native_type(std::move(parameters_values_vector))).serialize()), cql3::raw_value::make_value(utf8_type->decompose(record.request)), cql3::raw_value::make_value(timestamp_type->decompose(millis_since_epoch)), cql3::raw_value::make_value(int32_type->decompose((int32_t)(session_records.ttl.count()))) }; return cql3::query_options(db::consistency_level::ANY, std::experimental::nullopt, std::move(values), false, cql3::query_options::specific_options::DEFAULT, cql_serialization_format::latest()); } cql3::query_options trace_keyspace_helper::make_slow_query_mutation_data(const one_session_records& session_records, const utils::UUID& start_time_id) { const session_record& record = session_records.session_rec; auto millis_since_epoch = std::chrono::duration_cast(record.started_at.time_since_epoch()).count(); // query command is stored on a parameters map with a 'query' key auto query_str_it = record.parameters.find("query"); if (query_str_it == record.parameters.end()) { throw std::logic_error("No \"query\" parameter set for a session requesting a slow_query_log record"); } // parameters map std::vector> parameters_values_vector; parameters_values_vector.reserve(record.parameters.size()); std::for_each(record.parameters.begin(), record.parameters.end(), [¶meters_values_vector] (auto& val_pair) { parameters_values_vector.emplace_back(val_pair.first, val_pair.second); }); auto my_map_type = map_type_impl::get_instance(utf8_type, utf8_type, true); // set of tables involved in this query std::vector tables_names_vector; tables_names_vector.reserve(record.tables.size()); std::for_each(record.tables.begin(), record.tables.end(), [&tables_names_vector] (auto& val) { tables_names_vector.emplace_back(val); }); auto my_set_type = set_type_impl::get_instance(utf8_type, true); std::vector values({ cql3::raw_value::make_value(inet_addr_type->decompose(utils::fb_utilities::get_broadcast_address().addr())), cql3::raw_value::make_value(int32_type->decompose((int32_t)(engine().cpu_id()))), cql3::raw_value::make_value(uuid_type->decompose(session_records.session_id)), cql3::raw_value::make_value(timestamp_type->decompose(millis_since_epoch)), cql3::raw_value::make_value(timeuuid_type->decompose(start_time_id)), cql3::raw_value::make_value(utf8_type->decompose(query_str_it->second)), cql3::raw_value::make_value(int32_type->decompose(elapsed_to_micros(record.elapsed))), cql3::raw_value::make_value(make_map_value(my_map_type, map_type_impl::native_type(std::move(parameters_values_vector))).serialize()), cql3::raw_value::make_value(inet_addr_type->decompose(record.client.addr())), cql3::raw_value::make_value(make_set_value(my_set_type, set_type_impl::native_type(std::move(tables_names_vector))).serialize()), cql3::raw_value::make_value(utf8_type->decompose(record.username)), cql3::raw_value::make_value(int32_type->decompose((int32_t)(record.slow_query_record_ttl.count()))) }); return cql3::query_options(db::consistency_level::ANY, std::experimental::nullopt, std::move(values), false, cql3::query_options::specific_options::DEFAULT, cql_serialization_format::latest()); } std::vector trace_keyspace_helper::make_event_mutation_data(one_session_records& session_records, const event_record& record) { auto backend_state_ptr = static_cast(session_records.backend_state_ptr.get()); std::vector values({ cql3::raw_value::make_value(uuid_type->decompose(session_records.session_id)), cql3::raw_value::make_value(timeuuid_type->decompose(utils::UUID_gen::get_time_UUID(make_monotonic_UUID_tp(backend_state_ptr->last_nanos, record.event_time_point)))), cql3::raw_value::make_value(utf8_type->decompose(record.message)), cql3::raw_value::make_value(inet_addr_type->decompose(utils::fb_utilities::get_broadcast_address().addr())), cql3::raw_value::make_value(int32_type->decompose(elapsed_to_micros(record.elapsed))), cql3::raw_value::make_value(utf8_type->decompose(_local_tracing.get_thread_name())), cql3::raw_value::make_value(int32_type->decompose((int32_t)(session_records.ttl.count()))) }); return values; } future<> trace_keyspace_helper::apply_events_mutation(lw_shared_ptr records, std::deque& events_records) { if (events_records.empty()) { return now(); } return _events.cache_table_info().then([this, records, &events_records] { logger.trace("{}: storing {} events records", records->session_id, events_records.size()); std::vector> modifications(events_records.size(), _events.insert_stmt()); std::vector> values; auto& qp = cql3::get_local_query_processor(); values.reserve(events_records.size()); std::for_each(events_records.begin(), events_records.end(), [&values, all_records = records, this] (event_record& one_event_record) { values.emplace_back(make_event_mutation_data(*all_records, one_event_record)); }); return do_with( cql3::query_options::make_batch_options(cql3::query_options(db::consistency_level::ANY, std::experimental::nullopt, std::vector{}, false, cql3::query_options::specific_options::DEFAULT, cql_serialization_format::latest()), std::move(values)), cql3::statements::batch_statement(cql3::statements::batch_statement::type::UNLOGGED, std::move(modifications), cql3::attributes::none(), qp.get_cql_stats()), [this] (auto& batch_options, auto& batch) { return batch.execute(service::get_storage_proxy(), _dummy_query_state, batch_options).then([] (shared_ptr res) { return now(); }); } ); }); } future<> trace_keyspace_helper::flush_one_session_mutations(lw_shared_ptr records) { // grab events records available so far return do_with(std::move(records->events_recs), [this, records] (std::deque& events_records) { records->events_recs.clear(); // Check if a session's record is ready before handling events' records. // // New event's records and a session's record may become ready while a // mutation with the current events' records is being written. We don't want // to allow the situation when a session's record is written before the last // event record from the same session. bool session_record_is_ready = records->session_rec.ready(); // From this point on - all new data will have to be handled in the next write event records->data_consumed(); // We want to serialize the creation of events mutations in order to ensure // that mutations for events that were created first are going to be // created first too. auto backend_state_ptr = static_cast(records->backend_state_ptr.get()); semaphore& write_sem = backend_state_ptr->write_sem; return with_semaphore(write_sem, 1, [this, records, session_record_is_ready, &events_records] { return apply_events_mutation(records, events_records).then([this, session_record_is_ready, records] { if (session_record_is_ready) { // if session is finished - store a session and a session time index entries logger.trace("{}: going to store a session event", records->session_id); return _sessions.insert(make_session_mutation_data, *records).then([this, records] { if (!records->do_log_slow_query) { return now(); } auto start_time_id = utils::UUID_gen::get_time_UUID(make_monotonic_UUID_tp(_slow_query_last_nanos, records->session_rec.started_at)); logger.trace("{}: going to store a slow query event", records->session_id); return _slow_query_log.insert(make_slow_query_mutation_data, *records, start_time_id); }); } else { return now(); } }); }).finally([records] {}); }); } std::unique_ptr trace_keyspace_helper::allocate_session_state() const { return std::make_unique(); } future<> trace_keyspace_helper::table_helper::cache_table_info() { if (_prepared_stmt) { return now(); } else { // if prepared statement has been invalidated - drop cached pointers _insert_stmt = nullptr; } return cql3::get_local_query_processor().prepare(_insert_cql, _ks_helper.get_dummy_qs().get_client_state(), false).then([this] (shared_ptr msg_ptr) { _prepared_stmt = std::move(msg_ptr->get_prepared()); shared_ptr cql_stmt = _prepared_stmt->statement; _insert_stmt = dynamic_pointer_cast(cql_stmt); }).handle_exception([this] (auto eptr) { // One of the possible causes for an error here could be the table that doesn't exist. this->setup_table().discard_result(); // We throw the bad_column_family exception because the caller // expects and accounts this type of errors. try { std::rethrow_exception(eptr); } catch (std::exception& e) { throw bad_column_family(_name, e); } catch (...) { throw bad_column_family(_name); } }); } using registry = class_registrator; static registry registrator1("trace_keyspace_helper"); }