/* * Copyright (C) 2016-present ScyllaDB * * Modified by ScyllaDB */ /* * SPDX-License-Identifier: (AGPL-3.0-or-later and Apache-2.0) */ #pragma once #include #include #include #include #include #include "gc_clock.hh" #include "utils/UUID.hh" #include "gms/inet_address.hh" #include "enum_set.hh" #include "log.hh" #include "seastarx.hh" namespace service { class migration_manager; } namespace cql3 { class query_processor; } namespace tracing { using elapsed_clock = std::chrono::steady_clock; extern logging::logger tracing_logger; class trace_state_ptr; class tracing; enum class trace_type : uint8_t { NONE, QUERY, REPAIR, }; extern std::vector trace_type_names; inline const sstring& type_to_string(trace_type t) { return trace_type_names.at(static_cast(t)); } /** * Returns a TTL for a given trace type * @param t trace type * * @return TTL */ inline std::chrono::seconds ttl_by_type(const trace_type t) { switch (t) { case trace_type::NONE: case trace_type::QUERY: return std::chrono::seconds(86400); // 1 day case trace_type::REPAIR: return std::chrono::seconds(604800); // 7 days default: // unknown type value - must be a SW bug throw std::invalid_argument("unknown trace type: " + std::to_string(int(t))); } } /** * @brief represents an ID of a single tracing span. * * Currently span ID is a random 64-bit integer. */ class span_id { private: uint64_t _id = illegal_id; public: static constexpr uint64_t illegal_id = 0; public: span_id() = default; uint64_t get_id() const { return _id; } span_id(uint64_t id) : _id(id) {} /** * @return New span_id with a random legal value */ static span_id make_span_id(); }; // !!!!IMPORTANT!!!! // // The enum_set based on this enum is serialized using IDL, therefore new items // should always be added to the end of this enum - never before the existing // ones. // // Otherwise this may break IDL's backward compatibility. enum class trace_state_props { write_on_close, primary, log_slow_query, full_tracing, ignore_events }; using trace_state_props_set = enum_set>; class trace_info { public: utils::UUID session_id; trace_type type; bool write_on_close; trace_state_props_set state_props; uint32_t slow_query_threshold_us; // in microseconds uint32_t slow_query_ttl_sec; // in seconds span_id parent_id; uint64_t start_ts_us = 0u; // sentinel value (== "unset") public: trace_info(utils::UUID sid, trace_type t, bool w_o_c, trace_state_props_set s_p, uint32_t slow_query_threshold, uint32_t slow_query_ttl, span_id p_id, uint64_t s_t_u) : session_id(std::move(sid)) , type(t) , write_on_close(w_o_c) , state_props(s_p) , slow_query_threshold_us(slow_query_threshold) , slow_query_ttl_sec(slow_query_ttl) , parent_id(std::move(p_id)) , start_ts_us(s_t_u) { state_props.set_if(write_on_close); } }; struct one_session_records; using records_bulk = std::deque>; struct backend_session_state_base { virtual ~backend_session_state_base() {}; }; struct i_tracing_backend_helper { using wall_clock = std::chrono::system_clock; protected: tracing& _local_tracing; public: using ptr_type = std::unique_ptr; i_tracing_backend_helper(tracing& tr) : _local_tracing(tr) {} virtual ~i_tracing_backend_helper() {} virtual future<> start(cql3::query_processor& qp, service::migration_manager& mm) = 0; virtual future<> shutdown() = 0; /** * Write a bulk of tracing records. * * This function has to clear a scheduled state of each one_session_records object * in the @param bulk after it has been actually passed to the backend for writing. * * @param bulk a bulk of records */ virtual void write_records_bulk(records_bulk& bulk) = 0; virtual std::unique_ptr allocate_session_state() const = 0; private: friend class tracing; }; struct event_record { std::string message; elapsed_clock::duration elapsed; i_tracing_backend_helper::wall_clock::time_point event_time_point; event_record(sstring message_, elapsed_clock::duration elapsed_, i_tracing_backend_helper::wall_clock::time_point event_time_point_) : message(std::move(message_)) , elapsed(elapsed_) , event_time_point(event_time_point_) {} }; struct session_record { gms::inet_address client; // Keep the containers below sorted since some backends require that and // it's very cheap to always do that because the amount of elements in a // container is very small. std::map parameters; std::set tables; sstring username; sstring request; size_t request_size = 0; size_t response_size = 0; std::chrono::system_clock::time_point started_at; trace_type command = trace_type::NONE; elapsed_clock::duration elapsed; std::chrono::seconds slow_query_record_ttl; private: bool _consumed = false; public: session_record(trace_type cmd, std::chrono::seconds ttl) : username("") , command(cmd) , elapsed(-1) , slow_query_record_ttl(ttl) {} bool ready() const { return elapsed.count() >= 0 && !_consumed; } void set_consumed() { _consumed = true; } }; class one_session_records { private: shared_ptr _local_tracing_ptr; public: utils::UUID session_id; session_record session_rec; std::chrono::seconds ttl; std::deque events_recs; std::unique_ptr backend_state_ptr; bool do_log_slow_query = false; // A pointer to the records counter of the corresponding state new records // of this tracing session should consume from (e.g. "cached" or "pending // for write"). uint64_t* budget_ptr; // Each tracing session object represents a single tracing span. // // Each span has a span ID. In order to be able to build a full tree of all // spans of the same query we need a parent span ID as well. span_id parent_id; span_id my_span_id; one_session_records(trace_type type, std::chrono::seconds slow_query_ttl, std::chrono::seconds slow_query_rec_ttl, std::optional session_id = std::nullopt, span_id parent_id = span_id::illegal_id); /** * Consume a single record from the per-shard budget. */ void consume_from_budget() { ++(*budget_ptr); } /** * Drop all pending records and return the budget. */ void drop_records() { (*budget_ptr) -= size(); events_recs.clear(); session_rec.set_consumed(); } /** * Should be called when a record is scheduled for write. * From that point till data_consumed() call all new records will be written * in the next write event. */ inline void set_pending_for_write(); /** * Should be called after all data pending to be written in this record has * been processed. * From that point on new records are cached internally and have to be * explicitly committed for write in order to be written during the write event. */ inline void data_consumed(); bool is_pending_for_write() const { return _is_pending_for_write; } uint64_t size() const { return events_recs.size() + session_rec.ready(); } private: bool _is_pending_for_write = false; }; class tracing : public seastar::async_sharded_service { public: static const gc_clock::duration write_period; // maximum number of sessions pending for write per shard static constexpr int max_pending_sessions = 1000; // expectation of an average number of trace records per session static constexpr int exp_trace_events_per_session = 10; // maximum allowed pending records per-shard static constexpr int max_pending_trace_records = max_pending_sessions * exp_trace_events_per_session; // number of pending sessions that would trigger a write event static constexpr int write_event_sessions_threshold = 100; // number of pending records that would trigger a write event static constexpr int write_event_records_threshold = write_event_sessions_threshold * exp_trace_events_per_session; // Number of events when an info message is printed static constexpr int log_warning_period = 10000; static const std::chrono::microseconds default_slow_query_duraion_threshold; static const std::chrono::seconds default_slow_query_record_ttl; struct stats { uint64_t dropped_sessions = 0; uint64_t dropped_records = 0; uint64_t trace_records_count = 0; uint64_t trace_errors = 0; } stats; private: // A number of currently active tracing sessions uint64_t _active_sessions = 0; // Below are 3 counters that describe the total amount of tracing records on // this shard. Each counter describes a state in which a record may be. // // Each record may only be in a specific state at every point of time and // thereby it must be accounted only in one and only one of the three // counters below at any given time. // // The sum of all three counters should not be greater than // (max_pending_trace_records + write_event_records_threshold) at any time // (actually it can get as high as a value above plus (max_pending_sessions) // if all sessions are primary but we won't take this into an account for // simplicity). // // The same is about the number of outstanding sessions: it may not be // greater than (max_pending_sessions + write_event_sessions_threshold) at // any time. // // If total number of tracing records is greater or equal to the limit // above, the new trace point is going to be dropped. // // If current number or records plus the expected number of trace records // per session (exp_trace_events_per_session) is greater than the limit // above new sessions will be dropped. A new session will also be dropped if // there are too many active sessions. // // When the record or a session is dropped the appropriate statistics // counters are updated and there is a rate-limited warning message printed // to the log. // // Every time a number of records pending for write is greater or equal to // (write_event_records_threshold) or a number of sessions pending for // write is greater or equal to (write_event_sessions_threshold) a write // event is issued. // // Every 2 seconds a timer would write all pending for write records // available so far. // Total number of records cached in the active sessions that are not going // to be written in the next write event uint64_t _cached_records = 0; // Total number of records that are currently being written to I/O uint64_t _flushing_records = 0; // Total number of records in the _pending_for_write_records_bulk. All of // them are going to be written to the I/O during the next write event. uint64_t _pending_for_write_records_count = 0; records_bulk _pending_for_write_records_bulk; timer _write_timer; // _down becomes FALSE after the local service is fully initialized and // tracing records are allowed to be created and collected. It becomes TRUE // after the shutdown() call and prevents further write attempts to I/O // backend. bool _down = true; // If _slow_query_logging_enabled is enabled, a query processor keeps all // trace events related to the query until in the end it can decide // if the query was slow to be saved. bool _slow_query_logging_enabled = false; // If _ignore_trace_events is enabled, tracing::trace ignores all tracing // events as well as creating trace_state descendants with trace_info to // track tracing sessions only. This is used to implement lightweight // slow query tracing. bool _ignore_trace_events = false; std::unique_ptr _tracing_backend_helper_ptr; sstring _thread_name; sstring _tracing_backend_helper_class_name; seastar::metrics::metric_groups _metrics; double _trace_probability = 0.0; // keep this one for querying purposes uint64_t _normalized_trace_probability = 0; std::ranlux48_base _gen; std::chrono::microseconds _slow_query_duration_threshold; std::chrono::seconds _slow_query_record_ttl; public: uint64_t get_next_rand_uint64() { return _gen(); } i_tracing_backend_helper& backend_helper() { return *_tracing_backend_helper_ptr; } const sstring& get_thread_name() const { return _thread_name; } static seastar::sharded& tracing_instance() { // FIXME: leaked intentionally to avoid shutdown problems, see #293 static seastar::sharded* tracing_inst = new seastar::sharded(); return *tracing_inst; } static tracing& get_local_tracing_instance() { return tracing_instance().local(); } bool started() const { return !_down; } tracing(sstring tracing_backend_helper_class_name); // Initialize a tracing backend (e.g. tracing_keyspace or logstash) future<> start(cql3::query_processor& qp, service::migration_manager& mm); future<> stop(); /** * Waits until all pending tracing records are flushed to the backend an * shuts down the backend. The following calls to * write_session_record()/write_event_record() methods of a backend instance * should be a NOOP. * * @return a ready future when the shutdown is complete */ future<> shutdown(); void write_pending_records() { if (_pending_for_write_records_bulk.size()) { _flushing_records += _pending_for_write_records_count; stats.trace_records_count += _pending_for_write_records_count; _pending_for_write_records_count = 0; _tracing_backend_helper_ptr->write_records_bulk(_pending_for_write_records_bulk); _pending_for_write_records_bulk.clear(); } } void write_complete(uint64_t nr = 1) { if (nr > _flushing_records) { throw std::logic_error(seastar::format("completing more records ({:d}) than there are pending ({:d})", nr, _flushing_records)); } _flushing_records -= nr; } /** * Create a new primary tracing session. * * @param type a tracing session type * @param props trace session properties set * * @return tracing state handle */ trace_state_ptr create_session(trace_type type, trace_state_props_set props) noexcept; /** * Create a new secondary tracing session. * * @param secondary_session_info tracing session info * * @return tracing state handle */ trace_state_ptr create_session(const trace_info& secondary_session_info) noexcept; void write_maybe() { if (_pending_for_write_records_count >= write_event_records_threshold || _pending_for_write_records_bulk.size() >= write_event_sessions_threshold) { write_pending_records(); } } void end_session() { --_active_sessions; } void write_session_records(lw_shared_ptr records, bool write_now) { // if service is down - drop the records and return if (_down) { return; } try { schedule_for_write(std::move(records)); } catch (...) { // OOM: bump up the error counter and ignore ++stats.trace_errors; return; } if (write_now) { write_pending_records(); } else { write_maybe(); } } /** * Sets a probability for tracing a CQL request. * * @param p a new tracing probability - a floating point value in a [0,1] * range. It would effectively define a portion of CQL requests * initiated on the current Node that will be traced. * @throw std::invalid_argument if @ref p is out of range */ void set_trace_probability(double p); double get_trace_probability() const { return _trace_probability; } bool trace_next_query() { return _normalized_trace_probability != 0 && _gen() < _normalized_trace_probability; } std::unique_ptr allocate_backend_session_state() const { return _tracing_backend_helper_ptr->allocate_session_state(); } /** * Checks if there is enough budget for the @param nr new records * @param nr number of new records * * @return TRUE if there is enough budget, FALSE otherwise */ bool have_records_budget(uint64_t nr = 1) { // We don't want the total amount of pending, active and flushing records to // bypass the maximum number of pending records plus the number of // records that are possibly being written write now. // // If either records are being created too fast or a backend doesn't // keep up we want to start dropping records. // In any case, this should be rare. if (_pending_for_write_records_count + _cached_records + _flushing_records + nr > max_pending_trace_records + write_event_records_threshold) { return false; } return true; } uint64_t* get_pending_records_ptr() { return &_pending_for_write_records_count; } uint64_t* get_cached_records_ptr() { return &_cached_records; } void schedule_for_write(lw_shared_ptr records) { if (records->is_pending_for_write()) { return; } _pending_for_write_records_bulk.emplace_back(records); records->set_pending_for_write(); // move the current records from a "cached" to "pending for write" state auto current_records_num = records->size(); _cached_records -= current_records_num; _pending_for_write_records_count += current_records_num; } void set_slow_query_enabled(bool enable = true) { _slow_query_logging_enabled = enable; } bool slow_query_tracing_enabled() const { return _slow_query_logging_enabled; } void set_ignore_trace_events(bool enable = true) { _ignore_trace_events = enable; } bool ignore_trace_events_enabled() const { return _ignore_trace_events; } /** * Set the slow query threshold * * We limit the number of microseconds in the threshold by a maximal unsigned 32-bit * integer. * * If a new threshold value exceeds the above limitation we will override it * with the value based on a limit above. * * @param new_threshold new threshold value */ void set_slow_query_threshold(std::chrono::microseconds new_threshold) { if (new_threshold.count() > std::numeric_limits::max()) { _slow_query_duration_threshold = std::chrono::microseconds(std::numeric_limits::max()); return; } _slow_query_duration_threshold = new_threshold; } std::chrono::microseconds slow_query_threshold() const { return _slow_query_duration_threshold; } /** * Set the slow query record TTL * * We limit the number of seconds in the TTL by a maximal signed 32-bit * integer. * * If a new TTL value exceeds the above limitation we will override it * with the value based on a limit above. * * @param new_ttl new TTL */ void set_slow_query_record_ttl(std::chrono::seconds new_ttl) { if (new_ttl.count() > std::numeric_limits::max()) { _slow_query_record_ttl = std::chrono::seconds(std::numeric_limits::max()); return; } _slow_query_record_ttl = new_ttl; } std::chrono::seconds slow_query_record_ttl() const { return _slow_query_record_ttl; } private: void write_timer_callback(); /** * Check if we may create a new tracing session. * * @return TRUE if conditions are allowing creating a new tracing session */ bool may_create_new_session(const std::optional& session_id = std::nullopt); }; void one_session_records::set_pending_for_write() { _is_pending_for_write = true; budget_ptr = _local_tracing_ptr->get_pending_records_ptr(); } void one_session_records::data_consumed() { if (session_rec.ready()) { session_rec.set_consumed(); } _is_pending_for_write = false; budget_ptr = _local_tracing_ptr->get_cached_records_ptr(); } inline span_id span_id::make_span_id() { // make sure the value is always greater than 0 return 1 + (tracing::get_local_tracing_instance().get_next_rand_uint64() << 1); } } template <> struct fmt::formatter { constexpr auto parse(format_parse_context& ctx) { return ctx.begin(); } auto format(const tracing::span_id& id, fmt::format_context& ctx) const { return fmt::format_to(ctx.out(), "{}", id.get_id()); } };