The main idea is to log queries that take "too long" to complete.
The "too long" is above the given threshold.
To achieve the above this patch does the following:
- Introduce two new properties to the tracing::trace_state:
- "Full tracing": when the tracing of this query was explicitly requested.
In this state we will record all possible traces related to this query:
both on the coordinator and on any replica involved.
- "Log slow query": when slow query logging is enabled.
If slow query logging is enabled and a session's "duration" is above
the specified threshold we will create a record in the "slow queries log"
and write all trace records created on the coordinator and on a replica
if a replica's session lasts longer than that threshold.
(We will propagate the Coordinator's slow query logging threshold to replicas
in the context of a specific tracing/logging session).
The properties above are independent, namely they may be enabled and/or disabled
independently and any combination of them is legal (naturally, creating a tracing
session when both states above are disabled makes no sense).
- Instrument the tracing::tracing service to allow the following:
- Enable/disable slow query logging.
- Set/get the slow query duration threshold (in microseconds).
- Set/get the slow query log record TTL value (in seconds).
- Instrument the trace_keyspace_helper to write a slow query log entry
when requested.
- The slow query logging is disabled by default and the threshold is set to half a second.
- The TTL of a slow log record is set to 86400 seconds by default.
- It makes sense to use the same "slow query logging threshold" and a "slow query record TTL"
both on a coordinator and on a replica Nodes in a context of the same tracing session:
- Pass both TTL and a threshold to the replica in a trace_info.
This patch also implements the new slow query logging specific logic:
- Don't write the pending tracing records before the end of a tracing session
until "duration" reaches the logging threshold.
- Don't build the parameters<sstring, sstring> map unless we know we will write it
to I/O.
Signed-off-by: Vlad Zolotarov <vladz@cloudius-systems.com>
149 lines
5.1 KiB
C++
149 lines
5.1 KiB
C++
/*
|
|
* Licensed to the Apache Software Foundation (ASF) under one
|
|
* or more contributor license agreements. See the NOTICE file
|
|
* distributed with this work for additional information
|
|
* regarding copyright ownership. The ASF licenses this file
|
|
* to you under the Apache License, Version 2.0 (the
|
|
* "License"); you may not use this file except in compliance
|
|
* with the License. You may obtain a copy of the License at
|
|
*
|
|
* http://www.apache.org/licenses/LICENSE-2.0
|
|
*
|
|
* Unless required by applicable law or agreed to in writing, software
|
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
* See the License for the specific language governing permissions and
|
|
* limitations under the License.
|
|
*/
|
|
|
|
/*
|
|
* Copyright (C) 2016 ScyllaDB
|
|
*
|
|
* Modified by ScyllaDB
|
|
*/
|
|
|
|
/*
|
|
* This file is part of Scylla.
|
|
*
|
|
* Scylla is free software: you can redistribute it and/or modify
|
|
* it under the terms of the GNU Affero General Public License as published by
|
|
* the Free Software Foundation, either version 3 of the License, or
|
|
* (at your option) any later version.
|
|
*
|
|
* Scylla is distributed in the hope that it will be useful,
|
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
* GNU General Public License for more details.
|
|
*
|
|
* You should have received a copy of the GNU General Public License
|
|
* along with Scylla. If not, see <http://www.gnu.org/licenses/>.
|
|
*/
|
|
#include <chrono>
|
|
#include "tracing/trace_state.hh"
|
|
#include "tracing/trace_keyspace_helper.hh"
|
|
#include "service/storage_proxy.hh"
|
|
#include "to_string.hh"
|
|
#include "timestamp.hh"
|
|
|
|
namespace tracing {
|
|
|
|
logging::logger trace_state_logger("trace_state");
|
|
|
|
void trace_state::build_parameters_map() {
|
|
if (!_params_ptr) {
|
|
return;
|
|
}
|
|
|
|
auto& params_map = _records->session_rec.parameters;
|
|
params_values& vals = *_params_ptr;
|
|
|
|
if (vals.batchlog_endpoints) {
|
|
params_map.emplace("batch_endpoints", join(sstring(","), *vals.batchlog_endpoints | boost::adaptors::transformed([](gms::inet_address ep) {return seastar::format("/{}", ep);})));
|
|
}
|
|
|
|
if (vals.cl) {
|
|
params_map.emplace("consistency_level", seastar::format("{}", *vals.cl));
|
|
}
|
|
|
|
if (vals.serial_cl) {
|
|
params_map.emplace("serial_consistency_level", seastar::format("{}", *vals.serial_cl));
|
|
}
|
|
|
|
if (vals.page_size) {
|
|
params_map.emplace("page_size", seastar::format("{:d}", *vals.page_size));
|
|
}
|
|
|
|
if (vals.query) {
|
|
params_map.emplace("query", *vals.query);
|
|
}
|
|
|
|
if (vals.user_timestamp) {
|
|
params_map.emplace("user_timestamp", seastar::format("{:d}", *vals.user_timestamp));
|
|
}
|
|
}
|
|
|
|
trace_state::~trace_state() {
|
|
if (!is_primary() && is_in_state(state::background)) {
|
|
trace_state_logger.error("Secondary session is in a background state! session_id: {}", session_id());
|
|
}
|
|
|
|
stop_foreground_and_write();
|
|
_local_tracing_ptr->end_session();
|
|
|
|
trace_state_logger.trace("{}: destructing", session_id());
|
|
}
|
|
|
|
void trace_state::stop_foreground_and_write() {
|
|
// Do nothing if state hasn't been initiated
|
|
if (is_in_state(state::inactive)) {
|
|
return;
|
|
}
|
|
|
|
if (is_in_state(state::foreground)) {
|
|
auto e = elapsed();
|
|
_records->do_log_slow_query = should_log_slow_query(e);
|
|
|
|
if (is_primary()) {
|
|
// We don't account the session_record event when checking a limit
|
|
// of maximum events per session because there may be only one such
|
|
// event and we don't want to cripple the primary session by
|
|
// "stealing" one trace() event from it.
|
|
//
|
|
// We do want to account them however. If for instance there are a
|
|
// lot of tracing sessions that only open itself and then do nothing
|
|
// - they will create a lot of session_record events and we do want
|
|
// to handle this case properly.
|
|
_records->consume_from_budget();
|
|
|
|
_records->session_rec.elapsed = e;
|
|
|
|
// build_parameters_map() may throw. We don't want to record the
|
|
// session's record in this case since its data may be incomplete.
|
|
// These events should be really rare however, therefore we don't
|
|
// want to optimize this flow (e.g. rollback the corresponding
|
|
// events' records that have already been sent to I/O).
|
|
if (should_write_records()) {
|
|
try {
|
|
build_parameters_map();
|
|
} catch (...) {
|
|
// Bump up an error counter, drop any pending records and
|
|
// continue
|
|
++_local_tracing_ptr->stats.trace_errors;
|
|
_records->drop_records();
|
|
}
|
|
}
|
|
}
|
|
|
|
set_state(state::background);
|
|
}
|
|
|
|
trace_state_logger.trace("{}: Current records count is {}", session_id(), _records->size());
|
|
|
|
if (should_write_records()) {
|
|
_local_tracing_ptr->write_session_records(_records, write_on_close());
|
|
} else {
|
|
_records->drop_records();
|
|
}
|
|
}
|
|
}
|