Currently, perf-simple-query summarizes the statistics
only for the throughput, printing the median,
median absolute deviation, minimum, and maximum.
But the throughput put is typically highly variable
and its median is noisy.
This patch calculates also the mean and standard deviation
and does that also for instructions_per_op and cpu_cycles_per_op
to present a fuller picture of the performance metrics.
Output example:
```
random-seed=3383668492
enable-cache=1
Running test with config: {partitions=10000, concurrency=100, mode=read, frontend=cql, query_single_key=no, counters=no}
Disabling auto compaction
Creating 10000 partitions...
95613.97 tps ( 63.1 allocs/op, 0.0 logallocs/op, 14.1 tasks/op, 42456 insns/op, 22117 cycles/op, 0 errors)
97538.45 tps ( 63.1 allocs/op, 0.0 logallocs/op, 14.1 tasks/op, 42454 insns/op, 22094 cycles/op, 0 errors)
95883.37 tps ( 63.1 allocs/op, 0.0 logallocs/op, 14.1 tasks/op, 42438 insns/op, 22268 cycles/op, 0 errors)
96791.45 tps ( 63.1 allocs/op, 0.0 logallocs/op, 14.1 tasks/op, 42433 insns/op, 22256 cycles/op, 0 errors)
97894.71 tps ( 63.1 allocs/op, 0.0 logallocs/op, 14.1 tasks/op, 42420 insns/op, 22010 cycles/op, 0 errors)
throughput: mean=96744.39 standard-deviation=996.89 median=96791.45 median-absolute-deviation=861.02 maximum=97894.71 minimum=95613.97
instructions_per_op: mean=42440.08 standard-deviation=14.99 median=42437.59 median-absolute-deviation=13.58 maximum=42456.15 minimum=42420.10
cpu_cycles_per_op: mean=22148.98 standard-deviation=110.43 median=22117.04 median-absolute-deviation=106.89 maximum=22267.70 minimum=22010.42
```
Signed-off-by: Benny Halevy <bhalevy@scylladb.com>
Closes scylladb/scylladb#19450
330 lines
11 KiB
C++
330 lines
11 KiB
C++
/*
|
|
* Copyright (C) 2015-present ScyllaDB
|
|
*/
|
|
|
|
/*
|
|
* SPDX-License-Identifier: AGPL-3.0-or-later
|
|
*/
|
|
|
|
#pragma once
|
|
|
|
#include <seastar/core/print.hh>
|
|
#include <seastar/core/future-util.hh>
|
|
#include <seastar/core/distributed.hh>
|
|
#include <seastar/core/weak_ptr.hh>
|
|
#include <seastar/coroutine/as_future.hh>
|
|
#include "seastarx.hh"
|
|
#include "utils/extremum_tracking.hh"
|
|
#include "utils/estimated_histogram.hh"
|
|
#include <seastar/testing/linux_perf_event.hh>
|
|
#include <seastar/util/defer.hh>
|
|
#include "reader_permit.hh"
|
|
|
|
#include <chrono>
|
|
#include <iosfwd>
|
|
#include <boost/range/irange.hpp>
|
|
#include <vector>
|
|
|
|
template <typename Func>
|
|
static
|
|
void time_it(Func func, int iterations = 5, int iterations_between_clock_readings = 1000) {
|
|
using clk = std::chrono::steady_clock;
|
|
|
|
for (int i = 0; i < iterations; i++) {
|
|
auto start = clk::now();
|
|
auto end_at = start + std::chrono::seconds(1);
|
|
uint64_t count = 0;
|
|
|
|
while (clk::now() < end_at) {
|
|
for (int i = 0; i < iterations_between_clock_readings; i++) { // amortize clock reading cost
|
|
func();
|
|
count++;
|
|
}
|
|
}
|
|
|
|
auto end = clk::now();
|
|
auto duration = std::chrono::duration<double>(end - start).count();
|
|
std::cout << format("{:.2f}", (double)count / duration) << " tps\n";
|
|
}
|
|
}
|
|
|
|
struct executor_shard_stats {
|
|
uint64_t invocations = 0;
|
|
uint64_t allocations = 0;
|
|
uint64_t log_allocations = 0;
|
|
uint64_t tasks_executed = 0;
|
|
uint64_t instructions_retired = 0;
|
|
uint64_t cpu_cycles_retired = 0;
|
|
uint64_t errors = 0;
|
|
};
|
|
|
|
inline
|
|
executor_shard_stats
|
|
operator+(executor_shard_stats a, executor_shard_stats b) {
|
|
a.invocations += b.invocations;
|
|
a.allocations += b.allocations;
|
|
a.log_allocations += b.log_allocations;
|
|
a.tasks_executed += b.tasks_executed;
|
|
a.instructions_retired += b.instructions_retired;
|
|
a.cpu_cycles_retired += b.cpu_cycles_retired;
|
|
a.errors += b.errors;
|
|
return a;
|
|
}
|
|
|
|
inline
|
|
executor_shard_stats
|
|
operator-(executor_shard_stats a, executor_shard_stats b) {
|
|
a.invocations -= b.invocations;
|
|
a.allocations -= b.allocations;
|
|
a.log_allocations -= b.log_allocations;
|
|
a.tasks_executed -= b.tasks_executed;
|
|
a.instructions_retired -= b.instructions_retired;
|
|
a.cpu_cycles_retired -= b.cpu_cycles_retired;
|
|
a.errors -= b.errors;
|
|
return a;
|
|
}
|
|
|
|
uint64_t perf_tasks_processed();
|
|
uint64_t perf_mallocs();
|
|
uint64_t perf_logallocs();
|
|
|
|
// Drives concurrent and continuous execution of given asynchronous action
|
|
// until a deadline. Counts invocations and collects statistics.
|
|
template <typename Func>
|
|
class executor {
|
|
const Func _func;
|
|
const lowres_clock::time_point _end_at;
|
|
const uint64_t _end_at_count;
|
|
const unsigned _n_workers;
|
|
const bool _stop_on_error;
|
|
uint64_t _count;
|
|
uint64_t _errors;
|
|
linux_perf_event _instructions_retired_counter = linux_perf_event::user_instructions_retired();
|
|
linux_perf_event _cpu_cycles_retired_counter = linux_perf_event::user_cpu_cycles_retired();
|
|
private:
|
|
executor_shard_stats executor_shard_stats_snapshot();
|
|
future<> run_worker() {
|
|
while (_end_at_count ? _count < _end_at_count : lowres_clock::now() < _end_at) {
|
|
++_count;
|
|
future<> f = co_await coroutine::as_future(_func());
|
|
if (f.failed()) {
|
|
++_errors;
|
|
if (_stop_on_error) [[unlikely]] {
|
|
co_return co_await std::move(f);
|
|
}
|
|
f.ignore_ready_future();
|
|
}
|
|
}
|
|
}
|
|
public:
|
|
executor(unsigned n_workers, Func func, lowres_clock::time_point end_at, uint64_t end_at_count = 0, bool stop_on_error = true)
|
|
: _func(std::move(func))
|
|
, _end_at(end_at)
|
|
, _end_at_count(end_at_count)
|
|
, _n_workers(n_workers)
|
|
, _stop_on_error(stop_on_error)
|
|
, _count(0)
|
|
, _errors(0)
|
|
{ }
|
|
|
|
// Returns the number of invocations of @func
|
|
future<executor_shard_stats> run() {
|
|
auto stats_start = executor_shard_stats_snapshot();
|
|
_instructions_retired_counter.enable();
|
|
_cpu_cycles_retired_counter.enable();
|
|
auto idx = boost::irange(0, (int)_n_workers);
|
|
return parallel_for_each(idx.begin(), idx.end(), [this] (auto idx) mutable {
|
|
return this->run_worker();
|
|
}).then([this, stats_start] {
|
|
_instructions_retired_counter.disable();
|
|
_cpu_cycles_retired_counter.disable();
|
|
auto stats_end = executor_shard_stats_snapshot();
|
|
return stats_end - stats_start;
|
|
});
|
|
}
|
|
|
|
future<> stop() {
|
|
return make_ready_future<>();
|
|
}
|
|
};
|
|
|
|
template <typename Func>
|
|
executor_shard_stats
|
|
executor<Func>::executor_shard_stats_snapshot() {
|
|
return executor_shard_stats{
|
|
.invocations = _count,
|
|
.allocations = perf_mallocs(),
|
|
.log_allocations = perf_logallocs(),
|
|
.tasks_executed = perf_tasks_processed(),
|
|
.instructions_retired = _instructions_retired_counter.read(),
|
|
.cpu_cycles_retired = _cpu_cycles_retired_counter.read(),
|
|
.errors = _errors,
|
|
};
|
|
}
|
|
|
|
struct perf_result {
|
|
double throughput;
|
|
double mallocs_per_op;
|
|
double logallocs_per_op;
|
|
double tasks_per_op;
|
|
double instructions_per_op;
|
|
double cpu_cycles_per_op;
|
|
uint64_t errors;
|
|
};
|
|
|
|
|
|
struct aggregated_perf_results {
|
|
struct stats_t {
|
|
double median;
|
|
double median_absolute_deviation;
|
|
double min;
|
|
double max;
|
|
double mean;
|
|
double stdev;
|
|
};
|
|
std::unordered_map<std::string, stats_t> stats;
|
|
perf_result median_by_throughput; // Simplification, median element is considered based on throughput value
|
|
|
|
aggregated_perf_results(std::vector<perf_result>& results);
|
|
|
|
private:
|
|
stats_t calculate_stats(std::vector<perf_result>&, std::function<double(const perf_result&)> get_stat) const;
|
|
};
|
|
|
|
std::ostream& operator<<(std::ostream& os, const aggregated_perf_results& result);
|
|
// Use to make a perf_result with aio_writes added. Need to give "update" as
|
|
// update-func to time_parallel_ex to make it work.
|
|
struct aio_writes_result_mixin {
|
|
double aio_writes;
|
|
double aio_write_bytes;
|
|
|
|
aio_writes_result_mixin();
|
|
|
|
static void update(aio_writes_result_mixin& result, const executor_shard_stats& stats);
|
|
};
|
|
|
|
struct perf_result_with_aio_writes : public perf_result, public aio_writes_result_mixin {};
|
|
|
|
template <> struct fmt::formatter<perf_result_with_aio_writes> : fmt::formatter<string_view> {
|
|
auto format(const perf_result_with_aio_writes&, fmt::format_context& ctx) const -> decltype(ctx.out());
|
|
};
|
|
|
|
/**
|
|
* Measures throughput of an asynchronous action. Executes the action on all cores
|
|
* in parallel, with given number of concurrent executions per core.
|
|
*
|
|
* Runs many iterations. Prints partial total throughput after each iteration.
|
|
*
|
|
* Returns a vector of throughputs achieved in each iteration.
|
|
*/
|
|
template <typename Res, typename Func, typename UpdateFunc = void(*)(const Res&, const executor_shard_stats&)>
|
|
requires (std::is_base_of_v<perf_result, Res> && std::is_invocable_v<UpdateFunc, Res&, const executor_shard_stats&>)
|
|
static
|
|
std::vector<Res> time_parallel_ex(Func func, unsigned concurrency_per_core, int iterations = 5, unsigned operations_per_shard = 0, bool stop_on_error = true, UpdateFunc uf = [](const auto&, const auto&) {}) {
|
|
using clk = std::chrono::steady_clock;
|
|
if (operations_per_shard) {
|
|
iterations = 1;
|
|
}
|
|
std::vector<Res> results;
|
|
for (int i = 0; i < iterations; ++i) {
|
|
auto start = clk::now();
|
|
auto end_at = lowres_clock::now() + std::chrono::seconds(1);
|
|
distributed<executor<Func>> exec;
|
|
Res result;
|
|
exec.start(concurrency_per_core, func, std::move(end_at), operations_per_shard, stop_on_error).get();
|
|
auto stop_exec = defer([&exec] {
|
|
exec.stop().get();
|
|
});
|
|
auto stats = exec.map_reduce0(std::mem_fn(&executor<Func>::run),
|
|
executor_shard_stats(), std::plus<executor_shard_stats>()).get();
|
|
auto end = clk::now();
|
|
auto duration = std::chrono::duration<double>(end - start).count();
|
|
|
|
result.throughput = static_cast<double>(stats.invocations) / duration;
|
|
result.mallocs_per_op = double(stats.allocations) / stats.invocations;
|
|
result.logallocs_per_op = double(stats.log_allocations) / stats.invocations;
|
|
result.tasks_per_op = double(stats.tasks_executed) / stats.invocations;
|
|
result.instructions_per_op = double(stats.instructions_retired) / stats.invocations;
|
|
result.cpu_cycles_per_op = double(stats.cpu_cycles_retired) / stats.invocations;
|
|
result.errors = stats.errors;
|
|
|
|
uf(result, stats);
|
|
|
|
fmt::print("{}\n", result);
|
|
results.emplace_back(result);
|
|
}
|
|
return results;
|
|
}
|
|
|
|
template <typename Func>
|
|
static
|
|
std::vector<perf_result> time_parallel(Func func, unsigned concurrency_per_core, int iterations = 5, unsigned operations_per_shard = 0, bool stop_on_error = true) {
|
|
return time_parallel_ex<perf_result>(std::move(func), concurrency_per_core, iterations, operations_per_shard, stop_on_error);
|
|
}
|
|
|
|
template<typename Func>
|
|
auto duration_in_seconds(Func&& f) {
|
|
using clk = std::chrono::steady_clock;
|
|
auto start = clk::now();
|
|
f();
|
|
auto end = clk::now();
|
|
return std::chrono::duration_cast<std::chrono::duration<float>>(end - start);
|
|
}
|
|
|
|
class scheduling_latency_measurer : public weakly_referencable<scheduling_latency_measurer> {
|
|
using clk = std::chrono::steady_clock;
|
|
clk::time_point _last = clk::now();
|
|
utils::estimated_histogram _hist{300};
|
|
min_max_tracker<clk::duration> _minmax;
|
|
bool _stop = false;
|
|
private:
|
|
void schedule_tick();
|
|
void tick() {
|
|
auto old = _last;
|
|
_last = clk::now();
|
|
auto latency = _last - old;
|
|
_minmax.update(latency);
|
|
_hist.add(latency.count());
|
|
if (!_stop) {
|
|
schedule_tick();
|
|
}
|
|
}
|
|
public:
|
|
void start() {
|
|
schedule_tick();
|
|
}
|
|
void stop() {
|
|
_stop = true;
|
|
yield().get(); // so that the last scheduled tick is counted
|
|
}
|
|
const utils::estimated_histogram& histogram() const {
|
|
return _hist;
|
|
}
|
|
clk::duration min() const { return _minmax.min(); }
|
|
clk::duration max() const { return _minmax.max(); }
|
|
};
|
|
|
|
std::ostream& operator<<(std::ostream& out, const scheduling_latency_measurer& slm);
|
|
|
|
namespace perf {
|
|
|
|
// Closes the semaphore in the background when destroyed
|
|
class reader_concurrency_semaphore_wrapper {
|
|
std::unique_ptr<reader_concurrency_semaphore> _semaphore;
|
|
|
|
public:
|
|
explicit reader_concurrency_semaphore_wrapper(sstring name);
|
|
~reader_concurrency_semaphore_wrapper();
|
|
reader_permit make_permit();
|
|
};
|
|
|
|
} // namespace perf
|
|
|
|
template <> struct fmt::formatter<scheduling_latency_measurer> : fmt::formatter<string_view> {
|
|
auto format(const scheduling_latency_measurer&, fmt::format_context& ctx) const -> decltype(ctx.out());
|
|
};
|
|
|
|
template <> struct fmt::formatter<perf_result> : fmt::formatter<string_view> {
|
|
auto format(const perf_result&, fmt::format_context& ctx) const -> decltype(ctx.out());
|
|
};
|