Files
scylladb/test/perf/perf.hh
Kefu Chai f9091066b7 treewide: replace boost::irange with std::views::iota where possible
when building scylla with the standard library from GCC-14.2, shipped by
fedora 41, we have following build failure:

```
/home/kefu/.local/bin/clang++ -DDEBUG -DDEBUG_LSA_SANITIZER -DFMT_SHARED -DSANITIZE -DSCYLLA_BUILD_MODE=debug -DSCYLLA_ENABLE_ERROR_INJECTION -DSEASTAR_API_LEVEL=7 -DSEASTAR_DEBUG -DSEASTAR_DEBUG_PROMISE -DSEASTAR_DEBUG_SHARED_PTR -DSEASTAR_DEFAULT_ALLOCATOR -DSEASTAR_LOGGER_COMPILE_TIME_FMT -DSEASTAR_LOGGER_TYPE_STDOUT -DSEASTAR_SCHEDULING_GROUPS_COUNT=16 -DSEASTAR_SHUFFLE_TASK_QUEUE -DSEASTAR_SSTRING -DSEASTAR_TYPE_ERASE_MORE -DXXH_PRIVATE_API -DCMAKE_INTDIR=\"Debug\" -I/home/kefu/dev/scylladb -I/home/kefu/dev/scylladb/build/gen -I/home/kefu/dev/scylladb/seastar/include -I/home/kefu/dev/scylladb/build/seastar/gen/include -I/home/kefu/dev/scylladb/build/seastar/gen/src -isystem /home/kefu/dev/scylladb/abseil -g -Og -g -gz -std=gnu++23 -fvisibility=hidden -Wall -Werror -Wextra -Wno-error=deprecated-declarations -Wimplicit-fallthrough -Wno-c++11-narrowing -Wno-deprecated-copy -Wno-mismatched-tags -Wno-missing-field-initializers -Wno-overloaded-virtual -Wno-unsupported-friend -Wno-unused-parameter -ffile-prefix-map=/home/kefu/dev/scylladb/build=. -march=x86-64-v3 -mpclmul -Xclang -fexperimental-assignment-tracking=disabled -Werror=unused-result -fstack-clash-protection -fsanitize=address -fsanitize=undefined -MD -MT CMakeFiles/scylla-main.dir/Debug/init.cc.o -MF CMakeFiles/scylla-main.dir/Debug/init.cc.o.d -o CMakeFiles/scylla-main.dir/Debug/init.cc.o -c /home/kefu/dev/scylladb/init.cc
In file included from /home/kefu/dev/scylladb/init.cc:12:
In file included from /home/kefu/dev/scylladb/db/config.hh:20:
In file included from /home/kefu/dev/scylladb/locator/abstract_replication_strategy.hh:26:
/home/kefu/dev/scylladb/locator/tablets.hh:410:30: error: unexpected type name 'size_t': expected expression
  410 |         return boost::irange<size_t>(0, tablet_count()) | boost::adaptors::transformed([] (size_t i) {
      |                              ^
/home/kefu/dev/scylladb/locator/tablets.hh:410:23: error: no member named 'irange' in namespace 'boost'
  410 |         return boost::irange<size_t>(0, tablet_count()) | boost::adaptors::transformed([] (size_t i) {
      |                ~~~~~~~^
/home/kefu/dev/scylladb/locator/tablets.hh:410:38: error: left operand of comma operator has no effect [-Werror,-Wunused-value]
  410 |         return boost::irange<size_t>(0, tablet_count()) | boost::adaptors::transformed([] (size_t i) {
      |                                      ^
3 errors generated.
[16/782] Building CXX object CMakeFiles/scylla-main.dir/Debug/keys.cc.o
[17/782] Building CXX object CMakeFiles/scylla-main.dir/Debug/counters.cc.o
[18/782] Building CXX object CMakeFiles/scylla-main.dir/Debug/partition_slice_builder.cc.o
[19/782] Building CXX object CMakeFiles/scylla-main.dir/Debug/mutation_query.cc.o
FAILED: CMakeFiles/scylla-main.dir/Debug/mutation_query.cc.o
/home/kefu/.local/bin/clang++ -DDEBUG -DDEBUG_LSA_SANITIZER -DFMT_SHARED -DSANITIZE -DSCYLLA_BUILD_MODE=debug -DSCYLLA_ENABLE_ERROR_INJECTION -DSEASTAR_API_LEVEL=7 -DSEASTAR_DEBUG -DSEASTAR_DEBUG_PROMISE -DSEASTAR_DEBUG_SHARED_PTR -DSEASTAR_DEFAULT_ALLOCATOR -DSEASTAR_LOGGER_COMPILE_TIME_FMT -DSEASTAR_LOGGER_TYPE_STDOUT -DSEASTAR_SCHEDULING_GROUPS_COUNT=16 -DSEASTAR_SHUFFLE_TASK_QUEUE -DSEASTAR_SSTRING -DSEASTAR_TYPE_ERASE_MORE -DXXH_PRIVATE_API -DCMAKE_INTDIR=\"Debug\" -I/home/kefu/dev/scylladb -I/home/kefu/dev/scylladb/build/gen -I/home/kefu/dev/scylladb/seastar/include -I/home/kefu/dev/scylladb/build/seastar/gen/include -I/home/kefu/dev/scylladb/build/seastar/gen/src -isystem /home/kefu/dev/scylladb/abseil -g -Og -g -gz -std=gnu++23 -fvisibility=hidden -Wall -Werror -Wextra -Wno-error=deprecated-declarations -Wimplicit-fallthrough -Wno-c++11-narrowing -Wno-deprecated-copy -Wno-mismatched-tags -Wno-missing-field-initializers -Wno-overloaded-virtual -Wno-unsupported-friend -Wno-unused-parameter -ffile-prefix-map=/home/kefu/dev/scylladb/build=. -march=x86-64-v3 -mpclmul -Xclang -fexperimental-assignment-tracking=disabled -Werror=unused-result -fstack-clash-protection -fsanitize=address -fsanitize=undefined -MD -MT CMakeFiles/scylla-main.dir/Debug/mutation_query.cc.o -MF CMakeFiles/scylla-main.dir/Debug/mutation_query.cc.o.d -o CMakeFiles/scylla-main.dir/Debug/mutation_query.cc.o -c /home/kefu/dev/scylladb/mutation_query.cc
In file included from /home/kefu/dev/scylladb/mutation_query.cc:12:
In file included from /home/kefu/dev/scylladb/schema/schema_registry.hh:17:
In file included from /home/kefu/dev/scylladb/replica/database.hh:11:
In file included from /home/kefu/dev/scylladb/locator/abstract_replication_strategy.hh:26:
/home/kefu/dev/scylladb/locator/tablets.hh:410:30: error: unexpected type name 'size_t': expected expression
  410 |         return boost::irange<size_t>(0, tablet_count()) | boost::adaptors::transformed([] (size_t i) {
      |                              ^
/home/kefu/dev/scylladb/locator/tablets.hh:410:23: error: no member named 'irange' in namespace 'boost'
  410 |         return boost::irange<size_t>(0, tablet_count()) | boost::adaptors::transformed([] (size_t i) {
      |                ~~~~~~~^
/home/kefu/dev/scylladb/locator/tablets.hh:410:38: error: left operand of comma operator has no effect [-Werror,-Wunused-value]
  410 |         return boost::irange<size_t>(0, tablet_count()) | boost::adaptors::transformed([] (size_t i) {
      |                                      ^
In file included from /home/kefu/dev/scylladb/mutation_query.cc:12:
In file included from /home/kefu/dev/scylladb/schema/schema_registry.hh:17:
In file included from /home/kefu/dev/scylladb/replica/database.hh:37:
In file included from /home/kefu/dev/scylladb/db/snapshot-ctl.hh:20:
/home/kefu/dev/scylladb/tasks/task_manager.hh:403:54: error: no member named 'irange' in namespace 'boost'
  403 |         co_await coroutine::parallel_for_each(boost::irange(0u, smp::count), [&tm, id, &res, &func] (unsigned shard) -> future<> {
      |                                               ~~~~~~~^
4 errors generated.
```

so let's take the opportunity to switch from `boost::irange` to
`std::views::iota`.

in this change, we:

- switch from boost::irange to std::views::iota for better standard library compatibility
- retain boost::irange where step parameter is used, as std::views::iota doesn't support it
- this change partially modernizes our range usage while maintaining
- existing functionality

Signed-off-by: Kefu Chai <kefu.chai@scylladb.com>

Closes scylladb/scylladb#20924
2024-10-03 10:33:33 +03:00

331 lines
11 KiB
C++

/*
* Copyright (C) 2015-present ScyllaDB
*/
/*
* SPDX-License-Identifier: AGPL-3.0-or-later
*/
#pragma once
#include <ranges>
#include <seastar/core/print.hh>
#include <seastar/core/future-util.hh>
#include <seastar/core/distributed.hh>
#include <seastar/core/weak_ptr.hh>
#include <seastar/coroutine/as_future.hh>
#include "seastarx.hh"
#include "utils/extremum_tracking.hh"
#include "utils/estimated_histogram.hh"
#include <seastar/testing/linux_perf_event.hh>
#include <seastar/util/defer.hh>
#include "reader_permit.hh"
#include <chrono>
#include <iosfwd>
#include <boost/range/irange.hpp>
#include <vector>
template <typename Func>
static
void time_it(Func func, int iterations = 5, int iterations_between_clock_readings = 1000) {
using clk = std::chrono::steady_clock;
for (int i = 0; i < iterations; i++) {
auto start = clk::now();
auto end_at = start + std::chrono::seconds(1);
uint64_t count = 0;
while (clk::now() < end_at) {
for (int i = 0; i < iterations_between_clock_readings; i++) { // amortize clock reading cost
func();
count++;
}
}
auto end = clk::now();
auto duration = std::chrono::duration<double>(end - start).count();
std::cout << format("{:.2f}", (double)count / duration) << " tps\n";
}
}
struct executor_shard_stats {
uint64_t invocations = 0;
uint64_t allocations = 0;
uint64_t log_allocations = 0;
uint64_t tasks_executed = 0;
uint64_t instructions_retired = 0;
uint64_t cpu_cycles_retired = 0;
uint64_t errors = 0;
};
inline
executor_shard_stats
operator+(executor_shard_stats a, executor_shard_stats b) {
a.invocations += b.invocations;
a.allocations += b.allocations;
a.log_allocations += b.log_allocations;
a.tasks_executed += b.tasks_executed;
a.instructions_retired += b.instructions_retired;
a.cpu_cycles_retired += b.cpu_cycles_retired;
a.errors += b.errors;
return a;
}
inline
executor_shard_stats
operator-(executor_shard_stats a, executor_shard_stats b) {
a.invocations -= b.invocations;
a.allocations -= b.allocations;
a.log_allocations -= b.log_allocations;
a.tasks_executed -= b.tasks_executed;
a.instructions_retired -= b.instructions_retired;
a.cpu_cycles_retired -= b.cpu_cycles_retired;
a.errors -= b.errors;
return a;
}
uint64_t perf_tasks_processed();
uint64_t perf_mallocs();
uint64_t perf_logallocs();
// Drives concurrent and continuous execution of given asynchronous action
// until a deadline. Counts invocations and collects statistics.
template <typename Func>
class executor {
const Func _func;
const lowres_clock::time_point _end_at;
const uint64_t _end_at_count;
const unsigned _n_workers;
const bool _stop_on_error;
uint64_t _count;
uint64_t _errors;
linux_perf_event _instructions_retired_counter = linux_perf_event::user_instructions_retired();
linux_perf_event _cpu_cycles_retired_counter = linux_perf_event::user_cpu_cycles_retired();
private:
executor_shard_stats executor_shard_stats_snapshot();
future<> run_worker() {
while (_end_at_count ? _count < _end_at_count : lowres_clock::now() < _end_at) {
++_count;
future<> f = co_await coroutine::as_future(_func());
if (f.failed()) {
++_errors;
if (_stop_on_error) [[unlikely]] {
co_return co_await std::move(f);
}
f.ignore_ready_future();
}
}
}
public:
executor(unsigned n_workers, Func func, lowres_clock::time_point end_at, uint64_t end_at_count = 0, bool stop_on_error = true)
: _func(std::move(func))
, _end_at(end_at)
, _end_at_count(end_at_count)
, _n_workers(n_workers)
, _stop_on_error(stop_on_error)
, _count(0)
, _errors(0)
{ }
// Returns the number of invocations of @func
future<executor_shard_stats> run() {
auto stats_start = executor_shard_stats_snapshot();
_instructions_retired_counter.enable();
_cpu_cycles_retired_counter.enable();
auto idx = std::views::iota(0, (int)_n_workers);
return parallel_for_each(idx.begin(), idx.end(), [this] (auto idx) mutable {
return this->run_worker();
}).then([this, stats_start] {
_instructions_retired_counter.disable();
_cpu_cycles_retired_counter.disable();
auto stats_end = executor_shard_stats_snapshot();
return stats_end - stats_start;
});
}
future<> stop() {
return make_ready_future<>();
}
};
template <typename Func>
executor_shard_stats
executor<Func>::executor_shard_stats_snapshot() {
return executor_shard_stats{
.invocations = _count,
.allocations = perf_mallocs(),
.log_allocations = perf_logallocs(),
.tasks_executed = perf_tasks_processed(),
.instructions_retired = _instructions_retired_counter.read(),
.cpu_cycles_retired = _cpu_cycles_retired_counter.read(),
.errors = _errors,
};
}
struct perf_result {
double throughput;
double mallocs_per_op;
double logallocs_per_op;
double tasks_per_op;
double instructions_per_op;
double cpu_cycles_per_op;
uint64_t errors;
};
struct aggregated_perf_results {
struct stats_t {
double median;
double median_absolute_deviation;
double min;
double max;
double mean;
double stdev;
};
std::unordered_map<std::string, stats_t> stats;
perf_result median_by_throughput; // Simplification, median element is considered based on throughput value
aggregated_perf_results(std::vector<perf_result>& results);
private:
stats_t calculate_stats(std::vector<perf_result>&, std::function<double(const perf_result&)> get_stat) const;
};
std::ostream& operator<<(std::ostream& os, const aggregated_perf_results& result);
// Use to make a perf_result with aio_writes added. Need to give "update" as
// update-func to time_parallel_ex to make it work.
struct aio_writes_result_mixin {
double aio_writes;
double aio_write_bytes;
aio_writes_result_mixin();
static void update(aio_writes_result_mixin& result, const executor_shard_stats& stats);
};
struct perf_result_with_aio_writes : public perf_result, public aio_writes_result_mixin {};
template <> struct fmt::formatter<perf_result_with_aio_writes> : fmt::formatter<string_view> {
auto format(const perf_result_with_aio_writes&, fmt::format_context& ctx) const -> decltype(ctx.out());
};
/**
* Measures throughput of an asynchronous action. Executes the action on all cores
* in parallel, with given number of concurrent executions per core.
*
* Runs many iterations. Prints partial total throughput after each iteration.
*
* Returns a vector of throughputs achieved in each iteration.
*/
template <typename Res, typename Func, typename UpdateFunc = void(*)(const Res&, const executor_shard_stats&)>
requires (std::is_base_of_v<perf_result, Res> && std::is_invocable_v<UpdateFunc, Res&, const executor_shard_stats&>)
static
std::vector<Res> time_parallel_ex(Func func, unsigned concurrency_per_core, int iterations = 5, unsigned operations_per_shard = 0, bool stop_on_error = true, UpdateFunc uf = [](const auto&, const auto&) {}) {
using clk = std::chrono::steady_clock;
if (operations_per_shard) {
iterations = 1;
}
std::vector<Res> results;
for (int i = 0; i < iterations; ++i) {
auto start = clk::now();
auto end_at = lowres_clock::now() + std::chrono::seconds(1);
distributed<executor<Func>> exec;
Res result;
exec.start(concurrency_per_core, func, std::move(end_at), operations_per_shard, stop_on_error).get();
auto stop_exec = defer([&exec] {
exec.stop().get();
});
auto stats = exec.map_reduce0(std::mem_fn(&executor<Func>::run),
executor_shard_stats(), std::plus<executor_shard_stats>()).get();
auto end = clk::now();
auto duration = std::chrono::duration<double>(end - start).count();
result.throughput = static_cast<double>(stats.invocations) / duration;
result.mallocs_per_op = double(stats.allocations) / stats.invocations;
result.logallocs_per_op = double(stats.log_allocations) / stats.invocations;
result.tasks_per_op = double(stats.tasks_executed) / stats.invocations;
result.instructions_per_op = double(stats.instructions_retired) / stats.invocations;
result.cpu_cycles_per_op = double(stats.cpu_cycles_retired) / stats.invocations;
result.errors = stats.errors;
uf(result, stats);
fmt::print("{}\n", result);
results.emplace_back(result);
}
return results;
}
template <typename Func>
static
std::vector<perf_result> time_parallel(Func func, unsigned concurrency_per_core, int iterations = 5, unsigned operations_per_shard = 0, bool stop_on_error = true) {
return time_parallel_ex<perf_result>(std::move(func), concurrency_per_core, iterations, operations_per_shard, stop_on_error);
}
template<typename Func>
auto duration_in_seconds(Func&& f) {
using clk = std::chrono::steady_clock;
auto start = clk::now();
f();
auto end = clk::now();
return std::chrono::duration_cast<std::chrono::duration<float>>(end - start);
}
class scheduling_latency_measurer : public weakly_referencable<scheduling_latency_measurer> {
using clk = std::chrono::steady_clock;
clk::time_point _last = clk::now();
utils::estimated_histogram _hist{300};
min_max_tracker<clk::duration> _minmax;
bool _stop = false;
private:
void schedule_tick();
void tick() {
auto old = _last;
_last = clk::now();
auto latency = _last - old;
_minmax.update(latency);
_hist.add(latency.count());
if (!_stop) {
schedule_tick();
}
}
public:
void start() {
schedule_tick();
}
void stop() {
_stop = true;
yield().get(); // so that the last scheduled tick is counted
}
const utils::estimated_histogram& histogram() const {
return _hist;
}
clk::duration min() const { return _minmax.min(); }
clk::duration max() const { return _minmax.max(); }
};
std::ostream& operator<<(std::ostream& out, const scheduling_latency_measurer& slm);
namespace perf {
// Closes the semaphore in the background when destroyed
class reader_concurrency_semaphore_wrapper {
std::unique_ptr<reader_concurrency_semaphore> _semaphore;
public:
explicit reader_concurrency_semaphore_wrapper(sstring name);
~reader_concurrency_semaphore_wrapper();
reader_permit make_permit();
};
} // namespace perf
template <> struct fmt::formatter<scheduling_latency_measurer> : fmt::formatter<string_view> {
auto format(const scheduling_latency_measurer&, fmt::format_context& ctx) const -> decltype(ctx.out());
};
template <> struct fmt::formatter<perf_result> : fmt::formatter<string_view> {
auto format(const perf_result&, fmt::format_context& ctx) const -> decltype(ctx.out());
};