test: perf: report instructions retired per operations

Instructions retired per op is a much more stable than time per op
(inverse throughput) since it isn't much affected by changes in
CPU frequencey or other load on the test system (it's still somewhat
affected since a slower system will run more reactor polls per op).
It's also less indicative of real performance, since it's possible for
fewer inststructions to execute in more time than more instructions,
but that isn't an issue for comparative tests).

This allows incremental changes to the code base to be compared with
more confidence.
This commit is contained in:
Avi Kivity
2021-04-28 18:46:55 +03:00
parent 0bc98caf3e
commit 863b49af03
2 changed files with 12 additions and 2 deletions

View File

@@ -61,7 +61,7 @@ std::ostream& operator<<(std::ostream& out, const scheduling_latency_measurer& s
std::ostream&
operator<<(std::ostream& os, const perf_result& result) {
fmt::print(os, "{:.2f} tps ({:5.1f} allocs/op, {:5.1f} tasks/op)",
result.throughput, result.mallocs_per_op, result.tasks_per_op);
fmt::print(os, "{:.2f} tps ({:5.1f} allocs/op, {:5.1f} tasks/op, {:7.0f} insns/op)",
result.throughput, result.mallocs_per_op, result.tasks_per_op, result.instructions_per_op);
return os;
}

View File

@@ -28,6 +28,7 @@
#include "seastarx.hh"
#include "utils/extremum_tracking.hh"
#include "utils/estimated_histogram.hh"
#include "linux-perf-event.hh"
#include <chrono>
#include <iosfwd>
@@ -60,6 +61,7 @@ struct executor_shard_stats {
uint64_t invocations = 0;
uint64_t allocations = 0;
uint64_t tasks_executed = 0;
uint64_t instructions_retired = 0;
};
inline
@@ -68,6 +70,7 @@ operator+(executor_shard_stats a, executor_shard_stats b) {
a.invocations += b.invocations;
a.allocations += b.allocations;
a.tasks_executed += b.tasks_executed;
a.instructions_retired += b.instructions_retired;
return a;
}
@@ -77,6 +80,7 @@ operator-(executor_shard_stats a, executor_shard_stats b) {
a.invocations -= b.invocations;
a.allocations -= b.allocations;
a.tasks_executed -= b.tasks_executed;
a.instructions_retired -= b.instructions_retired;
return a;
}
@@ -93,10 +97,12 @@ class executor {
const uint64_t _end_at_count;
const unsigned _n_workers;
uint64_t _count;
linux_perf_event _instructions_retired_counter = linux_perf_event::user_instructions_retired();
private:
executor_shard_stats executor_shard_stats_snapshot();
future<> run_worker() {
auto stats_begin = executor_shard_stats_snapshot();
_instructions_retired_counter.enable();
return do_until([this] {
return _end_at_count ? _count == _end_at_count : lowres_clock::now() >= _end_at;
}, [this] () mutable {
@@ -120,6 +126,7 @@ public:
return parallel_for_each(idx.begin(), idx.end(), [this] (auto idx) mutable {
return this->run_worker();
}).then([this, stats_start] {
_instructions_retired_counter.disable();
auto stats_end = executor_shard_stats_snapshot();
stats_end.invocations = _count;
return stats_end - stats_start;
@@ -137,6 +144,7 @@ executor<Func>::executor_shard_stats_snapshot() {
return executor_shard_stats{
.allocations = perf_mallocs(),
.tasks_executed = perf_tasks_processed(),
.instructions_retired = _instructions_retired_counter.read(),
};
}
@@ -144,6 +152,7 @@ struct perf_result {
double throughput;
double mallocs_per_op;
double tasks_per_op;
double instructions_per_op;
};
std::ostream& operator<<(std::ostream& os, const perf_result& result);
@@ -177,6 +186,7 @@ std::vector<perf_result> time_parallel(Func func, unsigned concurrency_per_core,
.throughput = static_cast<double>(stats.invocations) / duration,
.mallocs_per_op = double(stats.allocations) / stats.invocations,
.tasks_per_op = double(stats.tasks_executed) / stats.invocations,
.instructions_per_op = double(stats.instructions_retired) / stats.invocations,
};
std::cout << result << "\n";
results.emplace_back(result);