Merge "Adding sampled histogram" from Amnon
"Histograms are used to collect latency information, in Origin, many of the operations are timed, this is a potential performance issue. This series adds an option to sample the operations, where small amount will be timed and the most will only be counted. This will give an estimation for the statistics, while keeping an accurate count of the total events and have neglectible performance impact. The first to use the modified histogram are the column family for their read and write." Conflicts: database.hh
This commit is contained in:
@@ -958,6 +958,49 @@
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"path":"/column_family/metrics/read_latency/histogram/{name}",
|
||||
"operations":[
|
||||
{
|
||||
"method":"GET",
|
||||
"summary":"Get read latency histogram",
|
||||
"$ref": "#/utils/histogram",
|
||||
"nickname":"get_read_latency_histogram",
|
||||
"produces":[
|
||||
"application/json"
|
||||
],
|
||||
"parameters":[
|
||||
{
|
||||
"name":"name",
|
||||
"description":"The column family name in keysspace:name format",
|
||||
"required":true,
|
||||
"allowMultiple":false,
|
||||
"type":"string",
|
||||
"paramType":"path"
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"path":"/column_family/metrics/read_latency/histogram/",
|
||||
"operations":[
|
||||
{
|
||||
"method":"GET",
|
||||
"summary":"Get read latency histogram from all column family",
|
||||
"type":"array",
|
||||
"items":{
|
||||
"$ref": "#/utils/histogram"
|
||||
},
|
||||
"nickname":"get_all_read_latency_histogram",
|
||||
"produces":[
|
||||
"application/json"
|
||||
],
|
||||
"parameters":[
|
||||
]
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"path":"/column_family/metrics/read_latency",
|
||||
"operations":[
|
||||
@@ -1081,6 +1124,49 @@
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"path":"/column_family/metrics/write_latency/histogram/{name}",
|
||||
"operations":[
|
||||
{
|
||||
"method":"GET",
|
||||
"summary":"Get write latency histogram",
|
||||
"$ref": "#/utils/histogram",
|
||||
"nickname":"get_write_latency_histogram",
|
||||
"produces":[
|
||||
"application/json"
|
||||
],
|
||||
"parameters":[
|
||||
{
|
||||
"name":"name",
|
||||
"description":"The column family name in keysspace:name format",
|
||||
"required":true,
|
||||
"allowMultiple":false,
|
||||
"type":"string",
|
||||
"paramType":"path"
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"path":"/column_family/metrics/write_latency/histogram/",
|
||||
"operations":[
|
||||
{
|
||||
"method":"GET",
|
||||
"summary":"Get write latency histogram of all column family",
|
||||
"type":"array",
|
||||
"items":{
|
||||
"$ref": "#/utils/histogram"
|
||||
},
|
||||
"nickname":"get_all_write_latency_histogram",
|
||||
"produces":[
|
||||
"application/json"
|
||||
],
|
||||
"parameters":[
|
||||
]
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"path":"/column_family/metrics/write_latency",
|
||||
"operations":[
|
||||
|
||||
@@ -111,13 +111,13 @@ inline double pow2(double a) {
|
||||
|
||||
inline httpd::utils_json::histogram add_histogram(httpd::utils_json::histogram res,
|
||||
const utils::ihistogram& val) {
|
||||
if (val.count == 0) {
|
||||
return res;
|
||||
}
|
||||
if (!res.count._set) {
|
||||
res = val;
|
||||
return res;
|
||||
}
|
||||
if (val.count == 0) {
|
||||
return res;
|
||||
}
|
||||
if (res.min() > val.min) {
|
||||
res.min = val.min;
|
||||
}
|
||||
|
||||
@@ -81,15 +81,40 @@ static future<json::json_return_type> get_cf_stats(http_context& ctx,
|
||||
}, std::plus<int64_t>());
|
||||
}
|
||||
|
||||
static future<json::json_return_type> map_cf_stats(http_context& ctx,
|
||||
int64_t column_family::stats::*f) {
|
||||
return ctx.db.map([f](const database& db) {
|
||||
int64_t res = 0;
|
||||
static future<json::json_return_type> get_cf_stats_sum(http_context& ctx, const sstring& name,
|
||||
utils::ihistogram column_family::stats::*f) {
|
||||
return map_reduce_cf(ctx, name, 0, [f](const column_family& cf) {
|
||||
return (cf.get_stats().*f).count;
|
||||
}, std::plus<int64_t>());
|
||||
}
|
||||
|
||||
static future<json::json_return_type> get_cf_stats_sum(http_context& ctx,
|
||||
utils::ihistogram column_family::stats::*f) {
|
||||
return map_reduce_cf(ctx, 0, [f](const column_family& cf) {
|
||||
return (cf.get_stats().*f).count;
|
||||
}, std::plus<int64_t>());
|
||||
}
|
||||
|
||||
static future<json::json_return_type> get_cf_histogram(http_context& ctx, const sstring& name,
|
||||
utils::ihistogram column_family::stats::*f) {
|
||||
utils::UUID uuid = get_uuid(name, ctx.db.local());
|
||||
return ctx.db.map_reduce0([f, uuid](const database& p) {return p.find_column_family(uuid).get_stats().*f;},
|
||||
httpd::utils_json::histogram(),
|
||||
add_histogram)
|
||||
.then([](const httpd::utils_json::histogram& val) {
|
||||
return make_ready_future<json::json_return_type>(val);
|
||||
});
|
||||
}
|
||||
|
||||
static future<json::json_return_type> get_cf_histogram(http_context& ctx, utils::ihistogram column_family::stats::*f) {
|
||||
std::function<httpd::utils_json::histogram(const database&)> fun = [f] (const database& db) {
|
||||
httpd::utils_json::histogram res;
|
||||
for (auto i : db.get_column_families()) {
|
||||
res += i.second.get()->get_stats().*f;
|
||||
res = add_histogram(res, i.second->get_stats().*f);
|
||||
}
|
||||
return res;
|
||||
}).then([](const std::vector<int64_t>& res) {
|
||||
};
|
||||
return ctx.db.map(fun).then([](const std::vector<httpd::utils_json::histogram> &res) {
|
||||
return make_ready_future<json::json_return_type>(res);
|
||||
});
|
||||
}
|
||||
@@ -243,19 +268,35 @@ void set_column_family(http_context& ctx, routes& r) {
|
||||
});
|
||||
|
||||
cf::get_read.set(r, [&ctx] (std::unique_ptr<request> req) {
|
||||
return get_cf_stats(ctx,req->param["name"] ,&column_family::stats::reads);
|
||||
return get_cf_stats_sum(ctx,req->param["name"] ,&column_family::stats::reads);
|
||||
});
|
||||
|
||||
cf::get_all_read.set(r, [&ctx] (std::unique_ptr<request> req) {
|
||||
return map_cf_stats(ctx, &column_family::stats::reads);
|
||||
return get_cf_stats_sum(ctx, &column_family::stats::reads);
|
||||
});
|
||||
|
||||
cf::get_write.set(r, [&ctx] (std::unique_ptr<request> req) {
|
||||
return get_cf_stats(ctx,req->param["name"] ,&column_family::stats::writes);
|
||||
return get_cf_stats_sum(ctx, req->param["name"] ,&column_family::stats::writes);
|
||||
});
|
||||
|
||||
cf::get_all_write.set(r, [&ctx] (std::unique_ptr<request> req) {
|
||||
return map_cf_stats(ctx, &column_family::stats::writes);
|
||||
return get_cf_stats_sum(ctx, &column_family::stats::writes);
|
||||
});
|
||||
|
||||
cf::get_read_latency_histogram.set(r, [&ctx] (std::unique_ptr<request> req) {
|
||||
return get_cf_histogram(ctx, req->param["name"], &column_family::stats::reads);
|
||||
});
|
||||
|
||||
cf::get_all_read_latency_histogram.set(r, [&ctx] (std::unique_ptr<request> req) {
|
||||
return get_cf_histogram(ctx, &column_family::stats::writes);
|
||||
});
|
||||
|
||||
cf::get_write_latency_histogram.set(r, [&ctx] (std::unique_ptr<request> req) {
|
||||
return get_cf_histogram(ctx, req->param["name"], &column_family::stats::reads);
|
||||
});
|
||||
|
||||
cf::get_all_write_latency_histogram.set(r, [&ctx] (std::unique_ptr<request> req) {
|
||||
return get_cf_histogram(ctx, &column_family::stats::writes);
|
||||
});
|
||||
|
||||
cf::get_pending_compactions.set(r, [&ctx] (std::unique_ptr<request> req) {
|
||||
@@ -518,5 +559,6 @@ void set_column_family(http_context& ctx, routes& r) {
|
||||
std::vector<double> res;
|
||||
return make_ready_future<json::json_return_type>(res);
|
||||
});
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
14
database.hh
14
database.hh
@@ -47,6 +47,8 @@
|
||||
#include "row_cache.hh"
|
||||
#include "compaction_strategy.hh"
|
||||
#include "utils/compaction_manager.hh"
|
||||
#include "utils/exponential_backoff_retry.hh"
|
||||
#include "utils/histogram.hh"
|
||||
|
||||
class frozen_mutation;
|
||||
class reconcilable_result;
|
||||
@@ -93,13 +95,13 @@ public:
|
||||
int64_t memtable_switch_count = 0;
|
||||
/** Estimated number of tasks pending for this column family */
|
||||
int64_t pending_flushes = 0;
|
||||
int64_t reads = 0;
|
||||
int64_t writes = 0;
|
||||
int64_t live_disk_space_used = 0;
|
||||
int64_t total_disk_space_used = 0;
|
||||
int64_t live_sstable_count = 0;
|
||||
/** Estimated number of compactions pending for this column family */
|
||||
int64_t pending_compactions = 0;
|
||||
utils::ihistogram reads{256, 100};
|
||||
utils::ihistogram writes{256, 100};
|
||||
};
|
||||
|
||||
private:
|
||||
@@ -456,9 +458,11 @@ class secondary_index_manager {};
|
||||
inline
|
||||
void
|
||||
column_family::apply(const mutation& m, const db::replay_position& rp) {
|
||||
utils::latency_counter lc;
|
||||
_stats.writes.set_latency(lc);
|
||||
active_memtable().apply(m, rp);
|
||||
_stats.writes++;
|
||||
seal_on_overflow();
|
||||
_stats.writes.mark(lc);
|
||||
}
|
||||
|
||||
inline
|
||||
@@ -482,10 +486,12 @@ column_family::check_valid_rp(const db::replay_position& rp) const {
|
||||
inline
|
||||
void
|
||||
column_family::apply(const frozen_mutation& m, const db::replay_position& rp) {
|
||||
utils::latency_counter lc;
|
||||
_stats.writes.set_latency(lc);
|
||||
check_valid_rp(rp);
|
||||
active_memtable().apply(m, rp);
|
||||
_stats.writes++;
|
||||
seal_on_overflow();
|
||||
_stats.writes.mark(lc);
|
||||
}
|
||||
|
||||
future<> update_schema_version_and_announce(service::storage_proxy& proxy);
|
||||
|
||||
@@ -5,46 +5,86 @@
|
||||
#pragma once
|
||||
|
||||
#include <boost/circular_buffer.hpp>
|
||||
#include "latency.hh"
|
||||
|
||||
namespace utils {
|
||||
|
||||
template<typename T>
|
||||
class histogram {
|
||||
class ihistogram {
|
||||
public:
|
||||
// count holds all the events
|
||||
int64_t count;
|
||||
T min;
|
||||
T max;
|
||||
T sum;
|
||||
// total holds only the events we sample
|
||||
int64_t total;
|
||||
int64_t min;
|
||||
int64_t max;
|
||||
int64_t sum;
|
||||
double mean;
|
||||
double variance;
|
||||
boost::circular_buffer<T> sample;
|
||||
histogram(size_t size = 1024)
|
||||
: count(0), min(0), max(0), sum(0), mean(0), variance(0), sample(
|
||||
int64_t sample_mask;
|
||||
boost::circular_buffer<int64_t> sample;
|
||||
ihistogram(size_t size = 1024, int64_t _sample_mask = 0x80)
|
||||
: count(0), total(0), min(0), max(0), sum(0), mean(0), variance(0),
|
||||
sample_mask(_sample_mask), sample(
|
||||
size) {
|
||||
}
|
||||
void mark(T value) {
|
||||
if (count == 0 || value < min) {
|
||||
void mark(int64_t value) {
|
||||
if (total == 0 || value < min) {
|
||||
min = value;
|
||||
}
|
||||
if (count == 0 || value > max) {
|
||||
if (total == 0 || value > max) {
|
||||
max = value;
|
||||
}
|
||||
if (count == 0) {
|
||||
if (total == 0) {
|
||||
mean = value;
|
||||
variance = 0;
|
||||
} else {
|
||||
double old_m = mean;
|
||||
double old_s = variance;
|
||||
|
||||
mean = old_m + ((value - old_m) / (count + 1));
|
||||
mean = old_m + ((value - old_m) / (total + 1));
|
||||
variance = old_s + ((value - old_m) * (value - mean));
|
||||
}
|
||||
sum += value;
|
||||
total++;
|
||||
count++;
|
||||
sample.push_back(value);
|
||||
}
|
||||
|
||||
void mark(latency_counter& lc) {
|
||||
if (lc.is_start()) {
|
||||
mark(lc.stop().latency_in_nano());
|
||||
} else {
|
||||
count++;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Return true if the current event should be sample.
|
||||
* In the typical case, there is no need to use this method
|
||||
* Call set_latency, that would start a latency object if needed.
|
||||
*/
|
||||
bool should_sample() const {
|
||||
return total & sample_mask;
|
||||
}
|
||||
/**
|
||||
* Set the latency according to the sample rate.
|
||||
*/
|
||||
ihistogram& set_latency(latency_counter& lc) {
|
||||
if (should_sample()) {
|
||||
lc.start();
|
||||
}
|
||||
return *this;
|
||||
}
|
||||
|
||||
/**
|
||||
* Allow to use the histogram as a counter
|
||||
* Increment the total number of events without
|
||||
* sampling the value.
|
||||
*/
|
||||
ihistogram& inc() {
|
||||
count++;
|
||||
return *this;
|
||||
}
|
||||
};
|
||||
|
||||
using ihistogram = histogram<int64_t>;
|
||||
|
||||
}
|
||||
|
||||
@@ -22,6 +22,10 @@ public:
|
||||
_start = now();
|
||||
}
|
||||
|
||||
bool is_start() const {
|
||||
// if start is not set it is still zero
|
||||
return _start.time_since_epoch().count();
|
||||
}
|
||||
latency_counter& stop() {
|
||||
_stop = now();
|
||||
return *this;
|
||||
|
||||
Reference in New Issue
Block a user