From af2ec7c7e88379f6004c926192ac1c1b4b8da696 Mon Sep 17 00:00:00 2001 From: Amnon Heiman Date: Wed, 5 Aug 2015 17:38:07 +0300 Subject: [PATCH 1/6] Utils add an is start method to latency_counter When doing a spares latency check, it is required to know if a latency object was started. This returns true if the start timer was set. Signed-off-by: Amnon Heiman --- utils/latency.hh | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/utils/latency.hh b/utils/latency.hh index 35c34f94fb..474ccae2d2 100644 --- a/utils/latency.hh +++ b/utils/latency.hh @@ -22,6 +22,10 @@ public: _start = now(); } + bool is_start() const { + // if start is not set it is still zero + return _start.time_since_epoch().count(); + } latency_counter& stop() { _stop = now(); return *this; From bd9a758b80290a27d2630fa9d2309c2dbaf0e6ed Mon Sep 17 00:00:00 2001 From: Amnon Heiman Date: Wed, 5 Aug 2015 17:40:27 +0300 Subject: [PATCH 2/6] Utils: Support sample based histogram The histogrm object is used both as a general counter for the number of events and for statistics and sampling. This chanage the histogram implementation, so it would support spares sampling while keeping the total number of event accurate. The implementation includes the following: Remove the template nature of the histogram, as it is used only for timer and use the name ihistogram instead. If in the future we'll need a histogram for other types, we can use the histogrma name for it. a total counter was added that count the number of events that are part of the statistic calculation. A helper methods where added to the ihistogram to handle the latency counter object. According to the sample mask it would mark the latency object as start if the counter and the mask are non zero and it would accept the latency object in its mark method, in which if the latency was not start, it will not be added and only the 'count' counter that counts the total number of events will be incremented. This should reduce the impact of latency calculation to a neglectable effect. Signed-off-by: Amnon Heiman --- utils/histogram.hh | 70 ++++++++++++++++++++++++++++++++++++---------- 1 file changed, 55 insertions(+), 15 deletions(-) diff --git a/utils/histogram.hh b/utils/histogram.hh index 9e77f14f83..de97141f21 100644 --- a/utils/histogram.hh +++ b/utils/histogram.hh @@ -5,46 +5,86 @@ #pragma once #include +#include "latency.hh" namespace utils { -template -class histogram { +class ihistogram { public: + // count holds all the events int64_t count; - T min; - T max; - T sum; + // total holds only the events we sample + int64_t total; + int64_t min; + int64_t max; + int64_t sum; double mean; double variance; - boost::circular_buffer sample; - histogram(size_t size = 1024) - : count(0), min(0), max(0), sum(0), mean(0), variance(0), sample( + int64_t sample_mask; + boost::circular_buffer sample; + ihistogram(size_t size = 1024, int64_t _sample_mask = 0x80) + : count(0), total(0), min(0), max(0), sum(0), mean(0), variance(0), + sample_mask(_sample_mask), sample( size) { } - void mark(T value) { - if (count == 0 || value < min) { + void mark(int64_t value) { + if (total == 0 || value < min) { min = value; } - if (count == 0 || value > max) { + if (total == 0 || value > max) { max = value; } - if (count == 0) { + if (total == 0) { mean = value; variance = 0; } else { double old_m = mean; double old_s = variance; - mean = old_m + ((value - old_m) / (count + 1)); + mean = old_m + ((value - old_m) / (total + 1)); variance = old_s + ((value - old_m) * (value - mean)); } sum += value; + total++; count++; sample.push_back(value); } + + void mark(latency_counter& lc) { + if (lc.is_start()) { + mark(lc.stop().latency_in_nano()); + } else { + count++; + } + } + + /** + * Return true if the current event should be sample. + * In the typical case, there is no need to use this method + * Call set_latency, that would start a latency object if needed. + */ + bool should_sample() const { + return total & sample_mask; + } + /** + * Set the latency according to the sample rate. + */ + ihistogram& set_latency(latency_counter& lc) { + if (should_sample()) { + lc.start(); + } + return *this; + } + + /** + * Allow to use the histogram as a counter + * Increment the total number of events without + * sampling the value. + */ + ihistogram& inc() { + count++; + return *this; + } }; -using ihistogram = histogram; - } From 3ef36681cc3a324aed6f2b78fa3da4e3624f43ff Mon Sep 17 00:00:00 2001 From: Amnon Heiman Date: Sun, 26 Jul 2015 09:14:31 +0300 Subject: [PATCH 3/6] API: Adding read, write latency histogram to column_family This adds the latency histogram to the column_family swagger definitions. The definitions are based on the ColumnFamilyMetrics. It adds the following commands: get_read_latency_histogram get_all_read_latency_histogram get_write_latency_histogram get_all_write_latency_histogram Signed-off-by: Amnon Heiman --- api/api-doc/column_family.json | 86 ++++++++++++++++++++++++++++++++++ 1 file changed, 86 insertions(+) diff --git a/api/api-doc/column_family.json b/api/api-doc/column_family.json index 2e4df149ff..a5163ad804 100644 --- a/api/api-doc/column_family.json +++ b/api/api-doc/column_family.json @@ -958,6 +958,49 @@ } ] }, + { + "path":"/column_family/metrics/read_latency/histogram/{name}", + "operations":[ + { + "method":"GET", + "summary":"Get read latency histogram", + "$ref": "#/utils/histogram", + "nickname":"get_read_latency_histogram", + "produces":[ + "application/json" + ], + "parameters":[ + { + "name":"name", + "description":"The column family name in keysspace:name format", + "required":true, + "allowMultiple":false, + "type":"string", + "paramType":"path" + } + ] + } + ] + }, + { + "path":"/column_family/metrics/read_latency/histogram/", + "operations":[ + { + "method":"GET", + "summary":"Get read latency histogram from all column family", + "type":"array", + "items":{ + "$ref": "#/utils/histogram" + }, + "nickname":"get_all_read_latency_histogram", + "produces":[ + "application/json" + ], + "parameters":[ + ] + } + ] + }, { "path":"/column_family/metrics/read_latency", "operations":[ @@ -1081,6 +1124,49 @@ } ] }, + { + "path":"/column_family/metrics/write_latency/histogram/{name}", + "operations":[ + { + "method":"GET", + "summary":"Get write latency histogram", + "$ref": "#/utils/histogram", + "nickname":"get_write_latency_histogram", + "produces":[ + "application/json" + ], + "parameters":[ + { + "name":"name", + "description":"The column family name in keysspace:name format", + "required":true, + "allowMultiple":false, + "type":"string", + "paramType":"path" + } + ] + } + ] + }, + { + "path":"/column_family/metrics/write_latency/histogram/", + "operations":[ + { + "method":"GET", + "summary":"Get write latency histogram of all column family", + "type":"array", + "items":{ + "$ref": "#/utils/histogram" + }, + "nickname":"get_all_write_latency_histogram", + "produces":[ + "application/json" + ], + "parameters":[ + ] + } + ] + }, { "path":"/column_family/metrics/write_latency", "operations":[ From 4329377556babf2bee168279a44c3100294ea4d4 Mon Sep 17 00:00:00 2001 From: Amnon Heiman Date: Wed, 5 Aug 2015 17:48:39 +0300 Subject: [PATCH 4/6] column_family to use histogram for read and write latency With the use of sparse histogram, the read and write counters in the column_family stats can be used. The total impact on performanc should be neglectible. Signed-off-by: Amnon Heiman --- database.hh | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/database.hh b/database.hh index d89e105937..679a2a9d6a 100644 --- a/database.hh +++ b/database.hh @@ -47,6 +47,7 @@ #include "row_cache.hh" #include "compaction_strategy.hh" #include "utils/exponential_backoff_retry.hh" +#include "utils/histogram.hh" class frozen_mutation; class reconcilable_result; @@ -93,13 +94,13 @@ public: int64_t memtable_switch_count = 0; /** Estimated number of tasks pending for this column family */ int64_t pending_flushes = 0; - int64_t reads = 0; - int64_t writes = 0; int64_t live_disk_space_used = 0; int64_t total_disk_space_used = 0; int64_t live_sstable_count = 0; /** Estimated number of compactions pending for this column family */ int64_t pending_compactions = 0; + utils::ihistogram reads{256, 100}; + utils::ihistogram writes{256, 100}; }; private: @@ -456,9 +457,11 @@ class secondary_index_manager {}; inline void column_family::apply(const mutation& m, const db::replay_position& rp) { + utils::latency_counter lc; + _stats.writes.set_latency(lc); active_memtable().apply(m, rp); - _stats.writes++; seal_on_overflow(); + _stats.writes.mark(lc); } inline @@ -482,10 +485,12 @@ column_family::check_valid_rp(const db::replay_position& rp) const { inline void column_family::apply(const frozen_mutation& m, const db::replay_position& rp) { + utils::latency_counter lc; + _stats.writes.set_latency(lc); check_valid_rp(rp); active_memtable().apply(m, rp); - _stats.writes++; seal_on_overflow(); + _stats.writes.mark(lc); } future<> update_schema_version_and_announce(service::storage_proxy& proxy); From 17ebebf268fc2d8849344ef6d5ff4772f3eea796 Mon Sep 17 00:00:00 2001 From: Amnon Heiman Date: Thu, 6 Aug 2015 12:53:41 +0300 Subject: [PATCH 5/6] API: When combining histogram, return zeroed histogram on empty This change make sure that when there are no results (ie. all the histogram that are summed are empty) the return result will be a zerroed histogram and not an empty object. Signed-off-by: Amnon Heiman --- api/api.hh | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/api/api.hh b/api/api.hh index 65efc9d3b6..7f85da252e 100644 --- a/api/api.hh +++ b/api/api.hh @@ -111,13 +111,13 @@ inline double pow2(double a) { inline httpd::utils_json::histogram add_histogram(httpd::utils_json::histogram res, const utils::ihistogram& val) { - if (val.count == 0) { - return res; - } if (!res.count._set) { res = val; return res; } + if (val.count == 0) { + return res; + } if (res.min() > val.min) { res.min = val.min; } From dab068dde9b02141d3b5b8a532f1184e5292fe5f Mon Sep 17 00:00:00 2001 From: Amnon Heiman Date: Wed, 5 Aug 2015 17:52:38 +0300 Subject: [PATCH 6/6] API: modify column family API to use the histogram With the change in column_family stats, the API needs to get the counter from the read and write histogram. It also adds the implementation for the read and write latency histogram. Signed-off-by: Amnon Heiman --- api/column_family.cc | 62 +++++++++++++++++++++++++++++++++++++------- 1 file changed, 52 insertions(+), 10 deletions(-) diff --git a/api/column_family.cc b/api/column_family.cc index 89cc84c225..c72288faa3 100644 --- a/api/column_family.cc +++ b/api/column_family.cc @@ -81,15 +81,40 @@ static future get_cf_stats(http_context& ctx, }, std::plus()); } -static future map_cf_stats(http_context& ctx, - int64_t column_family::stats::*f) { - return ctx.db.map([f](const database& db) { - int64_t res = 0; +static future get_cf_stats_sum(http_context& ctx, const sstring& name, + utils::ihistogram column_family::stats::*f) { + return map_reduce_cf(ctx, name, 0, [f](const column_family& cf) { + return (cf.get_stats().*f).count; + }, std::plus()); +} + +static future get_cf_stats_sum(http_context& ctx, + utils::ihistogram column_family::stats::*f) { + return map_reduce_cf(ctx, 0, [f](const column_family& cf) { + return (cf.get_stats().*f).count; + }, std::plus()); +} + +static future get_cf_histogram(http_context& ctx, const sstring& name, + utils::ihistogram column_family::stats::*f) { + utils::UUID uuid = get_uuid(name, ctx.db.local()); + return ctx.db.map_reduce0([f, uuid](const database& p) {return p.find_column_family(uuid).get_stats().*f;}, + httpd::utils_json::histogram(), + add_histogram) + .then([](const httpd::utils_json::histogram& val) { + return make_ready_future(val); + }); +} + +static future get_cf_histogram(http_context& ctx, utils::ihistogram column_family::stats::*f) { + std::function fun = [f] (const database& db) { + httpd::utils_json::histogram res; for (auto i : db.get_column_families()) { - res += i.second.get()->get_stats().*f; + res = add_histogram(res, i.second->get_stats().*f); } return res; - }).then([](const std::vector& res) { + }; + return ctx.db.map(fun).then([](const std::vector &res) { return make_ready_future(res); }); } @@ -243,19 +268,35 @@ void set_column_family(http_context& ctx, routes& r) { }); cf::get_read.set(r, [&ctx] (std::unique_ptr req) { - return get_cf_stats(ctx,req->param["name"] ,&column_family::stats::reads); + return get_cf_stats_sum(ctx,req->param["name"] ,&column_family::stats::reads); }); cf::get_all_read.set(r, [&ctx] (std::unique_ptr req) { - return map_cf_stats(ctx, &column_family::stats::reads); + return get_cf_stats_sum(ctx, &column_family::stats::reads); }); cf::get_write.set(r, [&ctx] (std::unique_ptr req) { - return get_cf_stats(ctx,req->param["name"] ,&column_family::stats::writes); + return get_cf_stats_sum(ctx, req->param["name"] ,&column_family::stats::writes); }); cf::get_all_write.set(r, [&ctx] (std::unique_ptr req) { - return map_cf_stats(ctx, &column_family::stats::writes); + return get_cf_stats_sum(ctx, &column_family::stats::writes); + }); + + cf::get_read_latency_histogram.set(r, [&ctx] (std::unique_ptr req) { + return get_cf_histogram(ctx, req->param["name"], &column_family::stats::reads); + }); + + cf::get_all_read_latency_histogram.set(r, [&ctx] (std::unique_ptr req) { + return get_cf_histogram(ctx, &column_family::stats::writes); + }); + + cf::get_write_latency_histogram.set(r, [&ctx] (std::unique_ptr req) { + return get_cf_histogram(ctx, req->param["name"], &column_family::stats::reads); + }); + + cf::get_all_write_latency_histogram.set(r, [&ctx] (std::unique_ptr req) { + return get_cf_histogram(ctx, &column_family::stats::writes); }); cf::get_pending_compactions.set(r, [&ctx] (std::unique_ptr req) { @@ -518,5 +559,6 @@ void set_column_family(http_context& ctx, routes& r) { std::vector res; return make_ready_future(res); }); + } }