scylladb/utils/estimated_histogram.hh

/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

/*
 * Copyright (C) 2015 ScyllaDB
 *
 * Modified by ScyllaDB
 */

/*
 * This file is part of Scylla.
 *
 * Scylla is free software: you can redistribute it and/or modify
 * it under the terms of the GNU Affero General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 *
 * Scylla is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with Scylla.  If not, see <http://www.gnu.org/licenses/>.
 */

#pragma once

#include <cmath>
#include <algorithm>
#include <vector>
#include <chrono>
#include "core/metrics_types.hh"

namespace utils {

struct estimated_histogram {
    using clock = std::chrono::steady_clock;
    using duration = clock::duration;
    /**
     * The series of values to which the counts in `buckets` correspond:
     * 1, 2, 3, 4, 5, 6, 7, 8, 10, 12, 14, 17, 20, etc.
     * Thus, a `buckets` of [0, 0, 1, 10] would mean we had seen one value of 3 and 10 values of 4.
     *
     * The series starts at 1 and grows by 1.2 each time (rounding and removing duplicates). It goes from 1
     * to around 36M by default (creating 90+1 buckets), which will give us timing resolution from microseconds to
     * 36 seconds, with less precision as the numbers get larger.
     *
     * Each bucket represents values from (previous bucket offset, current offset].
     */
    std::vector<int64_t> bucket_offsets;

    // buckets is one element longer than bucketOffsets -- the last element is values greater than the last offset
    std::vector<int64_t> buckets;

    int64_t _count = 0;

    estimated_histogram(int bucket_count = 90) {

        new_offsets(bucket_count);
        buckets.resize(bucket_offsets.size() + 1, 0);
    }

    seastar::metrics::histogram get_histogram() const {
        seastar::metrics::histogram res;
        res.buckets.resize(bucket_offsets.size());
        for (size_t i = 0; i < bucket_offsets.size(); i++ ) {
            res.buckets[i].count = buckets[i];
            res.buckets[i].upper_bound = bucket_offsets[i];
        }
        res.sample_count = _count;
        return res;
    }


    // FIXME: convert Java code below.
#if 0
    public EstimatedHistogram(long[] offsets, long[] bucketData)
    {
        assert bucketData.length == offsets.length +1;
        bucketOffsets = offsets;
        buckets = new AtomicLongArray(bucketData);
    }
#endif
private:
    void new_offsets(int size) {
        bucket_offsets.resize(size);
        if (size == 0) {
            return;
        }
        int64_t last = 1;
        bucket_offsets[0] = last;
        for (int i = 1; i < size; i++) {
            int64_t next = round(last * 1.2);
            if (next == last) {
                next++;
            }
            bucket_offsets[i] = next;
            last = next;
        }
    }
public:
    /**
     * @return the histogram values corresponding to each bucket index
     */
    const std::vector<int64_t>& get_bucket_offsets() const {
        return bucket_offsets;
    }

    /**
     * @return the histogram buckets
     */
    const std::vector<int64_t>& get_buckets() const {
        return buckets;
    }

    void clear() {
        buckets.resize(buckets.size(), 0);
    }
    /**
     * Increments the count of the bucket closest to n, rounding UP.
     * @param n
     */
    void add(int64_t n) {
        auto low = std::lower_bound(bucket_offsets.begin(), bucket_offsets.end(), n);
        if (low == bucket_offsets.end()) {
            low--;
        }
        auto pos = std::distance(bucket_offsets.begin(), low);
        buckets.at(pos)++;
        _count++;
    }

    /**
     * Increments the count of the bucket closest to n, rounding UP.
     * when using sampling, the number of items in the bucket will
     * be increase so that the overall number of items will be equal
     * to the new count
     * @param n
     */
    void add_nano(int64_t n, int64_t new_count) {
        n /= 1000;
        if (new_count <= _count) {
            return;
        }
        auto low = std::lower_bound(bucket_offsets.begin(), bucket_offsets.end(), n);
        if (low == bucket_offsets.end()) {
            low--;
        }
        auto pos = std::distance(bucket_offsets.begin(), low);
        buckets.at(pos)+= new_count - _count;
        _count = new_count;
    }

    void add(duration latency, int64_t new_count) {
        add_nano(std::chrono::duration_cast<std::chrono::nanoseconds>(latency).count(), new_count);
    }

    /**
     * @return the smallest value that could have been added to this histogram
     */
    int64_t min() const {
        size_t i = 0;
        for (auto b : buckets) {
            if (b > 0) {
                return i == 0 ? 0 : 1 + bucket_offsets[i - 1];
            }
            i++;
        }
        return 0;
    }

    /**
     * @return the largest value that could have been added to this histogram.  If the histogram
     * overflowed, returns INT64_MAX.
     */
    int64_t max() const {
        int lastBucket = buckets.size() - 1;
        if (buckets[lastBucket] > 0) {
            return INT64_MAX;
        }
        for (int i = lastBucket - 1; i >= 0; i--) {
            if (buckets[i] > 0) {
                return bucket_offsets[i];
            }
        }
        return 0;
    }

    /**
     * merge a histogram to the current one.
     */
    estimated_histogram& merge(const estimated_histogram& b) {
        if (bucket_offsets.size() < b.bucket_offsets.size()) {
            new_offsets(b.bucket_offsets.size());
            buckets.resize(b.bucket_offsets.size() + 1, 0);
        }
        size_t i = 0;
        for (auto p: b.buckets) {
            buckets[i++] += p;
        }
        return *this;
    }

    friend estimated_histogram merge(estimated_histogram a, const estimated_histogram& b);

    // FIXME: convert Java code below.
#if 0
    /**
     * @return the count in the given bucket
     */
    long get(int bucket)
    {
        return buckets.get(bucket);
    }

    /**
     * @param reset zero out buckets afterwards if true
     * @return a long[] containing the current histogram buckets
     */
    public long[] getBuckets(boolean reset)
    {
        final int len = buckets.length();
        long[] rv = new long[len];

        if (reset)
            for (int i = 0; i < len; i++)
                rv[i] = buckets.getAndSet(i, 0L);
        else
            for (int i = 0; i < len; i++)
                rv[i] = buckets.get(i);

        return rv;
    }

    /**
     * @return the smallest value that could have been added to this histogram
     */
    public long min()
    {
        for (int i = 0; i < buckets.length(); i++)
        {
            if (buckets.get(i) > 0)
                return i == 0 ? 0 : 1 + bucketOffsets[i - 1];
        }
        return 0;
    }

    /**
     * @return the largest value that could have been added to this histogram.  If the histogram
     * overflowed, returns Long.MAX_VALUE.
     */
    public long max()
    {
        int lastBucket = buckets.length() - 1;
        if (buckets.get(lastBucket) > 0)
            return Long.MAX_VALUE;

        for (int i = lastBucket - 1; i >= 0; i--)
        {
            if (buckets.get(i) > 0)
                return bucketOffsets[i];
        }
        return 0;
    }

    /**
     * @param percentile
     * @return estimated value at given percentile
     */
    public long percentile(double percentile)
    {
        assert percentile >= 0 && percentile <= 1.0;
        int lastBucket = buckets.length() - 1;
        if (buckets.get(lastBucket) > 0)
            throw new IllegalStateException("Unable to compute when histogram overflowed");

        long pcount = (long) Math.floor(count() * percentile);
        if (pcount == 0)
            return 0;

        long elements = 0;
        for (int i = 0; i < lastBucket; i++)
        {
            elements += buckets.get(i);
            if (elements >= pcount)
                return bucketOffsets[i];
        }
        return 0;
    }

#endif

    /**
     * @return the mean histogram value (average of bucket offsets, weighted by count)
     */
    int64_t mean() const {
        auto lastBucket = buckets.size() - 1;
        int64_t elements = 0;
        int64_t sum = 0;
        for (size_t i = 0; i < lastBucket; i++) {
            long bCount = buckets[i];
            elements += bCount;
            sum += bCount * bucket_offsets[i];
        }

        return ((double) (sum + elements -1)/ elements);
    }

    /**
     * @return the total number of non-zero values
     */
    int64_t count() const {
        int64_t sum = 0L;
        for (size_t i = 0; i < buckets.size(); i++) {
            sum += buckets[i];
        }
        return sum;
    }
#if 0
    /**
     * @return true if this histogram has overflowed -- that is, a value larger than our largest bucket could bound was added
     */
    public boolean isOverflowed()
    {
        return buckets.get(buckets.length() - 1) > 0;
    }

    /**
     * log.debug() every record in the histogram
     *
     * @param log
     */
    public void log(Logger log)
    {
        // only print overflow if there is any
        int nameCount;
        if (buckets.get(buckets.length() - 1) == 0)
            nameCount = buckets.length() - 1;
        else
            nameCount = buckets.length();
        String[] names = new String[nameCount];

        int maxNameLength = 0;
        for (int i = 0; i < nameCount; i++)
        {
            names[i] = nameOfRange(bucketOffsets, i);
            maxNameLength = Math.max(maxNameLength, names[i].length());
        }

        // emit log records
        String formatstr = "%" + maxNameLength + "s: %d";
        for (int i = 0; i < nameCount; i++)
        {
            long count = buckets.get(i);
            // sort-of-hack to not print empty ranges at the start that are only used to demarcate the
            // first populated range. for code clarity we don't omit this record from the maxNameLength
            // calculation, and accept the unnecessary whitespace prefixes that will occasionally occur
            if (i == 0 && count == 0)
                continue;
            log.debug(String.format(formatstr, names[i], count));
        }
    }

    private static String nameOfRange(long[] bucketOffsets, int index)
    {
        StringBuilder sb = new StringBuilder();
        appendRange(sb, bucketOffsets, index);
        return sb.toString();
    }

    private static void appendRange(StringBuilder sb, long[] bucketOffsets, int index)
    {
        sb.append("[");
        if (index == 0)
            if (bucketOffsets[0] > 0)
                // by original definition, this histogram is for values greater than zero only;
                // if values of 0 or less are required, an entry of lb-1 must be inserted at the start
                sb.append("1");
            else
                sb.append("-Inf");
        else
            sb.append(bucketOffsets[index - 1] + 1);
        sb.append("..");
        if (index == bucketOffsets.length)
            sb.append("Inf");
        else
            sb.append(bucketOffsets[index]);
        sb.append("]");
    }

    @Override
    public boolean equals(Object o)
    {
        if (this == o)
            return true;

        if (!(o instanceof EstimatedHistogram))
            return false;

        EstimatedHistogram that = (EstimatedHistogram) o;
        return Arrays.equals(getBucketOffsets(), that.getBucketOffsets()) &&
               Arrays.equals(getBuckets(false), that.getBuckets(false));
    }

    @Override
    public int hashCode()
    {
        return Objects.hashCode(getBucketOffsets(), getBuckets(false));
    }

    public static class EstimatedHistogramSerializer implements ISerializer<EstimatedHistogram>
    {
        public void serialize(EstimatedHistogram eh, DataOutputPlus out) throws IOException
        {
            long[] offsets = eh.getBucketOffsets();
            long[] buckets = eh.getBuckets(false);
            out.writeInt(buckets.length);
            for (int i = 0; i < buckets.length; i++)
            {
                out.writeLong(offsets[i == 0 ? 0 : i - 1]);
                out.writeLong(buckets[i]);
            }
        }

        public EstimatedHistogram deserialize(DataInput in) throws IOException
        {
            int size = in.readInt();
            long[] offsets = new long[size - 1];
            long[] buckets = new long[size];

            for (int i = 0; i < size; i++) {
                offsets[i == 0 ? 0 : i - 1] = in.readLong();
                buckets[i] = in.readLong();
            }
            return new EstimatedHistogram(offsets, buckets);
        }

        public long serializedSize(EstimatedHistogram eh, TypeSizes typeSizes)
        {
            int size = 0;

            long[] offsets = eh.getBucketOffsets();
            long[] buckets = eh.getBuckets(false);
            size += typeSizes.sizeof(buckets.length);
            for (int i = 0; i < buckets.length; i++)
            {
                size += typeSizes.sizeof(offsets[i == 0 ? 0 : i - 1]);
                size += typeSizes.sizeof(buckets[i]);
            }
            return size;
        }
    }
#endif
};

inline estimated_histogram estimated_histogram_merge(estimated_histogram a, const estimated_histogram& b) {
    return a.merge(b);
}

}