scylladb/utils/bloom_calculations.hh

/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

/*
 * Copyright (C) 2015 ScyllaDB
 *
 * Modified by ScyllaDB
 */

/*
 * This file is part of Scylla.
 *
 * Scylla is free software: you can redistribute it and/or modify
 * it under the terms of the GNU Affero General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 *
 * Scylla is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with Scylla.  If not, see <http://www.gnu.org/licenses/>.
 */

#pragma once

#include "exceptions/exceptions.hh"

namespace utils {

/**
 * The following calculations are taken from:
 * http://www.cs.wisc.edu/~cao/papers/summary-cache/node8.html
 * "Bloom Filters - the math"
 *
 * This class's static methods are meant to facilitate the use of the Bloom
 * Filter class by helping to choose correct values of 'bits per element' and
 * 'number of hash functions, k'.
 */
namespace bloom_calculations {

    /**
     * A wrapper class that holds two key parameters for a Bloom Filter: the
     * number of hash functions used, and the number of buckets per element used.
     */
    struct bloom_specification final {
        int K; // number of hash functions.
        int buckets_per_element;

        bloom_specification(int k, int buckets_per_element) : K(k), buckets_per_element(buckets_per_element) { }

        operator sstring() {
            return sprint("bloom_specification(K=%d, buckets_per_element=%d)", K, buckets_per_element);
        }
    };

    int constexpr min_buckets = 2;
    int constexpr min_k = 1;
    int constexpr EXCESS = 20;

    extern std::vector<std::vector<double>> probs;
    extern std::vector<int> opt_k_per_buckets;

    /**
     * Given the number of buckets that can be used per element, return a
     * specification that minimizes the false positive rate.
     *
     * @param buckets_per_element The number of buckets per element for the filter.
     * @return A spec that minimizes the false positive rate.
     */
    inline bloom_specification compute_bloom_spec(int buckets_per_element) {
        assert(buckets_per_element >= 1);
        assert(buckets_per_element <= int(probs.size()) - 1);
        return bloom_specification(opt_k_per_buckets[buckets_per_element], buckets_per_element);
    }

    /**
     * Given a maximum tolerable false positive probability, compute a Bloom
     * specification which will give less than the specified false positive rate,
     * but minimize the number of buckets per element and the number of hash
     * functions used.  Because bandwidth (and therefore total bitvector size)
     * is considered more expensive than computing power, preference is given
     * to minimizing buckets per element rather than number of hash functions.
     *
     * @param max_buckets_per_element The maximum number of buckets available for the filter.
     * @param max_false_pos_prob The maximum tolerable false positive rate.
     * @return A Bloom Specification which would result in a false positive rate
     * less than specified by the function call
     * @throws unsupported_operation_exception if a filter satisfying the parameters cannot be met
     */
    inline bloom_specification compute_bloom_spec(int max_buckets_per_element, double max_false_pos_prob) {
        assert(max_buckets_per_element >= 1);
        assert(max_buckets_per_element <= int(probs.size()) - 1);

        auto max_k = int(probs[max_buckets_per_element].size()) - 1;

        // Handle the trivial cases
        if(max_false_pos_prob >= probs[min_buckets][min_k]) {
            return bloom_specification(2, opt_k_per_buckets[2]);
        }

        if (max_false_pos_prob < probs[max_buckets_per_element][max_k]) {
            throw exceptions::unsupported_operation_exception(sprint("Unable to satisfy %f with %d buckets per element", max_false_pos_prob, max_buckets_per_element));
        }

        // First find the minimal required number of buckets:
        int buckets_per_element = 2;
        int K = opt_k_per_buckets[2];

        while(probs[buckets_per_element][K] > max_false_pos_prob){
            buckets_per_element++;
            K = opt_k_per_buckets[buckets_per_element];
        }
        // Now that the number of buckets is sufficient, see if we can relax K
        // without losing too much precision.
        while(probs[buckets_per_element][K - 1] <= max_false_pos_prob){
            K--;
        }

        return bloom_specification(K, buckets_per_element);
    }

    /**
     * Calculates the maximum number of buckets per element that this implementation
     * can support.  Crucially, it will lower the bucket count if necessary to meet
     * BitSet's size restrictions.
     */
    inline int max_buckets_per_element(long num_elements) {
        num_elements = std::max(1l, num_elements);

        double v = std::numeric_limits<long>::max() - EXCESS;
        v = v / num_elements;

        if (v < 1.0) {
            throw exceptions::unsupported_operation_exception(sprint("Cannot compute probabilities for %ld elements.", num_elements));
        }
        return std::min(probs.size() - 1, size_t(v));
    }
}

}

#if 0
package org.apache.cassandra.utils;

/**
 * The following calculations are taken from:
 * http://www.cs.wisc.edu/~cao/papers/summary-cache/node8.html
 * "Bloom Filters - the math"
 *
 * This class's static methods are meant to facilitate the use of the Bloom
 * Filter class by helping to choose correct values of 'bits per element' and
 * 'number of hash functions, k'.
 */
class BloomCalculations {

    private static final int minBuckets = 2;
    private static final int minK = 1;

    private static final int EXCESS = 20;

    /**
     * In the following keyspaceName, the row 'i' shows false positive rates if i buckets
     * per element are used.  Cell 'j' shows false positive rates if j hash
     * functions are used.  The first row is 'i=0', the first column is 'j=0'.
     * Each cell (i,j) the false positive rate determined by using i buckets per
     * element and j hash functions.
     */
    static final double[][] probs = new double[][]{
        {1.0}, // dummy row representing 0 buckets per element
        {1.0, 1.0}, // dummy row representing 1 buckets per element
        {1.0, 0.393,  0.400},
        {1.0, 0.283,  0.237,   0.253},
        {1.0, 0.221,  0.155,   0.147,   0.160},
        {1.0, 0.181,  0.109,   0.092,   0.092,   0.101}, // 5
        {1.0, 0.154,  0.0804,  0.0609,  0.0561,  0.0578,   0.0638},
        {1.0, 0.133,  0.0618,  0.0423,  0.0359,  0.0347,   0.0364},
        {1.0, 0.118,  0.0489,  0.0306,  0.024,   0.0217,   0.0216,   0.0229},
        {1.0, 0.105,  0.0397,  0.0228,  0.0166,  0.0141,   0.0133,   0.0135,   0.0145},
        {1.0, 0.0952, 0.0329,  0.0174,  0.0118,  0.00943,  0.00844,  0.00819,  0.00846}, // 10
        {1.0, 0.0869, 0.0276,  0.0136,  0.00864, 0.0065,   0.00552,  0.00513,  0.00509},
        {1.0, 0.08,   0.0236,  0.0108,  0.00646, 0.00459,  0.00371,  0.00329,  0.00314},
        {1.0, 0.074,  0.0203,  0.00875, 0.00492, 0.00332,  0.00255,  0.00217,  0.00199,  0.00194},
        {1.0, 0.0689, 0.0177,  0.00718, 0.00381, 0.00244,  0.00179,  0.00146,  0.00129,  0.00121,  0.0012},
        {1.0, 0.0645, 0.0156,  0.00596, 0.003,   0.00183,  0.00128,  0.001,    0.000852, 0.000775, 0.000744}, // 15
        {1.0, 0.0606, 0.0138,  0.005,   0.00239, 0.00139,  0.000935, 0.000702, 0.000574, 0.000505, 0.00047,  0.000459},
        {1.0, 0.0571, 0.0123,  0.00423, 0.00193, 0.00107,  0.000692, 0.000499, 0.000394, 0.000335, 0.000302, 0.000287, 0.000284},
        {1.0, 0.054,  0.0111,  0.00362, 0.00158, 0.000839, 0.000519, 0.00036,  0.000275, 0.000226, 0.000198, 0.000183, 0.000176},
        {1.0, 0.0513, 0.00998, 0.00312, 0.0013,  0.000663, 0.000394, 0.000264, 0.000194, 0.000155, 0.000132, 0.000118, 0.000111, 0.000109},
        {1.0, 0.0488, 0.00906, 0.0027,  0.00108, 0.00053,  0.000303, 0.000196, 0.00014,  0.000108, 8.89e-05, 7.77e-05, 7.12e-05, 6.79e-05, 6.71e-05} // 20
    };  // the first column is a dummy column representing K=0.

    /**
     * The optimal number of hashes for a given number of bits per element.
     * These values are automatically calculated from the data above.
     */
    private static final int[] optKPerBuckets = new int[probs.length];

    static
    {
        for (int i = 0; i < probs.length; i++)
        {
            double min = Double.MAX_VALUE;
            double[] prob = probs[i];
            for (int j = 0; j < prob.length; j++)
            {
                if (prob[j] < min)
                {
                    min = prob[j];
                    optKPerBuckets[i] = Math.max(minK, j);
                }
            }
        }
    }

    /**
     * Given the number of buckets that can be used per element, return a
     * specification that minimizes the false positive rate.
     *
     * @param bucketsPerElement The number of buckets per element for the filter.
     * @return A spec that minimizes the false positive rate.
     */
    public static BloomSpecification computeBloomSpec(int bucketsPerElement)
    {
        assert bucketsPerElement >= 1;
        assert bucketsPerElement <= probs.length - 1;
        return new BloomSpecification(optKPerBuckets[bucketsPerElement], bucketsPerElement);
    }

    /**
     * A wrapper class that holds two key parameters for a Bloom Filter: the
     * number of hash functions used, and the number of buckets per element used.
     */
    public static class BloomSpecification
    {
        final int K; // number of hash functions.
        final int bucketsPerElement;

        public BloomSpecification(int k, int bucketsPerElement)
        {
            K = k;
            this.bucketsPerElement = bucketsPerElement;
        }

        public String toString()
        {
            return String.format("BloomSpecification(K=%d, bucketsPerElement=%d)", K, bucketsPerElement);
        }
    }

    /**
     * Given a maximum tolerable false positive probability, compute a Bloom
     * specification which will give less than the specified false positive rate,
     * but minimize the number of buckets per element and the number of hash
     * functions used.  Because bandwidth (and therefore total bitvector size)
     * is considered more expensive than computing power, preference is given
     * to minimizing buckets per element rather than number of hash functions.
     *
     * @param maxBucketsPerElement The maximum number of buckets available for the filter.
     * @param maxFalsePosProb The maximum tolerable false positive rate.
     * @return A Bloom Specification which would result in a false positive rate
     * less than specified by the function call
     * @throws UnsupportedOperationException if a filter satisfying the parameters cannot be met
     */
    public static BloomSpecification computeBloomSpec(int maxBucketsPerElement, double maxFalsePosProb)
    {
        assert maxBucketsPerElement >= 1;
        assert maxBucketsPerElement <= probs.length - 1;
        int maxK = probs[maxBucketsPerElement].length - 1;

        // Handle the trivial cases
        if(maxFalsePosProb >= probs[minBuckets][minK]) {
            return new BloomSpecification(2, optKPerBuckets[2]);
        }
        if (maxFalsePosProb < probs[maxBucketsPerElement][maxK]) {
            throw new UnsupportedOperationException(String.format("Unable to satisfy %s with %s buckets per element",
                                                                  maxFalsePosProb, maxBucketsPerElement));
        }

        // First find the minimal required number of buckets:
        int bucketsPerElement = 2;
        int K = optKPerBuckets[2];
        while(probs[bucketsPerElement][K] > maxFalsePosProb){
            bucketsPerElement++;
            K = optKPerBuckets[bucketsPerElement];
        }
        // Now that the number of buckets is sufficient, see if we can relax K
        // without losing too much precision.
        while(probs[bucketsPerElement][K - 1] <= maxFalsePosProb){
            K--;
        }

        return new BloomSpecification(K, bucketsPerElement);
    }

    /**
     * Calculates the maximum number of buckets per element that this implementation
     * can support.  Crucially, it will lower the bucket count if necessary to meet
     * BitSet's size restrictions.
     */
    public static int maxBucketsPerElement(long numElements)
    {
        numElements = Math.max(1, numElements);
        double v = (Long.MAX_VALUE - EXCESS) / (double)numElements;
        if (v < 1.0)
        {
            throw new UnsupportedOperationException("Cannot compute probabilities for " + numElements + " elements.");
        }
        return Math.min(BloomCalculations.probs.length - 1, (int)v);
    }
}
#endif