Files
scylladb/sstables/hyperloglog.hh
Avi Kivity fcb8d040e8 treewide: use Software Package Data Exchange (SPDX) license identifiers
Instead of lengthy blurbs, switch to single-line, machine-readable
standardized (https://spdx.dev) license identifiers. The Linux kernel
switched long ago, so there is strong precedent.

Three cases are handled: AGPL-only, Apache-only, and dual licensed.
For the latter case, I chose (AGPL-3.0-or-later and Apache-2.0),
reasoning that our changes are extensive enough to apply our license.

The changes we applied mechanically with a script, except to
licenses/README.md.

Closes #9937
2022-01-18 12:15:18 +01:00

336 lines
9.1 KiB
C++

/*
* Copyright (c) 2013 Hideaki Ohno <hide.o.j55{at}gmail.com>
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the 'Software'), to
* deal in the Software without restriction, including without limitation the
* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
* sell copies of the Software, and to permit persons to whom the Software is
* furnished to do so.
*/
/*
* SPDX-License-Identifier: AGPL-3.0-or-later
*/
/*
* Copyright (C) 2011 Clearspring Technologies, Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* Copyright (C) 2015-present ScyllaDB
*
* Modified by ScyllaDB
*/
#pragma once
/**
* @file hyperloglog.hpp
* @brief HyperLogLog cardinality estimator
* @date Created 2013/3/20
* @author Hideaki Ohno
*/
#include <vector>
#include <cmath>
#include <sstream>
#include <stdexcept>
#include <algorithm>
#include <seastar/core/byteorder.hh>
#include <seastar/core/temporary_buffer.hh>
#include "seastarx.hh"
#if 0
#include "murmur3.h"
#endif
#define HLL_HASH_SEED 313
namespace hll {
static constexpr double pow_2_32 = 4294967296.0; ///< 2^32
static constexpr double neg_pow_2_32 = -4294967296.0; ///< -(2^32)
static inline size_t size_unsigned_var_int(unsigned int value) {
size_t size = 0;
while ((value & 0xFFFFFF80) != 0L) {
size++;
value >>= 7;
}
size++;
return size;
}
static inline size_t write_unsigned_var_int(unsigned int value, uint8_t* to) {
size_t size = 0;
while ((value & 0xFFFFFF80) != 0L) {
*to = (value & 0x7F) | 0x80;
value >>= 7;
to++;
size++;
}
*to = value & 0x7F;
size++;
return size;
}
/** @class HyperLogLog
* @brief Implement of 'HyperLogLog' estimate cardinality algorithm
*/
class HyperLogLog {
public:
/**
* Constructor
*
* @param[in] b bit width (register size will be 2 to the b power).
* This value must be in the range[4,16].Default value is 4.
*
* @exception std::invalid_argument the argument is out of range.
*/
HyperLogLog(uint8_t b = 4) :
b_(b), m_(1 << b), M_(m_, 0) {
if (b < 4 || 16 < b) {
throw std::invalid_argument("bit width must be in the range [4,16]");
}
double alpha;
switch (m_) {
case 16:
alpha = 0.673;
break;
case 32:
alpha = 0.697;
break;
case 64:
alpha = 0.709;
break;
default:
alpha = 0.7213 / (1.0 + 1.079 / m_);
break;
}
alphaMM_ = alpha * m_ * m_;
}
static HyperLogLog from_bytes(temporary_buffer<uint8_t> bytes) {
// FIXME: implement class that creates a HyperLogLog from an array of bytes.
// This will useful if we need to work with the cardinality data from the
// compaction metadata.
abort();
}
/**
* Adds element to the estimator
*
* @param[in] str string to add
* @param[in] len length of string
*/
#if 0
void add(const char* str, uint32_t len) {
uint32_t hash;
MurmurHash3_x86_32(str, len, HLL_HASH_SEED, (void*) &hash);
uint32_t index = hash >> (32 - b_);
uint8_t rank = rho((hash << b_), 32 - b_);
if (rank > M_[index]) {
M_[index] = rank;
}
}
#endif
void offer_hashed(uint64_t hash) {
uint32_t index = hash >> (64 - b_);
uint8_t rank = rho((hash << b_), 64 - b_);
if (rank > M_[index]) {
M_[index] = rank;
}
}
/*
* Calculate the size of buffer returned by get_bytes().
*/
size_t get_bytes_size() {
size_t size = 0;
size += sizeof(int); // version
size += size_unsigned_var_int(b_); // p; register width = b_.
size += size_unsigned_var_int(0); // sp; // sparse set = 0.
size += size_unsigned_var_int(0); // type;
size += size_unsigned_var_int(M_.size()); // register size;
size += M_.size();
return size;
}
temporary_buffer<uint8_t> get_bytes() {
// FIXME: add support to SPARSE format.
static constexpr int version = 2;
size_t s = get_bytes_size();
temporary_buffer<uint8_t> bytes(s);
size_t offset = 0;
// write version
write_be<int32_t>(reinterpret_cast<char*>(bytes.get_write() + offset), -version);
offset += sizeof(int);
// write register width
offset += write_unsigned_var_int(b_, bytes.get_write() + offset);
// NOTE: write precision value for sparse set (not supported).
offset += write_unsigned_var_int(0, bytes.get_write() + offset);
// write type (NORMAL always!)
offset += write_unsigned_var_int(0, bytes.get_write() + offset);
// write register size
offset += write_unsigned_var_int(M_.size(), bytes.get_write() + offset);
// write register
memcpy(bytes.get_write() + offset, M_.data(), M_.size());
offset += M_.size();
bytes.trim(offset);
if (s != offset) {
throw std::runtime_error("possible overflow while generating cardinality metadata");
}
return bytes;
}
/**
* Estimates cardinality value.
*
* @return Estimated cardinality value.
*/
double estimate() const {
double estimate;
double sum = 0.0;
for (uint32_t i = 0; i < m_; i++) {
sum += 1.0 / pow(2.0, M_[i]);
}
estimate = alphaMM_ / sum; // E in the original paper
if (estimate <= 2.5 * m_) {
uint32_t zeros = 0;
for (uint32_t i = 0; i < m_; i++) {
if (M_[i] == 0) {
zeros++;
}
}
if (zeros != 0) {
estimate = m_ * log(static_cast<double>(m_)/ zeros);
}
} else if (estimate > (1.0 / 30.0) * pow_2_32) {
estimate = neg_pow_2_32 * log(1.0 - (estimate / pow_2_32));
}
return estimate;
}
/**
* Merges the estimate from 'other' into this object, returning the estimate of their union.
* The number of registers in each must be the same.
*
* @param[in] other HyperLogLog instance to be merged
*
* @exception std::invalid_argument number of registers doesn't match.
*/
void merge(const HyperLogLog& other) {
if (m_ != other.m_) {
std::stringstream ss;
ss << "number of registers doesn't match: " << m_ << " != " << other.m_;
throw std::invalid_argument(ss.str().c_str());
}
for (uint32_t r = 0; r < m_; ++r) {
if (M_[r] < other.M_[r]) {
M_[r] = other.M_[r];
}
}
}
/**
* Clears all internal registers.
*/
void clear() {
std::fill(M_.begin(), M_.end(), 0);
}
/**
* Returns size of register.
*
* @return Register size
*/
uint32_t registerSize() const {
return m_;
}
/**
* Exchanges the content of the instance
*
* @param[in,out] rhs Another HyperLogLog instance
*/
void swap(HyperLogLog& rhs) {
std::swap(b_, rhs.b_);
std::swap(m_, rhs.m_);
std::swap(alphaMM_, rhs.alphaMM_);
M_.swap(rhs.M_);
}
/**
* Dump the current status to a stream
*
* @param[out] os The output stream where the data is saved
*
* @exception std::runtime_error When failed to dump.
*/
void dump(std::ostream& os) const {
os.write((char*)&b_, sizeof(b_));
os.write((char*)&M_[0], sizeof(M_[0]) * M_.size());
if(os.fail()){
throw std::runtime_error("Failed to dump");
}
}
/**
* Restore the status from a stream
*
* @param[in] is The input stream where the status is saved
*
* @exception std::runtime_error When failed to restore.
*/
void restore(std::istream& is) {
uint8_t b = 0;
is.read((char*)&b, sizeof(b));
HyperLogLog tempHLL(b);
is.read((char*)&(tempHLL.M_[0]), sizeof(M_[0]) * tempHLL.m_);
if(is.fail()){
throw std::runtime_error("Failed to restore");
}
swap(tempHLL);
}
private:
uint8_t b_; ///< register bit width
uint32_t m_; ///< register size
double alphaMM_; ///< alpha * m^2
std::vector<uint8_t> M_; ///< registers
uint8_t rho(uint32_t x, uint8_t b) {
uint8_t v = 1;
while (v <= b && !(x & 0x80000000)) {
v++;
x <<= 1;
}
return v;
}
};
} // namespace hll