Files
scylladb/utils/gz/crc_combine.cc
Dawid Mędrek a151944fa6 treewide: Replace __builtin_expect with (un)likely
C++20 introduced two new attributes--likely and unlikely--that
function as a built-in replacement for __builtin_expect implemented
in various compilers. Since it makes code easier to read and it's
an integral part of the language, there's no reason to not use it
instead.

Closes scylladb/scylladb#24786
2025-07-03 13:34:04 +03:00

224 lines
6.3 KiB
C++

/*
* Copyright (C) 2018-present ScyllaDB
*/
/*
* SPDX-License-Identifier: LicenseRef-ScyllaDB-Source-Available-1.0
*
*/
/*
* CRC32(M(x)) = M(x) * x^32 mod G(x)
*
* Where M(x) is a polynomial with binary coefficients from an integer field modulo 2.
*
* M(X) = a_0 * x^0 + a_1 * x^1 + ... + a_n * x^N
*
* a_i in { 0, 1 }
*
* Since it's a modulo-2 field, addition of coefficients is done by XORing.
*
* G(x) is a constant polynomial of degree 32:
*
* G(x) = x^32 + x^26 + x^23 + x^22 + x^16 + x^12 + x^11 + x^10 + x^8 + x^7 + x^5 + x^4 + x^2 + x + 1
*
* Polynomials are represented in memory as sequences of bits, where each
* bit represents the coefficient of some x^N term. Bit-reversed form is used,
* meaning that the coefficients for higher powers are stored in less significant bits:
*
* MSB LSB
* bit: 31 30 29 2 1 0
*
* | 1 | 0 | 1 | ... | 1 | 0 | 1 |
*
* P(x) = x^0 + x^2 + ... + x^29 + x^31
*
*/
#include "crc_combine.hh"
#include "crc_combine_table.hh"
#include "utils/clmul.hh"
using u32 = uint32_t;
using u64 = uint64_t;
#if defined(__x86_64__) || defined(__i386__) || defined(__aarch64__)
#include "barrett.hh"
/*
* Calculates:
*
* P1(x) * P2(x)
*/
static
u64 pmul(u32 p1, u32 p2) {
// Because the representation is bit-reversed, we need to shift left
// one bit after multiplying so that the highest bit of the 64-bit result
// corresponds to the coefficient of x^63 and not x^62.
return clmul(p1, p2) << 1;
}
/*
* Calculates:
*
* P1(x) * P2(x) mod G(x)
*/
static
u32 pmul_mod(u32 p1, u32 p2) {
return crc32_fold_barrett_u64(pmul(p1, p2));
}
/*
* Calculates:
*
* R(x) = P(x) * x^(e*8) mod G(x)
*
*/
static
u32 mul_by_x_pow_mul8(u32 p, u64 e) {
/*
* Let's expand e as a series with coefficients corresponding to the binary form:
*
* e = e_0 * 2^0 + e_1 * 2^1 + ... + e_(n-1) * 2^(n-1) + e_n * 2^n
*
* for short:
*
* e = SUM(e_i * 2^i)
* i in 0..63
*
* then:
*
* R(X) = P(x) * x^(e*8) mod G(x)
* = P(x) * x^(SUM(e_i * 2^i)*8) mod G(x)
* = P(x) * x^(SUM(e_i * 2^(i+3)))) mod G(x)
* = P(x) * MUL(x^(e_i * 2^(i+3)))) mod G(x)
*
* To simplify notation:
*
* t_i(x, u) = x^(u * 2^(i+3))
*
* then:
*
* R(X) = P(x) * MUL(t_i(x, e_i)) mod G(x)
*
* One observation is that we could apply modulo G(x) to each of the t_i term without
* changing the result.
*
* Also, each t_i can have 2 possible values:
*
* t_i(x, u) = { x^(2^(i+3)) mod G(x) for u = 1,
* 1 for u = 0 }
*
* So t_i(x, u) is a polynomial of degree up to 31 and its 2 possible values could be precomputed.
*
* There are 64 t_i terms, selected for multiplication by the bits of e. That means we have to
* do up to 64 multiplications. Each multiplication doubles the order of the polynomial and
* multiplication is natively supported only for up to degree 64, so we would have to also
* fold (apply the modulo operator) along the way.
*
* We can reduce the number of multiplications by grouping the terms, e.g. by 8:
*
* R(X) = P(x) * (t_0(x, e_0) * t_1(x, e_1) * ... * t_7(x, e_7) mod G(x))
* * (t_8(x, e_8) * t_9(x, e_9) * ... * t_15(x, e_15) mod G(x))
* ...
* * (t_56(x, e_56) * t_57(x, e_57) * ... * t_63(x, e_63) mod G(x)) mod G(x)
*
* Each group multiplies 8 t_i terms and can have 256 possible values depending on 8 bits of e.
* The trick is to pre-compute the values. They are stored in crc32_x_pow_radix_8_table_base_{0,8,16,24}.
* We can also observe that t_(i+32)(x, u) = t_i(x, u), so we actually need to compute values for
* only the first 4 groups.
*
* When we know that all bits in e relevant for given group are 0, that group will evaluate to 1
* and thus doesn't participate in the end result. So for small values of e we only have to multiply
* lower-order groups.
*/
u32 x0 = crc32_x_pow_radix_8_table_base_0[e & 0xff];
if (e < 0x100) [[unlikely]] {
return pmul_mod(p, x0);
}
u32 x1 = crc32_x_pow_radix_8_table_base_8[(e >> 8) & 0xff];
u32 x2 = crc32_x_pow_radix_8_table_base_16[(e >> 16) & 0xff];
u64 y0 = pmul(p, x0);
u64 y1 = pmul(x1, x2);
u32 z0 = crc32_fold_barrett_u64(y0);
u32 z1 = crc32_fold_barrett_u64(y1);
if (e < 0x1000000) [[likely]] {
return pmul_mod(z0, z1);
}
u32 x3 = crc32_x_pow_radix_8_table_base_24[(e >> 24) & 0xff];
z1 = pmul_mod(z1, x3);
if (e < 0x100000000) [[likely]] {
return pmul_mod(z0, z1);
}
u32 x4 = crc32_x_pow_radix_8_table_base_0[(e >> 32) & 0xff];
u32 x5 = crc32_x_pow_radix_8_table_base_8[(e >> 40) & 0xff];
u32 x6 = crc32_x_pow_radix_8_table_base_16[(e >> 48) & 0xff];
u32 x7 = crc32_x_pow_radix_8_table_base_24[(e >> 56) & 0xff];
u64 y2 = pmul(x4, x5);
u64 y3 = pmul(x6, x7);
u64 u0 = pmul(z0, z1);
u32 z2 = crc32_fold_barrett_u64(y2);
u32 z3 = crc32_fold_barrett_u64(y3);
u64 u1 = pmul(z2, z3);
u32 v0 = crc32_fold_barrett_u64(u0);
u32 v1 = crc32_fold_barrett_u64(u1);
return pmul_mod(v0, v1);
}
u32 fast_crc32_combine(u32 crc, u32 crc2, ssize_t len2) {
if (len2 == 0) {
return crc;
}
/*
* We need to calculate:
*
* CRC(A(x) * x^|B(x)| + B(x))
*
* given:
*
* CRC(A(x)) = crc
* CRC(B(x)) = crc2
* |B(x)| = len2 * 8
*
* CRC(A(x)) = A(x) * x^32 mod G(x)
* CRC(B(x)) = B(x) * x^32 mod G(x)
*
* CRC(A(x) * x^|B(x)| + B(x))
* = (A(x) * x^|B(x)| + B(x)) * x^32 mod G(x)
* = (A(x) * x^32 mod G(x) * x^|B(x)|) + (B(x) * x^32 mod G(x)) mod G(x)
* = (crc * x^(len2 * 8)) + crc2 mod G(x)
*
*/
return mul_by_x_pow_mul8(crc, len2) ^ crc2;
}
#else
// FIXME: Optimize for other archs
// That boils down to implementing crc32_fold_barrett_u64() and clmul()
// and reusing the algorithm above. For now, delegate to zlib.
#include <zlib.h>
u32 fast_crc32_combine(u32 crc, u32 crc2, ssize_t len2) {
if (len2 == 0) {
return crc;
}
return crc32_combine(crc, crc2, len2);
}
#endif