Files
scylladb/utils/crc.hh
Yibo Cai (Arm Technology China) 79136e895f utils/crc: calculate crc in parallel
It achieves 2.0x speedup on intel E5 and 1.1x to 2.5x speedup on
various arm64 microarchitectures.

The algorithm cuts data into blocks of 1024 bytes and calculates crc
for each block, which is furthur divided into three subblocks of 336
bytes(42 uint64) each, and 16 remaining bytes(2 uint64).

For each iteration, three independent crc are caculated for one uint64
from each subgroup. It increases IPC(instructions per cycle) much.
After subblocks are done, three crc and remaining two uint64 are
combined using carry-less multiplication to reach the final result
for one block of 1024 bytes.

Signed-off-by: Yibo Cai <yibo.cai@arm.com>
Message-Id: <1541042759-24767-1-git-send-email-yibo.cai@arm.com>
2018-11-01 10:19:32 +02:00

250 lines
7.6 KiB
C++

/*
* Copyright (C) 2015 ScyllaDB
*/
/*
* This file is part of Scylla.
*
* Scylla is free software: you can redistribute it and/or modify
* it under the terms of the GNU Affero General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* Scylla is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with Scylla. If not, see <http://www.gnu.org/licenses/>.
*
* A crc32 calculation for __PPC64__ uses the code from https://github.com/antonblanchard/crc32-vpmsum
* written by Anton Blanchard <anton@au.ibm.com>, IBM
*/
#pragma once
#include <cstdint>
#include <type_traits>
#include <seastar/net/byteorder.hh>
#include <seastar/core/byteorder.hh>
#if defined(__x86_64__) || defined(__i386__)
#include <smmintrin.h>
#elif defined(__aarch64__)
#include <arm_acle.h>
/* Implement x86-64 intrinsics with according aarch64 ones */
static inline uint32_t _mm_crc32_u8(uint32_t crc, uint8_t in)
{
return __crc32cb(crc, in);
}
static inline uint32_t _mm_crc32_u16(uint32_t crc, uint16_t in)
{
return __crc32ch(crc, in);
}
static inline uint32_t _mm_crc32_u32(uint32_t crc, uint32_t in)
{
return __crc32cw(crc, in);
}
static inline uint32_t _mm_crc32_u64(uint32_t crc, uint64_t in)
{
return __crc32cd(crc, in);
}
#else
#include <zlib.h>
#endif
// Carry-less multiplication
#if defined(__x86_64__) || defined(__i386__)
#include <wmmintrin.h>
static inline uint64_t clmul_u32(uint32_t p1, uint32_t p2)
{
__m128i p = _mm_set_epi64x(p1, p2);
p = _mm_clmulepi64_si128(p, p, 0x01);
return _mm_extract_epi64(p, 0);
}
#elif defined(__aarch64__)
#include <arm_neon.h>
static inline uint64_t clmul_u32(uint32_t p1, uint32_t p2)
{
return vmull_p64(p1, p2);
}
#endif
#include "utils/fragment_range.hh"
namespace utils {
class crc32 {
uint32_t _r = 0;
public:
// All process() functions assume input is in
// host byte order (i.e. equivalent to storing
// the value in a buffer and crcing the buffer).
#if defined(__x86_64__) || defined(__i386__) || defined(__aarch64__)
// On x86 use the crc32 instruction added in SSE 4.2.
void process_le(int8_t in) {
_r = _mm_crc32_u8(_r, in);
}
void process_le(uint8_t in) {
_r = _mm_crc32_u8(_r, in);
}
void process_le(int16_t in) {
_r = _mm_crc32_u16(_r, in);
}
void process_le(uint16_t in) {
_r = _mm_crc32_u16(_r, in);
}
void process_le(int32_t in) {
_r = _mm_crc32_u32(_r, in);
}
void process_le(uint32_t in) {
_r = _mm_crc32_u32(_r, in);
}
void process_le(int64_t in) {
_r = _mm_crc32_u64(_r, in);
}
void process_le(uint64_t in) {
_r = _mm_crc32_u64(_r, in);
}
template <typename T>
void process_be(T in) {
in = seastar::net::hton(in);
process_le(in);
}
void process(const uint8_t* in, size_t size) {
if ((reinterpret_cast<uintptr_t>(in) & 1) && size >= 1) {
process_le(*in);
++in;
--size;
}
if ((reinterpret_cast<uintptr_t>(in) & 3) && size >= 2) {
process_le(*reinterpret_cast<const uint16_t*>(in));
in += 2;
size -= 2;
}
if ((reinterpret_cast<uintptr_t>(in) & 7) && size >= 4) {
process_le(*reinterpret_cast<const uint32_t*>(in));
in += 4;
size -= 4;
}
// do in three parallel loops
while (size >= 1024) {
uint32_t crc0 = _r, crc1 = 0, crc2 = 0;
// calculate three blocks in parallel
// - crc0: in64[ 0, 1, ..., 41]
// - crc1: in64[42, 43, ..., 83]
// - crc2: in64[84, 85, ..., 125]
for (int i = 0; i < 42; ++i, in += 8) {
crc0 = _mm_crc32_u64(crc0, seastar::read_le<uint64_t>((const char*)in));
crc1 = _mm_crc32_u64(crc1, seastar::read_le<uint64_t>((const char*)in + 42*8));
crc2 = _mm_crc32_u64(crc2, seastar::read_le<uint64_t>((const char*)in + 42*2*8));
}
in += 42*2*8;
// combine three blocks' crc and last two u64
// - CRC32(crc0 * CRC32(x^(42*64*2)))
crc0 = _mm_crc32_u64(0, clmul_u32(crc0, 0xe417f38a));
// - CRC32(crc1 * CRC32(x^(42*64)))
crc1 = _mm_crc32_u64(0, clmul_u32(crc1, 0x8f158014));
// - CRC32(crc2 * x^32 + u64[-2])
crc2 = _mm_crc32_u64(crc2, seastar::read_le<uint64_t>((const char*)in));
in += 8;
// - Last u64
_r = _mm_crc32_u64(crc0^crc1^crc2, seastar::read_le<uint64_t>((const char*)in));
in += 8;
size -= 1024;
}
while (size >= 8) {
process_le(*reinterpret_cast<const uint64_t*>(in));
in += 8;
size -= 8;
}
if (size >= 4) {
process_le(*reinterpret_cast<const uint32_t*>(in));
in += 4;
size -= 4;
}
if (size >= 2) {
process_le(*reinterpret_cast<const uint16_t*>(in));
in += 2;
size -= 2;
}
if (size >= 1) {
process_le(*in);
}
}
#elif defined(__PPC64__)
uint32_t crc32_vpmsum(uint32_t crc, const uint8_t* p, size_t len);
template <class T>
void process_le(T in) {
static_assert(std::is_integral<T>::value, "T must be integral type.");
#if defined(__BYTE_ORDER__) && defined(__ORDER_BIG_ENDIAN__) && __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
switch (sizeof(T)) {
case 2: in = __builtin_bswap16(in); break;
case 4: in = __builtin_bswap32(in); break;
case 8: in = __builtin_bswap64(in); break;
}
#endif
_r = crc32_vpmsum(_r, reinterpret_cast<const uint8_t*>(&in), sizeof(T));
}
template <class T>
void process_be(T in) {
static_assert(std::is_integral<T>::value, "T must be integral type.");
in = seastar::net::hton(in);
_r = crc32_vpmsum(_r, reinterpret_cast<const uint8_t*>(&in), sizeof(T));
}
void process(const uint8_t* in, size_t size) {
_r = crc32_vpmsum(_r, in, size);
}
#else
template <class T>
void process_le(T in) {
static_assert(std::is_integral<T>::value, "T must be integral type.");
#if defined(__BYTE_ORDER__) && defined(__ORDER_BIG_ENDIAN__) && __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
switch (sizeof(T)) {
case 2: in = __builtin_bswap16(in); break;
case 4: in = __builtin_bswap32(in); break;
case 8: in = __builtin_bswap64(in); break;
}
#endif
_r = ::crc32(_r, reinterpret_cast<const uint8_t*>(&in), sizeof(T));
}
template <class T>
void process_be(T in) {
static_assert(std::is_integral<T>::value, "T must be integral type.");
in = seastar::net::hton(in);
_r = ::crc32(_r, reinterpret_cast<const uint8_t*>(&in), sizeof(T));
}
void process(const uint8_t* in, size_t size) {
_r = ::crc32(_r, in, size);
}
#endif
template<typename FragmentedBuffer>
GCC6_CONCEPT(requires FragmentRange<FragmentedBuffer>)
void process_fragmented(const FragmentedBuffer& buffer) {
using boost::range::for_each;
for_each(buffer, [this] (bytes_view bv) {
process(reinterpret_cast<const uint8_t*>(bv.data()), bv.size());
});
}
uint32_t get() const {
return _r;
}
};
}