From 42e7fbb4f7ee18265624b49b3870934b3a2b560e Mon Sep 17 00:00:00 2001 From: Zach Brown Date: Thu, 2 Jul 2020 15:10:53 -0700 Subject: [PATCH] scoutfs: switch to using fnv1a for hashing We had a few uses of crc for hashing. That was fine enough for initial testing but the huge number of xattrs that srch is recording was seeing very bad collisions from the clumsy combination of crc32c into a 64bit hash. Replace it with FNV for now. This also takes the opportunity to use 3 hash functions in the forest bloom filter so that we can extract them from the 64bit hash of the key rather than iterating and recalculating hashes for each function. Signed-off-by: Zach Brown --- kmod/src/btree.c | 4 ++-- kmod/src/dir.c | 4 ++-- kmod/src/forest.c | 16 +++++++++------- kmod/src/format.h | 5 +++-- kmod/src/hash.h | 46 ++++++++++++++++++++++++++++++++++++++++------ 5 files changed, 56 insertions(+), 19 deletions(-) diff --git a/kmod/src/btree.c b/kmod/src/btree.c index 5bf6f23d..e162cc40 100644 --- a/kmod/src/btree.c +++ b/kmod/src/btree.c @@ -14,7 +14,6 @@ #include #include #include -#include #include #include @@ -29,6 +28,7 @@ #include "block.h" #include "radix.h" #include "avl.h" +#include "hash.h" #include "scoutfs_trace.h" @@ -210,7 +210,7 @@ static int cmp_key_item(void *arg, struct scoutfs_avl_node *node) */ static int leaf_item_hash_ind(struct scoutfs_key *key) { - return crc32c(~0, key, sizeof(struct scoutfs_key)) % + return scoutfs_hash32(key, sizeof(struct scoutfs_key)) % SCOUTFS_BTREE_LEAF_ITEM_HASH_NR; } diff --git a/kmod/src/dir.c b/kmod/src/dir.c index a84d2134..89fda146 100644 --- a/kmod/src/dir.c +++ b/kmod/src/dir.c @@ -13,7 +13,6 @@ #include #include #include -#include #include #include #include @@ -31,6 +30,7 @@ #include "kvec.h" #include "forest.h" #include "lock.h" +#include "hash.h" #include "counters.h" #include "scoutfs_trace.h" @@ -249,7 +249,7 @@ static u32 dirent_name_fingerprint(const char *name, unsigned int name_len) static u64 dirent_name_hash(const char *name, unsigned int name_len) { - return crc32c(~0, name, name_len) | + return scoutfs_hash32(name, name_len) | ((u64)dirent_name_fingerprint(name, name_len) << 32); } diff --git a/kmod/src/forest.c b/kmod/src/forest.c index 7dc863ad..6fe22d0d 100644 --- a/kmod/src/forest.c +++ b/kmod/src/forest.c @@ -14,7 +14,6 @@ #include #include #include -#include #include "super.h" #include "format.h" @@ -24,6 +23,7 @@ #include "radix.h" #include "block.h" #include "forest.h" +#include "hash.h" #include "counters.h" #include "scoutfs_trace.h" @@ -240,18 +240,20 @@ static void read_unlock_forest_root(struct forest_info *finf, } } -/* - * XXX need something better. - */ static void calc_bloom_nrs(struct forest_bloom_nrs *bloom, struct scoutfs_key *key) { - u32 crc = ~0; + u64 hash; int i; + BUILD_BUG_ON((SCOUTFS_FOREST_BLOOM_FUNC_BITS * + SCOUTFS_FOREST_BLOOM_NRS) > 64); + + hash = scoutfs_hash64(key, sizeof(struct scoutfs_key)); + for (i = 0; i < ARRAY_SIZE(bloom->nrs); i++) { - crc = crc32c(crc, key, sizeof(struct scoutfs_key)); - bloom->nrs[i] = crc % SCOUTFS_FOREST_BLOOM_BITS; + bloom->nrs[i] = (u32)hash % SCOUTFS_FOREST_BLOOM_BITS; + hash >>= SCOUTFS_FOREST_BLOOM_FUNC_BITS; } } diff --git a/kmod/src/format.h b/kmod/src/format.h index 8418a638..ac05fecf 100644 --- a/kmod/src/format.h +++ b/kmod/src/format.h @@ -328,11 +328,12 @@ struct scoutfs_bloom_block { * before the bloom filters fill up and start returning excessive false * positives. */ -#define SCOUTFS_FOREST_BLOOM_NRS 7 +#define SCOUTFS_FOREST_BLOOM_NRS 3 #define SCOUTFS_FOREST_BLOOM_BITS \ (((SCOUTFS_BLOCK_LG_SIZE - sizeof(struct scoutfs_bloom_block)) / \ member_sizeof(struct scoutfs_bloom_block, bits[0])) * \ - member_sizeof(struct scoutfs_bloom_block, bits[0]) * 8) \ + member_sizeof(struct scoutfs_bloom_block, bits[0]) * 8) +#define SCOUTFS_FOREST_BLOOM_FUNC_BITS (SCOUTFS_BLOCK_LG_SHIFT + 3) /* * Keys are first sorted by major key zones. diff --git a/kmod/src/hash.h b/kmod/src/hash.h index 3ad3de09..9b169877 100644 --- a/kmod/src/hash.h +++ b/kmod/src/hash.h @@ -1,15 +1,49 @@ #ifndef _SCOUTFS_HASH_H_ #define _SCOUTFS_HASH_H_ -#include +/* + * We're using FNV1a for now. It's fine. Ish. + * + * The longer term plan is xxh3 but it looks like it'll take just a bit + * more time to be declared stable and then it needs to be ported to the + * kernel. + * + * - https://fastcompression.blogspot.com/2019/03/presenting-xxh3.html + * - https://github.com/Cyan4973/xxHash/releases/tag/v0.7.4 + */ + +static inline u32 fnv1a32(const void *data, unsigned int len) +{ + u32 hash = 0x811c9dc5; + + while (len--) { + hash ^= *(u8 *)(data++); + hash *= 0x01000193; + } + + return hash; +} + +static inline u64 fnv1a64(const void *data, unsigned int len) +{ + u64 hash = 0xcbf29ce484222325; + + while (len--) { + hash ^= *(u8 *)(data++); + hash *= 0x100000001b3; + } + + return hash; +} + +static inline u32 scoutfs_hash32(const void *data, unsigned int len) +{ + return fnv1a32(data, len); +} -/* XXX replace with xxhash */ static inline u64 scoutfs_hash64(const void *data, unsigned int len) { - unsigned int half = (len + 1) / 2; - - return crc32c(~0, data, half) | - ((u64)crc32c(~0, data + len - half, half) << 32); + return fnv1a64(data, len); } #endif