scoutfs: add dirent name fingerprint

Entries in a directory are indexed by the hash of their name.  This
introduces a perfectly random access pattern.  And this results in a cow
storm as directories get large enough such that the leaf blocks that
store their entries are larger than our commits.  Each commit ends up
being full of cowed leaf blocks that contain a single new entry.

The dirent name fingerprints change the dirent key to first start with a
fingerprint of the name.  This reduces the scope of hash randomization
from the entire directory to entries with the same fingerprint.

On real customer dir sizes and file names we saw roughly 3x create rate
improvements from being able to create more entries in leaf blocks
within a commit.

Signed-off-by: Zach Brown <zab@versity.com>
This commit is contained in:
Zach Brown
2020-05-14 15:07:52 -07:00
committed by Zach Brown
parent 0a47e8f936
commit f9ff25db23

View File

@@ -213,12 +213,44 @@ static struct scoutfs_dirent *alloc_dirent(unsigned int name_len)
return kmalloc(dirent_bytes(name_len), GFP_NOFS);
}
/*
* Test a bit number as though an array of bytes is a large len-bit
* big-endian value. nr 0 is the LSB of the final byte, nr (len - 1) is
* the MSB of the first byte.
*/
static int test_be_bytes_bit(int nr, const char *bytes, int len)
{
return bytes[(len - 1 - nr) >> 3] & (1 << (nr & 7));
}
/*
* Generate a 32bit "fingerprint" of the name by extracting 32 evenly
* distributed bits from the name. The intent is to have the sort order
* of the fingerprints reflect the memcmp() sort order of the names
* while mapping large names down to small fs keys.
*
* Names that are smaller than 32bits are biased towards the high bits
* of the fingerprint so that most significant bits of the fingerprints
* consistently reflect the initial characters of the names.
*/
static u32 dirent_name_fingerprint(const char *name, unsigned int name_len)
{
int name_bits = name_len * 8;
int skip = max(name_bits / 32, 1);
u32 fp = 0;
int f;
int n;
for (f = 31, n = name_bits - 1; f >= 0 && n >= 0; f--, n -= skip)
fp |= !!test_be_bytes_bit(n, name, name_bits) << f;
return fp;
}
static u64 dirent_name_hash(const char *name, unsigned int name_len)
{
unsigned int half = (name_len + 1) / 2;
return crc32c(~0, name, half) |
((u64)crc32c(~0, name + name_len - half, half) << 32);
return crc32c(~0, name, name_len) |
((u64)dirent_name_fingerprint(name, name_len) << 32);
}
static u64 dirent_names_equal(const char *a_name, unsigned int a_len,