scoutfs: store dirents at multiple hash values

Previously we dealt with colliding dirent hash values by storing all the
dirents that share a hash value in a big item with multiple dirents.

This complicated the code and strongly encouraged resizing items as
dirents come and go.  Resizing items isn't very easy with our simple log
segment item creation mechanism.

Instead let's deal with collisions by allowing a dirent to be stored at
multiple hash values.  The code is much simpler.

Lookup has to iterate over all possible hash values.  We can track the
greatest hash iteration stored in the directory inode to limit the
overhead of negative lookups in small directories.

Signed-off-by: Zach Brown <zab@versity.com>
This commit is contained in:
Zach Brown
2016-03-24 20:11:58 -07:00
parent 1270553f1f
commit 3755adddd5
4 changed files with 104 additions and 131 deletions

View File

@@ -24,30 +24,32 @@
#include "super.h"
/*
* Directory entries are stored in items whose offset is determined by
* the hash of the entry's name. This was primarily chosen to minimize
* the amount of data stored for each entry.
* Directory entries are stored in entries with offsets calculated from
* the hash of their entry name.
*
* Because we're hashing the name we need to worry about collisions. We
* store all the entries with the same hash value in the item. This was
* done so that create works with one specific item.
* The upside of having a single namespace of items used for both lookup
* and readdir iteration reduces the storage overhead of directories.
* The downside is that dirent operations produce random item access
* patterns.
*
* readdir iterates over these items in hash order. The high bits of
* the entry's readdir f_pos come from the item offset while the low
* bits come from a collision number in the entry.
* Hash values are limited to 31 bits to avoid bugs from use of 31 bit
* signed offsets. We also avoid bugs in network protocols limited to
* 32 bit directory positions.
*
* The full readdir position, and thus the absolute max number of
* entries in a directory, is limited to 2^31 to avoid the risk of
* breaking legacy environments. Even with a relatively small 27bit
* item offset allowing 16 colliding entries gets well into hundreds of
* millions of entries before an item fills up and we return a premature
* ENOSPC. Hundreds of millions in a single dir ought to be, wait for
* it, good enough for anybody.
* We have to worry about collisions because we're using the hash of the
* name. We simply allow a name to be stored at multiple hash value
* locations. Create iterates until it finds an unused value and lookup
* iterates until it finds an entry at a hash that matches the name. We
* can store the max iteration used during create in the directory to
* limit the number of values we'll check in lookup. With 31bit hash
* values we can get tens of thousands of entries before we use two
* hashes, hundreds for three, millions for four, and so on. The vast
* majority of directories will use one hash value.
*
* Each item's contents are protected by the dir inode's i_mutex that
* callers acquire before calling our dir operations. If we wanted more
* fine grained concurrency, and we might, we'd have to be careful to
* manage the shared items.
* This would be a crazy design in systems where dirent lookups perform
* dependent block reads down a radix or btree structure for each hash
* value. scoutfs makes this a lot cheaper by using the bloom filters
* in the log segments to short circuit negative item lookups.
*/
static unsigned int mode_to_type(umode_t mode)
@@ -94,15 +96,17 @@ static int names_equal(const char *name_a, int len_a, const char *name_b,
/*
* Return the offset portion of a dirent key from the hash of the name.
* The hash can't be 0 or 1 for . and .. and we chose to limit the max
* file->f_pos.
*
* XXX This crc nonsense is a quick hack. We'll want something a
* lot stronger like siphash.
*/
static u32 name_hash(struct inode *dir, const char *name, unsigned int len)
static u32 name_hash(const char *name, unsigned int len, u32 salt)
{
struct scoutfs_inode_info *ci = SCOUTFS_I(dir);
u32 h = crc32c(salt, name, len) & SCOUTFS_DIRENT_OFF_MASK;
return crc32c(ci->salt, name, len) >> (32 - SCOUTFS_DIRENT_OFF_BITS);
return max_t(u32, 2, min_t(u32, h, SCOUTFS_DIRENT_LAST_POS));
}
static unsigned int dent_bytes(unsigned int name_len)
@@ -110,35 +114,16 @@ static unsigned int dent_bytes(unsigned int name_len)
return sizeof(struct scoutfs_dirent) + name_len;
}
static unsigned int dent_val_off(struct scoutfs_item_ref *ref,
struct scoutfs_dirent *dent)
static unsigned int item_name_len(struct scoutfs_item_ref *ref)
{
return (char *)dent - (char *)ref->val;
return ref->val_len - sizeof(struct scoutfs_dirent);
}
static inline struct scoutfs_dirent *next_dent(struct scoutfs_item_ref *ref,
struct scoutfs_dirent *dent)
{
unsigned int next_off;
next_off = dent_val_off(ref, dent) + dent_bytes(dent->name_len);
if (next_off == ref->val_len)
return NULL;
return ref->val + next_off;
}
#define for_each_item_dent(ref, dent) \
for (dent = (ref)->val; dent; dent = next_dent(ref, dent))
/*
* Store the dirent item hash in the dentry so that we don't have to
* calculate and search to remove the item.
*/
struct dentry_info {
/*
* The key offset and collision nr are stored so that we don't
* have to either hash the name to find the item or compare
* names to find the dirent in the item.
*/
u32 key_offset;
u8 coll_nr;
u32 hash;
};
static struct kmem_cache *scoutfs_dentry_cachep;
@@ -173,16 +158,23 @@ static struct dentry_info *alloc_dentry_info(struct dentry *dentry)
static struct dentry *scoutfs_lookup(struct inode *dir, struct dentry *dentry,
unsigned int flags)
{
struct scoutfs_inode_info *si = SCOUTFS_I(dir);
struct super_block *sb = dir->i_sb;
DECLARE_SCOUTFS_ITEM_REF(ref);
struct scoutfs_dirent *dent;
struct dentry_info *di;
struct scoutfs_key key;
unsigned int name_len;
struct inode *inode;
DECLARE_SCOUTFS_ITEM_REF(ref);
u64 ino = 0;
u32 h = 0;
u32 nr = 0;
int ret;
int i;
if (si->max_dirent_hash_nr == 0) {
ret = -ENOENT;
goto out;
}
di = alloc_dentry_info(dentry);
if (IS_ERR(di)) {
@@ -195,21 +187,27 @@ static struct dentry *scoutfs_lookup(struct inode *dir, struct dentry *dentry,
goto out;
}
h = name_hash(dir, dentry->d_name.name, dentry->d_name.len);
scoutfs_set_key(&key, scoutfs_ino(dir), SCOUTFS_DIRENT_KEY, h);
h = si->salt;
for (i = 0; i < si->max_dirent_hash_nr; i++) {
h = name_hash(dentry->d_name.name, dentry->d_name.len, h);
scoutfs_set_key(&key, scoutfs_ino(dir), SCOUTFS_DIRENT_KEY, h);
ret = scoutfs_read_item(sb, &key, &ref);
if (ret)
goto out;
scoutfs_put_ref(&ref);
ret = scoutfs_read_item(sb, &key, &ref);
if (ret == -ENOENT)
continue;
if (ret < 0)
break;
ret = -ENOENT;
for_each_item_dent(&ref, dent) {
dent = ref.val;
name_len = item_name_len(&ref);
if (names_equal(dentry->d_name.name, dentry->d_name.len,
dent->name, dent->name_len)) {
dent->name, name_len)) {
ino = le64_to_cpu(dent->ino);
nr = dent->coll_nr;
ret = 0;
break;
} else {
ret = -ENOENT;
}
}
@@ -220,8 +218,7 @@ out:
} else if (ret) {
inode = ERR_PTR(ret);
} else {
di->key_offset = h;
di->coll_nr = nr;
di->hash = h;
inode = scoutfs_iget(sb, ino);
}
@@ -265,52 +262,35 @@ static int scoutfs_readdir(struct file *file, void *dirent, filldir_t filldir)
struct scoutfs_dirent *dent;
struct scoutfs_key first;
struct scoutfs_key last;
unsigned int name_len;
LIST_HEAD(iter_list);
int ret = 0;
u32 off;
u32 pos;
u32 nr;
if (!dir_emit_dots(file, dirent, filldir))
return 0;
scoutfs_set_key(&first, scoutfs_ino(inode), SCOUTFS_DIRENT_KEY,
file->f_pos >> SCOUTFS_DIRENT_COLL_BITS);
scoutfs_set_key(&last, scoutfs_ino(inode), SCOUTFS_DIRENT_KEY,
SCOUTFS_DIRENT_OFF_MASK);
SCOUTFS_DIRENT_LAST_POS);
while (file->f_pos <= SCOUTFS_DIRENT_LAST_POS) {
scoutfs_set_key(&first, scoutfs_ino(inode), SCOUTFS_DIRENT_KEY,
file->f_pos);
for(;;) {
scoutfs_put_ref(&ref);
ret = scoutfs_next_item(sb, &first, &last, &iter_list, &ref);
if (ret)
break;
/* start from first collision if we're in a new item */
if (scoutfs_key_offset(&first) == scoutfs_key_offset(ref.key))
nr = file->f_pos & SCOUTFS_DIRENT_COLL_MASK;
else
nr = 0;
dent = ref.val;
name_len = item_name_len(&ref);
pos = scoutfs_key_offset(ref.key);
off = scoutfs_key_offset(ref.key) << SCOUTFS_DIRENT_COLL_BITS;
for_each_item_dent(&ref, dent) {
if (dent->coll_nr < nr)
continue;
pos = off | dent->coll_nr;
if (filldir(dirent, dent->name, dent->name_len, pos,
le64_to_cpu(dent->ino),
dentry_type(dent->type)))
break;
file->f_pos = pos + 1;
}
/* done if filldir broke the loop */
if (dent)
if (filldir(dirent, dent->name, name_len, pos,
le64_to_cpu(dent->ino), dentry_type(dent->type)))
break;
first = *ref.key;
scoutfs_inc_key(&first);
file->f_pos = pos + 1;
}
scoutfs_put_ref(&ref);
@@ -326,6 +306,7 @@ static int scoutfs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode,
dev_t rdev)
{
struct super_block *sb = dir->i_sb;
struct scoutfs_inode_info *si = SCOUTFS_I(dir);
struct inode *inode = NULL;
struct scoutfs_dirent *dent;
DECLARE_SCOUTFS_ITEM_REF(ref);
@@ -333,8 +314,8 @@ static int scoutfs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode,
struct scoutfs_key key;
int bytes;
int ret;
u64 nr;
u64 h;
int i;
di = alloc_dentry_info(dentry);
if (IS_ERR(di))
@@ -347,38 +328,34 @@ static int scoutfs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode,
if (IS_ERR(inode))
return PTR_ERR(inode);
h = name_hash(dir, dentry->d_name.name, dentry->d_name.len);
scoutfs_set_key(&key, scoutfs_ino(dir), SCOUTFS_DIRENT_KEY, h);
bytes = dent_bytes(dentry->d_name.len);
ret = scoutfs_read_item(sb, &key, &ref);
if (ret != -ENOENT) {
/* XXX implement many hashes, not coll nr */
if (WARN_ON_ONCE(!ret)) {
scoutfs_put_ref(&ref);
h = si->salt;
for (i = 0; i < SCOUTFS_MAX_DENT_HASH_NR; i++) {
h = name_hash(dentry->d_name.name, dentry->d_name.len, h);
scoutfs_set_key(&key, scoutfs_ino(dir), SCOUTFS_DIRENT_KEY, h);
ret = scoutfs_create_item(sb, &key, bytes, &ref);
if (ret != -EEXIST)
break;
}
if (ret) {
if (ret == -EEXIST)
ret = -ENOSPC;
}
goto out;
}
ret = scoutfs_create_item(sb, &key, bytes, &ref);
if (ret)
goto out;
dent = ref.val;
nr = 0;
dent->ino = cpu_to_le64(scoutfs_ino(inode));
dent->type = mode_to_type(inode->i_mode);
dent->coll_nr = nr;
dent->name_len = dentry->d_name.len;
memcpy(dent->name, dentry->d_name.name, dent->name_len);
di->key_offset = h;
di->coll_nr = nr;
memcpy(dent->name, dentry->d_name.name, dentry->d_name.len);
di->hash = h;
scoutfs_put_ref(&ref);
i_size_write(dir, i_size_read(dir) + dentry->d_name.len);
dir->i_mtime = dir->i_ctime = CURRENT_TIME;
si->max_dirent_hash_nr = max_t(int, i + 1, si->max_dirent_hash_nr);
inode->i_mtime = inode->i_atime = inode->i_ctime = dir->i_mtime;
if (S_ISDIR(mode)) {
@@ -428,14 +405,10 @@ static int scoutfs_unlink(struct inode *dir, struct dentry *dentry)
return -EINVAL;
di = dentry->d_fsdata;
trace_printk("dir size %llu entry k_off nr %u %u\n",
i_size_read(inode), di->key_offset, di->coll_nr);
if (S_ISDIR(inode->i_mode) && i_size_read(inode))
return -ENOTEMPTY;
scoutfs_set_key(&key, scoutfs_ino(dir), SCOUTFS_DIRENT_KEY,
di->key_offset);
scoutfs_set_key(&key, scoutfs_ino(dir), SCOUTFS_DIRENT_KEY, di->hash);
ret = scoutfs_read_item(sb, &key, &ref);
if (ret)

View File

@@ -225,6 +225,7 @@ struct scoutfs_inode {
__le32 mode;
__le32 rdev;
__le32 salt;
__u8 max_dirent_hash_nr;
struct scoutfs_timespec atime;
struct scoutfs_timespec ctime;
struct scoutfs_timespec mtime;
@@ -238,17 +239,17 @@ struct scoutfs_inode {
*/
struct scoutfs_dirent {
__le64 ino;
#if defined(__LITTLE_ENDIAN_BITFIELD)
__u8 type:4,
coll_nr:4;
#else
__u8 coll_nr:4,
type:4;
#endif
__u8 name_len;
__u8 type;
__u8 name[0];
} __packed;
/*
* The max number of dirent hash values determines the overhead of
* lookups in very large directories. With 31bit offsets the number
* of entries stored before enospc tends to plateau around 200 million
* entries around 8 functions. That seems OK for now.
*/
#define SCOUTFS_MAX_DENT_HASH_NR 8
#define SCOUTFS_NAME_LEN 255
/*
@@ -257,14 +258,10 @@ struct scoutfs_dirent {
* network protocols that have limited readir positions.
*/
#define SCOUTFS_DIRENT_OFF_BITS 27
#define SCOUTFS_DIRENT_OFF_MASK ((1 << SCOUTFS_DIRENT_OFF_BITS) - 1)
#define SCOUTFS_DIRENT_COLL_BITS 4
#define SCOUTFS_DIRENT_COLL_MASK ((1 << SCOUTFS_DIRENT_COLL_BITS) - 1)
/* getdents returns the *next* pos with each entry. so we can't return ~0 */
#define SCOUTFS_DIRENT_MAX_POS \
(((1 << (SCOUTFS_DIRENT_OFF_BITS + SCOUTFS_DIRENT_COLL_BITS)) - 1) - 1)
#define SCOUTFS_DIRENT_OFF_BITS 31
#define SCOUTFS_DIRENT_OFF_MASK ((1U << SCOUTFS_DIRENT_OFF_BITS) - 1)
/* getdents returns next pos with an entry, no entry at (f_pos)~0 */
#define SCOUTFS_DIRENT_LAST_POS (INT_MAX - 1)
enum {
SCOUTFS_DT_FIFO = 0,

View File

@@ -105,6 +105,7 @@ static void load_inode(struct inode *inode, struct scoutfs_inode *cinode)
inode->i_ctime.tv_nsec = le32_to_cpu(cinode->ctime.nsec);
ci->salt = le32_to_cpu(cinode->salt);
ci->max_dirent_hash_nr = cinode->max_dirent_hash_nr;
}
static int scoutfs_read_locked_inode(struct inode *inode)
@@ -186,6 +187,7 @@ static void store_inode(struct scoutfs_inode *cinode, struct inode *inode)
cinode->mtime.nsec = cpu_to_le32(inode->i_mtime.tv_nsec);
cinode->salt = cpu_to_le32(ci->salt);
cinode->max_dirent_hash_nr = ci->max_dirent_hash_nr;
}
/*

View File

@@ -4,6 +4,7 @@
struct scoutfs_inode_info {
u64 ino;
u32 salt;
u8 max_dirent_hash_nr;
struct inode inode;
};