diff --git a/kmod/src/dir.c b/kmod/src/dir.c index e0aac04d..50a39290 100644 --- a/kmod/src/dir.c +++ b/kmod/src/dir.c @@ -24,30 +24,32 @@ #include "super.h" /* - * Directory entries are stored in items whose offset is determined by - * the hash of the entry's name. This was primarily chosen to minimize - * the amount of data stored for each entry. + * Directory entries are stored in entries with offsets calculated from + * the hash of their entry name. * - * Because we're hashing the name we need to worry about collisions. We - * store all the entries with the same hash value in the item. This was - * done so that create works with one specific item. + * The upside of having a single namespace of items used for both lookup + * and readdir iteration reduces the storage overhead of directories. + * The downside is that dirent operations produce random item access + * patterns. * - * readdir iterates over these items in hash order. The high bits of - * the entry's readdir f_pos come from the item offset while the low - * bits come from a collision number in the entry. + * Hash values are limited to 31 bits to avoid bugs from use of 31 bit + * signed offsets. We also avoid bugs in network protocols limited to + * 32 bit directory positions. * - * The full readdir position, and thus the absolute max number of - * entries in a directory, is limited to 2^31 to avoid the risk of - * breaking legacy environments. Even with a relatively small 27bit - * item offset allowing 16 colliding entries gets well into hundreds of - * millions of entries before an item fills up and we return a premature - * ENOSPC. Hundreds of millions in a single dir ought to be, wait for - * it, good enough for anybody. + * We have to worry about collisions because we're using the hash of the + * name. We simply allow a name to be stored at multiple hash value + * locations. Create iterates until it finds an unused value and lookup + * iterates until it finds an entry at a hash that matches the name. We + * can store the max iteration used during create in the directory to + * limit the number of values we'll check in lookup. With 31bit hash + * values we can get tens of thousands of entries before we use two + * hashes, hundreds for three, millions for four, and so on. The vast + * majority of directories will use one hash value. * - * Each item's contents are protected by the dir inode's i_mutex that - * callers acquire before calling our dir operations. If we wanted more - * fine grained concurrency, and we might, we'd have to be careful to - * manage the shared items. + * This would be a crazy design in systems where dirent lookups perform + * dependent block reads down a radix or btree structure for each hash + * value. scoutfs makes this a lot cheaper by using the bloom filters + * in the log segments to short circuit negative item lookups. */ static unsigned int mode_to_type(umode_t mode) @@ -94,15 +96,17 @@ static int names_equal(const char *name_a, int len_a, const char *name_b, /* * Return the offset portion of a dirent key from the hash of the name. + * The hash can't be 0 or 1 for . and .. and we chose to limit the max + * file->f_pos. * * XXX This crc nonsense is a quick hack. We'll want something a * lot stronger like siphash. */ -static u32 name_hash(struct inode *dir, const char *name, unsigned int len) +static u32 name_hash(const char *name, unsigned int len, u32 salt) { - struct scoutfs_inode_info *ci = SCOUTFS_I(dir); + u32 h = crc32c(salt, name, len) & SCOUTFS_DIRENT_OFF_MASK; - return crc32c(ci->salt, name, len) >> (32 - SCOUTFS_DIRENT_OFF_BITS); + return max_t(u32, 2, min_t(u32, h, SCOUTFS_DIRENT_LAST_POS)); } static unsigned int dent_bytes(unsigned int name_len) @@ -110,35 +114,16 @@ static unsigned int dent_bytes(unsigned int name_len) return sizeof(struct scoutfs_dirent) + name_len; } -static unsigned int dent_val_off(struct scoutfs_item_ref *ref, - struct scoutfs_dirent *dent) +static unsigned int item_name_len(struct scoutfs_item_ref *ref) { - return (char *)dent - (char *)ref->val; + return ref->val_len - sizeof(struct scoutfs_dirent); } - -static inline struct scoutfs_dirent *next_dent(struct scoutfs_item_ref *ref, - struct scoutfs_dirent *dent) -{ - unsigned int next_off; - - next_off = dent_val_off(ref, dent) + dent_bytes(dent->name_len); - if (next_off == ref->val_len) - return NULL; - - return ref->val + next_off; -} - -#define for_each_item_dent(ref, dent) \ - for (dent = (ref)->val; dent; dent = next_dent(ref, dent)) - +/* + * Store the dirent item hash in the dentry so that we don't have to + * calculate and search to remove the item. + */ struct dentry_info { - /* - * The key offset and collision nr are stored so that we don't - * have to either hash the name to find the item or compare - * names to find the dirent in the item. - */ - u32 key_offset; - u8 coll_nr; + u32 hash; }; static struct kmem_cache *scoutfs_dentry_cachep; @@ -173,16 +158,23 @@ static struct dentry_info *alloc_dentry_info(struct dentry *dentry) static struct dentry *scoutfs_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags) { + struct scoutfs_inode_info *si = SCOUTFS_I(dir); struct super_block *sb = dir->i_sb; + DECLARE_SCOUTFS_ITEM_REF(ref); struct scoutfs_dirent *dent; struct dentry_info *di; struct scoutfs_key key; + unsigned int name_len; struct inode *inode; - DECLARE_SCOUTFS_ITEM_REF(ref); u64 ino = 0; u32 h = 0; - u32 nr = 0; int ret; + int i; + + if (si->max_dirent_hash_nr == 0) { + ret = -ENOENT; + goto out; + } di = alloc_dentry_info(dentry); if (IS_ERR(di)) { @@ -195,21 +187,27 @@ static struct dentry *scoutfs_lookup(struct inode *dir, struct dentry *dentry, goto out; } - h = name_hash(dir, dentry->d_name.name, dentry->d_name.len); - scoutfs_set_key(&key, scoutfs_ino(dir), SCOUTFS_DIRENT_KEY, h); + h = si->salt; + for (i = 0; i < si->max_dirent_hash_nr; i++) { + h = name_hash(dentry->d_name.name, dentry->d_name.len, h); + scoutfs_set_key(&key, scoutfs_ino(dir), SCOUTFS_DIRENT_KEY, h); - ret = scoutfs_read_item(sb, &key, &ref); - if (ret) - goto out; + scoutfs_put_ref(&ref); + ret = scoutfs_read_item(sb, &key, &ref); + if (ret == -ENOENT) + continue; + if (ret < 0) + break; - ret = -ENOENT; - for_each_item_dent(&ref, dent) { + dent = ref.val; + name_len = item_name_len(&ref); if (names_equal(dentry->d_name.name, dentry->d_name.len, - dent->name, dent->name_len)) { + dent->name, name_len)) { ino = le64_to_cpu(dent->ino); - nr = dent->coll_nr; ret = 0; break; + } else { + ret = -ENOENT; } } @@ -220,8 +218,7 @@ out: } else if (ret) { inode = ERR_PTR(ret); } else { - di->key_offset = h; - di->coll_nr = nr; + di->hash = h; inode = scoutfs_iget(sb, ino); } @@ -265,52 +262,35 @@ static int scoutfs_readdir(struct file *file, void *dirent, filldir_t filldir) struct scoutfs_dirent *dent; struct scoutfs_key first; struct scoutfs_key last; + unsigned int name_len; LIST_HEAD(iter_list); int ret = 0; - u32 off; u32 pos; - u32 nr; if (!dir_emit_dots(file, dirent, filldir)) return 0; - scoutfs_set_key(&first, scoutfs_ino(inode), SCOUTFS_DIRENT_KEY, - file->f_pos >> SCOUTFS_DIRENT_COLL_BITS); scoutfs_set_key(&last, scoutfs_ino(inode), SCOUTFS_DIRENT_KEY, - SCOUTFS_DIRENT_OFF_MASK); + SCOUTFS_DIRENT_LAST_POS); + + while (file->f_pos <= SCOUTFS_DIRENT_LAST_POS) { + scoutfs_set_key(&first, scoutfs_ino(inode), SCOUTFS_DIRENT_KEY, + file->f_pos); - for(;;) { scoutfs_put_ref(&ref); ret = scoutfs_next_item(sb, &first, &last, &iter_list, &ref); if (ret) break; - /* start from first collision if we're in a new item */ - if (scoutfs_key_offset(&first) == scoutfs_key_offset(ref.key)) - nr = file->f_pos & SCOUTFS_DIRENT_COLL_MASK; - else - nr = 0; + dent = ref.val; + name_len = item_name_len(&ref); + pos = scoutfs_key_offset(ref.key); - off = scoutfs_key_offset(ref.key) << SCOUTFS_DIRENT_COLL_BITS; - for_each_item_dent(&ref, dent) { - if (dent->coll_nr < nr) - continue; - - pos = off | dent->coll_nr; - - if (filldir(dirent, dent->name, dent->name_len, pos, - le64_to_cpu(dent->ino), - dentry_type(dent->type))) - break; - - file->f_pos = pos + 1; - } - /* done if filldir broke the loop */ - if (dent) + if (filldir(dirent, dent->name, name_len, pos, + le64_to_cpu(dent->ino), dentry_type(dent->type))) break; - first = *ref.key; - scoutfs_inc_key(&first); + file->f_pos = pos + 1; } scoutfs_put_ref(&ref); @@ -326,6 +306,7 @@ static int scoutfs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev_t rdev) { struct super_block *sb = dir->i_sb; + struct scoutfs_inode_info *si = SCOUTFS_I(dir); struct inode *inode = NULL; struct scoutfs_dirent *dent; DECLARE_SCOUTFS_ITEM_REF(ref); @@ -333,8 +314,8 @@ static int scoutfs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, struct scoutfs_key key; int bytes; int ret; - u64 nr; u64 h; + int i; di = alloc_dentry_info(dentry); if (IS_ERR(di)) @@ -347,38 +328,34 @@ static int scoutfs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, if (IS_ERR(inode)) return PTR_ERR(inode); - h = name_hash(dir, dentry->d_name.name, dentry->d_name.len); - scoutfs_set_key(&key, scoutfs_ino(dir), SCOUTFS_DIRENT_KEY, h); bytes = dent_bytes(dentry->d_name.len); - ret = scoutfs_read_item(sb, &key, &ref); - if (ret != -ENOENT) { - /* XXX implement many hashes, not coll nr */ - if (WARN_ON_ONCE(!ret)) { - scoutfs_put_ref(&ref); + h = si->salt; + for (i = 0; i < SCOUTFS_MAX_DENT_HASH_NR; i++) { + h = name_hash(dentry->d_name.name, dentry->d_name.len, h); + scoutfs_set_key(&key, scoutfs_ino(dir), SCOUTFS_DIRENT_KEY, h); + + ret = scoutfs_create_item(sb, &key, bytes, &ref); + if (ret != -EEXIST) + break; + } + if (ret) { + if (ret == -EEXIST) ret = -ENOSPC; - } goto out; } - ret = scoutfs_create_item(sb, &key, bytes, &ref); - if (ret) - goto out; - dent = ref.val; - nr = 0; dent->ino = cpu_to_le64(scoutfs_ino(inode)); dent->type = mode_to_type(inode->i_mode); - dent->coll_nr = nr; - dent->name_len = dentry->d_name.len; - memcpy(dent->name, dentry->d_name.name, dent->name_len); - di->key_offset = h; - di->coll_nr = nr; + memcpy(dent->name, dentry->d_name.name, dentry->d_name.len); + di->hash = h; scoutfs_put_ref(&ref); i_size_write(dir, i_size_read(dir) + dentry->d_name.len); dir->i_mtime = dir->i_ctime = CURRENT_TIME; + si->max_dirent_hash_nr = max_t(int, i + 1, si->max_dirent_hash_nr); inode->i_mtime = inode->i_atime = inode->i_ctime = dir->i_mtime; if (S_ISDIR(mode)) { @@ -428,14 +405,10 @@ static int scoutfs_unlink(struct inode *dir, struct dentry *dentry) return -EINVAL; di = dentry->d_fsdata; - trace_printk("dir size %llu entry k_off nr %u %u\n", - i_size_read(inode), di->key_offset, di->coll_nr); - if (S_ISDIR(inode->i_mode) && i_size_read(inode)) return -ENOTEMPTY; - scoutfs_set_key(&key, scoutfs_ino(dir), SCOUTFS_DIRENT_KEY, - di->key_offset); + scoutfs_set_key(&key, scoutfs_ino(dir), SCOUTFS_DIRENT_KEY, di->hash); ret = scoutfs_read_item(sb, &key, &ref); if (ret) diff --git a/kmod/src/format.h b/kmod/src/format.h index 1310f8d7..989592e3 100644 --- a/kmod/src/format.h +++ b/kmod/src/format.h @@ -225,6 +225,7 @@ struct scoutfs_inode { __le32 mode; __le32 rdev; __le32 salt; + __u8 max_dirent_hash_nr; struct scoutfs_timespec atime; struct scoutfs_timespec ctime; struct scoutfs_timespec mtime; @@ -238,17 +239,17 @@ struct scoutfs_inode { */ struct scoutfs_dirent { __le64 ino; -#if defined(__LITTLE_ENDIAN_BITFIELD) - __u8 type:4, - coll_nr:4; -#else - __u8 coll_nr:4, - type:4; -#endif - __u8 name_len; + __u8 type; __u8 name[0]; } __packed; +/* + * The max number of dirent hash values determines the overhead of + * lookups in very large directories. With 31bit offsets the number + * of entries stored before enospc tends to plateau around 200 million + * entries around 8 functions. That seems OK for now. + */ +#define SCOUTFS_MAX_DENT_HASH_NR 8 #define SCOUTFS_NAME_LEN 255 /* @@ -257,14 +258,10 @@ struct scoutfs_dirent { * network protocols that have limited readir positions. */ -#define SCOUTFS_DIRENT_OFF_BITS 27 -#define SCOUTFS_DIRENT_OFF_MASK ((1 << SCOUTFS_DIRENT_OFF_BITS) - 1) -#define SCOUTFS_DIRENT_COLL_BITS 4 -#define SCOUTFS_DIRENT_COLL_MASK ((1 << SCOUTFS_DIRENT_COLL_BITS) - 1) - -/* getdents returns the *next* pos with each entry. so we can't return ~0 */ -#define SCOUTFS_DIRENT_MAX_POS \ - (((1 << (SCOUTFS_DIRENT_OFF_BITS + SCOUTFS_DIRENT_COLL_BITS)) - 1) - 1) +#define SCOUTFS_DIRENT_OFF_BITS 31 +#define SCOUTFS_DIRENT_OFF_MASK ((1U << SCOUTFS_DIRENT_OFF_BITS) - 1) +/* getdents returns next pos with an entry, no entry at (f_pos)~0 */ +#define SCOUTFS_DIRENT_LAST_POS (INT_MAX - 1) enum { SCOUTFS_DT_FIFO = 0, diff --git a/kmod/src/inode.c b/kmod/src/inode.c index 99187b2c..ba5eb8c4 100644 --- a/kmod/src/inode.c +++ b/kmod/src/inode.c @@ -105,6 +105,7 @@ static void load_inode(struct inode *inode, struct scoutfs_inode *cinode) inode->i_ctime.tv_nsec = le32_to_cpu(cinode->ctime.nsec); ci->salt = le32_to_cpu(cinode->salt); + ci->max_dirent_hash_nr = cinode->max_dirent_hash_nr; } static int scoutfs_read_locked_inode(struct inode *inode) @@ -186,6 +187,7 @@ static void store_inode(struct scoutfs_inode *cinode, struct inode *inode) cinode->mtime.nsec = cpu_to_le32(inode->i_mtime.tv_nsec); cinode->salt = cpu_to_le32(ci->salt); + cinode->max_dirent_hash_nr = ci->max_dirent_hash_nr; } /* diff --git a/kmod/src/inode.h b/kmod/src/inode.h index d7009352..3da21640 100644 --- a/kmod/src/inode.h +++ b/kmod/src/inode.h @@ -4,6 +4,7 @@ struct scoutfs_inode_info { u64 ino; u32 salt; + u8 max_dirent_hash_nr; struct inode inode; };