diff --git a/kmod/src/count.h b/kmod/src/count.h index 30b55ca5..f115d034 100644 --- a/kmod/src/count.h +++ b/kmod/src/count.h @@ -71,17 +71,14 @@ static inline const struct scoutfs_item_count SIC_DIRTY_INODE(void) } /* - * Adding a dirent adds the entry key, readdir key, and backref. + * Directory entries are stored in three items. */ static inline void __count_dirents(struct scoutfs_item_count *cnt, unsigned name_len) { - cnt->items += 3; - cnt->keys += offsetof(struct scoutfs_dirent_key, name[name_len]) + - sizeof(struct scoutfs_readdir_key) + - offsetof(struct scoutfs_link_backref_key, name[name_len]); - cnt->vals += 2 * offsetof(struct scoutfs_dirent, name[name_len]); + cnt->keys += 3 * sizeof(struct scoutfs_dirent_key); + cnt->vals += 3 * offsetof(struct scoutfs_dirent, name[name_len]); } static inline void __count_sym_target(struct scoutfs_item_count *cnt, diff --git a/kmod/src/dir.c b/kmod/src/dir.c index 3f980737..f32dfe6c 100644 --- a/kmod/src/dir.c +++ b/kmod/src/dir.c @@ -35,34 +35,31 @@ #include "scoutfs_trace.h" /* - * Directory entries are stored in entries with offsets calculated from - * the hash of their entry name. + * Directory entries are stored in three different items. Each has the + * same key format and all have identical values which contain the full + * entry name. * - * Having a single index of items used for both lookup and readdir - * iteration reduces the storage overhead of directories. It also - * avoids having to manage the allocation of readdir positions as - * directories age and the aggregate create count inches towards the - * small 31 bit position limit. The downside is that dirent name - * operations produce random item access patterns. + * Entries for name lookup are stored at the hash of the name and the + * readdir position. Including the position lets us create names + * without having to read the items to check for hash collisions. + * Lookup iterates over all the positions with the same hash values and + * compares the names. * - * Hash values are limited to 31 bits primarily to support older - * deployed protocols that only support 31 bits of file entry offsets, - * but also to avoid unlikely bugs in programs that store offsets in - * signed ints. + * Entries for readdir are stored in an increasing unique readdir + * position. This results in returning entries in creation order which + * matches inode allocation order and avoids random inode access + * patterns during readdir. * - * We have to worry about hash collisions. We linearly probe a fixed - * number of hash values past the natural value. In a typical small - * directory this search will terminate immediately because adjacent - * items will have distant offset values. It's only as the directory - * gets very large that hash values will start to be this dense and - * sweeping over items in a btree leaf is reasonably efficient. + * Entries for link backref traversal are stored at the target inode + * sorted by the parent dir and the entry's position in the parent dir. + * This keeps link backref users away from the higher contention area of + * dirent items in parent dirs. * - * For each directory entry item stored in a directory inode there is a - * corresponding link backref item stored at the target inode. This - * lets us find all the paths that refer to a given inode. The link - * backref offset comes from an advancing counter in the inode and the - * item value contains the dir inode and dirent offset of the referring - * link. + * All the entries have a dirent struct with the full name in their + * value. The dirent struct contains the name hash and readdir position + * so that any item use can reference all the items for a given entry. + * This is important for deleting all the items given a dentry that was + * populated by lookup. */ static unsigned int mode_to_type(umode_t mode) @@ -102,14 +99,15 @@ static unsigned int dentry_type(unsigned int type) } /* - * @readdir_pos lets us remove items on final unlink without having to - * look them up. + * @lock_cov: tells revalidation that the dentry is still locked and valid. * - * @lock_cov tells revalidation that the dentry is still locked and valid. + * @pos, @hash: lets us remove items on final unlink without having to + * look them up. */ struct dentry_info { - u64 readdir_pos; struct scoutfs_lock_coverage lock_cov; + u64 hash; + u64 pos; }; static struct kmem_cache *dentry_info_cache; @@ -161,15 +159,26 @@ static int alloc_dentry_info(struct dentry *dentry) } static void update_dentry_info(struct super_block *sb, struct dentry *dentry, - u64 pos, struct scoutfs_lock *lock) + u64 hash, u64 pos, struct scoutfs_lock *lock) { struct dentry_info *di = dentry->d_fsdata; if (WARN_ON_ONCE(di == NULL)) return; - di->readdir_pos = pos; scoutfs_lock_add_coverage(sb, lock, &di->lock_cov); + di->hash = hash; + di->pos = pos; +} + +static u64 dentry_info_hash(struct dentry *dentry) +{ + struct dentry_info *di = dentry->d_fsdata; + + if (WARN_ON_ONCE(di == NULL)) + return 0; + + return di->hash; } static u64 dentry_info_pos(struct dentry *dentry) @@ -179,88 +188,102 @@ static u64 dentry_info_pos(struct dentry *dentry) if (WARN_ON_ONCE(di == NULL)) return 0; - return di->readdir_pos; + return di->pos; } -static struct scoutfs_key_buf *alloc_dirent_key(struct super_block *sb, - u64 dir_ino, const char *name, - unsigned name_len) +static void init_dirent_key(struct scoutfs_key_buf *key, + struct scoutfs_dirent_key *dkey, u8 type, + u64 ino, u64 major, u64 minor) { - struct scoutfs_dirent_key *dkey; - struct scoutfs_key_buf *key; + dkey->zone = SCOUTFS_FS_ZONE; + dkey->ino = cpu_to_be64(ino); + dkey->type = type; + dkey->major = cpu_to_be64(major); + dkey->minor = cpu_to_be64(minor); - key = scoutfs_key_alloc(sb, offsetof(struct scoutfs_dirent_key, - name[name_len])); - if (key) { - dkey = key->data; - dkey->zone = SCOUTFS_FS_ZONE; - dkey->ino = cpu_to_be64(dir_ino); - dkey->type = SCOUTFS_DIRENT_TYPE; - memcpy(dkey->name, (void *)name, name_len); - } - - return key; + scoutfs_key_init(key, dkey, sizeof(struct scoutfs_dirent_key)); } -static void init_link_backref_key(struct scoutfs_key_buf *key, - struct scoutfs_link_backref_key *lbrkey, - u64 ino, u64 dir_ino, - const char *name, unsigned name_len) +static unsigned int dirent_bytes(unsigned int name_len) { - lbrkey->zone = SCOUTFS_FS_ZONE; - lbrkey->ino = cpu_to_be64(ino); - lbrkey->type = SCOUTFS_LINK_BACKREF_TYPE; - lbrkey->dir_ino = cpu_to_be64(dir_ino); - if (name_len) - memcpy(lbrkey->name, name, name_len); - - scoutfs_key_init(key, lbrkey, offsetof(struct scoutfs_link_backref_key, - name[name_len])); + return offsetof(struct scoutfs_dirent, name[name_len]); } -static struct scoutfs_key_buf *alloc_link_backref_key(struct super_block *sb, - u64 ino, u64 dir_ino, - const char *name, - unsigned name_len) +static struct scoutfs_dirent *alloc_dirent(unsigned int name_len) { - struct scoutfs_link_backref_key *lbkey; - struct scoutfs_key_buf *key; + return kmalloc(dirent_bytes(name_len), GFP_NOFS); +} - key = scoutfs_key_alloc(sb, offsetof(struct scoutfs_link_backref_key, - name[name_len])); - if (key) { - lbkey = key->data; - init_link_backref_key(key, lbkey, ino, dir_ino, - name, name_len); - } +static u64 dirent_name_hash(const char *name, unsigned int name_len) +{ + unsigned int half = (name_len + 1) / 2; - return key; + return crc32c(~0, name, half) | + ((u64)crc32c(~0, name + name_len - half, half) << 32); +} + +static u64 dirent_names_equal(const char *a_name, unsigned int a_len, + const char *b_name, unsigned int b_len) +{ + return a_len == b_len && memcmp(a_name, b_name, a_len) == 0; } /* * Looks for the dirent item and fills the caller's dirent if it finds * it. Returns item lookup errors including -ENOENT if it's not found. */ -static int lookup_dirent(struct super_block *sb, struct inode *dir, - const char *name, unsigned name_len, - struct scoutfs_dirent *dent, +static int lookup_dirent(struct super_block *sb, u64 dir_ino, const char *name, + unsigned name_len, u64 hash, + struct scoutfs_dirent *dent_ret, struct scoutfs_lock *lock) { - struct scoutfs_key_buf *key = NULL; + struct scoutfs_dirent_key last_dkey; + struct scoutfs_dirent_key dkey; + struct scoutfs_key_buf last_key; + struct scoutfs_key_buf key; + struct scoutfs_dirent *dent = NULL; struct kvec val; int ret; - key = alloc_dirent_key(sb, scoutfs_ino(dir), name, name_len); - if (!key) { + dent = alloc_dirent(SCOUTFS_NAME_LEN); + if (!dent) { ret = -ENOMEM; goto out; } - kvec_init(&val, dent, sizeof(struct scoutfs_dirent)); + init_dirent_key(&key, &dkey, SCOUTFS_DIRENT_TYPE, + dir_ino, hash, 0); + init_dirent_key(&last_key, &last_dkey, SCOUTFS_DIRENT_TYPE, + dir_ino, hash, U64_MAX); + kvec_init(&val, dent, dirent_bytes(SCOUTFS_NAME_LEN)); + + for (;;) { + ret = scoutfs_item_next(sb, &key, &last_key, &val, lock); + if (ret < 0) + break; + + ret -= sizeof(struct scoutfs_dirent); + /* XXX corruption */ + if (ret < 1 || ret > SCOUTFS_NAME_LEN) { + ret = -EIO; + goto out; + } + + if (dirent_names_equal(name, name_len, dent->name, ret)) { + *dent_ret = *dent; + ret = 0; + break; + } + + if (be64_to_cpu(dkey.minor) == U64_MAX) { + ret = -ENOENT; + break; + } + be64_add_cpu(&dkey.minor, 1); + } - ret = scoutfs_item_lookup_exact(sb, key, &val, lock); out: - scoutfs_key_free(sb, key); + kfree(dent); return ret; } @@ -318,18 +341,24 @@ static int scoutfs_d_revalidate(struct dentry *dentry, unsigned int flags) if (ret) goto out; - ret = lookup_dirent(sb, dir, dentry->d_name.name, dentry->d_name.len, + ret = lookup_dirent(sb, scoutfs_ino(dir), + dentry->d_name.name, dentry->d_name.len, + dirent_name_hash(dentry->d_name.name, + dentry->d_name.len), &dent, lock); - if (ret == -ENOENT) + if (ret == -ENOENT) { dent.ino = 0; - else if (ret < 0) + dent.hash = 0; + dent.pos = 0; + } else if (ret < 0) { goto out; + } dentry_ino = dentry->d_inode ? scoutfs_ino(dentry->d_inode) : 0; if ((dentry_ino == le64_to_cpu(dent.ino))) { - update_dentry_info(sb, dentry, le64_to_cpu(dent.readdir_pos), - lock); + update_dentry_info(sb, dentry, le64_to_cpu(dent.hash), + le64_to_cpu(dent.pos), lock); scoutfs_inc_counter(sb, dentry_revalidate_valid); ret = 1; } else { @@ -368,8 +397,11 @@ static struct dentry *scoutfs_lookup(struct inode *dir, struct dentry *dentry, struct scoutfs_dirent dent; struct inode *inode; u64 ino = 0; + u64 hash; int ret; + hash = dirent_name_hash(dentry->d_name.name, dentry->d_name.len); + if (dentry->d_name.len > SCOUTFS_NAME_LEN) { ret = -ENAMETOOLONG; goto out; @@ -383,15 +415,15 @@ static struct dentry *scoutfs_lookup(struct inode *dir, struct dentry *dentry, if (ret) goto out; - ret = lookup_dirent(sb, dir, dentry->d_name.name, dentry->d_name.len, - &dent, dir_lock); + ret = lookup_dirent(sb, scoutfs_ino(dir), dentry->d_name.name, + dentry->d_name.len, hash, &dent, dir_lock); if (ret == -ENOENT) { ino = 0; ret = 0; } else if (ret == 0) { ino = le64_to_cpu(dent.ino); - update_dentry_info(sb, dentry, le64_to_cpu(dent.readdir_pos), - dir_lock); + update_dentry_info(sb, dentry, le64_to_cpu(dent.hash), + le64_to_cpu(dent.pos), dir_lock); } scoutfs_unlock(sb, dir_lock, DLM_LOCK_PR); @@ -428,18 +460,6 @@ static int dir_emit_dots(struct file *file, void *dirent, filldir_t filldir) return 1; } -static void init_readdir_key(struct scoutfs_key_buf *key, - struct scoutfs_readdir_key *rkey, u64 dir_ino, - loff_t pos) -{ - rkey->zone = SCOUTFS_FS_ZONE; - rkey->ino = cpu_to_be64(dir_ino); - rkey->type = SCOUTFS_READDIR_TYPE; - rkey->pos = cpu_to_be64(pos); - - scoutfs_key_init(key, rkey, sizeof(struct scoutfs_readdir_key)); -} - /* * readdir simply iterates over the dirent items for the dir inode and * uses their offset as the readdir position. @@ -454,10 +474,9 @@ static int scoutfs_readdir(struct file *file, void *dirent, filldir_t filldir) struct scoutfs_dirent *dent; struct scoutfs_key_buf key; struct scoutfs_key_buf last_key; - struct scoutfs_readdir_key rkey; - struct scoutfs_readdir_key last_rkey; + struct scoutfs_dirent_key dkey; + struct scoutfs_dirent_key last_dkey; struct scoutfs_lock *dir_lock; - unsigned int item_len; unsigned int name_len; struct kvec val; u64 pos; @@ -466,27 +485,26 @@ static int scoutfs_readdir(struct file *file, void *dirent, filldir_t filldir) if (!dir_emit_dots(file, dirent, filldir)) return 0; - ret = scoutfs_lock_inode(sb, DLM_LOCK_PR, 0, inode, &dir_lock); - if (ret) - return ret; - - init_readdir_key(&last_key, &last_rkey, scoutfs_ino(inode), - SCOUTFS_DIRENT_LAST_POS); - - item_len = offsetof(struct scoutfs_dirent, name[SCOUTFS_NAME_LEN]); - dent = kmalloc(item_len, GFP_KERNEL); + dent = alloc_dirent(SCOUTFS_NAME_LEN); if (!dent) { ret = -ENOMEM; goto out; } - for (;;) { - init_readdir_key(&key, &rkey, scoutfs_ino(inode), file->f_pos); + init_dirent_key(&last_key, &last_dkey, SCOUTFS_READDIR_TYPE, + scoutfs_ino(inode), SCOUTFS_DIRENT_LAST_POS, 0); + kvec_init(&val, dent, dirent_bytes(SCOUTFS_NAME_LEN)); + + ret = scoutfs_lock_inode(sb, DLM_LOCK_PR, 0, inode, &dir_lock); + if (ret) + goto out; + + for (;;) { + init_dirent_key(&key, &dkey, SCOUTFS_READDIR_TYPE, + scoutfs_ino(inode), file->f_pos, 0); - kvec_init(&val, dent, item_len); ret = scoutfs_item_next_same_min(sb, &key, &last_key, &val, - offsetof(struct scoutfs_dirent, name[1]), - dir_lock); + dirent_bytes(1), dir_lock); if (ret < 0) { if (ret == -ENOENT) ret = 0; @@ -494,7 +512,7 @@ static int scoutfs_readdir(struct file *file, void *dirent, filldir_t filldir) } name_len = ret - sizeof(struct scoutfs_dirent); - pos = be64_to_cpu(rkey.pos); + pos = be64_to_cpu(dkey.major); if (filldir(dirent, dent->name, name_len, pos, le64_to_cpu(dent->ino), dentry_type(dent->type))) { @@ -519,69 +537,63 @@ out: * * If this returns an error then nothing will have changed. */ -static int add_entry_items(struct super_block *sb, u64 dir_ino, u64 pos, - const char *name, unsigned name_len, u64 ino, - umode_t mode, struct scoutfs_lock *dir_lock, +static int add_entry_items(struct super_block *sb, u64 dir_ino, u64 hash, + u64 pos, const char *name, unsigned name_len, + u64 ino, umode_t mode, struct scoutfs_lock *dir_lock, struct scoutfs_lock *inode_lock) { - struct scoutfs_key_buf *ent_key = NULL; - struct scoutfs_key_buf *lb_key = NULL; - struct scoutfs_dirent *dent = NULL; + struct scoutfs_dirent_key rdir_dkey; + struct scoutfs_dirent_key ent_dkey; + struct scoutfs_dirent_key lb_dkey; struct scoutfs_key_buf rdir_key; - struct scoutfs_readdir_key rkey; + struct scoutfs_key_buf ent_key; + struct scoutfs_key_buf lb_key; + struct scoutfs_dirent *dent; bool del_ent = false; bool del_rdir = false; struct kvec val; int ret; - ent_key = alloc_dirent_key(sb, dir_ino, name, name_len); - dent = kmalloc(offsetof(struct scoutfs_dirent, name[name_len]), - GFP_NOFS); - if (!ent_key || !dent) { + dent = alloc_dirent(name_len); + if (!dent) { ret = -ENOMEM; goto out; } /* initialize the dent */ dent->ino = cpu_to_le64(ino); - dent->readdir_pos = cpu_to_le64(pos); + dent->hash = cpu_to_le64(hash); + dent->pos = cpu_to_le64(pos); dent->type = mode_to_type(mode); memcpy(dent->name, name, name_len); - /* dirent item for lookup */ - kvec_init(&val, dent, sizeof(struct scoutfs_dirent)); - ret = scoutfs_item_create(sb, ent_key, &val, dir_lock); + init_dirent_key(&ent_key, &ent_dkey, SCOUTFS_DIRENT_TYPE, + dir_ino, hash, pos); + init_dirent_key(&rdir_key, &rdir_dkey, SCOUTFS_READDIR_TYPE, + dir_ino, pos, 0); + init_dirent_key(&lb_key, &lb_dkey, SCOUTFS_LINK_BACKREF_TYPE, + ino, dir_ino, pos); + kvec_init(&val, dent, dirent_bytes(name_len)); + + ret = scoutfs_item_create(sb, &ent_key, &val, dir_lock); if (ret) goto out; del_ent = true; - /* readdir item for .. readdir */ - init_readdir_key(&rdir_key, &rkey, dir_ino, pos); - kvec_init(&val, dent, offsetof(struct scoutfs_dirent, name[name_len])); - ret = scoutfs_item_create(sb, &rdir_key, &val, dir_lock); if (ret) goto out; del_rdir = true; - /* link backref item for inode to path resolution */ - lb_key = alloc_link_backref_key(sb, ino, dir_ino, name, name_len); - if (!lb_key) { - ret = -ENOMEM; - goto out; - } - - ret = scoutfs_item_create(sb, lb_key, NULL, inode_lock); + ret = scoutfs_item_create(sb, &lb_key, &val, inode_lock); out: if (ret < 0) { if (del_ent) - scoutfs_item_delete_dirty(sb, ent_key); + scoutfs_item_delete_dirty(sb, &ent_key); if (del_rdir) scoutfs_item_delete_dirty(sb, &rdir_key); } - scoutfs_key_free(sb, ent_key); - scoutfs_key_free(sb, lb_key); kfree(dent); return ret; @@ -592,49 +604,40 @@ out: * Only items are modified. The caller is responsible for locking, * entering a transaction, dirtying items, and managing the vfs structs. * - * The items match the items used in add_entry_items() but we don't have - * to worry about values here and we can dirty all the items before - * starting to delete them which makes cleanup a little easier. - * * If this returns an error then nothing will have changed. */ -static int del_entry_items(struct super_block *sb, u64 dir_ino, u64 pos, - const char *name, unsigned name_len, u64 ino, - struct scoutfs_lock *dir_lock, +static int del_entry_items(struct super_block *sb, u64 dir_ino, u64 hash, + u64 pos, u64 ino, struct scoutfs_lock *dir_lock, struct scoutfs_lock *inode_lock) { - struct scoutfs_key_buf *ent_key; - struct scoutfs_key_buf *lb_key; + struct scoutfs_dirent_key rdir_dkey; + struct scoutfs_dirent_key ent_dkey; + struct scoutfs_dirent_key lb_dkey; struct scoutfs_key_buf rdir_key; - struct scoutfs_readdir_key rkey; + struct scoutfs_key_buf ent_key; + struct scoutfs_key_buf lb_key; + LIST_HEAD(dir_saved); + LIST_HEAD(inode_saved); int ret; - ent_key = alloc_dirent_key(sb, dir_ino, name, name_len); - if (!ent_key) - return -ENOMEM; + init_dirent_key(&ent_key, &ent_dkey, SCOUTFS_DIRENT_TYPE, + dir_ino, hash, pos); + init_dirent_key(&rdir_key, &rdir_dkey, SCOUTFS_READDIR_TYPE, + dir_ino, pos, 0); + init_dirent_key(&lb_key, &lb_dkey, SCOUTFS_LINK_BACKREF_TYPE, + ino, dir_ino, pos); - init_readdir_key(&rdir_key, &rkey, dir_ino, pos); - - lb_key = alloc_link_backref_key(sb, ino, dir_ino, name, name_len); - if (!lb_key) { - ret = -ENOMEM; - goto out; + ret = scoutfs_item_delete_save(sb, &ent_key, &dir_saved, dir_lock) ?: + scoutfs_item_delete_save(sb, &rdir_key, &dir_saved, dir_lock) ?: + scoutfs_item_delete_save(sb, &lb_key, &inode_saved, inode_lock); + if (ret < 0) { + scoutfs_item_restore(sb, &dir_saved, dir_lock); + scoutfs_item_restore(sb, &inode_saved, inode_lock); + } else { + scoutfs_item_free_batch(sb, &dir_saved); + scoutfs_item_free_batch(sb, &inode_saved); } - ret = scoutfs_item_dirty(sb, ent_key, dir_lock) ?: - scoutfs_item_dirty(sb, &rdir_key, dir_lock) ?: - scoutfs_item_dirty(sb, lb_key, inode_lock); - if (ret) - goto out; - - scoutfs_item_delete_dirty(sb, ent_key); - scoutfs_item_delete_dirty(sb, &rdir_key); - scoutfs_item_delete_dirty(sb, lb_key); - ret = 0; - -out: - kfree(ent_key); - kfree(lb_key); return ret; } @@ -724,12 +727,14 @@ static int scoutfs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, struct scoutfs_lock *dir_lock = NULL; struct scoutfs_lock *inode_lock = NULL; LIST_HEAD(ind_locks); + u64 hash; u64 pos; int ret; if (dentry->d_name.len > SCOUTFS_NAME_LEN) return -ENAMETOOLONG; + hash = dirent_name_hash(dentry->d_name.name, dentry->d_name.len); inode = lock_hold_create(dir, dentry, mode, rdev, SIC_MKNOD(dentry->d_name.len), &dir_lock, &inode_lock, &ind_locks); @@ -738,13 +743,14 @@ static int scoutfs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, pos = SCOUTFS_I(dir)->next_readdir_pos++; - ret = add_entry_items(sb, scoutfs_ino(dir), pos, dentry->d_name.name, - dentry->d_name.len, scoutfs_ino(inode), - inode->i_mode, dir_lock, inode_lock); + ret = add_entry_items(sb, scoutfs_ino(dir), hash, pos, + dentry->d_name.name, dentry->d_name.len, + scoutfs_ino(inode), inode->i_mode, dir_lock, + inode_lock); if (ret) goto out; - update_dentry_info(sb, dentry, pos, dir_lock); + update_dentry_info(sb, dentry, hash, pos, dir_lock); i_size_write(dir, i_size_read(dir) + dentry->d_name.len); dir->i_mtime = dir->i_ctime = CURRENT_TIME; @@ -795,9 +801,12 @@ static int scoutfs_link(struct dentry *old_dentry, LIST_HEAD(ind_locks); u64 dir_size; u64 ind_seq; + u64 hash; u64 pos; int ret; + hash = dirent_name_hash(dentry->d_name.name, dentry->d_name.len); + if (dentry->d_name.len > SCOUTFS_NAME_LEN) return -ENAMETOOLONG; @@ -834,12 +843,13 @@ retry: pos = SCOUTFS_I(dir)->next_readdir_pos++; - ret = add_entry_items(sb, scoutfs_ino(dir), pos, dentry->d_name.name, - dentry->d_name.len, scoutfs_ino(inode), - inode->i_mode, dir_lock, inode_lock); + ret = add_entry_items(sb, scoutfs_ino(dir), hash, pos, + dentry->d_name.name, dentry->d_name.len, + scoutfs_ino(inode), inode->i_mode, dir_lock, + inode_lock); if (ret) goto out; - update_dentry_info(sb, dentry, pos, dir_lock); + update_dentry_info(sb, dentry, hash, pos, dir_lock); i_size_write(dir, dir_size); dir->i_mtime = dir->i_ctime = CURRENT_TIME; @@ -908,9 +918,9 @@ retry: if (ret) goto unlock; - ret = del_entry_items(sb, scoutfs_ino(dir), dentry_info_pos(dentry), - dentry->d_name.name, dentry->d_name.len, - scoutfs_ino(inode), dir_lock, inode_lock); + ret = del_entry_items(sb, scoutfs_ino(dir), dentry_info_hash(dentry), + dentry_info_pos(dentry), scoutfs_ino(inode), + dir_lock, inode_lock); if (ret) goto out; @@ -1108,9 +1118,12 @@ static int scoutfs_symlink(struct inode *dir, struct dentry *dentry, struct scoutfs_lock *dir_lock = NULL; struct scoutfs_lock *inode_lock = NULL; LIST_HEAD(ind_locks); + u64 hash; u64 pos; int ret; + hash = dirent_name_hash(dentry->d_name.name, dentry->d_name.len); + /* path_max includes null as does our value for nd_set_link */ if (dentry->d_name.len > SCOUTFS_NAME_LEN || name_len > PATH_MAX || name_len > SCOUTFS_SYMLINK_MAX_SIZE) @@ -1133,13 +1146,14 @@ static int scoutfs_symlink(struct inode *dir, struct dentry *dentry, pos = SCOUTFS_I(dir)->next_readdir_pos++; - ret = add_entry_items(sb, scoutfs_ino(dir), pos, dentry->d_name.name, - dentry->d_name.len, scoutfs_ino(inode), - inode->i_mode, dir_lock, inode_lock); + ret = add_entry_items(sb, scoutfs_ino(dir), hash, pos, + dentry->d_name.name, dentry->d_name.len, + scoutfs_ino(inode), inode->i_mode, dir_lock, + inode_lock); if (ret) goto out; - update_dentry_info(sb, dentry, pos, dir_lock); + update_dentry_info(sb, dentry, hash, pos, dir_lock); i_size_write(dir, i_size_read(dir) + dentry->d_name.len); dir->i_mtime = dir->i_ctime = CURRENT_TIME; @@ -1185,7 +1199,7 @@ int scoutfs_symlink_drop(struct super_block *sb, u64 ino, /* * Find the next link backref key for the given ino starting from the - * given dir inode and null terminated name. If we find a backref item + * given dir inode and final entry position. If we find a backref item * we add an allocated copy of it to the head of the caller's list. * * Returns 0 if we added an entry, -ENOENT if we didn't, and -errno for @@ -1195,40 +1209,37 @@ int scoutfs_symlink_drop(struct super_block *sb, u64 ino, * building up a path with individual locked backref item lookups. */ int scoutfs_dir_add_next_linkref(struct super_block *sb, u64 ino, - u64 dir_ino, char *name, unsigned int name_len, + u64 dir_ino, u64 dir_pos, struct list_head *list) { - struct scoutfs_link_backref_key last_lbkey; struct scoutfs_link_backref_entry *ent; - struct scoutfs_lock *lock = NULL; - struct scoutfs_key_buf last; + struct scoutfs_dirent_key last_dkey; + struct scoutfs_dirent_key dkey; + struct scoutfs_key_buf last_key; struct scoutfs_key_buf key; + struct scoutfs_lock *lock = NULL; + struct kvec val; int len; int ret; ent = kmalloc(offsetof(struct scoutfs_link_backref_entry, - lbkey.name[SCOUTFS_NAME_LEN + 1]), GFP_KERNEL); + dent.name[SCOUTFS_NAME_LEN]), GFP_KERNEL); if (!ent) return -ENOMEM; INIT_LIST_HEAD(&ent->head); - /* put search key in ent */ - init_link_backref_key(&key, &ent->lbkey, ino, dir_ino, name, name_len); - /* we actually have room for a full backref item */ - scoutfs_key_init_buf_len(&key, key.data, key.key_len, - offsetof(struct scoutfs_link_backref_key, - name[SCOUTFS_NAME_LEN + 1])); + init_dirent_key(&key, &dkey, SCOUTFS_LINK_BACKREF_TYPE, + ino, dir_ino, dir_pos); + init_dirent_key(&last_key, &last_dkey, SCOUTFS_LINK_BACKREF_TYPE, + ino, U64_MAX, U64_MAX); + kvec_init(&val, &ent->dent, dirent_bytes(SCOUTFS_NAME_LEN)); - /* small last key to avoid full name copy, XXX enforce no U64_MAX ino */ - init_link_backref_key(&last, &last_lbkey, ino, U64_MAX, NULL, 0); - - /* next backref key is now in ent */ ret = scoutfs_lock_ino(sb, DLM_LOCK_PR, 0, ino, &lock); if (ret) goto out; - ret = scoutfs_item_next(sb, &key, &last, NULL, lock); + ret = scoutfs_item_next(sb, &key, &last_key, &val, lock); scoutfs_unlock(sb, lock, DLM_LOCK_PR); lock = NULL; @@ -1236,15 +1247,17 @@ int scoutfs_dir_add_next_linkref(struct super_block *sb, u64 ino, if (ret < 0) goto out; - len = (int)key.key_len - sizeof(struct scoutfs_link_backref_key); + len = ret - sizeof(struct scoutfs_dirent); /* XXX corruption */ if (len < 1 || len > SCOUTFS_NAME_LEN) { ret = -EIO; goto out; } - ent->name_len = len; list_add(&ent->head, list); + ent->dir_ino = be64_to_cpu(dkey.major); + ent->dir_pos = be64_to_cpu(dkey.minor); + ent->name_len = len; ret = 0; out: if (list_empty(&ent->head)) @@ -1257,7 +1270,7 @@ static u64 first_backref_dir_ino(struct list_head *list) struct scoutfs_link_backref_entry *ent; ent = list_first_entry(list, struct scoutfs_link_backref_entry, head); - return be64_to_cpu(ent->lbkey.dir_ino); + return ent->dir_ino; } void scoutfs_dir_free_backref_path(struct super_block *sb, @@ -1310,8 +1323,7 @@ void scoutfs_dir_free_backref_path(struct super_block *sb, * sync if we see our dirty seq. */ int scoutfs_dir_get_backref_path(struct super_block *sb, u64 ino, u64 dir_ino, - char *name, u16 name_len, - struct list_head *list) + u64 dir_pos, struct list_head *list) { u64 par_ino; int ret; @@ -1323,15 +1335,14 @@ retry: * confident we won't hit an endless loop here again. */ if (WARN_ONCE(++iters >= 4000, "scoutfs: Excessive retries in " - "dir_get_backref_path. ino %llu dir_ino %llu name %.*s\n", - ino, dir_ino, name_len, name)) { + "dir_get_backref_path. ino %llu dir_ino %llu pos %llu\n", + ino, dir_ino, dir_pos)) { ret = -EINVAL; goto out; } /* get the next link name to the given inode */ - ret = scoutfs_dir_add_next_linkref(sb, ino, dir_ino, name, name_len, - list); + ret = scoutfs_dir_add_next_linkref(sb, ino, dir_ino, dir_pos, list); if (ret < 0) goto out; @@ -1339,8 +1350,7 @@ retry: par_ino = first_backref_dir_ino(list); while (par_ino != SCOUTFS_ROOT_INO) { - ret = scoutfs_dir_add_next_linkref(sb, par_ino, 0, NULL, 0, - list); + ret = scoutfs_dir_add_next_linkref(sb, par_ino, 0, 0, list); if (ret < 0) { if (ret == -ENOENT) { /* restart if there was no parent component */ @@ -1374,26 +1384,23 @@ static int item_d_ancestor(struct super_block *sb, u64 p1, u64 p2, u64 *p_ret) { struct scoutfs_link_backref_entry *ent; LIST_HEAD(list); - u64 dir_ino; int ret; u64 p; *p_ret = 0; - ret = scoutfs_dir_get_backref_path(sb, p2, 0, NULL, 0, &list); + ret = scoutfs_dir_get_backref_path(sb, p2, 0, 0, &list); if (ret) goto out; p = p2; list_for_each_entry(ent, &list, head) { - dir_ino = be64_to_cpu(ent->lbkey.dir_ino); - - if (dir_ino == p1) { + if (ent->dir_ino == p1) { *p_ret = p; ret = 0; break; } - p = dir_ino; + p = ent->dir_ino; } out: @@ -1434,27 +1441,18 @@ static int verify_ancestors(struct super_block *sb, u64 p1, u64 p2, * The caller has the name locked in the dir. */ static int verify_entry(struct super_block *sb, u64 dir_ino, const char *name, - unsigned name_len, u64 ino, + unsigned name_len, u64 hash, u64 ino, struct scoutfs_lock *lock) { - struct scoutfs_key_buf *key = NULL; struct scoutfs_dirent dent; - struct kvec val; int ret; - key = alloc_dirent_key(sb, dir_ino, name, name_len); - if (!key) - return -ENOMEM; - - kvec_init(&val, &dent, sizeof(dent)); - - ret = scoutfs_item_lookup_exact(sb, key, &val, lock); + ret = lookup_dirent(sb, dir_ino, name, name_len, hash, &dent, lock); if (ret == 0 && le64_to_cpu(dent.ino) != ino) ret = -ENOENT; else if (ret == -ENOENT && ino == 0) ret = 0; - scoutfs_key_free(sb, key); return ret; } @@ -1503,12 +1501,19 @@ static int scoutfs_rename(struct inode *old_dir, struct dentry *old_dentry, bool ins_old = false; LIST_HEAD(ind_locks); u64 ind_seq; + u64 old_hash; + u64 new_hash; u64 new_pos; int ret; int err; trace_scoutfs_rename(sb, old_dir, old_dentry, new_dir, new_dentry); + old_hash = dirent_name_hash(old_dentry->d_name.name, + old_dentry->d_name.len); + new_hash = dirent_name_hash(new_dentry->d_name.name, + new_dentry->d_name.len); + if (new_dentry->d_name.len > SCOUTFS_NAME_LEN) return -ENAMETOOLONG; @@ -1545,10 +1550,10 @@ static int scoutfs_rename(struct inode *old_dir, struct dentry *old_dentry, /* make sure that the entries assumed by the argument still exist */ ret = verify_entry(sb, scoutfs_ino(old_dir), old_dentry->d_name.name, - old_dentry->d_name.len, scoutfs_ino(old_inode), - old_dir_lock) ?: + old_dentry->d_name.len, old_hash, + scoutfs_ino(old_inode), old_dir_lock) ?: verify_entry(sb, scoutfs_ino(new_dir), new_dentry->d_name.name, - new_dentry->d_name.len, + new_dentry->d_name.len, new_hash, new_inode ? scoutfs_ino(new_inode) : 0, new_dir_lock); if (ret) @@ -1586,9 +1591,8 @@ retry: /* remove the new entry if it exists */ if (new_inode) { ret = del_entry_items(sb, scoutfs_ino(new_dir), + dentry_info_hash(new_dentry), dentry_info_pos(new_dentry), - new_dentry->d_name.name, - new_dentry->d_name.len, scoutfs_ino(new_inode), new_dir_lock, new_inode_lock); if (ret) @@ -1597,7 +1601,7 @@ retry: } /* create the new entry */ - ret = add_entry_items(sb, scoutfs_ino(new_dir), new_pos, + ret = add_entry_items(sb, scoutfs_ino(new_dir), new_hash, new_pos, new_dentry->d_name.name, new_dentry->d_name.len, scoutfs_ino(old_inode), old_inode->i_mode, new_dir_lock, old_inode_lock); @@ -1607,9 +1611,8 @@ retry: /* remove the old entry */ ret = del_entry_items(sb, scoutfs_ino(old_dir), + dentry_info_hash(old_dentry), dentry_info_pos(old_dentry), - old_dentry->d_name.name, - old_dentry->d_name.len, scoutfs_ino(old_inode), old_dir_lock, old_inode_lock); if (ret) @@ -1625,7 +1628,7 @@ retry: /* won't fail from here on out, update all the vfs structs */ /* the caller will use d_move to move the old_dentry into place */ - update_dentry_info(sb, old_dentry, new_pos, new_dir_lock); + update_dentry_info(sb, old_dentry, new_hash, new_pos, new_dir_lock); i_size_write(old_dir, i_size_read(old_dir) - old_dentry->d_name.len); if (!new_inode) @@ -1664,7 +1667,6 @@ retry: if (new_inode) scoutfs_update_inode_item(new_inode, new_inode_lock, &ind_locks); - ret = 0; out: if (ret) { @@ -1677,10 +1679,14 @@ out: * succeed. Maybe we could have an item replace call * that gives us the dupe to re-insert on cleanup? Not * sure. + * + * It's safe to use dentry_info here 'cause they haven't + * been updated if we saw an error. */ err = 0; if (ins_old) err = add_entry_items(sb, scoutfs_ino(old_dir), + dentry_info_hash(old_dentry), dentry_info_pos(old_dentry), old_dentry->d_name.name, old_dentry->d_name.len, @@ -1691,14 +1697,13 @@ out: if (del_new && err == 0) err = del_entry_items(sb, scoutfs_ino(new_dir), - new_pos, - new_dentry->d_name.name, - new_dentry->d_name.len, + new_hash, new_pos, scoutfs_ino(old_inode), new_dir_lock, old_inode_lock); if (ins_new && err == 0) err = add_entry_items(sb, scoutfs_ino(new_dir), + dentry_info_hash(new_dentry), dentry_info_pos(new_dentry), new_dentry->d_name.name, new_dentry->d_name.len, diff --git a/kmod/src/dir.h b/kmod/src/dir.h index 79aaaa2a..ee43930e 100644 --- a/kmod/src/dir.h +++ b/kmod/src/dir.h @@ -10,18 +10,20 @@ extern const struct inode_operations scoutfs_symlink_iops; struct scoutfs_link_backref_entry { struct list_head head; + u64 dir_ino; + u64 dir_pos; u16 name_len; - struct scoutfs_link_backref_key lbkey; + struct scoutfs_dirent dent; + /* the full name is allocated and stored in dent.name[0] */ }; -int scoutfs_dir_get_backref_path(struct super_block *sb, u64 target_ino, - u64 dir_ino, char *name, u16 name_len, - struct list_head *list); +int scoutfs_dir_get_backref_path(struct super_block *sb, u64 ino, u64 dir_ino, + u64 dir_pos, struct list_head *list); void scoutfs_dir_free_backref_path(struct super_block *sb, struct list_head *list); int scoutfs_dir_add_next_linkref(struct super_block *sb, u64 ino, - u64 dir_ino, char *name, unsigned int name_len, + u64 dir_ino, u64 dir_pos, struct list_head *list); int scoutfs_symlink_drop(struct super_block *sb, u64 ino, diff --git a/kmod/src/export.c b/kmod/src/export.c index 90c14cb9..5ee59b41 100644 --- a/kmod/src/export.c +++ b/kmod/src/export.c @@ -114,13 +114,12 @@ static struct dentry *scoutfs_get_parent(struct dentry *child) int ret; u64 ino; - ret = scoutfs_dir_add_next_linkref(sb, scoutfs_ino(inode), 0, NULL, 0, - &list); + ret = scoutfs_dir_add_next_linkref(sb, scoutfs_ino(inode), 0, 0, &list); if (ret) return ERR_PTR(ret); ent = list_first_entry(&list, struct scoutfs_link_backref_entry, head); - ino = be64_to_cpu(ent->lbkey.dir_ino); + ino = ent->dir_ino; scoutfs_dir_free_backref_path(sb, &list); trace_scoutfs_get_parent(sb, inode, ino); @@ -140,16 +139,16 @@ static int scoutfs_get_name(struct dentry *parent, char *name, int ret; ret = scoutfs_dir_add_next_linkref(sb, scoutfs_ino(inode), dir_ino, - NULL, 0, &list); + 0, &list); if (ret) return ret; ret = -ENOENT; ent = list_first_entry(&list, struct scoutfs_link_backref_entry, head); - if (be64_to_cpu(ent->lbkey.ino) == scoutfs_ino(inode) && - be64_to_cpu(ent->lbkey.dir_ino) == dir_ino && + if (le64_to_cpu(ent->dent.ino) == scoutfs_ino(inode) && + ent->dir_ino == dir_ino && ent->name_len <= NAME_MAX) { - memcpy(name, ent->lbkey.name, ent->name_len); + memcpy(name, ent->dent.name, ent->name_len); name[ent->name_len] = '\0'; ret = 0; trace_scoutfs_get_name(sb, parent->d_inode, inode, name); diff --git a/kmod/src/format.h b/kmod/src/format.h index ff3e4872..0509e647 100644 --- a/kmod/src/format.h +++ b/kmod/src/format.h @@ -266,29 +266,13 @@ struct scoutfs_inode_key { __u8 type; } __packed; -/* value is struct scoutfs_dirent without the name */ +/* value is struct scoutfs_dirent with the name */ struct scoutfs_dirent_key { __u8 zone; __be64 ino; __u8 type; - __u8 name[0]; -} __packed; - -/* value is struct scoutfs_dirent with the name */ -struct scoutfs_readdir_key { - __u8 zone; - __be64 ino; - __u8 type; - __be64 pos; -} __packed; - -/* value is empty */ -struct scoutfs_link_backref_key { - __u8 zone; - __be64 ino; - __u8 type; - __be64 dir_ino; - __u8 name[0]; + __be64 major; + __be64 minor; } __packed; /* key is bytes of encoded block mapping */ @@ -494,13 +478,17 @@ struct scoutfs_inode { #define SCOUTFS_SYMLINK_MAX_SIZE 4096 /* - * Dirents are stored in items with an offset of the hash of their name. - * Colliding names are packed into the value. + * Dirents are stored in multiple places to isolate contention when + * performing different operations: hashed by name for creation and + * lookup, at incrementing positions for readdir and resolving inodes to + * paths. Each entry has all the metadata needed to reference all the + * items (so an entry cached by lookup can be used to unlink all the + * items). */ struct scoutfs_dirent { __le64 ino; - __le64 counter; - __le64 readdir_pos; + __le64 hash; + __le64 pos; __u8 type; __u8 name[0]; } __packed; @@ -526,9 +514,8 @@ enum { SCOUTFS_DT_WHT, }; -/* ino_path can search for backref items with a null term */ #define SCOUTFS_MAX_KEY_SIZE \ - offsetof(struct scoutfs_link_backref_key, name[SCOUTFS_NAME_LEN + 1]) + sizeof(struct scoutfs_dirent_key) #define SCOUTFS_MAX_VAL_SIZE SCOUTFS_BLOCK_MAPPING_MAX_BYTES diff --git a/kmod/src/ioctl.c b/kmod/src/ioctl.c index 57faec81..fdfabab6 100644 --- a/kmod/src/ioctl.c +++ b/kmod/src/ioctl.c @@ -201,116 +201,79 @@ out: return ret; } -struct ino_path_cursor { - __u64 dir_ino; - __u8 name[SCOUTFS_NAME_LEN + 1]; -} __packed; - /* - * see the definition of scoutfs_ioctl_ino_path for ioctl semantics. - * - * The null termination of the cursor name is a trick to skip past the - * last name we read without having to try and "increment" the name. - * Adding a null sorts the cursor after the non-null name and before all - * the next names because the item names aren't null terminated. + * See the comment above the definition of struct scoutfs_ioctl_ino_path + * for ioctl semantics. */ static long scoutfs_ioc_ino_path(struct file *file, unsigned long arg) { struct super_block *sb = file_inode(file)->i_sb; - struct scoutfs_ioctl_ino_path __user *uargs; + struct scoutfs_ioctl_ino_path_result __user *ures; + struct scoutfs_link_backref_entry *last_ent; struct scoutfs_link_backref_entry *ent; - struct ino_path_cursor __user *ucurs; struct scoutfs_ioctl_ino_path args; - char __user *upath; LIST_HEAD(list); - u64 dir_ino; - u16 name_len; + u16 copied; char term; - char *name; int ret; - BUILD_BUG_ON(SCOUTFS_IOC_INO_PATH_CURSOR_BYTES != - sizeof(struct ino_path_cursor)); - if (!capable(CAP_DAC_READ_SEARCH)) return -EPERM; - uargs = (void __user *)arg; - if (copy_from_user(&args, uargs, sizeof(args))) + if (copy_from_user(&args, (void __user *)arg, sizeof(args))) return -EFAULT; - if (args.cursor_bytes != sizeof(struct ino_path_cursor)) - return -EINVAL; + ures = (void __user *)(unsigned long)args.result_ptr; - ucurs = (void __user *)(unsigned long)args.cursor_ptr; - upath = (void __user *)(unsigned long)args.path_ptr; - - if (get_user(dir_ino, &ucurs->dir_ino)) - return -EFAULT; - - /* alloc/copy the small cursor name, requires and includes null */ - name_len = strnlen_user(ucurs->name, sizeof(ucurs->name)); - if (name_len < 1 || name_len > sizeof(ucurs->name)) - return -EINVAL; - - name = kmalloc(name_len, GFP_KERNEL); - if (!name) - return -ENOMEM; - - if (copy_from_user(name, ucurs->name, name_len)) { - ret = -EFAULT; + ret = scoutfs_dir_get_backref_path(sb, args.ino, args.dir_ino, + args.dir_pos, &list); + if (ret < 0) goto out; - } - ret = scoutfs_dir_get_backref_path(sb, args.ino, dir_ino, name, - name_len, &list); - if (ret < 0) { - if (ret == -ENOENT) - ret = 0; - goto out; - } - - ret = 0; + last_ent = list_last_entry(&list, struct scoutfs_link_backref_entry, + head); + copied = 0; list_for_each_entry(ent, &list, head) { - if (ret + ent->name_len + 1 > args.path_bytes) { + + if (offsetof(struct scoutfs_ioctl_ino_path_result, + path[copied + ent->name_len + 1]) + > args.result_bytes) { ret = -ENAMETOOLONG; goto out; } - if (copy_to_user(upath, ent->lbkey.name, ent->name_len)) { + if (copy_to_user(&ures->path[copied], + ent->dent.name, ent->name_len)) { ret = -EFAULT; goto out; } - upath += ent->name_len; - ret += ent->name_len; + copied += ent->name_len; - if (ent->head.next == &list) + if (ent == last_ent) term = '\0'; else term = '/'; - if (put_user(term, upath)) { + if (put_user(term, &ures->path[copied])) { ret = -EFAULT; break; } - upath++; - ret++; + copied++; } - /* copy the last entry into the cursor */ - ent = list_last_entry(&list, struct scoutfs_link_backref_entry, head); - - if (put_user(be64_to_cpu(ent->lbkey.dir_ino), &ucurs->dir_ino) || - copy_to_user(ucurs->name, ent->lbkey.name, ent->name_len) || - put_user('\0', &ucurs->name[ent->name_len])) { + /* fill the result header now that we know the copied path length */ + if (put_user(last_ent->dir_ino, &ures->dir_ino) || + put_user(last_ent->dir_pos, &ures->dir_pos) || + put_user(copied, &ures->path_bytes)) { ret = -EFAULT; + } else { + ret = 0; } out: scoutfs_dir_free_backref_path(sb, &list); - kfree(name); return ret; } diff --git a/kmod/src/ioctl.h b/kmod/src/ioctl.h index 33c94b95..721f1cde 100644 --- a/kmod/src/ioctl.h +++ b/kmod/src/ioctl.h @@ -64,24 +64,36 @@ enum { struct scoutfs_ioctl_walk_inodes) /* - * Fill the path buffer with the next path to the target inode. An - * iteration cursor is stored in the cursor buffer which advances - * through the paths to the inode at each call. + * Fill the result buffer with the next absolute path to the target + * inode searching from a given position in a parent directory. * * @ino: The target ino that we're finding paths to. Constant across * all the calls that make up an iteration over all the inode's paths. * - * @cursor_ptr: A pointer to the buffer that will hold the iteration - * cursor. It must be initialized to 0 before iterating. Each call - * modifies it to skip past the result of that call. + * @dir_ino: The inode number of the directory containing the entry to + * our inode to search from. If this parent directory contains no more + * entries to our inode then we'll search through other parent directory + * inodes in inode order. * - * @cusur_bytes: The length of the cursor buffer. Must be - * SCOUTFS_IOC_INO_PATH_CURSOR_BYTES. + * @dir_pos: The position in the dir_ino parent directory of the entry + * to our inode to search from. If there is no entry at this position + * then we'll search through other entry positions in increasing order. + * If we exhaust the parent directory then we'll search through + * additional parent directories in inode order. * - * @path_ptr: The buffer to store each found path. + * @result_ptr: A pointer to the buffer where the result struct and + * absolute path will be stored. * - * @path_bytes: The size of the buffer that will the found path - * including null termination. (PATH_MAX is a solid choice.) + * @result_bytes: The size of the buffer that will contain the result + * struct and the null terminated absolute path name. + * + * To start iterating set the desired target inode, dir_ino to 0, + * dir_pos to 0, and set result_ptr and _bytes to a sufficiently large + * buffeer (sizeof(result) + PATH_MAX is a solid choice). + * + * After each returned result set the next search dir_ino and dir_pos to + * the returned dir_ino and dir_pos. Then increment the search dir_pos, + * and if it wrapped to 0, increment dir_ino. * * This only walks back through full hard links. None of the returned * paths will reflect symlinks to components in the path. @@ -90,28 +102,39 @@ enum { * returned paths to the inode. It requires CAP_DAC_READ_SEARCH which * bypasses permissions checking. * - * ENAMETOOLONG is returned when the next path found from the cursor - * doesn't fit in the path buffer. - * * This call is not serialized with any modification (create, rename, * unlink) of the path components. It will return all the paths that * were stable both before and after the call. It may or may not return * paths which are created or unlinked during the call. * - * The number of bytes in the path, including the null terminator, are - * returned when a path is found. 0 is returned when there are no more - * paths to the link to the inode from the cursor. + * On success 0 is returned and result struct is filled with the next + * absolute path. The path_bytes length of the path includes a null + * terminating byte. dir_ino and dir_pos refer to the position of the + * final component in its parent directory and can be advanced to search + * for the next terminal entry whose path is then built by walking up + * parent directories. + * + * ENOENT is returned when no paths are found. + * + * ENAMETOOLONG is returned when the result struct and path found + * doesn't fit in the result buffer. + * + * Many other errnos indicate hard failure to find the next path. */ struct scoutfs_ioctl_ino_path { __u64 ino; - __u64 cursor_ptr; - __u64 path_ptr; - __u16 cursor_bytes; - __u16 path_bytes; + __u64 dir_ino; + __u64 dir_pos; + __u64 result_ptr; + __u16 result_bytes; } __packed; -#define SCOUTFS_IOC_INO_PATH_CURSOR_BYTES \ - (sizeof(__u64) + SCOUTFS_NAME_LEN + 1) +struct scoutfs_ioctl_ino_path_result { + __u64 dir_ino; + __u64 dir_pos; + __u16 path_bytes; + __u8 path[0]; +} __packed; /* Get a single path from the root to the given inode number */ #define SCOUTFS_IOC_INO_PATH _IOW(SCOUTFS_IOCTL_MAGIC, 2, \ diff --git a/kmod/src/key.c b/kmod/src/key.c index 5befe0e0..9e8bb5cc 100644 --- a/kmod/src/key.c +++ b/kmod/src/key.c @@ -276,35 +276,17 @@ static int pr_xattr(char *buf, struct scoutfs_key_buf *key, size_t size) static int pr_dirent(char *buf, struct scoutfs_key_buf *key, size_t size) { struct scoutfs_dirent_key *dkey = key->data; - int len = (int)key->key_len - sizeof(struct scoutfs_dirent_key); + char *which = dkey->type == SCOUTFS_DIRENT_TYPE ? "dnt" : + dkey->type == SCOUTFS_READDIR_TYPE ? "rdr" : + dkey->type == SCOUTFS_LINK_BACKREF_TYPE ? "lbr" : + "unk"; return snprintf_key(buf, size, key, sizeof(struct scoutfs_dirent_key), key->key_len, - "fs.%llu.dnt.%.*s", - be64_to_cpu(dkey->ino), len, dkey->name); -} - -static int pr_readdir(char *buf, struct scoutfs_key_buf *key, size_t size) -{ - struct scoutfs_readdir_key *rkey = key->data; - - return snprintf_key(buf, size, key, - sizeof(struct scoutfs_readdir_key), 0, - "fs.%llu.rdr.%llu", - be64_to_cpu(rkey->ino), be64_to_cpu(rkey->pos)); -} - -static int pr_link_backref(char *buf, struct scoutfs_key_buf *key, size_t size) -{ - struct scoutfs_link_backref_key *lkey = key->data; - int len = (int)key->key_len - sizeof(*lkey); - - return snprintf_key(buf, size, key, - sizeof(struct scoutfs_link_backref_key), - key->key_len, - "fs.%llu.lbr.%llu.%.*s", - be64_to_cpu(lkey->ino), be64_to_cpu(lkey->dir_ino), - len, lkey->name); + "fs.%llu.%s.%llu.%llu", + be64_to_cpu(dkey->ino), which, + be64_to_cpu(dkey->major), + be64_to_cpu(dkey->minor)); } static int pr_symlink(char *buf, struct scoutfs_key_buf *key, size_t size) @@ -339,8 +321,8 @@ const static key_printer_t key_printers[SCOUTFS_MAX_ZONE][SCOUTFS_MAX_TYPE] = { [SCOUTFS_FS_ZONE][SCOUTFS_INODE_TYPE] = pr_inode, [SCOUTFS_FS_ZONE][SCOUTFS_XATTR_TYPE] = pr_xattr, [SCOUTFS_FS_ZONE][SCOUTFS_DIRENT_TYPE] = pr_dirent, - [SCOUTFS_FS_ZONE][SCOUTFS_READDIR_TYPE] = pr_readdir, - [SCOUTFS_FS_ZONE][SCOUTFS_LINK_BACKREF_TYPE] = pr_link_backref, + [SCOUTFS_FS_ZONE][SCOUTFS_READDIR_TYPE] = pr_dirent, + [SCOUTFS_FS_ZONE][SCOUTFS_LINK_BACKREF_TYPE] = pr_dirent, [SCOUTFS_FS_ZONE][SCOUTFS_SYMLINK_TYPE] = pr_symlink, [SCOUTFS_FS_ZONE][SCOUTFS_BLOCK_MAPPING_TYPE] = pr_block_mapping, };