scoutfs: add inode_paths ioctl

This adds the ioctl that returns all the paths from the root to a given
inode.  The implementation only traverses btree items to keep it
isolated from the vfs object locking and life cycles, but that could be
a performance problem.  This is another motivation to accelerate the
btree code.

Signed-off-by: Zach Brown <zab@versity.com>
This commit is contained in:
Zach Brown
2016-08-11 16:46:18 -07:00
parent 77e0ffb981
commit 0991622a21
7 changed files with 417 additions and 10 deletions

View File

@@ -49,6 +49,13 @@
* items will have distant offset values. It's only as the directory
* gets very large that hash values will start to be this dense and
* sweeping over items in a btree leaf is reasonably efficient.
*
* For each directory entry item stored in a directory inode there is a
* corresponding link backref item stored at the target inode. This
* lets us find all the paths that refer to a given inode. The link
* backref offset comes from an advancing counter in the inode and the
* item value contains the dir inode and dirent offset of the referring
* link.
*/
static unsigned int mode_to_type(umode_t mode)
@@ -108,11 +115,14 @@ static unsigned int item_name_len(struct scoutfs_btree_cursor *curs)
{
return curs->val_len - sizeof(struct scoutfs_dirent);
}
/*
* Store the dirent item hash in the dentry so that we don't have to
* calculate and search to remove the item.
* Each dirent stores the values that are needed to build the keys of
* the items that are removed on unlink so that we don't to search through
* items on unlink.
*/
struct dentry_info {
u64 lref_counter;
u32 hash;
};
@@ -158,6 +168,13 @@ static struct dentry_info *alloc_dentry_info(struct dentry *dentry)
return dentry->d_fsdata;
}
static void update_dentry_info(struct dentry_info *di, struct scoutfs_key *key,
struct scoutfs_dirent *dent)
{
di->lref_counter = le64_to_cpu(dent->counter);
di->hash = scoutfs_key_offset(key);
}
static u64 last_dirent_key_offset(u32 h)
{
return min_t(u64, (u64)h + SCOUTFS_DIRENT_COLL_NR - 1,
@@ -210,7 +227,7 @@ static struct dentry *scoutfs_lookup(struct inode *dir, struct dentry *dentry,
if (scoutfs_names_equal(dentry->d_name.name, dentry->d_name.len,
dent->name, name_len)) {
ino = le64_to_cpu(dent->ino);
di->hash = scoutfs_key_offset(curs.key);
update_dentry_info(di, curs.key, dent);
break;
}
}
@@ -296,6 +313,34 @@ static int scoutfs_readdir(struct file *file, void *dirent, filldir_t filldir)
return ret;
}
static void set_lref_key(struct scoutfs_key *key, u64 ino, u64 ctr)
{
scoutfs_set_key(key, ino, SCOUTFS_LINK_BACKREF_KEY, ctr);
}
static int update_lref_item(struct super_block *sb, struct scoutfs_key *key,
u64 dir_ino, u64 dir_off, bool update)
{
DECLARE_SCOUTFS_BTREE_CURSOR(curs);
struct scoutfs_link_backref *lref;
int ret;
if (update)
ret = scoutfs_btree_update(sb, key, &curs);
else
ret = scoutfs_btree_insert(sb, key, sizeof(*lref), &curs);
/* XXX verify size */
if (ret == 0) {
lref = curs.val;
lref->ino = cpu_to_le64(dir_ino);
lref->offset = cpu_to_le64(dir_off);
scoutfs_btree_release(&curs);
}
return ret;
}
static int scoutfs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode,
dev_t rdev)
{
@@ -308,6 +353,7 @@ static int scoutfs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode,
struct scoutfs_key first;
struct scoutfs_key last;
struct scoutfs_key key;
struct scoutfs_key lref_key;
int bytes;
int ret;
u64 h;
@@ -343,15 +389,25 @@ static int scoutfs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode,
if (ret)
goto out;
ret = scoutfs_btree_insert(sb, &key, bytes, &curs);
set_lref_key(&lref_key, scoutfs_ino(inode),
atomic64_inc_return(&SCOUTFS_I(inode)->link_counter));
ret = update_lref_item(sb, &lref_key, scoutfs_ino(dir),
scoutfs_key_offset(&key), false);
if (ret)
goto out;
ret = scoutfs_btree_insert(sb, &key, bytes, &curs);
if (ret) {
scoutfs_btree_delete(sb, &lref_key);
goto out;
}
dent = curs.val;
dent->ino = cpu_to_le64(scoutfs_ino(inode));
dent->counter = lref_key.offset;
dent->type = mode_to_type(inode->i_mode);
memcpy(dent->name, dentry->d_name.name, dentry->d_name.len);
di->hash = scoutfs_key_offset(&key);
update_dentry_info(di, &key, dent);
scoutfs_btree_release(&curs);
@@ -400,6 +456,7 @@ static int scoutfs_unlink(struct inode *dir, struct dentry *dentry)
struct timespec ts = current_kernel_time();
struct dentry_info *di;
struct scoutfs_key key;
struct scoutfs_key lref_key;
int ret = 0;
if (WARN_ON_ONCE(!dentry->d_fsdata))
@@ -413,8 +470,11 @@ static int scoutfs_unlink(struct inode *dir, struct dentry *dentry)
if (ret)
return ret;
set_lref_key(&lref_key, scoutfs_ino(inode), di->lref_counter);
ret = scoutfs_dirty_inode_item(dir) ?:
scoutfs_dirty_inode_item(inode);
scoutfs_dirty_inode_item(inode) ?:
scoutfs_btree_dirty(sb, &lref_key);
if (ret)
goto out;
@@ -424,6 +484,8 @@ static int scoutfs_unlink(struct inode *dir, struct dentry *dentry)
if (ret)
goto out;
scoutfs_btree_delete(sb, &lref_key);
dir->i_ctime = ts;
dir->i_mtime = ts;
i_size_write(dir, i_size_read(dir) - dentry->d_name.len);
@@ -442,6 +504,200 @@ out:
return ret;
}
/*
* Add an allocated path component to the callers list which links to
* the target inode at a counter past the given counter.
*
* This is implemented by searching for link backrefs on the inode
* starting from the given counter. Those contain references to the
* parent directory and dirent key offset that contain the link to the
* inode.
*
* The caller holds no locks that protect components in the path. We
* search the link backref to find the parent dir then acquire it's
* i_mutex to make sure that its entries and backrefs are stable. If
* the next backref points to a different dir after we acquire the lock
* we bounce off and retry.
*
* Backref counters are never reused and rename only modifies the
* existing backref counter under the dir's mutex.
*/
static int add_linkref_name(struct super_block *sb, u64 *dir_ino, u64 ino,
u64 *ctr, struct list_head *list)
{
struct scoutfs_path_component *comp;
DECLARE_SCOUTFS_BTREE_CURSOR(curs);
struct scoutfs_link_backref *lref;
struct scoutfs_dirent *dent;
struct inode *inode = NULL;
struct scoutfs_key first;
struct scoutfs_key last;
struct scoutfs_key key;
u64 retried = 0;
u64 off;
int len;
int ret;
comp = kmalloc(sizeof(struct scoutfs_path_component), GFP_KERNEL);
if (!comp)
return -ENOMEM;
retry:
scoutfs_set_key(&first, ino, SCOUTFS_LINK_BACKREF_KEY, *ctr);
scoutfs_set_key(&last, ino, SCOUTFS_LINK_BACKREF_KEY, ~0ULL);
ret = scoutfs_btree_next(sb, &first, &last, &curs);
if (ret <= 0)
goto out;
lref = curs.val;
*dir_ino = le64_to_cpu(lref->ino),
off = le64_to_cpu(lref->offset);
*ctr = scoutfs_key_offset(curs.key);
trace_printk("ino %llu ctr %llu dir_ino %llu off %llu\n",
ino, *ctr, *dir_ino, off);
scoutfs_btree_release(&curs);
/* XXX corruption, should never be key == U64_MAX */
if (*ctr == U64_MAX) {
ret = -EIO;
goto out;
}
/* XXX should verify ino and offset, too */
if (inode && scoutfs_ino(inode) != *dir_ino) {
mutex_unlock(&inode->i_mutex);
iput(inode);
inode = NULL;
}
if (!inode) {
inode = scoutfs_iget(sb, *dir_ino);
if (IS_ERR(inode)) {
ret = PTR_ERR(inode);
inode = NULL;
if (ret == -ENOENT && retried != *dir_ino) {
retried = *dir_ino;
goto retry;
}
goto out;
}
mutex_lock(&inode->i_mutex);
goto retry;
}
scoutfs_set_key(&key, *dir_ino, SCOUTFS_DIRENT_KEY, off);
ret = scoutfs_btree_lookup(sb, &key, &curs);
if (ret < 0) {
/* XXX corruption, should always have dirent for backref */
if (ret == -ENOENT)
ret = -EIO;
goto out;
}
dent = curs.val;
len = item_name_len(&curs);
trace_printk("dent ino %llu len %d\n", le64_to_cpu(dent->ino), len);
/* XXX corruption */
if (len < 1 || len > SCOUTFS_NAME_LEN) {
ret = -EIO;
goto out;
}
/* XXX corruption, dirents should always match link backref */
if (le64_to_cpu(dent->ino) != ino) {
ret = -EIO;
goto out;
}
(*ctr)++;
comp->len = len;
memcpy(comp->name, dent->name, len);
list_add(&comp->head, list);
comp = NULL; /* won't be freed */
scoutfs_btree_release(&curs);
ret = 1;
out:
if (inode) {
mutex_unlock(&inode->i_mutex);
iput(inode);
}
kfree(comp);
return ret;
}
void scoutfs_dir_free_path(struct list_head *list)
{
struct scoutfs_path_component *comp;
struct scoutfs_path_component *tmp;
list_for_each_entry_safe(comp, tmp, list, head) {
list_del_init(&comp->head);
kfree(comp);
}
}
/*
* Fill the list with the allocated path components that link the root
* to the target inode. The caller's ctr gives the link counter to
* start from.
*
* This is racing with modification of components in the path. We can
* traverse a partial path only to find that it's been blown away
* entirely. If we see a component go missing we retry. The removal of
* the final link to the inode should prevent repeatedly traversing
* paths that no longer exist.
*
* Returns > 0 and *ctr is updated if an allocated name was added to the
* list, 0 if no name past *ctr was found, or -errno on errors.
*/
int scoutfs_dir_next_path(struct super_block *sb, u64 ino, u64 *ctr,
struct list_head *list)
{
u64 our_ctr;
u64 par_ctr;
u64 par_ino;
int ret;
if (*ctr == U64_MAX)
return 0;
retry:
our_ctr = *ctr;
/* get the next link name to the given inode */
ret = add_linkref_name(sb, &par_ino, ino, &our_ctr, list);
if (ret <= 0)
goto out;
/* then get the names of all the parent dirs */
while (par_ino != SCOUTFS_ROOT_INO) {
par_ctr = 0;
ret = add_linkref_name(sb, &par_ino, par_ino, &par_ctr, list);
if (ret < 0)
goto out;
/* restart if there was no parent component */
if (ret == 0) {
scoutfs_dir_free_path(list);
goto retry;
}
}
out:
if (ret > 0)
*ctr = our_ctr;
return ret;
}
const struct file_operations scoutfs_dir_fops = {
.readdir = scoutfs_readdir,
};

View File

@@ -1,10 +1,22 @@
#ifndef _SCOUTFS_DIR_H_
#define _SCOUTFS_DIR_H_
#include "format.h"
extern const struct file_operations scoutfs_dir_fops;
extern const struct inode_operations scoutfs_dir_iops;
int scoutfs_dir_init(void);
void scoutfs_dir_exit(void);
struct scoutfs_path_component {
struct list_head head;
unsigned int len;
char name[SCOUTFS_NAME_LEN];
};
int scoutfs_dir_next_path(struct super_block *sb, u64 ino, u64 *ctr,
struct list_head *list);
void scoutfs_dir_free_path(struct list_head *list);
#endif

View File

@@ -97,10 +97,11 @@ struct scoutfs_key {
* isn't necessary. We could have an arbitrary sort order. So we don't
* have to stress about cleverly allocating the types.
*/
#define SCOUTFS_INODE_KEY 1
#define SCOUTFS_XATTR_KEY 2
#define SCOUTFS_DIRENT_KEY 3
#define SCOUTFS_BMAP_KEY 4
#define SCOUTFS_INODE_KEY 1
#define SCOUTFS_XATTR_KEY 2
#define SCOUTFS_DIRENT_KEY 3
#define SCOUTFS_LINK_BACKREF_KEY 4
#define SCOUTFS_BMAP_KEY 5
#define SCOUTFS_MAX_ITEM_LEN 512
@@ -173,6 +174,7 @@ struct scoutfs_timespec {
struct scoutfs_inode {
__le64 size;
__le64 blocks;
__le64 link_counter;
__le32 nlink;
__le32 uid;
__le32 gid;
@@ -192,6 +194,7 @@ struct scoutfs_inode {
*/
struct scoutfs_dirent {
__le64 ino;
__le64 counter;
__u8 type;
__u8 name[0];
} __packed;
@@ -262,4 +265,14 @@ struct scoutfs_block_map {
__le64 blkno[SCOUTFS_BLOCK_MAP_COUNT];
};
/*
* link backrefs give us a way to find all the hard links that refer
* to a target inode. They're stored at an offset determined by an
* advancing counter in their inode.
*/
struct scoutfs_link_backref {
__le64 ino;
__le64 offset;
} __packed;
#endif

View File

@@ -118,6 +118,7 @@ static void load_inode(struct inode *inode, struct scoutfs_inode *cinode)
inode->i_ctime.tv_nsec = le32_to_cpu(cinode->ctime.nsec);
ci->salt = le32_to_cpu(cinode->salt);
atomic64_set(&ci->link_counter, le64_to_cpu(cinode->link_counter));
}
static int scoutfs_read_locked_inode(struct inode *inode)
@@ -199,6 +200,7 @@ static void store_inode(struct scoutfs_inode *cinode, struct inode *inode)
cinode->mtime.nsec = cpu_to_le32(inode->i_mtime.tv_nsec);
cinode->salt = cpu_to_le32(ci->salt);
cinode->link_counter = cpu_to_le64(atomic64_read(&ci->link_counter));
}
/*
@@ -307,6 +309,7 @@ struct inode *scoutfs_new_inode(struct super_block *sb, struct inode *dir,
ci = SCOUTFS_I(inode);
ci->ino = ino;
get_random_bytes(&ci->salt, sizeof(ci->salt));
atomic64_set(&ci->link_counter, 0);
inode->i_ino = ino; /* XXX overflow */
inode_init_owner(inode, dir, mode);

View File

@@ -5,6 +5,7 @@ struct scoutfs_inode_info {
u64 ino;
u32 salt;
atomic64_t link_counter;
struct rw_semaphore xattr_rwsem;
struct inode inode;

View File

@@ -15,10 +15,12 @@
#include <linux/uaccess.h>
#include <linux/compiler.h>
#include <linux/uio.h>
#include <linux/slab.h>
#include "format.h"
#include "btree.h"
#include "key.h"
#include "dir.h"
#include "ioctl.h"
/*
@@ -93,11 +95,118 @@ static long scoutfs_ioc_inodes_since(struct file *file, unsigned long arg)
return ret;
}
static int copy_to_ptr(char __user **to, const void *from,
unsigned long n, int space)
{
if (n > space)
return -EOVERFLOW;
if (copy_to_user(*to, from, n))
return -EFAULT;
*to += n;
return space - n;
}
/*
* Fill the caller's buffer with all the paths from the on-disk root
* directory to the target inode. It will provide as many full paths as
* there are final links to the target inode.
*
* The null terminated paths are stored consecutively in the buffer. A
* final zero length null terminated string follows the last path.
*
* This only walks back through full hard links. None of the returned
* paths will reflect symlinks to components in the path.
*
* This doesn't ensure that the caller has permissions to traverse the
* returned paths to the inode. It requires CAP_DAC_READ_SEARCH which
* bypasses permissions checking.
*
* If the provided buffer isn't large enough EOVERFLOW will be returned.
* The buffer can be approximately sized by multiplying the inode's
* nlink by PATH_MAX.
*
* This call is not serialized with any modification (create, rename,
* unlink) of the path components. It will return all the paths that
* were stable both before and after the call. It may or may not return
* paths which are created or unlinked during the call.
*
* This will return failure if it fails to read any path. An empty
* buffer is returned if the target inode doesn't exist or is
* disconnected from the root.
*
* XXX
* - we may want to support partial failure
* - can dir renaming trick us into returning garbage paths? seems likely.
*/
static long scoutfs_ioc_inode_paths(struct file *file, unsigned long arg)
{
struct super_block *sb = file_inode(file)->i_sb;
struct scoutfs_ioctl_inode_paths __user *uargs = (void __user *)arg;
struct scoutfs_ioctl_inode_paths args;
struct scoutfs_path_component *comp;
struct scoutfs_path_component *tmp;
static char slash = '/';
static char null = '\0';
char __user *ptr;
LIST_HEAD(list);
u64 ctr;
int ret;
int len;
if (!capable(CAP_DAC_READ_SEARCH))
return -EPERM;
if (copy_from_user(&args, uargs, sizeof(args)))
return -EFAULT;
if (args.buf_len > INT_MAX)
return -EINVAL;
ptr = (void __user *)(unsigned long)args.buf_ptr;
len = args.buf_len;
ctr = 0;
while ((ret = scoutfs_dir_next_path(sb, args.ino, &ctr, &list)) > 0) {
ret = 0;
/* copy the components out as a path */
list_for_each_entry_safe(comp, tmp, &list, head) {
len = copy_to_ptr(&ptr, comp->name, comp->len, len);
if (len < 0)
goto out;
list_del_init(&comp->head);
kfree(comp);
if (!list_empty(&list)) {
len = copy_to_ptr(&ptr, &slash, 1, len);
if (len < 0)
goto out;
}
}
len = copy_to_ptr(&ptr, &null, 1, len);
if (len < 0)
goto out;
}
len = copy_to_ptr(&ptr, &null, 1, len);
out:
scoutfs_dir_free_path(&list);
if (ret == 0 && len < 0)
ret = len;
return ret;
}
long scoutfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
{
switch (cmd) {
case SCOUTFS_IOC_INODES_SINCE:
return scoutfs_ioc_inodes_since(file, arg);
case SCOUTFS_IOC_INODE_PATHS:
return scoutfs_ioc_inode_paths(file, arg);
}
return -ENOTTY;

View File

@@ -26,4 +26,17 @@ struct scoutfs_ioctl_inodes_since {
#define SCOUTFS_IOC_INODES_SINCE _IOW(SCOUTFS_IOCTL_MAGIC, 1, \
struct scoutfs_ioctl_inodes_since)
struct scoutfs_ioctl_inode_paths {
__u64 ino;
__u64 buf_ptr;
__u32 buf_len;
} __packed;
/*
* Fills the callers buffer with all the paths from the root to the
* target inode.
*/
#define SCOUTFS_IOC_INODE_PATHS _IOW(SCOUTFS_IOCTL_MAGIC, 2, \
struct scoutfs_ioctl_inode_paths)
#endif