mirror of
https://github.com/versity/scoutfs.git
synced 2026-01-09 05:13:18 +00:00
scoutfs: add indexing of inodes by fields
Add items for indexing inodes by their fields. When we update the inode item we also delete the old index items and create the new items. We rename and refactor the old inode since ioctl to now walk the inode index items. Signed-off-by: Zach Brown <zab@versity.com>
This commit is contained in:
@@ -160,6 +160,9 @@ struct scoutfs_segment_block {
|
||||
#define SCOUTFS_ORPHAN_KEY 10
|
||||
#define SCOUTFS_FREE_EXTENT_BLKNO_KEY 11
|
||||
#define SCOUTFS_FREE_EXTENT_BLOCKS_KEY 12
|
||||
#define SCOUTFS_INODE_INDEX_CTIME_KEY 13
|
||||
#define SCOUTFS_INODE_INDEX_MTIME_KEY 14
|
||||
#define SCOUTFS_INODE_INDEX_SIZE_KEY 15
|
||||
/* not found in the fs */
|
||||
#define SCOUTFS_MAX_UNUSED_KEY 253
|
||||
#define SCOUTFS_NET_ADDR_KEY 254
|
||||
@@ -249,6 +252,18 @@ struct scoutfs_symlink_key {
|
||||
__be64 ino;
|
||||
} __packed;
|
||||
|
||||
struct scoutfs_betimespec {
|
||||
__be64 sec;
|
||||
__be32 nsec;
|
||||
} __packed;
|
||||
|
||||
struct scoutfs_inode_index_key {
|
||||
__u8 type;
|
||||
__be64 major;
|
||||
__be32 minor;
|
||||
__be64 ino;
|
||||
} __packed;
|
||||
|
||||
/* XXX does this exist upstream somewhere? */
|
||||
#define member_sizeof(TYPE, MEMBER) (sizeof(((TYPE *)0)->MEMBER))
|
||||
|
||||
|
||||
@@ -184,6 +184,16 @@ static void set_inode_ops(struct inode *inode)
|
||||
mapping_set_gfp_mask(inode->i_mapping, GFP_USER);
|
||||
}
|
||||
|
||||
static void set_item_info(struct inode *inode)
|
||||
{
|
||||
struct scoutfs_inode_info *si = SCOUTFS_I(inode);
|
||||
|
||||
si->have_item = true;
|
||||
si->item_size = i_size_read(inode);
|
||||
si->item_ctime = inode->i_ctime;
|
||||
si->item_mtime = inode->i_mtime;
|
||||
}
|
||||
|
||||
static void load_inode(struct inode *inode, struct scoutfs_inode *cinode)
|
||||
{
|
||||
struct scoutfs_inode_info *ci = SCOUTFS_I(inode);
|
||||
@@ -203,6 +213,8 @@ static void load_inode(struct inode *inode, struct scoutfs_inode *cinode)
|
||||
|
||||
ci->data_version = le64_to_cpu(cinode->data_version);
|
||||
ci->next_readdir_pos = le64_to_cpu(cinode->next_readdir_pos);
|
||||
|
||||
set_item_info(inode);
|
||||
}
|
||||
|
||||
void scoutfs_inode_init_key(struct scoutfs_key_buf *key,
|
||||
@@ -362,6 +374,77 @@ int scoutfs_dirty_inode_item(struct inode *inode)
|
||||
return ret;
|
||||
}
|
||||
|
||||
/*
|
||||
* Make sure inode index items are kept in sync with the fields that are
|
||||
* set in the inode items. This must be called any time the contents of
|
||||
* the inode items are updated.
|
||||
*
|
||||
* This is effectively a RMW on the inode fields so the caller needs to
|
||||
* lock the inode so that it's the only one working with the index items
|
||||
* for a given set of fields in the inode.
|
||||
*
|
||||
* But it doesn't need to lock the index item keys. By locking the
|
||||
* inode we've ensured that we can safely log deletion and insertion
|
||||
* items in our log. The indexes are eventually consistent so we don't
|
||||
* need to wrap them locks.
|
||||
*
|
||||
* XXX this needs more supporting work from the rest of the
|
||||
* infrastructure:
|
||||
*
|
||||
* - Deleting and creating the items needs to forcefully set those dirty
|
||||
* items in the cache without first trying to read them from segments.
|
||||
* - the reading ioctl needs to forcefully invalidate the index items
|
||||
* as it walks.
|
||||
* - maybe the reading ioctl needs to verify fields with inodes?
|
||||
* - final inode deletion needs to invalidate the index items for
|
||||
* each inode as it deletes items based on the locked inode fields.
|
||||
* - make sure deletion items safely vanish w/o finding existing item
|
||||
* - ... error handling :(
|
||||
*/
|
||||
static int update_index(struct inode *inode, u8 type, u64 now_major,
|
||||
u32 now_minor, u64 then_major, u32 then_minor)
|
||||
{
|
||||
struct scoutfs_inode_info *si = SCOUTFS_I(inode);
|
||||
struct super_block *sb = inode->i_sb;
|
||||
struct scoutfs_inode_index_key ins_ikey;
|
||||
struct scoutfs_inode_index_key del_ikey;
|
||||
struct scoutfs_key_buf ins;
|
||||
struct scoutfs_key_buf del;
|
||||
int ret;
|
||||
int err;
|
||||
|
||||
trace_printk("ino %llu have %u now %llu.%u then %llu.%u \n",
|
||||
scoutfs_ino(inode), si->have_item,
|
||||
now_major, now_minor, then_major, then_minor);
|
||||
|
||||
if (si->have_item && now_major == then_major && now_minor == then_minor)
|
||||
return 0;
|
||||
|
||||
ins_ikey.type = type;
|
||||
ins_ikey.major = cpu_to_be64(now_major);
|
||||
ins_ikey.minor = cpu_to_be32(now_minor);
|
||||
ins_ikey.ino = cpu_to_be64(scoutfs_ino(inode));
|
||||
scoutfs_key_init(&ins, &ins_ikey, sizeof(ins_ikey));
|
||||
|
||||
ret = scoutfs_item_create(sb, &ins, NULL);
|
||||
if (ret || !si->have_item)
|
||||
return ret;
|
||||
|
||||
del_ikey.type = type;
|
||||
del_ikey.major = cpu_to_be64(then_major);
|
||||
del_ikey.minor = cpu_to_be32(then_minor);
|
||||
del_ikey.ino = cpu_to_be64(scoutfs_ino(inode));
|
||||
scoutfs_key_init(&del, &del_ikey, sizeof(del_ikey));
|
||||
|
||||
ret = scoutfs_item_delete(sb, &del);
|
||||
if (ret) {
|
||||
err = scoutfs_item_delete(sb, &ins);
|
||||
BUG_ON(err);
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
/*
|
||||
* Every time we modify the inode in memory we copy it to its inode
|
||||
* item. This lets us write out items without having to track down
|
||||
@@ -373,13 +456,25 @@ int scoutfs_dirty_inode_item(struct inode *inode)
|
||||
*/
|
||||
void scoutfs_update_inode_item(struct inode *inode)
|
||||
{
|
||||
struct scoutfs_inode_info *si = SCOUTFS_I(inode);
|
||||
struct super_block *sb = inode->i_sb;
|
||||
struct scoutfs_inode_key ikey;
|
||||
struct scoutfs_key_buf key;
|
||||
struct scoutfs_inode sinode;
|
||||
SCOUTFS_DECLARE_KVEC(val);
|
||||
int ret;
|
||||
int err;
|
||||
|
||||
ret = update_index(inode, SCOUTFS_INODE_INDEX_CTIME_KEY,
|
||||
inode->i_ctime.tv_sec, inode->i_ctime.tv_nsec,
|
||||
si->item_ctime.tv_sec, si->item_ctime.tv_nsec) ?:
|
||||
update_index(inode, SCOUTFS_INODE_INDEX_MTIME_KEY,
|
||||
inode->i_mtime.tv_sec, inode->i_mtime.tv_nsec,
|
||||
si->item_mtime.tv_sec, si->item_mtime.tv_nsec) ?:
|
||||
update_index(inode, SCOUTFS_INODE_INDEX_SIZE_KEY,
|
||||
i_size_read(inode), 0, si->item_size, 0);
|
||||
BUG_ON(ret);
|
||||
|
||||
store_inode(&sinode, inode);
|
||||
|
||||
scoutfs_inode_init_key(&key, &ikey, scoutfs_ino(inode));
|
||||
@@ -392,6 +487,7 @@ void scoutfs_update_inode_item(struct inode *inode)
|
||||
BUG_ON(err);
|
||||
}
|
||||
|
||||
set_item_info(inode);
|
||||
trace_scoutfs_update_inode(inode);
|
||||
}
|
||||
|
||||
@@ -562,6 +658,7 @@ struct inode *scoutfs_new_inode(struct super_block *sb, struct inode *dir,
|
||||
ci->ino = ino;
|
||||
ci->data_version = 0;
|
||||
ci->next_readdir_pos = SCOUTFS_DIRENT_FIRST_POS;
|
||||
ci->have_item = false;
|
||||
|
||||
inode->i_ino = ino; /* XXX overflow */
|
||||
inode_init_owner(inode, dir, mode);
|
||||
|
||||
@@ -8,6 +8,10 @@ struct scoutfs_inode_info {
|
||||
u64 ino;
|
||||
u64 data_version;
|
||||
u64 next_readdir_pos;
|
||||
bool have_item;
|
||||
u64 item_size;
|
||||
struct timespec item_ctime;
|
||||
struct timespec item_mtime;
|
||||
|
||||
/* initialized once for slab object */
|
||||
seqcount_t seqcount;
|
||||
|
||||
105
kmod/src/ioctl.c
105
kmod/src/ioctl.c
@@ -28,84 +28,81 @@
|
||||
#include "super.h"
|
||||
#include "inode.h"
|
||||
#include "trans.h"
|
||||
#include "item.h"
|
||||
#include "data.h"
|
||||
|
||||
/*
|
||||
* Find all the inodes that have had keys of a given type modified since
|
||||
* a given sequence number. The user's arg struct specifies the inode
|
||||
* range to search within and the sequence value to return results from.
|
||||
* Different ioctls call this for different key types.
|
||||
*
|
||||
* When this is used for file data items the user is trying to find
|
||||
* inodes whose data has changed since a given time in the past.
|
||||
*
|
||||
* XXX We'll need to improve the walk and search to notice when file
|
||||
* data items have been truncated away.
|
||||
*
|
||||
* Inodes and their sequence numbers are copied out to userspace in
|
||||
* inode order, not sequence order.
|
||||
* Walk one of the inode index items. This is a thin ioctl wrapper
|
||||
* around the core item interface.
|
||||
*/
|
||||
static long scoutfs_ioc_inodes_since(struct file *file, unsigned long arg,
|
||||
u8 type)
|
||||
static long scoutfs_ioc_walk_inodes(struct file *file, unsigned long arg)
|
||||
{
|
||||
struct super_block *sb = file_inode(file)->i_sb;
|
||||
struct scoutfs_ioctl_inodes_since __user *uargs = (void __user *)arg;
|
||||
struct scoutfs_ioctl_inodes_since args;
|
||||
struct scoutfs_ioctl_ino_seq __user *uiseq;
|
||||
struct scoutfs_ioctl_ino_seq iseq;
|
||||
struct scoutfs_inode_key last_ikey;
|
||||
struct scoutfs_inode_key ikey;
|
||||
struct scoutfs_key_buf last;
|
||||
struct scoutfs_ioctl_walk_inodes __user *uwalk = (void __user *)arg;
|
||||
struct scoutfs_ioctl_walk_inodes walk;
|
||||
struct scoutfs_ioctl_walk_inodes_entry ent;
|
||||
struct scoutfs_inode_index_key last_ikey;
|
||||
struct scoutfs_inode_index_key ikey;
|
||||
struct scoutfs_key_buf last_key;
|
||||
struct scoutfs_key_buf key;
|
||||
long bytes;
|
||||
u64 seq;
|
||||
int ret;
|
||||
int ret = 0;
|
||||
u32 nr;
|
||||
|
||||
if (copy_from_user(&args, uargs, sizeof(args)))
|
||||
if (copy_from_user(&walk, uwalk, sizeof(walk)))
|
||||
return -EFAULT;
|
||||
|
||||
uiseq = (void __user *)(unsigned long)args.buf_ptr;
|
||||
if (args.buf_len < sizeof(iseq) || args.buf_len > INT_MAX)
|
||||
trace_printk("index %u first %llu.%u.%llu last %llu.%u.%llu\n",
|
||||
walk.index, walk.first.major, walk.first.minor,
|
||||
walk.first.ino, walk.last.major, walk.last.minor,
|
||||
walk.last.ino);
|
||||
|
||||
if (walk.index == SCOUTFS_IOC_WALK_INODES_CTIME)
|
||||
ikey.type = SCOUTFS_INODE_INDEX_CTIME_KEY;
|
||||
else if (walk.index == SCOUTFS_IOC_WALK_INODES_MTIME)
|
||||
ikey.type = SCOUTFS_INODE_INDEX_MTIME_KEY;
|
||||
else if (walk.index == SCOUTFS_IOC_WALK_INODES_SIZE)
|
||||
ikey.type = SCOUTFS_INODE_INDEX_SIZE_KEY;
|
||||
else
|
||||
return -EINVAL;
|
||||
|
||||
scoutfs_inode_init_key(&key, &ikey, args.first_ino);
|
||||
scoutfs_inode_init_key(&last, &last_ikey, args.last_ino);
|
||||
ikey.major = cpu_to_be64(walk.first.major);
|
||||
ikey.minor = cpu_to_be32(walk.first.minor);
|
||||
ikey.ino = cpu_to_be64(walk.first.ino);
|
||||
scoutfs_key_init(&key, &ikey, sizeof(ikey));
|
||||
|
||||
bytes = 0;
|
||||
for (;;) {
|
||||
last_ikey.type = ikey.type;
|
||||
last_ikey.major = cpu_to_be64(walk.last.major);
|
||||
last_ikey.minor = cpu_to_be32(walk.last.minor);
|
||||
last_ikey.ino = cpu_to_be64(walk.last.ino);
|
||||
scoutfs_key_init(&last_key, &last_ikey, sizeof(last_ikey));
|
||||
|
||||
/* XXX item cache needs to search by seq */
|
||||
seq = !!sb;
|
||||
ret = WARN_ON_ONCE(-EINVAL);
|
||||
// ret = scoutfs_item_since(sb, &key, &last, args.seq, &seq, NULL);
|
||||
/* cap nr to the max the ioctl can return to a compat task */
|
||||
walk.nr_entries = min_t(u64, walk.nr_entries, INT_MAX);
|
||||
|
||||
for (nr = 0; nr < walk.nr_entries;
|
||||
nr++, walk.entries_ptr += sizeof(ent)) {
|
||||
|
||||
ret = scoutfs_item_next_same(sb, &key, &last_key, NULL);
|
||||
if (ret < 0) {
|
||||
if (ret == -ENOENT)
|
||||
ret = 0;
|
||||
break;
|
||||
}
|
||||
|
||||
iseq.ino = be64_to_cpu(ikey.ino);
|
||||
iseq.seq = seq;
|
||||
ent.major = be64_to_cpu(ikey.major);
|
||||
ent.minor = be32_to_cpu(ikey.minor);
|
||||
ent.ino = be64_to_cpu(ikey.ino);
|
||||
|
||||
if (copy_to_user(uiseq, &iseq, sizeof(iseq))) {
|
||||
if (copy_to_user((void __user *)walk.entries_ptr, &ent,
|
||||
sizeof(ent))) {
|
||||
ret = -EFAULT;
|
||||
break;
|
||||
}
|
||||
|
||||
uiseq++;
|
||||
bytes += sizeof(iseq);
|
||||
if (bytes + sizeof(iseq) > args.buf_len) {
|
||||
ret = 0;
|
||||
break;
|
||||
}
|
||||
|
||||
last_ikey.ino = cpu_to_be64(iseq.ino + 1);
|
||||
scoutfs_key_inc_cur_len(&key);
|
||||
}
|
||||
|
||||
if (bytes)
|
||||
ret = bytes;
|
||||
|
||||
return ret;
|
||||
return nr ?: ret;
|
||||
}
|
||||
|
||||
struct ino_path_cursor {
|
||||
@@ -419,12 +416,10 @@ out:
|
||||
long scoutfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
|
||||
{
|
||||
switch (cmd) {
|
||||
case SCOUTFS_IOC_INODES_SINCE:
|
||||
return scoutfs_ioc_inodes_since(file, arg, SCOUTFS_INODE_KEY);
|
||||
case SCOUTFS_IOC_WALK_INODES:
|
||||
return scoutfs_ioc_walk_inodes(file, arg);
|
||||
case SCOUTFS_IOC_INO_PATH:
|
||||
return scoutfs_ioc_ino_path(file, arg);
|
||||
case SCOUTFS_IOC_INODE_DATA_SINCE:
|
||||
return WARN_ON_ONCE(-EINVAL);
|
||||
case SCOUTFS_IOC_DATA_VERSION:
|
||||
return scoutfs_ioc_data_version(file, arg);
|
||||
case SCOUTFS_IOC_RELEASE:
|
||||
|
||||
@@ -6,25 +6,54 @@ long scoutfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg);
|
||||
/* XXX I have no idea how these are chosen. */
|
||||
#define SCOUTFS_IOCTL_MAGIC 's'
|
||||
|
||||
struct scoutfs_ioctl_ino_seq {
|
||||
struct scoutfs_ioctl_walk_inodes_entry {
|
||||
__u64 major;
|
||||
__u32 minor;
|
||||
__u64 ino;
|
||||
__u64 seq;
|
||||
} __packed;
|
||||
|
||||
struct scoutfs_ioctl_inodes_since {
|
||||
__u64 first_ino;
|
||||
__u64 last_ino;
|
||||
__u64 seq;
|
||||
__u64 buf_ptr;
|
||||
__u32 buf_len;
|
||||
} __packed;
|
||||
|
||||
/*
|
||||
* Adds entries to the user's buffer for each inode whose sequence
|
||||
* number is greater than or equal to the given seq.
|
||||
* Walk inodes in an index that is sorted by one of their fields.
|
||||
*
|
||||
* Each index is built from generic index items that have major and
|
||||
* minor values that are set to the field being indexed. In time
|
||||
* indices, for example, major is seconds and minor is nanoseconds.
|
||||
*
|
||||
* @first The first index entry that can be returned.
|
||||
* @last The last index entry that can be returned.
|
||||
* @entries_ptr Pointer to emory containing buffer for entry results.
|
||||
* @nr_entries The number of entries that can fit in the buffer.
|
||||
* @index Which index to walk, enumerated in _WALK_INODES_ constants.
|
||||
*
|
||||
* To start iterating first can be memset to 0 and last to 0xff. Then
|
||||
* after each set of results first can be set to the last entry returned
|
||||
* and then the fields can be incremented in reverse sort order (ino <
|
||||
* minor < major) as each increasingly significant value wraps around to
|
||||
* 0.
|
||||
*
|
||||
* If first is greater than last then the walk will return 0 entries.
|
||||
*/
|
||||
#define SCOUTFS_IOC_INODES_SINCE _IOW(SCOUTFS_IOCTL_MAGIC, 1, \
|
||||
struct scoutfs_ioctl_inodes_since)
|
||||
struct scoutfs_ioctl_walk_inodes {
|
||||
struct scoutfs_ioctl_walk_inodes_entry first;
|
||||
struct scoutfs_ioctl_walk_inodes_entry last;
|
||||
__u64 entries_ptr;
|
||||
__u32 nr_entries;
|
||||
__u8 index;
|
||||
} __packed;
|
||||
|
||||
enum {
|
||||
SCOUTFS_IOC_WALK_INODES_CTIME = 0,
|
||||
SCOUTFS_IOC_WALK_INODES_MTIME,
|
||||
SCOUTFS_IOC_WALK_INODES_SIZE,
|
||||
SCOUTFS_IOC_WALK_INODES_UNKNOWN,
|
||||
};
|
||||
|
||||
/*
|
||||
* Adds entries to the user's buffer for each inode that is found in the
|
||||
* given index between the first and last positions.
|
||||
*/
|
||||
#define SCOUTFS_IOC_WALK_INODES _IOW(SCOUTFS_IOCTL_MAGIC, 1, \
|
||||
struct scoutfs_ioctl_walk_inodes)
|
||||
|
||||
/*
|
||||
* Fill the path buffer with the next path to the target inode. An
|
||||
@@ -80,9 +109,6 @@ struct scoutfs_ioctl_ino_path {
|
||||
#define SCOUTFS_IOC_INO_PATH _IOW(SCOUTFS_IOCTL_MAGIC, 2, \
|
||||
struct scoutfs_ioctl_ino_path)
|
||||
|
||||
#define SCOUTFS_IOC_INODE_DATA_SINCE _IOW(SCOUTFS_IOCTL_MAGIC, 3, \
|
||||
struct scoutfs_ioctl_inodes_since)
|
||||
|
||||
#define SCOUTFS_IOC_DATA_VERSION _IOW(SCOUTFS_IOCTL_MAGIC, 4, __u64)
|
||||
|
||||
struct scoutfs_ioctl_release {
|
||||
|
||||
Reference in New Issue
Block a user