diff --git a/kmod/src/data.c b/kmod/src/data.c index cee53511..ce31e642 100644 --- a/kmod/src/data.c +++ b/kmod/src/data.c @@ -732,9 +732,9 @@ static int scoutfs_get_block(struct inode *inode, sector_t iblock, if (ext.len) trace_scoutfs_data_get_block_intersection(sb, &ext); - /* fail read and write if it's offline and we're not staging */ - if ((ext.flags & SEF_OFFLINE) && !si->staging) { - ret = -EINVAL; + /* non-staging callers should have waited on offline blocks */ + if (WARN_ON_ONCE((ext.flags & SEF_OFFLINE) && !si->staging)) { + ret = -EIO; goto out; } @@ -780,14 +780,28 @@ out: /* * This is almost never used. We can't block on a cluster lock while * holding the page lock because lock invalidation gets the page lock - * while blocking locks. If we can't use an existing lock then we drop - * the page lock and try again. + * while blocking locks. If a non blocking lock attempt fails we unlock + * the page and block acquiring the lock. We unlocked the page so it + * could have been truncated away, or whatever, so we return + * AOP_TRUNCATED_PAGE to have the caller try again. + * + * A similar process happens if we try to read from an offline extent + * that a caller hasn't already waited for. Instead of blocking + * acquiring the lock we block waiting for the offline extent. The page + * lock protects the page from release while we're checking and + * reading the extent. + * + * We can return errors from locking and checking offline extents. The + * page is unlocked if we return an error. */ static int scoutfs_readpage(struct file *file, struct page *page) { struct inode *inode = file->f_inode; + struct scoutfs_inode_info *si = SCOUTFS_I(inode); struct super_block *sb = inode->i_sb; struct scoutfs_lock *inode_lock = NULL; + SCOUTFS_DECLARE_PER_TASK_ENTRY(pt_ent); + DECLARE_DATA_WAIT(dw); int flags; int ret; @@ -809,27 +823,77 @@ static int scoutfs_readpage(struct file *file, struct page *page) return ret; } + if (scoutfs_per_task_add_excl(&si->pt_data_lock, &pt_ent, inode_lock)) { + ret = scoutfs_data_wait_check(inode, page_offset(page), + PAGE_CACHE_SIZE, SEF_OFFLINE, + SCOUTFS_IOC_DWO_READ, &dw, + inode_lock); + if (ret != 0) { + unlock_page(page); + scoutfs_per_task_del(&si->pt_data_lock, &pt_ent); + scoutfs_unlock(sb, inode_lock, SCOUTFS_LOCK_READ); + } + if (ret > 0) { + ret = scoutfs_data_wait(inode, &dw); + if (ret == 0) + ret = AOP_TRUNCATED_PAGE; + } + if (ret != 0) + return ret; + } + ret = mpage_readpage(page, scoutfs_get_block); + scoutfs_unlock(sb, inode_lock, SCOUTFS_LOCK_READ); + scoutfs_per_task_del(&si->pt_data_lock, &pt_ent); + return ret; } +/* + * This is used for opportunistic read-ahead which can throw the pages + * away if it needs to. If the caller didn't deal with offline extents + * then we drop those pages rather than trying to wait. Whoever is + * staging offline extents should be doing it in enormous chunks so that + * read-ahead can ramp up within each staged region. The check for + * offline extents is cheap when the inode has no offline extents. + */ static int scoutfs_readpages(struct file *file, struct address_space *mapping, struct list_head *pages, unsigned nr_pages) { struct inode *inode = file->f_inode; struct super_block *sb = inode->i_sb; struct scoutfs_lock *inode_lock = NULL; + struct page *page; + struct page *tmp; int ret; ret = scoutfs_lock_inode(sb, SCOUTFS_LOCK_READ, SCOUTFS_LKF_REFRESH_INODE, inode, &inode_lock); if (ret) - return ret; + goto out; + + list_for_each_entry_safe(page, tmp, pages, lru) { + ret = scoutfs_data_wait_check(inode, page_offset(page), + PAGE_CACHE_SIZE, SEF_OFFLINE, + SCOUTFS_IOC_DWO_READ, NULL, + inode_lock); + if (ret < 0) + goto out; + if (ret > 0) { + list_del(&page->lru); + page_cache_release(page); + if (--nr_pages == 0) { + ret = 0; + goto out; + } + } + } ret = mpage_readpages(mapping, pages, nr_pages, scoutfs_get_block); - +out: scoutfs_unlock(sb, inode_lock, SCOUTFS_LOCK_READ); + BUG_ON(!list_empty(pages)); return ret; } @@ -1249,6 +1313,239 @@ out: return ret; } +/* + * Insert a new waiter. This supports multiple tasks waiting for the + * same ino and iblock by also comparing waiters by their addresses. + */ +static void insert_offline_waiting(struct rb_root *root, + struct scoutfs_data_wait *ins) +{ + struct rb_node **node = &root->rb_node; + struct rb_node *parent = NULL; + struct scoutfs_data_wait *dw; + int cmp; + + while (*node) { + parent = *node; + dw = rb_entry(*node, struct scoutfs_data_wait, node); + + cmp = scoutfs_cmp_u64s(ins->ino, dw->ino) ?: + scoutfs_cmp_u64s(ins->iblock, dw->iblock) ?: + scoutfs_cmp(ins, dw); + if (cmp < 0) + node = &(*node)->rb_left; + else + node = &(*node)->rb_right; + } + + rb_link_node(&ins->node, parent, node); + rb_insert_color(&ins->node, root); +} + +static struct scoutfs_data_wait *next_data_wait(struct rb_root *root, u64 ino, + u64 iblock) +{ + struct rb_node **node = &root->rb_node; + struct rb_node *parent = NULL; + struct scoutfs_data_wait *next = NULL; + struct scoutfs_data_wait *dw; + int cmp; + + while (*node) { + parent = *node; + dw = rb_entry(*node, struct scoutfs_data_wait, node); + + /* go left when ino/iblock are equal to get first task */ + cmp = scoutfs_cmp_u64s(ino, dw->ino) ?: + scoutfs_cmp_u64s(iblock, dw->iblock); + if (cmp <= 0) { + node = &(*node)->rb_left; + next = dw; + } else if (cmp > 0) { + node = &(*node)->rb_right; + } + } + + return next; +} + +static struct scoutfs_data_wait *dw_next(struct scoutfs_data_wait *dw) +{ + struct rb_node *node = rb_next(&dw->node); + if (node) + return container_of(node, struct scoutfs_data_wait, node); + return NULL; +} + +/* + * Check if we should wait by looking for extents whose flags match. + * Returns 0 if no extents were found or any error encountered. + * + * The caller must have locked the extents before calling, both across + * mounts and within this mount. + * + * Returns 1 if any file extents in the caller's region matched. If the + * wait struct is provided then it is initialized to be woken when the + * extents change after the caller unlocks after the check. The caller + * must come through _data_wait() to clean up the wait struct if we set + * it up. + */ +int scoutfs_data_wait_check(struct inode *inode, loff_t pos, loff_t len, + u8 sef, u8 op, struct scoutfs_data_wait *dw, + struct scoutfs_lock *lock) +{ + struct super_block *sb = inode->i_sb; + DECLARE_DATA_WAIT_ROOT(sb, rt); + DECLARE_DATA_WAITQ(inode, wq); + struct scoutfs_extent ext = {0,}; + u64 iblock; + u64 last_block; + u64 on; + u64 off; + int ret = 0; + + if (WARN_ON_ONCE(sef & SEF_UNKNOWN) || + WARN_ON_ONCE(op & SCOUTFS_IOC_DWO_UNKNOWN) || + WARN_ON_ONCE(dw && !RB_EMPTY_NODE(&dw->node)) || + WARN_ON_ONCE(pos + len < pos)) { + ret = -EINVAL; + goto out; + } + + if ((sef & SEF_OFFLINE)) { + scoutfs_inode_get_onoff(inode, &on, &off); + if (off == 0) { + ret = 0; + goto out; + } + } + + iblock = pos >> SCOUTFS_BLOCK_SHIFT; + last_block = (pos + len - 1) >> SCOUTFS_BLOCK_SHIFT; + + while(iblock <= last_block) { + scoutfs_extent_init(&ext, SCOUTFS_FILE_EXTENT_TYPE, + scoutfs_ino(inode), iblock, 1, 0, 0); + ret = scoutfs_extent_next(sb, data_extent_io, &ext, lock); + if (ret < 0) { + if (ret == -ENOENT) + ret = 0; + break; + } + + if (ext.start > last_block) + break; + + if (sef & ext.flags) { + if (dw) { + dw->chg = atomic64_read(&wq->changed); + dw->ino = scoutfs_ino(inode); + dw->iblock = max(iblock, ext.start); + dw->op = op; + + spin_lock(&rt->lock); + insert_offline_waiting(&rt->root, dw); + spin_unlock(&rt->lock); + } + + ret = 1; + break; + } + + iblock = ext.start + ext.len; + } + +out: + trace_scoutfs_data_wait_check(sb, scoutfs_ino(inode), pos, len, + sef, op, ext.start, ext.len, ext.flags, + ret); + return ret; +} + +bool scoutfs_data_wait_found(struct scoutfs_data_wait *dw) +{ + return !RB_EMPTY_NODE(&dw->node); +} + +int scoutfs_data_wait_check_iov(struct inode *inode, const struct iovec *iov, + unsigned long nr_segs, loff_t pos, u8 sef, + u8 op, struct scoutfs_data_wait *dw, + struct scoutfs_lock *lock) +{ + unsigned long i; + int ret = 0; + + for (i = 0; i < nr_segs; i++) { + if (iov[i].iov_len == 0) + continue; + + ret = scoutfs_data_wait_check(inode, pos, iov[i].iov_len, sef, + op, dw, lock); + if (ret != 0) + break; + + pos += iov[i].iov_len; + } + + return ret; +} + +int scoutfs_data_wait(struct inode *inode, struct scoutfs_data_wait *dw) +{ + DECLARE_DATA_WAIT_ROOT(inode->i_sb, rt); + DECLARE_DATA_WAITQ(inode, wq); + int ret; + + ret = wait_event_interruptible(wq->waitq, + atomic64_read(&wq->changed) != dw->chg); + + spin_lock(&rt->lock); + rb_erase(&dw->node, &rt->root); + RB_CLEAR_NODE(&dw->node); + spin_unlock(&rt->lock); + + return ret; +} + +void scoutfs_data_wait_changed(struct inode *inode) +{ + DECLARE_DATA_WAITQ(inode, wq); + + atomic64_inc(&wq->changed); + wake_up(&wq->waitq); +} + +int scoutfs_data_waiting(struct super_block *sb, u64 ino, u64 iblock, + struct scoutfs_ioctl_data_waiting_entry *dwe, + unsigned int nr) +{ + DECLARE_DATA_WAIT_ROOT(sb, rt); + struct scoutfs_data_wait *dw; + int ret = 0; + + spin_lock(&rt->lock); + + dw = next_data_wait(&rt->root, ino, iblock); + while (dw && ret < nr) { + + dwe->ino = dw->ino; + dwe->iblock = dw->iblock; + dwe->op = dw->op; + + while ((dw = dw_next(dw)) && + (dw->ino == dwe->ino && dw->iblock == dwe->iblock)) { + dwe->op |= dw->op; + } + + dwe++; + ret++; + } + + spin_unlock(&rt->lock); + + return ret; +} + const struct address_space_operations scoutfs_file_aops = { .readpage = scoutfs_readpage, .readpages = scoutfs_readpages, diff --git a/kmod/src/data.h b/kmod/src/data.h index bd9f84fa..d45114da 100644 --- a/kmod/src/data.h +++ b/kmod/src/data.h @@ -1,6 +1,41 @@ #ifndef _SCOUTFS_FILERW_H_ #define _SCOUTFS_FILERW_H_ +struct scoutfs_lock; +struct scoutfs_ioctl_data_waiting_entry; + +struct scoutfs_data_wait_root { + spinlock_t lock; + struct rb_root root; +}; + +#define DECLARE_DATA_WAIT_ROOT(sb, nm) \ + struct scoutfs_data_wait_root *nm = &SCOUTFS_SB(sb)->data_wait_root + +struct scoutfs_data_waitq { + atomic64_t changed; + wait_queue_head_t waitq; +}; + +#define DECLARE_DATA_WAITQ(in, nm) \ + struct scoutfs_data_waitq *nm = &SCOUTFS_I(in)->data_waitq + +/* + * Tasks can wait for data extents. + */ +struct scoutfs_data_wait { + struct rb_node node; + u64 chg; + u64 ino; + u64 iblock; + u8 op; +}; + +#define DECLARE_DATA_WAIT(nm) \ + struct scoutfs_data_wait nm = { \ + .node.__rb_parent_color = (unsigned long)(&nm.node), \ + } + extern const struct address_space_operations scoutfs_file_aops; extern const struct file_operations scoutfs_file_fops; @@ -11,6 +46,21 @@ int scoutfs_data_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, u64 start, u64 len); long scoutfs_fallocate(struct file *file, int mode, loff_t offset, loff_t len); +int scoutfs_data_wait_check(struct inode *inode, loff_t pos, loff_t len, + u8 sef, u8 op, struct scoutfs_data_wait *ow, + struct scoutfs_lock *lock); +int scoutfs_data_wait_check_iov(struct inode *inode, const struct iovec *iov, + unsigned long nr_segs, loff_t pos, u8 sef, + u8 op, struct scoutfs_data_wait *ow, + struct scoutfs_lock *lock); +bool scoutfs_data_wait_found(struct scoutfs_data_wait *ow); +int scoutfs_data_wait(struct inode *inode, + struct scoutfs_data_wait *ow); +void scoutfs_data_wait_changed(struct inode *inode); +int scoutfs_data_waiting(struct super_block *sb, u64 ino, u64 iblock, + struct scoutfs_ioctl_data_waiting_entry *dwe, + unsigned int nr); + int scoutfs_data_setup(struct super_block *sb); void scoutfs_data_destroy(struct super_block *sb); diff --git a/kmod/src/file.c b/kmod/src/file.c index f78e5721..765e5f1e 100644 --- a/kmod/src/file.c +++ b/kmod/src/file.c @@ -39,15 +39,40 @@ ssize_t scoutfs_file_aio_read(struct kiocb *iocb, const struct iovec *iov, struct super_block *sb = inode->i_sb; struct scoutfs_lock *inode_lock = NULL; SCOUTFS_DECLARE_PER_TASK_ENTRY(pt_ent); + DECLARE_DATA_WAIT(dw); int ret; +retry: ret = scoutfs_lock_inode(sb, SCOUTFS_LOCK_READ, SCOUTFS_LKF_REFRESH_INODE, inode, &inode_lock); - if (ret == 0) { - scoutfs_per_task_add(&si->pt_data_lock, &pt_ent, inode_lock); - ret = generic_file_aio_read(iocb, iov, nr_segs, pos); - scoutfs_per_task_del(&si->pt_data_lock, &pt_ent); - scoutfs_unlock(sb, inode_lock, SCOUTFS_LOCK_READ); + if (ret) + goto out; + + if (scoutfs_per_task_add_excl(&si->pt_data_lock, &pt_ent, inode_lock)) { + /* protect checked extents from stage/release */ + mutex_lock(&inode->i_mutex); + atomic_inc(&inode->i_dio_count); + mutex_unlock(&inode->i_mutex); + + ret = scoutfs_data_wait_check_iov(inode, iov, nr_segs, pos, + SEF_OFFLINE, + SCOUTFS_IOC_DWO_READ, + &dw, inode_lock); + if (ret != 0) + goto out; + } + + ret = generic_file_aio_read(iocb, iov, nr_segs, pos); + +out: + if (scoutfs_per_task_del(&si->pt_data_lock, &pt_ent)) + inode_dio_done(inode); + scoutfs_unlock(sb, inode_lock, SCOUTFS_LOCK_READ); + + if (scoutfs_data_wait_found(&dw)) { + ret = scoutfs_data_wait(inode, &dw); + if (ret == 0) + goto retry; } return ret; @@ -62,11 +87,13 @@ ssize_t scoutfs_file_aio_write(struct kiocb *iocb, const struct iovec *iov, struct super_block *sb = inode->i_sb; struct scoutfs_lock *inode_lock = NULL; SCOUTFS_DECLARE_PER_TASK_ENTRY(pt_ent); + DECLARE_DATA_WAIT(dw); int ret; if (iocb->ki_left == 0) /* Does this even happen? */ return 0; +retry: mutex_lock(&inode->i_mutex); ret = scoutfs_lock_inode(sb, SCOUTFS_LOCK_WRITE, SCOUTFS_LKF_REFRESH_INODE, inode, &inode_lock); @@ -77,16 +104,31 @@ ssize_t scoutfs_file_aio_write(struct kiocb *iocb, const struct iovec *iov, if (ret) goto out; - scoutfs_per_task_add(&si->pt_data_lock, &pt_ent, inode_lock); + if (scoutfs_per_task_add_excl(&si->pt_data_lock, &pt_ent, inode_lock)) { + /* data_version is per inode, whole file must be online */ + ret = scoutfs_data_wait_check(inode, 0, i_size_read(inode), + SEF_OFFLINE, + SCOUTFS_IOC_DWO_WRITE, + &dw, inode_lock); + if (ret != 0) + goto out; + } /* XXX: remove SUID bit */ ret = __generic_file_aio_write(iocb, iov, nr_segs, &iocb->ki_pos); + out: scoutfs_per_task_del(&si->pt_data_lock, &pt_ent); scoutfs_unlock(sb, inode_lock, SCOUTFS_LOCK_WRITE); mutex_unlock(&inode->i_mutex); + if (scoutfs_data_wait_found(&dw)) { + ret = scoutfs_data_wait(inode, &dw); + if (ret == 0) + goto retry; + } + if (ret > 0 || ret == -EIOCBQUEUED) { ssize_t err; diff --git a/kmod/src/format.h b/kmod/src/format.h index 936b0717..9fcbc082 100644 --- a/kmod/src/format.h +++ b/kmod/src/format.h @@ -390,8 +390,9 @@ struct scoutfs_file_extent { __u8 flags; } __packed; -#define SEF_OFFLINE 0x1 -#define SEF_UNWRITTEN 0x2 +#define SEF_OFFLINE (1 << 0) +#define SEF_UNWRITTEN (1 << 1) +#define SEF_UNKNOWN (U8_MAX << 2) /* * The first xattr part item has a header that describes the xattr. The diff --git a/kmod/src/inode.c b/kmod/src/inode.c index b9e6fcf9..3bdc756a 100644 --- a/kmod/src/inode.c +++ b/kmod/src/inode.c @@ -70,6 +70,8 @@ static void scoutfs_inode_ctor(void *obj) seqcount_init(&ci->seqcount); ci->staging = false; scoutfs_per_task_init(&ci->pt_data_lock); + atomic64_set(&ci->data_waitq.changed, 0); + init_waitqueue_head(&ci->data_waitq.waitq); init_rwsem(&ci->xattr_rwsem); RB_CLEAR_NODE(&ci->writeback_node); spin_lock_init(&ci->ino_alloc.lock); @@ -340,6 +342,9 @@ static int set_inode_size(struct inode *inode, struct scoutfs_lock *lock, if (ret) return ret; + if (new_size != i_size_read(inode)) + scoutfs_inode_inc_data_version(inode); + truncate_setsize(inode, new_size); inode->i_ctime = inode->i_mtime = CURRENT_TIME; if (truncate) @@ -394,11 +399,22 @@ int scoutfs_complete_truncate(struct inode *inode, struct scoutfs_lock *lock) return ret ? ret : err; } +/* + * If we're changing the file size than the contents of the file are + * changing and we increment the data_version. This would prevent + * staging because the data_version is per-inode today, not per-extent. + * So if there are any offline extents within the new size then we need + * to stage them before we truncate. And this is called with the + * i_mutex held which would prevent staging so we release it and + * re-acquire it. Ideally we'd fix this so that we can acquire the lock + * instead of the caller. + */ int scoutfs_setattr(struct dentry *dentry, struct iattr *attr) { struct inode *inode = dentry->d_inode; struct super_block *sb = inode->i_sb; struct scoutfs_lock *lock = NULL; + DECLARE_DATA_WAIT(dw); LIST_HEAD(ind_locks); bool truncate = false; u64 attr_size; @@ -406,6 +422,7 @@ int scoutfs_setattr(struct dentry *dentry, struct iattr *attr) trace_scoutfs_setattr(dentry, attr); +retry: ret = scoutfs_lock_inode(sb, SCOUTFS_LOCK_WRITE, SCOUTFS_LKF_REFRESH_INODE, inode, &lock); if (ret) @@ -427,6 +444,28 @@ int scoutfs_setattr(struct dentry *dentry, struct iattr *attr) if (ret) goto out; + /* data_version is per inode, all must be online */ + if (attr_size > 0 && attr_size != i_size_read(inode)) { + ret = scoutfs_data_wait_check(inode, 0, attr_size, + SEF_OFFLINE, + SCOUTFS_IOC_DWO_CHANGE_SIZE, + &dw, lock); + if (ret < 0) + goto out; + if (scoutfs_data_wait_found(&dw)) { + scoutfs_unlock(sb, lock, SCOUTFS_LOCK_WRITE); + + /* XXX callee locks instead? */ + mutex_unlock(&inode->i_mutex); + ret = scoutfs_data_wait(inode, &dw); + mutex_lock(&inode->i_mutex); + + if (ret == 0) + goto retry; + goto out; + } + } + /* truncating to current size truncates extents past size */ truncate = i_size_read(inode) >= attr_size; @@ -532,6 +571,10 @@ void scoutfs_inode_add_onoff(struct inode *inode, s64 on, s64 off) write_seqcount_end(&si->seqcount); preempt_enable(); } + + /* any time offline extents decreased we try and wake waiters */ + if (inode && off < 0) + scoutfs_data_wait_changed(inode); } static u64 read_seqcount_u64(struct inode *inode, u64 *val) diff --git a/kmod/src/inode.h b/kmod/src/inode.h index 7ae34de8..0ccd0184 100644 --- a/kmod/src/inode.h +++ b/kmod/src/inode.h @@ -6,6 +6,7 @@ #include "per_task.h" #include "count.h" #include "format.h" +#include "data.h" struct scoutfs_lock; @@ -48,8 +49,10 @@ struct scoutfs_inode_info { seqcount_t seqcount; bool staging; /* holder of i_mutex is staging */ struct scoutfs_per_task pt_data_lock; + struct scoutfs_data_waitq data_waitq; struct rw_semaphore xattr_rwsem; struct rb_node writeback_node; + struct inode inode; }; diff --git a/kmod/src/ioctl.c b/kmod/src/ioctl.c index 2c1fa74e..738173e9 100644 --- a/kmod/src/ioctl.c +++ b/kmod/src/ioctl.c @@ -541,6 +541,56 @@ static long scoutfs_ioc_item_cache_keys(struct file *file, unsigned long arg) return ret ?: total; } +static bool inc_wrapped(u64 *ino, u64 *iblock) +{ + return (++(*iblock) == 0) && (++(*ino) == 0); +} + +static long scoutfs_ioc_data_waiting(struct file *file, unsigned long arg) +{ + struct super_block *sb = file_inode(file)->i_sb; + struct scoutfs_ioctl_data_waiting idw; + struct scoutfs_ioctl_data_waiting_entry __user *udwe; + struct scoutfs_ioctl_data_waiting_entry dwe[16]; + unsigned int nr; + int total; + int ret; + + if (copy_from_user(&idw, (void __user *)arg, sizeof(idw))) + return -EFAULT; + + if (idw.flags & SCOUTFS_IOC_DATA_WAITING_FLAGS_UNKNOWN) + return -EINVAL; + + udwe = (void __user *)(long)idw.ents_ptr; + total = 0; + ret = 0; + while (idw.ents_nr && !inc_wrapped(&idw.after_ino, &idw.after_iblock)) { + nr = min_t(size_t, idw.ents_nr, ARRAY_SIZE(dwe)); + + ret = scoutfs_data_waiting(sb, idw.after_ino, idw.after_iblock, + dwe, nr); + BUG_ON(ret > nr); /* stack overflow \o/ */ + if (ret <= 0) + break; + + if (copy_to_user(udwe, dwe, ret * sizeof(dwe[0]))) { + ret = -EFAULT; + break; + } + + idw.after_ino = dwe[ret - 1].ino; + idw.after_iblock = dwe[ret - 1].iblock; + + udwe += ret; + idw.ents_nr -= ret; + total += ret; + ret = 0; + } + + return ret ?: total; +} + long scoutfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { switch (cmd) { @@ -556,6 +606,8 @@ long scoutfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg) return scoutfs_ioc_stat_more(file, arg); case SCOUTFS_IOC_ITEM_CACHE_KEYS: return scoutfs_ioc_item_cache_keys(file, arg); + case SCOUTFS_IOC_DATA_WAITING: + return scoutfs_ioc_data_waiting(file, arg); } return -ENOTTY; diff --git a/kmod/src/ioctl.h b/kmod/src/ioctl.h index 915a130b..1b592522 100644 --- a/kmod/src/ioctl.h +++ b/kmod/src/ioctl.h @@ -229,4 +229,28 @@ enum { #define SCOUTFS_IOC_ITEM_CACHE_KEYS _IOW(SCOUTFS_IOCTL_MAGIC, 8, \ struct scoutfs_ioctl_item_cache_keys) +struct scoutfs_ioctl_data_waiting_entry { + __u64 ino; + __u64 iblock; + __u8 op; +} __packed; + +#define SCOUTFS_IOC_DWO_READ (1 << 0) +#define SCOUTFS_IOC_DWO_WRITE (1 << 1) +#define SCOUTFS_IOC_DWO_CHANGE_SIZE (1 << 2) +#define SCOUTFS_IOC_DWO_UNKNOWN (U8_MAX << 3) + +struct scoutfs_ioctl_data_waiting { + __u64 flags; + __u64 after_ino; + __u64 after_iblock; + __u64 ents_ptr; + __u16 ents_nr; +} __packed; + +#define SCOUTFS_IOC_DATA_WAITING_FLAGS_UNKNOWN (U8_MAX << 0) + +#define SCOUTFS_IOC_DATA_WAITING _IOW(SCOUTFS_IOCTL_MAGIC, 9, \ + struct scoutfs_ioctl_data_waiting) + #endif diff --git a/kmod/src/lock.c b/kmod/src/lock.c index d268dd82..6697ec66 100644 --- a/kmod/src/lock.c +++ b/kmod/src/lock.c @@ -32,6 +32,7 @@ #include "triggers.h" #include "tseq.h" #include "client.h" +#include "data.h" /* * scoutfs uses a lock service to manage item cache consistency between @@ -126,8 +127,10 @@ static void invalidate_inode(struct super_block *sb, u64 ino) inode = scoutfs_ilookup(sb, ino); if (inode) { scoutfs_inc_counter(sb, lock_invalidate_inode); - if (S_ISREG(inode->i_mode)) + if (S_ISREG(inode->i_mode)) { truncate_inode_pages(inode->i_mapping, 0); + scoutfs_data_wait_changed(inode); + } iput(inode); } } diff --git a/kmod/src/scoutfs_trace.h b/kmod/src/scoutfs_trace.h index 5e85305e..08d4572b 100644 --- a/kmod/src/scoutfs_trace.h +++ b/kmod/src/scoutfs_trace.h @@ -514,6 +514,45 @@ TRACE_EVENT(scoutfs_data_truncate_items, __entry->iblock, __entry->last, __entry->offline) ); +TRACE_EVENT(scoutfs_data_wait_check, + TP_PROTO(struct super_block *sb, __u64 ino, __u64 pos, __u64 len, + __u8 sef, __u8 op, __u64 ext_start, __u64 ext_len, + __u8 ext_flags, int ret), + + TP_ARGS(sb, ino, pos, len, sef, op, ext_start, ext_len, ext_flags, ret), + + TP_STRUCT__entry( + __field(__u64, fsid) + __field(__u64, ino) + __field(__u64, pos) + __field(__u64, len) + __field(__u8, sef) + __field(__u8, op) + __field(__u64, ext_start) + __field(__u64, ext_len) + __field(__u8, ext_flags) + __field(int, ret) + ), + + TP_fast_assign( + __entry->fsid = FSID_ARG(sb); + __entry->ino = ino; + __entry->pos = pos; + __entry->len = len; + __entry->sef = sef; + __entry->op = op; + __entry->ext_start = ext_start; + __entry->ext_len = ext_len; + __entry->ext_flags = ext_flags; + __entry->ret = ret; + ), + + TP_printk(FSID_FMT" ino %llu pos %llu len %llu sef 0x%x op 0x%x ext_start %llu ext_len %llu ext_flags 0x%x ret %d", + __entry->fsid, __entry->ino, __entry->pos, __entry->len, + __entry->sef, __entry->op, __entry->ext_start, + __entry->ext_len, __entry->ext_flags, __entry->ret) +); + TRACE_EVENT(scoutfs_sync_fs, TP_PROTO(struct super_block *sb, int wait), diff --git a/kmod/src/super.c b/kmod/src/super.c index 02f38934..ff39a6fb 100644 --- a/kmod/src/super.c +++ b/kmod/src/super.c @@ -339,6 +339,8 @@ static int scoutfs_fill_super(struct super_block *sb, void *data, int silent) spin_lock_init(&sbi->next_ino_lock); init_waitqueue_head(&sbi->trans_hold_wq); + spin_lock_init(&sbi->data_wait_root.lock); + sbi->data_wait_root.root = RB_ROOT; spin_lock_init(&sbi->trans_write_lock); INIT_DELAYED_WORK(&sbi->trans_write_work, scoutfs_trans_write_func); init_waitqueue_head(&sbi->trans_write_wq); diff --git a/kmod/src/super.h b/kmod/src/super.h index e24f4d9a..6dd03ac6 100644 --- a/kmod/src/super.h +++ b/kmod/src/super.h @@ -6,6 +6,7 @@ #include "format.h" #include "options.h" +#include "data.h" struct scoutfs_counters; struct scoutfs_triggers; @@ -49,6 +50,9 @@ struct scoutfs_sb_info { wait_queue_head_t trans_hold_wq; struct task_struct *trans_task; + /* tracks tasks waiting for data extents */ + struct scoutfs_data_wait_root data_wait_root; + spinlock_t trans_write_lock; u64 trans_write_count; u64 trans_seq;