From c6b688c2bf13aaf17c4ab1f433578aabd4dab9b1 Mon Sep 17 00:00:00 2001 From: Zach Brown Date: Tue, 15 Nov 2016 15:45:02 -0800 Subject: [PATCH] Add staging ioctl This adds the ioctl for writing archived file contents back into the file if the data_version still matches. Signed-off-by: Zach Brown Reviewed-by: Mark Fasheh --- kmod/src/filerw.c | 7 ++++ kmod/src/inode.c | 13 +++--- kmod/src/inode.h | 3 ++ kmod/src/ioctl.c | 101 ++++++++++++++++++++++++++++++++++++++++++++++ kmod/src/ioctl.h | 10 +++++ 5 files changed, 129 insertions(+), 5 deletions(-) diff --git a/kmod/src/filerw.c b/kmod/src/filerw.c index 727737cd..5cdce097 100644 --- a/kmod/src/filerw.c +++ b/kmod/src/filerw.c @@ -331,6 +331,7 @@ static int contig_mapped_blocks(struct inode *inode, u64 iblock, u64 *blkno) */ static int map_writable_block(struct inode *inode, u64 iblock, u64 *blkno_ret) { + struct scoutfs_inode_info *si = SCOUTFS_I(inode); struct super_block *sb = inode->i_sb; struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb); struct scoutfs_super_block *super = &sbi->stable_super; @@ -367,6 +368,12 @@ static int map_writable_block(struct inode *inode, u64 iblock, u64 *blkno_ret) goto out; inserted = true; } else { + if ((extent.flags & SCOUTFS_EXTENT_FLAG_OFFLINE) && + !si->staging) { + ret = -EINVAL; + goto out; + } + ret = scoutfs_btree_dirty(sb, meta, &key); if (ret) goto out; diff --git a/kmod/src/inode.c b/kmod/src/inode.c index 0d98d8bf..f990dcf9 100644 --- a/kmod/src/inode.c +++ b/kmod/src/inode.c @@ -153,11 +153,13 @@ void scoutfs_inode_inc_data_version(struct inode *inode) { struct scoutfs_inode_info *si = SCOUTFS_I(inode); - preempt_disable(); - write_seqcount_begin(&si->seqcount); - si->data_version++; - write_seqcount_end(&si->seqcount); - preempt_enable(); + if (!si->staging) { + preempt_disable(); + write_seqcount_begin(&si->seqcount); + si->data_version++; + write_seqcount_end(&si->seqcount); + preempt_enable(); + } } u64 scoutfs_inode_get_data_version(struct inode *inode) @@ -395,6 +397,7 @@ struct inode *scoutfs_new_inode(struct super_block *sb, struct inode *dir, ci->ino = ino; seqcount_init(&ci->seqcount); ci->data_version = 0; + ci->staging = false; get_random_bytes(&ci->salt, sizeof(ci->salt)); atomic64_set(&ci->link_counter, 0); diff --git a/kmod/src/inode.h b/kmod/src/inode.h index 1becde1d..f0f74024 100644 --- a/kmod/src/inode.h +++ b/kmod/src/inode.h @@ -8,6 +8,9 @@ struct scoutfs_inode_info { seqcount_t seqcount; u64 data_version; + /* holder of i_mutex is staging */ + bool staging; + atomic64_t link_counter; struct rw_semaphore xattr_rwsem; diff --git a/kmod/src/ioctl.c b/kmod/src/ioctl.c index 3144f940..d40cfae1 100644 --- a/kmod/src/ioctl.c +++ b/kmod/src/ioctl.c @@ -18,6 +18,8 @@ #include #include #include +#include +#include #include "format.h" #include "btree.h" @@ -373,6 +375,103 @@ out: return ret; } +/* + * Write the archived contents of the file back if the data_version + * still matches. + * + * This is a data plane operation only. We don't want the write to + * change any fields in the inode. It only changes the file contents. + * + * Keep in mind that the staging writes can easily span transactions and + * can crash partway through. If we called the normal write path and + * restored the inode afterwards the modified inode could be commited + * partway through by a transaction and then left that way by a crash + * before the write finishes and we restore the fields. It also + * wouldn't be great if the temporarily updated inode was visible to + * paths that don't serialize with write. + * + * We're implementing the buffered write path down to the start of + * generic_file_buffered_writes() without all the stuff that would + * change the inode: file_remove_suid(), file_update_time(). The + * easiest way to do that is to call generic_file_buffered_write(). + * We're careful to only allow staging writes inside i_size. + * + * We set a bool on the inode which tells our code to update the + * offline extents and to not update the data_version counter. + * + * This doesn't support any fancy write modes or side-effects: aio, + * direct, append, sync, breaking suid, sending rlimit signals. + */ +static long scoutfs_ioc_stage(struct file *file, unsigned long arg) +{ + struct inode *inode = file_inode(file); + struct address_space *mapping = inode->i_mapping; + struct scoutfs_inode_info *si = SCOUTFS_I(inode); + struct scoutfs_ioctl_stage args; + struct kiocb kiocb; + struct iovec iov; + size_t written; + loff_t pos; + int ret; + + if (copy_from_user(&args, (void __user *)arg, sizeof(args))) + return -EFAULT; + + if (args.count < 0 || (args.offset + args.count < args.offset)) + return -EINVAL; + if (args.count == 0) + return 0; + + /* the iocb is really only used for the file pointer :P */ + init_sync_kiocb(&kiocb, file); + kiocb.ki_pos = args.offset; + kiocb.ki_left = args.count; + kiocb.ki_nbytes = args.count; + iov.iov_base = (void __user *)(unsigned long)args.buf_ptr; + iov.iov_len = args.count; + + ret = mnt_want_write_file(file); + if (ret) + return ret; + + mutex_lock(&inode->i_mutex); + + if (!S_ISREG(inode->i_mode) || + !(file->f_mode & FMODE_WRITE) || + (file->f_flags & (O_APPEND | O_DIRECT | O_DSYNC)) || + IS_SYNC(file->f_mapping->host) || + (args.offset + args.count > i_size_read(inode))) { + ret = -EINVAL; + goto out; + } + + if (scoutfs_inode_get_data_version(inode) != args.data_version) { + ret = -ESTALE; + goto out; + } + + si->staging = true; + current->backing_dev_info = mapping->backing_dev_info; + + pos = args.offset; + written = 0; + do { + ret = generic_file_buffered_write(&kiocb, &iov, 1, pos, &pos, + args.count, written); + BUG_ON(ret == -EIOCBQUEUED); + if (ret > 0) + written += ret; + } while (ret > 0 && written < args.count); + + si->staging = false; + current->backing_dev_info = NULL; +out: + mutex_unlock(&inode->i_mutex); + mnt_drop_write_file(file); + + return ret; +} + long scoutfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { switch (cmd) { @@ -390,6 +489,8 @@ long scoutfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg) return scoutfs_ioc_data_version(file, arg); case SCOUTFS_IOC_RELEASE: return scoutfs_ioc_release(file, arg); + case SCOUTFS_IOC_STAGE: + return scoutfs_ioc_stage(file, arg); } return -ENOTTY; diff --git a/kmod/src/ioctl.h b/kmod/src/ioctl.h index 1be28b9b..d39c6272 100644 --- a/kmod/src/ioctl.h +++ b/kmod/src/ioctl.h @@ -67,4 +67,14 @@ struct scoutfs_ioctl_release { #define SCOUTFS_IOC_RELEASE _IOW(SCOUTFS_IOCTL_MAGIC, 7, \ struct scoutfs_ioctl_release) +struct scoutfs_ioctl_stage { + __u64 data_version; + __u64 buf_ptr; + __u64 offset; + __s32 count; +} __packed; + +#define SCOUTFS_IOC_STAGE _IOW(SCOUTFS_IOCTL_MAGIC, 8, \ + struct scoutfs_ioctl_stage) + #endif