diff --git a/kmod/src/data.c b/kmod/src/data.c index 6e110e3a..52a42f38 100644 --- a/kmod/src/data.c +++ b/kmod/src/data.c @@ -39,6 +39,7 @@ #include "msg.h" #include "count.h" #include "ext.h" +#include "util.h" /* * We want to amortize work done after dirtying the shared transaction @@ -1103,6 +1104,241 @@ out: return ret; } +/* + * We're using truncate_inode_pages_range to maintain consistency + * between the page cache and extents that just changed. We have to + * call with full aligned page offsets or it thinks that it should leave + * behind a zeroed partial page. + */ +static void truncate_inode_pages_extent(struct inode *inode, u64 start, u64 len) +{ + truncate_inode_pages_range(&inode->i_data, + start << SCOUTFS_BLOCK_SM_SHIFT, + ((start + len) << SCOUTFS_BLOCK_SM_SHIFT) - 1); +} + +/* + * Move extents from one file to another. The behaviour is more fully + * explained above the move_blocks ioctl argument structure definition. + * + * The caller has processed the ioctl args and performed the most basic + * inode checks, but we perform more detailed inode checks once we have + * the inode lock and refreshed inodes. Our job is to safely lock the + * two files and move the extents. + */ +#define MOVE_DATA_EXTENTS_PER_HOLD 16 +int scoutfs_data_move_blocks(struct inode *from, u64 from_off, + u64 byte_len, struct inode *to, u64 to_off) +{ + struct scoutfs_inode_info *from_si = SCOUTFS_I(from); + struct scoutfs_inode_info *to_si = SCOUTFS_I(to); + struct super_block *sb = from->i_sb; + struct scoutfs_lock *from_lock = NULL; + struct scoutfs_lock *to_lock = NULL; + struct data_ext_args from_args; + struct data_ext_args to_args; + struct scoutfs_extent ext; + LIST_HEAD(locks); + bool done = false; + loff_t from_size; + loff_t to_size; + u64 from_offline; + u64 to_offline; + u64 from_start; + u64 to_start; + u64 from_iblock; + u64 to_iblock; + u64 count; + u64 junk; + u64 seq; + u64 map; + u64 len; + int ret; + int err; + int i; + + lock_two_nondirectories(from, to); + + ret = scoutfs_lock_inodes(sb, SCOUTFS_LOCK_WRITE, + SCOUTFS_LKF_REFRESH_INODE, from, &from_lock, + to, &to_lock, NULL, NULL, NULL, NULL); + if (ret) + goto out; + + if ((from_off & SCOUTFS_BLOCK_SM_MASK) || + (to_off & SCOUTFS_BLOCK_SM_MASK) || + ((byte_len & SCOUTFS_BLOCK_SM_MASK) && + (from_off + byte_len != i_size_read(from)))) { + ret = -EINVAL; + goto out; + } + + from_iblock = from_off >> SCOUTFS_BLOCK_SM_SHIFT; + count = (byte_len + SCOUTFS_BLOCK_SM_MASK) >> SCOUTFS_BLOCK_SM_SHIFT; + to_iblock = to_off >> SCOUTFS_BLOCK_SM_SHIFT; + + if (S_ISDIR(from->i_mode) || S_ISDIR(to->i_mode)) { + ret = -EISDIR; + goto out; + } + + if (!S_ISREG(from->i_mode) || !S_ISREG(to->i_mode)) { + ret = -EINVAL; + goto out; + } + + ret = inode_permission(from, MAY_WRITE) ?: + inode_permission(to, MAY_WRITE); + if (ret < 0) + goto out; + + /* can't stage once data_version changes */ + scoutfs_inode_get_onoff(from, &junk, &from_offline); + scoutfs_inode_get_onoff(to, &junk, &to_offline); + if (from_offline || to_offline) { + ret = -ENODATA; + goto out; + } + + from_args = (struct data_ext_args) { + .ino = scoutfs_ino(from), + .inode = from, + .lock = from_lock, + }; + + to_args = (struct data_ext_args) { + .ino = scoutfs_ino(to), + .inode = to, + .lock = to_lock, + }; + + inode_dio_wait(from); + inode_dio_wait(to); + + ret = filemap_write_and_wait_range(&from->i_data, from_off, + from_off + byte_len - 1); + if (ret < 0) + goto out; + + for (;;) { + ret = scoutfs_inode_index_start(sb, &seq) ?: + scoutfs_inode_index_prepare(sb, &locks, from, true) ?: + scoutfs_inode_index_prepare(sb, &locks, to, true) ?: + scoutfs_inode_index_try_lock_hold(sb, &locks, seq, + SIC_EXACT(1, 1)); + if (ret > 0) + continue; + if (ret < 0) + goto out; + + ret = scoutfs_dirty_inode_item(from, from_lock) ?: + scoutfs_dirty_inode_item(to, to_lock); + if (ret < 0) + goto out; + + down_write_two(&from_si->extent_sem, &to_si->extent_sem); + + /* arbitrarily limit the number of extents per trans hold */ + for (i = 0; i < MOVE_DATA_EXTENTS_PER_HOLD; i++) { + /* find the next extent to move */ + ret = scoutfs_ext_next(sb, &data_ext_ops, &from_args, + from_iblock, 1, &ext); + if (ret < 0) { + if (ret == -ENOENT) { + done = true; + ret = 0; + } + break; + } + + /* only move extents within count and i_size */ + if (ext.start >= from_iblock + count || + ext.start >= i_size_read(from)) { + done = true; + ret = 0; + break; + } + + from_start = max(ext.start, from_iblock); + map = ext.map + (from_start - ext.start); + len = min3(from_iblock + count, + round_up((u64)i_size_read(from), + SCOUTFS_BLOCK_SM_SIZE), + ext.start + ext.len) - from_start; + + to_start = to_iblock + (from_start - from_iblock); + + /* insert the new, fails if it overlaps */ + ret = scoutfs_ext_insert(sb, &data_ext_ops, &to_args, + to_start, len, + map, ext.flags); + if (ret < 0) + break; + + /* remove the old, possibly splitting */ + ret = scoutfs_ext_set(sb, &data_ext_ops, &from_args, + from_start, len, 0, 0); + if (ret < 0) { + /* remove inserted new on err */ + err = scoutfs_ext_remove(sb, &data_ext_ops, + &to_args, to_start, + len); + BUG_ON(err); /* XXX inconsistent */ + break; + } + + trace_scoutfs_data_move_blocks(sb, scoutfs_ino(from), + from_start, len, map, + ext.flags, + scoutfs_ino(to), + to_start); + + /* moved extent might extend i_size */ + to_size = (to_start + len) << SCOUTFS_BLOCK_SM_SHIFT; + if (to_size > i_size_read(to)) { + /* while maintaining final partial */ + from_size = (from_start + len) << + SCOUTFS_BLOCK_SM_SHIFT; + if (from_size > i_size_read(from)) + to_size -= from_size - + i_size_read(from); + i_size_write(to, to_size); + } + } + + + up_write(&from_si->extent_sem); + up_write(&to_si->extent_sem); + + from->i_ctime = from->i_mtime = + to->i_ctime = to->i_mtime = CURRENT_TIME; + scoutfs_inode_inc_data_version(from); + scoutfs_inode_inc_data_version(to); + scoutfs_inode_set_data_seq(from); + scoutfs_inode_set_data_seq(to); + + scoutfs_update_inode_item(from, from_lock, &locks); + scoutfs_update_inode_item(to, to_lock, &locks); + scoutfs_release_trans(sb); + scoutfs_inode_index_unlock(sb, &locks); + + if (ret < 0 || done) + break; + } + + /* remove any cached pages from old extents */ + truncate_inode_pages_extent(from, from_iblock, count); + truncate_inode_pages_extent(to, to_iblock, count); + +out: + scoutfs_unlock(sb, from_lock, SCOUTFS_LOCK_WRITE); + scoutfs_unlock(sb, to_lock, SCOUTFS_LOCK_WRITE); + + unlock_two_nondirectories(from, to); + + return ret; +} + /* * This copies to userspace :/ */ diff --git a/kmod/src/data.h b/kmod/src/data.h index 09a64fe7..4668eca3 100644 --- a/kmod/src/data.h +++ b/kmod/src/data.h @@ -58,6 +58,8 @@ int scoutfs_data_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, long scoutfs_fallocate(struct file *file, int mode, loff_t offset, loff_t len); int scoutfs_data_init_offline_extent(struct inode *inode, u64 size, struct scoutfs_lock *lock); +int scoutfs_data_move_blocks(struct inode *from, u64 from_off, + u64 byte_len, struct inode *to, u64 to_off); int scoutfs_data_wait_check(struct inode *inode, loff_t pos, loff_t len, u8 sef, u8 op, struct scoutfs_data_wait *ow, diff --git a/kmod/src/ioctl.c b/kmod/src/ioctl.c index 3fcaae34..96d02787 100644 --- a/kmod/src/ioctl.c +++ b/kmod/src/ioctl.c @@ -12,6 +12,7 @@ */ #include #include +#include #include #include #include @@ -937,6 +938,54 @@ static long scoutfs_ioc_alloc_detail(struct file *file, unsigned long arg) args.copied; } +static long scoutfs_ioc_move_blocks(struct file *file, unsigned long arg) +{ + struct inode *to = file_inode(file); + struct super_block *sb = to->i_sb; + struct scoutfs_ioctl_move_blocks __user *umb = (void __user *)arg; + struct scoutfs_ioctl_move_blocks mb; + struct file *from_file; + struct inode *from; + int ret; + + if (copy_from_user(&mb, umb, sizeof(mb))) + return -EFAULT; + + if (mb.len == 0) + return 0; + + if (mb.from_off + mb.len < mb.from_off || + mb.to_off + mb.len < mb.to_off) + return -EOVERFLOW; + + from_file = fget(mb.from_fd); + if (!from_file) + return -EBADF; + from = file_inode(from_file); + + if (from == to) { + ret = -EINVAL; + goto out; + } + + if (from->i_sb != sb) { + ret = -EXDEV; + goto out; + } + + ret = mnt_want_write_file(file); + if (ret < 0) + goto out; + + ret = scoutfs_data_move_blocks(from, mb.from_off, mb.len, + to, mb.to_off); + mnt_drop_write_file(file); +out: + fput(from_file); + + return ret; +} + long scoutfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { switch (cmd) { @@ -964,6 +1013,8 @@ long scoutfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg) return scoutfs_ioc_data_wait_err(file, arg); case SCOUTFS_IOC_ALLOC_DETAIL: return scoutfs_ioc_alloc_detail(file, arg); + case SCOUTFS_IOC_MOVE_BLOCKS: + return scoutfs_ioc_move_blocks(file, arg); } return -ENOTTY; diff --git a/kmod/src/ioctl.h b/kmod/src/ioctl.h index fdb7fb62..3eaa36f0 100644 --- a/kmod/src/ioctl.h +++ b/kmod/src/ioctl.h @@ -413,4 +413,55 @@ struct scoutfs_ioctl_alloc_detail_entry { #define SCOUTFS_IOC_ALLOC_DETAIL _IOR(SCOUTFS_IOCTL_MAGIC, 12, \ struct scoutfs_ioctl_alloc_detail) +/* + * Move extents from one regular file to another at a different offset, + * on the same file system. + * + * from_fd specifies the source file and the ioctl is called on the + * destination file. Both files must have write access. from_off + * specifies the byte offset in the source, to_off is the byte offset in + * the destination, and len is the number of bytes in the region to + * move. All of the offsets and lengths must be in multiples of 4KB, + * except in the case where the from_off + len ends at the i_size of the + * source file. + * + * This interface only moves extents which are block granular, it does + * not perform RMW of sub-block byte extents and it does not overwrite + * existing extents in the destination. It will split extents in the + * source. + * + * Only extents within i_size on the source are moved. The destination + * i_size will be updated if extents are moved beyond its current + * i_size. The i_size update will maintain final partial blocks in the + * source. + * + * It will return an error if either of the files have offline extents. + * It will return 0 when all of the extents in the source region have + * been moved to the destination. Moving extents updates the ctime, + * mtime, meta_seq, data_seq, and data_version fields of both the source + * and destination inodes. If an error is returned then partial + * progress may have been made and inode fields may have been updated. + * + * Errors specific to this interface include: + * + * EINVAL: from_off, len, or to_off aren't a multiple of 4KB; the source + * and destination files are the same inode; either the source or + * destination is not a regular file; the destination file has + * an existing overlapping extent. + * EOVERFLOW: either from_off + len or to_off + len exceeded 64bits. + * EBADF: from_fd isn't a valid open file descriptor. + * EXDEV: the source and destination files are in different filesystems. + * EISDIR: either the source or destination is a directory. + * ENODATA: either the source or destination file have offline extents. + */ +struct scoutfs_ioctl_move_blocks { + __u64 from_fd; + __u64 from_off; + __u64 len; + __u64 to_off; +}; + +#define SCOUTFS_IOC_MOVE_BLOCKS _IOR(SCOUTFS_IOCTL_MAGIC, 13, \ + struct scoutfs_ioctl_move_blocks) + #endif diff --git a/kmod/src/scoutfs_trace.h b/kmod/src/scoutfs_trace.h index a2e1cae8..5262126b 100644 --- a/kmod/src/scoutfs_trace.h +++ b/kmod/src/scoutfs_trace.h @@ -169,6 +169,40 @@ TRACE_EVENT(scoutfs_data_fallocate, __entry->len, __entry->ret) ); +TRACE_EVENT(scoutfs_data_move_blocks, + TP_PROTO(struct super_block *sb, u64 from_ino, u64 from_start, u64 len, + u64 map, u8 flags, u64 to_ino, u64 to_start), + + TP_ARGS(sb, from_ino, from_start, len, map, flags, to_ino, to_start), + + TP_STRUCT__entry( + SCSB_TRACE_FIELDS + __field(__u64, from_ino) + __field(__u64, from_start) + __field(__u64, len) + __field(__u64, map) + __field(__u8, flags) + __field(__u64, to_ino) + __field(__u64, to_start) + ), + + TP_fast_assign( + SCSB_TRACE_ASSIGN(sb); + __entry->from_ino = from_ino; + __entry->from_start = from_start; + __entry->len = len; + __entry->map = map; + __entry->flags = flags; + __entry->to_ino = to_ino; + __entry->to_start = to_start; + ), + + TP_printk(SCSBF" from_ino %llu from_start %llu len %llu map %llu flags 0x%x to_ino %llu to_start %llu\n", + SCSB_TRACE_ARGS, __entry->from_ino, __entry->from_start, + __entry->len, __entry->map, __entry->flags, __entry->to_ino, + __entry->to_start) +); + TRACE_EVENT(scoutfs_data_fiemap, TP_PROTO(struct super_block *sb, __u64 start, __u64 len, int ret),