Add move_blocks ioctl

Add a relatively constrained ioctl that moves extents between regular
files.  This is intended to be used by tasks which combine many existing
files into a much larger file without reading and writing all the file
contents.

Signed-off-by: Zach Brown <zab@versity.com>
This commit is contained in:
Zach Brown
2021-01-06 10:40:34 -08:00
parent 4da3d47601
commit 3139d3ea68
5 changed files with 374 additions and 0 deletions

View File

@@ -39,6 +39,7 @@
#include "msg.h"
#include "count.h"
#include "ext.h"
#include "util.h"
/*
* We want to amortize work done after dirtying the shared transaction
@@ -1103,6 +1104,241 @@ out:
return ret;
}
/*
* We're using truncate_inode_pages_range to maintain consistency
* between the page cache and extents that just changed. We have to
* call with full aligned page offsets or it thinks that it should leave
* behind a zeroed partial page.
*/
static void truncate_inode_pages_extent(struct inode *inode, u64 start, u64 len)
{
truncate_inode_pages_range(&inode->i_data,
start << SCOUTFS_BLOCK_SM_SHIFT,
((start + len) << SCOUTFS_BLOCK_SM_SHIFT) - 1);
}
/*
* Move extents from one file to another. The behaviour is more fully
* explained above the move_blocks ioctl argument structure definition.
*
* The caller has processed the ioctl args and performed the most basic
* inode checks, but we perform more detailed inode checks once we have
* the inode lock and refreshed inodes. Our job is to safely lock the
* two files and move the extents.
*/
#define MOVE_DATA_EXTENTS_PER_HOLD 16
int scoutfs_data_move_blocks(struct inode *from, u64 from_off,
u64 byte_len, struct inode *to, u64 to_off)
{
struct scoutfs_inode_info *from_si = SCOUTFS_I(from);
struct scoutfs_inode_info *to_si = SCOUTFS_I(to);
struct super_block *sb = from->i_sb;
struct scoutfs_lock *from_lock = NULL;
struct scoutfs_lock *to_lock = NULL;
struct data_ext_args from_args;
struct data_ext_args to_args;
struct scoutfs_extent ext;
LIST_HEAD(locks);
bool done = false;
loff_t from_size;
loff_t to_size;
u64 from_offline;
u64 to_offline;
u64 from_start;
u64 to_start;
u64 from_iblock;
u64 to_iblock;
u64 count;
u64 junk;
u64 seq;
u64 map;
u64 len;
int ret;
int err;
int i;
lock_two_nondirectories(from, to);
ret = scoutfs_lock_inodes(sb, SCOUTFS_LOCK_WRITE,
SCOUTFS_LKF_REFRESH_INODE, from, &from_lock,
to, &to_lock, NULL, NULL, NULL, NULL);
if (ret)
goto out;
if ((from_off & SCOUTFS_BLOCK_SM_MASK) ||
(to_off & SCOUTFS_BLOCK_SM_MASK) ||
((byte_len & SCOUTFS_BLOCK_SM_MASK) &&
(from_off + byte_len != i_size_read(from)))) {
ret = -EINVAL;
goto out;
}
from_iblock = from_off >> SCOUTFS_BLOCK_SM_SHIFT;
count = (byte_len + SCOUTFS_BLOCK_SM_MASK) >> SCOUTFS_BLOCK_SM_SHIFT;
to_iblock = to_off >> SCOUTFS_BLOCK_SM_SHIFT;
if (S_ISDIR(from->i_mode) || S_ISDIR(to->i_mode)) {
ret = -EISDIR;
goto out;
}
if (!S_ISREG(from->i_mode) || !S_ISREG(to->i_mode)) {
ret = -EINVAL;
goto out;
}
ret = inode_permission(from, MAY_WRITE) ?:
inode_permission(to, MAY_WRITE);
if (ret < 0)
goto out;
/* can't stage once data_version changes */
scoutfs_inode_get_onoff(from, &junk, &from_offline);
scoutfs_inode_get_onoff(to, &junk, &to_offline);
if (from_offline || to_offline) {
ret = -ENODATA;
goto out;
}
from_args = (struct data_ext_args) {
.ino = scoutfs_ino(from),
.inode = from,
.lock = from_lock,
};
to_args = (struct data_ext_args) {
.ino = scoutfs_ino(to),
.inode = to,
.lock = to_lock,
};
inode_dio_wait(from);
inode_dio_wait(to);
ret = filemap_write_and_wait_range(&from->i_data, from_off,
from_off + byte_len - 1);
if (ret < 0)
goto out;
for (;;) {
ret = scoutfs_inode_index_start(sb, &seq) ?:
scoutfs_inode_index_prepare(sb, &locks, from, true) ?:
scoutfs_inode_index_prepare(sb, &locks, to, true) ?:
scoutfs_inode_index_try_lock_hold(sb, &locks, seq,
SIC_EXACT(1, 1));
if (ret > 0)
continue;
if (ret < 0)
goto out;
ret = scoutfs_dirty_inode_item(from, from_lock) ?:
scoutfs_dirty_inode_item(to, to_lock);
if (ret < 0)
goto out;
down_write_two(&from_si->extent_sem, &to_si->extent_sem);
/* arbitrarily limit the number of extents per trans hold */
for (i = 0; i < MOVE_DATA_EXTENTS_PER_HOLD; i++) {
/* find the next extent to move */
ret = scoutfs_ext_next(sb, &data_ext_ops, &from_args,
from_iblock, 1, &ext);
if (ret < 0) {
if (ret == -ENOENT) {
done = true;
ret = 0;
}
break;
}
/* only move extents within count and i_size */
if (ext.start >= from_iblock + count ||
ext.start >= i_size_read(from)) {
done = true;
ret = 0;
break;
}
from_start = max(ext.start, from_iblock);
map = ext.map + (from_start - ext.start);
len = min3(from_iblock + count,
round_up((u64)i_size_read(from),
SCOUTFS_BLOCK_SM_SIZE),
ext.start + ext.len) - from_start;
to_start = to_iblock + (from_start - from_iblock);
/* insert the new, fails if it overlaps */
ret = scoutfs_ext_insert(sb, &data_ext_ops, &to_args,
to_start, len,
map, ext.flags);
if (ret < 0)
break;
/* remove the old, possibly splitting */
ret = scoutfs_ext_set(sb, &data_ext_ops, &from_args,
from_start, len, 0, 0);
if (ret < 0) {
/* remove inserted new on err */
err = scoutfs_ext_remove(sb, &data_ext_ops,
&to_args, to_start,
len);
BUG_ON(err); /* XXX inconsistent */
break;
}
trace_scoutfs_data_move_blocks(sb, scoutfs_ino(from),
from_start, len, map,
ext.flags,
scoutfs_ino(to),
to_start);
/* moved extent might extend i_size */
to_size = (to_start + len) << SCOUTFS_BLOCK_SM_SHIFT;
if (to_size > i_size_read(to)) {
/* while maintaining final partial */
from_size = (from_start + len) <<
SCOUTFS_BLOCK_SM_SHIFT;
if (from_size > i_size_read(from))
to_size -= from_size -
i_size_read(from);
i_size_write(to, to_size);
}
}
up_write(&from_si->extent_sem);
up_write(&to_si->extent_sem);
from->i_ctime = from->i_mtime =
to->i_ctime = to->i_mtime = CURRENT_TIME;
scoutfs_inode_inc_data_version(from);
scoutfs_inode_inc_data_version(to);
scoutfs_inode_set_data_seq(from);
scoutfs_inode_set_data_seq(to);
scoutfs_update_inode_item(from, from_lock, &locks);
scoutfs_update_inode_item(to, to_lock, &locks);
scoutfs_release_trans(sb);
scoutfs_inode_index_unlock(sb, &locks);
if (ret < 0 || done)
break;
}
/* remove any cached pages from old extents */
truncate_inode_pages_extent(from, from_iblock, count);
truncate_inode_pages_extent(to, to_iblock, count);
out:
scoutfs_unlock(sb, from_lock, SCOUTFS_LOCK_WRITE);
scoutfs_unlock(sb, to_lock, SCOUTFS_LOCK_WRITE);
unlock_two_nondirectories(from, to);
return ret;
}
/*
* This copies to userspace :/
*/

View File

@@ -58,6 +58,8 @@ int scoutfs_data_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
long scoutfs_fallocate(struct file *file, int mode, loff_t offset, loff_t len);
int scoutfs_data_init_offline_extent(struct inode *inode, u64 size,
struct scoutfs_lock *lock);
int scoutfs_data_move_blocks(struct inode *from, u64 from_off,
u64 byte_len, struct inode *to, u64 to_off);
int scoutfs_data_wait_check(struct inode *inode, loff_t pos, loff_t len,
u8 sef, u8 op, struct scoutfs_data_wait *ow,

View File

@@ -12,6 +12,7 @@
*/
#include <linux/kernel.h>
#include <linux/fs.h>
#include <linux/file.h>
#include <linux/uaccess.h>
#include <linux/compiler.h>
#include <linux/uio.h>
@@ -937,6 +938,54 @@ static long scoutfs_ioc_alloc_detail(struct file *file, unsigned long arg)
args.copied;
}
static long scoutfs_ioc_move_blocks(struct file *file, unsigned long arg)
{
struct inode *to = file_inode(file);
struct super_block *sb = to->i_sb;
struct scoutfs_ioctl_move_blocks __user *umb = (void __user *)arg;
struct scoutfs_ioctl_move_blocks mb;
struct file *from_file;
struct inode *from;
int ret;
if (copy_from_user(&mb, umb, sizeof(mb)))
return -EFAULT;
if (mb.len == 0)
return 0;
if (mb.from_off + mb.len < mb.from_off ||
mb.to_off + mb.len < mb.to_off)
return -EOVERFLOW;
from_file = fget(mb.from_fd);
if (!from_file)
return -EBADF;
from = file_inode(from_file);
if (from == to) {
ret = -EINVAL;
goto out;
}
if (from->i_sb != sb) {
ret = -EXDEV;
goto out;
}
ret = mnt_want_write_file(file);
if (ret < 0)
goto out;
ret = scoutfs_data_move_blocks(from, mb.from_off, mb.len,
to, mb.to_off);
mnt_drop_write_file(file);
out:
fput(from_file);
return ret;
}
long scoutfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
{
switch (cmd) {
@@ -964,6 +1013,8 @@ long scoutfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
return scoutfs_ioc_data_wait_err(file, arg);
case SCOUTFS_IOC_ALLOC_DETAIL:
return scoutfs_ioc_alloc_detail(file, arg);
case SCOUTFS_IOC_MOVE_BLOCKS:
return scoutfs_ioc_move_blocks(file, arg);
}
return -ENOTTY;

View File

@@ -413,4 +413,55 @@ struct scoutfs_ioctl_alloc_detail_entry {
#define SCOUTFS_IOC_ALLOC_DETAIL _IOR(SCOUTFS_IOCTL_MAGIC, 12, \
struct scoutfs_ioctl_alloc_detail)
/*
* Move extents from one regular file to another at a different offset,
* on the same file system.
*
* from_fd specifies the source file and the ioctl is called on the
* destination file. Both files must have write access. from_off
* specifies the byte offset in the source, to_off is the byte offset in
* the destination, and len is the number of bytes in the region to
* move. All of the offsets and lengths must be in multiples of 4KB,
* except in the case where the from_off + len ends at the i_size of the
* source file.
*
* This interface only moves extents which are block granular, it does
* not perform RMW of sub-block byte extents and it does not overwrite
* existing extents in the destination. It will split extents in the
* source.
*
* Only extents within i_size on the source are moved. The destination
* i_size will be updated if extents are moved beyond its current
* i_size. The i_size update will maintain final partial blocks in the
* source.
*
* It will return an error if either of the files have offline extents.
* It will return 0 when all of the extents in the source region have
* been moved to the destination. Moving extents updates the ctime,
* mtime, meta_seq, data_seq, and data_version fields of both the source
* and destination inodes. If an error is returned then partial
* progress may have been made and inode fields may have been updated.
*
* Errors specific to this interface include:
*
* EINVAL: from_off, len, or to_off aren't a multiple of 4KB; the source
* and destination files are the same inode; either the source or
* destination is not a regular file; the destination file has
* an existing overlapping extent.
* EOVERFLOW: either from_off + len or to_off + len exceeded 64bits.
* EBADF: from_fd isn't a valid open file descriptor.
* EXDEV: the source and destination files are in different filesystems.
* EISDIR: either the source or destination is a directory.
* ENODATA: either the source or destination file have offline extents.
*/
struct scoutfs_ioctl_move_blocks {
__u64 from_fd;
__u64 from_off;
__u64 len;
__u64 to_off;
};
#define SCOUTFS_IOC_MOVE_BLOCKS _IOR(SCOUTFS_IOCTL_MAGIC, 13, \
struct scoutfs_ioctl_move_blocks)
#endif

View File

@@ -169,6 +169,40 @@ TRACE_EVENT(scoutfs_data_fallocate,
__entry->len, __entry->ret)
);
TRACE_EVENT(scoutfs_data_move_blocks,
TP_PROTO(struct super_block *sb, u64 from_ino, u64 from_start, u64 len,
u64 map, u8 flags, u64 to_ino, u64 to_start),
TP_ARGS(sb, from_ino, from_start, len, map, flags, to_ino, to_start),
TP_STRUCT__entry(
SCSB_TRACE_FIELDS
__field(__u64, from_ino)
__field(__u64, from_start)
__field(__u64, len)
__field(__u64, map)
__field(__u8, flags)
__field(__u64, to_ino)
__field(__u64, to_start)
),
TP_fast_assign(
SCSB_TRACE_ASSIGN(sb);
__entry->from_ino = from_ino;
__entry->from_start = from_start;
__entry->len = len;
__entry->map = map;
__entry->flags = flags;
__entry->to_ino = to_ino;
__entry->to_start = to_start;
),
TP_printk(SCSBF" from_ino %llu from_start %llu len %llu map %llu flags 0x%x to_ino %llu to_start %llu\n",
SCSB_TRACE_ARGS, __entry->from_ino, __entry->from_start,
__entry->len, __entry->map, __entry->flags, __entry->to_ino,
__entry->to_start)
);
TRACE_EVENT(scoutfs_data_fiemap,
TP_PROTO(struct super_block *sb, __u64 start, __u64 len, int ret),