mirror of
https://github.com/versity/scoutfs.git
synced 2026-01-06 04:04:59 +00:00
Add move_blocks ioctl
Add a relatively constrained ioctl that moves extents between regular files. This is intended to be used by tasks which combine many existing files into a much larger file without reading and writing all the file contents. Signed-off-by: Zach Brown <zab@versity.com>
This commit is contained in:
236
kmod/src/data.c
236
kmod/src/data.c
@@ -39,6 +39,7 @@
|
||||
#include "msg.h"
|
||||
#include "count.h"
|
||||
#include "ext.h"
|
||||
#include "util.h"
|
||||
|
||||
/*
|
||||
* We want to amortize work done after dirtying the shared transaction
|
||||
@@ -1103,6 +1104,241 @@ out:
|
||||
return ret;
|
||||
}
|
||||
|
||||
/*
|
||||
* We're using truncate_inode_pages_range to maintain consistency
|
||||
* between the page cache and extents that just changed. We have to
|
||||
* call with full aligned page offsets or it thinks that it should leave
|
||||
* behind a zeroed partial page.
|
||||
*/
|
||||
static void truncate_inode_pages_extent(struct inode *inode, u64 start, u64 len)
|
||||
{
|
||||
truncate_inode_pages_range(&inode->i_data,
|
||||
start << SCOUTFS_BLOCK_SM_SHIFT,
|
||||
((start + len) << SCOUTFS_BLOCK_SM_SHIFT) - 1);
|
||||
}
|
||||
|
||||
/*
|
||||
* Move extents from one file to another. The behaviour is more fully
|
||||
* explained above the move_blocks ioctl argument structure definition.
|
||||
*
|
||||
* The caller has processed the ioctl args and performed the most basic
|
||||
* inode checks, but we perform more detailed inode checks once we have
|
||||
* the inode lock and refreshed inodes. Our job is to safely lock the
|
||||
* two files and move the extents.
|
||||
*/
|
||||
#define MOVE_DATA_EXTENTS_PER_HOLD 16
|
||||
int scoutfs_data_move_blocks(struct inode *from, u64 from_off,
|
||||
u64 byte_len, struct inode *to, u64 to_off)
|
||||
{
|
||||
struct scoutfs_inode_info *from_si = SCOUTFS_I(from);
|
||||
struct scoutfs_inode_info *to_si = SCOUTFS_I(to);
|
||||
struct super_block *sb = from->i_sb;
|
||||
struct scoutfs_lock *from_lock = NULL;
|
||||
struct scoutfs_lock *to_lock = NULL;
|
||||
struct data_ext_args from_args;
|
||||
struct data_ext_args to_args;
|
||||
struct scoutfs_extent ext;
|
||||
LIST_HEAD(locks);
|
||||
bool done = false;
|
||||
loff_t from_size;
|
||||
loff_t to_size;
|
||||
u64 from_offline;
|
||||
u64 to_offline;
|
||||
u64 from_start;
|
||||
u64 to_start;
|
||||
u64 from_iblock;
|
||||
u64 to_iblock;
|
||||
u64 count;
|
||||
u64 junk;
|
||||
u64 seq;
|
||||
u64 map;
|
||||
u64 len;
|
||||
int ret;
|
||||
int err;
|
||||
int i;
|
||||
|
||||
lock_two_nondirectories(from, to);
|
||||
|
||||
ret = scoutfs_lock_inodes(sb, SCOUTFS_LOCK_WRITE,
|
||||
SCOUTFS_LKF_REFRESH_INODE, from, &from_lock,
|
||||
to, &to_lock, NULL, NULL, NULL, NULL);
|
||||
if (ret)
|
||||
goto out;
|
||||
|
||||
if ((from_off & SCOUTFS_BLOCK_SM_MASK) ||
|
||||
(to_off & SCOUTFS_BLOCK_SM_MASK) ||
|
||||
((byte_len & SCOUTFS_BLOCK_SM_MASK) &&
|
||||
(from_off + byte_len != i_size_read(from)))) {
|
||||
ret = -EINVAL;
|
||||
goto out;
|
||||
}
|
||||
|
||||
from_iblock = from_off >> SCOUTFS_BLOCK_SM_SHIFT;
|
||||
count = (byte_len + SCOUTFS_BLOCK_SM_MASK) >> SCOUTFS_BLOCK_SM_SHIFT;
|
||||
to_iblock = to_off >> SCOUTFS_BLOCK_SM_SHIFT;
|
||||
|
||||
if (S_ISDIR(from->i_mode) || S_ISDIR(to->i_mode)) {
|
||||
ret = -EISDIR;
|
||||
goto out;
|
||||
}
|
||||
|
||||
if (!S_ISREG(from->i_mode) || !S_ISREG(to->i_mode)) {
|
||||
ret = -EINVAL;
|
||||
goto out;
|
||||
}
|
||||
|
||||
ret = inode_permission(from, MAY_WRITE) ?:
|
||||
inode_permission(to, MAY_WRITE);
|
||||
if (ret < 0)
|
||||
goto out;
|
||||
|
||||
/* can't stage once data_version changes */
|
||||
scoutfs_inode_get_onoff(from, &junk, &from_offline);
|
||||
scoutfs_inode_get_onoff(to, &junk, &to_offline);
|
||||
if (from_offline || to_offline) {
|
||||
ret = -ENODATA;
|
||||
goto out;
|
||||
}
|
||||
|
||||
from_args = (struct data_ext_args) {
|
||||
.ino = scoutfs_ino(from),
|
||||
.inode = from,
|
||||
.lock = from_lock,
|
||||
};
|
||||
|
||||
to_args = (struct data_ext_args) {
|
||||
.ino = scoutfs_ino(to),
|
||||
.inode = to,
|
||||
.lock = to_lock,
|
||||
};
|
||||
|
||||
inode_dio_wait(from);
|
||||
inode_dio_wait(to);
|
||||
|
||||
ret = filemap_write_and_wait_range(&from->i_data, from_off,
|
||||
from_off + byte_len - 1);
|
||||
if (ret < 0)
|
||||
goto out;
|
||||
|
||||
for (;;) {
|
||||
ret = scoutfs_inode_index_start(sb, &seq) ?:
|
||||
scoutfs_inode_index_prepare(sb, &locks, from, true) ?:
|
||||
scoutfs_inode_index_prepare(sb, &locks, to, true) ?:
|
||||
scoutfs_inode_index_try_lock_hold(sb, &locks, seq,
|
||||
SIC_EXACT(1, 1));
|
||||
if (ret > 0)
|
||||
continue;
|
||||
if (ret < 0)
|
||||
goto out;
|
||||
|
||||
ret = scoutfs_dirty_inode_item(from, from_lock) ?:
|
||||
scoutfs_dirty_inode_item(to, to_lock);
|
||||
if (ret < 0)
|
||||
goto out;
|
||||
|
||||
down_write_two(&from_si->extent_sem, &to_si->extent_sem);
|
||||
|
||||
/* arbitrarily limit the number of extents per trans hold */
|
||||
for (i = 0; i < MOVE_DATA_EXTENTS_PER_HOLD; i++) {
|
||||
/* find the next extent to move */
|
||||
ret = scoutfs_ext_next(sb, &data_ext_ops, &from_args,
|
||||
from_iblock, 1, &ext);
|
||||
if (ret < 0) {
|
||||
if (ret == -ENOENT) {
|
||||
done = true;
|
||||
ret = 0;
|
||||
}
|
||||
break;
|
||||
}
|
||||
|
||||
/* only move extents within count and i_size */
|
||||
if (ext.start >= from_iblock + count ||
|
||||
ext.start >= i_size_read(from)) {
|
||||
done = true;
|
||||
ret = 0;
|
||||
break;
|
||||
}
|
||||
|
||||
from_start = max(ext.start, from_iblock);
|
||||
map = ext.map + (from_start - ext.start);
|
||||
len = min3(from_iblock + count,
|
||||
round_up((u64)i_size_read(from),
|
||||
SCOUTFS_BLOCK_SM_SIZE),
|
||||
ext.start + ext.len) - from_start;
|
||||
|
||||
to_start = to_iblock + (from_start - from_iblock);
|
||||
|
||||
/* insert the new, fails if it overlaps */
|
||||
ret = scoutfs_ext_insert(sb, &data_ext_ops, &to_args,
|
||||
to_start, len,
|
||||
map, ext.flags);
|
||||
if (ret < 0)
|
||||
break;
|
||||
|
||||
/* remove the old, possibly splitting */
|
||||
ret = scoutfs_ext_set(sb, &data_ext_ops, &from_args,
|
||||
from_start, len, 0, 0);
|
||||
if (ret < 0) {
|
||||
/* remove inserted new on err */
|
||||
err = scoutfs_ext_remove(sb, &data_ext_ops,
|
||||
&to_args, to_start,
|
||||
len);
|
||||
BUG_ON(err); /* XXX inconsistent */
|
||||
break;
|
||||
}
|
||||
|
||||
trace_scoutfs_data_move_blocks(sb, scoutfs_ino(from),
|
||||
from_start, len, map,
|
||||
ext.flags,
|
||||
scoutfs_ino(to),
|
||||
to_start);
|
||||
|
||||
/* moved extent might extend i_size */
|
||||
to_size = (to_start + len) << SCOUTFS_BLOCK_SM_SHIFT;
|
||||
if (to_size > i_size_read(to)) {
|
||||
/* while maintaining final partial */
|
||||
from_size = (from_start + len) <<
|
||||
SCOUTFS_BLOCK_SM_SHIFT;
|
||||
if (from_size > i_size_read(from))
|
||||
to_size -= from_size -
|
||||
i_size_read(from);
|
||||
i_size_write(to, to_size);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
up_write(&from_si->extent_sem);
|
||||
up_write(&to_si->extent_sem);
|
||||
|
||||
from->i_ctime = from->i_mtime =
|
||||
to->i_ctime = to->i_mtime = CURRENT_TIME;
|
||||
scoutfs_inode_inc_data_version(from);
|
||||
scoutfs_inode_inc_data_version(to);
|
||||
scoutfs_inode_set_data_seq(from);
|
||||
scoutfs_inode_set_data_seq(to);
|
||||
|
||||
scoutfs_update_inode_item(from, from_lock, &locks);
|
||||
scoutfs_update_inode_item(to, to_lock, &locks);
|
||||
scoutfs_release_trans(sb);
|
||||
scoutfs_inode_index_unlock(sb, &locks);
|
||||
|
||||
if (ret < 0 || done)
|
||||
break;
|
||||
}
|
||||
|
||||
/* remove any cached pages from old extents */
|
||||
truncate_inode_pages_extent(from, from_iblock, count);
|
||||
truncate_inode_pages_extent(to, to_iblock, count);
|
||||
|
||||
out:
|
||||
scoutfs_unlock(sb, from_lock, SCOUTFS_LOCK_WRITE);
|
||||
scoutfs_unlock(sb, to_lock, SCOUTFS_LOCK_WRITE);
|
||||
|
||||
unlock_two_nondirectories(from, to);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
/*
|
||||
* This copies to userspace :/
|
||||
*/
|
||||
|
||||
@@ -58,6 +58,8 @@ int scoutfs_data_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
|
||||
long scoutfs_fallocate(struct file *file, int mode, loff_t offset, loff_t len);
|
||||
int scoutfs_data_init_offline_extent(struct inode *inode, u64 size,
|
||||
struct scoutfs_lock *lock);
|
||||
int scoutfs_data_move_blocks(struct inode *from, u64 from_off,
|
||||
u64 byte_len, struct inode *to, u64 to_off);
|
||||
|
||||
int scoutfs_data_wait_check(struct inode *inode, loff_t pos, loff_t len,
|
||||
u8 sef, u8 op, struct scoutfs_data_wait *ow,
|
||||
|
||||
@@ -12,6 +12,7 @@
|
||||
*/
|
||||
#include <linux/kernel.h>
|
||||
#include <linux/fs.h>
|
||||
#include <linux/file.h>
|
||||
#include <linux/uaccess.h>
|
||||
#include <linux/compiler.h>
|
||||
#include <linux/uio.h>
|
||||
@@ -937,6 +938,54 @@ static long scoutfs_ioc_alloc_detail(struct file *file, unsigned long arg)
|
||||
args.copied;
|
||||
}
|
||||
|
||||
static long scoutfs_ioc_move_blocks(struct file *file, unsigned long arg)
|
||||
{
|
||||
struct inode *to = file_inode(file);
|
||||
struct super_block *sb = to->i_sb;
|
||||
struct scoutfs_ioctl_move_blocks __user *umb = (void __user *)arg;
|
||||
struct scoutfs_ioctl_move_blocks mb;
|
||||
struct file *from_file;
|
||||
struct inode *from;
|
||||
int ret;
|
||||
|
||||
if (copy_from_user(&mb, umb, sizeof(mb)))
|
||||
return -EFAULT;
|
||||
|
||||
if (mb.len == 0)
|
||||
return 0;
|
||||
|
||||
if (mb.from_off + mb.len < mb.from_off ||
|
||||
mb.to_off + mb.len < mb.to_off)
|
||||
return -EOVERFLOW;
|
||||
|
||||
from_file = fget(mb.from_fd);
|
||||
if (!from_file)
|
||||
return -EBADF;
|
||||
from = file_inode(from_file);
|
||||
|
||||
if (from == to) {
|
||||
ret = -EINVAL;
|
||||
goto out;
|
||||
}
|
||||
|
||||
if (from->i_sb != sb) {
|
||||
ret = -EXDEV;
|
||||
goto out;
|
||||
}
|
||||
|
||||
ret = mnt_want_write_file(file);
|
||||
if (ret < 0)
|
||||
goto out;
|
||||
|
||||
ret = scoutfs_data_move_blocks(from, mb.from_off, mb.len,
|
||||
to, mb.to_off);
|
||||
mnt_drop_write_file(file);
|
||||
out:
|
||||
fput(from_file);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
long scoutfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
|
||||
{
|
||||
switch (cmd) {
|
||||
@@ -964,6 +1013,8 @@ long scoutfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
|
||||
return scoutfs_ioc_data_wait_err(file, arg);
|
||||
case SCOUTFS_IOC_ALLOC_DETAIL:
|
||||
return scoutfs_ioc_alloc_detail(file, arg);
|
||||
case SCOUTFS_IOC_MOVE_BLOCKS:
|
||||
return scoutfs_ioc_move_blocks(file, arg);
|
||||
}
|
||||
|
||||
return -ENOTTY;
|
||||
|
||||
@@ -413,4 +413,55 @@ struct scoutfs_ioctl_alloc_detail_entry {
|
||||
#define SCOUTFS_IOC_ALLOC_DETAIL _IOR(SCOUTFS_IOCTL_MAGIC, 12, \
|
||||
struct scoutfs_ioctl_alloc_detail)
|
||||
|
||||
/*
|
||||
* Move extents from one regular file to another at a different offset,
|
||||
* on the same file system.
|
||||
*
|
||||
* from_fd specifies the source file and the ioctl is called on the
|
||||
* destination file. Both files must have write access. from_off
|
||||
* specifies the byte offset in the source, to_off is the byte offset in
|
||||
* the destination, and len is the number of bytes in the region to
|
||||
* move. All of the offsets and lengths must be in multiples of 4KB,
|
||||
* except in the case where the from_off + len ends at the i_size of the
|
||||
* source file.
|
||||
*
|
||||
* This interface only moves extents which are block granular, it does
|
||||
* not perform RMW of sub-block byte extents and it does not overwrite
|
||||
* existing extents in the destination. It will split extents in the
|
||||
* source.
|
||||
*
|
||||
* Only extents within i_size on the source are moved. The destination
|
||||
* i_size will be updated if extents are moved beyond its current
|
||||
* i_size. The i_size update will maintain final partial blocks in the
|
||||
* source.
|
||||
*
|
||||
* It will return an error if either of the files have offline extents.
|
||||
* It will return 0 when all of the extents in the source region have
|
||||
* been moved to the destination. Moving extents updates the ctime,
|
||||
* mtime, meta_seq, data_seq, and data_version fields of both the source
|
||||
* and destination inodes. If an error is returned then partial
|
||||
* progress may have been made and inode fields may have been updated.
|
||||
*
|
||||
* Errors specific to this interface include:
|
||||
*
|
||||
* EINVAL: from_off, len, or to_off aren't a multiple of 4KB; the source
|
||||
* and destination files are the same inode; either the source or
|
||||
* destination is not a regular file; the destination file has
|
||||
* an existing overlapping extent.
|
||||
* EOVERFLOW: either from_off + len or to_off + len exceeded 64bits.
|
||||
* EBADF: from_fd isn't a valid open file descriptor.
|
||||
* EXDEV: the source and destination files are in different filesystems.
|
||||
* EISDIR: either the source or destination is a directory.
|
||||
* ENODATA: either the source or destination file have offline extents.
|
||||
*/
|
||||
struct scoutfs_ioctl_move_blocks {
|
||||
__u64 from_fd;
|
||||
__u64 from_off;
|
||||
__u64 len;
|
||||
__u64 to_off;
|
||||
};
|
||||
|
||||
#define SCOUTFS_IOC_MOVE_BLOCKS _IOR(SCOUTFS_IOCTL_MAGIC, 13, \
|
||||
struct scoutfs_ioctl_move_blocks)
|
||||
|
||||
#endif
|
||||
|
||||
@@ -169,6 +169,40 @@ TRACE_EVENT(scoutfs_data_fallocate,
|
||||
__entry->len, __entry->ret)
|
||||
);
|
||||
|
||||
TRACE_EVENT(scoutfs_data_move_blocks,
|
||||
TP_PROTO(struct super_block *sb, u64 from_ino, u64 from_start, u64 len,
|
||||
u64 map, u8 flags, u64 to_ino, u64 to_start),
|
||||
|
||||
TP_ARGS(sb, from_ino, from_start, len, map, flags, to_ino, to_start),
|
||||
|
||||
TP_STRUCT__entry(
|
||||
SCSB_TRACE_FIELDS
|
||||
__field(__u64, from_ino)
|
||||
__field(__u64, from_start)
|
||||
__field(__u64, len)
|
||||
__field(__u64, map)
|
||||
__field(__u8, flags)
|
||||
__field(__u64, to_ino)
|
||||
__field(__u64, to_start)
|
||||
),
|
||||
|
||||
TP_fast_assign(
|
||||
SCSB_TRACE_ASSIGN(sb);
|
||||
__entry->from_ino = from_ino;
|
||||
__entry->from_start = from_start;
|
||||
__entry->len = len;
|
||||
__entry->map = map;
|
||||
__entry->flags = flags;
|
||||
__entry->to_ino = to_ino;
|
||||
__entry->to_start = to_start;
|
||||
),
|
||||
|
||||
TP_printk(SCSBF" from_ino %llu from_start %llu len %llu map %llu flags 0x%x to_ino %llu to_start %llu\n",
|
||||
SCSB_TRACE_ARGS, __entry->from_ino, __entry->from_start,
|
||||
__entry->len, __entry->map, __entry->flags, __entry->to_ino,
|
||||
__entry->to_start)
|
||||
);
|
||||
|
||||
TRACE_EVENT(scoutfs_data_fiemap,
|
||||
TP_PROTO(struct super_block *sb, __u64 start, __u64 len, int ret),
|
||||
|
||||
|
||||
Reference in New Issue
Block a user