mirror of
https://github.com/versity/scoutfs.git
synced 2026-05-03 19:35:43 +00:00
Compare commits
2 Commits
zab/punch_
...
clk/max_ho
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
2fcd56d0e2 | ||
|
|
e0d2aec2c0 |
@@ -86,18 +86,47 @@ static u64 smallest_order_length(u64 len)
|
||||
}
|
||||
|
||||
/*
|
||||
* An extent modification dirties three distinct leaves of an allocator
|
||||
* btree as it adds and removes the blkno and size sorted items for the
|
||||
* old and new lengths of the extent. Dirtying the paths to these
|
||||
* leaves can grow the tree and grow/shrink neighbours at each level.
|
||||
* We over-estimate the number of blocks allocated and freed (the paths
|
||||
* share a root, growth doesn't free) to err on the simpler and safer
|
||||
* side. The overhead is minimal given the relatively large list blocks
|
||||
* and relatively short allocator trees.
|
||||
* Moving an extent between trees can dirty blocks in several ways. This
|
||||
* function calculates worst case number of blocks across these scenarions.
|
||||
* We treat the alloc and free counts independently, so the values below are
|
||||
* max(allocated, freed), not the sum.
|
||||
*
|
||||
* We track extents with two separate btree items: by block number and by size.
|
||||
*
|
||||
* If we're removing an extent from the btree (allocating), we can dirty
|
||||
* two blocks if the keys are in different leaves. If we wind up merging
|
||||
* leaves because we fall below the low water mark, we can wind up freeing
|
||||
* three leaves.
|
||||
*
|
||||
* That sequence is as follows, assuming the original keys are removed from
|
||||
* blocks A and B:
|
||||
*
|
||||
* Allocate new dirty A' and B'
|
||||
* Free old stable A and B
|
||||
* B' has fallen below the low water mark, so copy B' into A'
|
||||
* Free B'
|
||||
*
|
||||
* An extent insertion (freeing an extent) can dirty up to five distinct items
|
||||
* in the btree as it adds and removes the blkno and size sorted items for the
|
||||
* old and new lengths of the extent:
|
||||
*
|
||||
* In the by-blkno portion of the btree, we can dirty (allocate for COW) up
|
||||
* to two blocks- either by merging adjacent extents, which can cause us to
|
||||
* join leaf blocks; or by an insertion that causes a split.
|
||||
*
|
||||
* In the by-size portion, we never merge extents, so normally we just dirty
|
||||
* a single item with a size insertion. But if we merged adjacent extents in
|
||||
* the by-blkno portion of the tree, we might be working with three by-sizex
|
||||
* items: removing the two old ones that were combined in the merge; and
|
||||
* adding the new one for the larger, merged size.
|
||||
*
|
||||
* Finally, dirtying the paths to these leaves can grow the tree and grow/shrink
|
||||
* neighbours at each level, so we multiply by the height of the tree after
|
||||
* accounting for a possible new level.
|
||||
*/
|
||||
static u32 extent_mod_blocks(u32 height)
|
||||
{
|
||||
return ((1 + height) * 2) * 3;
|
||||
return ((1 + height) * 3) * 5;
|
||||
}
|
||||
|
||||
/*
|
||||
|
||||
@@ -1515,93 +1515,6 @@ out:
|
||||
return ret;
|
||||
}
|
||||
|
||||
/*
|
||||
* Punch holes in offline extents. This is a very specific tool that
|
||||
* only does one job: it converts extents from offline to sparse. It
|
||||
* returns an error if it encounters an extent that isn't offline or has
|
||||
* a block mapping. It ignores i_size completely; it does not test it,
|
||||
* and does not update it.
|
||||
*
|
||||
* The caller has the inode locked in the vfs and performed basic sanity
|
||||
* checks. We manage transactions and the extent_sem which is ordered
|
||||
* inside the transaction.
|
||||
*/
|
||||
int scoutfs_data_punch_offline(struct inode *inode, u64 iblock, u64 last, u64 data_version,
|
||||
struct scoutfs_lock *lock)
|
||||
{
|
||||
struct scoutfs_inode_info *si = SCOUTFS_I(inode);
|
||||
struct super_block *sb = inode->i_sb;
|
||||
struct data_ext_args args = {
|
||||
.ino = scoutfs_ino(inode),
|
||||
.inode = inode,
|
||||
.lock = lock,
|
||||
};
|
||||
struct scoutfs_extent ext;
|
||||
LIST_HEAD(ind_locks);
|
||||
int ret;
|
||||
int i;
|
||||
|
||||
if (WARN_ON_ONCE(iblock > last)) {
|
||||
ret = -EINVAL;
|
||||
goto out;
|
||||
}
|
||||
|
||||
/* idiomatic to call start,last with 0,~0, clamp last to last possible */
|
||||
last = min(last, SCOUTFS_BLOCK_SM_MAX);
|
||||
|
||||
ret = 0;
|
||||
while (iblock <= last) {
|
||||
ret = scoutfs_inode_index_lock_hold(inode, &ind_locks, true, false) ?:
|
||||
scoutfs_dirty_inode_item(inode, lock);
|
||||
if (ret < 0)
|
||||
break;
|
||||
|
||||
down_write(&si->extent_sem);
|
||||
|
||||
for (i = 0; i < 32 && (iblock <= last); i++) {
|
||||
ret = scoutfs_ext_next(sb, &data_ext_ops, &args, iblock, 1, &ext);
|
||||
if (ret == -ENOENT || ext.start > iblock) {
|
||||
iblock = last + 1;
|
||||
ret = 0;
|
||||
break;
|
||||
}
|
||||
|
||||
if (ext.map) {
|
||||
ret = -EINVAL;
|
||||
break;
|
||||
}
|
||||
|
||||
if (ext.flags & SEF_OFFLINE) {
|
||||
if (iblock > ext.start) {
|
||||
ext.len -= iblock - ext.start;
|
||||
ext.start = iblock;
|
||||
}
|
||||
ext.len = min(ext.len, last - ext.start + 1);
|
||||
ext.flags &= ~SEF_OFFLINE;
|
||||
|
||||
ret = scoutfs_ext_set(sb, &data_ext_ops, &args,
|
||||
ext.start, ext.len, ext.map, ext.flags);
|
||||
if (ret < 0)
|
||||
break;
|
||||
}
|
||||
|
||||
iblock = ext.start + ext.len;
|
||||
}
|
||||
|
||||
up_write(&si->extent_sem);
|
||||
|
||||
scoutfs_update_inode_item(inode, lock, &ind_locks);
|
||||
scoutfs_release_trans(sb);
|
||||
scoutfs_inode_index_unlock(sb, &ind_locks);
|
||||
|
||||
if (ret < 0)
|
||||
break;
|
||||
}
|
||||
|
||||
out:
|
||||
return ret;
|
||||
}
|
||||
|
||||
/*
|
||||
* This copies to userspace :/
|
||||
*/
|
||||
|
||||
@@ -57,8 +57,6 @@ int scoutfs_data_init_offline_extent(struct inode *inode, u64 size,
|
||||
int scoutfs_data_move_blocks(struct inode *from, u64 from_off,
|
||||
u64 byte_len, struct inode *to, u64 to_off, bool to_stage,
|
||||
u64 data_version);
|
||||
int scoutfs_data_punch_offline(struct inode *inode, u64 iblock, u64 last, u64 data_version,
|
||||
struct scoutfs_lock *lock);
|
||||
|
||||
int scoutfs_data_wait_check(struct inode *inode, loff_t pos, loff_t len,
|
||||
u8 sef, u8 op, struct scoutfs_data_wait *ow,
|
||||
|
||||
@@ -1668,78 +1668,6 @@ out:
|
||||
return ret;
|
||||
}
|
||||
|
||||
static long scoutfs_ioc_punch_offline(struct file *file, unsigned long arg)
|
||||
{
|
||||
struct inode *inode = file_inode(file);
|
||||
struct super_block *sb = inode->i_sb;
|
||||
struct scoutfs_ioctl_punch_offline __user *upo = (void __user *)arg;
|
||||
struct scoutfs_ioctl_punch_offline po;
|
||||
struct scoutfs_lock *lock = NULL;
|
||||
u64 iblock;
|
||||
u64 last;
|
||||
u64 tmp;
|
||||
int ret;
|
||||
|
||||
if (copy_from_user(&po, upo, sizeof(po)))
|
||||
return -EFAULT;
|
||||
|
||||
if (po.len == 0)
|
||||
return 0;
|
||||
|
||||
if (check_add_overflow(po.offset, po.len - 1, &tmp) ||
|
||||
(po.offset & SCOUTFS_BLOCK_SM_MASK) ||
|
||||
(po.len & SCOUTFS_BLOCK_SM_MASK))
|
||||
return -EOVERFLOW;
|
||||
|
||||
if (po.flags)
|
||||
return -EINVAL;
|
||||
|
||||
ret = mnt_want_write_file(file);
|
||||
if (ret < 0)
|
||||
return ret;
|
||||
|
||||
inode_lock(inode);
|
||||
|
||||
ret = scoutfs_lock_inode(sb, SCOUTFS_LOCK_WRITE,
|
||||
SCOUTFS_LKF_REFRESH_INODE, inode, &lock);
|
||||
if (ret)
|
||||
goto out;
|
||||
|
||||
if (!S_ISREG(inode->i_mode)) {
|
||||
ret = -EINVAL;
|
||||
goto out;
|
||||
}
|
||||
|
||||
if (!(file->f_mode & FMODE_WRITE)) {
|
||||
ret = -EINVAL;
|
||||
goto out;
|
||||
}
|
||||
|
||||
ret = inode_permission(KC_VFS_INIT_NS inode, MAY_WRITE);
|
||||
if (ret < 0)
|
||||
goto out;
|
||||
|
||||
if (scoutfs_inode_data_version(inode) != po.data_version) {
|
||||
ret = -ESTALE;
|
||||
goto out;
|
||||
}
|
||||
|
||||
if ((ret = scoutfs_inode_check_retention(inode)))
|
||||
goto out;
|
||||
|
||||
iblock = po.offset >> SCOUTFS_BLOCK_SM_SHIFT;
|
||||
last = (po.offset + po.len - 1) >> SCOUTFS_BLOCK_SM_SHIFT;
|
||||
|
||||
ret = scoutfs_data_punch_offline(inode, iblock, last, po.data_version, lock);
|
||||
|
||||
out:
|
||||
scoutfs_unlock(sb, lock, SCOUTFS_LOCK_WRITE);
|
||||
inode_unlock(inode);
|
||||
mnt_drop_write_file(file);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
long scoutfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
|
||||
{
|
||||
switch (cmd) {
|
||||
@@ -1789,8 +1717,6 @@ long scoutfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
|
||||
return scoutfs_ioc_mod_quota_rule(file, arg, false);
|
||||
case SCOUTFS_IOC_READ_XATTR_INDEX:
|
||||
return scoutfs_ioc_read_xattr_index(file, arg);
|
||||
case SCOUTFS_IOC_PUNCH_OFFLINE:
|
||||
return scoutfs_ioc_punch_offline(file, arg);
|
||||
}
|
||||
|
||||
return -ENOTTY;
|
||||
|
||||
@@ -843,32 +843,4 @@ struct scoutfs_ioctl_read_xattr_index {
|
||||
#define SCOUTFS_IOC_READ_XATTR_INDEX \
|
||||
_IOR(SCOUTFS_IOCTL_MAGIC, 23, struct scoutfs_ioctl_read_xattr_index)
|
||||
|
||||
/*
|
||||
* This is a limited and specific version of hole punching. It's an
|
||||
* archive layer operation that only converts unmapped offline extents
|
||||
* into sparse extents. It is intended to be used when restoring sparse
|
||||
* files after the initial creation set the entire file size offline.
|
||||
*
|
||||
* The offset and len fields are in units of bytes and must be aligned
|
||||
* to the small (4KiB) block size. All regions of offline extents
|
||||
* covered by the region will be converted into sparse online extents,
|
||||
* including regions that straddle the boundaries of the region. Any
|
||||
* existing sparse extents in the region are ignored.
|
||||
*
|
||||
* The data_version must match the inode or EINVAL is returned. The
|
||||
* data_version is not modified by this operation.
|
||||
*
|
||||
* EINVAL is returned if any mapped extents are found in the region. If
|
||||
* an error is returned then partial progress may have been made.
|
||||
*/
|
||||
struct scoutfs_ioctl_punch_offline {
|
||||
__u64 offset;
|
||||
__u64 len;
|
||||
__u64 data_version;
|
||||
__u64 flags;
|
||||
};
|
||||
|
||||
#define SCOUTFS_IOC_PUNCH_OFFLINE \
|
||||
_IOW(SCOUTFS_IOCTL_MAGIC, 24, struct scoutfs_ioctl_punch_offline)
|
||||
|
||||
#endif
|
||||
|
||||
@@ -1966,15 +1966,17 @@ DEFINE_EVENT(scoutfs_server_client_count_class, scoutfs_server_client_down,
|
||||
);
|
||||
|
||||
DECLARE_EVENT_CLASS(scoutfs_server_commit_users_class,
|
||||
TP_PROTO(struct super_block *sb, int holding, int applying, int nr_holders,
|
||||
u32 avail_before, u32 freed_before, int committing, int exceeded),
|
||||
TP_ARGS(sb, holding, applying, nr_holders, avail_before, freed_before, committing,
|
||||
exceeded),
|
||||
TP_PROTO(struct super_block *sb, int holding, int applying,
|
||||
int nr_holders, u32 budget,
|
||||
u32 avail_before, u32 freed_before,
|
||||
int committing, int exceeded),
|
||||
TP_ARGS(sb, holding, applying, nr_holders, budget, avail_before, freed_before, committing, exceeded),
|
||||
TP_STRUCT__entry(
|
||||
SCSB_TRACE_FIELDS
|
||||
__field(int, holding)
|
||||
__field(int, applying)
|
||||
__field(int, nr_holders)
|
||||
__field(u32, budget)
|
||||
__field(__u32, avail_before)
|
||||
__field(__u32, freed_before)
|
||||
__field(int, committing)
|
||||
@@ -1985,35 +1987,45 @@ DECLARE_EVENT_CLASS(scoutfs_server_commit_users_class,
|
||||
__entry->holding = !!holding;
|
||||
__entry->applying = !!applying;
|
||||
__entry->nr_holders = nr_holders;
|
||||
__entry->budget = budget;
|
||||
__entry->avail_before = avail_before;
|
||||
__entry->freed_before = freed_before;
|
||||
__entry->committing = !!committing;
|
||||
__entry->exceeded = !!exceeded;
|
||||
),
|
||||
TP_printk(SCSBF" holding %u applying %u nr %u avail_before %u freed_before %u committing %u exceeded %u",
|
||||
SCSB_TRACE_ARGS, __entry->holding, __entry->applying, __entry->nr_holders,
|
||||
__entry->avail_before, __entry->freed_before, __entry->committing,
|
||||
__entry->exceeded)
|
||||
TP_printk(SCSBF" holding %u applying %u nr %u budget %u avail_before %u freed_before %u committing %u exceeded %u",
|
||||
SCSB_TRACE_ARGS, __entry->holding, __entry->applying,
|
||||
__entry->nr_holders, __entry->budget,
|
||||
__entry->avail_before, __entry->freed_before,
|
||||
__entry->committing, __entry->exceeded)
|
||||
);
|
||||
DEFINE_EVENT(scoutfs_server_commit_users_class, scoutfs_server_commit_hold,
|
||||
TP_PROTO(struct super_block *sb, int holding, int applying, int nr_holders,
|
||||
u32 avail_before, u32 freed_before, int committing, int exceeded),
|
||||
TP_ARGS(sb, holding, applying, nr_holders, avail_before, freed_before, committing, exceeded)
|
||||
TP_PROTO(struct super_block *sb, int holding, int applying,
|
||||
int nr_holders, u32 budget,
|
||||
u32 avail_before, u32 freed_before,
|
||||
int committing, int exceeded),
|
||||
TP_ARGS(sb, holding, applying, nr_holders, budget, avail_before, freed_before, committing, exceeded)
|
||||
);
|
||||
DEFINE_EVENT(scoutfs_server_commit_users_class, scoutfs_server_commit_apply,
|
||||
TP_PROTO(struct super_block *sb, int holding, int applying, int nr_holders,
|
||||
u32 avail_before, u32 freed_before, int committing, int exceeded),
|
||||
TP_ARGS(sb, holding, applying, nr_holders, avail_before, freed_before, committing, exceeded)
|
||||
TP_PROTO(struct super_block *sb, int holding, int applying,
|
||||
int nr_holders, u32 budget,
|
||||
u32 avail_before, u32 freed_before,
|
||||
int committing, int exceeded),
|
||||
TP_ARGS(sb, holding, applying, nr_holders, budget, avail_before, freed_before, committing, exceeded)
|
||||
);
|
||||
DEFINE_EVENT(scoutfs_server_commit_users_class, scoutfs_server_commit_start,
|
||||
TP_PROTO(struct super_block *sb, int holding, int applying, int nr_holders,
|
||||
u32 avail_before, u32 freed_before, int committing, int exceeded),
|
||||
TP_ARGS(sb, holding, applying, nr_holders, avail_before, freed_before, committing, exceeded)
|
||||
TP_PROTO(struct super_block *sb, int holding, int applying,
|
||||
int nr_holders, u32 budget,
|
||||
u32 avail_before, u32 freed_before,
|
||||
int committing, int exceeded),
|
||||
TP_ARGS(sb, holding, applying, nr_holders, budget, avail_before, freed_before, committing, exceeded)
|
||||
);
|
||||
DEFINE_EVENT(scoutfs_server_commit_users_class, scoutfs_server_commit_end,
|
||||
TP_PROTO(struct super_block *sb, int holding, int applying, int nr_holders,
|
||||
u32 avail_before, u32 freed_before, int committing, int exceeded),
|
||||
TP_ARGS(sb, holding, applying, nr_holders, avail_before, freed_before, committing, exceeded)
|
||||
TP_PROTO(struct super_block *sb, int holding, int applying,
|
||||
int nr_holders, u32 budget,
|
||||
u32 avail_before, u32 freed_before,
|
||||
int committing, int exceeded),
|
||||
TP_ARGS(sb, holding, applying, nr_holders, budget, avail_before, freed_before, committing, exceeded)
|
||||
);
|
||||
|
||||
#define slt_symbolic(mode) \
|
||||
|
||||
@@ -65,6 +65,7 @@ struct commit_users {
|
||||
struct list_head holding;
|
||||
struct list_head applying;
|
||||
unsigned int nr_holders;
|
||||
u32 budget;
|
||||
u32 avail_before;
|
||||
u32 freed_before;
|
||||
bool committing;
|
||||
@@ -84,8 +85,9 @@ static void init_commit_users(struct commit_users *cusers)
|
||||
do { \
|
||||
__typeof__(cusers) _cusers = (cusers); \
|
||||
trace_scoutfs_server_commit_##which(sb, !list_empty(&_cusers->holding), \
|
||||
!list_empty(&_cusers->applying), _cusers->nr_holders, _cusers->avail_before, \
|
||||
_cusers->freed_before, _cusers->committing, _cusers->exceeded); \
|
||||
!list_empty(&_cusers->applying), _cusers->nr_holders, _cusers->budget, \
|
||||
_cusers->avail_before, _cusers->freed_before, _cusers->committing, \
|
||||
_cusers->exceeded); \
|
||||
} while (0)
|
||||
|
||||
struct server_info {
|
||||
@@ -303,7 +305,6 @@ static void check_holder_budget(struct super_block *sb, struct server_info *serv
|
||||
u32 freed_used;
|
||||
u32 avail_now;
|
||||
u32 freed_now;
|
||||
u32 budget;
|
||||
|
||||
assert_spin_locked(&cusers->lock);
|
||||
|
||||
@@ -318,15 +319,14 @@ static void check_holder_budget(struct super_block *sb, struct server_info *serv
|
||||
else
|
||||
freed_used = SCOUTFS_ALLOC_LIST_MAX_BLOCKS - freed_now;
|
||||
|
||||
budget = cusers->nr_holders * COMMIT_HOLD_ALLOC_BUDGET;
|
||||
if (avail_used <= budget && freed_used <= budget)
|
||||
if (avail_used <= cusers->budget && freed_used <= cusers->budget)
|
||||
return;
|
||||
|
||||
exceeded_once = true;
|
||||
cusers->exceeded = cusers->nr_holders;
|
||||
|
||||
scoutfs_err(sb, "%u holders exceeded alloc budget av: bef %u now %u, fr: bef %u now %u",
|
||||
cusers->nr_holders, cusers->avail_before, avail_now,
|
||||
scoutfs_err(sb, "holders exceeded alloc budget %u av: bef %u now %u, fr: bef %u now %u",
|
||||
cusers->budget, cusers->avail_before, avail_now,
|
||||
cusers->freed_before, freed_now);
|
||||
|
||||
list_for_each_entry(hold, &cusers->holding, entry) {
|
||||
@@ -349,7 +349,7 @@ static bool hold_commit(struct super_block *sb, struct server_info *server,
|
||||
{
|
||||
bool has_room;
|
||||
bool held;
|
||||
u32 budget;
|
||||
u32 new_budget;
|
||||
u32 av;
|
||||
u32 fr;
|
||||
|
||||
@@ -367,8 +367,8 @@ static bool hold_commit(struct super_block *sb, struct server_info *server,
|
||||
}
|
||||
|
||||
/* +2 for our additional hold and then for the final commit work the server does */
|
||||
budget = (cusers->nr_holders + 2) * COMMIT_HOLD_ALLOC_BUDGET;
|
||||
has_room = av >= budget && fr >= budget;
|
||||
new_budget = max(cusers->budget, (cusers->nr_holders + 2) * COMMIT_HOLD_ALLOC_BUDGET);
|
||||
has_room = av >= new_budget && fr >= new_budget;
|
||||
/* checking applying so holders drain once an apply caller starts waiting */
|
||||
held = !cusers->committing && has_room && list_empty(&cusers->applying);
|
||||
|
||||
@@ -388,6 +388,7 @@ static bool hold_commit(struct super_block *sb, struct server_info *server,
|
||||
list_add_tail(&hold->entry, &cusers->holding);
|
||||
|
||||
cusers->nr_holders++;
|
||||
cusers->budget = new_budget;
|
||||
|
||||
} else if (!has_room && cusers->nr_holders == 0 && !cusers->committing) {
|
||||
cusers->committing = true;
|
||||
@@ -516,6 +517,7 @@ static void commit_end(struct super_block *sb, struct commit_users *cusers, int
|
||||
list_for_each_entry_safe(hold, tmp, &cusers->applying, entry)
|
||||
list_del_init(&hold->entry);
|
||||
cusers->committing = false;
|
||||
cusers->budget = 0;
|
||||
spin_unlock(&cusers->lock);
|
||||
|
||||
wake_up(&cusers->waitq);
|
||||
|
||||
Reference in New Issue
Block a user