Compare commits

..

2 Commits

Author SHA1 Message Date
Chris Kirby
2fcd56d0e2 Fix commit budget calculation with multiple holders
The try_drain_data_freed() path was generating errors about overrunning
its commit budget:

scoutfs f.2b8928.r.02689f error: 1 holders exceeded alloc budget av: bef 8185 now 8036, fr: bef 8185 now 7602

The budget overrun check was using the current number of commit holders
(in this case one) instead of the the maximum number of concurrent holders
(in this case two). So even well behaved paths like try_drain_data_freed()
can appear to exceed their commit budget if other holders dirty some blocks
and apply their commits before the try_drain_data_freed() thread does its
final budget reconciliation.

Signed-off-by: Chris Kirby <ckirby@versity.com>
2025-06-17 11:38:07 -05:00
Chris Kirby
e0d2aec2c0 Fix dirtied block calculation in extent_mod_blocks()
Free extents are stored in two btrees: one sorted by block number, one
by size. So if you insert a new extent between two existing extents, you can
be modifying two items in the by-block-number tree. And depending on the size
of those items, that can result in three items over in the -by-size tree.
So that's a 5x multiplier per level.

If we're shrinking the tree and adding more freed blocks, we're conceptually
dirtying two blocks at each level to merge. (current *2 in the code).
But if they fall under the low water mark then one of them is freed, so we
can have *3 per level in this case.

Signed-off-by: Chris Kirby <ckirby@versity.com>
2025-06-17 11:38:07 -05:00
9 changed files with 134 additions and 113 deletions

View File

@@ -86,18 +86,47 @@ static u64 smallest_order_length(u64 len)
}
/*
* An extent modification dirties three distinct leaves of an allocator
* btree as it adds and removes the blkno and size sorted items for the
* old and new lengths of the extent. Dirtying the paths to these
* leaves can grow the tree and grow/shrink neighbours at each level.
* We over-estimate the number of blocks allocated and freed (the paths
* share a root, growth doesn't free) to err on the simpler and safer
* side. The overhead is minimal given the relatively large list blocks
* and relatively short allocator trees.
* Moving an extent between trees can dirty blocks in several ways. This
* function calculates worst case number of blocks across these scenarions.
* We treat the alloc and free counts independently, so the values below are
* max(allocated, freed), not the sum.
*
* We track extents with two separate btree items: by block number and by size.
*
* If we're removing an extent from the btree (allocating), we can dirty
* two blocks if the keys are in different leaves. If we wind up merging
* leaves because we fall below the low water mark, we can wind up freeing
* three leaves.
*
* That sequence is as follows, assuming the original keys are removed from
* blocks A and B:
*
* Allocate new dirty A' and B'
* Free old stable A and B
* B' has fallen below the low water mark, so copy B' into A'
* Free B'
*
* An extent insertion (freeing an extent) can dirty up to five distinct items
* in the btree as it adds and removes the blkno and size sorted items for the
* old and new lengths of the extent:
*
* In the by-blkno portion of the btree, we can dirty (allocate for COW) up
* to two blocks- either by merging adjacent extents, which can cause us to
* join leaf blocks; or by an insertion that causes a split.
*
* In the by-size portion, we never merge extents, so normally we just dirty
* a single item with a size insertion. But if we merged adjacent extents in
* the by-blkno portion of the tree, we might be working with three by-sizex
* items: removing the two old ones that were combined in the merge; and
* adding the new one for the larger, merged size.
*
* Finally, dirtying the paths to these leaves can grow the tree and grow/shrink
* neighbours at each level, so we multiply by the height of the tree after
* accounting for a possible new level.
*/
static u32 extent_mod_blocks(u32 height)
{
return ((1 + height) * 2) * 3;
return ((1 + height) * 3) * 5;
}
/*

View File

@@ -296,39 +296,28 @@ static s64 truncate_extents(struct super_block *sb, struct inode *inode,
* and offline blocks. If it's not provided then the inode is being
* destroyed and isn't reachable, we don't need to update it.
*
* If 'pause' is set, then we are destroying the inode and we should take
* breaks occasionally to allow other nodes access to this inode lock shard.
*
* The caller is in charge of locking the inode and data, but we may
* have to modify far more items than fit in a transaction so we're in
* charge of batching updates into transactions. If the inode is
* provided then we're responsible for updating its item as we go.
*/
int scoutfs_data_truncate_items(struct super_block *sb, struct inode *inode,
u64 ino, u64 *iblock, u64 last, bool offline,
struct scoutfs_lock *lock, bool pause)
u64 ino, u64 iblock, u64 last, bool offline,
struct scoutfs_lock *lock)
{
struct scoutfs_inode_info *si = NULL;
LIST_HEAD(ind_locks);
u64 cur_seq;
s64 ret = 0;
WARN_ON_ONCE(inode && !inode_is_locked(inode));
/*
* If the inode is provided, then we aren't destroying it. So it's not
* safe to pause while removing items- it needs to be done in one chunk.
*/
if (WARN_ON_ONCE(pause && inode))
return -EINVAL;
/* clamp last to the last possible block? */
if (last > SCOUTFS_BLOCK_SM_MAX)
last = SCOUTFS_BLOCK_SM_MAX;
trace_scoutfs_data_truncate_items(sb, *iblock, last, offline, pause);
trace_scoutfs_data_truncate_items(sb, iblock, last, offline);
if (WARN_ON_ONCE(last < *iblock))
if (WARN_ON_ONCE(last < iblock))
return -EINVAL;
if (inode) {
@@ -336,9 +325,7 @@ int scoutfs_data_truncate_items(struct super_block *sb, struct inode *inode,
down_write(&si->extent_sem);
}
cur_seq = scoutfs_trans_sample_seq(sb);
while (*iblock <= last) {
while (iblock <= last) {
if (inode)
ret = scoutfs_inode_index_lock_hold(inode, &ind_locks, true, false);
else
@@ -352,7 +339,7 @@ int scoutfs_data_truncate_items(struct super_block *sb, struct inode *inode,
ret = 0;
if (ret == 0)
ret = truncate_extents(sb, inode, ino, *iblock, last,
ret = truncate_extents(sb, inode, ino, iblock, last,
offline, lock);
if (inode)
@@ -364,19 +351,8 @@ int scoutfs_data_truncate_items(struct super_block *sb, struct inode *inode,
if (ret <= 0)
break;
*iblock = ret;
iblock = ret;
ret = 0;
/*
* We know there's more to do because truncate_extents()
* pauses every EXTENTS_PER_HOLD extents and it returned the
* next starting block. Our caller might also want us to pause,
* which we will do whenever we cross a transaction boundary.
*/
if (pause && (cur_seq != scoutfs_trans_sample_seq(sb))) {
ret = -EINPROGRESS;
break;
}
}
if (si)

View File

@@ -47,8 +47,8 @@ int scoutfs_get_block_write(struct inode *inode, sector_t iblock, struct buffer_
int create);
int scoutfs_data_truncate_items(struct super_block *sb, struct inode *inode,
u64 ino, u64 *iblock, u64 last, bool offline,
struct scoutfs_lock *lock, bool pause);
u64 ino, u64 iblock, u64 last, bool offline,
struct scoutfs_lock *lock);
int scoutfs_data_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
u64 start, u64 len);
long scoutfs_fallocate(struct file *file, int mode, loff_t offset, loff_t len);

View File

@@ -468,8 +468,8 @@ int scoutfs_complete_truncate(struct inode *inode, struct scoutfs_lock *lock)
start = (i_size_read(inode) + SCOUTFS_BLOCK_SM_SIZE - 1) >>
SCOUTFS_BLOCK_SM_SHIFT;
ret = scoutfs_data_truncate_items(inode->i_sb, inode,
scoutfs_ino(inode), &start, ~0ULL,
false, lock, false);
scoutfs_ino(inode), start, ~0ULL,
false, lock);
err = clear_truncate_flag(inode, lock);
return ret ? ret : err;
@@ -1635,8 +1635,7 @@ int scoutfs_inode_orphan_delete(struct super_block *sb, u64 ino, struct scoutfs_
* partial deletion until all deletion is complete and the orphan item
* is removed.
*/
static int delete_inode_items(struct super_block *sb, u64 ino,
struct scoutfs_inode *sinode, u64 *start,
static int delete_inode_items(struct super_block *sb, u64 ino, struct scoutfs_inode *sinode,
struct scoutfs_lock *lock, struct scoutfs_lock *orph_lock)
{
struct scoutfs_key key;
@@ -1655,8 +1654,8 @@ static int delete_inode_items(struct super_block *sb, u64 ino,
/* remove data items in their own transactions */
if (S_ISREG(mode)) {
ret = scoutfs_data_truncate_items(sb, NULL, ino, start, ~0ULL,
false, lock, true);
ret = scoutfs_data_truncate_items(sb, NULL, ino, 0, ~0ULL,
false, lock);
if (ret)
goto out;
}
@@ -1804,23 +1803,16 @@ out:
*/
static int try_delete_inode_items(struct super_block *sb, u64 ino)
{
struct inode_deletion_lock_data *ldata;
struct scoutfs_lock *orph_lock;
struct scoutfs_lock *lock;
struct inode_deletion_lock_data *ldata = NULL;
struct scoutfs_lock *orph_lock = NULL;
struct scoutfs_lock *lock = NULL;
struct scoutfs_inode sinode;
struct scoutfs_key key;
bool clear_trying = false;
bool more = false;
u64 group_nr;
u64 start = 0;
int bit_nr;
int ret;
again:
ldata = NULL;
orph_lock = NULL;
lock = NULL;
ret = scoutfs_lock_ino(sb, SCOUTFS_LOCK_WRITE, 0, ino, &lock);
if (ret < 0)
goto out;
@@ -1832,12 +1824,11 @@ again:
goto out;
/* only one local attempt per inode at a time */
if (!more && test_and_set_bit(bit_nr, ldata->trying)) {
if (test_and_set_bit(bit_nr, ldata->trying)) {
ret = 0;
goto out;
}
clear_trying = true;
more = false;
/* can't delete if it's cached in local or remote mounts */
if (scoutfs_omap_test(sb, ino) || test_bit_le(bit_nr, ldata->map.bits)) {
@@ -1862,15 +1853,7 @@ again:
if (ret < 0)
goto out;
ret = delete_inode_items(sb, ino, &sinode, &start, lock, orph_lock);
if (ret == -EINPROGRESS) {
more = true;
clear_trying = false;
} else {
more = false;
}
ret = delete_inode_items(sb, ino, &sinode, lock, orph_lock);
out:
if (clear_trying)
clear_bit(bit_nr, ldata->trying);
@@ -1878,9 +1861,6 @@ out:
scoutfs_unlock(sb, lock, SCOUTFS_LOCK_WRITE);
scoutfs_unlock(sb, orph_lock, SCOUTFS_LOCK_WRITE_ONLY);
if (more)
goto again;
return ret;
}

View File

@@ -372,9 +372,9 @@ static long scoutfs_ioc_release(struct file *file, unsigned long arg)
sblock = args.offset >> SCOUTFS_BLOCK_SM_SHIFT;
eblock = (args.offset + args.length - 1) >> SCOUTFS_BLOCK_SM_SHIFT;
ret = scoutfs_data_truncate_items(sb, inode, scoutfs_ino(inode),
&sblock,
sblock,
eblock, true,
lock, false);
lock);
if (ret == 0) {
scoutfs_inode_get_onoff(inode, &online, &offline);
isize = i_size_read(inode);
@@ -383,8 +383,8 @@ static long scoutfs_ioc_release(struct file *file, unsigned long arg)
>> SCOUTFS_BLOCK_SM_SHIFT;
ret = scoutfs_data_truncate_items(sb, inode,
scoutfs_ino(inode),
&sblock, U64_MAX,
false, lock, false);
sblock, U64_MAX,
false, lock);
}
}

View File

@@ -378,16 +378,15 @@ DEFINE_EVENT(scoutfs_data_file_extent_class, scoutfs_data_fiemap_extent,
);
TRACE_EVENT(scoutfs_data_truncate_items,
TP_PROTO(struct super_block *sb, __u64 iblock, __u64 last, int offline, bool pause),
TP_PROTO(struct super_block *sb, __u64 iblock, __u64 last, int offline),
TP_ARGS(sb, iblock, last, offline, pause),
TP_ARGS(sb, iblock, last, offline),
TP_STRUCT__entry(
SCSB_TRACE_FIELDS
__field(__u64, iblock)
__field(__u64, last)
__field(int, offline)
__field(bool, pause)
),
TP_fast_assign(
@@ -395,12 +394,10 @@ TRACE_EVENT(scoutfs_data_truncate_items,
__entry->iblock = iblock;
__entry->last = last;
__entry->offline = offline;
__entry->pause = pause;
),
TP_printk(SCSBF" iblock %llu last %llu offline %u pause %d",
SCSB_TRACE_ARGS, __entry->iblock, __entry->last,
__entry->offline, __entry->pause)
TP_printk(SCSBF" iblock %llu last %llu offline %u", SCSB_TRACE_ARGS,
__entry->iblock, __entry->last, __entry->offline)
);
TRACE_EVENT(scoutfs_data_wait_check,
@@ -1969,15 +1966,17 @@ DEFINE_EVENT(scoutfs_server_client_count_class, scoutfs_server_client_down,
);
DECLARE_EVENT_CLASS(scoutfs_server_commit_users_class,
TP_PROTO(struct super_block *sb, int holding, int applying, int nr_holders,
u32 avail_before, u32 freed_before, int committing, int exceeded),
TP_ARGS(sb, holding, applying, nr_holders, avail_before, freed_before, committing,
exceeded),
TP_PROTO(struct super_block *sb, int holding, int applying,
int nr_holders, u32 budget,
u32 avail_before, u32 freed_before,
int committing, int exceeded),
TP_ARGS(sb, holding, applying, nr_holders, budget, avail_before, freed_before, committing, exceeded),
TP_STRUCT__entry(
SCSB_TRACE_FIELDS
__field(int, holding)
__field(int, applying)
__field(int, nr_holders)
__field(u32, budget)
__field(__u32, avail_before)
__field(__u32, freed_before)
__field(int, committing)
@@ -1988,35 +1987,45 @@ DECLARE_EVENT_CLASS(scoutfs_server_commit_users_class,
__entry->holding = !!holding;
__entry->applying = !!applying;
__entry->nr_holders = nr_holders;
__entry->budget = budget;
__entry->avail_before = avail_before;
__entry->freed_before = freed_before;
__entry->committing = !!committing;
__entry->exceeded = !!exceeded;
),
TP_printk(SCSBF" holding %u applying %u nr %u avail_before %u freed_before %u committing %u exceeded %u",
SCSB_TRACE_ARGS, __entry->holding, __entry->applying, __entry->nr_holders,
__entry->avail_before, __entry->freed_before, __entry->committing,
__entry->exceeded)
TP_printk(SCSBF" holding %u applying %u nr %u budget %u avail_before %u freed_before %u committing %u exceeded %u",
SCSB_TRACE_ARGS, __entry->holding, __entry->applying,
__entry->nr_holders, __entry->budget,
__entry->avail_before, __entry->freed_before,
__entry->committing, __entry->exceeded)
);
DEFINE_EVENT(scoutfs_server_commit_users_class, scoutfs_server_commit_hold,
TP_PROTO(struct super_block *sb, int holding, int applying, int nr_holders,
u32 avail_before, u32 freed_before, int committing, int exceeded),
TP_ARGS(sb, holding, applying, nr_holders, avail_before, freed_before, committing, exceeded)
TP_PROTO(struct super_block *sb, int holding, int applying,
int nr_holders, u32 budget,
u32 avail_before, u32 freed_before,
int committing, int exceeded),
TP_ARGS(sb, holding, applying, nr_holders, budget, avail_before, freed_before, committing, exceeded)
);
DEFINE_EVENT(scoutfs_server_commit_users_class, scoutfs_server_commit_apply,
TP_PROTO(struct super_block *sb, int holding, int applying, int nr_holders,
u32 avail_before, u32 freed_before, int committing, int exceeded),
TP_ARGS(sb, holding, applying, nr_holders, avail_before, freed_before, committing, exceeded)
TP_PROTO(struct super_block *sb, int holding, int applying,
int nr_holders, u32 budget,
u32 avail_before, u32 freed_before,
int committing, int exceeded),
TP_ARGS(sb, holding, applying, nr_holders, budget, avail_before, freed_before, committing, exceeded)
);
DEFINE_EVENT(scoutfs_server_commit_users_class, scoutfs_server_commit_start,
TP_PROTO(struct super_block *sb, int holding, int applying, int nr_holders,
u32 avail_before, u32 freed_before, int committing, int exceeded),
TP_ARGS(sb, holding, applying, nr_holders, avail_before, freed_before, committing, exceeded)
TP_PROTO(struct super_block *sb, int holding, int applying,
int nr_holders, u32 budget,
u32 avail_before, u32 freed_before,
int committing, int exceeded),
TP_ARGS(sb, holding, applying, nr_holders, budget, avail_before, freed_before, committing, exceeded)
);
DEFINE_EVENT(scoutfs_server_commit_users_class, scoutfs_server_commit_end,
TP_PROTO(struct super_block *sb, int holding, int applying, int nr_holders,
u32 avail_before, u32 freed_before, int committing, int exceeded),
TP_ARGS(sb, holding, applying, nr_holders, avail_before, freed_before, committing, exceeded)
TP_PROTO(struct super_block *sb, int holding, int applying,
int nr_holders, u32 budget,
u32 avail_before, u32 freed_before,
int committing, int exceeded),
TP_ARGS(sb, holding, applying, nr_holders, budget, avail_before, freed_before, committing, exceeded)
);
#define slt_symbolic(mode) \

View File

@@ -65,6 +65,7 @@ struct commit_users {
struct list_head holding;
struct list_head applying;
unsigned int nr_holders;
u32 budget;
u32 avail_before;
u32 freed_before;
bool committing;
@@ -84,8 +85,9 @@ static void init_commit_users(struct commit_users *cusers)
do { \
__typeof__(cusers) _cusers = (cusers); \
trace_scoutfs_server_commit_##which(sb, !list_empty(&_cusers->holding), \
!list_empty(&_cusers->applying), _cusers->nr_holders, _cusers->avail_before, \
_cusers->freed_before, _cusers->committing, _cusers->exceeded); \
!list_empty(&_cusers->applying), _cusers->nr_holders, _cusers->budget, \
_cusers->avail_before, _cusers->freed_before, _cusers->committing, \
_cusers->exceeded); \
} while (0)
struct server_info {
@@ -303,7 +305,6 @@ static void check_holder_budget(struct super_block *sb, struct server_info *serv
u32 freed_used;
u32 avail_now;
u32 freed_now;
u32 budget;
assert_spin_locked(&cusers->lock);
@@ -318,15 +319,14 @@ static void check_holder_budget(struct super_block *sb, struct server_info *serv
else
freed_used = SCOUTFS_ALLOC_LIST_MAX_BLOCKS - freed_now;
budget = cusers->nr_holders * COMMIT_HOLD_ALLOC_BUDGET;
if (avail_used <= budget && freed_used <= budget)
if (avail_used <= cusers->budget && freed_used <= cusers->budget)
return;
exceeded_once = true;
cusers->exceeded = cusers->nr_holders;
scoutfs_err(sb, "%u holders exceeded alloc budget av: bef %u now %u, fr: bef %u now %u",
cusers->nr_holders, cusers->avail_before, avail_now,
scoutfs_err(sb, "holders exceeded alloc budget %u av: bef %u now %u, fr: bef %u now %u",
cusers->budget, cusers->avail_before, avail_now,
cusers->freed_before, freed_now);
list_for_each_entry(hold, &cusers->holding, entry) {
@@ -349,7 +349,7 @@ static bool hold_commit(struct super_block *sb, struct server_info *server,
{
bool has_room;
bool held;
u32 budget;
u32 new_budget;
u32 av;
u32 fr;
@@ -367,8 +367,8 @@ static bool hold_commit(struct super_block *sb, struct server_info *server,
}
/* +2 for our additional hold and then for the final commit work the server does */
budget = (cusers->nr_holders + 2) * COMMIT_HOLD_ALLOC_BUDGET;
has_room = av >= budget && fr >= budget;
new_budget = max(cusers->budget, (cusers->nr_holders + 2) * COMMIT_HOLD_ALLOC_BUDGET);
has_room = av >= new_budget && fr >= new_budget;
/* checking applying so holders drain once an apply caller starts waiting */
held = !cusers->committing && has_room && list_empty(&cusers->applying);
@@ -388,6 +388,7 @@ static bool hold_commit(struct super_block *sb, struct server_info *server,
list_add_tail(&hold->entry, &cusers->holding);
cusers->nr_holders++;
cusers->budget = new_budget;
} else if (!has_room && cusers->nr_holders == 0 && !cusers->committing) {
cusers->committing = true;
@@ -516,6 +517,7 @@ static void commit_end(struct super_block *sb, struct commit_users *cusers, int
list_for_each_entry_safe(hold, tmp, &cusers->applying, entry)
list_del_init(&hold->entry);
cusers->committing = false;
cusers->budget = 0;
spin_unlock(&cusers->lock);
wake_up(&cusers->waitq);

View File

@@ -1,3 +1,4 @@
== setting longer hung task timeout
== creating fragmented extents
== unlink file with moved extents to free extents per block
== cleanup

View File

@@ -10,6 +10,30 @@ EXTENTS_PER_BTREE_BLOCK=600
EXTENTS_PER_LIST_BLOCK=8192
FREED_EXTENTS=$((EXTENTS_PER_BTREE_BLOCK * EXTENTS_PER_LIST_BLOCK))
#
# This test specifically creates a pathologically sparse file that will
# be as expensive as possible to free. This is usually fine on
# dedicated or reasonable hardware, but trying to run this in
# virtualized debug kernels can take a very long time. This test is
# about making sure that the server doesn't fail, not that the platform
# can handle the scale of work that our btree formats happen to require
# while execution is bogged down with use-after-free memory reference
# tracking. So we give the test a lot more breathing room before
# deciding that its hung.
#
echo "== setting longer hung task timeout"
if [ -w /proc/sys/kernel/hung_task_timeout_secs ]; then
secs=$(cat /proc/sys/kernel/hung_task_timeout_secs)
test "$secs" -gt 0 || \
t_fail "confusing value '$secs' from /proc/sys/kernel/hung_task_timeout_secs"
restore_hung_task_timeout()
{
echo "$secs" > /proc/sys/kernel/hung_task_timeout_secs
}
trap restore_hung_task_timeout EXIT
echo "$((secs * 5))" > /proc/sys/kernel/hung_task_timeout_secs
fi
echo "== creating fragmented extents"
fragmented_data_extents $FREED_EXTENTS $EXTENTS_PER_BTREE_BLOCK "$T_D0/alloc" "$T_D0/move"