Fix double free of metadata blocks in log merging

The log merging process is meant to provide parallelism across workers
in mounts.  The idea is that the server hands out a bunch of concurrent
non-intersecting work that's based on the structure of the stable input
fs_root btree.

The nature of the parallel work (cow of the blocks that intersect a key
range) means that the ranges of concurrently issued work can't overlap
or the work will all cow the same input blocks, freeing that input
stable block multiple times.  We're seeing this in testing.

Correctness was intended by having an advancing key that sweeps sorted
ranges.  Duplicate ranges would never be hit as the key advanced past
each it visited.  This was broken by the mapping of the fs item keys to
log merge tree keys by clobbering the sk_zone key value.  It effectively
interleaves the ranges of each zone in the fs root (meta indexes,
orphans, fs items).  With just the right log merge conditions that
involve logged items in the right places and partial completed work to
insert remaining ranges behind the key, ranges can be stored at mapped
keys that end up with ranges out of order.  The server iterates over
these and ends up issueing overlapping work, which results in duplicated
frees of the input blocks.

The fix, without changing the format of the stored log tree items, is to
perform a full sweep of all the range items and determine the next item
by looking at the full precision stored keys.  This ensures that the
processed ranges always advance and never overlap.

Signed-off-by: Zach Brown <zab@versity.com>
This commit is contained in:
Zach Brown
2025-11-08 10:04:19 -08:00
parent 8484a58dd6
commit e182914e51

View File

@@ -994,10 +994,11 @@ static int for_each_rid_last_lt(struct super_block *sb, struct scoutfs_btree_roo
}
/*
* Log merge range items are stored at the starting fs key of the range.
* The only fs key field that doesn't hold information is the zone, so
* we use the zone to differentiate all types that we store in the log
* merge tree.
* Log merge range items are stored at the starting fs key of the range
* with the zone overwritten to indicate the log merge item type. This
* day0 mistake loses sorting information for items in the different
* zones in the fs root, so the range items aren't strictly sorted by
* the starting key of their range.
*/
static void init_log_merge_key(struct scoutfs_key *key, u8 zone, u64 first,
u64 second)
@@ -1029,6 +1030,51 @@ static int next_log_merge_item_key(struct super_block *sb, struct scoutfs_btree_
return ret;
}
/*
* The range items aren't sorted by their range.start because
* _RANGE_ZONE clobbers the range's zone. We sweep all the items and
* find the range with the next least starting key that's greater than
* the caller's starting key. We have to be careful to iterate over the
* log_merge tree keys because the ranges can overlap as they're mapped
* to the log_merge keys by clobbering their zone.
*/
static int next_log_merge_range(struct super_block *sb, struct scoutfs_btree_root *root,
struct scoutfs_key *start, struct scoutfs_log_merge_range *rng)
{
struct scoutfs_log_merge_range *next;
SCOUTFS_BTREE_ITEM_REF(iref);
struct scoutfs_key key;
int ret;
key = *start;
key.sk_zone = SCOUTFS_LOG_MERGE_RANGE_ZONE;
scoutfs_key_set_ones(&rng->start);
do {
ret = scoutfs_btree_next(sb, root, &key, &iref);
if (ret == 0) {
if (iref.key->sk_zone != SCOUTFS_LOG_MERGE_RANGE_ZONE) {
ret = -ENOENT;
} else if (iref.val_len != sizeof(struct scoutfs_log_merge_range)) {
ret = -EIO;
} else {
next = iref.val;
if (scoutfs_key_compare(&next->start, &rng->start) < 0 &&
scoutfs_key_compare(&next->start, start) >= 0)
*rng = *next;
key = *iref.key;
scoutfs_key_inc(&key);
}
scoutfs_btree_put_iref(&iref);
}
} while (ret == 0);
if (ret == -ENOENT && !scoutfs_key_is_ones(&rng->start))
ret = 0;
return ret;
}
static int next_log_merge_item(struct super_block *sb,
struct scoutfs_btree_root *root,
u8 zone, u64 first, u64 second,
@@ -2720,10 +2766,7 @@ restart:
/* find the next range, always checking for splicing */
for (;;) {
key = stat.next_range_key;
key.sk_zone = SCOUTFS_LOG_MERGE_RANGE_ZONE;
ret = next_log_merge_item_key(sb, &super->log_merge, SCOUTFS_LOG_MERGE_RANGE_ZONE,
&key, &rng, sizeof(rng));
ret = next_log_merge_range(sb, &super->log_merge, &stat.next_range_key, &rng);
if (ret < 0 && ret != -ENOENT) {
err_str = "finding merge range item";
goto out;