From add1da10dc5cc4866df809377f386c31bdadef1c Mon Sep 17 00:00:00 2001 From: Auke Kok Date: Tue, 17 Mar 2026 12:27:57 -0700 Subject: [PATCH 1/6] Add test for stale seq in merge delta combining. merge_read_item() fails to update found->seq when combining delta items from multiple finalized log trees. Add a test case to replicate the conditions of this issue. Each of 5 mounts sets totl value 1 on 2500 shared keys, giving an expected total of 5 per key. Any total > 5 proves double-counting from a stale seq. The log_merge_force_partial trigger forces many partial merges per cycle, creating the conditions where stale-seq items get spliced into fs_root while finalized logs still exist. Parallel readers on all mounts race against this window to detect double-counted values. Signed-off-by: Auke Kok --- tests/golden/totl-merge-read | 3 ++ tests/sequence | 1 + tests/tests/totl-merge-read.sh | 50 ++++++++++++++++++++++++++++++++++ 3 files changed, 54 insertions(+) create mode 100644 tests/golden/totl-merge-read create mode 100644 tests/tests/totl-merge-read.sh diff --git a/tests/golden/totl-merge-read b/tests/golden/totl-merge-read new file mode 100644 index 00000000..931671e6 --- /dev/null +++ b/tests/golden/totl-merge-read @@ -0,0 +1,3 @@ +== setup +expected 4681 +== cleanup diff --git a/tests/sequence b/tests/sequence index e296553a..bdd8aba7 100644 --- a/tests/sequence +++ b/tests/sequence @@ -27,6 +27,7 @@ simple-xattr-unit.sh retention-basic.sh totl-xattr-tag.sh quota.sh +totl-merge-read.sh lock-refleak.sh lock-shrink-consistency.sh lock-shrink-read-race.sh diff --git a/tests/tests/totl-merge-read.sh b/tests/tests/totl-merge-read.sh new file mode 100644 index 00000000..8c37c93a --- /dev/null +++ b/tests/tests/totl-merge-read.sh @@ -0,0 +1,50 @@ +# +# Test that merge_read_item() correctly updates the sequence number when +# combining delta items from multiple finalized log trees. Each mount +# sets a totl value in its own 3-bit lane (powers of 8) so that any +# double-counting overflows the lane and is caught by: or(v, exp) != exp. +# + +t_require_commands setfattr scoutfs +t_require_mounts 5 + +echo "== setup" +for nr in $(t_fs_nrs); do + d=$(eval echo \$T_D$nr) + for i in $(seq 1 2500); do : > "$d/f$nr$i"; done +done +sync +t_force_log_merge + +vals=(1 8 64 512 4096) +expected=4681 +n=0 +for nr in $(t_fs_nrs); do + d=$(eval echo \$T_D$nr) + v=${vals[$((n++))]} + for i in $(seq 1 2500); do + setfattr -n "scoutfs.totl.t.$i.0.0" -v $v "$d/f$nr$i" + done +done + +t_trigger_arm_silent log_merge_force_partial $(t_server_nr) + +bad="$T_TMPDIR/bad" +for nr in $(t_fs_nrs); do + ( while true; do + echo 1 > "$(t_debugfs_path $nr)/drop_weak_item_cache" + scoutfs read-xattr-totals -p "$(eval echo \$T_M$nr)" | \ + awk -F'[ =,]+' -v e=$expected 'or($2+0,e) != e' + done ) >> "$bad" & +done + +echo "expected $expected" +t_force_log_merge +t_silent_kill $(jobs -p) +test -s "$bad" && echo "double-counted:" && cat "$bad" + +echo "== cleanup" +for nr in $(t_fs_nrs); do + find "$(eval echo \$T_D$nr)" -name "f$nr*" -delete +done +t_pass From b724567b2a68e510af38042713798aba3c0a40ab Mon Sep 17 00:00:00 2001 From: Auke Kok Date: Tue, 17 Mar 2026 12:01:27 -0700 Subject: [PATCH 2/6] Add log_merge_force_partial trigger for testing partial merges. Add a trigger that forces btree_merge() to return -ERANGE after modifying a leaf's worth of items, causing many small partial merges per merge cycle. This is used by tests to reliably reproduce races that depend on partial merges splicing items into fs_root while finalized logs still exist. The trigger check lives inside btree_merge() where it can observe actual item modification progress, rather than overriding the caller's dirty byte limit argument which applies to the whole writer context. Signed-off-by: Auke Kok --- kmod/src/btree.c | 8 ++++++++ kmod/src/triggers.c | 1 + kmod/src/triggers.h | 1 + 3 files changed, 10 insertions(+) diff --git a/kmod/src/btree.c b/kmod/src/btree.c index 7a21afc7..caabc065 100644 --- a/kmod/src/btree.c +++ b/kmod/src/btree.c @@ -2486,6 +2486,14 @@ int scoutfs_btree_merge(struct super_block *sb, mitem = next_mitem(mitem); free_mitem(&rng, tmp); } + + if (mitem && walk_val_len == 0 && + !(walk_flags & (BTW_INSERT | BTW_DELETE)) && + scoutfs_trigger(sb, LOG_MERGE_FORCE_PARTIAL)) { + ret = -ERANGE; + *next_ret = mitem->key; + goto out; + } } ret = 0; diff --git a/kmod/src/triggers.c b/kmod/src/triggers.c index 4616d013..15ddf907 100644 --- a/kmod/src/triggers.c +++ b/kmod/src/triggers.c @@ -46,6 +46,7 @@ static char *names[] = { [SCOUTFS_TRIGGER_SRCH_MERGE_STOP_SAFE] = "srch_merge_stop_safe", [SCOUTFS_TRIGGER_STATFS_LOCK_PURGE] = "statfs_lock_purge", [SCOUTFS_TRIGGER_RECLAIM_SKIP_FINALIZE] = "reclaim_skip_finalize", + [SCOUTFS_TRIGGER_LOG_MERGE_FORCE_PARTIAL] = "log_merge_force_partial", }; bool scoutfs_trigger_test_and_clear(struct super_block *sb, unsigned int t) diff --git a/kmod/src/triggers.h b/kmod/src/triggers.h index 64a798b7..6d70cbea 100644 --- a/kmod/src/triggers.h +++ b/kmod/src/triggers.h @@ -9,6 +9,7 @@ enum scoutfs_trigger { SCOUTFS_TRIGGER_SRCH_MERGE_STOP_SAFE, SCOUTFS_TRIGGER_STATFS_LOCK_PURGE, SCOUTFS_TRIGGER_RECLAIM_SKIP_FINALIZE, + SCOUTFS_TRIGGER_LOG_MERGE_FORCE_PARTIAL, SCOUTFS_TRIGGER_NR, }; From 38d36c9f5ca3a75956d582c2085ea4fc705b90c7 Mon Sep 17 00:00:00 2001 From: Auke Kok Date: Tue, 17 Mar 2026 12:28:11 -0700 Subject: [PATCH 3/6] Update seq when merging deltas from partial log merge. Two different clients can write delta's for totl indexes at the same time, recording their changes. When merged, a reader should apply both in order, and only once. To do so, the seq determines whether the delta has been applied already. The code fails to update the seq while walking the trees for deltas to apply. Subsequently, when processing subsequent trees, it could re-process deltas already applied. In case of a large negative delta (e.g. removal of large amounts of files), the totl value could become negative, resulting in quota lockout. The fix is simple: advance the seq when reading partial delta merges to avoid double counting. Signed-off-by: Auke Kok --- kmod/src/btree.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/kmod/src/btree.c b/kmod/src/btree.c index caabc065..d8a25f0f 100644 --- a/kmod/src/btree.c +++ b/kmod/src/btree.c @@ -2183,6 +2183,8 @@ static int merge_read_item(struct super_block *sb, struct scoutfs_key *key, u64 if (ret > 0) { if (ret == SCOUTFS_DELTA_COMBINED) { scoutfs_inc_counter(sb, btree_merge_delta_combined); + if (seq > found->seq) + found->seq = seq; } else if (ret == SCOUTFS_DELTA_COMBINED_NULL) { scoutfs_inc_counter(sb, btree_merge_delta_null); free_mitem(rng, found); From 857a39579ec54b7a7727b3bc04af9f5ecba1db49 Mon Sep 17 00:00:00 2001 From: Auke Kok Date: Thu, 2 Apr 2026 10:49:00 -0700 Subject: [PATCH 4/6] Clear roots when retrying due to stale btree blocks. Before deltas were added this code path was correct, but with deltas we can't just retry this without clearing &root, since it would potentially double count. The condition where this could happen is when there are deltas in several finalized log trees, and we've made progress towards reading some of them, and then encounter a stale btree block. The retry would not clear the collected trees, apply the same delta as was already applied before the retry, and thus double count. Signed-off-by: Auke Kok --- kmod/src/wkic.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/kmod/src/wkic.c b/kmod/src/wkic.c index 44eef494..58737c62 100644 --- a/kmod/src/wkic.c +++ b/kmod/src/wkic.c @@ -886,8 +886,12 @@ retry_stale: trace_scoutfs_wkic_read_items(sb, key, &start, &end); ret = scoutfs_block_check_stale(sb, ret, &saved, &roots.fs_root.ref, &roots.logs_root.ref); if (ret < 0) { - if (ret == -ESTALE) + if (ret == -ESTALE) { + /* not safe to retry due to delta items, must restart clean */ + free_item_tree(&root); + root = RB_ROOT; goto retry_stale; + } goto out; } From 8bdc20af21d47e2b12bfa0efd4e88c604cb96ffd Mon Sep 17 00:00:00 2001 From: Auke Kok Date: Tue, 7 Apr 2026 10:33:46 -0700 Subject: [PATCH 5/6] Rename/reword FINALIZED to MERGE_INPUT. These mislabeled members and enums were clearly not describing the actual data being handled and obfuscating the intent of avoiding mixing merge input items with non-merge input items. Signed-off-by: Auke Kok --- kmod/src/forest.c | 4 ++-- kmod/src/forest.h | 2 +- kmod/src/totl.c | 42 +++++++++++++++++++++++++----------------- kmod/src/totl.h | 6 +++--- 4 files changed, 31 insertions(+), 23 deletions(-) diff --git a/kmod/src/forest.c b/kmod/src/forest.c index 306f4841..fee22785 100644 --- a/kmod/src/forest.c +++ b/kmod/src/forest.c @@ -318,14 +318,14 @@ int scoutfs_forest_read_items_roots(struct super_block *sb, struct scoutfs_net_r scoutfs_inc_counter(sb, forest_bloom_pass); if ((le64_to_cpu(lt.flags) & SCOUTFS_LOG_TREES_FINALIZED)) - rid.fic |= FIC_FINALIZED; + rid.fic |= FIC_MERGE_INPUT; ret = scoutfs_btree_read_items(sb, <.item_root, key, start, end, forest_read_items, &rid); if (ret < 0) goto out; - rid.fic &= ~FIC_FINALIZED; + rid.fic &= ~FIC_MERGE_INPUT; } ret = 0; diff --git a/kmod/src/forest.h b/kmod/src/forest.h index 151affb8..0a11a14b 100644 --- a/kmod/src/forest.h +++ b/kmod/src/forest.h @@ -11,7 +11,7 @@ struct scoutfs_lock; /* caller gives an item to the callback */ enum { FIC_FS_ROOT = (1 << 0), - FIC_FINALIZED = (1 << 1), + FIC_MERGE_INPUT = (1 << 1), }; typedef int (*scoutfs_forest_item_cb)(struct super_block *sb, struct scoutfs_key *key, u64 seq, u8 flags, void *val, int val_len, int fic, void *arg); diff --git a/kmod/src/totl.c b/kmod/src/totl.c index cfa9b31a..50c6daca 100644 --- a/kmod/src/totl.c +++ b/kmod/src/totl.c @@ -30,6 +30,11 @@ void scoutfs_totl_merge_init(struct scoutfs_totl_merging *merg) memset(merg, 0, sizeof(struct scoutfs_totl_merging)); } +/* + * bin the incoming merge inputs so that we can resolve delta items + * properly. Finalized logs that are merge inputs are kept separately + * from those that are not. + */ void scoutfs_totl_merge_contribute(struct scoutfs_totl_merging *merg, u64 seq, u8 flags, void *val, int val_len, int fic) { @@ -39,10 +44,10 @@ void scoutfs_totl_merge_contribute(struct scoutfs_totl_merging *merg, merg->fs_seq = seq; merg->fs_total = le64_to_cpu(tval->total); merg->fs_count = le64_to_cpu(tval->count); - } else if (fic & FIC_FINALIZED) { - merg->fin_seq = seq; - merg->fin_total += le64_to_cpu(tval->total); - merg->fin_count += le64_to_cpu(tval->count); + } else if (fic & FIC_MERGE_INPUT) { + merg->inp_seq = seq; + merg->inp_total += le64_to_cpu(tval->total); + merg->inp_count += le64_to_cpu(tval->count); } else { merg->log_seq = seq; merg->log_total += le64_to_cpu(tval->total); @@ -53,15 +58,18 @@ void scoutfs_totl_merge_contribute(struct scoutfs_totl_merging *merg, /* * .totl. item merging has to be careful because the log btree merging * code can write partial results to the fs_root. This means that a - * reader can see both cases where new finalized logs should be applied - * to the old fs items and where old finalized logs have already been - * applied to the partially merged fs items. Currently active logged - * items are always applied on top of all cases. + * reader can see both cases where merge input deltas should be applied + * to the old fs items and where they have already been applied to the + * partially merged fs items. + * + * Only finalized log trees that are inputs to the current merge cycle + * are tracked in the inp_ bucket. Finalized trees that aren't merge + * inputs and active log trees are always applied unconditionally since + * they cannot be in fs_root. * * These cases are differentiated with a combination of sequence numbers - * in items, the count of contributing xattrs, and a flag - * differentiating finalized and active logged items. This lets us - * recognize all cases, including when finalized logs were merged and + * in items and the count of contributing xattrs. This lets us + * recognize all cases, including when merge inputs were merged and * deleted the fs item. */ void scoutfs_totl_merge_resolve(struct scoutfs_totl_merging *merg, __u64 *total, __u64 *count) @@ -75,14 +83,14 @@ void scoutfs_totl_merge_resolve(struct scoutfs_totl_merging *merg, __u64 *total, *count = merg->fs_count; } - /* apply finalized logs if they're newer or creating */ - if (((merg->fs_seq != 0) && (merg->fin_seq > merg->fs_seq)) || - ((merg->fs_seq == 0) && (merg->fin_count > 0))) { - *total += merg->fin_total; - *count += merg->fin_count; + /* apply merge input deltas if they're newer or creating */ + if (((merg->fs_seq != 0) && (merg->inp_seq > merg->fs_seq)) || + ((merg->fs_seq == 0) && (merg->inp_count > 0))) { + *total += merg->inp_total; + *count += merg->inp_count; } - /* always apply active logs which must be newer than fs and finalized */ + /* always apply non-input finalized and active logs */ if (merg->log_seq > 0) { *total += merg->log_total; *count += merg->log_count; diff --git a/kmod/src/totl.h b/kmod/src/totl.h index 52de1fd6..abcb685e 100644 --- a/kmod/src/totl.h +++ b/kmod/src/totl.h @@ -7,9 +7,9 @@ struct scoutfs_totl_merging { u64 fs_seq; u64 fs_total; u64 fs_count; - u64 fin_seq; - u64 fin_total; - s64 fin_count; + u64 inp_seq; + u64 inp_total; + s64 inp_count; u64 log_seq; u64 log_total; s64 log_count; From 5e2009f939240ebadb29292ea0e3b7ff452341a1 Mon Sep 17 00:00:00 2001 From: Auke Kok Date: Wed, 8 Apr 2026 13:27:42 -0700 Subject: [PATCH 6/6] Avoid double counting deltas from non-input finalized log trees. Readers currently accumulate all finalized log tree deltas into a single bucket for deciding whether they are already in fs_root or not, but, finalized trees that aren't inputs to a current merge will have higher seqs, and thus we may be double applying deltas already merged into fs_root. To distinguish, scoutfs_totl_merge_contribute() needs to know the merge status item seq. We change wkic's get_roots() from using the SCOUTFS_NET_CMD_GET_ROOTS RPC to reading the superblock directly. This is needed because totl merge resolution has to use the same data as the btree roots it is operating on, thus we can't grab it from a SCOUTFS_NET_CMD_GET_ROOTS packet - it likely is different. Signed-off-by: Auke Kok --- kmod/src/forest.c | 12 ++++--- kmod/src/forest.h | 6 ++-- kmod/src/wkic.c | 83 ++++++++++++++++++++++++++++++++++++++--------- 3 files changed, 78 insertions(+), 23 deletions(-) diff --git a/kmod/src/forest.c b/kmod/src/forest.c index fee22785..38681273 100644 --- a/kmod/src/forest.c +++ b/kmod/src/forest.c @@ -239,9 +239,9 @@ static int forest_read_items(struct super_block *sb, struct scoutfs_key *key, u6 * to reset their state and retry with a newer version of the btrees. */ int scoutfs_forest_read_items_roots(struct super_block *sb, struct scoutfs_net_roots *roots, - struct scoutfs_key *key, struct scoutfs_key *bloom_key, - struct scoutfs_key *start, struct scoutfs_key *end, - scoutfs_forest_item_cb cb, void *arg) + u64 merge_input_seq, struct scoutfs_key *key, + struct scoutfs_key *bloom_key, struct scoutfs_key *start, + struct scoutfs_key *end, scoutfs_forest_item_cb cb, void *arg) { struct forest_read_items_data rid = { .cb = cb, @@ -317,7 +317,9 @@ int scoutfs_forest_read_items_roots(struct super_block *sb, struct scoutfs_net_r scoutfs_inc_counter(sb, forest_bloom_pass); - if ((le64_to_cpu(lt.flags) & SCOUTFS_LOG_TREES_FINALIZED)) + if ((le64_to_cpu(lt.flags) & SCOUTFS_LOG_TREES_FINALIZED) && + (merge_input_seq == 0 || + le64_to_cpu(lt.finalize_seq) < merge_input_seq)) rid.fic |= FIC_MERGE_INPUT; ret = scoutfs_btree_read_items(sb, <.item_root, key, start, @@ -345,7 +347,7 @@ int scoutfs_forest_read_items(struct super_block *sb, ret = scoutfs_client_get_roots(sb, &roots); if (ret == 0) - ret = scoutfs_forest_read_items_roots(sb, &roots, key, bloom_key, start, end, + ret = scoutfs_forest_read_items_roots(sb, &roots, 0, key, bloom_key, start, end, cb, arg); return ret; } diff --git a/kmod/src/forest.h b/kmod/src/forest.h index 0a11a14b..a3d5e4b2 100644 --- a/kmod/src/forest.h +++ b/kmod/src/forest.h @@ -25,9 +25,9 @@ int scoutfs_forest_read_items(struct super_block *sb, struct scoutfs_key *end, scoutfs_forest_item_cb cb, void *arg); int scoutfs_forest_read_items_roots(struct super_block *sb, struct scoutfs_net_roots *roots, - struct scoutfs_key *key, struct scoutfs_key *bloom_key, - struct scoutfs_key *start, struct scoutfs_key *end, - scoutfs_forest_item_cb cb, void *arg); + u64 merge_input_seq, struct scoutfs_key *key, + struct scoutfs_key *bloom_key, struct scoutfs_key *start, + struct scoutfs_key *end, scoutfs_forest_item_cb cb, void *arg); int scoutfs_forest_set_bloom_bits(struct super_block *sb, struct scoutfs_lock *lock); void scoutfs_forest_set_max_seq(struct super_block *sb, u64 max_seq); diff --git a/kmod/src/wkic.c b/kmod/src/wkic.c index 58737c62..eee77f95 100644 --- a/kmod/src/wkic.c +++ b/kmod/src/wkic.c @@ -95,6 +95,7 @@ struct wkic_info { /* block reading slow path */ struct mutex roots_mutex; struct scoutfs_net_roots roots; + u64 merge_input_seq; u64 roots_read_seq; ktime_t roots_expire; @@ -805,29 +806,79 @@ static void free_page_list(struct super_block *sb, struct list_head *list) * read_seq number so that we can compare the age of the items in cached * pages. Only one request to refresh the roots is in progress at a * time. This is the slow path that's only used when the cache isn't - * populated and the roots aren't cached. The root request is fast - * enough, especially compared to the resulting item reading IO, that we - * don't mind hiding it behind a trivial mutex. + * populated and the roots aren't cached. + * + * We read roots directly from the on-disk superblock rather than + * requesting them from the server so that we can also read the + * log_merge btree from the same superblock. The merge status item + * seq tells us which finalized log trees are inputs to the current + * merge, which is needed to correctly resolve totl delta items. */ -static int get_roots(struct super_block *sb, struct wkic_info *winf, - struct scoutfs_net_roots *roots_ret, u64 *read_seq, bool force_new) +static int refresh_roots(struct super_block *sb, struct wkic_info *winf) +{ + struct scoutfs_super_block *super; + struct scoutfs_log_merge_status *stat; + SCOUTFS_BTREE_ITEM_REF(iref); + struct scoutfs_key key; + int ret; + + super = kmalloc(sizeof(*super), GFP_NOFS); + if (!super) + return -ENOMEM; + + ret = scoutfs_read_super(sb, super); + if (ret < 0) + goto out; + + winf->roots = (struct scoutfs_net_roots){ + .fs_root = super->fs_root, + .logs_root = super->logs_root, + .srch_root = super->srch_root, + }; + + winf->merge_input_seq = 0; + if (super->log_merge.ref.blkno) { + scoutfs_key_set_zeros(&key); + key.sk_zone = SCOUTFS_LOG_MERGE_STATUS_ZONE; + ret = scoutfs_btree_lookup(sb, &super->log_merge, &key, &iref); + if (ret == 0) { + if (iref.val_len == sizeof(*stat)) { + stat = iref.val; + winf->merge_input_seq = le64_to_cpu(stat->seq); + } else { + ret = -EUCLEAN; + } + scoutfs_btree_put_iref(&iref); + } else if (ret == -ENOENT) { + ret = 0; + } + if (ret < 0) + goto out; + } + + winf->roots_read_seq++; + winf->roots_expire = ktime_add_ms(ktime_get_raw(), WKIC_CACHE_LIFETIME_MS); +out: + kfree(super); + return ret; +} + +static int get_roots(struct super_block *sb, struct wkic_info *winf, + struct scoutfs_net_roots *roots_ret, u64 *merge_input_seq, + u64 *read_seq, bool force_new) { - struct scoutfs_net_roots roots; int ret; mutex_lock(&winf->roots_mutex); if (force_new || ktime_before(winf->roots_expire, ktime_get_raw())) { - ret = scoutfs_client_get_roots(sb, &roots); + ret = refresh_roots(sb, winf); if (ret < 0) goto out; - - winf->roots = roots; - winf->roots_read_seq++; - winf->roots_expire = ktime_add_ms(ktime_get_raw(), WKIC_CACHE_LIFETIME_MS); } *roots_ret = winf->roots; + *merge_input_seq = winf->merge_input_seq; *read_seq = winf->roots_read_seq; ret = 0; out: @@ -870,20 +921,22 @@ static int insert_read_pages(struct super_block *sb, struct wkic_info *winf, struct scoutfs_key end; struct wkic_page *wpage; LIST_HEAD(pages); + u64 merge_input_seq; u64 read_seq; int ret; ret = 0; retry_stale: - ret = get_roots(sb, winf, &roots, &read_seq, ret == -ESTALE); + ret = get_roots(sb, winf, &roots, &merge_input_seq, &read_seq, ret == -ESTALE); if (ret < 0) - goto out; + goto check_stale; start = *range_start; end = *range_end; - ret = scoutfs_forest_read_items_roots(sb, &roots, key, range_start, &start, &end, - read_items_cb, &root); + ret = scoutfs_forest_read_items_roots(sb, &roots, merge_input_seq, key, range_start, + &start, &end, read_items_cb, &root); trace_scoutfs_wkic_read_items(sb, key, &start, &end); +check_stale: ret = scoutfs_block_check_stale(sb, ret, &saved, &roots.fs_root.ref, &roots.logs_root.ref); if (ret < 0) { if (ret == -ESTALE) {