diff --git a/kmod/src/btree.c b/kmod/src/btree.c index 7a21afc7..d8a25f0f 100644 --- a/kmod/src/btree.c +++ b/kmod/src/btree.c @@ -2183,6 +2183,8 @@ static int merge_read_item(struct super_block *sb, struct scoutfs_key *key, u64 if (ret > 0) { if (ret == SCOUTFS_DELTA_COMBINED) { scoutfs_inc_counter(sb, btree_merge_delta_combined); + if (seq > found->seq) + found->seq = seq; } else if (ret == SCOUTFS_DELTA_COMBINED_NULL) { scoutfs_inc_counter(sb, btree_merge_delta_null); free_mitem(rng, found); @@ -2486,6 +2488,14 @@ int scoutfs_btree_merge(struct super_block *sb, mitem = next_mitem(mitem); free_mitem(&rng, tmp); } + + if (mitem && walk_val_len == 0 && + !(walk_flags & (BTW_INSERT | BTW_DELETE)) && + scoutfs_trigger(sb, LOG_MERGE_FORCE_PARTIAL)) { + ret = -ERANGE; + *next_ret = mitem->key; + goto out; + } } ret = 0; diff --git a/kmod/src/forest.c b/kmod/src/forest.c index 306f4841..38681273 100644 --- a/kmod/src/forest.c +++ b/kmod/src/forest.c @@ -239,9 +239,9 @@ static int forest_read_items(struct super_block *sb, struct scoutfs_key *key, u6 * to reset their state and retry with a newer version of the btrees. */ int scoutfs_forest_read_items_roots(struct super_block *sb, struct scoutfs_net_roots *roots, - struct scoutfs_key *key, struct scoutfs_key *bloom_key, - struct scoutfs_key *start, struct scoutfs_key *end, - scoutfs_forest_item_cb cb, void *arg) + u64 merge_input_seq, struct scoutfs_key *key, + struct scoutfs_key *bloom_key, struct scoutfs_key *start, + struct scoutfs_key *end, scoutfs_forest_item_cb cb, void *arg) { struct forest_read_items_data rid = { .cb = cb, @@ -317,15 +317,17 @@ int scoutfs_forest_read_items_roots(struct super_block *sb, struct scoutfs_net_r scoutfs_inc_counter(sb, forest_bloom_pass); - if ((le64_to_cpu(lt.flags) & SCOUTFS_LOG_TREES_FINALIZED)) - rid.fic |= FIC_FINALIZED; + if ((le64_to_cpu(lt.flags) & SCOUTFS_LOG_TREES_FINALIZED) && + (merge_input_seq == 0 || + le64_to_cpu(lt.finalize_seq) < merge_input_seq)) + rid.fic |= FIC_MERGE_INPUT; ret = scoutfs_btree_read_items(sb, <.item_root, key, start, end, forest_read_items, &rid); if (ret < 0) goto out; - rid.fic &= ~FIC_FINALIZED; + rid.fic &= ~FIC_MERGE_INPUT; } ret = 0; @@ -345,7 +347,7 @@ int scoutfs_forest_read_items(struct super_block *sb, ret = scoutfs_client_get_roots(sb, &roots); if (ret == 0) - ret = scoutfs_forest_read_items_roots(sb, &roots, key, bloom_key, start, end, + ret = scoutfs_forest_read_items_roots(sb, &roots, 0, key, bloom_key, start, end, cb, arg); return ret; } diff --git a/kmod/src/forest.h b/kmod/src/forest.h index 151affb8..a3d5e4b2 100644 --- a/kmod/src/forest.h +++ b/kmod/src/forest.h @@ -11,7 +11,7 @@ struct scoutfs_lock; /* caller gives an item to the callback */ enum { FIC_FS_ROOT = (1 << 0), - FIC_FINALIZED = (1 << 1), + FIC_MERGE_INPUT = (1 << 1), }; typedef int (*scoutfs_forest_item_cb)(struct super_block *sb, struct scoutfs_key *key, u64 seq, u8 flags, void *val, int val_len, int fic, void *arg); @@ -25,9 +25,9 @@ int scoutfs_forest_read_items(struct super_block *sb, struct scoutfs_key *end, scoutfs_forest_item_cb cb, void *arg); int scoutfs_forest_read_items_roots(struct super_block *sb, struct scoutfs_net_roots *roots, - struct scoutfs_key *key, struct scoutfs_key *bloom_key, - struct scoutfs_key *start, struct scoutfs_key *end, - scoutfs_forest_item_cb cb, void *arg); + u64 merge_input_seq, struct scoutfs_key *key, + struct scoutfs_key *bloom_key, struct scoutfs_key *start, + struct scoutfs_key *end, scoutfs_forest_item_cb cb, void *arg); int scoutfs_forest_set_bloom_bits(struct super_block *sb, struct scoutfs_lock *lock); void scoutfs_forest_set_max_seq(struct super_block *sb, u64 max_seq); diff --git a/kmod/src/totl.c b/kmod/src/totl.c index cfa9b31a..50c6daca 100644 --- a/kmod/src/totl.c +++ b/kmod/src/totl.c @@ -30,6 +30,11 @@ void scoutfs_totl_merge_init(struct scoutfs_totl_merging *merg) memset(merg, 0, sizeof(struct scoutfs_totl_merging)); } +/* + * bin the incoming merge inputs so that we can resolve delta items + * properly. Finalized logs that are merge inputs are kept separately + * from those that are not. + */ void scoutfs_totl_merge_contribute(struct scoutfs_totl_merging *merg, u64 seq, u8 flags, void *val, int val_len, int fic) { @@ -39,10 +44,10 @@ void scoutfs_totl_merge_contribute(struct scoutfs_totl_merging *merg, merg->fs_seq = seq; merg->fs_total = le64_to_cpu(tval->total); merg->fs_count = le64_to_cpu(tval->count); - } else if (fic & FIC_FINALIZED) { - merg->fin_seq = seq; - merg->fin_total += le64_to_cpu(tval->total); - merg->fin_count += le64_to_cpu(tval->count); + } else if (fic & FIC_MERGE_INPUT) { + merg->inp_seq = seq; + merg->inp_total += le64_to_cpu(tval->total); + merg->inp_count += le64_to_cpu(tval->count); } else { merg->log_seq = seq; merg->log_total += le64_to_cpu(tval->total); @@ -53,15 +58,18 @@ void scoutfs_totl_merge_contribute(struct scoutfs_totl_merging *merg, /* * .totl. item merging has to be careful because the log btree merging * code can write partial results to the fs_root. This means that a - * reader can see both cases where new finalized logs should be applied - * to the old fs items and where old finalized logs have already been - * applied to the partially merged fs items. Currently active logged - * items are always applied on top of all cases. + * reader can see both cases where merge input deltas should be applied + * to the old fs items and where they have already been applied to the + * partially merged fs items. + * + * Only finalized log trees that are inputs to the current merge cycle + * are tracked in the inp_ bucket. Finalized trees that aren't merge + * inputs and active log trees are always applied unconditionally since + * they cannot be in fs_root. * * These cases are differentiated with a combination of sequence numbers - * in items, the count of contributing xattrs, and a flag - * differentiating finalized and active logged items. This lets us - * recognize all cases, including when finalized logs were merged and + * in items and the count of contributing xattrs. This lets us + * recognize all cases, including when merge inputs were merged and * deleted the fs item. */ void scoutfs_totl_merge_resolve(struct scoutfs_totl_merging *merg, __u64 *total, __u64 *count) @@ -75,14 +83,14 @@ void scoutfs_totl_merge_resolve(struct scoutfs_totl_merging *merg, __u64 *total, *count = merg->fs_count; } - /* apply finalized logs if they're newer or creating */ - if (((merg->fs_seq != 0) && (merg->fin_seq > merg->fs_seq)) || - ((merg->fs_seq == 0) && (merg->fin_count > 0))) { - *total += merg->fin_total; - *count += merg->fin_count; + /* apply merge input deltas if they're newer or creating */ + if (((merg->fs_seq != 0) && (merg->inp_seq > merg->fs_seq)) || + ((merg->fs_seq == 0) && (merg->inp_count > 0))) { + *total += merg->inp_total; + *count += merg->inp_count; } - /* always apply active logs which must be newer than fs and finalized */ + /* always apply non-input finalized and active logs */ if (merg->log_seq > 0) { *total += merg->log_total; *count += merg->log_count; diff --git a/kmod/src/totl.h b/kmod/src/totl.h index 52de1fd6..abcb685e 100644 --- a/kmod/src/totl.h +++ b/kmod/src/totl.h @@ -7,9 +7,9 @@ struct scoutfs_totl_merging { u64 fs_seq; u64 fs_total; u64 fs_count; - u64 fin_seq; - u64 fin_total; - s64 fin_count; + u64 inp_seq; + u64 inp_total; + s64 inp_count; u64 log_seq; u64 log_total; s64 log_count; diff --git a/kmod/src/triggers.c b/kmod/src/triggers.c index 4616d013..15ddf907 100644 --- a/kmod/src/triggers.c +++ b/kmod/src/triggers.c @@ -46,6 +46,7 @@ static char *names[] = { [SCOUTFS_TRIGGER_SRCH_MERGE_STOP_SAFE] = "srch_merge_stop_safe", [SCOUTFS_TRIGGER_STATFS_LOCK_PURGE] = "statfs_lock_purge", [SCOUTFS_TRIGGER_RECLAIM_SKIP_FINALIZE] = "reclaim_skip_finalize", + [SCOUTFS_TRIGGER_LOG_MERGE_FORCE_PARTIAL] = "log_merge_force_partial", }; bool scoutfs_trigger_test_and_clear(struct super_block *sb, unsigned int t) diff --git a/kmod/src/triggers.h b/kmod/src/triggers.h index 64a798b7..6d70cbea 100644 --- a/kmod/src/triggers.h +++ b/kmod/src/triggers.h @@ -9,6 +9,7 @@ enum scoutfs_trigger { SCOUTFS_TRIGGER_SRCH_MERGE_STOP_SAFE, SCOUTFS_TRIGGER_STATFS_LOCK_PURGE, SCOUTFS_TRIGGER_RECLAIM_SKIP_FINALIZE, + SCOUTFS_TRIGGER_LOG_MERGE_FORCE_PARTIAL, SCOUTFS_TRIGGER_NR, }; diff --git a/kmod/src/wkic.c b/kmod/src/wkic.c index 44eef494..eee77f95 100644 --- a/kmod/src/wkic.c +++ b/kmod/src/wkic.c @@ -95,6 +95,7 @@ struct wkic_info { /* block reading slow path */ struct mutex roots_mutex; struct scoutfs_net_roots roots; + u64 merge_input_seq; u64 roots_read_seq; ktime_t roots_expire; @@ -805,29 +806,79 @@ static void free_page_list(struct super_block *sb, struct list_head *list) * read_seq number so that we can compare the age of the items in cached * pages. Only one request to refresh the roots is in progress at a * time. This is the slow path that's only used when the cache isn't - * populated and the roots aren't cached. The root request is fast - * enough, especially compared to the resulting item reading IO, that we - * don't mind hiding it behind a trivial mutex. + * populated and the roots aren't cached. + * + * We read roots directly from the on-disk superblock rather than + * requesting them from the server so that we can also read the + * log_merge btree from the same superblock. The merge status item + * seq tells us which finalized log trees are inputs to the current + * merge, which is needed to correctly resolve totl delta items. */ -static int get_roots(struct super_block *sb, struct wkic_info *winf, - struct scoutfs_net_roots *roots_ret, u64 *read_seq, bool force_new) +static int refresh_roots(struct super_block *sb, struct wkic_info *winf) +{ + struct scoutfs_super_block *super; + struct scoutfs_log_merge_status *stat; + SCOUTFS_BTREE_ITEM_REF(iref); + struct scoutfs_key key; + int ret; + + super = kmalloc(sizeof(*super), GFP_NOFS); + if (!super) + return -ENOMEM; + + ret = scoutfs_read_super(sb, super); + if (ret < 0) + goto out; + + winf->roots = (struct scoutfs_net_roots){ + .fs_root = super->fs_root, + .logs_root = super->logs_root, + .srch_root = super->srch_root, + }; + + winf->merge_input_seq = 0; + if (super->log_merge.ref.blkno) { + scoutfs_key_set_zeros(&key); + key.sk_zone = SCOUTFS_LOG_MERGE_STATUS_ZONE; + ret = scoutfs_btree_lookup(sb, &super->log_merge, &key, &iref); + if (ret == 0) { + if (iref.val_len == sizeof(*stat)) { + stat = iref.val; + winf->merge_input_seq = le64_to_cpu(stat->seq); + } else { + ret = -EUCLEAN; + } + scoutfs_btree_put_iref(&iref); + } else if (ret == -ENOENT) { + ret = 0; + } + if (ret < 0) + goto out; + } + + winf->roots_read_seq++; + winf->roots_expire = ktime_add_ms(ktime_get_raw(), WKIC_CACHE_LIFETIME_MS); +out: + kfree(super); + return ret; +} + +static int get_roots(struct super_block *sb, struct wkic_info *winf, + struct scoutfs_net_roots *roots_ret, u64 *merge_input_seq, + u64 *read_seq, bool force_new) { - struct scoutfs_net_roots roots; int ret; mutex_lock(&winf->roots_mutex); if (force_new || ktime_before(winf->roots_expire, ktime_get_raw())) { - ret = scoutfs_client_get_roots(sb, &roots); + ret = refresh_roots(sb, winf); if (ret < 0) goto out; - - winf->roots = roots; - winf->roots_read_seq++; - winf->roots_expire = ktime_add_ms(ktime_get_raw(), WKIC_CACHE_LIFETIME_MS); } *roots_ret = winf->roots; + *merge_input_seq = winf->merge_input_seq; *read_seq = winf->roots_read_seq; ret = 0; out: @@ -870,24 +921,30 @@ static int insert_read_pages(struct super_block *sb, struct wkic_info *winf, struct scoutfs_key end; struct wkic_page *wpage; LIST_HEAD(pages); + u64 merge_input_seq; u64 read_seq; int ret; ret = 0; retry_stale: - ret = get_roots(sb, winf, &roots, &read_seq, ret == -ESTALE); + ret = get_roots(sb, winf, &roots, &merge_input_seq, &read_seq, ret == -ESTALE); if (ret < 0) - goto out; + goto check_stale; start = *range_start; end = *range_end; - ret = scoutfs_forest_read_items_roots(sb, &roots, key, range_start, &start, &end, - read_items_cb, &root); + ret = scoutfs_forest_read_items_roots(sb, &roots, merge_input_seq, key, range_start, + &start, &end, read_items_cb, &root); trace_scoutfs_wkic_read_items(sb, key, &start, &end); +check_stale: ret = scoutfs_block_check_stale(sb, ret, &saved, &roots.fs_root.ref, &roots.logs_root.ref); if (ret < 0) { - if (ret == -ESTALE) + if (ret == -ESTALE) { + /* not safe to retry due to delta items, must restart clean */ + free_item_tree(&root); + root = RB_ROOT; goto retry_stale; + } goto out; } diff --git a/tests/golden/totl-merge-read b/tests/golden/totl-merge-read new file mode 100644 index 00000000..931671e6 --- /dev/null +++ b/tests/golden/totl-merge-read @@ -0,0 +1,3 @@ +== setup +expected 4681 +== cleanup diff --git a/tests/sequence b/tests/sequence index e296553a..bdd8aba7 100644 --- a/tests/sequence +++ b/tests/sequence @@ -27,6 +27,7 @@ simple-xattr-unit.sh retention-basic.sh totl-xattr-tag.sh quota.sh +totl-merge-read.sh lock-refleak.sh lock-shrink-consistency.sh lock-shrink-read-race.sh diff --git a/tests/tests/totl-merge-read.sh b/tests/tests/totl-merge-read.sh new file mode 100644 index 00000000..8c37c93a --- /dev/null +++ b/tests/tests/totl-merge-read.sh @@ -0,0 +1,50 @@ +# +# Test that merge_read_item() correctly updates the sequence number when +# combining delta items from multiple finalized log trees. Each mount +# sets a totl value in its own 3-bit lane (powers of 8) so that any +# double-counting overflows the lane and is caught by: or(v, exp) != exp. +# + +t_require_commands setfattr scoutfs +t_require_mounts 5 + +echo "== setup" +for nr in $(t_fs_nrs); do + d=$(eval echo \$T_D$nr) + for i in $(seq 1 2500); do : > "$d/f$nr$i"; done +done +sync +t_force_log_merge + +vals=(1 8 64 512 4096) +expected=4681 +n=0 +for nr in $(t_fs_nrs); do + d=$(eval echo \$T_D$nr) + v=${vals[$((n++))]} + for i in $(seq 1 2500); do + setfattr -n "scoutfs.totl.t.$i.0.0" -v $v "$d/f$nr$i" + done +done + +t_trigger_arm_silent log_merge_force_partial $(t_server_nr) + +bad="$T_TMPDIR/bad" +for nr in $(t_fs_nrs); do + ( while true; do + echo 1 > "$(t_debugfs_path $nr)/drop_weak_item_cache" + scoutfs read-xattr-totals -p "$(eval echo \$T_M$nr)" | \ + awk -F'[ =,]+' -v e=$expected 'or($2+0,e) != e' + done ) >> "$bad" & +done + +echo "expected $expected" +t_force_log_merge +t_silent_kill $(jobs -p) +test -s "$bad" && echo "double-counted:" && cat "$bad" + +echo "== cleanup" +for nr in $(t_fs_nrs); do + find "$(eval echo \$T_D$nr)" -name "f$nr*" -delete +done +t_pass