Merge pull request #289 from versity/auke/merge_read_item_stale_seq

Update seq when merging deltas from partial log merge.
This commit is contained in:
Zach Brown
2026-04-13 14:10:37 -07:00
committed by GitHub
11 changed files with 180 additions and 47 deletions

View File

@@ -2183,6 +2183,8 @@ static int merge_read_item(struct super_block *sb, struct scoutfs_key *key, u64
if (ret > 0) {
if (ret == SCOUTFS_DELTA_COMBINED) {
scoutfs_inc_counter(sb, btree_merge_delta_combined);
if (seq > found->seq)
found->seq = seq;
} else if (ret == SCOUTFS_DELTA_COMBINED_NULL) {
scoutfs_inc_counter(sb, btree_merge_delta_null);
free_mitem(rng, found);
@@ -2486,6 +2488,14 @@ int scoutfs_btree_merge(struct super_block *sb,
mitem = next_mitem(mitem);
free_mitem(&rng, tmp);
}
if (mitem && walk_val_len == 0 &&
!(walk_flags & (BTW_INSERT | BTW_DELETE)) &&
scoutfs_trigger(sb, LOG_MERGE_FORCE_PARTIAL)) {
ret = -ERANGE;
*next_ret = mitem->key;
goto out;
}
}
ret = 0;

View File

@@ -239,9 +239,9 @@ static int forest_read_items(struct super_block *sb, struct scoutfs_key *key, u6
* to reset their state and retry with a newer version of the btrees.
*/
int scoutfs_forest_read_items_roots(struct super_block *sb, struct scoutfs_net_roots *roots,
struct scoutfs_key *key, struct scoutfs_key *bloom_key,
struct scoutfs_key *start, struct scoutfs_key *end,
scoutfs_forest_item_cb cb, void *arg)
u64 merge_input_seq, struct scoutfs_key *key,
struct scoutfs_key *bloom_key, struct scoutfs_key *start,
struct scoutfs_key *end, scoutfs_forest_item_cb cb, void *arg)
{
struct forest_read_items_data rid = {
.cb = cb,
@@ -317,15 +317,17 @@ int scoutfs_forest_read_items_roots(struct super_block *sb, struct scoutfs_net_r
scoutfs_inc_counter(sb, forest_bloom_pass);
if ((le64_to_cpu(lt.flags) & SCOUTFS_LOG_TREES_FINALIZED))
rid.fic |= FIC_FINALIZED;
if ((le64_to_cpu(lt.flags) & SCOUTFS_LOG_TREES_FINALIZED) &&
(merge_input_seq == 0 ||
le64_to_cpu(lt.finalize_seq) < merge_input_seq))
rid.fic |= FIC_MERGE_INPUT;
ret = scoutfs_btree_read_items(sb, &lt.item_root, key, start,
end, forest_read_items, &rid);
if (ret < 0)
goto out;
rid.fic &= ~FIC_FINALIZED;
rid.fic &= ~FIC_MERGE_INPUT;
}
ret = 0;
@@ -345,7 +347,7 @@ int scoutfs_forest_read_items(struct super_block *sb,
ret = scoutfs_client_get_roots(sb, &roots);
if (ret == 0)
ret = scoutfs_forest_read_items_roots(sb, &roots, key, bloom_key, start, end,
ret = scoutfs_forest_read_items_roots(sb, &roots, 0, key, bloom_key, start, end,
cb, arg);
return ret;
}

View File

@@ -11,7 +11,7 @@ struct scoutfs_lock;
/* caller gives an item to the callback */
enum {
FIC_FS_ROOT = (1 << 0),
FIC_FINALIZED = (1 << 1),
FIC_MERGE_INPUT = (1 << 1),
};
typedef int (*scoutfs_forest_item_cb)(struct super_block *sb, struct scoutfs_key *key, u64 seq,
u8 flags, void *val, int val_len, int fic, void *arg);
@@ -25,9 +25,9 @@ int scoutfs_forest_read_items(struct super_block *sb,
struct scoutfs_key *end,
scoutfs_forest_item_cb cb, void *arg);
int scoutfs_forest_read_items_roots(struct super_block *sb, struct scoutfs_net_roots *roots,
struct scoutfs_key *key, struct scoutfs_key *bloom_key,
struct scoutfs_key *start, struct scoutfs_key *end,
scoutfs_forest_item_cb cb, void *arg);
u64 merge_input_seq, struct scoutfs_key *key,
struct scoutfs_key *bloom_key, struct scoutfs_key *start,
struct scoutfs_key *end, scoutfs_forest_item_cb cb, void *arg);
int scoutfs_forest_set_bloom_bits(struct super_block *sb,
struct scoutfs_lock *lock);
void scoutfs_forest_set_max_seq(struct super_block *sb, u64 max_seq);

View File

@@ -30,6 +30,11 @@ void scoutfs_totl_merge_init(struct scoutfs_totl_merging *merg)
memset(merg, 0, sizeof(struct scoutfs_totl_merging));
}
/*
* bin the incoming merge inputs so that we can resolve delta items
* properly. Finalized logs that are merge inputs are kept separately
* from those that are not.
*/
void scoutfs_totl_merge_contribute(struct scoutfs_totl_merging *merg,
u64 seq, u8 flags, void *val, int val_len, int fic)
{
@@ -39,10 +44,10 @@ void scoutfs_totl_merge_contribute(struct scoutfs_totl_merging *merg,
merg->fs_seq = seq;
merg->fs_total = le64_to_cpu(tval->total);
merg->fs_count = le64_to_cpu(tval->count);
} else if (fic & FIC_FINALIZED) {
merg->fin_seq = seq;
merg->fin_total += le64_to_cpu(tval->total);
merg->fin_count += le64_to_cpu(tval->count);
} else if (fic & FIC_MERGE_INPUT) {
merg->inp_seq = seq;
merg->inp_total += le64_to_cpu(tval->total);
merg->inp_count += le64_to_cpu(tval->count);
} else {
merg->log_seq = seq;
merg->log_total += le64_to_cpu(tval->total);
@@ -53,15 +58,18 @@ void scoutfs_totl_merge_contribute(struct scoutfs_totl_merging *merg,
/*
* .totl. item merging has to be careful because the log btree merging
* code can write partial results to the fs_root. This means that a
* reader can see both cases where new finalized logs should be applied
* to the old fs items and where old finalized logs have already been
* applied to the partially merged fs items. Currently active logged
* items are always applied on top of all cases.
* reader can see both cases where merge input deltas should be applied
* to the old fs items and where they have already been applied to the
* partially merged fs items.
*
* Only finalized log trees that are inputs to the current merge cycle
* are tracked in the inp_ bucket. Finalized trees that aren't merge
* inputs and active log trees are always applied unconditionally since
* they cannot be in fs_root.
*
* These cases are differentiated with a combination of sequence numbers
* in items, the count of contributing xattrs, and a flag
* differentiating finalized and active logged items. This lets us
* recognize all cases, including when finalized logs were merged and
* in items and the count of contributing xattrs. This lets us
* recognize all cases, including when merge inputs were merged and
* deleted the fs item.
*/
void scoutfs_totl_merge_resolve(struct scoutfs_totl_merging *merg, __u64 *total, __u64 *count)
@@ -75,14 +83,14 @@ void scoutfs_totl_merge_resolve(struct scoutfs_totl_merging *merg, __u64 *total,
*count = merg->fs_count;
}
/* apply finalized logs if they're newer or creating */
if (((merg->fs_seq != 0) && (merg->fin_seq > merg->fs_seq)) ||
((merg->fs_seq == 0) && (merg->fin_count > 0))) {
*total += merg->fin_total;
*count += merg->fin_count;
/* apply merge input deltas if they're newer or creating */
if (((merg->fs_seq != 0) && (merg->inp_seq > merg->fs_seq)) ||
((merg->fs_seq == 0) && (merg->inp_count > 0))) {
*total += merg->inp_total;
*count += merg->inp_count;
}
/* always apply active logs which must be newer than fs and finalized */
/* always apply non-input finalized and active logs */
if (merg->log_seq > 0) {
*total += merg->log_total;
*count += merg->log_count;

View File

@@ -7,9 +7,9 @@ struct scoutfs_totl_merging {
u64 fs_seq;
u64 fs_total;
u64 fs_count;
u64 fin_seq;
u64 fin_total;
s64 fin_count;
u64 inp_seq;
u64 inp_total;
s64 inp_count;
u64 log_seq;
u64 log_total;
s64 log_count;

View File

@@ -46,6 +46,7 @@ static char *names[] = {
[SCOUTFS_TRIGGER_SRCH_MERGE_STOP_SAFE] = "srch_merge_stop_safe",
[SCOUTFS_TRIGGER_STATFS_LOCK_PURGE] = "statfs_lock_purge",
[SCOUTFS_TRIGGER_RECLAIM_SKIP_FINALIZE] = "reclaim_skip_finalize",
[SCOUTFS_TRIGGER_LOG_MERGE_FORCE_PARTIAL] = "log_merge_force_partial",
};
bool scoutfs_trigger_test_and_clear(struct super_block *sb, unsigned int t)

View File

@@ -9,6 +9,7 @@ enum scoutfs_trigger {
SCOUTFS_TRIGGER_SRCH_MERGE_STOP_SAFE,
SCOUTFS_TRIGGER_STATFS_LOCK_PURGE,
SCOUTFS_TRIGGER_RECLAIM_SKIP_FINALIZE,
SCOUTFS_TRIGGER_LOG_MERGE_FORCE_PARTIAL,
SCOUTFS_TRIGGER_NR,
};

View File

@@ -95,6 +95,7 @@ struct wkic_info {
/* block reading slow path */
struct mutex roots_mutex;
struct scoutfs_net_roots roots;
u64 merge_input_seq;
u64 roots_read_seq;
ktime_t roots_expire;
@@ -805,29 +806,79 @@ static void free_page_list(struct super_block *sb, struct list_head *list)
* read_seq number so that we can compare the age of the items in cached
* pages. Only one request to refresh the roots is in progress at a
* time. This is the slow path that's only used when the cache isn't
* populated and the roots aren't cached. The root request is fast
* enough, especially compared to the resulting item reading IO, that we
* don't mind hiding it behind a trivial mutex.
* populated and the roots aren't cached.
*
* We read roots directly from the on-disk superblock rather than
* requesting them from the server so that we can also read the
* log_merge btree from the same superblock. The merge status item
* seq tells us which finalized log trees are inputs to the current
* merge, which is needed to correctly resolve totl delta items.
*/
static int get_roots(struct super_block *sb, struct wkic_info *winf,
struct scoutfs_net_roots *roots_ret, u64 *read_seq, bool force_new)
static int refresh_roots(struct super_block *sb, struct wkic_info *winf)
{
struct scoutfs_super_block *super;
struct scoutfs_log_merge_status *stat;
SCOUTFS_BTREE_ITEM_REF(iref);
struct scoutfs_key key;
int ret;
super = kmalloc(sizeof(*super), GFP_NOFS);
if (!super)
return -ENOMEM;
ret = scoutfs_read_super(sb, super);
if (ret < 0)
goto out;
winf->roots = (struct scoutfs_net_roots){
.fs_root = super->fs_root,
.logs_root = super->logs_root,
.srch_root = super->srch_root,
};
winf->merge_input_seq = 0;
if (super->log_merge.ref.blkno) {
scoutfs_key_set_zeros(&key);
key.sk_zone = SCOUTFS_LOG_MERGE_STATUS_ZONE;
ret = scoutfs_btree_lookup(sb, &super->log_merge, &key, &iref);
if (ret == 0) {
if (iref.val_len == sizeof(*stat)) {
stat = iref.val;
winf->merge_input_seq = le64_to_cpu(stat->seq);
} else {
ret = -EUCLEAN;
}
scoutfs_btree_put_iref(&iref);
} else if (ret == -ENOENT) {
ret = 0;
}
if (ret < 0)
goto out;
}
winf->roots_read_seq++;
winf->roots_expire = ktime_add_ms(ktime_get_raw(), WKIC_CACHE_LIFETIME_MS);
out:
kfree(super);
return ret;
}
static int get_roots(struct super_block *sb, struct wkic_info *winf,
struct scoutfs_net_roots *roots_ret, u64 *merge_input_seq,
u64 *read_seq, bool force_new)
{
struct scoutfs_net_roots roots;
int ret;
mutex_lock(&winf->roots_mutex);
if (force_new || ktime_before(winf->roots_expire, ktime_get_raw())) {
ret = scoutfs_client_get_roots(sb, &roots);
ret = refresh_roots(sb, winf);
if (ret < 0)
goto out;
winf->roots = roots;
winf->roots_read_seq++;
winf->roots_expire = ktime_add_ms(ktime_get_raw(), WKIC_CACHE_LIFETIME_MS);
}
*roots_ret = winf->roots;
*merge_input_seq = winf->merge_input_seq;
*read_seq = winf->roots_read_seq;
ret = 0;
out:
@@ -870,24 +921,30 @@ static int insert_read_pages(struct super_block *sb, struct wkic_info *winf,
struct scoutfs_key end;
struct wkic_page *wpage;
LIST_HEAD(pages);
u64 merge_input_seq;
u64 read_seq;
int ret;
ret = 0;
retry_stale:
ret = get_roots(sb, winf, &roots, &read_seq, ret == -ESTALE);
ret = get_roots(sb, winf, &roots, &merge_input_seq, &read_seq, ret == -ESTALE);
if (ret < 0)
goto out;
goto check_stale;
start = *range_start;
end = *range_end;
ret = scoutfs_forest_read_items_roots(sb, &roots, key, range_start, &start, &end,
read_items_cb, &root);
ret = scoutfs_forest_read_items_roots(sb, &roots, merge_input_seq, key, range_start,
&start, &end, read_items_cb, &root);
trace_scoutfs_wkic_read_items(sb, key, &start, &end);
check_stale:
ret = scoutfs_block_check_stale(sb, ret, &saved, &roots.fs_root.ref, &roots.logs_root.ref);
if (ret < 0) {
if (ret == -ESTALE)
if (ret == -ESTALE) {
/* not safe to retry due to delta items, must restart clean */
free_item_tree(&root);
root = RB_ROOT;
goto retry_stale;
}
goto out;
}

View File

@@ -0,0 +1,3 @@
== setup
expected 4681
== cleanup

View File

@@ -27,6 +27,7 @@ simple-xattr-unit.sh
retention-basic.sh
totl-xattr-tag.sh
quota.sh
totl-merge-read.sh
lock-refleak.sh
lock-shrink-consistency.sh
lock-shrink-read-race.sh

View File

@@ -0,0 +1,50 @@
#
# Test that merge_read_item() correctly updates the sequence number when
# combining delta items from multiple finalized log trees. Each mount
# sets a totl value in its own 3-bit lane (powers of 8) so that any
# double-counting overflows the lane and is caught by: or(v, exp) != exp.
#
t_require_commands setfattr scoutfs
t_require_mounts 5
echo "== setup"
for nr in $(t_fs_nrs); do
d=$(eval echo \$T_D$nr)
for i in $(seq 1 2500); do : > "$d/f$nr$i"; done
done
sync
t_force_log_merge
vals=(1 8 64 512 4096)
expected=4681
n=0
for nr in $(t_fs_nrs); do
d=$(eval echo \$T_D$nr)
v=${vals[$((n++))]}
for i in $(seq 1 2500); do
setfattr -n "scoutfs.totl.t.$i.0.0" -v $v "$d/f$nr$i"
done
done
t_trigger_arm_silent log_merge_force_partial $(t_server_nr)
bad="$T_TMPDIR/bad"
for nr in $(t_fs_nrs); do
( while true; do
echo 1 > "$(t_debugfs_path $nr)/drop_weak_item_cache"
scoutfs read-xattr-totals -p "$(eval echo \$T_M$nr)" | \
awk -F'[ =,]+' -v e=$expected 'or($2+0,e) != e'
done ) >> "$bad" &
done
echo "expected $expected"
t_force_log_merge
t_silent_kill $(jobs -p)
test -s "$bad" && echo "double-counted:" && cat "$bad"
echo "== cleanup"
for nr in $(t_fs_nrs); do
find "$(eval echo \$T_D$nr)" -name "f$nr*" -delete
done
t_pass