mirror of
https://github.com/versity/scoutfs.git
synced 2026-04-15 11:26:58 +00:00
Merge pull request #289 from versity/auke/merge_read_item_stale_seq
Update seq when merging deltas from partial log merge.
This commit is contained in:
@@ -2183,6 +2183,8 @@ static int merge_read_item(struct super_block *sb, struct scoutfs_key *key, u64
|
||||
if (ret > 0) {
|
||||
if (ret == SCOUTFS_DELTA_COMBINED) {
|
||||
scoutfs_inc_counter(sb, btree_merge_delta_combined);
|
||||
if (seq > found->seq)
|
||||
found->seq = seq;
|
||||
} else if (ret == SCOUTFS_DELTA_COMBINED_NULL) {
|
||||
scoutfs_inc_counter(sb, btree_merge_delta_null);
|
||||
free_mitem(rng, found);
|
||||
@@ -2486,6 +2488,14 @@ int scoutfs_btree_merge(struct super_block *sb,
|
||||
mitem = next_mitem(mitem);
|
||||
free_mitem(&rng, tmp);
|
||||
}
|
||||
|
||||
if (mitem && walk_val_len == 0 &&
|
||||
!(walk_flags & (BTW_INSERT | BTW_DELETE)) &&
|
||||
scoutfs_trigger(sb, LOG_MERGE_FORCE_PARTIAL)) {
|
||||
ret = -ERANGE;
|
||||
*next_ret = mitem->key;
|
||||
goto out;
|
||||
}
|
||||
}
|
||||
|
||||
ret = 0;
|
||||
|
||||
@@ -239,9 +239,9 @@ static int forest_read_items(struct super_block *sb, struct scoutfs_key *key, u6
|
||||
* to reset their state and retry with a newer version of the btrees.
|
||||
*/
|
||||
int scoutfs_forest_read_items_roots(struct super_block *sb, struct scoutfs_net_roots *roots,
|
||||
struct scoutfs_key *key, struct scoutfs_key *bloom_key,
|
||||
struct scoutfs_key *start, struct scoutfs_key *end,
|
||||
scoutfs_forest_item_cb cb, void *arg)
|
||||
u64 merge_input_seq, struct scoutfs_key *key,
|
||||
struct scoutfs_key *bloom_key, struct scoutfs_key *start,
|
||||
struct scoutfs_key *end, scoutfs_forest_item_cb cb, void *arg)
|
||||
{
|
||||
struct forest_read_items_data rid = {
|
||||
.cb = cb,
|
||||
@@ -317,15 +317,17 @@ int scoutfs_forest_read_items_roots(struct super_block *sb, struct scoutfs_net_r
|
||||
|
||||
scoutfs_inc_counter(sb, forest_bloom_pass);
|
||||
|
||||
if ((le64_to_cpu(lt.flags) & SCOUTFS_LOG_TREES_FINALIZED))
|
||||
rid.fic |= FIC_FINALIZED;
|
||||
if ((le64_to_cpu(lt.flags) & SCOUTFS_LOG_TREES_FINALIZED) &&
|
||||
(merge_input_seq == 0 ||
|
||||
le64_to_cpu(lt.finalize_seq) < merge_input_seq))
|
||||
rid.fic |= FIC_MERGE_INPUT;
|
||||
|
||||
ret = scoutfs_btree_read_items(sb, <.item_root, key, start,
|
||||
end, forest_read_items, &rid);
|
||||
if (ret < 0)
|
||||
goto out;
|
||||
|
||||
rid.fic &= ~FIC_FINALIZED;
|
||||
rid.fic &= ~FIC_MERGE_INPUT;
|
||||
}
|
||||
|
||||
ret = 0;
|
||||
@@ -345,7 +347,7 @@ int scoutfs_forest_read_items(struct super_block *sb,
|
||||
|
||||
ret = scoutfs_client_get_roots(sb, &roots);
|
||||
if (ret == 0)
|
||||
ret = scoutfs_forest_read_items_roots(sb, &roots, key, bloom_key, start, end,
|
||||
ret = scoutfs_forest_read_items_roots(sb, &roots, 0, key, bloom_key, start, end,
|
||||
cb, arg);
|
||||
return ret;
|
||||
}
|
||||
|
||||
@@ -11,7 +11,7 @@ struct scoutfs_lock;
|
||||
/* caller gives an item to the callback */
|
||||
enum {
|
||||
FIC_FS_ROOT = (1 << 0),
|
||||
FIC_FINALIZED = (1 << 1),
|
||||
FIC_MERGE_INPUT = (1 << 1),
|
||||
};
|
||||
typedef int (*scoutfs_forest_item_cb)(struct super_block *sb, struct scoutfs_key *key, u64 seq,
|
||||
u8 flags, void *val, int val_len, int fic, void *arg);
|
||||
@@ -25,9 +25,9 @@ int scoutfs_forest_read_items(struct super_block *sb,
|
||||
struct scoutfs_key *end,
|
||||
scoutfs_forest_item_cb cb, void *arg);
|
||||
int scoutfs_forest_read_items_roots(struct super_block *sb, struct scoutfs_net_roots *roots,
|
||||
struct scoutfs_key *key, struct scoutfs_key *bloom_key,
|
||||
struct scoutfs_key *start, struct scoutfs_key *end,
|
||||
scoutfs_forest_item_cb cb, void *arg);
|
||||
u64 merge_input_seq, struct scoutfs_key *key,
|
||||
struct scoutfs_key *bloom_key, struct scoutfs_key *start,
|
||||
struct scoutfs_key *end, scoutfs_forest_item_cb cb, void *arg);
|
||||
int scoutfs_forest_set_bloom_bits(struct super_block *sb,
|
||||
struct scoutfs_lock *lock);
|
||||
void scoutfs_forest_set_max_seq(struct super_block *sb, u64 max_seq);
|
||||
|
||||
@@ -30,6 +30,11 @@ void scoutfs_totl_merge_init(struct scoutfs_totl_merging *merg)
|
||||
memset(merg, 0, sizeof(struct scoutfs_totl_merging));
|
||||
}
|
||||
|
||||
/*
|
||||
* bin the incoming merge inputs so that we can resolve delta items
|
||||
* properly. Finalized logs that are merge inputs are kept separately
|
||||
* from those that are not.
|
||||
*/
|
||||
void scoutfs_totl_merge_contribute(struct scoutfs_totl_merging *merg,
|
||||
u64 seq, u8 flags, void *val, int val_len, int fic)
|
||||
{
|
||||
@@ -39,10 +44,10 @@ void scoutfs_totl_merge_contribute(struct scoutfs_totl_merging *merg,
|
||||
merg->fs_seq = seq;
|
||||
merg->fs_total = le64_to_cpu(tval->total);
|
||||
merg->fs_count = le64_to_cpu(tval->count);
|
||||
} else if (fic & FIC_FINALIZED) {
|
||||
merg->fin_seq = seq;
|
||||
merg->fin_total += le64_to_cpu(tval->total);
|
||||
merg->fin_count += le64_to_cpu(tval->count);
|
||||
} else if (fic & FIC_MERGE_INPUT) {
|
||||
merg->inp_seq = seq;
|
||||
merg->inp_total += le64_to_cpu(tval->total);
|
||||
merg->inp_count += le64_to_cpu(tval->count);
|
||||
} else {
|
||||
merg->log_seq = seq;
|
||||
merg->log_total += le64_to_cpu(tval->total);
|
||||
@@ -53,15 +58,18 @@ void scoutfs_totl_merge_contribute(struct scoutfs_totl_merging *merg,
|
||||
/*
|
||||
* .totl. item merging has to be careful because the log btree merging
|
||||
* code can write partial results to the fs_root. This means that a
|
||||
* reader can see both cases where new finalized logs should be applied
|
||||
* to the old fs items and where old finalized logs have already been
|
||||
* applied to the partially merged fs items. Currently active logged
|
||||
* items are always applied on top of all cases.
|
||||
* reader can see both cases where merge input deltas should be applied
|
||||
* to the old fs items and where they have already been applied to the
|
||||
* partially merged fs items.
|
||||
*
|
||||
* Only finalized log trees that are inputs to the current merge cycle
|
||||
* are tracked in the inp_ bucket. Finalized trees that aren't merge
|
||||
* inputs and active log trees are always applied unconditionally since
|
||||
* they cannot be in fs_root.
|
||||
*
|
||||
* These cases are differentiated with a combination of sequence numbers
|
||||
* in items, the count of contributing xattrs, and a flag
|
||||
* differentiating finalized and active logged items. This lets us
|
||||
* recognize all cases, including when finalized logs were merged and
|
||||
* in items and the count of contributing xattrs. This lets us
|
||||
* recognize all cases, including when merge inputs were merged and
|
||||
* deleted the fs item.
|
||||
*/
|
||||
void scoutfs_totl_merge_resolve(struct scoutfs_totl_merging *merg, __u64 *total, __u64 *count)
|
||||
@@ -75,14 +83,14 @@ void scoutfs_totl_merge_resolve(struct scoutfs_totl_merging *merg, __u64 *total,
|
||||
*count = merg->fs_count;
|
||||
}
|
||||
|
||||
/* apply finalized logs if they're newer or creating */
|
||||
if (((merg->fs_seq != 0) && (merg->fin_seq > merg->fs_seq)) ||
|
||||
((merg->fs_seq == 0) && (merg->fin_count > 0))) {
|
||||
*total += merg->fin_total;
|
||||
*count += merg->fin_count;
|
||||
/* apply merge input deltas if they're newer or creating */
|
||||
if (((merg->fs_seq != 0) && (merg->inp_seq > merg->fs_seq)) ||
|
||||
((merg->fs_seq == 0) && (merg->inp_count > 0))) {
|
||||
*total += merg->inp_total;
|
||||
*count += merg->inp_count;
|
||||
}
|
||||
|
||||
/* always apply active logs which must be newer than fs and finalized */
|
||||
/* always apply non-input finalized and active logs */
|
||||
if (merg->log_seq > 0) {
|
||||
*total += merg->log_total;
|
||||
*count += merg->log_count;
|
||||
|
||||
@@ -7,9 +7,9 @@ struct scoutfs_totl_merging {
|
||||
u64 fs_seq;
|
||||
u64 fs_total;
|
||||
u64 fs_count;
|
||||
u64 fin_seq;
|
||||
u64 fin_total;
|
||||
s64 fin_count;
|
||||
u64 inp_seq;
|
||||
u64 inp_total;
|
||||
s64 inp_count;
|
||||
u64 log_seq;
|
||||
u64 log_total;
|
||||
s64 log_count;
|
||||
|
||||
@@ -46,6 +46,7 @@ static char *names[] = {
|
||||
[SCOUTFS_TRIGGER_SRCH_MERGE_STOP_SAFE] = "srch_merge_stop_safe",
|
||||
[SCOUTFS_TRIGGER_STATFS_LOCK_PURGE] = "statfs_lock_purge",
|
||||
[SCOUTFS_TRIGGER_RECLAIM_SKIP_FINALIZE] = "reclaim_skip_finalize",
|
||||
[SCOUTFS_TRIGGER_LOG_MERGE_FORCE_PARTIAL] = "log_merge_force_partial",
|
||||
};
|
||||
|
||||
bool scoutfs_trigger_test_and_clear(struct super_block *sb, unsigned int t)
|
||||
|
||||
@@ -9,6 +9,7 @@ enum scoutfs_trigger {
|
||||
SCOUTFS_TRIGGER_SRCH_MERGE_STOP_SAFE,
|
||||
SCOUTFS_TRIGGER_STATFS_LOCK_PURGE,
|
||||
SCOUTFS_TRIGGER_RECLAIM_SKIP_FINALIZE,
|
||||
SCOUTFS_TRIGGER_LOG_MERGE_FORCE_PARTIAL,
|
||||
SCOUTFS_TRIGGER_NR,
|
||||
};
|
||||
|
||||
|
||||
@@ -95,6 +95,7 @@ struct wkic_info {
|
||||
/* block reading slow path */
|
||||
struct mutex roots_mutex;
|
||||
struct scoutfs_net_roots roots;
|
||||
u64 merge_input_seq;
|
||||
u64 roots_read_seq;
|
||||
ktime_t roots_expire;
|
||||
|
||||
@@ -805,29 +806,79 @@ static void free_page_list(struct super_block *sb, struct list_head *list)
|
||||
* read_seq number so that we can compare the age of the items in cached
|
||||
* pages. Only one request to refresh the roots is in progress at a
|
||||
* time. This is the slow path that's only used when the cache isn't
|
||||
* populated and the roots aren't cached. The root request is fast
|
||||
* enough, especially compared to the resulting item reading IO, that we
|
||||
* don't mind hiding it behind a trivial mutex.
|
||||
* populated and the roots aren't cached.
|
||||
*
|
||||
* We read roots directly from the on-disk superblock rather than
|
||||
* requesting them from the server so that we can also read the
|
||||
* log_merge btree from the same superblock. The merge status item
|
||||
* seq tells us which finalized log trees are inputs to the current
|
||||
* merge, which is needed to correctly resolve totl delta items.
|
||||
*/
|
||||
static int get_roots(struct super_block *sb, struct wkic_info *winf,
|
||||
struct scoutfs_net_roots *roots_ret, u64 *read_seq, bool force_new)
|
||||
static int refresh_roots(struct super_block *sb, struct wkic_info *winf)
|
||||
{
|
||||
struct scoutfs_super_block *super;
|
||||
struct scoutfs_log_merge_status *stat;
|
||||
SCOUTFS_BTREE_ITEM_REF(iref);
|
||||
struct scoutfs_key key;
|
||||
int ret;
|
||||
|
||||
super = kmalloc(sizeof(*super), GFP_NOFS);
|
||||
if (!super)
|
||||
return -ENOMEM;
|
||||
|
||||
ret = scoutfs_read_super(sb, super);
|
||||
if (ret < 0)
|
||||
goto out;
|
||||
|
||||
winf->roots = (struct scoutfs_net_roots){
|
||||
.fs_root = super->fs_root,
|
||||
.logs_root = super->logs_root,
|
||||
.srch_root = super->srch_root,
|
||||
};
|
||||
|
||||
winf->merge_input_seq = 0;
|
||||
if (super->log_merge.ref.blkno) {
|
||||
scoutfs_key_set_zeros(&key);
|
||||
key.sk_zone = SCOUTFS_LOG_MERGE_STATUS_ZONE;
|
||||
ret = scoutfs_btree_lookup(sb, &super->log_merge, &key, &iref);
|
||||
if (ret == 0) {
|
||||
if (iref.val_len == sizeof(*stat)) {
|
||||
stat = iref.val;
|
||||
winf->merge_input_seq = le64_to_cpu(stat->seq);
|
||||
} else {
|
||||
ret = -EUCLEAN;
|
||||
}
|
||||
scoutfs_btree_put_iref(&iref);
|
||||
} else if (ret == -ENOENT) {
|
||||
ret = 0;
|
||||
}
|
||||
if (ret < 0)
|
||||
goto out;
|
||||
}
|
||||
|
||||
winf->roots_read_seq++;
|
||||
winf->roots_expire = ktime_add_ms(ktime_get_raw(), WKIC_CACHE_LIFETIME_MS);
|
||||
out:
|
||||
kfree(super);
|
||||
return ret;
|
||||
}
|
||||
|
||||
static int get_roots(struct super_block *sb, struct wkic_info *winf,
|
||||
struct scoutfs_net_roots *roots_ret, u64 *merge_input_seq,
|
||||
u64 *read_seq, bool force_new)
|
||||
{
|
||||
struct scoutfs_net_roots roots;
|
||||
int ret;
|
||||
|
||||
mutex_lock(&winf->roots_mutex);
|
||||
|
||||
if (force_new || ktime_before(winf->roots_expire, ktime_get_raw())) {
|
||||
ret = scoutfs_client_get_roots(sb, &roots);
|
||||
ret = refresh_roots(sb, winf);
|
||||
if (ret < 0)
|
||||
goto out;
|
||||
|
||||
winf->roots = roots;
|
||||
winf->roots_read_seq++;
|
||||
winf->roots_expire = ktime_add_ms(ktime_get_raw(), WKIC_CACHE_LIFETIME_MS);
|
||||
}
|
||||
|
||||
*roots_ret = winf->roots;
|
||||
*merge_input_seq = winf->merge_input_seq;
|
||||
*read_seq = winf->roots_read_seq;
|
||||
ret = 0;
|
||||
out:
|
||||
@@ -870,24 +921,30 @@ static int insert_read_pages(struct super_block *sb, struct wkic_info *winf,
|
||||
struct scoutfs_key end;
|
||||
struct wkic_page *wpage;
|
||||
LIST_HEAD(pages);
|
||||
u64 merge_input_seq;
|
||||
u64 read_seq;
|
||||
int ret;
|
||||
|
||||
ret = 0;
|
||||
retry_stale:
|
||||
ret = get_roots(sb, winf, &roots, &read_seq, ret == -ESTALE);
|
||||
ret = get_roots(sb, winf, &roots, &merge_input_seq, &read_seq, ret == -ESTALE);
|
||||
if (ret < 0)
|
||||
goto out;
|
||||
goto check_stale;
|
||||
|
||||
start = *range_start;
|
||||
end = *range_end;
|
||||
ret = scoutfs_forest_read_items_roots(sb, &roots, key, range_start, &start, &end,
|
||||
read_items_cb, &root);
|
||||
ret = scoutfs_forest_read_items_roots(sb, &roots, merge_input_seq, key, range_start,
|
||||
&start, &end, read_items_cb, &root);
|
||||
trace_scoutfs_wkic_read_items(sb, key, &start, &end);
|
||||
check_stale:
|
||||
ret = scoutfs_block_check_stale(sb, ret, &saved, &roots.fs_root.ref, &roots.logs_root.ref);
|
||||
if (ret < 0) {
|
||||
if (ret == -ESTALE)
|
||||
if (ret == -ESTALE) {
|
||||
/* not safe to retry due to delta items, must restart clean */
|
||||
free_item_tree(&root);
|
||||
root = RB_ROOT;
|
||||
goto retry_stale;
|
||||
}
|
||||
goto out;
|
||||
}
|
||||
|
||||
|
||||
3
tests/golden/totl-merge-read
Normal file
3
tests/golden/totl-merge-read
Normal file
@@ -0,0 +1,3 @@
|
||||
== setup
|
||||
expected 4681
|
||||
== cleanup
|
||||
@@ -27,6 +27,7 @@ simple-xattr-unit.sh
|
||||
retention-basic.sh
|
||||
totl-xattr-tag.sh
|
||||
quota.sh
|
||||
totl-merge-read.sh
|
||||
lock-refleak.sh
|
||||
lock-shrink-consistency.sh
|
||||
lock-shrink-read-race.sh
|
||||
|
||||
50
tests/tests/totl-merge-read.sh
Normal file
50
tests/tests/totl-merge-read.sh
Normal file
@@ -0,0 +1,50 @@
|
||||
#
|
||||
# Test that merge_read_item() correctly updates the sequence number when
|
||||
# combining delta items from multiple finalized log trees. Each mount
|
||||
# sets a totl value in its own 3-bit lane (powers of 8) so that any
|
||||
# double-counting overflows the lane and is caught by: or(v, exp) != exp.
|
||||
#
|
||||
|
||||
t_require_commands setfattr scoutfs
|
||||
t_require_mounts 5
|
||||
|
||||
echo "== setup"
|
||||
for nr in $(t_fs_nrs); do
|
||||
d=$(eval echo \$T_D$nr)
|
||||
for i in $(seq 1 2500); do : > "$d/f$nr$i"; done
|
||||
done
|
||||
sync
|
||||
t_force_log_merge
|
||||
|
||||
vals=(1 8 64 512 4096)
|
||||
expected=4681
|
||||
n=0
|
||||
for nr in $(t_fs_nrs); do
|
||||
d=$(eval echo \$T_D$nr)
|
||||
v=${vals[$((n++))]}
|
||||
for i in $(seq 1 2500); do
|
||||
setfattr -n "scoutfs.totl.t.$i.0.0" -v $v "$d/f$nr$i"
|
||||
done
|
||||
done
|
||||
|
||||
t_trigger_arm_silent log_merge_force_partial $(t_server_nr)
|
||||
|
||||
bad="$T_TMPDIR/bad"
|
||||
for nr in $(t_fs_nrs); do
|
||||
( while true; do
|
||||
echo 1 > "$(t_debugfs_path $nr)/drop_weak_item_cache"
|
||||
scoutfs read-xattr-totals -p "$(eval echo \$T_M$nr)" | \
|
||||
awk -F'[ =,]+' -v e=$expected 'or($2+0,e) != e'
|
||||
done ) >> "$bad" &
|
||||
done
|
||||
|
||||
echo "expected $expected"
|
||||
t_force_log_merge
|
||||
t_silent_kill $(jobs -p)
|
||||
test -s "$bad" && echo "double-counted:" && cat "$bad"
|
||||
|
||||
echo "== cleanup"
|
||||
for nr in $(t_fs_nrs); do
|
||||
find "$(eval echo \$T_D$nr)" -name "f$nr*" -delete
|
||||
done
|
||||
t_pass
|
||||
Reference in New Issue
Block a user