diff --git a/kmod/src/btree.c b/kmod/src/btree.c index 34d4434a..9e5effd0 100644 --- a/kmod/src/btree.c +++ b/kmod/src/btree.c @@ -604,11 +604,12 @@ static bool valid_referenced_block(struct scoutfs_super_block *super, * dirtying, and allocate new blocks. * * Btree blocks don't have rigid cache consistency. We can be following - * a new root to read refs into previously stale cached blocks. If we - * hit a cached block that doesn't match the ref (or indeed a corrupt - * block) we return -ESTALE which tells the caller to deal with this - * error: either find a new root or return a hard error if the block is - * really corrupt. + * block references into cached blocks that are now stale or can be + * following a stale root into blocks that have been overwritten. If we + * hit a block that looks stale we first invalidate the cache and retry, + * returning -ESTALE if it still looks wrong. The caller can retry the + * read from a more current root or decide that this is a persistent + * error. * * btree callers serialize concurrent writers in a btree but not between * btrees. We have to lock around the shared btree_info. Callers do @@ -626,6 +627,7 @@ static int get_ref_block(struct super_block *sb, int flags, struct scoutfs_btree_block *bt = NULL; struct scoutfs_btree_block *new; struct buffer_head *bh; + bool retried = false; u64 blkno; u64 seq; int ret; @@ -633,6 +635,7 @@ static int get_ref_block(struct super_block *sb, int flags, /* always get the current block, either to return or cow from */ if (ref && ref->blkno) { +retry: bh = sb_bread(sb, le64_to_cpu(ref->blkno)); if (!bh) { ret = -EIO; @@ -643,13 +646,19 @@ static int get_ref_block(struct super_block *sb, int flags, if (!valid_referenced_block(super, ref, bt, bh) || scoutfs_trigger(sb, BTREE_STALE_READ)) { + scoutfs_inc_counter(sb, btree_stale_read); + lock_buffer(bh); clear_buffer_uptodate(bh); unlock_buffer(bh); put_bh(bh); bt = NULL; - scoutfs_inc_counter(sb, btree_stale_read); + if (!retried) { + retried = true; + goto retry; + } + ret = -ESTALE; goto out; } diff --git a/kmod/src/manifest.c b/kmod/src/manifest.c index d667f339..fcf404a2 100644 --- a/kmod/src/manifest.c +++ b/kmod/src/manifest.c @@ -76,6 +76,7 @@ struct manifest_ref { int found_ctr; int off; u8 level; + bool retried; struct scoutfs_key first; struct scoutfs_key last; @@ -469,9 +470,11 @@ static int get_nonzero_refs(struct super_block *sb, } /* - * See if the caller is a remote btree reader who has read a stale btree - * block and should keep trying. If they see repeated errors on the - * same root then we assume that it's persistent corruption. + * If we saw persistent stale blocks or segment reads while walking the + * manifest then we might be trying to read through an old stale root + * that has been overwritten. We can ask for a new root and try again. + * If we don't get a new root and the errors persist then the we've hit + * corruption. */ static int handle_stale_btree(struct super_block *sb, struct scoutfs_btree_root *root, @@ -605,8 +608,12 @@ retry_stale: /* sort by segment to issue advancing reads */ list_sort(NULL, &ref_list, cmp_ment_ref_segno); +resubmit: /* submit reads for all the segments */ list_for_each_entry(ref, &ref_list, entry) { + /* don't resubmit if we've read */ + if (ref->seg) + continue; trace_scoutfs_read_item_segment(sb, ref->level, ref->segno, ref->seq, &ref->first, @@ -624,9 +631,16 @@ retry_stale: /* always wait for submitted segments */ list_for_each_entry(ref, &ref_list, entry) { if (!ref->seg) - break; + continue; err = scoutfs_seg_wait(sb, ref->seg, ref->segno, ref->seq); + if (err == -ESTALE && !ref->retried) { + ref->retried = true; + err = 0; + scoutfs_seg_put(ref->seg); + ref->seg = NULL; + goto resubmit; + } if (err && !ret) ret = err; }