scoutfs: fix spurious hard stale block errors

The stale block handling code only handled the case where we read through a stale root into blocks that have been overwritten in the persistent store. In this case you'll get a new root and the read will be OK. It didn't handle the case where we have stale blocks cached at the blocks of the legitamate current root. In this case we get ESTALE from each stale block and because the root doesn't change when we retry we assume the persistent structure is corrupt. This case can happen when the btree ring wraps and there are still blocks cached at the head of the ring. This became much more possible when we moved to small fixed size keys. The fix is to retry reading individual blocks or segments before returning -ESTALE and expecting the caller to get a new root and try again. In the stale cache case this will allow the more recent correct blocks to be read. Signed-off-by: Zach Brown <zab@versity.com>
2026-02-07 11:10:44 +00:00 · 2018-04-10 15:32:58 -07:00
parent 966c8b8cbc
commit e1f32a0f8b
2 changed files with 33 additions and 10 deletions
--- a/kmod/src/btree.c
+++ b/kmod/src/btree.c
@@ -604,11 +604,12 @@ static bool valid_referenced_block(struct scoutfs_super_block *super,
 * dirtying, and allocate new blocks.
 *
 * Btree blocks don't have rigid cache consistency.  We can be following
- * a new root to read refs into previously stale cached blocks.  If we
- * hit a cached block that doesn't match the ref (or indeed a corrupt
- * block) we return -ESTALE which tells the caller to deal with this
- * error: either find a new root or return a hard error if the block is
- * really corrupt.
+ * block references into cached blocks that are now stale or can be
+ * following a stale root into blocks that have been overwritten.  If we
+ * hit a block that looks stale we first invalidate the cache and retry,
+ * returning -ESTALE if it still looks wrong.  The caller can retry the
+ * read from a more current root or decide that this is a persistent
+ * error.
 *
 * btree callers serialize concurrent writers in a btree but not between
 * btrees.  We have to lock around the shared btree_info.  Callers do
@@ -626,6 +627,7 @@ static int get_ref_block(struct super_block *sb, int flags,
 	struct scoutfs_btree_block *bt = NULL;
 	struct scoutfs_btree_block *new;
 	struct buffer_head *bh;
+	bool retried = false;
 	u64 blkno;
 	u64 seq;
 	int ret;
@@ -633,6 +635,7 @@ static int get_ref_block(struct super_block *sb, int flags,

 	/* always get the current block, either to return or cow from */
 	if (ref && ref->blkno) {
+retry:
 		bh = sb_bread(sb, le64_to_cpu(ref->blkno));
 		if (!bh) {
 			ret = -EIO;
@@ -643,13 +646,19 @@ static int get_ref_block(struct super_block *sb, int flags,
 		if (!valid_referenced_block(super, ref, bt, bh) ||
 		    scoutfs_trigger(sb, BTREE_STALE_READ)) {

+			scoutfs_inc_counter(sb, btree_stale_read);
+
 			lock_buffer(bh);
 			clear_buffer_uptodate(bh);
 			unlock_buffer(bh);
 			put_bh(bh);
 			bt = NULL;

-			scoutfs_inc_counter(sb, btree_stale_read);
+			if (!retried) {
+				retried = true;
+				goto retry;
+			}
+
 			ret = -ESTALE;
 			goto out;
 		}
--- a/kmod/src/manifest.c
+++ b/kmod/src/manifest.c
@@ -76,6 +76,7 @@ struct manifest_ref {
 	int found_ctr;
 	int off;
 	u8 level;
+	bool retried;

 	struct scoutfs_key first;
 	struct scoutfs_key last;
@@ -469,9 +470,11 @@ static int get_nonzero_refs(struct super_block *sb,
 }

 /*
- * See if the caller is a remote btree reader who has read a stale btree
- * block and should keep trying.  If they see repeated errors on the
- * same root then we assume that it's persistent corruption.
+ * If we saw persistent stale blocks or segment reads while walking the
+ * manifest then we might be trying to read through an old stale root
+ * that has been overwritten.  We can ask for a new root and try again.
+ * If we don't get a new root and the errors persist then the we've hit
+ * corruption.
 */
 static int handle_stale_btree(struct super_block *sb,
 			      struct scoutfs_btree_root *root,
@@ -605,8 +608,12 @@ retry_stale:
 	/* sort by segment to issue advancing reads */
 	list_sort(NULL, &ref_list, cmp_ment_ref_segno);

+resubmit:
 	/* submit reads for all the segments */
 	list_for_each_entry(ref, &ref_list, entry) {
+		/* don't resubmit if we've read */
+		if (ref->seg)
+			continue;

 		trace_scoutfs_read_item_segment(sb, ref->level, ref->segno,
 						ref->seq, &ref->first,
@@ -624,9 +631,16 @@ retry_stale:
 	/* always wait for submitted segments */
 	list_for_each_entry(ref, &ref_list, entry) {
 		if (!ref->seg)
-			break;
+			continue;

 		err = scoutfs_seg_wait(sb, ref->seg, ref->segno, ref->seq);
+		if (err == -ESTALE && !ref->retried) {
+			ref->retried = true;
+			err = 0;
+			scoutfs_seg_put(ref->seg);
+			ref->seg = NULL;
+			goto resubmit;
+		}
 		if (err && !ret)
 			ret = err;
 	}