diff --git a/kmod/src/btree.c b/kmod/src/btree.c index 83b82894..34d4434a 100644 --- a/kmod/src/btree.c +++ b/kmod/src/btree.c @@ -24,6 +24,8 @@ #include "key.h" #include "btree.h" #include "sort_priv.h" +#include "counters.h" +#include "triggers.h" #include "scoutfs_trace.h" @@ -603,12 +605,10 @@ static bool valid_referenced_block(struct scoutfs_super_block *super, * * Btree blocks don't have rigid cache consistency. We can be following * a new root to read refs into previously stale cached blocks. If we - * see that the block metadata doesn't match we first assume that we - * just have a stale block and try and re-read it. If it still doesn't - * match we assume that we're an reader racing with a writer overwriting - * old blocks in the ring. We return an error that tells the caller to - * deal with this error: either find a new root or return a hard error - * if the block is really corrupt. + * hit a cached block that doesn't match the ref (or indeed a corrupt + * block) we return -ESTALE which tells the caller to deal with this + * error: either find a new root or return a hard error if the block is + * really corrupt. * * btree callers serialize concurrent writers in a btree but not between * btrees. We have to lock around the shared btree_info. Callers do @@ -626,13 +626,11 @@ static int get_ref_block(struct super_block *sb, int flags, struct scoutfs_btree_block *bt = NULL; struct scoutfs_btree_block *new; struct buffer_head *bh; - int retries = 1; u64 blkno; u64 seq; int ret; int i; -retry: /* always get the current block, either to return or cow from */ if (ref && ref->blkno) { bh = sb_bread(sb, le64_to_cpu(ref->blkno)); @@ -642,17 +640,17 @@ retry: } bt = (void *)bh->b_data; - if (!valid_referenced_block(super, ref, bt, bh)) { - if (retries-- > 0) { - lock_buffer(bh); - clear_buffer_uptodate(bh); - unlock_buffer(bh); - put_bh(bh); - bt = NULL; - goto retry; - } - /* XXX let us know when we eventually hit this */ - ret = WARN_ON_ONCE(-ESTALE); + if (!valid_referenced_block(super, ref, bt, bh) || + scoutfs_trigger(sb, BTREE_STALE_READ)) { + + lock_buffer(bh); + clear_buffer_uptodate(bh); + unlock_buffer(bh); + put_bh(bh); + bt = NULL; + + scoutfs_inc_counter(sb, btree_stale_read); + ret = -ESTALE; goto out; } diff --git a/kmod/src/compact.c b/kmod/src/compact.c index e7fdd551..428346a0 100644 --- a/kmod/src/compact.c +++ b/kmod/src/compact.c @@ -619,7 +619,10 @@ static void scoutfs_compact_func(struct work_struct *work) free_cseg_list(sb, &curs.csegs); free_cseg_list(sb, &results); - WARN_ON_ONCE(ret); + if (ret == -ESTALE) + scoutfs_inc_counter(sb, compact_stale_error); + + WARN_ON_ONCE(ret && ret != -ESTALE); trace_scoutfs_compact_func(sb, ret); } diff --git a/kmod/src/counters.h b/kmod/src/counters.h index 67ceba60..c2d3a453 100644 --- a/kmod/src/counters.h +++ b/kmod/src/counters.h @@ -14,11 +14,13 @@ #define EXPAND_EACH_COUNTER \ EXPAND_COUNTER(alloc_alloc) \ EXPAND_COUNTER(alloc_free) \ + EXPAND_COUNTER(btree_stale_read) \ EXPAND_COUNTER(compact_operations) \ EXPAND_COUNTER(compact_segment_moved) \ EXPAND_COUNTER(compact_segment_read) \ EXPAND_COUNTER(compact_segment_write_bytes) \ EXPAND_COUNTER(compact_segment_writes) \ + EXPAND_COUNTER(compact_stale_error) \ EXPAND_COUNTER(compact_sticky_upper) \ EXPAND_COUNTER(compact_sticky_written) \ EXPAND_COUNTER(data_end_writeback_page) \ @@ -59,9 +61,11 @@ EXPAND_COUNTER(lock_free) \ EXPAND_COUNTER(lock_incompat_wait) \ EXPAND_COUNTER(manifest_compact_migrate) \ + EXPAND_COUNTER(manifest_hard_stale_error) \ EXPAND_COUNTER(seg_alloc) \ EXPAND_COUNTER(seg_free) \ EXPAND_COUNTER(seg_shrink) \ + EXPAND_COUNTER(seg_stale_read) \ EXPAND_COUNTER(trans_commit_fsync) \ EXPAND_COUNTER(trans_commit_full) \ EXPAND_COUNTER(trans_commit_item_flush) \ diff --git a/kmod/src/manifest.c b/kmod/src/manifest.c index 2b3d59b0..f7cd99e6 100644 --- a/kmod/src/manifest.c +++ b/kmod/src/manifest.c @@ -26,6 +26,7 @@ #include "manifest.h" #include "trans.h" #include "counters.h" +#include "triggers.h" #include "client.h" #include "scoutfs_trace.h" @@ -527,7 +528,6 @@ out: scoutfs_btree_put_iref(&iref); scoutfs_btree_put_iref(&prev); kfree(mkey); - BUG_ON(ret == -ESTALE); /* XXX caller needs to retry or return error */ return ret; } @@ -575,8 +575,10 @@ static int read_items(struct super_block *sb, struct scoutfs_key_buf *key, struct scoutfs_segment *seg; struct manifest_ref *ref; struct manifest_ref *tmp; + __le64 last_root_seq; LIST_HEAD(ref_list); LIST_HEAD(batch); + bool force_hard; u8 found_flags = 0; u8 item_flags; int found_ctr; @@ -604,6 +606,8 @@ static int read_items(struct super_block *sb, struct scoutfs_key_buf *key, * either get a manifest ref in the lvb of their lock or they'll * ask the server the first time the system sees the lock. */ + last_root_seq = 0; +retry_stale: ret = scoutfs_client_get_manifest_root(sb, &root); if (ret) goto out; @@ -771,6 +775,25 @@ out: free_ref(sb, ref); } + /* + * Resample the root and retry reads as long as we see + * inconsistent blocks/segments and new roots to read through. + * Persistent inconsistency in the same root is seen as corrupt + * structures instead. + */ + force_hard = scoutfs_trigger(sb, HARD_STALE_ERROR); + if (ret == -ESTALE || force_hard) { + /* keep trying as long as the root changes */ + if ((last_root_seq != root.ref.seq) && !force_hard) { + last_root_seq = root.ref.seq; + goto retry_stale; + } + + /* persistent error */ + scoutfs_inc_counter(sb, manifest_hard_stale_error); + ret = -EIO; + } + return ret; } diff --git a/kmod/src/seg.c b/kmod/src/seg.c index 95624332..cdb79b2b 100644 --- a/kmod/src/seg.c +++ b/kmod/src/seg.c @@ -26,6 +26,7 @@ #include "alloc.h" #include "key.h" #include "counters.h" +#include "triggers.h" #include "scoutfs_trace.h" /* @@ -377,6 +378,8 @@ int scoutfs_seg_wait(struct super_block *sb, struct scoutfs_segment *seg, struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb); struct segment_cache *cac = sbi->segment_cache; struct scoutfs_segment_block *sblk = off_ptr(seg, 0); + unsigned long flags; + bool erased; int ret; ret = wait_event_interruptible(cac->waitq, @@ -392,12 +395,17 @@ int scoutfs_seg_wait(struct super_block *sb, struct scoutfs_segment *seg, sblk = off_ptr(seg, 0); if (WARN_ON_ONCE(segno != le64_to_cpu(sblk->segno)) || - WARN_ON_ONCE(seq != le64_to_cpu(sblk->seq))) { - ret = -ESTALE; - goto out; - } + WARN_ON_ONCE(seq != le64_to_cpu(sblk->seq)) || + scoutfs_trigger(sb, SEG_STALE_READ)) { + spin_lock_irqsave(&cac->lock, flags); + erased = erase_seg(cac, seg); + spin_unlock_irqrestore(&cac->lock, flags); + if (erased) + scoutfs_seg_put(seg); - ret = 0; + scoutfs_inc_counter(sb, seg_stale_read); + ret = -ESTALE; + } out: return ret; } diff --git a/kmod/src/triggers.c b/kmod/src/triggers.c index 64f83941..200d29e7 100644 --- a/kmod/src/triggers.c +++ b/kmod/src/triggers.c @@ -38,7 +38,9 @@ struct scoutfs_triggers { struct scoutfs_triggers *name = SCOUTFS_SB(sb)->triggers static char *names[] = { - [SCOUTFS_TRIGGER_SOMETHING] = "something", + [SCOUTFS_TRIGGER_BTREE_STALE_READ] = "btree_stale_read", + [SCOUTFS_TRIGGER_HARD_STALE_ERROR] = "hard_stale_error", + [SCOUTFS_TRIGGER_SEG_STALE_READ] = "seg_stale_read", }; bool scoutfs_trigger_test_and_clear(struct super_block *sb, unsigned int t) diff --git a/kmod/src/triggers.h b/kmod/src/triggers.h index 48920844..1b802c5e 100644 --- a/kmod/src/triggers.h +++ b/kmod/src/triggers.h @@ -2,7 +2,9 @@ #define _SCOUTFS_TRIGGERS_H_ enum { - SCOUTFS_TRIGGER_SOMETHING, + SCOUTFS_TRIGGER_BTREE_STALE_READ, + SCOUTFS_TRIGGER_HARD_STALE_ERROR, + SCOUTFS_TRIGGER_SEG_STALE_READ, SCOUTFS_TRIGGER_NR, };