From 829126790bbbf5ce67ebb7275f17449c84baa349 Mon Sep 17 00:00:00 2001 From: Zach Brown Date: Mon, 18 Dec 2017 09:46:08 -0800 Subject: [PATCH] scoutfs: retry stale btree and segment reads We don't have strict consistency protocols protecting the "physical" caches that hold btree blocks and segments. We have metadata that tells a reader that it's hit a stale cached entry and needs to invalidate and read a the current version from the media. This implements the retrying. If we get stale sequence numbers in segments or btree blocks we invalidate them from the cache and return -ESTALE. This can only happen when reading structures that could have been modified remotely. This means btree reads in the clients and segment reads for everyone. btree reads on the server are always consistent because it is the only writer. Adding retrying to item reading and compaction catches all of these cases. Stale reads are triggered by inconsistency. But that could also be persistent corruption in persistent media. Callers need to be careful to turn their retries into hard errors if they're persistent. Item reading can do this because it knows the btree root seq that anchored the walk. Compaction doesn't do this today. That gets addressed in a big sweep of error handling at some point in the not too distant future. Signed-off-by: Zach Brown --- kmod/src/btree.c | 36 +++++++++++++++++------------------- kmod/src/compact.c | 5 ++++- kmod/src/counters.h | 4 ++++ kmod/src/manifest.c | 25 ++++++++++++++++++++++++- kmod/src/seg.c | 18 +++++++++++++----- kmod/src/triggers.c | 4 +++- kmod/src/triggers.h | 4 +++- 7 files changed, 68 insertions(+), 28 deletions(-) diff --git a/kmod/src/btree.c b/kmod/src/btree.c index 83b82894..34d4434a 100644 --- a/kmod/src/btree.c +++ b/kmod/src/btree.c @@ -24,6 +24,8 @@ #include "key.h" #include "btree.h" #include "sort_priv.h" +#include "counters.h" +#include "triggers.h" #include "scoutfs_trace.h" @@ -603,12 +605,10 @@ static bool valid_referenced_block(struct scoutfs_super_block *super, * * Btree blocks don't have rigid cache consistency. We can be following * a new root to read refs into previously stale cached blocks. If we - * see that the block metadata doesn't match we first assume that we - * just have a stale block and try and re-read it. If it still doesn't - * match we assume that we're an reader racing with a writer overwriting - * old blocks in the ring. We return an error that tells the caller to - * deal with this error: either find a new root or return a hard error - * if the block is really corrupt. + * hit a cached block that doesn't match the ref (or indeed a corrupt + * block) we return -ESTALE which tells the caller to deal with this + * error: either find a new root or return a hard error if the block is + * really corrupt. * * btree callers serialize concurrent writers in a btree but not between * btrees. We have to lock around the shared btree_info. Callers do @@ -626,13 +626,11 @@ static int get_ref_block(struct super_block *sb, int flags, struct scoutfs_btree_block *bt = NULL; struct scoutfs_btree_block *new; struct buffer_head *bh; - int retries = 1; u64 blkno; u64 seq; int ret; int i; -retry: /* always get the current block, either to return or cow from */ if (ref && ref->blkno) { bh = sb_bread(sb, le64_to_cpu(ref->blkno)); @@ -642,17 +640,17 @@ retry: } bt = (void *)bh->b_data; - if (!valid_referenced_block(super, ref, bt, bh)) { - if (retries-- > 0) { - lock_buffer(bh); - clear_buffer_uptodate(bh); - unlock_buffer(bh); - put_bh(bh); - bt = NULL; - goto retry; - } - /* XXX let us know when we eventually hit this */ - ret = WARN_ON_ONCE(-ESTALE); + if (!valid_referenced_block(super, ref, bt, bh) || + scoutfs_trigger(sb, BTREE_STALE_READ)) { + + lock_buffer(bh); + clear_buffer_uptodate(bh); + unlock_buffer(bh); + put_bh(bh); + bt = NULL; + + scoutfs_inc_counter(sb, btree_stale_read); + ret = -ESTALE; goto out; } diff --git a/kmod/src/compact.c b/kmod/src/compact.c index e7fdd551..428346a0 100644 --- a/kmod/src/compact.c +++ b/kmod/src/compact.c @@ -619,7 +619,10 @@ static void scoutfs_compact_func(struct work_struct *work) free_cseg_list(sb, &curs.csegs); free_cseg_list(sb, &results); - WARN_ON_ONCE(ret); + if (ret == -ESTALE) + scoutfs_inc_counter(sb, compact_stale_error); + + WARN_ON_ONCE(ret && ret != -ESTALE); trace_scoutfs_compact_func(sb, ret); } diff --git a/kmod/src/counters.h b/kmod/src/counters.h index 67ceba60..c2d3a453 100644 --- a/kmod/src/counters.h +++ b/kmod/src/counters.h @@ -14,11 +14,13 @@ #define EXPAND_EACH_COUNTER \ EXPAND_COUNTER(alloc_alloc) \ EXPAND_COUNTER(alloc_free) \ + EXPAND_COUNTER(btree_stale_read) \ EXPAND_COUNTER(compact_operations) \ EXPAND_COUNTER(compact_segment_moved) \ EXPAND_COUNTER(compact_segment_read) \ EXPAND_COUNTER(compact_segment_write_bytes) \ EXPAND_COUNTER(compact_segment_writes) \ + EXPAND_COUNTER(compact_stale_error) \ EXPAND_COUNTER(compact_sticky_upper) \ EXPAND_COUNTER(compact_sticky_written) \ EXPAND_COUNTER(data_end_writeback_page) \ @@ -59,9 +61,11 @@ EXPAND_COUNTER(lock_free) \ EXPAND_COUNTER(lock_incompat_wait) \ EXPAND_COUNTER(manifest_compact_migrate) \ + EXPAND_COUNTER(manifest_hard_stale_error) \ EXPAND_COUNTER(seg_alloc) \ EXPAND_COUNTER(seg_free) \ EXPAND_COUNTER(seg_shrink) \ + EXPAND_COUNTER(seg_stale_read) \ EXPAND_COUNTER(trans_commit_fsync) \ EXPAND_COUNTER(trans_commit_full) \ EXPAND_COUNTER(trans_commit_item_flush) \ diff --git a/kmod/src/manifest.c b/kmod/src/manifest.c index 2b3d59b0..f7cd99e6 100644 --- a/kmod/src/manifest.c +++ b/kmod/src/manifest.c @@ -26,6 +26,7 @@ #include "manifest.h" #include "trans.h" #include "counters.h" +#include "triggers.h" #include "client.h" #include "scoutfs_trace.h" @@ -527,7 +528,6 @@ out: scoutfs_btree_put_iref(&iref); scoutfs_btree_put_iref(&prev); kfree(mkey); - BUG_ON(ret == -ESTALE); /* XXX caller needs to retry or return error */ return ret; } @@ -575,8 +575,10 @@ static int read_items(struct super_block *sb, struct scoutfs_key_buf *key, struct scoutfs_segment *seg; struct manifest_ref *ref; struct manifest_ref *tmp; + __le64 last_root_seq; LIST_HEAD(ref_list); LIST_HEAD(batch); + bool force_hard; u8 found_flags = 0; u8 item_flags; int found_ctr; @@ -604,6 +606,8 @@ static int read_items(struct super_block *sb, struct scoutfs_key_buf *key, * either get a manifest ref in the lvb of their lock or they'll * ask the server the first time the system sees the lock. */ + last_root_seq = 0; +retry_stale: ret = scoutfs_client_get_manifest_root(sb, &root); if (ret) goto out; @@ -771,6 +775,25 @@ out: free_ref(sb, ref); } + /* + * Resample the root and retry reads as long as we see + * inconsistent blocks/segments and new roots to read through. + * Persistent inconsistency in the same root is seen as corrupt + * structures instead. + */ + force_hard = scoutfs_trigger(sb, HARD_STALE_ERROR); + if (ret == -ESTALE || force_hard) { + /* keep trying as long as the root changes */ + if ((last_root_seq != root.ref.seq) && !force_hard) { + last_root_seq = root.ref.seq; + goto retry_stale; + } + + /* persistent error */ + scoutfs_inc_counter(sb, manifest_hard_stale_error); + ret = -EIO; + } + return ret; } diff --git a/kmod/src/seg.c b/kmod/src/seg.c index 95624332..cdb79b2b 100644 --- a/kmod/src/seg.c +++ b/kmod/src/seg.c @@ -26,6 +26,7 @@ #include "alloc.h" #include "key.h" #include "counters.h" +#include "triggers.h" #include "scoutfs_trace.h" /* @@ -377,6 +378,8 @@ int scoutfs_seg_wait(struct super_block *sb, struct scoutfs_segment *seg, struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb); struct segment_cache *cac = sbi->segment_cache; struct scoutfs_segment_block *sblk = off_ptr(seg, 0); + unsigned long flags; + bool erased; int ret; ret = wait_event_interruptible(cac->waitq, @@ -392,12 +395,17 @@ int scoutfs_seg_wait(struct super_block *sb, struct scoutfs_segment *seg, sblk = off_ptr(seg, 0); if (WARN_ON_ONCE(segno != le64_to_cpu(sblk->segno)) || - WARN_ON_ONCE(seq != le64_to_cpu(sblk->seq))) { - ret = -ESTALE; - goto out; - } + WARN_ON_ONCE(seq != le64_to_cpu(sblk->seq)) || + scoutfs_trigger(sb, SEG_STALE_READ)) { + spin_lock_irqsave(&cac->lock, flags); + erased = erase_seg(cac, seg); + spin_unlock_irqrestore(&cac->lock, flags); + if (erased) + scoutfs_seg_put(seg); - ret = 0; + scoutfs_inc_counter(sb, seg_stale_read); + ret = -ESTALE; + } out: return ret; } diff --git a/kmod/src/triggers.c b/kmod/src/triggers.c index 64f83941..200d29e7 100644 --- a/kmod/src/triggers.c +++ b/kmod/src/triggers.c @@ -38,7 +38,9 @@ struct scoutfs_triggers { struct scoutfs_triggers *name = SCOUTFS_SB(sb)->triggers static char *names[] = { - [SCOUTFS_TRIGGER_SOMETHING] = "something", + [SCOUTFS_TRIGGER_BTREE_STALE_READ] = "btree_stale_read", + [SCOUTFS_TRIGGER_HARD_STALE_ERROR] = "hard_stale_error", + [SCOUTFS_TRIGGER_SEG_STALE_READ] = "seg_stale_read", }; bool scoutfs_trigger_test_and_clear(struct super_block *sb, unsigned int t) diff --git a/kmod/src/triggers.h b/kmod/src/triggers.h index 48920844..1b802c5e 100644 --- a/kmod/src/triggers.h +++ b/kmod/src/triggers.h @@ -2,7 +2,9 @@ #define _SCOUTFS_TRIGGERS_H_ enum { - SCOUTFS_TRIGGER_SOMETHING, + SCOUTFS_TRIGGER_BTREE_STALE_READ, + SCOUTFS_TRIGGER_HARD_STALE_ERROR, + SCOUTFS_TRIGGER_SEG_STALE_READ, SCOUTFS_TRIGGER_NR, };