scoutfs: retry stale btree and segment reads

We don't have strict consistency protocols protecting the "physical"
caches that hold btree blocks and segments.  We have metadata that tells
a reader that it's hit a stale cached entry and needs to invalidate and
read a the current version from the media.

This implements the retrying.  If we get stale sequence numbers in
segments or btree blocks we invalidate them from the cache and return
-ESTALE.

This can only happen when reading structures that could have been
modified remotely.  This means btree reads in the clients and segment
reads for everyone.  btree reads on the server are always consistent
because it is the only writer.

Adding retrying to item reading and compaction catches all of these
cases.

Stale reads are triggered by inconsistency.  But that could also be
persistent corruption in persistent media.  Callers need to be careful
to turn their retries into hard errors if they're persistent.  Item
reading can do this because it knows the btree root seq that anchored
the walk.  Compaction doesn't do this today.  That gets addressed in a
big sweep of error handling at some point in the not too distant future.

Signed-off-by: Zach Brown <zab@versity.com>
This commit is contained in:
Zach Brown
2017-12-18 09:46:08 -08:00
committed by Mark Fasheh
parent e354fd18b1
commit 829126790b
7 changed files with 68 additions and 28 deletions

View File

@@ -24,6 +24,8 @@
#include "key.h"
#include "btree.h"
#include "sort_priv.h"
#include "counters.h"
#include "triggers.h"
#include "scoutfs_trace.h"
@@ -603,12 +605,10 @@ static bool valid_referenced_block(struct scoutfs_super_block *super,
*
* Btree blocks don't have rigid cache consistency. We can be following
* a new root to read refs into previously stale cached blocks. If we
* see that the block metadata doesn't match we first assume that we
* just have a stale block and try and re-read it. If it still doesn't
* match we assume that we're an reader racing with a writer overwriting
* old blocks in the ring. We return an error that tells the caller to
* deal with this error: either find a new root or return a hard error
* if the block is really corrupt.
* hit a cached block that doesn't match the ref (or indeed a corrupt
* block) we return -ESTALE which tells the caller to deal with this
* error: either find a new root or return a hard error if the block is
* really corrupt.
*
* btree callers serialize concurrent writers in a btree but not between
* btrees. We have to lock around the shared btree_info. Callers do
@@ -626,13 +626,11 @@ static int get_ref_block(struct super_block *sb, int flags,
struct scoutfs_btree_block *bt = NULL;
struct scoutfs_btree_block *new;
struct buffer_head *bh;
int retries = 1;
u64 blkno;
u64 seq;
int ret;
int i;
retry:
/* always get the current block, either to return or cow from */
if (ref && ref->blkno) {
bh = sb_bread(sb, le64_to_cpu(ref->blkno));
@@ -642,17 +640,17 @@ retry:
}
bt = (void *)bh->b_data;
if (!valid_referenced_block(super, ref, bt, bh)) {
if (retries-- > 0) {
lock_buffer(bh);
clear_buffer_uptodate(bh);
unlock_buffer(bh);
put_bh(bh);
bt = NULL;
goto retry;
}
/* XXX let us know when we eventually hit this */
ret = WARN_ON_ONCE(-ESTALE);
if (!valid_referenced_block(super, ref, bt, bh) ||
scoutfs_trigger(sb, BTREE_STALE_READ)) {
lock_buffer(bh);
clear_buffer_uptodate(bh);
unlock_buffer(bh);
put_bh(bh);
bt = NULL;
scoutfs_inc_counter(sb, btree_stale_read);
ret = -ESTALE;
goto out;
}

View File

@@ -619,7 +619,10 @@ static void scoutfs_compact_func(struct work_struct *work)
free_cseg_list(sb, &curs.csegs);
free_cseg_list(sb, &results);
WARN_ON_ONCE(ret);
if (ret == -ESTALE)
scoutfs_inc_counter(sb, compact_stale_error);
WARN_ON_ONCE(ret && ret != -ESTALE);
trace_scoutfs_compact_func(sb, ret);
}

View File

@@ -14,11 +14,13 @@
#define EXPAND_EACH_COUNTER \
EXPAND_COUNTER(alloc_alloc) \
EXPAND_COUNTER(alloc_free) \
EXPAND_COUNTER(btree_stale_read) \
EXPAND_COUNTER(compact_operations) \
EXPAND_COUNTER(compact_segment_moved) \
EXPAND_COUNTER(compact_segment_read) \
EXPAND_COUNTER(compact_segment_write_bytes) \
EXPAND_COUNTER(compact_segment_writes) \
EXPAND_COUNTER(compact_stale_error) \
EXPAND_COUNTER(compact_sticky_upper) \
EXPAND_COUNTER(compact_sticky_written) \
EXPAND_COUNTER(data_end_writeback_page) \
@@ -59,9 +61,11 @@
EXPAND_COUNTER(lock_free) \
EXPAND_COUNTER(lock_incompat_wait) \
EXPAND_COUNTER(manifest_compact_migrate) \
EXPAND_COUNTER(manifest_hard_stale_error) \
EXPAND_COUNTER(seg_alloc) \
EXPAND_COUNTER(seg_free) \
EXPAND_COUNTER(seg_shrink) \
EXPAND_COUNTER(seg_stale_read) \
EXPAND_COUNTER(trans_commit_fsync) \
EXPAND_COUNTER(trans_commit_full) \
EXPAND_COUNTER(trans_commit_item_flush) \

View File

@@ -26,6 +26,7 @@
#include "manifest.h"
#include "trans.h"
#include "counters.h"
#include "triggers.h"
#include "client.h"
#include "scoutfs_trace.h"
@@ -527,7 +528,6 @@ out:
scoutfs_btree_put_iref(&iref);
scoutfs_btree_put_iref(&prev);
kfree(mkey);
BUG_ON(ret == -ESTALE); /* XXX caller needs to retry or return error */
return ret;
}
@@ -575,8 +575,10 @@ static int read_items(struct super_block *sb, struct scoutfs_key_buf *key,
struct scoutfs_segment *seg;
struct manifest_ref *ref;
struct manifest_ref *tmp;
__le64 last_root_seq;
LIST_HEAD(ref_list);
LIST_HEAD(batch);
bool force_hard;
u8 found_flags = 0;
u8 item_flags;
int found_ctr;
@@ -604,6 +606,8 @@ static int read_items(struct super_block *sb, struct scoutfs_key_buf *key,
* either get a manifest ref in the lvb of their lock or they'll
* ask the server the first time the system sees the lock.
*/
last_root_seq = 0;
retry_stale:
ret = scoutfs_client_get_manifest_root(sb, &root);
if (ret)
goto out;
@@ -771,6 +775,25 @@ out:
free_ref(sb, ref);
}
/*
* Resample the root and retry reads as long as we see
* inconsistent blocks/segments and new roots to read through.
* Persistent inconsistency in the same root is seen as corrupt
* structures instead.
*/
force_hard = scoutfs_trigger(sb, HARD_STALE_ERROR);
if (ret == -ESTALE || force_hard) {
/* keep trying as long as the root changes */
if ((last_root_seq != root.ref.seq) && !force_hard) {
last_root_seq = root.ref.seq;
goto retry_stale;
}
/* persistent error */
scoutfs_inc_counter(sb, manifest_hard_stale_error);
ret = -EIO;
}
return ret;
}

View File

@@ -26,6 +26,7 @@
#include "alloc.h"
#include "key.h"
#include "counters.h"
#include "triggers.h"
#include "scoutfs_trace.h"
/*
@@ -377,6 +378,8 @@ int scoutfs_seg_wait(struct super_block *sb, struct scoutfs_segment *seg,
struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
struct segment_cache *cac = sbi->segment_cache;
struct scoutfs_segment_block *sblk = off_ptr(seg, 0);
unsigned long flags;
bool erased;
int ret;
ret = wait_event_interruptible(cac->waitq,
@@ -392,12 +395,17 @@ int scoutfs_seg_wait(struct super_block *sb, struct scoutfs_segment *seg,
sblk = off_ptr(seg, 0);
if (WARN_ON_ONCE(segno != le64_to_cpu(sblk->segno)) ||
WARN_ON_ONCE(seq != le64_to_cpu(sblk->seq))) {
ret = -ESTALE;
goto out;
}
WARN_ON_ONCE(seq != le64_to_cpu(sblk->seq)) ||
scoutfs_trigger(sb, SEG_STALE_READ)) {
spin_lock_irqsave(&cac->lock, flags);
erased = erase_seg(cac, seg);
spin_unlock_irqrestore(&cac->lock, flags);
if (erased)
scoutfs_seg_put(seg);
ret = 0;
scoutfs_inc_counter(sb, seg_stale_read);
ret = -ESTALE;
}
out:
return ret;
}

View File

@@ -38,7 +38,9 @@ struct scoutfs_triggers {
struct scoutfs_triggers *name = SCOUTFS_SB(sb)->triggers
static char *names[] = {
[SCOUTFS_TRIGGER_SOMETHING] = "something",
[SCOUTFS_TRIGGER_BTREE_STALE_READ] = "btree_stale_read",
[SCOUTFS_TRIGGER_HARD_STALE_ERROR] = "hard_stale_error",
[SCOUTFS_TRIGGER_SEG_STALE_READ] = "seg_stale_read",
};
bool scoutfs_trigger_test_and_clear(struct super_block *sb, unsigned int t)

View File

@@ -2,7 +2,9 @@
#define _SCOUTFS_TRIGGERS_H_
enum {
SCOUTFS_TRIGGER_SOMETHING,
SCOUTFS_TRIGGER_BTREE_STALE_READ,
SCOUTFS_TRIGGER_HARD_STALE_ERROR,
SCOUTFS_TRIGGER_SEG_STALE_READ,
SCOUTFS_TRIGGER_NR,
};