From 045380ca55b316f286e56978c33e8bcb1c71bb21 Mon Sep 17 00:00:00 2001 From: Zach Brown Date: Wed, 28 Mar 2018 08:58:07 -0700 Subject: [PATCH] scoutfs: don't negatively cache unread segments Previously we changed item reading to try and read from the start of its locked range instead of from the key that wasn't found in the cache. This greatly improved the performance of access patterns that didn't proceed in key order. We rightly shrank the range of items that we'd claim to cache by the segments that we read. But we missed the case where our search key falls between two segments and we chose to read the next segment instead of the previous. If the previous segment in this case overlapped with the lock range then we were claiming to cache the segments contents but weren't reading it. This would result in bad negative caching of items that existed. scoutfs/500 was tripping over this as it tried to rename a file created by another node. The local renaming node would try to look up a key that only existed in level 0 and not read but negatively cache the items in the previous level 1 segment. We fix this by shrinking the caching range down as we're considering manifest entries instead of up as we process each segment read because we have to shrink based on the segments in the manifest, not the ones we chose to read. With this fixed the rename can see those items in the level 1 segment again. Signed-off-by: Zach Brown --- kmod/src/manifest.c | 98 ++++++++++++++++++++++++++++----------------- 1 file changed, 61 insertions(+), 37 deletions(-) diff --git a/kmod/src/manifest.c b/kmod/src/manifest.c index a41ba62a..d667f339 100644 --- a/kmod/src/manifest.c +++ b/kmod/src/manifest.c @@ -285,33 +285,53 @@ static int alloc_manifest_ref(struct super_block *sb, struct list_head *ref_list } /* - * Return the previous entry if it's in the right level and it overlaps - * with the start key by having a last key that's >=. If no such entry - * exists it just returns the next entry after the key and doesn't test - * it at all. If this returns 0 then the caller has to put the iref. + * Give the caller the next entry that overlaps with the given key at th + * egiven level. We first check the previous entry before the key to + * see if it overlaps. If it does then we return it. If it doesn't + * then we return the raw next entry after the key. The caller has to + * test it. + * + * If a start key is provided then the caller is working with cache + * ranges. If we find a previous entry that doesn't contain the key + * then we see if we should shrink the range to make sure that it + * doesn't include this segment whose items we're not using. + * + * Returns 0 with the iref pointing to the btree item with the entry, + * callers has to put the iref when they're done. */ static int btree_prev_overlap_or_next(struct super_block *sb, struct scoutfs_btree_root *root, - void *key, unsigned key_len, + void *bkey, unsigned bkey_len, + struct scoutfs_key *key, struct scoutfs_key *start, u8 level, struct scoutfs_btree_item_ref *iref) { struct scoutfs_manifest_entry ment; int ret; - ret = scoutfs_btree_prev(sb, root, key, key_len, iref); + ret = scoutfs_btree_prev(sb, root, bkey, bkey_len, iref); if (ret < 0 && ret != -ENOENT) return ret; if (ret == 0) { init_ment_iref(&ment, iref); + + /* shrink range so it doesn't cover skipped prev */ + if (start && ment.level == level && + scoutfs_key_compare(&ment.last, key) < 0 && + scoutfs_key_compare(&ment.last, start) >= 0) { + *start = ment.last; + scoutfs_key_inc(start); + } + + /* skip prev that doesn't contain the key */ if (ment.level != level || - scoutfs_key_compare(&ment.last, start) < 0) + scoutfs_key_compare(&ment.last, key) < 0) ret = -ENOENT; } if (ret == -ENOENT) { scoutfs_btree_put_iref(iref); - ret = scoutfs_btree_next(sb, root, key, key_len, iref); + ret = scoutfs_btree_next(sb, root, bkey, bkey_len, iref); } return ret; @@ -381,19 +401,26 @@ static int get_zero_refs(struct super_block *sb, /* * Get references to all segments in non-zero levels that contain the - * caller's search key. The item ranges of segments at each non-zero - * level don't overlap so we can iterate through the key space in each - * segment starting with the search key. In each level we need the - * first existing segment that intersects with the range, even if it - * doesn't contain the key. The key might fall between segments at that - * level. If a segment is entirely outside of the caller's range then - * we can't trust its contents. + * caller's key. The item ranges of segments at each non-zero level + * don't overlap so we can iterate through the key space in each segment + * starting with the search key. In each level we need the first + * existing segment that intersects with the range, even if it doesn't + * contain the key. The key might fall between segments at that level. + * + * The caller can provide the range of items that they're going to + * consider authoritative for the range of segments that we give them. + * We have to shrink this range if we give them segments that don't + * cover the range. This includes implicitly negative cached space + * that's created by using the segment after the hole between segments. + * If a segment is entirely outside of the caller's range then we can't + * trust its contents. * * This can return -ESTALE if it reads through stale btree blocks. */ static int get_nonzero_refs(struct super_block *sb, struct scoutfs_btree_root *root, struct scoutfs_key *key, + struct scoutfs_key *start, struct scoutfs_key *end, struct list_head *ref_list) { @@ -403,11 +430,15 @@ static int get_nonzero_refs(struct super_block *sb, int ret; int i; + if (WARN_ON_ONCE(!!start != !!end) || + WARN_ON_ONCE(start && scoutfs_key_compare(start, end) > 0)) + return -EINVAL; + for (i = 1; ; i++) { init_btree_key(&mkey, i, 0, key); ret = btree_prev_overlap_or_next(sb, root, &mkey, sizeof(mkey), - key, i, &iref); + key, start, i, &iref); if (ret < 0) { if (ret == -ENOENT) ret = 0; @@ -418,12 +449,20 @@ static int get_nonzero_refs(struct super_block *sb, scoutfs_btree_put_iref(&iref); if (ment.level != i || - scoutfs_key_compare(&ment.first, end) > 0) + (end && scoutfs_key_compare(&ment.first, end) > 0)) continue; ret = alloc_manifest_ref(sb, ref_list, &ment); if (ret) break; + + if (start && scoutfs_key_compare(&ment.first, start) > 0 && + scoutfs_key_compare(&ment.first, key) <= 0) + *start = ment.first; + + if (end && scoutfs_key_compare(&ment.last, end) < 0 && + scoutfs_key_compare(&ment.last, key) >= 0) + *end = ment.last; } return ret; @@ -551,23 +590,11 @@ retry_stale: if (ret) goto out; - /* get non-zero segments that intersect with the missed key */ - ret = get_nonzero_refs(sb, &root, key, &seg_end, &ref_list); + /* get non-zero segments that intersect with the key, shrinks range */ + ret = get_nonzero_refs(sb, &root, key, &seg_start, &seg_end, &ref_list); if (ret) goto out; - /* clamp start and end to the segment boundaries, including key */ - list_for_each_entry(ref, &ref_list, entry) { - - if (scoutfs_key_compare(&ref->first, &seg_start) > 0 && - scoutfs_key_compare(&ref->first, key) <= 0) - seg_start = ref->first; - - if (scoutfs_key_compare(&ref->last, &seg_end) < 0 && - scoutfs_key_compare(&ref->last, key) >= 0) - seg_end = ref->last; - } - trace_scoutfs_read_item_keys(sb, key, start, end, &seg_start, &seg_end); /* then get level 0s that intersect with our search range */ @@ -747,7 +774,6 @@ int scoutfs_manifest_next_key(struct super_block *sb, struct scoutfs_key *key, struct scoutfs_key *next_key) { struct scoutfs_key item_key; - struct scoutfs_key end; struct scoutfs_btree_root root; struct scoutfs_segment *seg; struct manifest_ref *ref; @@ -764,10 +790,8 @@ retry_stale: if (ret) goto out; - scoutfs_key_set_ones(&end); - - ret = get_zero_refs(sb, &root, key, &end, &ref_list) ?: - get_nonzero_refs(sb, &root, key, &end, &ref_list); + ret = get_zero_refs(sb, &root, key, key, &ref_list) ?: + get_nonzero_refs(sb, &root, key, NULL, NULL, &ref_list); if (ret) goto out; @@ -975,7 +999,7 @@ again: init_btree_key(&mkey, level + 1, 0, &ment.first); ret = btree_prev_overlap_or_next(sb, &super->manifest.root, &mkey, sizeof(mkey), &ment.first, - level + 1, &over_iref); + NULL, level + 1, &over_iref); sticky = false; for (i = 0; ret == 0 && i < SCOUTFS_MANIFEST_FANOUT + 1; i++) { init_ment_iref(&over, &over_iref);