From ca526e2bc0d7b2d1e4b48e460492c9a9003476cd Mon Sep 17 00:00:00 2001 From: Zach Brown Date: Wed, 27 Apr 2022 11:25:31 -0700 Subject: [PATCH 1/2] Lock recovery uses old mode while invalidating When a new server starts up it rebuilds its view of all the granted locks with lock recovery messages. Clients give the server their granted lock modes which the server then uses to process all the resent lock requests from clients. The lock invalidation work in the client is responsible for transitioning an old granted mode to a new invalidated mode from an unsolicited message from the server. It has to process any client state that'd be incompatible with the new mode (write dirty data, drop caches). While it is doing this work, as an implementation short cut, it sets the granted lock mode to the new mode so that users that are compatible with the new invalidated mode can use the lock whlie it's being invalidated. Picture readers reading data while a write lock is invalidating and writing dirty data. A problem arises when a lock recover request is processed during lock invalidation. The client lock recover request handler sends a response with the current granted mode. The server takes this to mean that the invalidation is done but the client invalidation worker might still be writing data, dropping caches, etc. The server will allow the state machine to advance which can send grants to pending client requests which believed that the invalidation was done. All of this can lead to a grant response handler in the client tripping the assertion that there can not be cached items that were incompatible with the old mode in a grant from the server. Invalidation might still be invalidating caches. Hitting this bug is very rare and requires a new server starting up while a client has both a request outstanding and an invalidation being processed when the lock recover request arrives. The fix is to record the old mode during invalidation and send that in lock recover responses. This can lead the lock server to resend invalidation requests to the client. The client already safely handles duplicate invalidation requests from other failover cases. Signed-off-by: Zach Brown --- kmod/src/lock.c | 20 ++++++++++++++++---- kmod/src/lock.h | 1 + 2 files changed, 17 insertions(+), 4 deletions(-) diff --git a/kmod/src/lock.c b/kmod/src/lock.c index b04e18ee..9275e3a4 100644 --- a/kmod/src/lock.c +++ b/kmod/src/lock.c @@ -289,6 +289,7 @@ static struct scoutfs_lock *lock_alloc(struct super_block *sb, lock->sb = sb; init_waitqueue_head(&lock->waitq); lock->mode = SCOUTFS_LOCK_NULL; + lock->invalidating_mode = SCOUTFS_LOCK_NULL; atomic64_set(&lock->forest_bloom_nr, 0); @@ -666,7 +667,9 @@ struct inv_req { * * Before we start invalidating the lock we set the lock to the new * mode, preventing further incompatible users of the old mode from - * using the lock while we're invalidating. + * using the lock while we're invalidating. We record the previously + * granted mode so that we can send lock recover responses with the old + * granted mode during invalidation. */ static void lock_invalidate_worker(struct work_struct *work) { @@ -691,7 +694,8 @@ static void lock_invalidate_worker(struct work_struct *work) if (!lock_counts_match(nl->new_mode, lock->users)) continue; - /* set the new mode, no incompatible users during inval */ + /* set the new mode, no incompatible users during inval, recov needs old */ + lock->invalidating_mode = lock->mode; lock->mode = nl->new_mode; /* move everyone that's ready to our private list */ @@ -734,6 +738,8 @@ static void lock_invalidate_worker(struct work_struct *work) list_del(&ireq->head); kfree(ireq); + lock->invalidating_mode = SCOUTFS_LOCK_NULL; + if (list_empty(&lock->inv_list)) { /* finish if another request didn't arrive */ list_del_init(&lock->inv_head); @@ -824,6 +830,7 @@ int scoutfs_lock_recover_request(struct super_block *sb, u64 net_id, { DECLARE_LOCK_INFO(sb, linfo); struct scoutfs_net_lock_recover *nlr; + enum scoutfs_lock_mode mode; struct scoutfs_lock *lock; struct scoutfs_lock *next; struct rb_node *node; @@ -844,10 +851,15 @@ int scoutfs_lock_recover_request(struct super_block *sb, u64 net_id, for (i = 0; lock && i < SCOUTFS_NET_LOCK_MAX_RECOVER_NR; i++) { + if (lock->invalidating_mode != SCOUTFS_LOCK_NULL) + mode = lock->invalidating_mode; + else + mode = lock->mode; + nlr->locks[i].key = lock->start; nlr->locks[i].write_seq = cpu_to_le64(lock->write_seq); - nlr->locks[i].old_mode = lock->mode; - nlr->locks[i].new_mode = lock->mode; + nlr->locks[i].old_mode = mode; + nlr->locks[i].new_mode = mode; node = rb_next(&lock->node); if (node) diff --git a/kmod/src/lock.h b/kmod/src/lock.h index a1d9b903..204ee888 100644 --- a/kmod/src/lock.h +++ b/kmod/src/lock.h @@ -39,6 +39,7 @@ struct scoutfs_lock { struct list_head cov_list; enum scoutfs_lock_mode mode; + enum scoutfs_lock_mode invalidating_mode; unsigned int waiters[SCOUTFS_LOCK_NR_MODES]; unsigned int users[SCOUTFS_LOCK_NR_MODES]; From dff366e1a442e336f6a144e389c6388fede2fe51 Mon Sep 17 00:00:00 2001 From: Zach Brown Date: Wed, 27 Apr 2022 11:55:06 -0700 Subject: [PATCH 2/2] Add lock invalidation and recovery test Add a test which tries to have lock recovery processed during lock invalidation on clients. Signed-off-by: Zach Brown --- tests/golden/lock-recover-invalidate | 3 ++ tests/golden/lock-rever-invalidate | 0 tests/sequence | 1 + tests/tests/lock-recover-invalidate.sh | 43 ++++++++++++++++++++++++++ 4 files changed, 47 insertions(+) create mode 100644 tests/golden/lock-recover-invalidate create mode 100644 tests/golden/lock-rever-invalidate create mode 100644 tests/tests/lock-recover-invalidate.sh diff --git a/tests/golden/lock-recover-invalidate b/tests/golden/lock-recover-invalidate new file mode 100644 index 00000000..af208381 --- /dev/null +++ b/tests/golden/lock-recover-invalidate @@ -0,0 +1,3 @@ +== starting background invalidating read/write load +== 60s of lock recovery during invalidating load +== stopping background load diff --git a/tests/golden/lock-rever-invalidate b/tests/golden/lock-rever-invalidate new file mode 100644 index 00000000..e69de29b diff --git a/tests/sequence b/tests/sequence index 40c82e52..37f30647 100644 --- a/tests/sequence +++ b/tests/sequence @@ -17,6 +17,7 @@ lock-refleak.sh lock-shrink-consistency.sh lock-pr-cw-conflict.sh lock-revoke-getcwd.sh +lock-recover-invalidate.sh export-lookup-evict-race.sh createmany-parallel.sh createmany-large-names.sh diff --git a/tests/tests/lock-recover-invalidate.sh b/tests/tests/lock-recover-invalidate.sh new file mode 100644 index 00000000..8f06b58c --- /dev/null +++ b/tests/tests/lock-recover-invalidate.sh @@ -0,0 +1,43 @@ +# +# trigger server failover and lock recovery during heavy invalidating +# load on multiple mounts +# + +majority_nr=$(t_majority_count) +quorum_nr=$T_QUORUM + +test "$quorum_nr" == "$majority_nr" && \ + t_skip "need remaining majority when leader unmounted" + +test "$T_NR_MOUNTS" -lt "$((quorum_nr + 2))" && \ + t_skip "need at least 2 non-quorum load mounts" + +echo "== starting background invalidating read/write load" +touch "$T_D0/file" +load_pids="" +for i in $(t_fs_nrs); do + if [ "$i" -ge "$quorum_nr" ]; then + eval path="\$T_D${i}/file" + + (while true; do touch $path > /dev/null 2>&1; done) & + load_pids="$load_pids $!" + (while true; do stat $path > /dev/null 2>&1; done) & + load_pids="$load_pids $!" + fi +done + +# had it reproduce in ~40s on wimpy debug kernel guests +LENGTH=60 +echo "== ${LENGTH}s of lock recovery during invalidating load" +END=$((SECONDS + LENGTH)) +while [ "$SECONDS" -lt "$END" ]; do + sv=$(t_server_nr) + t_umount $sv + t_mount $sv + # new server had to process greeting for mount to finish +done + +echo "== stopping background load" +kill $load_pids + +t_pass