From ca526e2bc0d7b2d1e4b48e460492c9a9003476cd Mon Sep 17 00:00:00 2001
From: Zach Brown <zab@versity.com>
Date: Wed, 27 Apr 2022 11:25:31 -0700
Subject: [PATCH 1/2] Lock recovery uses old mode while invalidating

When a new server starts up it rebuilds its view of all the granted
locks with lock recovery messages.  Clients give the server their
granted lock modes which the server then uses to process all the resent
lock requests from clients.

The lock invalidation work in the client is responsible for
transitioning an old granted mode to a new invalidated mode from an
unsolicited message from the server.  It has to process any client state
that'd be incompatible with the new mode (write dirty data, drop
caches).  While it is doing this work, as an implementation short cut,
it sets the granted lock mode to the new mode so that users that are
compatible with the new invalidated mode can use the lock whlie it's
being invalidated.  Picture readers reading data while a write lock is
invalidating and writing dirty data.

A problem arises when a lock recover request is processed during lock
invalidation.  The client lock recover request handler sends a response
with the current granted mode.  The server takes this to mean that the
invalidation is done but the client invalidation worker might still be
writing data, dropping caches, etc.  The server will allow the state
machine to advance which can send grants to pending client requests
which believed that the invalidation was done.

All of this can lead to a grant response handler in the client tripping
the assertion that there can not be cached items that were incompatible
with the old mode in a grant from the server.  Invalidation might still
be invalidating caches.  Hitting this bug is very rare and requires a
new server starting up while a client has both a request outstanding and
an invalidation being processed when the lock recover request arrives.

The fix is to record the old mode during invalidation and send that in
lock recover responses.  This can lead the lock server to resend
invalidation requests to the client.  The client already safely handles
duplicate invalidation requests from other failover cases.

Signed-off-by: Zach Brown <zab@versity.com>
---
 kmod/src/lock.c | 20 ++++++++++++++++----
 kmod/src/lock.h |  1 +
 2 files changed, 17 insertions(+), 4 deletions(-)

diff --git a/kmod/src/lock.c b/kmod/src/lock.c
index b04e18ee..9275e3a4 100644
--- a/kmod/src/lock.c
+++ b/kmod/src/lock.c
@@ -289,6 +289,7 @@ static struct scoutfs_lock *lock_alloc(struct super_block *sb,
 	lock->sb = sb;
 	init_waitqueue_head(&lock->waitq);
 	lock->mode = SCOUTFS_LOCK_NULL;
+	lock->invalidating_mode = SCOUTFS_LOCK_NULL;
 
 	atomic64_set(&lock->forest_bloom_nr, 0);
 
@@ -666,7 +667,9 @@ struct inv_req {
  *
  * Before we start invalidating the lock we set the lock to the new
  * mode, preventing further incompatible users of the old mode from
- * using the lock while we're invalidating.
+ * using the lock while we're invalidating.  We record the previously
+ * granted mode so that we can send lock recover responses with the old
+ * granted mode during invalidation.
  */
 static void lock_invalidate_worker(struct work_struct *work)
 {
@@ -691,7 +694,8 @@ static void lock_invalidate_worker(struct work_struct *work)
 		if (!lock_counts_match(nl->new_mode, lock->users))
 			continue;
 
-		/* set the new mode, no incompatible users during inval */
+		/* set the new mode, no incompatible users during inval, recov needs old */
+		lock->invalidating_mode = lock->mode;
 		lock->mode = nl->new_mode;
 
 		/* move everyone that's ready to our private list */
@@ -734,6 +738,8 @@ static void lock_invalidate_worker(struct work_struct *work)
 		list_del(&ireq->head);
 		kfree(ireq);
 
+		lock->invalidating_mode = SCOUTFS_LOCK_NULL;
+
 		if (list_empty(&lock->inv_list)) {
 			/* finish if another request didn't arrive */
 			list_del_init(&lock->inv_head);
@@ -824,6 +830,7 @@ int scoutfs_lock_recover_request(struct super_block *sb, u64 net_id,
 {
 	DECLARE_LOCK_INFO(sb, linfo);
 	struct scoutfs_net_lock_recover *nlr;
+	enum scoutfs_lock_mode mode;
 	struct scoutfs_lock *lock;
 	struct scoutfs_lock *next;
 	struct rb_node *node;
@@ -844,10 +851,15 @@ int scoutfs_lock_recover_request(struct super_block *sb, u64 net_id,
 
 	for (i = 0; lock && i < SCOUTFS_NET_LOCK_MAX_RECOVER_NR; i++) {
 
+		if (lock->invalidating_mode != SCOUTFS_LOCK_NULL)
+			mode = lock->invalidating_mode;
+		else
+			mode = lock->mode;
+
 		nlr->locks[i].key = lock->start;
 		nlr->locks[i].write_seq = cpu_to_le64(lock->write_seq);
-		nlr->locks[i].old_mode = lock->mode;
-		nlr->locks[i].new_mode = lock->mode;
+		nlr->locks[i].old_mode = mode;
+		nlr->locks[i].new_mode = mode;
 
 		node = rb_next(&lock->node);
 		if (node)
diff --git a/kmod/src/lock.h b/kmod/src/lock.h
index a1d9b903..204ee888 100644
--- a/kmod/src/lock.h
+++ b/kmod/src/lock.h
@@ -39,6 +39,7 @@ struct scoutfs_lock {
 	struct list_head cov_list;
 
 	enum scoutfs_lock_mode mode;
+	enum scoutfs_lock_mode invalidating_mode;
 	unsigned int waiters[SCOUTFS_LOCK_NR_MODES];
 	unsigned int users[SCOUTFS_LOCK_NR_MODES];
 

From dff366e1a442e336f6a144e389c6388fede2fe51 Mon Sep 17 00:00:00 2001
From: Zach Brown <zab@versity.com>
Date: Wed, 27 Apr 2022 11:55:06 -0700
Subject: [PATCH 2/2] Add lock invalidation and recovery test

Add a test which tries to have lock recovery processed during lock
invalidation on clients.

Signed-off-by: Zach Brown <zab@versity.com>
---
 tests/golden/lock-recover-invalidate   |  3 ++
 tests/golden/lock-rever-invalidate     |  0
 tests/sequence                         |  1 +
 tests/tests/lock-recover-invalidate.sh | 43 ++++++++++++++++++++++++++
 4 files changed, 47 insertions(+)
 create mode 100644 tests/golden/lock-recover-invalidate
 create mode 100644 tests/golden/lock-rever-invalidate
 create mode 100644 tests/tests/lock-recover-invalidate.sh

diff --git a/tests/golden/lock-recover-invalidate b/tests/golden/lock-recover-invalidate
new file mode 100644
index 00000000..af208381
--- /dev/null
+++ b/tests/golden/lock-recover-invalidate
@@ -0,0 +1,3 @@
+== starting background invalidating read/write load
+== 60s of lock recovery during invalidating load
+== stopping background load
diff --git a/tests/golden/lock-rever-invalidate b/tests/golden/lock-rever-invalidate
new file mode 100644
index 00000000..e69de29b
diff --git a/tests/sequence b/tests/sequence
index 40c82e52..37f30647 100644
--- a/tests/sequence
+++ b/tests/sequence
@@ -17,6 +17,7 @@ lock-refleak.sh
 lock-shrink-consistency.sh
 lock-pr-cw-conflict.sh
 lock-revoke-getcwd.sh
+lock-recover-invalidate.sh
 export-lookup-evict-race.sh
 createmany-parallel.sh
 createmany-large-names.sh
diff --git a/tests/tests/lock-recover-invalidate.sh b/tests/tests/lock-recover-invalidate.sh
new file mode 100644
index 00000000..8f06b58c
--- /dev/null
+++ b/tests/tests/lock-recover-invalidate.sh
@@ -0,0 +1,43 @@
+#
+# trigger server failover and lock recovery during heavy invalidating
+# load on multiple mounts
+#
+
+majority_nr=$(t_majority_count)
+quorum_nr=$T_QUORUM
+
+test "$quorum_nr" == "$majority_nr" && \
+        t_skip "need remaining majority when leader unmounted"
+
+test "$T_NR_MOUNTS" -lt "$((quorum_nr + 2))" && \
+        t_skip "need at least 2 non-quorum load mounts"
+
+echo "== starting background invalidating read/write load"
+touch "$T_D0/file"
+load_pids=""
+for i in $(t_fs_nrs); do
+	if [ "$i" -ge "$quorum_nr" ]; then
+		eval path="\$T_D${i}/file"
+
+		(while true; do touch $path > /dev/null 2>&1; done) &
+		load_pids="$load_pids $!"
+		(while true; do stat $path > /dev/null 2>&1; done) &
+		load_pids="$load_pids $!"
+	fi
+done
+
+# had it reproduce in ~40s on wimpy debug kernel guests
+LENGTH=60
+echo "== ${LENGTH}s of lock recovery during invalidating load"
+END=$((SECONDS + LENGTH))
+while [ "$SECONDS" -lt "$END" ]; do
+        sv=$(t_server_nr)
+        t_umount $sv
+        t_mount $sv
+	# new server had to process greeting for mount to finish
+done
+
+echo "== stopping background load"
+kill $load_pids
+
+t_pass