From 69d9040e6834ef633c6af21f408fd548ee145856 Mon Sep 17 00:00:00 2001
From: Zach Brown <zab@versity.com>
Date: Wed, 30 Oct 2024 13:04:13 -0700
Subject: [PATCH 1/5] Close lock server use-after-free race

Lock object lifetimes in the lock server are protected by reference
counts.  References are acquired while holding a lock on an rbtree.

Unfortunately, the decision to free lock objects wasn't tested while
also holding that lock on the rbtree.  A caller putting their object
would test the refcount, then wait to get the rbtree lock to remove it
from the tree.

There's a possible race where the decision is made to remove the object
but another reference is added before the object is removed.  This was
seen in testing and manifest as an incoming request handling path adding
a request message to the object before it is freed, losing the message.
Clients would then hang on a lock that never saw a response because
their request was freed with the lock object.

The fix is to hold the rbtree lock when testing the refcount and
deciding to free.  It adds a bit more contention but not significantly
so, given the wild existing contention on a per-fs spinlocked rbtree.

Signed-off-by: Zach Brown <zab@versity.com>
---
 kmod/src/lock_server.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/kmod/src/lock_server.c b/kmod/src/lock_server.c
index 56d20338..edb9f9a7 100644
--- a/kmod/src/lock_server.c
+++ b/kmod/src/lock_server.c
@@ -317,16 +317,18 @@ static void put_server_lock(struct lock_server_info *inf,
 
 	BUG_ON(!mutex_is_locked(&snode->mutex));
 
+	spin_lock(&inf->lock);
+
 	if (atomic_dec_and_test(&snode->refcount) &&
 	    list_empty(&snode->granted) &&
 	    list_empty(&snode->requested) &&
 	    list_empty(&snode->invalidated)) {
-		spin_lock(&inf->lock);
 		rb_erase(&snode->node, &inf->locks_root);
-		spin_unlock(&inf->lock);
 		should_free = true;
 	}
 
+	spin_unlock(&inf->lock);
+
 	mutex_unlock(&snode->mutex);
 
 	if (should_free) {

From 5a6eb569f3295619f26a607da2676dd903aedf1c Mon Sep 17 00:00:00 2001
From: Zach Brown <zab@versity.com>
Date: Wed, 30 Oct 2024 13:16:04 -0700
Subject: [PATCH 2/5] Add some lock debugging trace fields

Over time some fields have been added to the lock struct which haven't
been added to the lock tracing output.  Add some of the more relevant
lock fields to tracing.

Signed-off-by: Zach Brown <zab@versity.com>
---
 kmod/src/scoutfs_trace.h | 13 ++++++++++---
 1 file changed, 10 insertions(+), 3 deletions(-)

diff --git a/kmod/src/scoutfs_trace.h b/kmod/src/scoutfs_trace.h
index ceb538ba..e9c09750 100644
--- a/kmod/src/scoutfs_trace.h
+++ b/kmod/src/scoutfs_trace.h
@@ -1046,9 +1046,12 @@ DECLARE_EVENT_CLASS(scoutfs_lock_class,
 		sk_trace_define(start)
 		sk_trace_define(end)
 		__field(u64, refresh_gen)
+		__field(u64, write_seq)
+		__field(u64, dirty_trans_seq)
 		__field(unsigned char, request_pending)
 		__field(unsigned char, invalidate_pending)
 		__field(int, mode)
+		__field(int, invalidating_mode)
 		__field(unsigned int, waiters_cw)
 		__field(unsigned int, waiters_pr)
 		__field(unsigned int, waiters_ex)
@@ -1061,9 +1064,12 @@ DECLARE_EVENT_CLASS(scoutfs_lock_class,
 		sk_trace_assign(start, &lck->start);
 		sk_trace_assign(end, &lck->end);
 		__entry->refresh_gen = lck->refresh_gen;
+		__entry->write_seq = lck->write_seq;
+		__entry->dirty_trans_seq = lck->dirty_trans_seq;
 		__entry->request_pending = lck->request_pending;
 		__entry->invalidate_pending = lck->invalidate_pending;
 		__entry->mode = lck->mode;
+		__entry->invalidating_mode = lck->invalidating_mode;
 		__entry->waiters_pr = lck->waiters[SCOUTFS_LOCK_READ];
 		__entry->waiters_ex = lck->waiters[SCOUTFS_LOCK_WRITE];
 		__entry->waiters_cw = lck->waiters[SCOUTFS_LOCK_WRITE_ONLY];
@@ -1071,10 +1077,11 @@ DECLARE_EVENT_CLASS(scoutfs_lock_class,
 		__entry->users_ex = lck->users[SCOUTFS_LOCK_WRITE];
 		__entry->users_cw = lck->users[SCOUTFS_LOCK_WRITE_ONLY];
         ),
-        TP_printk(SCSBF" start "SK_FMT" end "SK_FMT" mode %u reqpnd %u invpnd %u rfrgen %llu waiters: pr %u ex %u cw %u users: pr %u ex %u cw %u",
+        TP_printk(SCSBF" start "SK_FMT" end "SK_FMT" mode %u invmd %u reqp %u invp %u refg %llu wris %llu dts %llu waiters: pr %u ex %u cw %u users: pr %u ex %u cw %u",
 		  SCSB_TRACE_ARGS, sk_trace_args(start), sk_trace_args(end),
-		  __entry->mode, __entry->request_pending,
-		  __entry->invalidate_pending, __entry->refresh_gen,
+		  __entry->mode, __entry->invalidating_mode, __entry->request_pending,
+		  __entry->invalidate_pending, __entry->refresh_gen, __entry->write_seq,
+		  __entry->dirty_trans_seq,
 		  __entry->waiters_pr, __entry->waiters_ex, __entry->waiters_cw,
 		  __entry->users_pr, __entry->users_ex, __entry->users_cw)
 );

From 8c1a45c9f53574394432737558d60e1eb3f83021 Mon Sep 17 00:00:00 2001
From: Zach Brown <zab@versity.com>
Date: Wed, 30 Oct 2024 13:19:40 -0700
Subject: [PATCH 3/5] Use bools instead of weird addition as or in net

When freeing acked reesponses in the net layer we sweep the send and
resend queues looking for queued responses up to the sequence number
we've had acked.  The code that did this used a weird pattern of
returning ints and adding them which gave me pause.  Clean it up to use
bools and or (not short-circuiting ||) to more obviously communicate
what's going on.

Signed-off-by: Zach Brown <zab@versity.com>
---
 kmod/src/net.c | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/kmod/src/net.c b/kmod/src/net.c
index d7c68743..613cf169 100644
--- a/kmod/src/net.c
+++ b/kmod/src/net.c
@@ -502,12 +502,12 @@ static void scoutfs_net_proc_worker(struct work_struct *work)
  * Free live responses up to and including the seq by marking them dead
  * and moving them to the send queue to be freed.
  */
-static int move_acked_responses(struct scoutfs_net_connection *conn,
-				struct list_head *list, u64 seq)
+static bool move_acked_responses(struct scoutfs_net_connection *conn,
+				 struct list_head *list, u64 seq)
 {
 	struct message_send *msend;
 	struct message_send *tmp;
-	int ret = 0;
+	bool moved = false;
 
 	assert_spin_locked(&conn->lock);
 
@@ -519,20 +519,20 @@ static int move_acked_responses(struct scoutfs_net_connection *conn,
 
 		msend->dead = 1;
 		list_move(&msend->head, &conn->send_queue);
-		ret = 1;
+		moved = true;
 	}
 
-	return ret;
+	return moved;
 }
 
 /* acks are processed inline in the recv worker */
 static void free_acked_responses(struct scoutfs_net_connection *conn, u64 seq)
 {
-	int moved;
+	bool moved;
 
 	spin_lock(&conn->lock);
 
-	moved = move_acked_responses(conn, &conn->send_queue, seq) +
+	moved = move_acked_responses(conn, &conn->send_queue, seq) |
 		move_acked_responses(conn, &conn->resend_queue, seq);
 
 	spin_unlock(&conn->lock);

From 19e78c32fce34487af6cb662c6514d0ddb845f26 Mon Sep 17 00:00:00 2001
From: Zach Brown <zab@versity.com>
Date: Thu, 31 Oct 2024 09:48:07 -0700
Subject: [PATCH 4/5] Allow null lock compatibility between nodes

Right now a client requesting a null mode for a lock will cause
invalidations of all existing granted modes of the lock across the
cluster.

This is unneccessarily broad.  The absolute requirement is that a null
request invalidates other existing granted modes on the client.  That's
how the client safely resolves shrinking's desire to free locks while
the locks are in use.  It relies on turning it into the race between use
and remote invalidation.

But that only requires invalidating existing grants from the requesting
client, not all clients.  It is always safe for null grants to coexist
with all grants on other clients.  Consider the existing mechanics
involving null modes.  First, null locks are instatiated on the client
before sending any requests at all.  At any given time newly allocated
null locks are coexisting with all existing locks across the cluster.
Second, the server frees the client entry tracking struct the moment it
sends a null grant to the client.  From that point on the client's null
lock can not have any impact on the rest of the lock holders because the
server has forgotten about it.

So we add this case to the server's test that two client lock modes are
compatible.  We take the opportunity to comment the heck out of this
function instead of making it a dense boolean composition.  The only
functional change is the addition of this case, the existing cases are
refactored but unchanged.

Signed-off-by: Zach Brown <zab@versity.com>
---
 kmod/src/lock_server.c | 49 ++++++++++++++++++++++++++++++++----------
 1 file changed, 38 insertions(+), 11 deletions(-)

diff --git a/kmod/src/lock_server.c b/kmod/src/lock_server.c
index edb9f9a7..d5e55255 100644
--- a/kmod/src/lock_server.c
+++ b/kmod/src/lock_server.c
@@ -202,21 +202,48 @@ static u8 invalidation_mode(u8 granted, u8 requested)
 
 /*
  * Return true of the client lock instances described by the entries can
- * be granted at the same time.  Typically this only means they're both
- * modes that are compatible between nodes. In addition there's the
- * special case where a read lock on a client is compatible with a write
- * lock on the same client because the client's cache covered by the
- * read lock is still valid if they get a write lock.
+ * be granted at the same time.  There's only three cases where this is
+ * true.
+ *
+ * First, the two locks are both of the same mode that allows full
+ * sharing -- read and write only.  The only point of these modes is
+ * that everyone can share them.
+ *
+ * Second, a write lock gives the client permission to read as well.
+ * This means that a client can upgrade its read lock to a write lock
+ * without having to invalidate the existing read and drop caches.
+ *
+ * Third, null locks are always compatible between clients.  It's as
+ * though the client with the null lock has no lock at all.  But it's
+ * never compatible with all locks on the client requesting null.
+ * Sending invalidations for existing locks on a client when we get a
+ * null request is how we resolve races in shrinking locks -- we turn it
+ * into the unsolicited remote invalidation case.
+ *
+ * All other mode and client combinations can not be shared, most
+ * typically a write lock invalidating all other non-write holders to
+ * drop caches and force a read after the write has completed.
  */
 static bool client_entries_compatible(struct client_lock_entry *granted,
 				      struct client_lock_entry *requested)
 {
-	return (granted->mode == requested->mode &&
-		(granted->mode == SCOUTFS_LOCK_READ ||
-		 granted->mode == SCOUTFS_LOCK_WRITE_ONLY)) ||
-	       (granted->rid == requested->rid &&
-		granted->mode == SCOUTFS_LOCK_READ &&
-		requested->mode == SCOUTFS_LOCK_WRITE);
+	/* only read and write_only can be full shared */
+	if ((granted->mode == requested->mode) &&
+	    (granted->mode == SCOUTFS_LOCK_READ || granted->mode == SCOUTFS_LOCK_WRITE_ONLY))
+		return true;
+
+	/* _write includes reading, so a client can upgrade its read to write */
+	if (granted->rid == requested->rid &&
+	    granted->mode == SCOUTFS_LOCK_READ &&
+	    requested->mode == SCOUTFS_LOCK_WRITE)
+		return true;
+
+	/* null is always compatible across clients, never within a client */
+	if ((granted->rid != requested->rid) &&
+	    (granted->mode == SCOUTFS_LOCK_NULL || requested->mode == SCOUTFS_LOCK_NULL))
+		return true;
+
+	return false;
 }
 
 /*

From 4a26059d00e6cac5d82286b4f268a39c4d68a7c7 Mon Sep 17 00:00:00 2001
From: Zach Brown <zab@versity.com>
Date: Thu, 31 Oct 2024 11:21:51 -0700
Subject: [PATCH 5/5] Add lock-shrink-read-race test

Add a quick test that races readers and shrinking to stress lock object
refcount racing between concurrent lock request handling threads in the
lock server.

Signed-off-by: Zach Brown <zab@versity.com>
---
 tests/golden/lock-shrink-read-race   |  2 ++
 tests/sequence                       |  1 +
 tests/tests/lock-shrink-read-race.sh | 40 ++++++++++++++++++++++++++++
 3 files changed, 43 insertions(+)
 create mode 100644 tests/golden/lock-shrink-read-race
 create mode 100644 tests/tests/lock-shrink-read-race.sh

diff --git a/tests/golden/lock-shrink-read-race b/tests/golden/lock-shrink-read-race
new file mode 100644
index 00000000..eb528076
--- /dev/null
+++ b/tests/golden/lock-shrink-read-race
@@ -0,0 +1,2 @@
+=== setup
+=== spin reading and shrinking
diff --git a/tests/sequence b/tests/sequence
index 16e0e465..78001e59 100644
--- a/tests/sequence
+++ b/tests/sequence
@@ -25,6 +25,7 @@ totl-xattr-tag.sh
 quota.sh
 lock-refleak.sh
 lock-shrink-consistency.sh
+lock-shrink-read-race.sh
 lock-pr-cw-conflict.sh
 lock-revoke-getcwd.sh
 lock-recover-invalidate.sh
diff --git a/tests/tests/lock-shrink-read-race.sh b/tests/tests/lock-shrink-read-race.sh
new file mode 100644
index 00000000..18693bf0
--- /dev/null
+++ b/tests/tests/lock-shrink-read-race.sh
@@ -0,0 +1,40 @@
+#
+# We had a lock server refcounting bug that could let one thread get a
+# reference on a lock struct that was being freed by another thread.  We
+# were able to reproduce this by having all clients try and produce a
+# lot of read and null requests.
+#
+# This will manfiest as a hung lock and timed out test runs, probably
+# with hung task messages on the console.  Depending on how the race
+# turns out, it can trigger KASAN warnings in
+# process_waiting_requests().
+#
+
+READERS_PER=3
+SECS=30
+
+echo "=== setup"
+touch "$T_D0/file"
+
+echo "=== spin reading and shrinking"
+END=$((SECONDS + SECS))
+for m in $(t_fs_nrs); do
+	eval file="\$T_D${m}/file"
+
+	# lots of tasks reading as fast as they can
+	for t in $(seq 1 $READERS_PER); do
+		(while [ $SECONDS -lt $END ]; do
+			stat $file > /dev/null
+		 done) &
+	done
+	# one task shrinking (triggering null requests) and reading
+	(while [ $SECONDS -lt $END ]; do
+		stat $file > /dev/null
+		t_trigger_arm_silent statfs_lock_purge $m
+		stat -f "$file" > /dev/null
+	 done) &
+done
+
+wait
+
+t_pass