Merge pull request #193 from versity/zab/hung_lock_fixes

Zab/hung lock fixes
2025-12-23 05:25:18 +00:00 · 2024-10-31 16:56:51 -07:00
parent d94ec29ffa 4a26059d00
commit 6b4e666952
6 changed files with 102 additions and 23 deletions
--- a/kmod/src/lock_server.c
+++ b/kmod/src/lock_server.c
@@ -202,21 +202,48 @@ static u8 invalidation_mode(u8 granted, u8 requested)

 /*
 * Return true of the client lock instances described by the entries can
- * be granted at the same time.  Typically this only means they're both
- * modes that are compatible between nodes. In addition there's the
- * special case where a read lock on a client is compatible with a write
- * lock on the same client because the client's cache covered by the
- * read lock is still valid if they get a write lock.
+ * be granted at the same time.  There's only three cases where this is
+ * true.
+ *
+ * First, the two locks are both of the same mode that allows full
+ * sharing -- read and write only.  The only point of these modes is
+ * that everyone can share them.
+ *
+ * Second, a write lock gives the client permission to read as well.
+ * This means that a client can upgrade its read lock to a write lock
+ * without having to invalidate the existing read and drop caches.
+ *
+ * Third, null locks are always compatible between clients.  It's as
+ * though the client with the null lock has no lock at all.  But it's
+ * never compatible with all locks on the client requesting null.
+ * Sending invalidations for existing locks on a client when we get a
+ * null request is how we resolve races in shrinking locks -- we turn it
+ * into the unsolicited remote invalidation case.
+ *
+ * All other mode and client combinations can not be shared, most
+ * typically a write lock invalidating all other non-write holders to
+ * drop caches and force a read after the write has completed.
 */
 static bool client_entries_compatible(struct client_lock_entry *granted,
 				      struct client_lock_entry *requested)
 {
-	return (granted->mode == requested->mode &&
-		(granted->mode == SCOUTFS_LOCK_READ ||
-		 granted->mode == SCOUTFS_LOCK_WRITE_ONLY)) ||
-	       (granted->rid == requested->rid &&
-		granted->mode == SCOUTFS_LOCK_READ &&
-		requested->mode == SCOUTFS_LOCK_WRITE);
+	/* only read and write_only can be full shared */
+	if ((granted->mode == requested->mode) &&
+	    (granted->mode == SCOUTFS_LOCK_READ || granted->mode == SCOUTFS_LOCK_WRITE_ONLY))
+		return true;
+
+	/* _write includes reading, so a client can upgrade its read to write */
+	if (granted->rid == requested->rid &&
+	    granted->mode == SCOUTFS_LOCK_READ &&
+	    requested->mode == SCOUTFS_LOCK_WRITE)
+		return true;
+
+	/* null is always compatible across clients, never within a client */
+	if ((granted->rid != requested->rid) &&
+	    (granted->mode == SCOUTFS_LOCK_NULL || requested->mode == SCOUTFS_LOCK_NULL))
+		return true;
+
+	return false;
 }

 /*
@@ -317,16 +344,18 @@ static void put_server_lock(struct lock_server_info *inf,

 	BUG_ON(!mutex_is_locked(&snode->mutex));

+	spin_lock(&inf->lock);
+
 	if (atomic_dec_and_test(&snode->refcount) &&
 	    list_empty(&snode->granted) &&
 	    list_empty(&snode->requested) &&
 	    list_empty(&snode->invalidated)) {
-		spin_lock(&inf->lock);
 		rb_erase(&snode->node, &inf->locks_root);
-		spin_unlock(&inf->lock);
 		should_free = true;
 	}

+	spin_unlock(&inf->lock);
+
 	mutex_unlock(&snode->mutex);

 	if (should_free) {
--- a/kmod/src/net.c
+++ b/kmod/src/net.c
@@ -502,12 +502,12 @@ static void scoutfs_net_proc_worker(struct work_struct *work)
 * Free live responses up to and including the seq by marking them dead
 * and moving them to the send queue to be freed.
 */
-static int move_acked_responses(struct scoutfs_net_connection *conn,
-				struct list_head *list, u64 seq)
+static bool move_acked_responses(struct scoutfs_net_connection *conn,
+				 struct list_head *list, u64 seq)
 {
 	struct message_send *msend;
 	struct message_send *tmp;
-	int ret = 0;
+	bool moved = false;

 	assert_spin_locked(&conn->lock);

@@ -519,20 +519,20 @@ static int move_acked_responses(struct scoutfs_net_connection *conn,

 		msend->dead = 1;
 		list_move(&msend->head, &conn->send_queue);
-		ret = 1;
+		moved = true;
 	}

-	return ret;
+	return moved;
 }

 /* acks are processed inline in the recv worker */
 static void free_acked_responses(struct scoutfs_net_connection *conn, u64 seq)
 {
-	int moved;
+	bool moved;

 	spin_lock(&conn->lock);

-	moved = move_acked_responses(conn, &conn->send_queue, seq) +
+	moved = move_acked_responses(conn, &conn->send_queue, seq) |
 		move_acked_responses(conn, &conn->resend_queue, seq);

 	spin_unlock(&conn->lock);
--- a/kmod/src/scoutfs_trace.h
+++ b/kmod/src/scoutfs_trace.h
@@ -1046,9 +1046,12 @@ DECLARE_EVENT_CLASS(scoutfs_lock_class,
 		sk_trace_define(start)
 		sk_trace_define(end)
 		__field(u64, refresh_gen)
+		__field(u64, write_seq)
+		__field(u64, dirty_trans_seq)
 		__field(unsigned char, request_pending)
 		__field(unsigned char, invalidate_pending)
 		__field(int, mode)
+		__field(int, invalidating_mode)
 		__field(unsigned int, waiters_cw)
 		__field(unsigned int, waiters_pr)
 		__field(unsigned int, waiters_ex)
@@ -1061,9 +1064,12 @@ DECLARE_EVENT_CLASS(scoutfs_lock_class,
 		sk_trace_assign(start, &lck->start);
 		sk_trace_assign(end, &lck->end);
 		__entry->refresh_gen = lck->refresh_gen;
+		__entry->write_seq = lck->write_seq;
+		__entry->dirty_trans_seq = lck->dirty_trans_seq;
 		__entry->request_pending = lck->request_pending;
 		__entry->invalidate_pending = lck->invalidate_pending;
 		__entry->mode = lck->mode;
+		__entry->invalidating_mode = lck->invalidating_mode;
 		__entry->waiters_pr = lck->waiters[SCOUTFS_LOCK_READ];
 		__entry->waiters_ex = lck->waiters[SCOUTFS_LOCK_WRITE];
 		__entry->waiters_cw = lck->waiters[SCOUTFS_LOCK_WRITE_ONLY];
@@ -1071,10 +1077,11 @@ DECLARE_EVENT_CLASS(scoutfs_lock_class,
 		__entry->users_ex = lck->users[SCOUTFS_LOCK_WRITE];
 		__entry->users_cw = lck->users[SCOUTFS_LOCK_WRITE_ONLY];
        ),
-        TP_printk(SCSBF" start "SK_FMT" end "SK_FMT" mode %u reqpnd %u invpnd %u rfrgen %llu waiters: pr %u ex %u cw %u users: pr %u ex %u cw %u",
+        TP_printk(SCSBF" start "SK_FMT" end "SK_FMT" mode %u invmd %u reqp %u invp %u refg %llu wris %llu dts %llu waiters: pr %u ex %u cw %u users: pr %u ex %u cw %u",
 		  SCSB_TRACE_ARGS, sk_trace_args(start), sk_trace_args(end),
-		  __entry->mode, __entry->request_pending,
-		  __entry->invalidate_pending, __entry->refresh_gen,
+		  __entry->mode, __entry->invalidating_mode, __entry->request_pending,
+		  __entry->invalidate_pending, __entry->refresh_gen, __entry->write_seq,
+		  __entry->dirty_trans_seq,
 		  __entry->waiters_pr, __entry->waiters_ex, __entry->waiters_cw,
 		  __entry->users_pr, __entry->users_ex, __entry->users_cw)
 );
--- a/tests/golden/lock-shrink-read-race
+++ b/tests/golden/lock-shrink-read-race
@@ -0,0 +1,2 @@
+=== setup
+=== spin reading and shrinking
--- a/tests/sequence
+++ b/tests/sequence
@@ -25,6 +25,7 @@ totl-xattr-tag.sh
 quota.sh
 lock-refleak.sh
 lock-shrink-consistency.sh
+lock-shrink-read-race.sh
 lock-pr-cw-conflict.sh
 lock-revoke-getcwd.sh
 lock-recover-invalidate.sh
--- a/tests/tests/lock-shrink-read-race.sh
+++ b/tests/tests/lock-shrink-read-race.sh
@@ -0,0 +1,40 @@
+#
+# We had a lock server refcounting bug that could let one thread get a
+# reference on a lock struct that was being freed by another thread.  We
+# were able to reproduce this by having all clients try and produce a
+# lot of read and null requests.
+#
+# This will manfiest as a hung lock and timed out test runs, probably
+# with hung task messages on the console.  Depending on how the race
+# turns out, it can trigger KASAN warnings in
+# process_waiting_requests().
+#
+
+READERS_PER=3
+SECS=30
+
+echo "=== setup"
+touch "$T_D0/file"
+
+echo "=== spin reading and shrinking"
+END=$((SECONDS + SECS))
+for m in $(t_fs_nrs); do
+	eval file="\$T_D${m}/file"
+
+	# lots of tasks reading as fast as they can
+	for t in $(seq 1 $READERS_PER); do
+		(while [ $SECONDS -lt $END ]; do
+			stat $file > /dev/null
+		 done) &
+	done
+	# one task shrinking (triggering null requests) and reading
+	(while [ $SECONDS -lt $END ]; do
+		stat $file > /dev/null
+		t_trigger_arm_silent statfs_lock_purge $m
+		stat -f "$file" > /dev/null
+	 done) &
+done
+
+wait
+
+t_pass