Merge pull request #193 from versity/zab/hung_lock_fixes

Zab/hung lock fixes
2026-01-03 02:45:20 +00:00 · 2024-10-31 16:56:51 -07:00
parent d94ec29ffa 4a26059d00
commit 6b4e666952
6 changed files with 102 additions and 23 deletions
--- a/kmod/src/lock_server.c
+++ b/kmod/src/lock_server.c
@@ -202,21 +202,48 @@ static u8 invalidation_mode(u8 granted, u8 requested)
 /*
 * Return true of the client lock instances described by the entries can
- * be granted at the same time.  Typically this only means they're both
+ * be granted at the same time.  There's only three cases where this is
- * modes that are compatible between nodes. In addition there's the
+ * true.
- * special case where a read lock on a client is compatible with a write
+ *
- * lock on the same client because the client's cache covered by the
+ * First, the two locks are both of the same mode that allows full
- * read lock is still valid if they get a write lock.
+ * sharing -- read and write only.  The only point of these modes is
 * that everyone can share them.
 *
 * Second, a write lock gives the client permission to read as well.
 * This means that a client can upgrade its read lock to a write lock
 * without having to invalidate the existing read and drop caches.
 *
 * Third, null locks are always compatible between clients.  It's as
 * though the client with the null lock has no lock at all.  But it's
 * never compatible with all locks on the client requesting null.
 * Sending invalidations for existing locks on a client when we get a
 * null request is how we resolve races in shrinking locks -- we turn it
 * into the unsolicited remote invalidation case.
 *
 * All other mode and client combinations can not be shared, most
 * typically a write lock invalidating all other non-write holders to
 * drop caches and force a read after the write has completed.
 */
 static bool client_entries_compatible(struct client_lock_entry *granted,
 				      struct client_lock_entry *requested)
 {
-	return (granted->mode == requested->mode &&
+	/* only read and write_only can be full shared */
-		(granted->mode == SCOUTFS_LOCK_READ ||
+	if ((granted->mode == requested->mode) &&
-		 granted->mode == SCOUTFS_LOCK_WRITE_ONLY)) ||
+	    (granted->mode == SCOUTFS_LOCK_READ || granted->mode == SCOUTFS_LOCK_WRITE_ONLY))
-	       (granted->rid == requested->rid &&
+		return true;
-		granted->mode == SCOUTFS_LOCK_READ &&
+
-		requested->mode == SCOUTFS_LOCK_WRITE);
+	/* _write includes reading, so a client can upgrade its read to write */
 	if (granted->rid == requested->rid &&
 	    granted->mode == SCOUTFS_LOCK_READ &&
 	    requested->mode == SCOUTFS_LOCK_WRITE)
 		return true;
 	/* null is always compatible across clients, never within a client */
 	if ((granted->rid != requested->rid) &&
 	    (granted->mode == SCOUTFS_LOCK_NULL || requested->mode == SCOUTFS_LOCK_NULL))
 		return true;
 	return false;
 }
 /*
@@ -317,16 +344,18 @@ static void put_server_lock(struct lock_server_info *inf,
 	BUG_ON(!mutex_is_locked(&snode->mutex));
 	spin_lock(&inf->lock);
 	if (atomic_dec_and_test(&snode->refcount) &&
 	    list_empty(&snode->granted) &&
 	    list_empty(&snode->requested) &&
 	    list_empty(&snode->invalidated)) {
 		spin_lock(&inf->lock);
 		rb_erase(&snode->node, &inf->locks_root);
 		spin_unlock(&inf->lock);
 		should_free = true;
 	}
 	spin_unlock(&inf->lock);
 	mutex_unlock(&snode->mutex);
 	if (should_free) {
--- a/kmod/src/net.c
+++ b/kmod/src/net.c
@@ -502,12 +502,12 @@ static void scoutfs_net_proc_worker(struct work_struct *work)
 * Free live responses up to and including the seq by marking them dead
 * and moving them to the send queue to be freed.
 */
-static int move_acked_responses(struct scoutfs_net_connection *conn,
+static bool move_acked_responses(struct scoutfs_net_connection *conn,
-				struct list_head *list, u64 seq)
+				 struct list_head *list, u64 seq)
 {
 	struct message_send *msend;
 	struct message_send *tmp;
-	int ret = 0;
+	bool moved = false;
 	assert_spin_locked(&conn->lock);
@@ -519,20 +519,20 @@ static int move_acked_responses(struct scoutfs_net_connection *conn,
 		msend->dead = 1;
 		list_move(&msend->head, &conn->send_queue);
-		ret = 1;
+		moved = true;
 	}
-	return ret;
+	return moved;
 }
 /* acks are processed inline in the recv worker */
 static void free_acked_responses(struct scoutfs_net_connection *conn, u64 seq)
 {
-	int moved;
+	bool moved;
 	spin_lock(&conn->lock);
-	moved = move_acked_responses(conn, &conn->send_queue, seq) +
+	moved = move_acked_responses(conn, &conn->send_queue, seq) |
 		move_acked_responses(conn, &conn->resend_queue, seq);
 	spin_unlock(&conn->lock);
--- a/kmod/src/scoutfs_trace.h
+++ b/kmod/src/scoutfs_trace.h
@@ -1046,9 +1046,12 @@ DECLARE_EVENT_CLASS(scoutfs_lock_class,
 		sk_trace_define(start)
 		sk_trace_define(end)
 		__field(u64, refresh_gen)
 		__field(u64, write_seq)
 		__field(u64, dirty_trans_seq)
 		__field(unsigned char, request_pending)
 		__field(unsigned char, invalidate_pending)
 		__field(int, mode)
 		__field(int, invalidating_mode)
 		__field(unsigned int, waiters_cw)
 		__field(unsigned int, waiters_pr)
 		__field(unsigned int, waiters_ex)
@@ -1061,9 +1064,12 @@ DECLARE_EVENT_CLASS(scoutfs_lock_class,
 		sk_trace_assign(start, &lck->start);
 		sk_trace_assign(end, &lck->end);
 		__entry->refresh_gen = lck->refresh_gen;
 		__entry->write_seq = lck->write_seq;
 		__entry->dirty_trans_seq = lck->dirty_trans_seq;
 		__entry->request_pending = lck->request_pending;
 		__entry->invalidate_pending = lck->invalidate_pending;
 		__entry->mode = lck->mode;
 		__entry->invalidating_mode = lck->invalidating_mode;
 		__entry->waiters_pr = lck->waiters[SCOUTFS_LOCK_READ];
 		__entry->waiters_ex = lck->waiters[SCOUTFS_LOCK_WRITE];
 		__entry->waiters_cw = lck->waiters[SCOUTFS_LOCK_WRITE_ONLY];
@@ -1071,10 +1077,11 @@ DECLARE_EVENT_CLASS(scoutfs_lock_class,
 		__entry->users_ex = lck->users[SCOUTFS_LOCK_WRITE];
 		__entry->users_cw = lck->users[SCOUTFS_LOCK_WRITE_ONLY];
        ),
-        TP_printk(SCSBF" start "SK_FMT" end "SK_FMT" mode %u reqpnd %u invpnd %u rfrgen %llu waiters: pr %u ex %u cw %u users: pr %u ex %u cw %u",
+        TP_printk(SCSBF" start "SK_FMT" end "SK_FMT" mode %u invmd %u reqp %u invp %u refg %llu wris %llu dts %llu waiters: pr %u ex %u cw %u users: pr %u ex %u cw %u",
 		  SCSB_TRACE_ARGS, sk_trace_args(start), sk_trace_args(end),
-		  __entry->mode, __entry->request_pending,
+		  __entry->mode, __entry->invalidating_mode, __entry->request_pending,
-		  __entry->invalidate_pending, __entry->refresh_gen,
+		  __entry->invalidate_pending, __entry->refresh_gen, __entry->write_seq,
 		  __entry->dirty_trans_seq,
 		  __entry->waiters_pr, __entry->waiters_ex, __entry->waiters_cw,
 		  __entry->users_pr, __entry->users_ex, __entry->users_cw)
 );
--- a/tests/golden/lock-shrink-read-race
+++ b/tests/golden/lock-shrink-read-race
@@ -0,0 +1,2 @@
 === setup
 === spin reading and shrinking
--- a/tests/sequence
+++ b/tests/sequence
@@ -25,6 +25,7 @@ totl-xattr-tag.sh
 quota.sh
 lock-refleak.sh
 lock-shrink-consistency.sh
 lock-shrink-read-race.sh
 lock-pr-cw-conflict.sh
 lock-revoke-getcwd.sh
 lock-recover-invalidate.sh
--- a/tests/tests/lock-shrink-read-race.sh
+++ b/tests/tests/lock-shrink-read-race.sh
@@ -0,0 +1,40 @@
 #
 # We had a lock server refcounting bug that could let one thread get a
 # reference on a lock struct that was being freed by another thread.  We
 # were able to reproduce this by having all clients try and produce a
 # lot of read and null requests.
 #
 # This will manfiest as a hung lock and timed out test runs, probably
 # with hung task messages on the console.  Depending on how the race
 # turns out, it can trigger KASAN warnings in
 # process_waiting_requests().
 #
 READERS_PER=3
 SECS=30
 echo "=== setup"
 touch "$T_D0/file"
 echo "=== spin reading and shrinking"
 END=$((SECONDS + SECS))
 for m in $(t_fs_nrs); do
 	eval file="\$T_D${m}/file"
 	# lots of tasks reading as fast as they can
 	for t in $(seq 1 $READERS_PER); do
 		(while [ $SECONDS -lt $END ]; do
 			stat $file > /dev/null
 		 done) &
 	done
 	# one task shrinking (triggering null requests) and reading
 	(while [ $SECONDS -lt $END ]; do
 		stat $file > /dev/null
 		t_trigger_arm_silent statfs_lock_purge $m
 		stat -f "$file" > /dev/null
 	 done) &
 done
 wait
 t_pass
		`@@ -0,0 +1,2 @@`
							`=== setup`
							`=== spin reading and shrinking`