From 2d91e51218dd4423c4526b00ddf18e3e5ddfb0fd Mon Sep 17 00:00:00 2001 From: Auke Kok Date: Tue, 12 May 2026 09:59:08 -0700 Subject: [PATCH] Drop pending greetings from resend_queue in set_valid_greeting. There's a race possible where set_valid_greeting() runs just as another greeting reply sits on the resend_queue from a handshake: 1. server_greeting() calls scoutfs_net_response() to queue the server's greeting reply on send_queue and kicks send_work. 2. server_greeting() falls through to scoutfs_net_server_greeting() which calls set_valid_greeting() under conn->lock. 3. If send_work already ran between steps 1 and 2 it moved the reply onto resend_queue. 4. set_valid_greeting() splices resend_queue back into send_queue, re-queueing the greeting reply for a second send on the same socket. The receiver treats a duplicate greeting as fatal: the recieve path invalid_message check on the saw_greeting flag tears the connection down silently. The client never sees a normal disconnect, just a stalled connection, and the server only notices once the 30s recovery timeout fires and fences the client. Fix by walking resend_queue in set_valid_greeting() and dropping any queued greeting before splicing them in. Dropping them should be safe and mimics what scoutfs_net_shutdown_worker() does. Signed-off-by: Auke Kok --- kmod/src/net.c | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/kmod/src/net.c b/kmod/src/net.c index b028d7c8..bb180898 100644 --- a/kmod/src/net.c +++ b/kmod/src/net.c @@ -1730,12 +1730,30 @@ int scoutfs_net_connect(struct super_block *sb, static void set_valid_greeting(struct scoutfs_net_connection *conn) { + struct net_info *ninf = SCOUTFS_SB(conn->sb)->net_info; + struct message_send *msend; + struct message_send *tmp; + assert_spin_locked(&conn->lock); /* recv should have dropped invalid duplicate greeting messages */ BUG_ON(test_conn_fl(conn, valid_greeting)); set_conn_fl(conn, valid_greeting); + + /* + * Drop greetings from the resend_queue before splicing it into + * the send_queue. We might have a greeting left in the resend + * queue at the moment that we reach this point. A duplicate + * greeting is treated as fatal and causes a stall and fence. + */ + list_for_each_entry_safe(msend, tmp, &conn->resend_queue, head) { + if (msend->nh.cmd == SCOUTFS_NET_CMD_GREETING) { + msend->dead = 1; + free_msend(ninf, conn, msend); + } + } + list_splice_tail_init(&conn->resend_queue, &conn->send_queue); queue_work(conn->workq, &conn->send_work); }