Compare commits

...

3 Commits

Author SHA1 Message Date
Auke Kok 063899286c Drop pending greetings from resend_queue in set_valid_greeting.
There's a race possible where set_valid_greeting() runs just as
another greeting reply sits on the resend_queue from a handshake:

  1. server_greeting() calls scoutfs_net_response() to queue the
     server's greeting reply on send_queue and kicks send_work.
  2. server_greeting() falls through to scoutfs_net_server_greeting()
     which calls set_valid_greeting() under conn->lock.
  3. If send_work already ran between steps 1 and 2 it moved the
     reply onto resend_queue.
  4. set_valid_greeting() splices resend_queue back into send_queue,
     re-queueing the greeting reply for a second send on the same
     socket.

The receiver treats a duplicate greeting as fatal: the receive path
invalid_message check on the saw_greeting flag tears the connection
down silently. The client never sees a normal disconnect, just a
stalled connection, and the server only notices once the 30s
recovery timeout fires and fences the client.

Fix by walking resend_queue in set_valid_greeting() and dropping any
queued greeting before splicing them in. Dropping them should be safe
and mimics what scoutfs_net_shutdown_worker() does.

Signed-off-by: Auke Kok <auke.kok@versity.com>
2026-06-05 10:13:03 -07:00
Auke Kok ec24e86f2f Add recovery and lock-recover diagnostic trace events.
Add trace events to aid debugging recovery stalls:

  - scoutfs_recov_prepare(sb, rid, which):
    Server registered that a client needs to recover something.

  - scoutfs_recov_finish(sb, rid, which, remaining):
    Client finished one recovery item, and lists remaining count.

  - scoutfs_recov_timeout_fire(sb, nr_pending):
    The 30s recovery window expired, with count of clients that
    failed to recover within the timeout.

  - scoutfs_recov_fence_rid(sb, rid):
    One event per client about to be fenced.

  - scoutfs_lock_server_greeting(sb, rid, recov_pending):
    Shows which rid we expect to recover locks.

Signed-off-by: Auke Kok <auke.kok@versity.com>
2026-06-05 10:12:59 -07:00
Auke Kok 0a06de249c Use spin_lock_bh on recinf->lock to prevent softirq deadlock
The trace events added in the subsequent change take recinf->lock
while in timer_callback() to iterate recinf->pending, but it does
so in softirq context. To do this safely, it has to use the _bh()
spin_lock variant. Convert the other lock takers, too.

```
	================================
	WARNING: inconsistent lock state
	5.14.0-427.35.1.el9_4.x86_64+debug #1 Tainted: G           OE     -------  ---
	--------------------------------
	inconsistent {SOFTIRQ-ON-W} -> {IN-SOFTIRQ-W} usage.
	swapper/2/0 [HC0[0]:SC1[1]:HE1:SE0] takes:
	ffff88813cdd9c20 (&recinf->lock){+.?.}-{2:2}, at: timer_callback+0x26/0x380 [scoutfs]
	{SOFTIRQ-ON-W} state was registered at:
	  __lock_acquire+0x7d0/0x1900
	  lock_acquire+0x1da/0x640
	  _raw_spin_lock+0x34/0x80
	  scoutfs_recov_finish+0x80/0x830 [scoutfs]
	  server_greeting+0x244/0xe60 [scoutfs]
	  scoutfs_net_proc_worker+0x28a/0xce0 [scoutfs]
	  recv_one_message+0x7e3/0xd10 [scoutfs]
	  scoutfs_net_recv_worker+0x441/0xe00 [scoutfs]
	  process_one_work+0x8e5/0x1530
	  worker_thread+0x598/0xf70
	  kthread+0x2a4/0x350
	  ret_from_fork+0x29/0x50
	irq event stamp: 549813370
	hardirqs last  enabled at (549813370): [<ffffffffabe25cb4>] _raw_spin_unlock_irq+0x24/0x50
	hardirqs last disabled at (549813369): [<ffffffffabe2594e>] _raw_spin_lock_irq+0x5e/0x90
	softirqs last  enabled at (549813356): [<ffffffffabe28c91>] __do_softirq+0x621/0x9c2
	softirqs last disabled at (549813363): [<ffffffffa9a44665>] __irq_exit_rcu+0x185/0x230

	other info that might help us debug this:
	 Possible unsafe locking scenario:
	       CPU0
	       ----
	  lock(&recinf->lock);
	  <Interrupt>
	    lock(&recinf->lock);

	 *** DEADLOCK ***
```

Signed-off-by: Auke Kok <auke.kok@versity.com>
2026-06-05 10:12:22 -07:00
5 changed files with 162 additions and 13 deletions
+6 -1
View File
@@ -626,9 +626,14 @@ out:
int scoutfs_lock_server_greeting(struct super_block *sb, u64 rid)
{
struct scoutfs_key key;
bool pending;
int ret;
if (scoutfs_recov_is_pending(sb, rid, SCOUTFS_RECOV_LOCKS)) {
pending = scoutfs_recov_is_pending(sb, rid, SCOUTFS_RECOV_LOCKS);
trace_scoutfs_lock_server_greeting(sb, rid, pending);
if (pending) {
scoutfs_key_set_zeros(&key);
ret = scoutfs_server_lock_recover_request(sb, rid, &key);
} else {
+18
View File
@@ -1730,12 +1730,30 @@ int scoutfs_net_connect(struct super_block *sb,
static void set_valid_greeting(struct scoutfs_net_connection *conn)
{
struct net_info *ninf = SCOUTFS_SB(conn->sb)->net_info;
struct message_send *msend;
struct message_send *tmp;
assert_spin_locked(&conn->lock);
/* recv should have dropped invalid duplicate greeting messages */
BUG_ON(test_conn_fl(conn, valid_greeting));
set_conn_fl(conn, valid_greeting);
/*
* Drop greetings from the resend_queue before splicing it into
* the send_queue. We might have a greeting left in the resend
* queue at the moment that we reach this point. A duplicate
* greeting is treated as fatal and causes a stall and fence.
*/
list_for_each_entry_safe(msend, tmp, &conn->resend_queue, head) {
if (msend->nh.cmd == SCOUTFS_NET_CMD_GREETING) {
msend->dead = 1;
free_msend(ninf, conn, msend);
}
}
list_splice_tail_init(&conn->resend_queue, &conn->send_queue);
queue_work(conn->workq, &conn->send_work);
}
+32 -12
View File
@@ -21,6 +21,7 @@
#include "super.h"
#include "recov.h"
#include "cmp.h"
#include "scoutfs_trace.h"
/*
* There are a few server messages which can't be processed until they
@@ -103,7 +104,7 @@ int scoutfs_recov_prepare(struct super_block *sb, u64 rid, int which)
if (!alloc)
return -ENOMEM;
spin_lock(&recinf->lock);
spin_lock_bh(&recinf->lock);
pend = lookup_pending(recinf, rid, SCOUTFS_RECOV_ALL);
if (pend) {
@@ -116,9 +117,12 @@ int scoutfs_recov_prepare(struct super_block *sb, u64 rid, int which)
list_sort(NULL, &recinf->pending, cmp_pending_rid);
}
spin_unlock(&recinf->lock);
spin_unlock_bh(&recinf->lock);
kfree(alloc);
trace_scoutfs_recov_prepare(sb, rid, which);
return 0;
}
@@ -135,6 +139,15 @@ static int recov_finished(struct recov_info *recinf)
static void timer_callback(struct timer_list *timer)
{
struct recov_info *recinf = timer_container_of(recinf, timer, timer);
struct recov_pending *pend;
int nr = 0;
spin_lock(&recinf->lock);
list_for_each_entry(pend, &recinf->pending, head)
nr++;
spin_unlock(&recinf->lock);
trace_scoutfs_recov_timeout_fire(recinf->sb, nr);
recinf->timeout_fn(recinf->sb);
}
@@ -153,7 +166,7 @@ int scoutfs_recov_begin(struct super_block *sb, void (*timeout_fn)(struct super_
DECLARE_RECOV_INFO(sb, recinf);
int ret;
spin_lock(&recinf->lock);
spin_lock_bh(&recinf->lock);
recinf->timeout_fn = timeout_fn;
recinf->timer.expires = jiffies + msecs_to_jiffies(timeout_ms);
@@ -161,7 +174,7 @@ int scoutfs_recov_begin(struct super_block *sb, void (*timeout_fn)(struct super_
ret = recov_finished(recinf);
spin_unlock(&recinf->lock);
spin_unlock_bh(&recinf->lock);
if (ret > 0)
del_timer_sync(&recinf->timer);
@@ -181,9 +194,11 @@ int scoutfs_recov_finish(struct super_block *sb, u64 rid, int which)
{
DECLARE_RECOV_INFO(sb, recinf);
struct recov_pending *pend;
struct recov_pending *iter;
int remaining = 0;
int ret = 0;
spin_lock(&recinf->lock);
spin_lock_bh(&recinf->lock);
pend = lookup_pending(recinf, rid, which);
if (pend) {
@@ -196,7 +211,12 @@ int scoutfs_recov_finish(struct super_block *sb, u64 rid, int which)
}
}
spin_unlock(&recinf->lock);
list_for_each_entry(iter, &recinf->pending, head)
remaining++;
spin_unlock_bh(&recinf->lock);
trace_scoutfs_recov_finish(sb, rid, which, remaining);
if (ret > 0)
del_timer_sync(&recinf->timer);
@@ -215,9 +235,9 @@ bool scoutfs_recov_is_pending(struct super_block *sb, u64 rid, int which)
DECLARE_RECOV_INFO(sb, recinf);
bool is_pending;
spin_lock(&recinf->lock);
spin_lock_bh(&recinf->lock);
is_pending = lookup_pending(recinf, rid, which) != NULL;
spin_unlock(&recinf->lock);
spin_unlock_bh(&recinf->lock);
return is_pending;
}
@@ -236,10 +256,10 @@ u64 scoutfs_recov_next_pending(struct super_block *sb, u64 rid, int which)
DECLARE_RECOV_INFO(sb, recinf);
struct recov_pending *pend;
spin_lock(&recinf->lock);
spin_lock_bh(&recinf->lock);
pend = next_pending(recinf, rid, which);
rid = pend ? pend->rid : 0;
spin_unlock(&recinf->lock);
spin_unlock_bh(&recinf->lock);
return rid;
}
@@ -257,10 +277,10 @@ void scoutfs_recov_shutdown(struct super_block *sb)
del_timer_sync(&recinf->timer);
spin_lock(&recinf->lock);
spin_lock_bh(&recinf->lock);
list_splice_init(&recinf->pending, &list);
recinf->timeout_fn = NULL;
spin_unlock(&recinf->lock);
spin_unlock_bh(&recinf->lock);
list_for_each_entry_safe(pend, tmp, &list, head) {
list_del(&pend->head);
+104
View File
@@ -2121,6 +2121,110 @@ DEFINE_EVENT(scoutfs_server_client_count_class, scoutfs_server_client_down,
TP_ARGS(sb, rid, nr_clients)
);
TRACE_EVENT(scoutfs_recov_prepare,
TP_PROTO(struct super_block *sb, u64 rid, int which),
TP_ARGS(sb, rid, which),
TP_STRUCT__entry(
SCSB_TRACE_FIELDS
__field(__u64, c_rid)
__field(int, which)
),
TP_fast_assign(
SCSB_TRACE_ASSIGN(sb);
__entry->c_rid = rid;
__entry->which = which;
),
TP_printk(SCSBF" rid %016llx which 0x%x",
SCSB_TRACE_ARGS, __entry->c_rid, __entry->which)
);
TRACE_EVENT(scoutfs_recov_finish,
TP_PROTO(struct super_block *sb, u64 rid, int which, int remaining),
TP_ARGS(sb, rid, which, remaining),
TP_STRUCT__entry(
SCSB_TRACE_FIELDS
__field(__u64, c_rid)
__field(int, which)
__field(int, remaining)
),
TP_fast_assign(
SCSB_TRACE_ASSIGN(sb);
__entry->c_rid = rid;
__entry->which = which;
__entry->remaining = remaining;
),
TP_printk(SCSBF" rid %016llx which 0x%x remaining %d",
SCSB_TRACE_ARGS, __entry->c_rid, __entry->which,
__entry->remaining)
);
TRACE_EVENT(scoutfs_recov_timeout_fire,
TP_PROTO(struct super_block *sb, int nr_pending),
TP_ARGS(sb, nr_pending),
TP_STRUCT__entry(
SCSB_TRACE_FIELDS
__field(int, nr_pending)
),
TP_fast_assign(
SCSB_TRACE_ASSIGN(sb);
__entry->nr_pending = nr_pending;
),
TP_printk(SCSBF" nr_pending %d",
SCSB_TRACE_ARGS, __entry->nr_pending)
);
TRACE_EVENT(scoutfs_recov_fence_rid,
TP_PROTO(struct super_block *sb, u64 rid),
TP_ARGS(sb, rid),
TP_STRUCT__entry(
SCSB_TRACE_FIELDS
__field(__u64, c_rid)
),
TP_fast_assign(
SCSB_TRACE_ASSIGN(sb);
__entry->c_rid = rid;
),
TP_printk(SCSBF" rid %016llx",
SCSB_TRACE_ARGS, __entry->c_rid)
);
TRACE_EVENT(scoutfs_lock_server_greeting,
TP_PROTO(struct super_block *sb, u64 rid, bool recov_pending),
TP_ARGS(sb, rid, recov_pending),
TP_STRUCT__entry(
SCSB_TRACE_FIELDS
__field(__u64, c_rid)
__field(bool, recov_pending)
),
TP_fast_assign(
SCSB_TRACE_ASSIGN(sb);
__entry->c_rid = rid;
__entry->recov_pending = recov_pending;
),
TP_printk(SCSBF" rid %016llx recov_pending %d",
SCSB_TRACE_ARGS, __entry->c_rid, __entry->recov_pending)
);
DECLARE_EVENT_CLASS(scoutfs_server_commit_users_class,
TP_PROTO(struct super_block *sb, int holding, int applying,
int nr_holders, u32 budget,
+2
View File
@@ -4386,6 +4386,8 @@ static void fence_pending_recov_worker(struct work_struct *work)
scoutfs_err(sb, "%lu ms recovery timeout expired for client rid %016llx, fencing",
SERVER_RECOV_TIMEOUT_MS, rid);
trace_scoutfs_recov_fence_rid(sb, rid);
ret = lookup_mounted_client_addr(sb, rid, &addr);
if (ret < 0) {
scoutfs_err(sb, "client rid addr lookup err %d, shutting down server", ret);