Allow forced unmount errors in lock invalidation

Lock invalidation has assertions for critical errors, but it doesn't allow the synthetic errors that come from forced unmount severing the client's connection to the world. Signed-off-by: Zach Brown <zab@versity.com>
Search messages in rbtree instead of lists
2026-01-09 21:27:25 +00:00 · 2026-01-09 09:15:17 -08:00 · 2026-01-08 10:58:50 -08:00 · 2026-01-08 10:58:50 -08:00
8 changed files with 326 additions and 267 deletions
--- a/kmod/src/counters.h
+++ b/kmod/src/counters.h
@@ -125,7 +125,6 @@
 	EXPAND_COUNTER(item_update)				\
 	EXPAND_COUNTER(item_write_dirty)			\
 	EXPAND_COUNTER(lock_alloc)				\
-	EXPAND_COUNTER(lock_count_objects)			\
 	EXPAND_COUNTER(lock_free)				\
 	EXPAND_COUNTER(lock_grant_request)			\
 	EXPAND_COUNTER(lock_grant_response)			\
@@ -139,10 +138,8 @@
 	EXPAND_COUNTER(lock_lock_error)				\
 	EXPAND_COUNTER(lock_nonblock_eagain)			\
 	EXPAND_COUNTER(lock_recover_request)			\
-	EXPAND_COUNTER(lock_scan_objects)			\
 	EXPAND_COUNTER(lock_shrink_attempted)			\
-	EXPAND_COUNTER(lock_shrink_aborted)			\
-	EXPAND_COUNTER(lock_shrink_work)			\
+	EXPAND_COUNTER(lock_shrink_request_failed)		\
 	EXPAND_COUNTER(lock_unlock)				\
 	EXPAND_COUNTER(lock_wait)				\
 	EXPAND_COUNTER(log_merge_complete)			\
--- a/kmod/src/lock.c
+++ b/kmod/src/lock.c
@@ -53,8 +53,10 @@
 * all access to the lock (by revoking it down to a null mode) then the
 * lock is freed.
 *
- * Memory pressure on the client can cause the client to request a null
- * mode from the server so that once its granted the lock can be freed.
+ * Each client has a configurable number of locks that are allowed to
+ * remain idle after being granted, for use by future tasks.  Past the
+ * limit locks are freed by requesting a null mode from the server,
+ * governed by a LRU.
 *
 * So far we've only needed a minimal trylock.  We return -EAGAIN if a
 * lock attempt can't immediately match an existing granted lock.  This
@@ -79,14 +81,11 @@ struct lock_info {
 	bool unmounting;
 	struct rb_root lock_tree;
 	struct rb_root lock_range_tree;
-	KC_DEFINE_SHRINKER(shrinker);
+	u64 nr_locks;
 	struct list_head lru_list;
-	unsigned long long lru_nr;
 	struct workqueue_struct *workq;
 	struct work_struct inv_work;
 	struct list_head inv_list;
-	struct work_struct shrink_work;
-	struct list_head shrink_list;
 	atomic64_t next_refresh_gen;

 	struct dentry *tseq_dentry;
@@ -249,7 +248,6 @@ static void lock_free(struct lock_info *linfo, struct scoutfs_lock *lock)
 	BUG_ON(!RB_EMPTY_NODE(&lock->range_node));
 	BUG_ON(!list_empty(&lock->lru_head));
 	BUG_ON(!list_empty(&lock->inv_head));
-	BUG_ON(!list_empty(&lock->shrink_head));
 	BUG_ON(!list_empty(&lock->cov_list));

 	kfree(lock->inode_deletion_data);
@@ -277,7 +275,6 @@ static struct scoutfs_lock *lock_alloc(struct super_block *sb,
 	INIT_LIST_HEAD(&lock->lru_head);
 	INIT_LIST_HEAD(&lock->inv_head);
 	INIT_LIST_HEAD(&lock->inv_list);
-	INIT_LIST_HEAD(&lock->shrink_head);
 	spin_lock_init(&lock->cov_list_lock);
 	INIT_LIST_HEAD(&lock->cov_list);

@@ -410,6 +407,7 @@ static bool lock_insert(struct super_block *sb, struct scoutfs_lock *ins)
 	rb_link_node(&ins->node, parent, node);
 	rb_insert_color(&ins->node, &linfo->lock_tree);

+	linfo->nr_locks++;
 	scoutfs_tseq_add(&linfo->tseq_tree, &ins->tseq_entry);

 	return true;
@@ -424,6 +422,7 @@ static void lock_remove(struct lock_info *linfo, struct scoutfs_lock *lock)
 	rb_erase(&lock->range_node, &linfo->lock_range_tree);
 	RB_CLEAR_NODE(&lock->range_node);

+	linfo->nr_locks--;
 	scoutfs_tseq_del(&linfo->tseq_tree, &lock->tseq_entry);
 }

@@ -463,10 +462,8 @@ static void __lock_del_lru(struct lock_info *linfo, struct scoutfs_lock *lock)
 {
 	assert_spin_locked(&linfo->lock);

-	if (!list_empty(&lock->lru_head)) {
+	if (!list_empty(&lock->lru_head))
 		list_del_init(&lock->lru_head);
-		linfo->lru_nr--;
-	}
 }

 /*
@@ -525,14 +522,16 @@ static struct scoutfs_lock *create_lock(struct super_block *sb,
 * indicate that the lock wasn't idle.  If it really is idle then we
 * either free it if it's null or put it back on the lru.
 */
-static void put_lock(struct lock_info *linfo,struct scoutfs_lock *lock)
+static void __put_lock(struct lock_info *linfo, struct scoutfs_lock *lock, bool tail)
 {
 	assert_spin_locked(&linfo->lock);

 	if (lock_idle(lock)) {
 		if (lock->mode != SCOUTFS_LOCK_NULL) {
-			list_add_tail(&lock->lru_head, &linfo->lru_list);
-			linfo->lru_nr++;
+			if (tail)
+				list_add_tail(&lock->lru_head, &linfo->lru_list);
+			else
+				list_add(&lock->lru_head, &linfo->lru_list);
 		} else {
 			lock_remove(linfo, lock);
 			lock_free(linfo, lock);
@@ -540,6 +539,11 @@ static void put_lock(struct lock_info *linfo,struct scoutfs_lock *lock)
 	}
 }

+static inline void put_lock(struct lock_info *linfo, struct scoutfs_lock *lock)
+{
+	__put_lock(linfo, lock, true);
+}
+
 /*
 * The caller has made a change (set a lock mode) which can let one of the
 * invalidating locks make forward progress.
@@ -713,14 +717,14 @@ static void lock_invalidate_worker(struct work_struct *work)
 		/* only lock protocol, inv can't call subsystems after shutdown */
 		if (!linfo->shutdown) {
 			ret = lock_invalidate(sb, lock, nl->old_mode, nl->new_mode);
-			BUG_ON(ret);
+			BUG_ON(ret < 0 && ret != -ENOLINK);
 		}

 		/* respond with the key and modes from the request, server might have died */
 		ret = scoutfs_client_lock_response(sb, ireq->net_id, nl);
 		if (ret == -ENOTCONN)
 			ret = 0;
-		BUG_ON(ret);
+		BUG_ON(ret < 0 && ret != -ENOLINK);

 		scoutfs_inc_counter(sb, lock_invalidate_response);
 	}
@@ -875,6 +879,69 @@ int scoutfs_lock_recover_request(struct super_block *sb, u64 net_id,
 	return ret;
 }

+/*
+ * This is called on every _lock call to try and keep the number of
+ * locks under the idle count.  We're intentionally trying to throttle
+ * shrinking bursts by tying its frequency to lock use.  It will only
+ * send requests to free unused locks, though, so it's always possible
+ * to exceed the high water mark under heavy load.
+ *
+ * We send a null request and the lock will be freed by the response
+ * once all users drain.  If this races with invalidation then the
+ * server will only send the grant response once the invalidation is
+ * finished.
+ */
+static bool try_shrink_lock(struct super_block *sb, struct lock_info *linfo, bool force)
+{
+	struct scoutfs_mount_options opts;
+	struct scoutfs_lock *lock = NULL;
+	struct scoutfs_net_lock nl;
+	int ret = 0;
+
+	scoutfs_options_read(sb, &opts);
+
+	/* avoiding lock contention with unsynchronized test, don't mind temp false results */
+	if (!force && (list_empty(&linfo->lru_list) ||
+	               READ_ONCE(linfo->nr_locks) <= opts.lock_idle_count))
+		return false;
+
+	spin_lock(&linfo->lock);
+
+	lock = list_first_entry_or_null(&linfo->lru_list, struct scoutfs_lock, lru_head);
+	if (lock && (force || (linfo->nr_locks > opts.lock_idle_count))) {
+		__lock_del_lru(linfo, lock);
+		lock->request_pending = 1;
+
+		nl.key = lock->start;
+		nl.old_mode = lock->mode;
+		nl.new_mode = SCOUTFS_LOCK_NULL;
+	} else {
+		lock = NULL;
+	}
+
+	spin_unlock(&linfo->lock);
+
+	if (lock) {
+		ret = scoutfs_client_lock_request(sb, &nl);
+		if (ret < 0) {
+			scoutfs_inc_counter(sb, lock_shrink_request_failed);
+
+			spin_lock(&linfo->lock);
+
+			lock->request_pending = 0;
+			wake_up(&lock->waitq);
+			__put_lock(linfo, lock, false);
+
+			spin_unlock(&linfo->lock);
+		} else {
+			scoutfs_inc_counter(sb, lock_shrink_attempted);
+			trace_scoutfs_lock_shrink(sb, lock);
+		}
+	}
+
+	return lock && ret == 0;
+}
+
 static bool lock_wait_cond(struct super_block *sb, struct scoutfs_lock *lock,
 			   enum scoutfs_lock_mode mode)
 {
@@ -937,6 +1004,8 @@ static int lock_key_range(struct super_block *sb, enum scoutfs_lock_mode mode, i
 	if (WARN_ON_ONCE(scoutfs_trans_held()))
 		return -EDEADLK;

+	try_shrink_lock(sb, linfo, false);
+
 	spin_lock(&linfo->lock);

 	/* drops and re-acquires lock if it allocates */
@@ -1380,134 +1449,12 @@ bool scoutfs_lock_protected(struct scoutfs_lock *lock, struct scoutfs_key *key,
 					  &lock->start, &lock->end) == 0;
 }

-/*
- * The shrink callback got the lock, marked it request_pending, and put
- * it on the shrink list.  We send a null request and the lock will be
- * freed by the response once all users drain.  If this races with
- * invalidation then the server will only send the grant response once
- * the invalidation is finished.
- */
-static void lock_shrink_worker(struct work_struct *work)
-{
-	struct lock_info *linfo = container_of(work, struct lock_info,
-					       shrink_work);
-	struct super_block *sb = linfo->sb;
-	struct scoutfs_net_lock nl;
-	struct scoutfs_lock *lock;
-	struct scoutfs_lock *tmp;
-	LIST_HEAD(list);
-	int ret;
-
-	scoutfs_inc_counter(sb, lock_shrink_work);
-
-	spin_lock(&linfo->lock);
-	list_splice_init(&linfo->shrink_list, &list);
-	spin_unlock(&linfo->lock);
-
-	list_for_each_entry_safe(lock, tmp, &list, shrink_head) {
-		list_del_init(&lock->shrink_head);
-
-		/* unlocked lock access, but should be stable since we queued */
-		nl.key = lock->start;
-		nl.old_mode = lock->mode;
-		nl.new_mode = SCOUTFS_LOCK_NULL;
-
-		ret = scoutfs_client_lock_request(sb, &nl);
-		if (ret) {
-			/* oh well, not freeing */
-			scoutfs_inc_counter(sb, lock_shrink_aborted);
-
-			spin_lock(&linfo->lock);
-
-			lock->request_pending = 0;
-			wake_up(&lock->waitq);
-			put_lock(linfo, lock);
-
-			spin_unlock(&linfo->lock);
-		}
-	}
-}
-
-static unsigned long lock_count_objects(struct shrinker *shrink,
-					struct shrink_control *sc)
-{
-	struct lock_info *linfo = KC_SHRINKER_CONTAINER_OF(shrink, struct lock_info);
-	struct super_block *sb = linfo->sb;
-
-	scoutfs_inc_counter(sb, lock_count_objects);
-
-	return shrinker_min_long(linfo->lru_nr);
-}
-
-/*
- * Start the shrinking process for locks on the lru.  If a lock is on
- * the lru then it can't have any active users.  We don't want to block
- * or allocate here so all we do is get the lock, mark it request
- * pending, and kick off the work.  The work sends a null request and
- * eventually the lock is freed by its response.
- *
- * Only a racing lock attempt that isn't matched can prevent the lock
- * from being freed.  It'll block waiting to send its request for its
- * mode which will prevent the lock from being freed when the null
- * response arrives.
- */
-static unsigned long lock_scan_objects(struct shrinker *shrink,
-				       struct shrink_control *sc)
-{
-	struct lock_info *linfo = KC_SHRINKER_CONTAINER_OF(shrink, struct lock_info);
-	struct super_block *sb = linfo->sb;
-	struct scoutfs_lock *lock;
-	struct scoutfs_lock *tmp;
-	unsigned long freed = 0;
-	unsigned long nr = sc->nr_to_scan;
-	bool added = false;
-
-	scoutfs_inc_counter(sb, lock_scan_objects);
-
-	spin_lock(&linfo->lock);
-
-restart:
-	list_for_each_entry_safe(lock, tmp, &linfo->lru_list, lru_head) {
-
-		BUG_ON(!lock_idle(lock));
-		BUG_ON(lock->mode == SCOUTFS_LOCK_NULL);
-		BUG_ON(!list_empty(&lock->shrink_head));
-
-		if (nr-- == 0)
-			break;
-
-		__lock_del_lru(linfo, lock);
-		lock->request_pending = 1;
-		list_add_tail(&lock->shrink_head, &linfo->shrink_list);
-		added = true;
-		freed++;
-
-		scoutfs_inc_counter(sb, lock_shrink_attempted);
-		trace_scoutfs_lock_shrink(sb, lock);
-
-		/* could have bazillions of idle locks */
-		if (cond_resched_lock(&linfo->lock))
-			goto restart;
-	}
-
-	spin_unlock(&linfo->lock);
-
-	if (added)
-		queue_work(linfo->workq, &linfo->shrink_work);
-
-	trace_scoutfs_lock_shrink_exit(sb, sc->nr_to_scan, freed);
-	return freed;
-}
-
 void scoutfs_free_unused_locks(struct super_block *sb)
 {
-	struct lock_info *linfo = SCOUTFS_SB(sb)->lock_info;
-	struct shrink_control sc = {
-		.gfp_mask = GFP_NOFS,
-		.nr_to_scan = INT_MAX,
-	};
+	DECLARE_LOCK_INFO(sb, linfo);

-	lock_scan_objects(KC_SHRINKER_FN(&linfo->shrinker), &sc);
+	while (try_shrink_lock(sb, linfo, true))
+		cond_resched();
 }

 static void lock_tseq_show(struct seq_file *m, struct scoutfs_tseq_entry *ent)
@@ -1590,10 +1537,10 @@ u64 scoutfs_lock_ino_refresh_gen(struct super_block *sb, u64 ino)
 * transitions and sending requests.   We set the shutdown flag to catch
 * anyone who breaks this rule.
 *
- * We unregister the shrinker so that we won't try and send null
- * requests in response to memory pressure.  The locks will all be
- * unceremoniously dropped once we get a farewell response from the
- * server which indicates that they destroyed our locking state.
+ * With no more lock callers, we'll no longer try to shrink the pool of
+ * granted locks.  We'll free all of them as _destroy() is called after
+ * the farewell response indicates that the server tore down all our
+ * lock state.
 *
 * We will still respond to invalidation requests that have to be
 * processed to let unmount in other mounts acquire locks and make
@@ -1613,10 +1560,6 @@ void scoutfs_lock_shutdown(struct super_block *sb)

 	trace_scoutfs_lock_shutdown(sb, linfo);

-	/* stop the shrinker from queueing work */
-	KC_UNREGISTER_SHRINKER(&linfo->shrinker);
-	flush_work(&linfo->shrink_work);
-
 	/* cause current and future lock calls to return errors */
 	spin_lock(&linfo->lock);
 	linfo->shutdown = true;
@@ -1707,8 +1650,6 @@ void scoutfs_lock_destroy(struct super_block *sb)
 			list_del_init(&lock->inv_head);
 			lock->invalidate_pending = 0;
 		}
-		if (!list_empty(&lock->shrink_head))
-			list_del_init(&lock->shrink_head);
 		lock_remove(linfo, lock);
 		lock_free(linfo, lock);
 	}
@@ -1733,14 +1674,9 @@ int scoutfs_lock_setup(struct super_block *sb)
 	spin_lock_init(&linfo->lock);
 	linfo->lock_tree = RB_ROOT;
 	linfo->lock_range_tree = RB_ROOT;
-	KC_INIT_SHRINKER_FUNCS(&linfo->shrinker, lock_count_objects,
-			       lock_scan_objects);
-	KC_REGISTER_SHRINKER(&linfo->shrinker, "scoutfs-lock:" SCSBF, SCSB_ARGS(sb));
 	INIT_LIST_HEAD(&linfo->lru_list);
 	INIT_WORK(&linfo->inv_work, lock_invalidate_worker);
 	INIT_LIST_HEAD(&linfo->inv_list);
-	INIT_WORK(&linfo->shrink_work, lock_shrink_worker);
-	INIT_LIST_HEAD(&linfo->shrink_list);
 	atomic64_set(&linfo->next_refresh_gen, 0);
 	scoutfs_tseq_tree_init(&linfo->tseq_tree, lock_tseq_show);

--- a/kmod/src/net.c
+++ b/kmod/src/net.c
@@ -21,6 +21,7 @@
 #include <net/tcp.h>
 #include <linux/log2.h>
 #include <linux/jhash.h>
+#include <linux/rbtree.h>

 #include "format.h"
 #include "counters.h"
@@ -122,9 +123,10 @@ enum spin_lock_subtype {
 */
 struct message_send {
 	struct scoutfs_tseq_entry tseq_entry;
-	unsigned long dead:1;
-	struct list_head head;
+	struct rb_node node;
 	scoutfs_net_response_t resp_func;
+	struct list_head head;
+	struct list_head dead_head; /* o/~ playin' in the band o/~ */
 	void *resp_data;
 	struct scoutfs_net_header nh;
 };
@@ -161,58 +163,133 @@ static bool nh_is_request(struct scoutfs_net_header *nh)
 	return !nh_is_response(nh);
 }

+static bool msend_is_dead(struct message_send *msend)
+{
+	return !list_empty(&msend->dead_head);
+}
+
+static int cmp_sorted_msend(u64 pos, struct message_send *msend)
+{
+	if (nh_is_request(&msend->nh))
+		return pos < le64_to_cpu(msend->nh.id) ? -1 :
+		       pos > le64_to_cpu(msend->nh.id) ? 1 : 0;
+	else
+		return pos < le64_to_cpu(msend->nh.seq) ? -1 :
+		       pos > le64_to_cpu(msend->nh.seq) ? 1 : 0;
+}
+
+static struct message_send *search_sorted_msends(struct rb_root *root, u64 pos, struct rb_node *ins)
+{
+	struct rb_node **node = &root->rb_node;
+	struct rb_node *parent = NULL;
+	struct message_send *msend = NULL;
+	struct message_send *next = NULL;
+	int cmp = -1;
+
+	while (*node) {
+		parent = *node;
+		msend = container_of(*node, struct message_send, node);
+
+		cmp = cmp_sorted_msend(pos, msend);
+		if (cmp < 0) {
+			next = msend;
+			node = &(*node)->rb_left;
+		} else if (cmp > 0) {
+			node = &(*node)->rb_right;
+		} else {
+			next = msend;
+			break;
+		}
+	}
+
+	BUG_ON(cmp == 0 && ins);
+
+	if (ins) {
+		rb_link_node(ins, parent, node);
+		rb_insert_color(ins, root);
+	}
+
+	return next;
+}
+
+static struct message_send *next_sorted_msend(struct message_send *msend)
+{
+	struct rb_node *node = rb_next(&msend->node);
+
+	return node ? rb_entry(node, struct message_send, node) : NULL;
+}
+
+#define for_each_sorted_msend(MSEND_, TMP_, ROOT_, POS_) \
+	for (MSEND_ = search_sorted_msends(ROOT_, POS_, NULL); \
+	     MSEND_ != NULL && ({ TMP_ = next_sorted_msend(MSEND_); true; }); \
+	     MSEND_ = TMP_)
+
+static void insert_sorted_msend(struct scoutfs_net_connection *conn, struct message_send *msend)
+{
+	BUG_ON(!RB_EMPTY_NODE(&msend->node));
+
+	if (nh_is_request(&msend->nh))
+		search_sorted_msends(&conn->req_root, le64_to_cpu(msend->nh.id), &msend->node);
+	else
+		search_sorted_msends(&conn->resp_root, le64_to_cpu(msend->nh.seq), &msend->node);
+}
+
+static void erase_sorted_msend(struct scoutfs_net_connection *conn, struct message_send *msend)
+{
+	if (!RB_EMPTY_NODE(&msend->node)) {
+		if (nh_is_request(&msend->nh))
+			rb_erase(&msend->node, &conn->req_root);
+		else
+			rb_erase(&msend->node, &conn->resp_root);
+		RB_CLEAR_NODE(&msend->node);
+	}
+}
+
+static void move_sorted_msends(struct scoutfs_net_connection *dst_conn, struct rb_root *dst_root,
+			       struct scoutfs_net_connection *src_conn, struct rb_root *src_root)
+{
+	struct message_send *msend;
+	struct message_send *tmp;
+
+	for_each_sorted_msend(msend, tmp, src_root, 0) {
+		erase_sorted_msend(src_conn, msend);
+		insert_sorted_msend(dst_conn, msend);
+	}
+}
+
 /*
- * We return dead requests so that the caller can stop searching other
- * lists for the dead request that we found.
+ * Pending requests are uniquely identified by the id they were assigned
+ * as they were first put on the send queue.
 */
-static struct message_send *search_list(struct scoutfs_net_connection *conn,
-					struct list_head *list,
-					u8 cmd, u64 id)
+static struct message_send *find_request(struct scoutfs_net_connection *conn, u64 id)
 {
 	struct message_send *msend;

 	assert_spin_locked(&conn->lock);

-	list_for_each_entry(msend, list, head) {
-		if (nh_is_request(&msend->nh) && msend->nh.cmd == cmd &&
-		    le64_to_cpu(msend->nh.id) == id)
-			return msend;
-	}
-
-	return NULL;
-}
-
-/*
- * Find an active send request on the lists.  It's almost certainly
- * waiting on the resend queue but it could be actively being sent.
- */
-static struct message_send *find_request(struct scoutfs_net_connection *conn,
-					 u8 cmd, u64 id)
-{
-	struct message_send *msend;
-
-	msend = search_list(conn, &conn->resend_queue, cmd, id) ?:
-		search_list(conn, &conn->send_queue, cmd, id);
-	if (msend && msend->dead)
+	msend = search_sorted_msends(&conn->req_root, id, NULL);
+	if (msend && le64_to_cpu(msend->nh.id) != id)
 		msend = NULL;
+
 	return msend;
 }

 /*
- * Complete a send message by moving it to the send queue and marking it
- * to be freed.  It won't be visible to callers trying to find sends.
+ * Free a message by moving it to the dead list and queueing the send
+ * work to free it.  Once considered dead the message won't be sent or
+ * visible as a request for response processing.
 */
-static void complete_send(struct scoutfs_net_connection *conn,
-			  struct message_send *msend)
+static void queue_dead_free(struct scoutfs_net_connection *conn, struct message_send *msend)
 {
 	assert_spin_locked(&conn->lock);

-	if (WARN_ON_ONCE(msend->dead) ||
+	/* callers look up the message in the sorted roots, which dead messages are removed from */
+	if (WARN_ON_ONCE(msend_is_dead(msend)) ||
 	    WARN_ON_ONCE(list_empty(&msend->head)))
 		return;

-	msend->dead = 1;
-	list_move(&msend->head, &conn->send_queue);
+	erase_sorted_msend(conn, msend);
+	list_add_tail(&msend->dead_head, &conn->dead_list);
 	queue_work(conn->workq, &conn->send_work);
 }

@@ -369,7 +446,8 @@ static int submit_send(struct super_block *sb,

 	msend->resp_func = resp_func;
 	msend->resp_data = resp_data;
-	msend->dead = 0;
+	INIT_LIST_HEAD(&msend->dead_head);
+	RB_CLEAR_NODE(&msend->node);

 	msend->nh.seq = cpu_to_le64(seq);
 	msend->nh.recv_seq = 0;  /* set when sent, not when queued */
@@ -390,6 +468,7 @@ static int submit_send(struct super_block *sb,
 	} else {
 		list_add_tail(&msend->head, &conn->resend_queue);
 	}
+	insert_sorted_msend(conn, msend);

 	if (id_ret)
 		*id_ret = le64_to_cpu(msend->nh.id);
@@ -455,11 +534,11 @@ static int process_response(struct scoutfs_net_connection *conn,

 	spin_lock(&conn->lock);

-	msend = find_request(conn, mrecv->nh.cmd, le64_to_cpu(mrecv->nh.id));
+	msend = find_request(conn, le64_to_cpu(mrecv->nh.id));
 	if (msend) {
 		resp_func = msend->resp_func;
 		resp_data = msend->resp_data;
-		complete_send(conn, msend);
+		queue_dead_free(conn, msend);
 	} else {
 		scoutfs_inc_counter(sb, net_dropped_response);
 	}
@@ -546,47 +625,22 @@ static void queue_ordered_proc(struct scoutfs_net_connection *conn, struct messa
 	queue_work(conn->workq, &wlist->work);
 }

-/*
- * Free live responses up to and including the seq by marking them dead
- * and moving them to the send queue to be freed.
- */
-static bool move_acked_responses(struct scoutfs_net_connection *conn,
-				 struct list_head *list, u64 seq)
-{
-	struct message_send *msend;
-	struct message_send *tmp;
-	bool moved = false;
-
-	assert_spin_locked(&conn->lock);
-
-	list_for_each_entry_safe(msend, tmp, list, head) {
-		if (le64_to_cpu(msend->nh.seq) > seq)
-			break;
-		if (!nh_is_response(&msend->nh) || msend->dead)
-			continue;
-
-		msend->dead = 1;
-		list_move(&msend->head, &conn->send_queue);
-		moved = true;
-	}
-
-	return moved;
-}
-
 /* acks are processed inline in the recv worker */
 static void free_acked_responses(struct scoutfs_net_connection *conn, u64 seq)
 {
-	bool moved;
+	struct message_send *msend;
+	struct message_send *tmp;

 	spin_lock(&conn->lock);

-	moved = move_acked_responses(conn, &conn->send_queue, seq) |
-		move_acked_responses(conn, &conn->resend_queue, seq);
+	for_each_sorted_msend(msend, tmp, &conn->resp_root, 0) {
+		if (le64_to_cpu(msend->nh.seq) > seq)
+			break;
+
+		queue_dead_free(conn, msend);
+	}

 	spin_unlock(&conn->lock);
-
-	if (moved)
-		queue_work(conn->workq, &conn->send_work);
 }

 static int k_recvmsg(struct socket *sock, void *buf, unsigned len)
@@ -824,9 +878,13 @@ static int k_sendmsg_full(struct socket *sock, struct kvec *kv, unsigned long nr
 	return ret;
 }

-static void free_msend(struct net_info *ninf, struct message_send *msend)
+static void free_msend(struct net_info *ninf, struct scoutfs_net_connection *conn,
+		       struct message_send *msend)
 {
 	list_del_init(&msend->head);
+	if (!list_empty(&msend->dead_head))
+		list_del_init(&msend->dead_head);
+	erase_sorted_msend(conn, msend);
 	scoutfs_tseq_del(&ninf->msg_tseq_tree, &msend->tseq_entry);
 	kfree(msend);
 }
@@ -866,12 +924,11 @@ static void scoutfs_net_send_worker(struct work_struct *work)
 		count = 0;

 		spin_lock(&conn->lock);
-		list_for_each_entry_safe(msend, _msend_, &conn->send_queue, head) {
-			if (msend->dead) {
-				free_msend(ninf, msend);
-				continue;
-			}

+		list_for_each_entry_safe(msend, _msend_, &conn->dead_list, dead_head)
+			free_msend(ninf, conn, msend);
+
+		list_for_each_entry_safe(msend, _msend_, &conn->send_queue, head) {
 			len = nh_bytes(le16_to_cpu(msend->nh.data_len));

 			if ((msend->nh.cmd == SCOUTFS_NET_CMD_FAREWELL) &&
@@ -909,7 +966,7 @@ static void scoutfs_net_send_worker(struct work_struct *work)
 			msend->nh.recv_seq = 0;

 			/* resend if it wasn't freed while we sent */
-			if (!msend->dead)
+			if (!msend_is_dead(msend))
 				list_move_tail(&msend->head, &conn->resend_queue);

 			if (--nr_segs == 0)
@@ -957,7 +1014,7 @@ static void scoutfs_net_destroy_worker(struct work_struct *work)

 	list_splice_init(&conn->resend_queue, &conn->send_queue);
 	list_for_each_entry_safe(msend, tmp, &conn->send_queue, head)
-		free_msend(ninf, msend);
+		free_msend(ninf, conn, msend);

 	/* accepted sockets are removed from their listener's list */
 	if (conn->listening_conn) {
@@ -1303,7 +1360,7 @@ static void scoutfs_net_shutdown_worker(struct work_struct *work)
 							struct message_send, head))) {
 			resp_func = msend->resp_func;
 			resp_data = msend->resp_data;
-			free_msend(ninf, msend);
+			free_msend(ninf, conn, msend);
 			spin_unlock(&conn->lock);

 			call_resp_func(sb, conn, resp_func, resp_data, NULL, 0, -ECONNABORTED);
@@ -1319,7 +1376,7 @@ static void scoutfs_net_shutdown_worker(struct work_struct *work)
 	list_splice_tail_init(&conn->send_queue, &conn->resend_queue);
 	list_for_each_entry_safe(msend, tmp, &conn->resend_queue, head) {
 		if (msend->nh.cmd == SCOUTFS_NET_CMD_GREETING)
-			free_msend(ninf, msend);
+			free_msend(ninf, conn, msend);
 	}

 	clear_conn_fl(conn, saw_greeting);
@@ -1493,6 +1550,9 @@ scoutfs_net_alloc_conn(struct super_block *sb,
 	atomic64_set(&conn->recv_seq, 0);
 	INIT_LIST_HEAD(&conn->send_queue);
 	INIT_LIST_HEAD(&conn->resend_queue);
+	INIT_LIST_HEAD(&conn->dead_list);
+	conn->req_root = RB_ROOT;
+	conn->resp_root = RB_ROOT;
 	INIT_WORK(&conn->listen_work, scoutfs_net_listen_worker);
 	INIT_WORK(&conn->connect_work, scoutfs_net_connect_worker);
 	INIT_WORK(&conn->send_work, scoutfs_net_send_worker);
@@ -1703,10 +1763,8 @@ void scoutfs_net_client_greeting(struct super_block *sb,

 	if (new_server) {
 		atomic64_set(&conn->recv_seq, 0);
-		list_for_each_entry_safe(msend, tmp, &conn->resend_queue, head){
-			if (nh_is_response(&msend->nh))
-				free_msend(ninf, msend);
-		}
+		for_each_sorted_msend(msend, tmp, &conn->resp_root, 0)
+			free_msend(ninf, conn, msend);
 	}

 	set_valid_greeting(conn);
@@ -1808,6 +1866,8 @@ restart:
 		BUG_ON(!list_empty(&reconn->send_queue));
 		/* queued greeting response is racing, can be in send or resend queue */
 		list_splice_tail_init(&reconn->resend_queue, &conn->resend_queue);
+		move_sorted_msends(conn, &conn->req_root, reconn, &reconn->req_root);
+		move_sorted_msends(conn, &conn->resp_root, reconn, &reconn->resp_root);

 		/* new conn info is unused, swap, old won't call down */
 		swap(conn->info, reconn->info);
--- a/kmod/src/net.h
+++ b/kmod/src/net.h
@@ -67,6 +67,9 @@ struct scoutfs_net_connection {
 	u64 next_send_id;
 	struct list_head send_queue;
 	struct list_head resend_queue;
+	struct list_head dead_list;
+	struct rb_root req_root;
+	struct rb_root resp_root;

 	atomic64_t recv_seq;
 	unsigned int ordered_proc_nr;
--- a/kmod/src/options.c
+++ b/kmod/src/options.c
@@ -34,6 +34,7 @@ enum {
 	Opt_data_prealloc_blocks,
 	Opt_data_prealloc_contig_only,
 	Opt_ino_alloc_per_lock,
+	Opt_lock_idle_count,
 	Opt_log_merge_wait_timeout_ms,
 	Opt_metadev_path,
 	Opt_noacl,
@@ -49,6 +50,7 @@ static const match_table_t tokens = {
 	{Opt_data_prealloc_blocks, "data_prealloc_blocks=%s"},
 	{Opt_data_prealloc_contig_only, "data_prealloc_contig_only=%s"},
 	{Opt_ino_alloc_per_lock, "ino_alloc_per_lock=%s"},
+	{Opt_lock_idle_count, "lock_idle_count=%s"},
 	{Opt_log_merge_wait_timeout_ms, "log_merge_wait_timeout_ms=%s"},
 	{Opt_metadev_path, "metadev_path=%s"},
 	{Opt_noacl, "noacl"},
@@ -119,6 +121,10 @@ static void free_options(struct scoutfs_mount_options *opts)
 	kfree(opts->metadev_path);
 }

+#define MIN_LOCK_IDLE_COUNT	32
+#define DEFAULT_LOCK_IDLE_COUNT	(10 * 1000)
+#define MAX_LOCK_IDLE_COUNT	(100 * 1000)
+
 #define MIN_LOG_MERGE_WAIT_TIMEOUT_MS		100UL
 #define DEFAULT_LOG_MERGE_WAIT_TIMEOUT_MS	500
 #define MAX_LOG_MERGE_WAIT_TIMEOUT_MS		(60 * MSEC_PER_SEC)
@@ -139,6 +145,7 @@ static void init_default_options(struct scoutfs_mount_options *opts)
 	opts->data_prealloc_blocks = SCOUTFS_DATA_PREALLOC_DEFAULT_BLOCKS;
 	opts->data_prealloc_contig_only = 1;
 	opts->ino_alloc_per_lock = SCOUTFS_LOCK_INODE_GROUP_NR;
+	opts->lock_idle_count = DEFAULT_LOCK_IDLE_COUNT;
 	opts->log_merge_wait_timeout_ms = DEFAULT_LOG_MERGE_WAIT_TIMEOUT_MS;
 	opts->orphan_scan_delay_ms = -1;
 	opts->quorum_heartbeat_timeout_ms = SCOUTFS_QUORUM_DEF_HB_TIMEO_MS;
@@ -146,6 +153,21 @@ static void init_default_options(struct scoutfs_mount_options *opts)
 	opts->tcp_keepalive_timeout_ms = DEFAULT_TCP_KEEPALIVE_TIMEOUT_MS;
 }

+static int verify_lock_idle_count(struct super_block *sb, int ret, int val)
+{
+	if (ret < 0) {
+		scoutfs_err(sb, "failed to parse lock_idle_count value");
+		return -EINVAL;
+	}
+	if (val < MIN_LOCK_IDLE_COUNT || val > MAX_LOCK_IDLE_COUNT) {
+		scoutfs_err(sb, "invalid lock_idle_count value %d, must be between %u and %u",
+			    val, MIN_LOCK_IDLE_COUNT, MAX_LOCK_IDLE_COUNT);
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
 static int verify_log_merge_wait_timeout_ms(struct super_block *sb, int ret, int val)
 {
 	if (ret < 0) {
@@ -261,6 +283,14 @@ static int parse_options(struct super_block *sb, char *options, struct scoutfs_m
 			opts->tcp_keepalive_timeout_ms = nr;
 			break;

+		case Opt_lock_idle_count:
+			ret = match_int(args, &nr);
+			ret = verify_lock_idle_count(sb, ret, nr);
+			if (ret < 0)
+				return ret;
+			opts->lock_idle_count = nr;
+			break;
+
 		case Opt_log_merge_wait_timeout_ms:
 			ret = match_int(args, &nr);
 			ret = verify_log_merge_wait_timeout_ms(sb, ret, nr);
@@ -536,6 +566,43 @@ static ssize_t ino_alloc_per_lock_store(struct kobject *kobj, struct kobj_attrib
 }
 SCOUTFS_ATTR_RW(ino_alloc_per_lock);

+static ssize_t lock_idle_count_show(struct kobject *kobj, struct kobj_attribute *attr,
+						char *buf)
+{
+	struct super_block *sb = SCOUTFS_SYSFS_ATTRS_SB(kobj);
+	struct scoutfs_mount_options opts;
+
+	scoutfs_options_read(sb, &opts);
+
+	return snprintf(buf, PAGE_SIZE, "%u", opts.lock_idle_count);
+}
+static ssize_t lock_idle_count_store(struct kobject *kobj, struct kobj_attribute *attr,
+						 const char *buf, size_t count)
+{
+	struct super_block *sb = SCOUTFS_SYSFS_ATTRS_SB(kobj);
+	DECLARE_OPTIONS_INFO(sb, optinf);
+	char nullterm[30]; /* more than enough for octal -U64_MAX */
+	int val;
+	int len;
+	int ret;
+
+	len = min(count, sizeof(nullterm) - 1);
+	memcpy(nullterm, buf, len);
+	nullterm[len] = '\0';
+
+	ret = kstrtoint(nullterm, 0, &val);
+	ret = verify_lock_idle_count(sb, ret, val);
+	if (ret == 0) {
+		write_seqlock(&optinf->seqlock);
+		optinf->opts.lock_idle_count = val;
+		write_sequnlock(&optinf->seqlock);
+		ret = count;
+	}
+
+	return ret;
+}
+SCOUTFS_ATTR_RW(lock_idle_count);
+
 static ssize_t log_merge_wait_timeout_ms_show(struct kobject *kobj, struct kobj_attribute *attr,
 						char *buf)
 {
@@ -677,6 +744,7 @@ static struct attribute *options_attrs[] = {
 	SCOUTFS_ATTR_PTR(data_prealloc_blocks),
 	SCOUTFS_ATTR_PTR(data_prealloc_contig_only),
 	SCOUTFS_ATTR_PTR(ino_alloc_per_lock),
+	SCOUTFS_ATTR_PTR(lock_idle_count),
 	SCOUTFS_ATTR_PTR(log_merge_wait_timeout_ms),
 	SCOUTFS_ATTR_PTR(metadev_path),
 	SCOUTFS_ATTR_PTR(orphan_scan_delay_ms),
--- a/kmod/src/options.h
+++ b/kmod/src/options.h
@@ -9,6 +9,7 @@ struct scoutfs_mount_options {
 	u64 data_prealloc_blocks;
 	bool data_prealloc_contig_only;
 	unsigned int ino_alloc_per_lock;
+	int lock_idle_count;
 	unsigned int log_merge_wait_timeout_ms;
 	char *metadev_path;
 	unsigned int orphan_scan_delay_ms;
--- a/kmod/src/scoutfs_trace.h
+++ b/kmod/src/scoutfs_trace.h
@@ -1619,28 +1619,6 @@ DEFINE_EVENT(scoutfs_work_class, scoutfs_data_return_server_extents_exit,
        TP_ARGS(sb, data, ret)
 );

-DECLARE_EVENT_CLASS(scoutfs_shrink_exit_class,
-        TP_PROTO(struct super_block *sb, unsigned long nr_to_scan, int ret),
-        TP_ARGS(sb, nr_to_scan, ret),
-        TP_STRUCT__entry(
-		__field(void *, sb)
-		__field(unsigned long, nr_to_scan)
-		__field(int, ret)
-        ),
-        TP_fast_assign(
-		__entry->sb = sb;
-		__entry->nr_to_scan = nr_to_scan;
-		__entry->ret = ret;
-        ),
-        TP_printk("sb %p nr_to_scan %lu ret %d",
-		  __entry->sb, __entry->nr_to_scan, __entry->ret)
-);
-
-DEFINE_EVENT(scoutfs_shrink_exit_class, scoutfs_lock_shrink_exit,
-        TP_PROTO(struct super_block *sb, unsigned long nr_to_scan, int ret),
-        TP_ARGS(sb, nr_to_scan, ret)
-);
-
 TRACE_EVENT(scoutfs_rename,
 	TP_PROTO(struct super_block *sb, struct inode *old_dir,
 		 struct dentry *old_dentry, struct inode *new_dir,
--- a/utils/man/scoutfs.5
+++ b/utils/man/scoutfs.5
@@ -63,6 +63,22 @@ mounts because there are more locks that cover the same number of
 created files.  This can be helpful when working with smaller numbers of
 large files.
 .TP
+.B lock_idle_count=<number>
+This option sets the number of locks that the client will allow to
+remain idle after being granted.  If the number of locks exceeds this
+count then the client will try to free the oldest locks.  This setting
+is per-mount and only changes the behavior of that mount.
+.sp
+Idle locks are not reclaimed by memory pressure so this option
+determines the limit of how much memory is likely to be pinned by
+allocated idle locks.  Setting this too low can increase latency of
+operations as repeated use of a working set of locks has to request the
+locks from the network rather than using granted idle locks.
+.sp
+The count is not strictly enforced.  Operations are allowed to use locks
+while over the limit to avoid deadlocks under heavy concurrent load.
+Exceeding the count only attempts freeing of idle locks.
+.TP
 .B log_merge_wait_timeout_ms=<number>
 This option sets the amount of time, in milliseconds, that log merge
 creation can wait before timing out.  This setting is per-mount, only