merge conflict from zab/shrink cleanup

Fix a sparse warning in net.c
Add tcp_keepalive_timeout_ms option.
2026-01-10 13:47:27 +00:00 · 2025-10-07 12:22:53 -07:00 · 2025-10-07 12:22:40 -07:00 · 2025-10-07 12:16:23 -07:00 · 2025-10-07 12:15:59 -07:00 · 2025-10-07 12:15:51 -07:00
22 changed files with 885 additions and 599 deletions
--- a/kmod/src/Makefile.kernelcompat
+++ b/kmod/src/Makefile.kernelcompat
@@ -158,15 +158,6 @@ ifneq (,$(shell grep 'sock_create_kern.*struct net' include/linux/net.h))
 ccflags-y += -DKC_SOCK_CREATE_KERN_NET=1
 endif

-#
-# v3.18-rc6-1619-gc0371da6047a
-#
-# iov_iter is now part of struct msghdr
-#
-ifneq (,$(shell grep 'struct iov_iter.*msg_iter' include/linux/socket.h))
-ccflags-y += -DKC_MSGHDR_STRUCT_IOV_ITER=1
-endif
-
 #
 # v4.17-rc6-7-g95582b008388
 #
@@ -434,3 +425,40 @@ endif
 ifneq (,$(shell grep 'int ..remap_pages..struct vm_area_struct' include/linux/mm.h))
 ccflags-y += -DKC_MM_REMAP_PAGES
 endif
+
+#
+# v3.19-4742-g503c358cf192
+#
+# list_lru_shrink_count() and list_lru_shrink_walk() introduced
+#
+ifneq (,$(shell grep 'list_lru_shrink_count.*struct list_lru' include/linux/list_lru.h))
+ccflags-y += -DKC_LIST_LRU_SHRINK_COUNT_WALK
+endif
+
+#
+# v3.19-4757-g3f97b163207c
+#
+# lru_list_walk_cb lru arg added
+#
+ifneq (,$(shell grep 'struct list_head \*item, spinlock_t \*lock, void \*cb_arg' include/linux/list_lru.h))
+ccflags-y += -DKC_LIST_LRU_WALK_CB_ITEM_LOCK
+endif
+
+#
+# v6.7-rc4-153-g0a97c01cd20b
+#
+# list_lru_{add,del} -> list_lru_{add,del}_obj
+#
+ifneq (,$(shell grep '^bool list_lru_add_obj' include/linux/list_lru.h))
+ccflags-y += -DKC_LIST_LRU_ADD_OBJ
+endif
+
+#
+# v6.12-rc6-227-gda0c02516c50
+#
+# lru_list_walk_cb lock arg removed
+#
+ifneq (,$(shell grep 'struct list_lru_one \*list, spinlock_t \*lock, void \*cb_arg' include/linux/list_lru.h))
+ccflags-y += -DKC_LIST_LRU_WALK_CB_LIST_LOCK
+endif
+
--- a/kmod/src/block.c
+++ b/kmod/src/block.c
@@ -22,6 +22,7 @@
 #include <linux/rhashtable.h>
 #include <linux/random.h>
 #include <linux/sched/mm.h>
+#include <linux/list_lru.h>

 #include "format.h"
 #include "super.h"
@@ -38,26 +39,12 @@
 * than the page size.  Callers can have their own contexts for tracking
 * dirty blocks that are written together.  We pin dirty blocks in
 * memory and only checksum them all as they're all written.
- *
- * Memory reclaim is driven by maintaining two very coarse groups of
- * blocks.  As we access blocks we mark them with an increasing counter
- * to discourage them from being reclaimed.  We then define a threshold
- * at the current counter minus half the population.  Recent blocks have
- * a counter greater than the threshold, and all other blocks with
- * counters less than it are considered older and are candidates for
- * reclaim.  This results in access updates rarely modifying an atomic
- * counter as blocks need to be moved into the recent group, and shrink
- * can randomly scan blocks looking for the half of the population that
- * will be in the old group.  It's reasonably effective, but is
- * particularly efficient and avoids contention between concurrent
- * accesses and shrinking.
 */

 struct block_info {
 	struct super_block *sb;
-	atomic_t total_inserted;
-	atomic64_t access_counter;
 	struct rhashtable ht;
+	struct list_lru lru;
 	wait_queue_head_t waitq;
 	KC_DEFINE_SHRINKER(shrinker);
 	struct work_struct free_work;
@@ -76,28 +63,15 @@ enum block_status_bits {
 	BLOCK_BIT_PAGE_ALLOC,	/* page (possibly high order) allocation */
 	BLOCK_BIT_VIRT,		/* mapped virt allocation */
 	BLOCK_BIT_CRC_VALID,	/* crc has been verified */
+	BLOCK_BIT_ACCESSED,	/* seen by lookup since last lru add/walk */
 };

-/*
- * We want to tie atomic changes in refcounts to whether or not the
- * block is still visible in the hash table, so we store the hash
- * table's reference up at a known high bit.  We could naturally set the
- * inserted bit through excessive refcount increments.  We don't do
- * anything about that but at least warn if we get close.
- *
- * We're avoiding the high byte for no real good reason, just out of a
- * historical fear of implementations that don't provide the full
- * precision.
- */
-#define BLOCK_REF_INSERTED	(1U << 23)
-#define BLOCK_REF_FULL		(BLOCK_REF_INSERTED >> 1)
-
 struct block_private {
 	struct scoutfs_block bl;
 	struct super_block *sb;
 	atomic_t refcount;
-	u64 accessed;
 	struct rhash_head ht_head;
+	struct list_head lru_head;
 	struct list_head dirty_entry;
 	struct llist_node free_node;
 	unsigned long bits;
@@ -112,7 +86,7 @@ struct block_private {
 do {												\
 	__typeof__(bp) _bp = (bp);								\
 	trace_scoutfs_block_##which(_bp->sb, _bp, _bp->bl.blkno, atomic_read(&_bp->refcount),	\
-				    atomic_read(&_bp->io_count), _bp->bits, _bp->accessed);	\
+				    atomic_read(&_bp->io_count), _bp->bits);	\
 } while (0)

 #define BLOCK_PRIVATE(_bl) \
@@ -176,6 +150,7 @@ static struct block_private *block_alloc(struct super_block *sb, u64 blkno)
 	bp->bl.blkno = blkno;
 	bp->sb = sb;
 	atomic_set(&bp->refcount, 1);
+	INIT_LIST_HEAD(&bp->lru_head);
 	INIT_LIST_HEAD(&bp->dirty_entry);
 	set_bit(BLOCK_BIT_NEW, &bp->bits);
 	atomic_set(&bp->io_count, 0);
@@ -233,32 +208,85 @@ static void block_free_work(struct work_struct *work)
 }

 /*
- * Get a reference to a block while holding an existing reference.
+ * Users of blocks hold a refcount.  If putting a refcount drops to zero
+ * then the block is freed.
+ *
+ * Acquiring new references and claiming the exclusive right to tear
+ * down a block is built around this LIVE_REFCOUNT_BASE refcount value.
+ * As blocks are initially cached they have the live base added to their
+ * refcount.  Lookups will only increment the refcount and return blocks
+ * for reference holders while the refcount is >= than the base.
+ *
+ * To remove a block from the cache and eventually free it, either by
+ * the lru walk in the shrinker, or by reference holders, the live base
+ * is removed and turned into a normal refcount increment that will be
+ * put by the caller.  This can only be done once for a block, and once
+ * its done lookup will not return any more references.
+ */
+#define LIVE_REFCOUNT_BASE (INT_MAX ^ (INT_MAX >> 1))
+
+/*
+ * Inc the refcount while holding an incremented refcount.  We can't
+ * have so many individual reference holders that they pass the live
+ * base.
 */
 static void block_get(struct block_private *bp)
 {
-	WARN_ON_ONCE((atomic_read(&bp->refcount) & ~BLOCK_REF_INSERTED) <= 0);
+	int now = atomic_inc_return(&bp->refcount);

-	atomic_inc(&bp->refcount);
+	BUG_ON(now <= 1);
+	BUG_ON(now == LIVE_REFCOUNT_BASE);
 }

 /*
- * Get a reference to a block as long as it's been inserted in the hash
- * table and hasn't been removed.
- */ 
-static struct block_private *block_get_if_inserted(struct block_private *bp)
+ * if (*v >= u) {
+ * 	*v += a;
+ * 	return true;
+ * }
+ */
+static bool atomic_add_unless_less(atomic_t *v, int a, int u)
 {
-	int cnt;
+	int c;

 	do {
-		cnt = atomic_read(&bp->refcount);
-		WARN_ON_ONCE(cnt & BLOCK_REF_FULL);
-		if (!(cnt & BLOCK_REF_INSERTED))
-			return NULL;
+		c = atomic_read(v);
+		if (c < u)
+			return false;
+	} while (atomic_cmpxchg(v, c, c + a) != c);

-	} while (atomic_cmpxchg(&bp->refcount, cnt, cnt + 1) != cnt);
+	return true;
+}

-	return bp;
+static bool block_get_if_live(struct block_private *bp)
+{
+	return atomic_add_unless_less(&bp->refcount, 1, LIVE_REFCOUNT_BASE);
+}
+
+/*
+ * If the refcount still has the live base, subtract it and increment
+ * the callers refcount that they'll put.
+ */
+static bool block_get_remove_live(struct block_private *bp)
+{
+	return atomic_add_unless_less(&bp->refcount, (1 - LIVE_REFCOUNT_BASE), LIVE_REFCOUNT_BASE);
+}
+
+/*
+ * Only get the live base refcount if it is the only refcount remaining.
+ * This means that there are no active refcount holders and the block
+ * can't be dirty or under IO, which both hold references.
+ */
+static bool block_get_remove_live_only(struct block_private *bp)
+{
+	int c;
+
+	do {
+		c = atomic_read(&bp->refcount);
+		if (c != LIVE_REFCOUNT_BASE)
+			return false;
+	} while (atomic_cmpxchg(&bp->refcount, c, c - LIVE_REFCOUNT_BASE + 1) != c);
+
+	return true;
 }

 /*
@@ -290,104 +318,73 @@ static const struct rhashtable_params block_ht_params = {
 };

 /*
- * Insert a new block into the hash table.  Once it is inserted in the
- * hash table readers can start getting references.  The caller may have
- * multiple refs but the block can't already be inserted.
+ * Insert the block into the cache so that it's visible for lookups.
+ * The caller can hold references (including for a dirty block).
+ *
+ * We make sure the base is added and the block is in the lru once it's
+ * in the hash.  If hash table insertion fails it'll be briefly visible
+ * in the lru, but won't be isolated/evicted because we hold an
+ * incremented refcount in addition to the live base.
 */
 static int block_insert(struct super_block *sb, struct block_private *bp)
 {
 	DECLARE_BLOCK_INFO(sb, binf);
 	int ret;

-	WARN_ON_ONCE(atomic_read(&bp->refcount) & BLOCK_REF_INSERTED);
-
+	BUG_ON(atomic_read(&bp->refcount) >= LIVE_REFCOUNT_BASE);
+	atomic_add(LIVE_REFCOUNT_BASE, &bp->refcount);
+	smp_mb__after_atomic(); /* make sure live base is visible to list_lru walk */
+	list_lru_add_obj(&binf->lru, &bp->lru_head);
 retry:
-	atomic_add(BLOCK_REF_INSERTED, &bp->refcount);
 	ret = rhashtable_lookup_insert_fast(&binf->ht, &bp->ht_head, block_ht_params);
 	if (ret < 0) {
-		atomic_sub(BLOCK_REF_INSERTED, &bp->refcount);
 		if (ret == -EBUSY) {
 			/* wait for pending rebalance to finish */
 			synchronize_rcu();
 			goto retry;
+		} else {
+			atomic_sub(LIVE_REFCOUNT_BASE, &bp->refcount);
+			BUG_ON(atomic_read(&bp->refcount) >= LIVE_REFCOUNT_BASE);
+			list_lru_del_obj(&binf->lru, &bp->lru_head);
 		}
 	} else {
-		atomic_inc(&binf->total_inserted);
 		TRACE_BLOCK(insert, bp);
 	}

 	return ret;
 }

-static u64 accessed_recently(struct block_info *binf)
-{
-	return atomic64_read(&binf->access_counter) - (atomic_read(&binf->total_inserted) >> 1);
-}
-
 /*
- * Make sure that a block that is being accessed is less likely to be
- * reclaimed if it is seen by the shrinker.   If the block hasn't been
- * accessed recently we update its accessed value.
+ * Indicate to the lru walker that this block has been accessed since it
+ * was added or last walked.
 */
 static void block_accessed(struct super_block *sb, struct block_private *bp)
 {
-	DECLARE_BLOCK_INFO(sb, binf);
-
-	if (bp->accessed == 0 || bp->accessed < accessed_recently(binf)) {
+	if (!test_and_set_bit(BLOCK_BIT_ACCESSED, &bp->bits))
 		scoutfs_inc_counter(sb, block_cache_access_update);
-		bp->accessed = atomic64_inc_return(&binf->access_counter);
-	}
 }

 /*
- * The caller wants to remove the block from the hash table and has an
- * idea what the refcount should be.  If the refcount does still
- * indicate that the block is hashed, and we're able to clear that bit,
- * then we can remove it from the hash table.
+ * Remove the block from the cache.  When this returns the block won't
+ * be visible for additional references from lookup.
 *
- * The caller makes sure that it's safe to be referencing this block,
- * either with their own held reference (most everything) or by being in
- * an rcu grace period (shrink).
- */
-static bool block_remove_cnt(struct super_block *sb, struct block_private *bp, int cnt)
-{
-	DECLARE_BLOCK_INFO(sb, binf);
-	int ret;
-
-	if ((cnt & BLOCK_REF_INSERTED) &&
-	    (atomic_cmpxchg(&bp->refcount, cnt, cnt & ~BLOCK_REF_INSERTED) == cnt)) {
-
-		TRACE_BLOCK(remove, bp);
-		ret = rhashtable_remove_fast(&binf->ht, &bp->ht_head, block_ht_params);
-		WARN_ON_ONCE(ret); /* must have been inserted */
-		atomic_dec(&binf->total_inserted);
-		return true;
-	}
-
-	return false;
-}
-
-/*
- * Try to remove the block from the hash table as long as the refcount
- * indicates that it is still in the hash table.  This can be racing
- * with normal refcount changes so it might have to retry.
+ * We always try and remove from the hash table.  It's safe to remove a
+ * block that isn't hashed, it just returns -ENOENT.
+ *
+ * This is racing with the lru walk in the shrinker also trying to
+ * remove idle blocks from the cache.  They both try to remove the live
+ * refcount base and perform their removal and put if they get it.
 */
 static void block_remove(struct super_block *sb, struct block_private *bp)
 {
-	int cnt;
+	DECLARE_BLOCK_INFO(sb, binf);

-	do {
-		cnt = atomic_read(&bp->refcount);
-	} while ((cnt & BLOCK_REF_INSERTED) && !block_remove_cnt(sb, bp, cnt));
-}
+	rhashtable_remove_fast(&binf->ht, &bp->ht_head, block_ht_params);

-/*
- * Take one shot at removing the block from the hash table if it's still
- * in the hash table and the caller has the only other reference.
- */
-static bool block_remove_solo(struct super_block *sb, struct block_private *bp)
-{
-	return block_remove_cnt(sb, bp, BLOCK_REF_INSERTED | 1);
+	if (block_get_remove_live(bp)) {
+		list_lru_del_obj(&binf->lru, &bp->lru_head);
+		block_put(sb, bp);
+	}
 }

 static bool io_busy(struct block_private *bp)
@@ -396,37 +393,6 @@ static bool io_busy(struct block_private *bp)
 	return test_bit(BLOCK_BIT_IO_BUSY, &bp->bits);
 }

-/*
- * Called during shutdown with no other users.
- */
-static void block_remove_all(struct super_block *sb)
-{
-	DECLARE_BLOCK_INFO(sb, binf);
-	struct rhashtable_iter iter;
-	struct block_private *bp;
-
-	rhashtable_walk_enter(&binf->ht, &iter);
-	rhashtable_walk_start(&iter);
-
-	for (;;) {
-		bp = rhashtable_walk_next(&iter);
-		if (bp == NULL)
-			break;
-		if (bp == ERR_PTR(-EAGAIN))
-			continue;
-
-		if (block_get_if_inserted(bp)) {
-			block_remove(sb, bp);
-			WARN_ON_ONCE(atomic_read(&bp->refcount) != 1);
-			block_put(sb, bp);
-		}
-	}
-
-	rhashtable_walk_stop(&iter);
-	rhashtable_walk_exit(&iter);
-
-	WARN_ON_ONCE(atomic_read(&binf->total_inserted) != 0);
-}

 /*
 * XXX The io_count and sb fields in the block_private are only used
@@ -488,7 +454,7 @@ static int block_submit_bio(struct super_block *sb, struct block_private *bp,
 	int ret = 0;

 	if (scoutfs_forcing_unmount(sb))
-		return -EIO;
+		return -ENOLINK;

 	sector = bp->bl.blkno << (SCOUTFS_BLOCK_LG_SHIFT - 9);

@@ -543,6 +509,10 @@ static int block_submit_bio(struct super_block *sb, struct block_private *bp,
 	return ret;
 }

+/*
+ * Return a block with an elevated refcount if it was present in the
+ * hash table and its refcount didn't indicate that it was being freed.
+ */
 static struct block_private *block_lookup(struct super_block *sb, u64 blkno)
 {
 	DECLARE_BLOCK_INFO(sb, binf);
@@ -550,8 +520,8 @@ static struct block_private *block_lookup(struct super_block *sb, u64 blkno)

 	rcu_read_lock();
 	bp = rhashtable_lookup(&binf->ht, &blkno, block_ht_params);
-	if (bp)
-		bp = block_get_if_inserted(bp);
+	if (bp && !block_get_if_live(bp))
+		bp = NULL;
 	rcu_read_unlock();

 	return bp;
@@ -1078,100 +1048,85 @@ static unsigned long block_count_objects(struct shrinker *shrink, struct shrink_
 	struct super_block *sb = binf->sb;

 	scoutfs_inc_counter(sb, block_cache_count_objects);
-
-	return shrinker_min_long(atomic_read(&binf->total_inserted));
+	return list_lru_shrink_count(&binf->lru, sc);
+}
+
+struct isolate_args {
+	struct super_block *sb;
+	struct list_head dispose;
+};
+
+#define DECLARE_ISOLATE_ARGS(sb_, name_) \
+	struct isolate_args name_ = { \
+		.sb = sb_, \
+		.dispose = LIST_HEAD_INIT(name_.dispose), \
+	}
+
+static enum lru_status isolate_lru_block(struct list_head *item, struct list_lru_one *list,
+					 void *cb_arg)
+{
+	struct block_private *bp = container_of(item, struct block_private, lru_head);
+	struct isolate_args *ia = cb_arg;
+
+	TRACE_BLOCK(isolate, bp);
+
+	/* rotate accessed blocks to the tail of the list (lazy promotion) */
+	if (test_and_clear_bit(BLOCK_BIT_ACCESSED, &bp->bits)) {
+		scoutfs_inc_counter(ia->sb, block_cache_isolate_rotate);
+		return LRU_ROTATE;
+	}
+
+	/* any refs, including dirty/io, stop us from acquiring lru refcount */
+	if (!block_get_remove_live_only(bp)) {
+		scoutfs_inc_counter(ia->sb, block_cache_isolate_skip);
+		return LRU_SKIP;
+	}
+
+	scoutfs_inc_counter(ia->sb, block_cache_isolate_removed);
+	list_lru_isolate_move(list, &bp->lru_head, &ia->dispose);
+	return LRU_REMOVED;
+}
+
+static void shrink_dispose_blocks(struct super_block *sb, struct list_head *dispose)
+{
+	struct block_private *bp;
+	struct block_private *bp__;
+
+	list_for_each_entry_safe(bp, bp__, dispose, lru_head) {
+		list_del_init(&bp->lru_head);
+		block_remove(sb, bp);
+		block_put(sb, bp);
+	}
 }

-/*
- * Remove a number of cached blocks that haven't been used recently.
- *
- * We don't maintain a strictly ordered LRU to avoid the contention of
- * accesses always moving blocks around in some precise global
- * structure.
- *
- * Instead we use counters to divide the blocks into two roughly equal
- * groups by how recently they were accessed.  We randomly walk all
- * inserted blocks looking for any blocks in the older half to remove
- * and free.  The random walk and there being two groups means that we
- * typically only walk a small multiple of the number we're looking for
- * before we find them all.
- *
- * Our rcu walk of blocks can see blocks in all stages of their life
- * cycle, from dirty blocks to those with 0 references that are queued
- * for freeing.  We only want to free idle inserted blocks so we
- * atomically remove blocks when the only references are ours and the
- * hash table.
- */
 static unsigned long block_scan_objects(struct shrinker *shrink, struct shrink_control *sc)
 {
 	struct block_info *binf = KC_SHRINKER_CONTAINER_OF(shrink, struct block_info);
 	struct super_block *sb = binf->sb;
-	struct rhashtable_iter iter;
-	struct block_private *bp;
-	bool stop = false;
-	unsigned long freed = 0;
-	unsigned long nr = sc->nr_to_scan;
-	u64 recently;
+	DECLARE_ISOLATE_ARGS(sb, ia);
+	unsigned long freed;

 	scoutfs_inc_counter(sb, block_cache_scan_objects);

-	recently = accessed_recently(binf);
-	rhashtable_walk_enter(&binf->ht, &iter);
-	rhashtable_walk_start(&iter);
+	freed = kc_list_lru_shrink_walk(&binf->lru, sc, isolate_lru_block, &ia);
+	shrink_dispose_blocks(sb, &ia.dispose);
+	return freed;
+}

-	/*
-	 * This isn't great but I don't see a better way.  We want to
-	 * walk the hash from a random point so that we're not
-	 * constantly walking over the same region that we've already
-	 * freed old blocks within.  The interface doesn't let us do
-	 * this explicitly, but this seems to work?  The difference this
-	 * makes is enormous, around a few orders of magnitude fewer
-	 * _nexts per shrink.
-	 */
-	if (iter.walker.tbl)
-		iter.slot = prandom_u32_max(iter.walker.tbl->size);
+/*
+ * Called during shutdown with no other users.  The isolating walk must
+ * find blocks on the lru that only have references for presence on the
+ * lru and in the hash table.
+ */
+static void block_shrink_all(struct super_block *sb)
+{
+	DECLARE_BLOCK_INFO(sb, binf);
+	DECLARE_ISOLATE_ARGS(sb, ia);

-	while (nr > 0) {
-		bp = rhashtable_walk_next(&iter);
-		if (bp == NULL)
-			break;
-		if (bp == ERR_PTR(-EAGAIN)) {
-			/*
-			 * We can be called from reclaim in the allocation
-			 * to resize the hash table itself.  We have to
-			 * return so that the caller can proceed and
-			 * enable hash table iteration again.
-			 */
-			scoutfs_inc_counter(sb, block_cache_shrink_stop);
-			stop = true;
-			break;
-		}
-
-		scoutfs_inc_counter(sb, block_cache_shrink_next);
-
-		if (bp->accessed >= recently) {
-			scoutfs_inc_counter(sb, block_cache_shrink_recent);
-			continue;
-		}
-
-		if (block_get_if_inserted(bp)) {
-			if (block_remove_solo(sb, bp)) {
-				scoutfs_inc_counter(sb, block_cache_shrink_remove);
-				TRACE_BLOCK(shrink, bp);
-				freed++;
-				nr--;
-			}
-			block_put(sb, bp);
-		}
-	}
-
-	rhashtable_walk_stop(&iter);
-	rhashtable_walk_exit(&iter);
-
-	if (stop)
-		return SHRINK_STOP;
-	else
-		return freed;
+	do {
+		kc_list_lru_walk(&binf->lru, isolate_lru_block, &ia, 128);
+		shrink_dispose_blocks(sb, &ia.dispose);
+        } while (list_lru_count(&binf->lru) > 0);
 }

 struct sm_block_completion {
@@ -1210,7 +1165,7 @@ static int sm_block_io(struct super_block *sb, struct block_device *bdev, blk_op
 	BUILD_BUG_ON(PAGE_SIZE < SCOUTFS_BLOCK_SM_SIZE);

 	if (scoutfs_forcing_unmount(sb))
-		return -EIO;
+		return -ENOLINK;

 	if (WARN_ON_ONCE(len > SCOUTFS_BLOCK_SM_SIZE) ||
 	    WARN_ON_ONCE(!op_is_write(opf) && !blk_crc))
@@ -1276,7 +1231,7 @@ int scoutfs_block_write_sm(struct super_block *sb,
 int scoutfs_block_setup(struct super_block *sb)
 {
 	struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
-	struct block_info *binf;
+	struct block_info *binf = NULL;
 	int ret;

 	binf = kzalloc(sizeof(struct block_info), GFP_KERNEL);
@@ -1285,15 +1240,15 @@ int scoutfs_block_setup(struct super_block *sb)
 		goto out;
 	}

-	ret = rhashtable_init(&binf->ht, &block_ht_params);
-	if (ret < 0) {
-		kfree(binf);
+	ret = list_lru_init(&binf->lru);
+	if (ret < 0)
+		goto out;
+
+	ret = rhashtable_init(&binf->ht, &block_ht_params);
+	if (ret < 0)
 		goto out;
-	}

 	binf->sb = sb;
-	atomic_set(&binf->total_inserted, 0);
-	atomic64_set(&binf->access_counter, 0);
 	init_waitqueue_head(&binf->waitq);
 	KC_INIT_SHRINKER_FUNCS(&binf->shrinker, block_count_objects,
 			       block_scan_objects);
@@ -1305,8 +1260,10 @@ int scoutfs_block_setup(struct super_block *sb)

 	ret = 0;
 out:
-	if (ret)
-		scoutfs_block_destroy(sb);
+	if (ret < 0 && binf) {
+		list_lru_destroy(&binf->lru);
+		kfree(binf);
+	}

 	return ret;
 }
@@ -1318,9 +1275,10 @@ void scoutfs_block_destroy(struct super_block *sb)

 	if (binf) {
 		KC_UNREGISTER_SHRINKER(&binf->shrinker);
-		block_remove_all(sb);
+		block_shrink_all(sb);
 		flush_work(&binf->free_work);
 		rhashtable_destroy(&binf->ht);
+		list_lru_destroy(&binf->lru);

 		kfree(binf);
 		sbi->block_info = NULL;
--- a/kmod/src/counters.h
+++ b/kmod/src/counters.h
@@ -26,17 +26,15 @@
 	EXPAND_COUNTER(block_cache_alloc_page_order)		\
 	EXPAND_COUNTER(block_cache_alloc_virt)			\
 	EXPAND_COUNTER(block_cache_end_io_error)		\
+	EXPAND_COUNTER(block_cache_isolate_removed)		\
+	EXPAND_COUNTER(block_cache_isolate_rotate)		\
+	EXPAND_COUNTER(block_cache_isolate_skip)		\
 	EXPAND_COUNTER(block_cache_forget)			\
 	EXPAND_COUNTER(block_cache_free)			\
 	EXPAND_COUNTER(block_cache_free_work)			\
 	EXPAND_COUNTER(block_cache_remove_stale)		\
 	EXPAND_COUNTER(block_cache_count_objects)		\
 	EXPAND_COUNTER(block_cache_scan_objects)		\
-	EXPAND_COUNTER(block_cache_shrink)			\
-	EXPAND_COUNTER(block_cache_shrink_next)			\
-	EXPAND_COUNTER(block_cache_shrink_recent)		\
-	EXPAND_COUNTER(block_cache_shrink_remove)		\
-	EXPAND_COUNTER(block_cache_shrink_stop)			\
 	EXPAND_COUNTER(btree_compact_values)			\
 	EXPAND_COUNTER(btree_compact_values_enomem)		\
 	EXPAND_COUNTER(btree_delete)				\
@@ -118,10 +116,11 @@
 	EXPAND_COUNTER(item_pcpu_page_hit)			\
 	EXPAND_COUNTER(item_pcpu_page_miss)			\
 	EXPAND_COUNTER(item_pcpu_page_miss_keys)		\
+	EXPAND_COUNTER(item_read_pages_barrier)			\
+	EXPAND_COUNTER(item_read_pages_retry)			\
 	EXPAND_COUNTER(item_read_pages_split)			\
 	EXPAND_COUNTER(item_shrink_page)			\
 	EXPAND_COUNTER(item_shrink_page_dirty)			\
-	EXPAND_COUNTER(item_shrink_page_reader)			\
 	EXPAND_COUNTER(item_shrink_page_trylock)		\
 	EXPAND_COUNTER(item_update)				\
 	EXPAND_COUNTER(item_write_dirty)			\
@@ -146,6 +145,7 @@
 	EXPAND_COUNTER(lock_shrink_work)			\
 	EXPAND_COUNTER(lock_unlock)				\
 	EXPAND_COUNTER(lock_wait)				\
+	EXPAND_COUNTER(log_merge_no_finalized)			\
 	EXPAND_COUNTER(log_merge_wait_timeout)			\
 	EXPAND_COUNTER(net_dropped_response)			\
 	EXPAND_COUNTER(net_send_bytes)				\
@@ -182,6 +182,7 @@
 	EXPAND_COUNTER(quorum_send_vote)			\
 	EXPAND_COUNTER(quorum_server_shutdown)			\
 	EXPAND_COUNTER(quorum_term_follower)			\
+	EXPAND_COUNTER(reclaimed_open_logs)			\
 	EXPAND_COUNTER(server_commit_hold)			\
 	EXPAND_COUNTER(server_commit_queue)			\
 	EXPAND_COUNTER(server_commit_worker)			\
--- a/kmod/src/format.h
+++ b/kmod/src/format.h
@@ -1091,7 +1091,8 @@ enum scoutfs_net_cmd {
 	EXPAND_NET_ERRNO(ENOMEM)	\
 	EXPAND_NET_ERRNO(EIO)		\
 	EXPAND_NET_ERRNO(ENOSPC)	\
-	EXPAND_NET_ERRNO(EINVAL)
+	EXPAND_NET_ERRNO(EINVAL)	\
+	EXPAND_NET_ERRNO(ENOLINK)

 #undef EXPAND_NET_ERRNO
 #define EXPAND_NET_ERRNO(which) SCOUTFS_NET_ERR_##which,
--- a/kmod/src/inode.c
+++ b/kmod/src/inode.c
@@ -1965,6 +1965,8 @@ static void iput_worker(struct work_struct *work)
 		while (count-- > 0)
 			iput(inode);

+		cond_resched();
+
 		/* can't touch inode after final iput */

 		spin_lock(&inf->iput_lock);
--- a/kmod/src/item.c
+++ b/kmod/src/item.c
@@ -97,9 +97,8 @@ struct item_cache_info {
 	struct list_head lru_list;
 	unsigned long lru_pages;

-	/* written by page readers, read by shrink */
-	spinlock_t active_lock;
-	struct list_head active_list;
+	/* stop readers from caching stale items behind reclaimed cleaned written items */
+	atomic64_t read_dirty_barrier;
 };

 #define DECLARE_ITEM_CACHE_INFO(sb, name) \
@@ -1285,78 +1284,6 @@ static int cache_empty_page(struct super_block *sb,
 	return 0;
 }

-/*
- * Readers operate independently from dirty items and transactions.
- * They read a set of persistent items and insert them into the cache
- * when there aren't already pages whose key range contains the items.
- * This naturally prefers cached dirty items over stale read items.
- *
- * We have to deal with the case where dirty items are written and
- * invalidated while a read is in flight.   The reader won't have seen
- * the items that were dirty in their persistent roots as they started
- * reading.  By the time they insert their read pages the previously
- * dirty items have been reclaimed and are not in the cache.  The old
- * stale items will be inserted in their place, effectively corrupting
- * by having the dirty items disappear.
- *
- * We fix this by tracking the max seq of items in pages.  As readers
- * start they record the current transaction seq.  Invalidation skips
- * pages with a max seq greater than the first reader seq because the
- * items in the page have to stick around to prevent the readers stale
- * items from being inserted.
- *
- * This naturally only affects a small set of pages with items that were
- * written relatively recently.  If we're in memory pressure then we
- * probably have a lot of pages and they'll naturally have items that
- * were visible to any raders.  We don't bother with the complicated and
- * expensive further refinement of tracking the ranges that are being
- * read and comparing those with pages to invalidate.
- */
-struct active_reader {
-	struct list_head head;
-	u64 seq;
-};
-
-#define INIT_ACTIVE_READER(rdr) \
-	struct active_reader rdr = { .head = LIST_HEAD_INIT(rdr.head) }
-
-static void add_active_reader(struct super_block *sb, struct active_reader *active)
-{
-	DECLARE_ITEM_CACHE_INFO(sb, cinf);
-
-	BUG_ON(!list_empty(&active->head));
-
-	active->seq = scoutfs_trans_sample_seq(sb);
-
-	spin_lock(&cinf->active_lock);
-	list_add_tail(&active->head, &cinf->active_list);
-	spin_unlock(&cinf->active_lock);
-}
-
-static u64 first_active_reader_seq(struct item_cache_info *cinf)
-{
-	struct active_reader *active;
-	u64 first;
-
-	/* only the calling task adds or deletes this active */
-	spin_lock(&cinf->active_lock);
-	active = list_first_entry_or_null(&cinf->active_list, struct active_reader, head);
-	first = active ? active->seq : U64_MAX;
-	spin_unlock(&cinf->active_lock);
-
-	return first;
-}
-
-static void del_active_reader(struct item_cache_info *cinf, struct active_reader *active)
-{
-	/* only the calling task adds or deletes this active */
-	if (!list_empty(&active->head)) {
-		spin_lock(&cinf->active_lock);
-		list_del_init(&active->head);
-		spin_unlock(&cinf->active_lock);
-	}
-}
-
 /*
 * Add a newly read item to the pages that we're assembling for
 * insertion into the cache.   These pages are private, they only exist
@@ -1450,24 +1377,34 @@ static int read_page_item(struct super_block *sb, struct scoutfs_key *key, u64 s
 * and duplicates, we insert any resulting pages which don't overlap
 * with existing cached pages.
 *
- * We only insert uncached regions because this is called with cluster
- * locks held, but without locking the cache.  The regions we read can
- * be stale with respect to the current cache, which can be read and
- * dirtied by other cluster lock holders on our node, but the cluster
- * locks protect the stable items we read.  Invalidation is careful not
- * to drop pages that have items that we couldn't see because they were
- * dirty when we started reading.
- *
 * The forest item reader is reading stable trees that could be
 * overwritten.  It can return -ESTALE which we return to the caller who
 * will retry the operation and work with a new set of more recent
 * btrees.
+ *
+ * We only insert uncached regions because this is called with cluster
+ * locks held, but without locking the cache.  The regions we read can
+ * be stale with respect to the current cache, which can be read and
+ * dirtied by other cluster lock holders on our node, but the cluster
+ * locks protect the stable items we read.
+ *
+ * Using the presence of locally written dirty pages to override stale
+ * read pages only works if, well, the more recent locally written pages
+ * are still present.  Readers are totally decoupled from writers and
+ * can have a set of items that is very old indeed.  In the mean time
+ * more recent items would have been dirtied locally, committed,
+ * cleaned, and reclaimed.  We have a coarse barrier which ensures that
+ * readers can't insert items read from old roots from before local data
+ * was written.  If a write completes while a read is in progress the
+ * read will have to retry.  The retried read can use cached blocks so
+ * we're relying on reads being much faster than writes to reduce the
+ * overhead to mostly cpu work of recollecting the items from cached
+ * blocks via a more recent root from the server.
 */
 static int read_pages(struct super_block *sb, struct item_cache_info *cinf,
 		      struct scoutfs_key *key, struct scoutfs_lock *lock)
 {
 	struct rb_root root = RB_ROOT;
-	INIT_ACTIVE_READER(active);
 	struct cached_page *right = NULL;
 	struct cached_page *pg;
 	struct cached_page *rd;
@@ -1480,6 +1417,7 @@ static int read_pages(struct super_block *sb, struct item_cache_info *cinf,
 	struct rb_node *par;
 	struct rb_node *pg_tmp;
 	struct rb_node *item_tmp;
+	u64 rdbar;
 	int pgi;
 	int ret;

@@ -1493,8 +1431,7 @@ static int read_pages(struct super_block *sb, struct item_cache_info *cinf,
 	pg->end = lock->end;
 	rbtree_insert(&pg->node, NULL, &root.rb_node, &root);

-	/* set active reader seq before reading persistent roots */
-	add_active_reader(sb, &active);
+	rdbar = atomic64_read(&cinf->read_dirty_barrier);

 	start = lock->start;
 	end = lock->end;
@@ -1533,11 +1470,19 @@ static int read_pages(struct super_block *sb, struct item_cache_info *cinf,
 retry:
 	write_lock(&cinf->rwlock);

+	ret = 0;
 	while ((rd = first_page(&root))) {

 		pg = page_rbtree_walk(sb, &cinf->pg_root, &rd->start, &rd->end,
 				      NULL, NULL, &par, &pnode);
 		if (!pg) {
+			/* can't insert if write is cleaning (write_lock is read barrier) */
+			if (atomic64_read(&cinf->read_dirty_barrier) != rdbar) {
+				scoutfs_inc_counter(sb, item_read_pages_barrier);
+				ret = -ESTALE;
+				break;
+			}
+
 			/* insert read pages that don't intersect */
 			rbtree_erase(&rd->node, &root);
 			rbtree_insert(&rd->node, par, pnode, &cinf->pg_root);
@@ -1572,10 +1517,7 @@ retry:

 	write_unlock(&cinf->rwlock);

-	ret = 0;
 out:
-	del_active_reader(cinf, &active);
-
 	/* free any pages we left dangling on error */
 	for_each_page_safe(&root, rd, pg_tmp) {
 		rbtree_erase(&rd->node, &root);
@@ -1635,6 +1577,7 @@ retry:
 			ret = read_pages(sb, cinf, key, lock);
 		if (ret < 0 && ret != -ESTALE)
 			goto out;
+		scoutfs_inc_counter(sb, item_read_pages_retry);
 		goto retry;
 	}

@@ -2415,6 +2358,10 @@ int scoutfs_item_write_done(struct super_block *sb)
 	struct cached_item *tmp;
 	struct cached_page *pg;

+	/* don't let read_pages insert possibly stale items */
+	atomic64_inc(&cinf->read_dirty_barrier);
+	smp_mb__after_atomic();
+
 	spin_lock(&cinf->dirty_lock);
 	while ((pg = list_first_entry_or_null(&cinf->dirty_list, struct cached_page, dirty_head))) {
 		if (write_trylock(&pg->rwlock)) {
@@ -2593,24 +2540,15 @@ static unsigned long item_cache_scan_objects(struct shrinker *shrink,
 	struct cached_page *tmp;
 	struct cached_page *pg;
 	unsigned long freed = 0;
-	u64 first_reader_seq;
 	int nr = sc->nr_to_scan;

 	scoutfs_inc_counter(sb, item_cache_scan_objects);

-	/* can't invalidate pages with items that weren't visible to first reader */
-	first_reader_seq = first_active_reader_seq(cinf);
-
 	write_lock(&cinf->rwlock);
 	spin_lock(&cinf->lru_lock);

 	list_for_each_entry_safe(pg, tmp, &cinf->lru_list, lru_head) {

-		if (first_reader_seq <= pg->max_seq) {
-			scoutfs_inc_counter(sb, item_shrink_page_reader);
-			continue;
-		}
-
 		if (!write_trylock(&pg->rwlock)) {
 			scoutfs_inc_counter(sb, item_shrink_page_trylock);
 			continue;
@@ -2677,8 +2615,7 @@ int scoutfs_item_setup(struct super_block *sb)
 	atomic_set(&cinf->dirty_pages, 0);
 	spin_lock_init(&cinf->lru_lock);
 	INIT_LIST_HEAD(&cinf->lru_list);
-	spin_lock_init(&cinf->active_lock);
-	INIT_LIST_HEAD(&cinf->active_list);
+	atomic64_set(&cinf->read_dirty_barrier, 0);

 	cinf->pcpu_pages = alloc_percpu(struct item_percpu_pages);
 	if (!cinf->pcpu_pages)
@@ -2711,8 +2648,6 @@ void scoutfs_item_destroy(struct super_block *sb)
 	int cpu;

 	if (cinf) {
-		BUG_ON(!list_empty(&cinf->active_list));
-
 #ifdef KC_CPU_NOTIFIER
 		unregister_hotcpu_notifier(&cinf->notifier);
 #endif
--- a/kmod/src/kernelcompat.c
+++ b/kmod/src/kernelcompat.c
@@ -81,3 +81,69 @@ kc_generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov,
 	return written ? written : status;
 }
 #endif
+
+#include <linux/list_lru.h>
+
+#ifdef KC_LIST_LRU_WALK_CB_ITEM_LOCK
+static enum lru_status kc_isolate(struct list_head *item, spinlock_t *lock, void *cb_arg)
+{
+	struct kc_isolate_args *args = cb_arg;
+
+	/* isolate doesn't use list, nr_items updated in caller */
+	return args->isolate(item, NULL, args->cb_arg);
+}
+
+unsigned long kc_list_lru_walk(struct list_lru *lru, kc_list_lru_walk_cb_t isolate, void *cb_arg,
+				      unsigned long nr_to_walk)
+{
+	struct kc_isolate_args args = {
+		.isolate = isolate,
+		.cb_arg = cb_arg,
+	};
+
+	return list_lru_walk(lru, kc_isolate, &args, nr_to_walk);
+}
+
+unsigned long kc_list_lru_shrink_walk(struct list_lru *lru, struct shrink_control *sc,
+				      kc_list_lru_walk_cb_t isolate, void *cb_arg)
+{
+	struct kc_isolate_args args = {
+		.isolate = isolate,
+		.cb_arg = cb_arg,
+	};
+
+	return list_lru_shrink_walk(lru, sc, kc_isolate, &args);
+}
+#endif
+
+#ifdef KC_LIST_LRU_WALK_CB_LIST_LOCK
+static enum lru_status kc_isolate(struct list_head *item, struct list_lru_one *list,
+				  spinlock_t *lock, void *cb_arg)
+{
+	struct kc_isolate_args *args = cb_arg;
+
+	return args->isolate(item, list, args->cb_arg);
+}
+
+unsigned long kc_list_lru_walk(struct list_lru *lru, kc_list_lru_walk_cb_t isolate, void *cb_arg,
+				      unsigned long nr_to_walk)
+{
+	struct kc_isolate_args args = {
+		.isolate = isolate,
+		.cb_arg = cb_arg,
+	};
+
+	return list_lru_walk(lru, kc_isolate, &args, nr_to_walk);
+}
+unsigned long kc_list_lru_shrink_walk(struct list_lru *lru, struct shrink_control *sc,
+				      kc_list_lru_walk_cb_t isolate, void *cb_arg)
+{
+	struct kc_isolate_args args = {
+		.isolate = isolate,
+		.cb_arg = cb_arg,
+	};
+
+	return list_lru_shrink_walk(lru, sc, kc_isolate, &args);
+}
+
+#endif
--- a/kmod/src/kernelcompat.h
+++ b/kmod/src/kernelcompat.h
@@ -410,4 +410,51 @@ static inline vm_fault_t vmf_error(int err)
 }
 #endif

+#include <linux/list_lru.h>
+
+#ifndef KC_LIST_LRU_SHRINK_COUNT_WALK
+/* we don't bother with sc->{nid,memcg} (which doesn't exist in oldest kernels) */
+static inline unsigned long list_lru_shrink_count(struct list_lru *lru,
+                                                  struct shrink_control *sc)
+{
+        return list_lru_count(lru);
+}
+static inline unsigned long
+list_lru_shrink_walk(struct list_lru *lru, struct shrink_control *sc,
+		     list_lru_walk_cb isolate, void *cb_arg)
+{
+	return list_lru_walk(lru, isolate, cb_arg, sc->nr_to_scan);
+}
+#endif
+
+#ifndef KC_LIST_LRU_ADD_OBJ
+#define list_lru_add_obj list_lru_add
+#define list_lru_del_obj list_lru_del
+#endif
+
+#if defined(KC_LIST_LRU_WALK_CB_LIST_LOCK) || defined(KC_LIST_LRU_WALK_CB_ITEM_LOCK)
+struct list_lru_one;
+typedef enum lru_status (*kc_list_lru_walk_cb_t)(struct list_head *item, struct list_lru_one *list,
+						 void *cb_arg);
+struct kc_isolate_args {
+	kc_list_lru_walk_cb_t isolate;
+	void *cb_arg;
+};
+unsigned long kc_list_lru_walk(struct list_lru *lru, kc_list_lru_walk_cb_t isolate, void *cb_arg,
+			       unsigned long nr_to_walk);
+unsigned long kc_list_lru_shrink_walk(struct list_lru *lru, struct shrink_control *sc,
+				      kc_list_lru_walk_cb_t isolate, void *cb_arg);
+#else
+#define kc_list_lru_shrink_walk list_lru_shrink_walk
+#endif
+
+#if defined(KC_LIST_LRU_WALK_CB_ITEM_LOCK)
+/* isolate moved by hand, nr_items updated in walk as _REMOVE returned */
+static inline void list_lru_isolate_move(struct list_lru_one *list, struct list_head *item,
+					 struct list_head *head)
+{
+        list_move(item, head);
+}
+#endif
+
 #endif
--- a/kmod/src/net.c
+++ b/kmod/src/net.c
@@ -20,6 +20,7 @@
 #include <net/sock.h>
 #include <net/tcp.h>
 #include <linux/log2.h>
+#include <linux/jhash.h>

 #include "format.h"
 #include "counters.h"
@@ -31,6 +32,7 @@
 #include "endian_swap.h"
 #include "tseq.h"
 #include "fence.h"
+#include "options.h"

 /*
 * scoutfs networking delivers requests and responses between nodes.
@@ -134,6 +136,7 @@ struct message_send {
 struct message_recv {
 	struct scoutfs_tseq_entry tseq_entry;
 	struct work_struct proc_work;
+	struct list_head ordered_head;
 	struct scoutfs_net_connection *conn;
 	struct scoutfs_net_header nh;
 };
@@ -332,7 +335,7 @@ static int submit_send(struct super_block *sb,
 		return -EINVAL;

 	if (scoutfs_forcing_unmount(sb))
-		return -EIO;
+		return -ENOLINK;

 	msend = kmalloc(offsetof(struct message_send,
 				 nh.data[data_len]), GFP_NOFS);
@@ -498,6 +501,51 @@ static void scoutfs_net_proc_worker(struct work_struct *work)
 	trace_scoutfs_net_proc_work_exit(sb, 0, ret);
 }

+static void scoutfs_net_ordered_proc_worker(struct work_struct *work)
+{
+	struct scoutfs_work_list *wlist = container_of(work, struct scoutfs_work_list, work);
+	struct message_recv *mrecv;
+	struct message_recv *mrecv__;
+	LIST_HEAD(list);
+
+	spin_lock(&wlist->lock);
+	list_splice_init(&wlist->list, &list);
+	spin_unlock(&wlist->lock);
+
+	list_for_each_entry_safe(mrecv, mrecv__, &list, ordered_head) {
+		list_del_init(&mrecv->ordered_head);
+		scoutfs_net_proc_worker(&mrecv->proc_work);
+	}
+}
+
+/*
+ * Some messages require in-order processing.  But the scope of the
+ * ordering isn't global.  In the case of lock messages, it's per lock.
+ * So for these messages we hash them to a number of ordered workers who
+ * walk a list and call the usual work function in order.  This replaced
+ * first the proc work detecting OOO and re-ordering, and then only
+ * calling proc from the one recv work context.
+ */
+static void queue_ordered_proc(struct scoutfs_net_connection *conn, struct message_recv *mrecv)
+{
+	struct scoutfs_work_list *wlist;
+	struct scoutfs_net_lock *nl;
+	u32 h;
+
+	if (WARN_ON_ONCE(mrecv->nh.cmd != SCOUTFS_NET_CMD_LOCK ||
+		         le16_to_cpu(mrecv->nh.data_len) != sizeof(struct scoutfs_net_lock)))
+		return scoutfs_net_proc_worker(&mrecv->proc_work);
+
+	nl = (void *)mrecv->nh.data;
+	h = jhash(&nl->key, sizeof(struct scoutfs_key), 0x6fdd3cd5);
+	wlist = &conn->ordered_proc_wlists[h % conn->ordered_proc_nr];
+
+	spin_lock(&wlist->lock);
+	list_add_tail(&mrecv->ordered_head, &wlist->list);
+	spin_unlock(&wlist->lock);
+	queue_work(conn->workq, &wlist->work);
+}
+
 /*
 * Free live responses up to and including the seq by marking them dead
 * and moving them to the send queue to be freed.
@@ -541,33 +589,17 @@ static void free_acked_responses(struct scoutfs_net_connection *conn, u64 seq)
 		queue_work(conn->workq, &conn->send_work);
 }

-static int recvmsg_full(struct socket *sock, void *buf, unsigned len)
+static int k_recvmsg(struct socket *sock, void *buf, unsigned len)
 {
-	struct msghdr msg;
-	struct kvec kv;
-	int ret;
+	struct kvec kv = {
+		.iov_base = buf,
+		.iov_len = len,
+	};
+	struct msghdr msg = {
+		.msg_flags = MSG_NOSIGNAL,
+	};

-	while (len) {
-		memset(&msg, 0, sizeof(msg));
-		msg.msg_flags = MSG_NOSIGNAL;
-		kv.iov_base = buf;
-		kv.iov_len = len;
-
-#ifndef KC_MSGHDR_STRUCT_IOV_ITER
-		msg.msg_iov = (struct iovec *)&kv;
-		msg.msg_iovlen = 1;
-#else
-		iov_iter_init(&msg.msg_iter, READ, (struct iovec *)&kv, len, 1);
-#endif
-		ret = kernel_recvmsg(sock, &msg, &kv, 1, len, msg.msg_flags);
-		if (ret <= 0)
-			return -ECONNABORTED;
-
-		len -= ret;
-		buf += ret;
-	}
-
-	return 0;
+	return kernel_recvmsg(sock, &msg, &kv, 1, len, msg.msg_flags);
 }

 static bool invalid_message(struct scoutfs_net_connection *conn,
@@ -604,6 +636,72 @@ static bool invalid_message(struct scoutfs_net_connection *conn,
 	return false;
 }

+static int recv_one_message(struct super_block *sb, struct net_info *ninf,
+			    struct scoutfs_net_connection *conn, struct scoutfs_net_header *nh,
+			    unsigned int data_len)
+{
+	struct message_recv *mrecv;
+	int ret;
+
+	scoutfs_inc_counter(sb, net_recv_messages);
+	scoutfs_add_counter(sb, net_recv_bytes, nh_bytes(data_len));
+	trace_scoutfs_net_recv_message(sb, &conn->sockname, &conn->peername, nh);
+
+	/* caller's invalid message checked data len */
+	mrecv = kmalloc(offsetof(struct message_recv, nh.data[data_len]), GFP_NOFS);
+	if (!mrecv) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	mrecv->conn = conn;
+	INIT_WORK(&mrecv->proc_work, scoutfs_net_proc_worker);
+	INIT_LIST_HEAD(&mrecv->ordered_head);
+	mrecv->nh = *nh;
+	if (data_len)
+		memcpy(mrecv->nh.data, (nh + 1), data_len);
+
+	if (nh->cmd == SCOUTFS_NET_CMD_GREETING) {
+		/* greetings are out of band, no seq mechanics */
+		set_conn_fl(conn, saw_greeting);
+
+	} else if (le64_to_cpu(nh->seq) <=
+		   atomic64_read(&conn->recv_seq)) {
+		/* drop any resent duplicated messages */
+		scoutfs_inc_counter(sb, net_recv_dropped_duplicate);
+		kfree(mrecv);
+		ret = 0;
+		goto out;
+
+	} else {
+		/* record that we've received sender's seq */
+		atomic64_set(&conn->recv_seq, le64_to_cpu(nh->seq));
+		/* and free our responses that sender has received */
+		free_acked_responses(conn, le64_to_cpu(nh->recv_seq));
+	}
+
+	scoutfs_tseq_add(&ninf->msg_tseq_tree, &mrecv->tseq_entry);
+
+	/*
+	 * Initial received greetings are processed inline
+	 * before any other incoming messages.
+	 *
+	 * Incoming requests or responses to the lock client
+	 * can't handle re-ordering, so they're queued to
+	 * ordered receive processing work.
+	 */
+	if (nh->cmd == SCOUTFS_NET_CMD_GREETING)
+		scoutfs_net_proc_worker(&mrecv->proc_work);
+	else if (nh->cmd == SCOUTFS_NET_CMD_LOCK && !conn->listening_conn)
+		queue_ordered_proc(conn, mrecv);
+	else
+		queue_work(conn->workq, &mrecv->proc_work);
+	ret = 0;
+
+out:
+	return ret;
+}
+
 /*
 * Always block receiving from the socket.  Errors trigger shutting down
 * the connection.
@@ -614,86 +712,72 @@ static void scoutfs_net_recv_worker(struct work_struct *work)
 	struct super_block *sb = conn->sb;
 	struct net_info *ninf = SCOUTFS_SB(sb)->net_info;
 	struct socket *sock = conn->sock;
-	struct scoutfs_net_header nh;
-	struct message_recv *mrecv;
+	struct scoutfs_net_header *nh;
+	struct page *page = NULL;
 	unsigned int data_len;
+	int hdr_off;
+	int rx_off;
+	int size;
 	int ret;

 	trace_scoutfs_net_recv_work_enter(sb, 0, 0);

+	page = alloc_page(GFP_NOFS);
+	if (!page) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	hdr_off = 0;
+	rx_off = 0;
+
 	for (;;) {
 		/* receive the header */
-		ret = recvmsg_full(sock, &nh, sizeof(nh));
-		if (ret)
-			break;
-
-		/* receiving an invalid message breaks the connection */
-		if (invalid_message(conn, &nh)) {
-			scoutfs_inc_counter(sb, net_recv_invalid_message);
-			ret = -EBADMSG;
-			break;
+		ret = k_recvmsg(sock, page_address(page) + rx_off, PAGE_SIZE - rx_off);
+		if (ret <= 0) {
+			ret = -ECONNABORTED;
+			goto out;
 		}

-		data_len = le16_to_cpu(nh.data_len);
+		rx_off += ret;

-		scoutfs_inc_counter(sb, net_recv_messages);
-		scoutfs_add_counter(sb, net_recv_bytes, nh_bytes(data_len));
-		trace_scoutfs_net_recv_message(sb, &conn->sockname,
-					       &conn->peername, &nh);
+		for (;;) {
+			size = rx_off - hdr_off;
+			if (size < sizeof(struct scoutfs_net_header))
+				break;

-		/* invalid message checked data len */
-		mrecv = kmalloc(offsetof(struct message_recv,
-					 nh.data[data_len]), GFP_NOFS);
-		if (!mrecv) {
-			ret = -ENOMEM;
-			break;
+			nh = page_address(page) + hdr_off;
+
+			/* receiving an invalid message breaks the connection */
+			if (invalid_message(conn, nh)) {
+				scoutfs_inc_counter(sb, net_recv_invalid_message);
+				ret = -EBADMSG;
+				break;
+			}
+
+			data_len = le16_to_cpu(nh->data_len);
+			if (sizeof(struct scoutfs_net_header) + data_len > size)
+				break;
+
+			ret = recv_one_message(sb, ninf, conn, nh, data_len);
+			if (ret < 0)
+				goto out;
+
+			hdr_off += sizeof(struct scoutfs_net_header) + data_len;
 		}

-		mrecv->conn = conn;
-		INIT_WORK(&mrecv->proc_work, scoutfs_net_proc_worker);
-		mrecv->nh = nh;
-
-		/* receive the data payload */
-		ret = recvmsg_full(sock, mrecv->nh.data, data_len);
-		if (ret) {
-			kfree(mrecv);
-			break;
+		if ((PAGE_SIZE - rx_off) <
+		    (sizeof(struct scoutfs_net_header) + SCOUTFS_NET_MAX_DATA_LEN)) {
+			if (size)
+				memmove(page_address(page), page_address(page) + hdr_off, size);
+			hdr_off = 0;
+			rx_off = size;
 		}
-
-		if (nh.cmd == SCOUTFS_NET_CMD_GREETING) {
-			/* greetings are out of band, no seq mechanics */
-			set_conn_fl(conn, saw_greeting);
-
-		} else if (le64_to_cpu(nh.seq) <=
-			   atomic64_read(&conn->recv_seq)) {
-			/* drop any resent duplicated messages */
-			scoutfs_inc_counter(sb, net_recv_dropped_duplicate);
-			kfree(mrecv);
-			continue;
-
-		} else {
-			/* record that we've received sender's seq */
-			atomic64_set(&conn->recv_seq, le64_to_cpu(nh.seq));
-			/* and free our responses that sender has received */
-			free_acked_responses(conn, le64_to_cpu(nh.recv_seq));
-		}
-
-		scoutfs_tseq_add(&ninf->msg_tseq_tree, &mrecv->tseq_entry);
-
-		/*
-		 * Initial received greetings are processed
-		 * synchronously before any other incoming messages.
-		 *
-		 * Incoming requests or responses to the lock client are
-		 * called synchronously to avoid reordering.
-		 */
-		if (nh.cmd == SCOUTFS_NET_CMD_GREETING ||
-		    (nh.cmd == SCOUTFS_NET_CMD_LOCK && !conn->listening_conn))
-			scoutfs_net_proc_worker(&mrecv->proc_work);
-		else
-			queue_work(conn->workq, &mrecv->proc_work);
 	}

+out:
+	__free_page(page);
+
 	if (ret)
 		scoutfs_inc_counter(sb, net_recv_error);

@@ -703,33 +787,41 @@ static void scoutfs_net_recv_worker(struct work_struct *work)
 	trace_scoutfs_net_recv_work_exit(sb, 0, ret);
 }

-static int sendmsg_full(struct socket *sock, void *buf, unsigned len)
+/*
+ * This consumes the kvec.
+ */
+static int k_sendmsg_full(struct socket *sock, struct kvec *kv, unsigned long nr_segs, size_t count)
 {
-	struct msghdr msg;
-	struct kvec kv;
-	int ret;
+	int ret = 0;

-	while (len) {
-		memset(&msg, 0, sizeof(msg));
-		msg.msg_flags = MSG_NOSIGNAL;
-		kv.iov_base = buf;
-		kv.iov_len = len;
+	while (count > 0) {
+		struct msghdr msg = {
+			.msg_flags = MSG_NOSIGNAL,
+		};

-#ifndef KC_MSGHDR_STRUCT_IOV_ITER
-		msg.msg_iov = (struct iovec *)&kv;
-		msg.msg_iovlen = 1;
-#else
-		iov_iter_init(&msg.msg_iter, WRITE, (struct iovec *)&kv, len, 1);
-#endif
-		ret = kernel_sendmsg(sock, &msg, &kv, 1, len);
-		if (ret <= 0)
-			return -ECONNABORTED;
+		ret = kernel_sendmsg(sock, &msg, kv, nr_segs, count);
+		if (ret <= 0) {
+			ret = -ECONNABORTED;
+			break;
+		}

-		len -= ret;
-		buf += ret;
+		count -= ret;
+		if (count) {
+			while (nr_segs > 0 && ret >= kv->iov_len) {
+				ret -= kv->iov_len;
+				kv++;
+				nr_segs--;
+			}
+			if (nr_segs > 0 && ret > 0) {
+				kv->iov_base += ret;
+				kv->iov_len -= ret;
+			}
+			BUG_ON(nr_segs == 0);
+		}
+		ret = 0;
 	}
-
-	return 0;
+	
+	return ret;
 }

 static void free_msend(struct net_info *ninf, struct message_send *msend)
@@ -760,54 +852,73 @@ static void scoutfs_net_send_worker(struct work_struct *work)
 	struct super_block *sb = conn->sb;
 	struct net_info *ninf = SCOUTFS_SB(sb)->net_info;
 	struct message_send *msend;
-	int ret = 0;
+	struct message_send *_msend_;
+	struct kvec kv[16];
+	unsigned long nr_segs;
+	size_t count;
 	int len;
+	int ret;

 	trace_scoutfs_net_send_work_enter(sb, 0, 0);

-	spin_lock(&conn->lock);
-
-	while ((msend = list_first_entry_or_null(&conn->send_queue,
-						 struct message_send, head))) {
-
-		if (msend->dead) {
-			free_msend(ninf, msend);
-			continue;
-		}
-
-		if ((msend->nh.cmd == SCOUTFS_NET_CMD_FAREWELL) &&
-		    nh_is_response(&msend->nh)) {
-			set_conn_fl(conn, saw_farewell);
-		}
-
-		msend->nh.recv_seq =
-			cpu_to_le64(atomic64_read(&conn->recv_seq));
-
-		spin_unlock(&conn->lock);
-
-		len = nh_bytes(le16_to_cpu(msend->nh.data_len));
-
-		scoutfs_inc_counter(sb, net_send_messages);
-		scoutfs_add_counter(sb, net_send_bytes, len);
-		trace_scoutfs_net_send_message(sb, &conn->sockname,
-					       &conn->peername, &msend->nh);
-
-		ret = sendmsg_full(conn->sock, &msend->nh, len);
+	for (;;) {
+		nr_segs = 0;
+		count = 0;

 		spin_lock(&conn->lock);
+		list_for_each_entry_safe(msend, _msend_, &conn->send_queue, head) {
+			if (msend->dead) {
+				free_msend(ninf, msend);
+				continue;
+			}

-		msend->nh.recv_seq = 0;
+			len = nh_bytes(le16_to_cpu(msend->nh.data_len));

-		if (ret)
-			break;
+			if ((msend->nh.cmd == SCOUTFS_NET_CMD_FAREWELL) &&
+			    nh_is_response(&msend->nh)) {
+				set_conn_fl(conn, saw_farewell);
+			}

-		/* resend if it wasn't freed while we sent */
-		if (!msend->dead)
-			list_move_tail(&msend->head, &conn->resend_queue);
+			msend->nh.recv_seq = cpu_to_le64(atomic64_read(&conn->recv_seq));
+
+			scoutfs_inc_counter(sb, net_send_messages);
+			scoutfs_add_counter(sb, net_send_bytes, len);
+			trace_scoutfs_net_send_message(sb, &conn->sockname,
+						       &conn->peername, &msend->nh);
+
+			count += len;
+			kv[nr_segs].iov_base = &msend->nh;
+			kv[nr_segs].iov_len = len;
+			if (++nr_segs == ARRAY_SIZE(kv))
+				break;
+
+		}
+		spin_unlock(&conn->lock);
+
+		if (nr_segs == 0) {
+			ret = 0;
+			goto out;
+		}
+
+		ret = k_sendmsg_full(conn->sock, kv, nr_segs, count);
+		if (ret < 0)
+			goto out;
+
+		spin_lock(&conn->lock);
+		list_for_each_entry_safe(msend, _msend_, &conn->send_queue, head) {
+			msend->nh.recv_seq = 0;
+
+			/* resend if it wasn't freed while we sent */
+			if (!msend->dead)
+				list_move_tail(&msend->head, &conn->resend_queue);
+
+			if (--nr_segs == 0)
+				break;
+		}
+		spin_unlock(&conn->lock);
 	}

-	spin_unlock(&conn->lock);
-
+out:
 	if (ret) {
 		scoutfs_inc_counter(sb, net_send_error);
 		shutdown_conn(conn);
@@ -862,6 +973,7 @@ static void scoutfs_net_destroy_worker(struct work_struct *work)
 	destroy_workqueue(conn->workq);
 	scoutfs_tseq_del(&ninf->conn_tseq_tree, &conn->tseq_entry);
 	kfree(conn->info);
+	kfree(conn->ordered_proc_wlists);
 	trace_scoutfs_conn_destroy_free(conn);
 	kfree(conn);

@@ -887,7 +999,7 @@ static void destroy_conn(struct scoutfs_net_connection *conn)
 * The TCP_KEEP* and TCP_USER_TIMEOUT option interaction is subtle.
 * TCP_USER_TIMEOUT only applies if there is unacked written data in the
 * send queue.  It doesn't work if the connection is idle.  Adding
- * keepalice probes with user_timeout set changes how the keepalive
+ * keepalive probes with user_timeout set changes how the keepalive
 * timeout is calculated.   CNT no longer matters.   Each time
 * additional probes (not the first) are sent the user timeout is
 * checked against the last time data was received.  If none of the
@@ -899,14 +1011,16 @@ static void destroy_conn(struct scoutfs_net_connection *conn)
 * elapses during the probe timer processing after the unsuccessful
 * probes.
 */
-#define UNRESPONSIVE_TIMEOUT_SECS 10
-#define UNRESPONSIVE_PROBES 3
-static int sock_opts_and_names(struct scoutfs_net_connection *conn,
+static int sock_opts_and_names(struct super_block *sb,
+			       struct scoutfs_net_connection *conn,
 			       struct socket *sock)
 {
+	struct scoutfs_mount_options opts;
 	int optval;
 	int ret;

+	scoutfs_options_read(sb, &opts);
+
 	/* we use a keepalive timeout instead of send timeout */
 	ret = kc_sock_set_sndtimeo(sock, 0);
 	if (ret)
@@ -919,8 +1033,7 @@ static int sock_opts_and_names(struct scoutfs_net_connection *conn,
 	if (ret)
 		goto out;

-	BUILD_BUG_ON(UNRESPONSIVE_PROBES >= UNRESPONSIVE_TIMEOUT_SECS);
-	optval = UNRESPONSIVE_TIMEOUT_SECS - (UNRESPONSIVE_PROBES);
+	optval = (opts.tcp_keepalive_timeout_ms / MSEC_PER_SEC) - UNRESPONSIVE_PROBES;
 	ret = kc_tcp_sock_set_keepidle(sock, optval);
 	if (ret)
 		goto out;
@@ -930,7 +1043,7 @@ static int sock_opts_and_names(struct scoutfs_net_connection *conn,
 	if (ret)
 		goto out;

-	optval = UNRESPONSIVE_TIMEOUT_SECS * MSEC_PER_SEC;
+	optval = opts.tcp_keepalive_timeout_ms;
 	ret = kc_tcp_sock_set_user_timeout(sock, optval);
 	if (ret)
 		goto out;
@@ -998,7 +1111,7 @@ static void scoutfs_net_listen_worker(struct work_struct *work)
 			continue;
 		}

-		ret = sock_opts_and_names(acc_conn, acc_sock);
+		ret = sock_opts_and_names(sb, acc_conn, acc_sock);
 		if (ret) {
 			sock_release(acc_sock);
 			destroy_conn(acc_conn);
@@ -1069,7 +1182,7 @@ static void scoutfs_net_connect_worker(struct work_struct *work)
 	if (ret)
 		goto out;

-	ret = sock_opts_and_names(conn, sock);
+	ret = sock_opts_and_names(sb, conn, sock);
 	if (ret)
 		goto out;

@@ -1330,25 +1443,30 @@ scoutfs_net_alloc_conn(struct super_block *sb,
 {
 	struct net_info *ninf = SCOUTFS_SB(sb)->net_info;
 	struct scoutfs_net_connection *conn;
+	unsigned int nr;
+	unsigned int i;
+
+	nr = min_t(unsigned int, num_possible_cpus(),
+		   PAGE_SIZE / sizeof(struct scoutfs_work_list));

 	conn = kzalloc(sizeof(struct scoutfs_net_connection), GFP_NOFS);
-	if (!conn)
-		return NULL;
-
-	if (info_size) {
-		conn->info = kzalloc(info_size, GFP_NOFS);
-		if (!conn->info) {
-			kfree(conn);
-			return NULL;
-		}
+	if (conn) {
+		if (info_size)
+			conn->info = kzalloc(info_size, GFP_NOFS);
+		conn->ordered_proc_wlists = kmalloc_array(nr, sizeof(struct scoutfs_work_list),
+							  GFP_NOFS);
+		conn->workq = alloc_workqueue("scoutfs_net_%s",
+					      WQ_UNBOUND | WQ_NON_REENTRANT, 0,
+					      name_suffix);
 	}
-
-	conn->workq = alloc_workqueue("scoutfs_net_%s",
-				      WQ_UNBOUND | WQ_NON_REENTRANT, 0,
-				      name_suffix);
-	if (!conn->workq) {
-		kfree(conn->info);
-		kfree(conn);
+	if (!conn || (info_size && !conn->info) || !conn->workq || !conn->ordered_proc_wlists) {
+		if (conn) {
+			kfree(conn->info);
+			kfree(conn->ordered_proc_wlists);
+			if (conn->workq)
+				destroy_workqueue(conn->workq);
+			kfree(conn);
+		}
 		return NULL;
 	}

@@ -1378,6 +1496,13 @@ scoutfs_net_alloc_conn(struct super_block *sb,
 	INIT_DELAYED_WORK(&conn->reconn_free_dwork,
 			  scoutfs_net_reconn_free_worker);

+	conn->ordered_proc_nr = nr;
+	for (i = 0; i < nr; i++) {
+		INIT_WORK(&conn->ordered_proc_wlists[i].work, scoutfs_net_ordered_proc_worker);
+		spin_lock_init(&conn->ordered_proc_wlists[i].lock);
+		INIT_LIST_HEAD(&conn->ordered_proc_wlists[i].list);
+	}
+
 	scoutfs_tseq_add(&ninf->conn_tseq_tree, &conn->tseq_entry);
 	trace_scoutfs_conn_alloc(conn);

--- a/kmod/src/net.h
+++ b/kmod/src/net.h
@@ -1,10 +1,18 @@
 #ifndef _SCOUTFS_NET_H_
 #define _SCOUTFS_NET_H_

+#include <linux/spinlock.h>
+#include <linux/list.h>
 #include <linux/in.h>
 #include "endian_swap.h"
 #include "tseq.h"

+struct scoutfs_work_list {
+	struct work_struct work;
+	spinlock_t lock;
+	struct list_head list;
+};
+
 struct scoutfs_net_connection;

 /* These are called in their own blocking context */
@@ -61,6 +69,8 @@ struct scoutfs_net_connection {
 	struct list_head resend_queue;

 	atomic64_t recv_seq;
+	unsigned int ordered_proc_nr;
+	struct scoutfs_work_list *ordered_proc_wlists;

 	struct workqueue_struct *workq;
 	struct work_struct listen_work;
--- a/kmod/src/options.c
+++ b/kmod/src/options.c
@@ -39,6 +39,7 @@ enum {
 	Opt_orphan_scan_delay_ms,
 	Opt_quorum_heartbeat_timeout_ms,
 	Opt_quorum_slot_nr,
+	Opt_tcp_keepalive_timeout_ms,
 	Opt_err,
 };

@@ -52,6 +53,7 @@ static const match_table_t tokens = {
 	{Opt_orphan_scan_delay_ms, "orphan_scan_delay_ms=%s"},
 	{Opt_quorum_heartbeat_timeout_ms, "quorum_heartbeat_timeout_ms=%s"},
 	{Opt_quorum_slot_nr, "quorum_slot_nr=%s"},
+	{Opt_tcp_keepalive_timeout_ms, "tcp_keepalive_timeout_ms=%s"},
 	{Opt_err, NULL}
 };

@@ -126,6 +128,8 @@ static void free_options(struct scoutfs_mount_options *opts)
 #define MIN_DATA_PREALLOC_BLOCKS	1ULL
 #define MAX_DATA_PREALLOC_BLOCKS	((unsigned long long)SCOUTFS_BLOCK_SM_MAX)

+#define DEFAULT_TCP_KEEPALIVE_TIMEOUT_MS	(10 * MSEC_PER_SEC)
+
 static void init_default_options(struct scoutfs_mount_options *opts)
 {
 	memset(opts, 0, sizeof(*opts));
@@ -136,6 +140,7 @@ static void init_default_options(struct scoutfs_mount_options *opts)
 	opts->orphan_scan_delay_ms = -1;
 	opts->quorum_heartbeat_timeout_ms = SCOUTFS_QUORUM_DEF_HB_TIMEO_MS;
 	opts->quorum_slot_nr = -1;
+	opts->tcp_keepalive_timeout_ms = DEFAULT_TCP_KEEPALIVE_TIMEOUT_MS;
 }

 static int verify_log_merge_wait_timeout_ms(struct super_block *sb, int ret, int val)
@@ -168,6 +173,21 @@ static int verify_quorum_heartbeat_timeout_ms(struct super_block *sb, int ret, u
 	return 0;
 }

+static int verify_tcp_keepalive_timeout_ms(struct super_block *sb, int ret, int val)
+{
+	if (ret < 0) {
+		scoutfs_err(sb, "failed to parse tcp_keepalive_timeout_ms value");
+		return -EINVAL;
+	}
+	if (val <= (UNRESPONSIVE_PROBES * MSEC_PER_SEC)) {
+		scoutfs_err(sb, "invalid tcp_keepalive_timeout_ms value %d, must be larger than %lu",
+			    val, (UNRESPONSIVE_PROBES * MSEC_PER_SEC));
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
 /*
 * Parse the option string into our options struct.   This can allocate
 * memory in the struct.  The caller is responsible for always calling
@@ -218,6 +238,14 @@ static int parse_options(struct super_block *sb, char *options, struct scoutfs_m
 			opts->data_prealloc_contig_only = nr;
 			break;

+		case Opt_tcp_keepalive_timeout_ms:
+			ret = match_int(args, &nr);
+			ret = verify_tcp_keepalive_timeout_ms(sb, ret, nr);
+			if (ret < 0)
+				return ret;
+			opts->tcp_keepalive_timeout_ms = nr;
+			break;
+
 		case Opt_log_merge_wait_timeout_ms:
 			ret = match_int(args, &nr);
 			ret = verify_log_merge_wait_timeout_ms(sb, ret, nr);
@@ -371,6 +399,7 @@ int scoutfs_options_show(struct seq_file *seq, struct dentry *root)
 	seq_printf(seq, ",orphan_scan_delay_ms=%u", opts.orphan_scan_delay_ms);
 	if (opts.quorum_slot_nr >= 0)
 		seq_printf(seq, ",quorum_slot_nr=%d", opts.quorum_slot_nr);
+	seq_printf(seq, ",tcp_keepalive_timeout_ms=%d", opts.tcp_keepalive_timeout_ms);

 	return 0;
 }
--- a/kmod/src/options.h
+++ b/kmod/src/options.h
@@ -13,8 +13,11 @@ struct scoutfs_mount_options {
 	unsigned int orphan_scan_delay_ms;
 	int quorum_slot_nr;
 	u64 quorum_heartbeat_timeout_ms;
+	int tcp_keepalive_timeout_ms;
 };

+#define UNRESPONSIVE_PROBES	3
+
 void scoutfs_options_read(struct super_block *sb, struct scoutfs_mount_options *opts);
 int scoutfs_options_show(struct seq_file *seq, struct dentry *root);

--- a/kmod/src/quorum.c
+++ b/kmod/src/quorum.c
@@ -243,10 +243,6 @@ static int send_msg_members(struct super_block *sb, int type, u64 term, int only
 	};
 	struct sockaddr_in sin;
 	struct msghdr mh = {
-#ifndef KC_MSGHDR_STRUCT_IOV_ITER
-		.msg_iov = (struct iovec *)&kv,
-		.msg_iovlen = 1,
-#endif
 		.msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL,
 		.msg_name = &sin,
 		.msg_namelen = sizeof(sin),
@@ -268,9 +264,7 @@ static int send_msg_members(struct super_block *sb, int type, u64 term, int only

 		scoutfs_quorum_slot_sin(&qinf->qconf, i, &sin);
 		now = ktime_get();
-#ifdef KC_MSGHDR_STRUCT_IOV_ITER
-		iov_iter_init(&mh.msg_iter, WRITE, (struct iovec *)&kv, sizeof(qmes), 1);
-#endif
+
 		ret = kernel_sendmsg(qinf->sock, &mh, &kv, 1, kv.iov_len);
 		if (ret != kv.iov_len)
 			failed++;
@@ -312,10 +306,6 @@ static int recv_msg(struct super_block *sb, struct quorum_host_msg *msg,
 		.iov_len = sizeof(struct scoutfs_quorum_message),
 	};
 	struct msghdr mh = {
-#ifndef KC_MSGHDR_STRUCT_IOV_ITER
-		.msg_iov = (struct iovec *)&kv,
-		.msg_iovlen = 1,
-#endif
 		.msg_flags = MSG_NOSIGNAL,
 	};

@@ -333,9 +323,6 @@ static int recv_msg(struct super_block *sb, struct quorum_host_msg *msg,
 		ret = kc_tcp_sock_set_rcvtimeo(qinf->sock, rel_to);
 	}

-#ifdef KC_MSGHDR_STRUCT_IOV_ITER
-	iov_iter_init(&mh.msg_iter, READ, (struct iovec *)&kv, sizeof(struct scoutfs_quorum_message), 1);
-#endif
 	ret = kernel_recvmsg(qinf->sock, &mh, &kv, 1, kv.iov_len, mh.msg_flags);
 	if (ret < 0)
 		return ret;
@@ -726,6 +713,8 @@ static void scoutfs_quorum_worker(struct work_struct *work)
 	struct quorum_status qst = {0,};
 	struct hb_recording hbr;
 	bool record_hb;
+	bool recv_failed;
+	bool initializing = true;
 	int ret;
 	int err;

@@ -758,6 +747,8 @@ static void scoutfs_quorum_worker(struct work_struct *work)

 		update_show_status(qinf, &qst);

+		recv_failed = false;
+
 		ret = recv_msg(sb, &msg, qst.timeout);
 		if (ret < 0) {
 			if (ret != -ETIMEDOUT && ret != -EAGAIN) {
@@ -765,6 +756,9 @@ static void scoutfs_quorum_worker(struct work_struct *work)
 				scoutfs_inc_counter(sb, quorum_recv_error);
 				goto out;
 			}
+
+			recv_failed = true;
+
 			msg.type = SCOUTFS_QUORUM_MSG_INVALID;
 			ret = 0;
 		}
@@ -822,12 +816,13 @@ static void scoutfs_quorum_worker(struct work_struct *work)

 		/* followers and candidates start new election on timeout */
 		if (qst.role != LEADER &&
+		    (initializing || recv_failed) &&
 		    ktime_after(ktime_get(), qst.timeout)) {
 			/* .. but only if their server has stopped */
 			if (!scoutfs_server_is_down(sb)) {
 				qst.timeout = election_timeout();
 				scoutfs_inc_counter(sb, quorum_candidate_server_stopping);
-				continue;
+				goto again;
 			}

 			qst.role = CANDIDATE;
@@ -964,6 +959,9 @@ static void scoutfs_quorum_worker(struct work_struct *work)
 		}

 		record_hb_delay(sb, qinf, &hbr, record_hb, qst.role);
+
+again:
+		initializing = false;
 	}

 	update_show_status(qinf, &qst);
--- a/kmod/src/scoutfs_trace.h
+++ b/kmod/src/scoutfs_trace.h
@@ -823,13 +823,14 @@ DEFINE_EVENT(scoutfs_lock_info_class, scoutfs_lock_destroy,
 );

 TRACE_EVENT(scoutfs_xattr_set,
-	TP_PROTO(struct super_block *sb, size_t name_len, const void *value,
-		 size_t size, int flags),
+	TP_PROTO(struct super_block *sb, __u64 ino, size_t name_len,
+		 const void *value, size_t size, int flags),

-	TP_ARGS(sb, name_len, value, size, flags),
+	TP_ARGS(sb, ino, name_len, value, size, flags),

 	TP_STRUCT__entry(
 		SCSB_TRACE_FIELDS
+		__field(__u64, ino)
 		__field(size_t, name_len)
 		__field(const void *, value)
 		__field(size_t, size)
@@ -838,15 +839,16 @@ TRACE_EVENT(scoutfs_xattr_set,

 	TP_fast_assign(
 		SCSB_TRACE_ASSIGN(sb);
+		__entry->ino = ino;
 		__entry->name_len = name_len;
 		__entry->value = value;
 		__entry->size = size;
 		__entry->flags = flags;
 	),

-	TP_printk(SCSBF" name_len %zu value %p size %zu flags 0x%x",
-		  SCSB_TRACE_ARGS, __entry->name_len, __entry->value,
-		  __entry->size, __entry->flags)
+	TP_printk(SCSBF" ino %llu name_len %zu value %p size %zu flags 0x%x",
+		  SCSB_TRACE_ARGS, __entry->ino,  __entry->name_len,
+		  __entry->value, __entry->size, __entry->flags)
 );

 TRACE_EVENT(scoutfs_advance_dirty_super,
@@ -2463,6 +2465,27 @@ TRACE_EVENT(scoutfs_block_dirty_ref,
 		  __entry->block_blkno, __entry->block_seq)
 );

+TRACE_EVENT(scoutfs_get_file_block,
+	TP_PROTO(struct super_block *sb, u64 blkno, int flags),
+
+	TP_ARGS(sb, blkno, flags),
+
+	TP_STRUCT__entry(
+		SCSB_TRACE_FIELDS
+		__field(__u64, blkno)
+		__field(int, flags)
+	),
+
+	TP_fast_assign(
+		SCSB_TRACE_ASSIGN(sb);
+		__entry->blkno = blkno;
+		__entry->flags = flags;
+	),
+
+	TP_printk(SCSBF" blkno %llu flags 0x%x",
+		  SCSB_TRACE_ARGS, __entry->blkno, __entry->flags)
+);
+
 TRACE_EVENT(scoutfs_block_stale,
 	TP_PROTO(struct super_block *sb, struct scoutfs_block_ref *ref,
 		 struct scoutfs_block_header *hdr, u32 magic, u32 crc),
@@ -2503,8 +2526,8 @@ TRACE_EVENT(scoutfs_block_stale,

 DECLARE_EVENT_CLASS(scoutfs_block_class,
 	TP_PROTO(struct super_block *sb, void *bp, u64 blkno, int refcount, int io_count,
-		 unsigned long bits, __u64 accessed),
-	TP_ARGS(sb, bp, blkno, refcount, io_count, bits, accessed),
+		 unsigned long bits),
+	TP_ARGS(sb, bp, blkno, refcount, io_count, bits),
 	TP_STRUCT__entry(
 		SCSB_TRACE_FIELDS
 		__field(void *, bp)
@@ -2512,7 +2535,6 @@ DECLARE_EVENT_CLASS(scoutfs_block_class,
 		__field(int, refcount)
 		__field(int, io_count)
 		__field(long, bits)
-		__field(__u64, accessed)
 	),
 	TP_fast_assign(
 		SCSB_TRACE_ASSIGN(sb);
@@ -2521,71 +2543,65 @@ DECLARE_EVENT_CLASS(scoutfs_block_class,
 		__entry->refcount = refcount;
 		__entry->io_count = io_count;
 		__entry->bits = bits;
-		__entry->accessed = accessed;
 	),
-	TP_printk(SCSBF" bp %p blkno %llu refcount %d io_count %d bits 0x%lx accessed %llu",
+	TP_printk(SCSBF" bp %p blkno %llu refcount %x io_count %d bits 0x%lx",
 		  SCSB_TRACE_ARGS, __entry->bp, __entry->blkno, __entry->refcount,
-		  __entry->io_count, __entry->bits, __entry->accessed)
+		  __entry->io_count, __entry->bits)
 );
 DEFINE_EVENT(scoutfs_block_class, scoutfs_block_allocate,
 	TP_PROTO(struct super_block *sb, void *bp, u64 blkno,
-		 int refcount, int io_count, unsigned long bits,
-		 __u64 accessed),
-	TP_ARGS(sb, bp, blkno, refcount, io_count, bits, accessed)
+		 int refcount, int io_count, unsigned long bits),
+	TP_ARGS(sb, bp, blkno, refcount, io_count, bits)
 );
 DEFINE_EVENT(scoutfs_block_class, scoutfs_block_free,
 	TP_PROTO(struct super_block *sb, void *bp, u64 blkno,
-		 int refcount, int io_count, unsigned long bits,
-		 __u64 accessed),
-	TP_ARGS(sb, bp, blkno, refcount, io_count, bits, accessed)
+		 int refcount, int io_count, unsigned long bits),
+	TP_ARGS(sb, bp, blkno, refcount, io_count, bits)
 );
 DEFINE_EVENT(scoutfs_block_class, scoutfs_block_insert,
 	TP_PROTO(struct super_block *sb, void *bp, u64 blkno,
-		 int refcount, int io_count, unsigned long bits,
-		 __u64 accessed),
-	TP_ARGS(sb, bp, blkno, refcount, io_count, bits, accessed)
+		 int refcount, int io_count, unsigned long bits),
+	TP_ARGS(sb, bp, blkno, refcount, io_count, bits)
 );
 DEFINE_EVENT(scoutfs_block_class, scoutfs_block_remove,
 	TP_PROTO(struct super_block *sb, void *bp, u64 blkno,
-		 int refcount, int io_count, unsigned long bits,
-		 __u64 accessed),
-	TP_ARGS(sb, bp, blkno, refcount, io_count, bits, accessed)
+		 int refcount, int io_count, unsigned long bits),
+	TP_ARGS(sb, bp, blkno, refcount, io_count, bits)
 );
 DEFINE_EVENT(scoutfs_block_class, scoutfs_block_end_io,
 	TP_PROTO(struct super_block *sb, void *bp, u64 blkno,
-		 int refcount, int io_count, unsigned long bits,
-		 __u64 accessed),
-	TP_ARGS(sb, bp, blkno, refcount, io_count, bits, accessed)
+		 int refcount, int io_count, unsigned long bits),
+	TP_ARGS(sb, bp, blkno, refcount, io_count, bits)
 );
 DEFINE_EVENT(scoutfs_block_class, scoutfs_block_submit,
 	TP_PROTO(struct super_block *sb, void *bp, u64 blkno,
-		 int refcount, int io_count, unsigned long bits,
-		 __u64 accessed),
-	TP_ARGS(sb, bp, blkno, refcount, io_count, bits, accessed)
+		 int refcount, int io_count, unsigned long bits),
+	TP_ARGS(sb, bp, blkno, refcount, io_count, bits)
 );
 DEFINE_EVENT(scoutfs_block_class, scoutfs_block_invalidate,
 	TP_PROTO(struct super_block *sb, void *bp, u64 blkno,
-		 int refcount, int io_count, unsigned long bits,
-		 __u64 accessed),
-	TP_ARGS(sb, bp, blkno, refcount, io_count, bits, accessed)
+		 int refcount, int io_count, unsigned long bits),
+	TP_ARGS(sb, bp, blkno, refcount, io_count, bits)
 );
 DEFINE_EVENT(scoutfs_block_class, scoutfs_block_mark_dirty,
 	TP_PROTO(struct super_block *sb, void *bp, u64 blkno,
-		 int refcount, int io_count, unsigned long bits,
-		 __u64 accessed),
-	TP_ARGS(sb, bp, blkno, refcount, io_count, bits, accessed)
+		 int refcount, int io_count, unsigned long bits),
+	TP_ARGS(sb, bp, blkno, refcount, io_count, bits)
 );
 DEFINE_EVENT(scoutfs_block_class, scoutfs_block_forget,
 	TP_PROTO(struct super_block *sb, void *bp, u64 blkno,
-		 int refcount, int io_count, unsigned long bits,
-		 __u64 accessed),
-	TP_ARGS(sb, bp, blkno, refcount, io_count, bits, accessed)
+		 int refcount, int io_count, unsigned long bits),
+	TP_ARGS(sb, bp, blkno, refcount, io_count, bits)
 );
 DEFINE_EVENT(scoutfs_block_class, scoutfs_block_shrink,
 	TP_PROTO(struct super_block *sb, void *bp, u64 blkno,
-		 int refcount, int io_count, unsigned long bits,
-		 __u64 accessed),
-	TP_ARGS(sb, bp, blkno, refcount, io_count, bits, accessed)
+		 int refcount, int io_count, unsigned long bits),
+	TP_ARGS(sb, bp, blkno, refcount, io_count, bits)
+);
+DEFINE_EVENT(scoutfs_block_class, scoutfs_block_isolate,
+	TP_PROTO(struct super_block *sb, void *bp, u64 blkno,
+		 int refcount, int io_count, unsigned long bits),
+	TP_ARGS(sb, bp, blkno, refcount, io_count, bits)
 );

 DECLARE_EVENT_CLASS(scoutfs_ext_next_class,
@@ -3060,6 +3076,27 @@ DEFINE_EVENT(scoutfs_srch_compact_class, scoutfs_srch_compact_client_recv,
 	TP_ARGS(sb, sc)
 );

+TRACE_EVENT(scoutfs_ioc_search_xattrs,
+	TP_PROTO(struct super_block *sb, u64 ino, u64 last_ino),
+
+	TP_ARGS(sb, ino, last_ino),
+
+	TP_STRUCT__entry(
+		SCSB_TRACE_FIELDS
+		__field(u64, ino)
+		__field(u64, last_ino)
+	),
+
+	TP_fast_assign(
+		SCSB_TRACE_ASSIGN(sb);
+		__entry->ino = ino;
+		__entry->last_ino = last_ino;
+	),
+
+	TP_printk(SCSBF" ino %llu last_ino %llu", SCSB_TRACE_ARGS,
+		  __entry->ino, __entry->last_ino)
+);
+
 #endif /* _TRACE_SCOUTFS_H */

 /* This part must be outside protection */
--- a/kmod/src/server.c
+++ b/kmod/src/server.c
@@ -610,7 +610,7 @@ static void scoutfs_server_commit_func(struct work_struct *work)
 		goto out;

 	if (scoutfs_forcing_unmount(sb)) {
-		ret = -EIO;
+		ret = -ENOLINK;
 		goto out;
 	}

@@ -1256,6 +1256,7 @@ static int finalize_and_start_log_merge(struct super_block *sb, struct scoutfs_l
 		/* done if we're not finalizing and there's no finalized */
 		if (!finalize_ours && !saw_finalized) {
 			ret = 0;
+			scoutfs_inc_counter(sb, log_merge_no_finalized);
 			break;
 		}

@@ -1889,6 +1890,9 @@ static int reclaim_open_log_tree(struct super_block *sb, u64 rid)
 out:
 	mutex_unlock(&server->logs_mutex);

+	if (ret == 0)
+		scoutfs_inc_counter(sb, reclaimed_open_logs);
+
 	if (ret < 0 && ret != -EINPROGRESS)
 		scoutfs_err(sb, "server error %d reclaiming log trees for rid %016llx: %s",
 			    ret, rid, err_str);
--- a/kmod/src/srch.c
+++ b/kmod/src/srch.c
@@ -62,7 +62,7 @@
 * re-allocated and re-written.  Search can restart by checking the
 * btree for the current set of files.  Compaction reads log files which
 * are protected from other compactions by the persistent busy items
- * created by the server.  Compaction won't see it's blocks reused out
+ * created by the server.  Compaction won't see its blocks reused out
 * from under it, but it can encounter stale cached blocks that need to
 * be invalidated.
 */
@@ -442,6 +442,10 @@ out:
 	if (ret == 0 && (flags & GFB_INSERT) && blk >= le64_to_cpu(sfl->blocks))
 		sfl->blocks = cpu_to_le64(blk + 1);

+	if (bl) {
+		trace_scoutfs_get_file_block(sb, bl->blkno, flags);
+	}
+
 	*bl_ret = bl;
 	return ret;
 }
@@ -749,14 +753,14 @@ static int search_log_file(struct super_block *sb,
 		for (i = 0; i < le32_to_cpu(srb->entry_nr); i++) {
 			if (pos > SCOUTFS_SRCH_BLOCK_SAFE_BYTES) {
 				/* can only be inconsistency :/ */
-				ret = EIO;
+				ret = -EIO;
 				break;
 			}

 			ret = decode_entry(srb->entries + pos, &sre, &prev);
 			if (ret <= 0) {
 				/* can only be inconsistency :/ */
-				ret = EIO;
+				ret = -EIO;
 				break;
 			}
 			pos += ret;
@@ -859,14 +863,14 @@ static int search_sorted_file(struct super_block *sb,

 		if (pos > SCOUTFS_SRCH_BLOCK_SAFE_BYTES) {
 			/* can only be inconsistency :/ */
-			ret = EIO;
+			ret = -EIO;
 			break;
 		}

 		ret = decode_entry(srb->entries + pos, &sre, &prev);
 		if (ret <= 0) {
 			/* can only be inconsistency :/ */
-			ret = EIO;
+			ret = -EIO;
 			break;
 		}
 		pos += ret;
@@ -972,6 +976,8 @@ int scoutfs_srch_search_xattrs(struct super_block *sb,

 	scoutfs_inc_counter(sb, srch_search_xattrs);

+	trace_scoutfs_ioc_search_xattrs(sb, ino, last_ino);
+
 	*done = false;
 	srch_init_rb_root(sroot);

@@ -1802,7 +1808,7 @@ static void swap_page_sre(void *A, void *B, int size)
 * typically, ~10x worst case).
 *
 * Because we read and sort all the input files we must perform the full
- * compaction in one operation.  The server must have given us a
+ * compaction in one operation.  The server must have given us
 * sufficiently large avail/freed lists, otherwise we'll return ENOSPC.
 */
 static int compact_logs(struct super_block *sb,
@@ -1866,14 +1872,14 @@ static int compact_logs(struct super_block *sb,

 		if (pos > SCOUTFS_SRCH_BLOCK_SAFE_BYTES) {
 			/* can only be inconsistency :/ */
-			ret = EIO;
+			ret = -EIO;
 			break;
 		}

 		ret = decode_entry(srb->entries + pos, sre, &prev);
 		if (ret <= 0) {
 			/* can only be inconsistency :/ */
-			ret = EIO;
+			ret = -EIO;
 			goto out;
 		}
 		prev = *sre;
--- a/kmod/src/trans.c
+++ b/kmod/src/trans.c
@@ -196,7 +196,7 @@ static int retry_forever(struct super_block *sb, int (*func)(struct super_block
 			}

 			if (scoutfs_forcing_unmount(sb)) {
-				ret = -EIO;
+				ret = -ENOLINK;
 				break;
 			}

@@ -252,7 +252,7 @@ void scoutfs_trans_write_func(struct work_struct *work)
 	}

 	if (scoutfs_forcing_unmount(sb)) {
-		ret = -EIO;
+		ret = -ENOLINK;
 		goto out;
 	}

--- a/kmod/src/xattr.c
+++ b/kmod/src/xattr.c
@@ -742,7 +742,7 @@ int scoutfs_xattr_set_locked(struct inode *inode, const char *name, size_t name_
 	int ret;
 	int err;

-	trace_scoutfs_xattr_set(sb, name_len, value, size, flags);
+	trace_scoutfs_xattr_set(sb, ino, name_len, value, size, flags);

 	if (WARN_ON_ONCE(tgs->totl && tgs->indx) ||
 	    WARN_ON_ONCE((tgs->totl | tgs->indx) && !tag_lock))
--- a/tests/funcs/filter.sh
+++ b/tests/funcs/filter.sh
@@ -140,6 +140,9 @@ t_filter_dmesg()
 	re="$re|scoutfs .* error.*server failed to bind to.*"
 	re="$re|scoutfs .* critical transaction commit failure.*"

+	# ENOLINK (-67) indicates an expected forced unmount error
+	re="$re|scoutfs .* error -67 .*"
+
 	# change-devices causes loop device resizing
 	re="$re|loop: module loaded"
 	re="$re|loop[0-9].* detected capacity change from.*"
--- a/tests/tests/format-version-forward-back.sh
+++ b/tests/tests/format-version-forward-back.sh
@@ -11,8 +11,8 @@
 # format version.
 #

-# not supported on el9!
-if [ $(source /etc/os-release ; echo ${VERSION_ID:0:1}) -gt 8 ]; then
+# not supported on el8 or higher
+if [ $(source /etc/os-release ; echo ${VERSION_ID:0:1}) -gt 7 ]; then
 	t_skip_permitted "Unsupported OS version"
 fi

--- a/tests/tests/orphan-inodes.sh
+++ b/tests/tests/orphan-inodes.sh
@@ -67,6 +67,21 @@ t_mount_all
 while test -d $(echo /sys/fs/scoutfs/*/fence/* | cut -d " " -f 1); do
 	sleep .5
 done
+
+
+sv=$(t_server_nr)
+
+# wait for reclaim_open_log_tree() to complete for each mount
+while [ $(t_counter reclaimed_open_logs $sv) -lt $T_NR_MOUNTS ]; do
+	sleep 1
+done
+
+# wait for finalize_and_start_log_merge() to find no active merges in flight
+# and not find any finalized trees
+while [ $(t_counter log_merge_no_finalized $sv) -lt 1 ]; do
+	sleep 1
+done
+
 # wait for orphan scans to run
 t_set_all_sysfs_mount_options orphan_scan_delay_ms 1000
 # wait until we see two consecutive orphan scan attempts without
--- a/utils/man/scoutfs.5
+++ b/utils/man/scoutfs.5
@@ -130,6 +130,24 @@ the server for the filesystem if it is elected leader.
 The assigned number must match one of the slots defined with \-Q options
 when the filesystem was created with mkfs.  If the number assigned
 doesn't match a number created during mkfs then the mount will fail.
+.TP
+.B tcp_keepalive_timeout_ms=<number>
+This option sets the amount of time, in milliseconds, that a client
+connection will wait for active TCP packets, before deciding that
+the connection is dead. This setting is per-mount and only changes
+the behavior of that mount.
+.sp
+The default value of this setting is 10000msec (10s). Any precision
+beyond a whole second is likely unrealistic due to the nature of
+TCP keepalive mechanisms in the Linux kernel. Valid values are any
+value higher than 3000 (3s). Values that are higher than 30000msec
+(30s) will likely interfere with other embedded timeout values.
+.sp
+The TCP keepalive mechanism is complex and observing a lost connection
+quickly is important to maintain cluster stability. If the local
+network suffers from intermittent outages this option may provide
+some respite to overcome these outages without the cluster becoming
+desynchronized.
 .SH VOLUME OPTIONS
 Volume options are persistent options which are stored in the super
 block in the metadata device and which apply to all mounts of the volume.
Author	SHA1	Message	Date
Auke Kok	732637d372	merge conflict from zab/shrink cleanup	2025-10-07 12:22:53 -07:00
Auke Kok	963591cc9a	Fix a sparse warning in net.c	2025-10-07 12:22:40 -07:00
Auke Kok	ad79ee94f9	Add tcp_keepalive_timeout_ms option. The default TCP keepalive value is currently 10s, resulting in clients being disconnected after 10 seconds of not replying to a TCP keepalive packet. These keepalive values are reasonable most of the times, but we've seen client disconnects where this timeout has been exceeded, resulting in fencing. The cause for this is unknown at this time, but it is suspected that network intermissions are happening. This change adds a configurable value for this specific client socket timeout. It enforces that its value is above UNRESPONSIVE_PROBES, whose value remains unchanged. The default value of 10000ms (10s) remains the trusted value. It is enirely unclear and untested what values are reasonable and which ones are not. Since the value of this setting can and will interact with other timeout values, care must be taken to not exceed certain other timeout values. I've tested this only briefly with values of 5000 and 25000. Outside that range is likely problematic. Signed-off-by: Auke Kok <auke.kok@versity.com>	2025-10-07 12:16:23 -07:00
Zach Brown	65ea250de9	Remove msghdr iov_iter kernelcompat This removes the KC_MSGHDR_STRUCT_IOV_ITER kernel compat. kernel_{send,recv}msg() initializes either msg_iov or msg_iter. This isn't a clean revert of "69068ae2 Initialize msg.msg_iter from iovec." because previous patches fixed the order of arguments, and the net send caller was removed. Signed-off-by: Zach Brown <zab@versity.com>	2025-10-07 12:15:59 -07:00
Zach Brown	86ca09ed7d	Send messages in batches Previous work had the receiver try to receive multiple messages in bulk. This does the same for the sender. We walk the send queue and initialize a vector that we then send with one call. This is intentionally similar to the single message sending pattern to avoid unintended changes. Along with the changes to recieve in bulk this ended up increasing the message processing rate by about 6x when both send and receive were going full throttle. Signed-off-by: Zach Brown <zab@versity.com>	2025-10-07 12:15:51 -07:00
Zach Brown	5681920bfe	Fix swapped sendmsg nr_segs/count When the msg_iter compat was added the iter was initialized with nr_segs and count swapped. I'm not convinced this had any effect because the kernel_{send,recv}msg() call would initialize msg_iter again with the correct arguments. Signed-off-by: Zach Brown <zab@versity.com>	2025-10-07 12:15:43 -07:00
Zach Brown	6c2ccf75ea	Receive incoming messages in bulk Our messaging layer is used for small control messages, not large data payloads. By calling recvmsg twice for every incoming message we're hitting the socket lock reasonably hard. With senders doing the same, and a lot of messages flowing in each direction, the contention is non-trivial. This changes the receiver to copy as much of the incoming stream into a page that is then framed and copied again into individual allocated messages that can be processed concurrently. We're avoiding contention with the sender on the socket at the cost of additional copies of our small messages. Signed-off-by: Zach Brown <zab@versity.com>	2025-10-07 12:15:34 -07:00
Zach Brown	a818b9e461	Process client lock messages in ordered work The lock client has a requirement that it can't handle some messages being processed out of order. Previously it had detected message ordering itself, but had missed some cases. Recieve processing was then changed to always call lock message processing from the recv work to globally order all lock messages. This inline processing was contributing to excessive latencies in making our way through the incoming receive queue, delaying work that would otherwise be parallel once we got it off the recv queue. This was seen in practice as a giant flood of lock shrink messages arrived at the client. It processed each in turn, starving a statfs response long enough to trigger the hung task warning. This fix does two things. First, it moves ordered recv processing out of the recv work. It lets the recv work drain the socket quickly and turn it into a list that the ordered work is consuming. Other messages will have a chance to be received and queued to their processing work without having to wait for the ordered work to be processed. Secondly, it adds parallelism to the ordered processing. The incoming lock messages don't need global ordering, they need ordering within each lock. We add an arbitrary but reasonable number of ordered workers and hash lock messages to each worker based on the lock's key. Signed-off-by: Zach Brown <zab@versity.com>	2025-10-07 12:15:20 -07:00
Zach Brown	b9f8eee59e	Use list_lru for block cache shrinking The block cache had a bizarre cache eviction policy that was trying to avoid precise LRU updates at each block. It had pretty bad behaviour, including only allowing reclaim of maybe 20% of the blocks that were visited by the shrinker. We can use the existing list_lru facility in the kernel to do a better job. Blocks only exhibit contention as they're allocated and added to per-node lists. From then on we only set accessed bits and the private list walkers move blocks around on the list as we see the accessed bits. (It looks more like a fifo with lazy promotion than a "LRU" that is actively moving list items around as they're accessed.) Using the facility means changing how we remove blocks from the cache and hide them from lookup. We clean up the refcount inserted flag a bit to be expressed more as a base refcount that can be acquired by whoever's removing from the cache. It seems a lot clearer. Signed-off-by: Zach Brown <zab@versity.com>	2025-10-07 12:14:25 -07:00
Zach Brown	d8fcbb9564	Add kernelcompat for list_lru Add kernelcompat helpers for initial use of list_lru for shrinking. The most complicated part is the walk callback type changing. Signed-off-by: Zach Brown <zab@versity.com>	2025-10-07 12:14:15 -07:00
Zach Brown	4d58252e1a	Retry stale item reads instead of stopping reclaim Readers can read a set of items that is stale with respect to items that were dirtied and written under a local cluster lock after the read started. The active reader machanism addressed this by refusing to shrink pages that could contain items that were dirtied while any readers were in flight. Under the right circumstances this can result in refusing to shrink quite a lot of pages indeed. This changes the mechanism to allow pages to be reclaimed, and instead forces stale readers to retry. The gamble is that reads are much faster than writes. A small fraction should have to retry, and when they do they can be satisfied by the block cache. Signed-off-by: Zach Brown <zab@versity.com>	2025-10-07 12:12:29 -07:00
Chris Kirby	293df47589	Fix race condition in orphan-inodes test Make sure that the orphan scanners can see deletions after forced unmounts by waiting for reclaim_open_log_tree() to run on each mount; and waiting for finalize_and_start_log_merge() to run and not find any finalized trees. Do this by adding two new counters: reclaimed_open_logs and log_merge_no_finalized and fixing the orphan-inodes test to check those before waiting for the orphan scanners to complete. Signed-off-by: Chris Kirby <ckirby@versity.com>	2025-10-06 16:55:47 -05:00
Chris Kirby	2a58e4c147	Use ENOLINK as a special error code during forced unmount Tests such as quorum-heartbeat-timeout were failing with EIO messages in dmesg output due to expected errors during forced unmount. Use ENOLINK instead, and filter all errors from dmesg with this errno (67). Signed-off-by: Chris Kirby <ckirby@versity.com>	2025-10-06 15:57:42 -05:00
Auke Kok	1b7917e063	Don't run format-version-forward-back on el8, either This test compiles an earlier commit from the tree that is starting to fail due to various changes on the OS level, most recently due to sparse issues with newer kernel headers. This problem will likely increase in the future as we add more supported releases. We opt to just only run this test on el7 for now. While we could have made this skip sparse checks that fail it on el8, it will suffice at this point if this just works on one of the supported OS versions during testing. Signed-off-by: Auke Kok <auke.kok@versity.com>	2025-10-06 12:27:25 -05:00
Zach Brown	4f9c3503c8	Add cond_resched to iput worker The iput worker can accumulate quite a bit of pending work to do. We've seen hung task warnings while it's doing its work (admitedly in debug kernels). There's no harm in throwing in a cond_resched so other tasks get a chance to do work. Signed-off-by: Zach Brown <zab@versity.com>	2025-10-06 12:27:25 -05:00
Chris Kirby	541cb47af0	Add tracing for get_file_block() and scoutfs_ioc_search_xattrs(). Signed-off-by: Chris Kirby <ckirby@versity.com>	2025-10-06 12:27:25 -05:00
Chris Kirby	d537365d0a	Fix several cases in srch.c where the return value of EIO should have been -EIO. Signed-off-by: Chris Kirby <ckirby@versity.com>	2025-10-06 12:27:25 -05:00
Chris Kirby	7375627861	Add the inode number to scoutfs_xattr_set traces. Signed-off-by: Chris Kirby <ckirby@versity.com>	2025-10-06 12:27:25 -05:00
Chris Kirby	48d849e2f4	Only start new quorum election after a receive failure It's possible for the quorum worker to be preempted for a long period, especially on debug kernels. Since we only check for how much time has passed, it's possible for a clean receive to inadvertently trigger an election. This can cause the quorum-heartbeat-timeout test to fail due to observed delays outside of the expected bounds. Instead, make sure we had a receive failure before comparing timestamps. Signed-off-by: Chris Kirby <ckirby@versity.com>	2025-10-06 12:27:25 -05:00