Update tracing with cluster lock changes

Signed-off-by: Zach Brown <zab@versity.com>
Directly queue cluster lock work
2026-06-09 21:22:36 +00:00 · 2025-10-31 15:38:31 -05:00 · 2025-10-31 15:38:31 -05:00 · 2025-10-31 15:38:31 -05:00 · 2025-10-31 15:38:31 -05:00 · 2025-10-31 15:38:31 -05:00
45 changed files with 1883 additions and 1103 deletions
@@ -5,13 +5,6 @@ ifeq ($(SK_KSRC),)
 SK_KSRC := $(shell echo /lib/modules/`uname -r`/build)
 endif

-# fail if sparse fails if we find it
-ifeq ($(shell sparse && echo found),found)
-SP =
-else
-SP = @:
-endif
-
 SCOUTFS_GIT_DESCRIBE ?= \
 	$(shell git describe --all --abbrev=6 --long 2>/dev/null || \
 		echo no-git)
@@ -36,9 +29,7 @@ TARFILE = scoutfs-kmod-$(RPM_VERSION).tar
 all: module

 module:
-	$(MAKE) $(SCOUTFS_ARGS)
-	$(SP) $(MAKE) C=2 CF="-D__CHECK_ENDIAN__" $(SCOUTFS_ARGS)
-
+	$(MAKE) CHECK=$(CURDIR)/src/sparse-filtered.sh C=1 CF="-D__CHECK_ENDIAN__" $(SCOUTFS_ARGS)

 modules_install:
 	$(MAKE) $(SCOUTFS_ARGS) modules_install
@@ -158,15 +158,6 @@ ifneq (,$(shell grep 'sock_create_kern.*struct net' include/linux/net.h))
 ccflags-y += -DKC_SOCK_CREATE_KERN_NET=1
 endif

-#
-# v3.18-rc6-1619-gc0371da6047a
-#
-# iov_iter is now part of struct msghdr
-#
-ifneq (,$(shell grep 'struct iov_iter.*msg_iter' include/linux/socket.h))
-ccflags-y += -DKC_MSGHDR_STRUCT_IOV_ITER=1
-endif
-
 #
 # v4.17-rc6-7-g95582b008388
 #
@@ -287,6 +278,14 @@ ifneq (,$(shell grep 'int ..mknod. .struct user_namespace' include/linux/fs.h))
 ccflags-y += -DKC_VFS_METHOD_USER_NAMESPACE_ARG
 endif

+#
+# v6.2-rc1-2-gabf08576afe3
+#
+# fs: vfs methods use struct mnt_idmap instead of struct user_namespace
+ifneq (,$(shell grep 'int vfs_mknod.struct mnt_idmap' include/linux/fs.h))
+ccflags-y += -DKC_VFS_METHOD_MNT_IDMAP_ARG
+endif
+
 #
 # v5.17-rc2-21-g07888c665b40
 #
@@ -434,3 +433,56 @@ endif
 ifneq (,$(shell grep 'int ..remap_pages..struct vm_area_struct' include/linux/mm.h))
 ccflags-y += -DKC_MM_REMAP_PAGES
 endif
+
+#
+# v3.19-4742-g503c358cf192
+#
+# list_lru_shrink_count() and list_lru_shrink_walk() introduced
+#
+ifneq (,$(shell grep 'list_lru_shrink_count.*struct list_lru' include/linux/list_lru.h))
+ccflags-y += -DKC_LIST_LRU_SHRINK_COUNT_WALK
+endif
+
+#
+# v3.19-4757-g3f97b163207c
+#
+# lru_list_walk_cb lru arg added
+#
+ifneq (,$(shell grep 'struct list_head \*item, spinlock_t \*lock, void \*cb_arg' include/linux/list_lru.h))
+ccflags-y += -DKC_LIST_LRU_WALK_CB_ITEM_LOCK
+endif
+
+#
+# v6.7-rc4-153-g0a97c01cd20b
+#
+# list_lru_{add,del} -> list_lru_{add,del}_obj
+#
+ifneq (,$(shell grep '^bool list_lru_add_obj' include/linux/list_lru.h))
+ccflags-y += -DKC_LIST_LRU_ADD_OBJ
+endif
+
+#
+# v6.12-rc6-227-gda0c02516c50
+#
+# lru_list_walk_cb lock arg removed
+#
+ifneq (,$(shell grep 'struct list_lru_one \*list, spinlock_t \*lock, void \*cb_arg' include/linux/list_lru.h))
+ccflags-y += -DKC_LIST_LRU_WALK_CB_LIST_LOCK
+endif
+
+#
+# v5.1-rc4-273-ge9b98e162aa5
+#
+# introduce stack trace helpers
+#
+ifneq (,$(shell grep '^unsigned int stack_trace_save' include/linux/stacktrace.h))
+ccflags-y += -DKC_STACK_TRACE_SAVE
+endif
+
+# v6.1-rc1-4-g7420332a6ff4
+#
+# .get_acl() method now has dentry arg (and mnt_idmap). The old get_acl has been renamed
+# to get_inode_acl() and is still available as well, but has an extra rcu param.
+ifneq (,$(shell grep 'struct posix_acl ...get_acl..struct mnt_idmap ., struct dentry' include/linux/fs.h))
+ccflags-y += -DKC_GET_ACL_DENTRY
+endif
@@ -107,8 +107,15 @@ struct posix_acl *scoutfs_get_acl_locked(struct inode *inode, int type, struct s
 	return acl;
 }

+#ifdef KC_GET_ACL_DENTRY
+struct posix_acl *scoutfs_get_acl(KC_VFS_NS_DEF
+				  struct dentry *dentry, int type)
+{
+	struct inode *inode = dentry->d_inode;
+#else
 struct posix_acl *scoutfs_get_acl(struct inode *inode, int type)
 {
+#endif
 	struct super_block *sb = inode->i_sb;
 	struct scoutfs_lock *lock = NULL;
 	struct posix_acl *acl;
@@ -201,8 +208,15 @@ out:
 	return ret;
 }

+#ifdef KC_GET_ACL_DENTRY
+int scoutfs_set_acl(KC_VFS_NS_DEF
+		    struct dentry *dentry, struct posix_acl *acl, int type)
+{
+	struct inode *inode = dentry->d_inode;
+#else
 int scoutfs_set_acl(struct inode *inode, struct posix_acl *acl, int type)
 {
+#endif
 	struct super_block *sb = inode->i_sb;
 	struct scoutfs_lock *lock = NULL;
 	LIST_HEAD(ind_locks);
@@ -240,7 +254,12 @@ int scoutfs_acl_get_xattr(struct dentry *dentry, const char *name, void *value,
 	if (!IS_POSIXACL(dentry->d_inode))
 		return -EOPNOTSUPP;

+#ifdef KC_GET_ACL_DENTRY
+	acl = scoutfs_get_acl(KC_VFS_INIT_NS
+			      dentry, type);
+#else
 	acl = scoutfs_get_acl(dentry->d_inode, type);
+#endif
 	if (IS_ERR(acl))
 		return PTR_ERR(acl);
 	if (acl == NULL)
@@ -286,7 +305,11 @@ int scoutfs_acl_set_xattr(struct dentry *dentry, const char *name, const void *v
 		}
 	}

+#ifdef KC_GET_ACL_DENTRY
+	ret = scoutfs_set_acl(KC_VFS_INIT_NS dentry, acl, type);
+#else
 	ret = scoutfs_set_acl(dentry->d_inode, acl, type);
+#endif
 out:
 	posix_acl_release(acl);

@@ -1,9 +1,14 @@
 #ifndef _SCOUTFS_ACL_H_
 #define _SCOUTFS_ACL_H_

+#ifdef KC_GET_ACL_DENTRY
+struct posix_acl *scoutfs_get_acl(KC_VFS_NS_DEF struct dentry *dentry, int type);
+int scoutfs_set_acl(KC_VFS_NS_DEF struct dentry *dentry, struct posix_acl *acl, int type);
+#else
 struct posix_acl *scoutfs_get_acl(struct inode *inode, int type);
-struct posix_acl *scoutfs_get_acl_locked(struct inode *inode, int type, struct scoutfs_lock *lock);
 int scoutfs_set_acl(struct inode *inode, struct posix_acl *acl, int type);
+#endif
+struct posix_acl *scoutfs_get_acl_locked(struct inode *inode, int type, struct scoutfs_lock *lock);
 int scoutfs_set_acl_locked(struct inode *inode, struct posix_acl *acl, int type,
 			   struct scoutfs_lock *lock, struct list_head *ind_locks);
 #ifdef KC_XATTR_STRUCT_XATTR_HANDLER
@@ -857,7 +857,7 @@ static int find_zone_extent(struct super_block *sb, struct scoutfs_alloc_root *r
 		.zone = SCOUTFS_FREE_EXTENT_ORDER_ZONE,
 	};
 	struct scoutfs_extent found;
-	struct scoutfs_extent ext;
+	struct scoutfs_extent ext = {0,};
 	u64 start;
 	u64 len;
 	int nr;
@@ -22,6 +22,8 @@
 #include <linux/rhashtable.h>
 #include <linux/random.h>
 #include <linux/sched/mm.h>
+#include <linux/list_lru.h>
+#include <linux/stacktrace.h>

 #include "format.h"
 #include "super.h"
@@ -38,26 +40,12 @@
 * than the page size.  Callers can have their own contexts for tracking
 * dirty blocks that are written together.  We pin dirty blocks in
 * memory and only checksum them all as they're all written.
- *
- * Memory reclaim is driven by maintaining two very coarse groups of
- * blocks.  As we access blocks we mark them with an increasing counter
- * to discourage them from being reclaimed.  We then define a threshold
- * at the current counter minus half the population.  Recent blocks have
- * a counter greater than the threshold, and all other blocks with
- * counters less than it are considered older and are candidates for
- * reclaim.  This results in access updates rarely modifying an atomic
- * counter as blocks need to be moved into the recent group, and shrink
- * can randomly scan blocks looking for the half of the population that
- * will be in the old group.  It's reasonably effective, but is
- * particularly efficient and avoids contention between concurrent
- * accesses and shrinking.
 */

 struct block_info {
 	struct super_block *sb;
-	atomic_t total_inserted;
-	atomic64_t access_counter;
 	struct rhashtable ht;
+	struct list_lru lru;
 	wait_queue_head_t waitq;
 	KC_DEFINE_SHRINKER(shrinker);
 	struct work_struct free_work;
@@ -76,28 +64,15 @@ enum block_status_bits {
 	BLOCK_BIT_PAGE_ALLOC,	/* page (possibly high order) allocation */
 	BLOCK_BIT_VIRT,		/* mapped virt allocation */
 	BLOCK_BIT_CRC_VALID,	/* crc has been verified */
+	BLOCK_BIT_ACCESSED,	/* seen by lookup since last lru add/walk */
 };

-/*
- * We want to tie atomic changes in refcounts to whether or not the
- * block is still visible in the hash table, so we store the hash
- * table's reference up at a known high bit.  We could naturally set the
- * inserted bit through excessive refcount increments.  We don't do
- * anything about that but at least warn if we get close.
- *
- * We're avoiding the high byte for no real good reason, just out of a
- * historical fear of implementations that don't provide the full
- * precision.
- */
-#define BLOCK_REF_INSERTED	(1U << 23)
-#define BLOCK_REF_FULL		(BLOCK_REF_INSERTED >> 1)
-
 struct block_private {
 	struct scoutfs_block bl;
 	struct super_block *sb;
 	atomic_t refcount;
-	u64 accessed;
 	struct rhash_head ht_head;
+	struct list_head lru_head;
 	struct list_head dirty_entry;
 	struct llist_node free_node;
 	unsigned long bits;
@@ -106,13 +81,15 @@ struct block_private {
 		struct page *page;
 		void *virt;
 	};
+	unsigned int stack_len;
+	unsigned long stack[10];
 };

 #define TRACE_BLOCK(which, bp)									\
 do {												\
 	__typeof__(bp) _bp = (bp);								\
 	trace_scoutfs_block_##which(_bp->sb, _bp, _bp->bl.blkno, atomic_read(&_bp->refcount),	\
-				    atomic_read(&_bp->io_count), _bp->bits, _bp->accessed);	\
+				    atomic_read(&_bp->io_count), _bp->bits);	\
 } while (0)

 #define BLOCK_PRIVATE(_bl) \
@@ -126,7 +103,17 @@ static __le32 block_calc_crc(struct scoutfs_block_header *hdr, u32 size)
 	return cpu_to_le32(calc);
 }

-static struct block_private *block_alloc(struct super_block *sb, u64 blkno)
+static noinline void save_block_stack(struct block_private *bp)
+{
+	bp->stack_len = stack_trace_save(bp->stack, ARRAY_SIZE(bp->stack), 2);
+}
+
+static void print_block_stack(struct block_private *bp)
+{
+	stack_trace_print(bp->stack, bp->stack_len, 1);
+}
+
+static noinline struct block_private *block_alloc(struct super_block *sb, u64 blkno)
 {
 	struct block_private *bp;
 	unsigned int nofs_flags;
@@ -176,11 +163,13 @@ static struct block_private *block_alloc(struct super_block *sb, u64 blkno)
 	bp->bl.blkno = blkno;
 	bp->sb = sb;
 	atomic_set(&bp->refcount, 1);
+	INIT_LIST_HEAD(&bp->lru_head);
 	INIT_LIST_HEAD(&bp->dirty_entry);
 	set_bit(BLOCK_BIT_NEW, &bp->bits);
 	atomic_set(&bp->io_count, 0);

 	TRACE_BLOCK(allocate, bp);
+	save_block_stack(bp);

 out:
 	if (!bp)
@@ -233,32 +222,85 @@ static void block_free_work(struct work_struct *work)
 }

 /*
- * Get a reference to a block while holding an existing reference.
+ * Users of blocks hold a refcount.  If putting a refcount drops to zero
+ * then the block is freed.
+ *
+ * Acquiring new references and claiming the exclusive right to tear
+ * down a block is built around this LIVE_REFCOUNT_BASE refcount value.
+ * As blocks are initially cached they have the live base added to their
+ * refcount.  Lookups will only increment the refcount and return blocks
+ * for reference holders while the refcount is >= than the base.
+ *
+ * To remove a block from the cache and eventually free it, either by
+ * the lru walk in the shrinker, or by reference holders, the live base
+ * is removed and turned into a normal refcount increment that will be
+ * put by the caller.  This can only be done once for a block, and once
+ * its done lookup will not return any more references.
+ */
+#define LIVE_REFCOUNT_BASE (INT_MAX ^ (INT_MAX >> 1))
+
+/*
+ * Inc the refcount while holding an incremented refcount.  We can't
+ * have so many individual reference holders that they pass the live
+ * base.
 */
 static void block_get(struct block_private *bp)
 {
-	WARN_ON_ONCE((atomic_read(&bp->refcount) & ~BLOCK_REF_INSERTED) <= 0);
+	int now = atomic_inc_return(&bp->refcount);

-	atomic_inc(&bp->refcount);
+	BUG_ON(now <= 1);
+	BUG_ON(now == LIVE_REFCOUNT_BASE);
 }

 /*
- * Get a reference to a block as long as it's been inserted in the hash
- * table and hasn't been removed.
- */ 
-static struct block_private *block_get_if_inserted(struct block_private *bp)
+ * if (*v >= u) {
+ * 	*v += a;
+ * 	return true;
+ * }
+ */
+static bool atomic_add_unless_less(atomic_t *v, int a, int u)
 {
-	int cnt;
+	int c;

 	do {
-		cnt = atomic_read(&bp->refcount);
-		WARN_ON_ONCE(cnt & BLOCK_REF_FULL);
-		if (!(cnt & BLOCK_REF_INSERTED))
-			return NULL;
+		c = atomic_read(v);
+		if (c < u)
+			return false;
+	} while (atomic_cmpxchg(v, c, c + a) != c);

-	} while (atomic_cmpxchg(&bp->refcount, cnt, cnt + 1) != cnt);
+	return true;
+}

-	return bp;
+static bool block_get_if_live(struct block_private *bp)
+{
+	return atomic_add_unless_less(&bp->refcount, 1, LIVE_REFCOUNT_BASE);
+}
+
+/*
+ * If the refcount still has the live base, subtract it and increment
+ * the callers refcount that they'll put.
+ */
+static bool block_get_remove_live(struct block_private *bp)
+{
+	return atomic_add_unless_less(&bp->refcount, (1 - LIVE_REFCOUNT_BASE), LIVE_REFCOUNT_BASE);
+}
+
+/*
+ * Only get the live base refcount if it is the only refcount remaining.
+ * This means that there are no active refcount holders and the block
+ * can't be dirty or under IO, which both hold references.
+ */
+static bool block_get_remove_live_only(struct block_private *bp)
+{
+	int c;
+
+	do {
+		c = atomic_read(&bp->refcount);
+		if (c != LIVE_REFCOUNT_BASE)
+			return false;
+	} while (atomic_cmpxchg(&bp->refcount, c, c - LIVE_REFCOUNT_BASE + 1) != c);
+
+	return true;
 }

 /*
@@ -290,104 +332,73 @@ static const struct rhashtable_params block_ht_params = {
 };

 /*
- * Insert a new block into the hash table.  Once it is inserted in the
- * hash table readers can start getting references.  The caller may have
- * multiple refs but the block can't already be inserted.
+ * Insert the block into the cache so that it's visible for lookups.
+ * The caller can hold references (including for a dirty block).
+ *
+ * We make sure the base is added and the block is in the lru once it's
+ * in the hash.  If hash table insertion fails it'll be briefly visible
+ * in the lru, but won't be isolated/evicted because we hold an
+ * incremented refcount in addition to the live base.
 */
 static int block_insert(struct super_block *sb, struct block_private *bp)
 {
 	DECLARE_BLOCK_INFO(sb, binf);
 	int ret;

-	WARN_ON_ONCE(atomic_read(&bp->refcount) & BLOCK_REF_INSERTED);
-
+	BUG_ON(atomic_read(&bp->refcount) >= LIVE_REFCOUNT_BASE);
+	atomic_add(LIVE_REFCOUNT_BASE, &bp->refcount);
+	smp_mb__after_atomic(); /* make sure live base is visible to list_lru walk */
+	list_lru_add_obj(&binf->lru, &bp->lru_head);
 retry:
-	atomic_add(BLOCK_REF_INSERTED, &bp->refcount);
 	ret = rhashtable_lookup_insert_fast(&binf->ht, &bp->ht_head, block_ht_params);
 	if (ret < 0) {
-		atomic_sub(BLOCK_REF_INSERTED, &bp->refcount);
 		if (ret == -EBUSY) {
 			/* wait for pending rebalance to finish */
 			synchronize_rcu();
 			goto retry;
+		} else {
+			atomic_sub(LIVE_REFCOUNT_BASE, &bp->refcount);
+			BUG_ON(atomic_read(&bp->refcount) >= LIVE_REFCOUNT_BASE);
+			list_lru_del_obj(&binf->lru, &bp->lru_head);
 		}
 	} else {
-		atomic_inc(&binf->total_inserted);
 		TRACE_BLOCK(insert, bp);
 	}

 	return ret;
 }

-static u64 accessed_recently(struct block_info *binf)
-{
-	return atomic64_read(&binf->access_counter) - (atomic_read(&binf->total_inserted) >> 1);
-}
-
 /*
- * Make sure that a block that is being accessed is less likely to be
- * reclaimed if it is seen by the shrinker.   If the block hasn't been
- * accessed recently we update its accessed value.
+ * Indicate to the lru walker that this block has been accessed since it
+ * was added or last walked.
 */
 static void block_accessed(struct super_block *sb, struct block_private *bp)
 {
-	DECLARE_BLOCK_INFO(sb, binf);
-
-	if (bp->accessed == 0 || bp->accessed < accessed_recently(binf)) {
+	if (!test_and_set_bit(BLOCK_BIT_ACCESSED, &bp->bits))
 		scoutfs_inc_counter(sb, block_cache_access_update);
-		bp->accessed = atomic64_inc_return(&binf->access_counter);
-	}
 }

 /*
- * The caller wants to remove the block from the hash table and has an
- * idea what the refcount should be.  If the refcount does still
- * indicate that the block is hashed, and we're able to clear that bit,
- * then we can remove it from the hash table.
+ * Remove the block from the cache.  When this returns the block won't
+ * be visible for additional references from lookup.
 *
- * The caller makes sure that it's safe to be referencing this block,
- * either with their own held reference (most everything) or by being in
- * an rcu grace period (shrink).
- */
-static bool block_remove_cnt(struct super_block *sb, struct block_private *bp, int cnt)
-{
-	DECLARE_BLOCK_INFO(sb, binf);
-	int ret;
-
-	if ((cnt & BLOCK_REF_INSERTED) &&
-	    (atomic_cmpxchg(&bp->refcount, cnt, cnt & ~BLOCK_REF_INSERTED) == cnt)) {
-
-		TRACE_BLOCK(remove, bp);
-		ret = rhashtable_remove_fast(&binf->ht, &bp->ht_head, block_ht_params);
-		WARN_ON_ONCE(ret); /* must have been inserted */
-		atomic_dec(&binf->total_inserted);
-		return true;
-	}
-
-	return false;
-}
-
-/*
- * Try to remove the block from the hash table as long as the refcount
- * indicates that it is still in the hash table.  This can be racing
- * with normal refcount changes so it might have to retry.
+ * We always try and remove from the hash table.  It's safe to remove a
+ * block that isn't hashed, it just returns -ENOENT.
+ *
+ * This is racing with the lru walk in the shrinker also trying to
+ * remove idle blocks from the cache.  They both try to remove the live
+ * refcount base and perform their removal and put if they get it.
 */
 static void block_remove(struct super_block *sb, struct block_private *bp)
 {
-	int cnt;
+	DECLARE_BLOCK_INFO(sb, binf);

-	do {
-		cnt = atomic_read(&bp->refcount);
-	} while ((cnt & BLOCK_REF_INSERTED) && !block_remove_cnt(sb, bp, cnt));
-}
+	rhashtable_remove_fast(&binf->ht, &bp->ht_head, block_ht_params);

-/*
- * Take one shot at removing the block from the hash table if it's still
- * in the hash table and the caller has the only other reference.
- */
-static bool block_remove_solo(struct super_block *sb, struct block_private *bp)
-{
-	return block_remove_cnt(sb, bp, BLOCK_REF_INSERTED | 1);
+	if (block_get_remove_live(bp)) {
+		list_lru_del_obj(&binf->lru, &bp->lru_head);
+		block_put(sb, bp);
+	}
 }

 static bool io_busy(struct block_private *bp)
@@ -396,37 +407,6 @@ static bool io_busy(struct block_private *bp)
 	return test_bit(BLOCK_BIT_IO_BUSY, &bp->bits);
 }

-/*
- * Called during shutdown with no other users.
- */
-static void block_remove_all(struct super_block *sb)
-{
-	DECLARE_BLOCK_INFO(sb, binf);
-	struct rhashtable_iter iter;
-	struct block_private *bp;
-
-	rhashtable_walk_enter(&binf->ht, &iter);
-	rhashtable_walk_start(&iter);
-
-	for (;;) {
-		bp = rhashtable_walk_next(&iter);
-		if (bp == NULL)
-			break;
-		if (bp == ERR_PTR(-EAGAIN))
-			continue;
-
-		if (block_get_if_inserted(bp)) {
-			block_remove(sb, bp);
-			WARN_ON_ONCE(atomic_read(&bp->refcount) != 1);
-			block_put(sb, bp);
-		}
-	}
-
-	rhashtable_walk_stop(&iter);
-	rhashtable_walk_exit(&iter);
-
-	WARN_ON_ONCE(atomic_read(&binf->total_inserted) != 0);
-}

 /*
 * XXX The io_count and sb fields in the block_private are only used
@@ -488,7 +468,7 @@ static int block_submit_bio(struct super_block *sb, struct block_private *bp,
 	int ret = 0;

 	if (scoutfs_forcing_unmount(sb))
-		return -EIO;
+		return -ENOLINK;

 	sector = bp->bl.blkno << (SCOUTFS_BLOCK_LG_SHIFT - 9);

@@ -543,6 +523,10 @@ static int block_submit_bio(struct super_block *sb, struct block_private *bp,
 	return ret;
 }

+/*
+ * Return a block with an elevated refcount if it was present in the
+ * hash table and its refcount didn't indicate that it was being freed.
+ */
 static struct block_private *block_lookup(struct super_block *sb, u64 blkno)
 {
 	DECLARE_BLOCK_INFO(sb, binf);
@@ -550,8 +534,8 @@ static struct block_private *block_lookup(struct super_block *sb, u64 blkno)

 	rcu_read_lock();
 	bp = rhashtable_lookup(&binf->ht, &blkno, block_ht_params);
-	if (bp)
-		bp = block_get_if_inserted(bp);
+	if (bp && !block_get_if_live(bp))
+		bp = NULL;
 	rcu_read_unlock();

 	return bp;
@@ -712,8 +696,8 @@ retry:

 	ret = 0;
 out:
-	if ((ret == -ESTALE || scoutfs_trigger(sb, BLOCK_REMOVE_STALE)) &&
-	    !retried && !block_is_dirty(bp)) {
+	if (!retried && !IS_ERR_OR_NULL(bp) && !block_is_dirty(bp) &&
+	    (ret == -ESTALE || scoutfs_trigger(sb, BLOCK_REMOVE_STALE))) {
 		retried = true;
 		scoutfs_inc_counter(sb, block_cache_remove_stale);
 		block_remove(sb, bp);
@@ -1078,100 +1062,106 @@ static unsigned long block_count_objects(struct shrinker *shrink, struct shrink_
 	struct super_block *sb = binf->sb;

 	scoutfs_inc_counter(sb, block_cache_count_objects);
-
-	return shrinker_min_long(atomic_read(&binf->total_inserted));
+	return list_lru_shrink_count(&binf->lru, sc);
+}
+
+struct isolate_args {
+	struct super_block *sb;
+	struct list_head dispose;
+};
+
+#define DECLARE_ISOLATE_ARGS(sb_, name_) \
+	struct isolate_args name_ = { \
+		.sb = sb_, \
+		.dispose = LIST_HEAD_INIT(name_.dispose), \
+	}
+
+static enum lru_status isolate_lru_block(struct list_head *item, struct list_lru_one *list,
+					 void *cb_arg)
+{
+	struct block_private *bp = container_of(item, struct block_private, lru_head);
+	struct isolate_args *ia = cb_arg;
+
+	TRACE_BLOCK(isolate, bp);
+
+	/* rotate accessed blocks to the tail of the list (lazy promotion) */
+	if (test_and_clear_bit(BLOCK_BIT_ACCESSED, &bp->bits)) {
+		scoutfs_inc_counter(ia->sb, block_cache_isolate_rotate);
+		return LRU_ROTATE;
+	}
+
+	/* any refs, including dirty/io, stop us from acquiring lru refcount */
+	if (!block_get_remove_live_only(bp)) {
+		scoutfs_inc_counter(ia->sb, block_cache_isolate_skip);
+		return LRU_SKIP;
+	}
+
+	scoutfs_inc_counter(ia->sb, block_cache_isolate_removed);
+	list_lru_isolate_move(list, &bp->lru_head, &ia->dispose);
+	return LRU_REMOVED;
+}
+
+static void shrink_dispose_blocks(struct super_block *sb, struct list_head *dispose)
+{
+	struct block_private *bp;
+	struct block_private *bp__;
+
+	list_for_each_entry_safe(bp, bp__, dispose, lru_head) {
+		list_del_init(&bp->lru_head);
+		block_remove(sb, bp);
+		block_put(sb, bp);
+	}
 }

-/*
- * Remove a number of cached blocks that haven't been used recently.
- *
- * We don't maintain a strictly ordered LRU to avoid the contention of
- * accesses always moving blocks around in some precise global
- * structure.
- *
- * Instead we use counters to divide the blocks into two roughly equal
- * groups by how recently they were accessed.  We randomly walk all
- * inserted blocks looking for any blocks in the older half to remove
- * and free.  The random walk and there being two groups means that we
- * typically only walk a small multiple of the number we're looking for
- * before we find them all.
- *
- * Our rcu walk of blocks can see blocks in all stages of their life
- * cycle, from dirty blocks to those with 0 references that are queued
- * for freeing.  We only want to free idle inserted blocks so we
- * atomically remove blocks when the only references are ours and the
- * hash table.
- */
 static unsigned long block_scan_objects(struct shrinker *shrink, struct shrink_control *sc)
 {
 	struct block_info *binf = KC_SHRINKER_CONTAINER_OF(shrink, struct block_info);
 	struct super_block *sb = binf->sb;
-	struct rhashtable_iter iter;
-	struct block_private *bp;
-	bool stop = false;
-	unsigned long freed = 0;
-	unsigned long nr = sc->nr_to_scan;
-	u64 recently;
+	DECLARE_ISOLATE_ARGS(sb, ia);
+	unsigned long freed;

 	scoutfs_inc_counter(sb, block_cache_scan_objects);

-	recently = accessed_recently(binf);
-	rhashtable_walk_enter(&binf->ht, &iter);
-	rhashtable_walk_start(&iter);
+	freed = kc_list_lru_shrink_walk(&binf->lru, sc, isolate_lru_block, &ia);
+	shrink_dispose_blocks(sb, &ia.dispose);
+	return freed;
+}

-	/*
-	 * This isn't great but I don't see a better way.  We want to
-	 * walk the hash from a random point so that we're not
-	 * constantly walking over the same region that we've already
-	 * freed old blocks within.  The interface doesn't let us do
-	 * this explicitly, but this seems to work?  The difference this
-	 * makes is enormous, around a few orders of magnitude fewer
-	 * _nexts per shrink.
-	 */
-	if (iter.walker.tbl)
-		iter.slot = prandom_u32_max(iter.walker.tbl->size);
+static enum lru_status dump_lru_block(struct list_head *item, struct list_lru_one *list,
+					 void *cb_arg)
+{
+	struct block_private *bp = container_of(item, struct block_private, lru_head);

-	while (nr > 0) {
-		bp = rhashtable_walk_next(&iter);
-		if (bp == NULL)
-			break;
-		if (bp == ERR_PTR(-EAGAIN)) {
-			/*
-			 * We can be called from reclaim in the allocation
-			 * to resize the hash table itself.  We have to
-			 * return so that the caller can proceed and
-			 * enable hash table iteration again.
-			 */
-			scoutfs_inc_counter(sb, block_cache_shrink_stop);
-			stop = true;
-			break;
-		}
+	printk("blkno %llu refcount 0x%x io_count %d bits 0x%lx\n",
+		bp->bl.blkno, atomic_read(&bp->refcount), atomic_read(&bp->io_count),
+		bp->bits);
+	print_block_stack(bp);

-		scoutfs_inc_counter(sb, block_cache_shrink_next);
+	return LRU_SKIP;
+}

-		if (bp->accessed >= recently) {
-			scoutfs_inc_counter(sb, block_cache_shrink_recent);
-			continue;
-		}
+/*
+ * Called during shutdown with no other users.  The isolating walk must
+ * find blocks on the lru that only have references for presence on the
+ * lru and in the hash table.
+ */
+static void block_shrink_all(struct super_block *sb)
+{
+	DECLARE_BLOCK_INFO(sb, binf);
+	DECLARE_ISOLATE_ARGS(sb, ia);
+	long count;

-		if (block_get_if_inserted(bp)) {
-			if (block_remove_solo(sb, bp)) {
-				scoutfs_inc_counter(sb, block_cache_shrink_remove);
-				TRACE_BLOCK(shrink, bp);
-				freed++;
-				nr--;
-			}
-			block_put(sb, bp);
-		}
+	count = DIV_ROUND_UP(list_lru_count(&binf->lru), 128) * 2;
+	do {
+		kc_list_lru_walk(&binf->lru, isolate_lru_block, &ia, 128);
+		shrink_dispose_blocks(sb, &ia.dispose);
+	} while (list_lru_count(&binf->lru) > 0 && --count > 0);
+
+	count = list_lru_count(&binf->lru);
+	if (count > 0) {
+		scoutfs_err(sb, "failed to isolate/dispose %ld blocks", count);
+		kc_list_lru_walk(&binf->lru, dump_lru_block, sb, count);
 	}
-
-	rhashtable_walk_stop(&iter);
-	rhashtable_walk_exit(&iter);
-
-	if (stop)
-		return SHRINK_STOP;
-	else
-		return freed;
 }

 struct sm_block_completion {
@@ -1210,7 +1200,7 @@ static int sm_block_io(struct super_block *sb, struct block_device *bdev, blk_op
 	BUILD_BUG_ON(PAGE_SIZE < SCOUTFS_BLOCK_SM_SIZE);

 	if (scoutfs_forcing_unmount(sb))
-		return -EIO;
+		return -ENOLINK;

 	if (WARN_ON_ONCE(len > SCOUTFS_BLOCK_SM_SIZE) ||
 	    WARN_ON_ONCE(!op_is_write(opf) && !blk_crc))
@@ -1276,7 +1266,7 @@ int scoutfs_block_write_sm(struct super_block *sb,
 int scoutfs_block_setup(struct super_block *sb)
 {
 	struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
-	struct block_info *binf;
+	struct block_info *binf = NULL;
 	int ret;

 	binf = kzalloc(sizeof(struct block_info), GFP_KERNEL);
@@ -1285,15 +1275,15 @@ int scoutfs_block_setup(struct super_block *sb)
 		goto out;
 	}

-	ret = rhashtable_init(&binf->ht, &block_ht_params);
-	if (ret < 0) {
-		kfree(binf);
+	ret = list_lru_init(&binf->lru);
+	if (ret < 0)
+		goto out;
+
+	ret = rhashtable_init(&binf->ht, &block_ht_params);
+	if (ret < 0)
 		goto out;
-	}

 	binf->sb = sb;
-	atomic_set(&binf->total_inserted, 0);
-	atomic64_set(&binf->access_counter, 0);
 	init_waitqueue_head(&binf->waitq);
 	KC_INIT_SHRINKER_FUNCS(&binf->shrinker, block_count_objects,
 			       block_scan_objects);
@@ -1305,8 +1295,10 @@ int scoutfs_block_setup(struct super_block *sb)

 	ret = 0;
 out:
-	if (ret)
-		scoutfs_block_destroy(sb);
+	if (ret < 0 && binf) {
+		list_lru_destroy(&binf->lru);
+		kfree(binf);
+	}

 	return ret;
 }
@@ -1318,9 +1310,10 @@ void scoutfs_block_destroy(struct super_block *sb)

 	if (binf) {
 		KC_UNREGISTER_SHRINKER(&binf->shrinker);
-		block_remove_all(sb);
+		block_shrink_all(sb);
 		flush_work(&binf->free_work);
 		rhashtable_destroy(&binf->ht);
+		list_lru_destroy(&binf->lru);

 		kfree(binf);
 		sbi->block_info = NULL;
@@ -435,8 +435,8 @@ static int lookup_mounted_client_item(struct super_block *sb, u64 rid)
 	if (ret == -ENOENT)
 		ret = 0;

-	kfree(super);
 out:
+	kfree(super);
 	return ret;
 }

@@ -26,17 +26,15 @@
 	EXPAND_COUNTER(block_cache_alloc_page_order)		\
 	EXPAND_COUNTER(block_cache_alloc_virt)			\
 	EXPAND_COUNTER(block_cache_end_io_error)		\
+	EXPAND_COUNTER(block_cache_isolate_removed)		\
+	EXPAND_COUNTER(block_cache_isolate_rotate)		\
+	EXPAND_COUNTER(block_cache_isolate_skip)		\
 	EXPAND_COUNTER(block_cache_forget)			\
 	EXPAND_COUNTER(block_cache_free)			\
 	EXPAND_COUNTER(block_cache_free_work)			\
 	EXPAND_COUNTER(block_cache_remove_stale)		\
 	EXPAND_COUNTER(block_cache_count_objects)		\
 	EXPAND_COUNTER(block_cache_scan_objects)		\
-	EXPAND_COUNTER(block_cache_shrink)			\
-	EXPAND_COUNTER(block_cache_shrink_next)			\
-	EXPAND_COUNTER(block_cache_shrink_recent)		\
-	EXPAND_COUNTER(block_cache_shrink_remove)		\
-	EXPAND_COUNTER(block_cache_shrink_stop)			\
 	EXPAND_COUNTER(btree_compact_values)			\
 	EXPAND_COUNTER(btree_compact_values_enomem)		\
 	EXPAND_COUNTER(btree_delete)				\
@@ -90,6 +88,7 @@
 	EXPAND_COUNTER(forest_read_items)			\
 	EXPAND_COUNTER(forest_roots_next_hint)			\
 	EXPAND_COUNTER(forest_set_bloom_bits)			\
+	EXPAND_COUNTER(inode_deleted)				\
 	EXPAND_COUNTER(item_cache_count_objects)		\
 	EXPAND_COUNTER(item_cache_scan_objects)			\
 	EXPAND_COUNTER(item_clear_dirty)			\
@@ -117,10 +116,11 @@
 	EXPAND_COUNTER(item_pcpu_page_hit)			\
 	EXPAND_COUNTER(item_pcpu_page_miss)			\
 	EXPAND_COUNTER(item_pcpu_page_miss_keys)		\
+	EXPAND_COUNTER(item_read_pages_barrier)			\
+	EXPAND_COUNTER(item_read_pages_retry)			\
 	EXPAND_COUNTER(item_read_pages_split)			\
 	EXPAND_COUNTER(item_shrink_page)			\
 	EXPAND_COUNTER(item_shrink_page_dirty)			\
-	EXPAND_COUNTER(item_shrink_page_reader)			\
 	EXPAND_COUNTER(item_shrink_page_trylock)		\
 	EXPAND_COUNTER(item_update)				\
 	EXPAND_COUNTER(item_write_dirty)			\
@@ -145,6 +145,7 @@
 	EXPAND_COUNTER(lock_shrink_work)			\
 	EXPAND_COUNTER(lock_unlock)				\
 	EXPAND_COUNTER(lock_wait)				\
+	EXPAND_COUNTER(log_merge_no_finalized)			\
 	EXPAND_COUNTER(log_merge_wait_timeout)			\
 	EXPAND_COUNTER(net_dropped_response)			\
 	EXPAND_COUNTER(net_send_bytes)				\
@@ -181,6 +182,7 @@
 	EXPAND_COUNTER(quorum_send_vote)			\
 	EXPAND_COUNTER(quorum_server_shutdown)			\
 	EXPAND_COUNTER(quorum_term_follower)			\
+	EXPAND_COUNTER(reclaimed_open_logs)			\
 	EXPAND_COUNTER(server_commit_hold)			\
 	EXPAND_COUNTER(server_commit_queue)			\
 	EXPAND_COUNTER(server_commit_worker)			\
@@ -2053,6 +2053,9 @@ const struct inode_operations scoutfs_dir_iops = {
 #endif
 	.listxattr	= scoutfs_listxattr,
 	.get_acl	= scoutfs_get_acl,
+#ifdef KC_GET_ACL_DENTRY
+	.set_acl	= scoutfs_set_acl,
+#endif
 	.symlink	= scoutfs_symlink,
 	.permission	= scoutfs_permission,
 #ifdef KC_LINUX_HAVE_RHEL_IOPS_WRAPPER
@@ -470,7 +470,7 @@ struct scoutfs_srch_compact {
 * @get_trans_seq, @commit_trans_seq: These pair of sequence numbers
 * determine if a transaction is currently open for the mount that owns
 * the log_trees struct.  get_trans_seq is advanced by the server as the
- * transaction is opened.   The server sets comimt_trans_seq equal to
+ * transaction is opened.   The server sets commit_trans_seq equal to
 * get_ as the transaction is committed.
 */
 struct scoutfs_log_trees {
@@ -1091,7 +1091,8 @@ enum scoutfs_net_cmd {
 	EXPAND_NET_ERRNO(ENOMEM)	\
 	EXPAND_NET_ERRNO(EIO)		\
 	EXPAND_NET_ERRNO(ENOSPC)	\
-	EXPAND_NET_ERRNO(EINVAL)
+	EXPAND_NET_ERRNO(EINVAL)	\
+	EXPAND_NET_ERRNO(ENOLINK)

 #undef EXPAND_NET_ERRNO
 #define EXPAND_NET_ERRNO(which) SCOUTFS_NET_ERR_##which,
@@ -150,6 +150,9 @@ static const struct inode_operations scoutfs_file_iops = {
 #endif
 	.listxattr	= scoutfs_listxattr,
 	.get_acl	= scoutfs_get_acl,
+#ifdef KC_GET_ACL_DENTRY
+	.set_acl	= scoutfs_set_acl,
+#endif
 	.fiemap		= scoutfs_data_fiemap,
 };

@@ -163,6 +166,9 @@ static const struct inode_operations scoutfs_special_iops = {
 #endif
 	.listxattr	= scoutfs_listxattr,
 	.get_acl	= scoutfs_get_acl,
+#ifdef KC_GET_ACL_DENTRY
+	.set_acl	= scoutfs_set_acl,
+#endif
 };

 /*
@@ -476,7 +482,7 @@ int scoutfs_complete_truncate(struct inode *inode, struct scoutfs_lock *lock)
 }

 /*
- * If we're changing the file size than the contents of the file are
+ * If we're changing the file size then the contents of the file are
 * changing and we increment the data_version.  This would prevent
 * staging because the data_version is per-inode today, not per-extent.
 * So if there are any offline extents within the new size then we need
@@ -1854,6 +1860,9 @@ static int try_delete_inode_items(struct super_block *sb, u64 ino)
 		goto out;

 	ret = delete_inode_items(sb, ino, &sinode, lock, orph_lock);
+	if (ret == 0)
+		scoutfs_inc_counter(sb, inode_deleted);
+
 out:
 	if (clear_trying)
 		clear_bit(bit_nr, ldata->trying);
@@ -1962,6 +1971,8 @@ static void iput_worker(struct work_struct *work)
 		while (count-- > 0)
 			iput(inode);

+		cond_resched();
+
 		/* can't touch inode after final iput */

 		spin_lock(&inf->iput_lock);
@@ -2183,7 +2194,7 @@ int scoutfs_inode_walk_writeback(struct super_block *sb, bool write)
 	struct scoutfs_inode_info *si;
 	struct scoutfs_inode_info *tmp;
 	struct inode *inode;
-	int ret;
+	int ret = 0;

 	spin_lock(&inf->writeback_lock);

@@ -954,6 +954,9 @@ static int copy_alloc_detail_to_user(struct super_block *sb, void *arg,
 	if (args->copied == args->nr)
 		return -EOVERFLOW;

+	/* .type and .pad need clearing */
+	memset(&ade, 0, sizeof(struct scoutfs_ioctl_alloc_detail_entry));
+
 	ade.blocks = blocks;
 	ade.id = id;
 	ade.meta = !!meta;
@@ -1369,7 +1372,7 @@ static long scoutfs_ioc_get_referring_entries(struct file *file, unsigned long a
 			ent.d_type = bref->d_type;
 			ent.name_len = name_len;

-			if (copy_to_user(uent, &ent, sizeof(struct scoutfs_ioctl_dirent)) ||
+			if (copy_to_user(uent, &ent, offsetof(struct scoutfs_ioctl_dirent, name[0])) ||
 			    copy_to_user(&uent->name[0], bref->dent.name, name_len) ||
 			    put_user('\0', &uent->name[name_len])) {
 				ret = -EFAULT;
@@ -86,6 +86,8 @@ struct item_cache_info {
 	/* often walked, but per-cpu refs are fast path */
 	rwlock_t rwlock;
 	struct rb_root pg_root;
+	/* stop readers from caching stale items behind reclaimed cleaned written items */
+	u64 read_dirty_barrier;

 	/* page-granular modification by writers, then exclusive to commit */
 	spinlock_t dirty_lock;
@@ -96,10 +98,6 @@ struct item_cache_info {
 	spinlock_t lru_lock;
 	struct list_head lru_list;
 	unsigned long lru_pages;
-
-	/* written by page readers, read by shrink */
-	spinlock_t active_lock;
-	struct list_head active_list;
 };

 #define DECLARE_ITEM_CACHE_INFO(sb, name) \
@@ -1285,78 +1283,6 @@ static int cache_empty_page(struct super_block *sb,
 	return 0;
 }

-/*
- * Readers operate independently from dirty items and transactions.
- * They read a set of persistent items and insert them into the cache
- * when there aren't already pages whose key range contains the items.
- * This naturally prefers cached dirty items over stale read items.
- *
- * We have to deal with the case where dirty items are written and
- * invalidated while a read is in flight.   The reader won't have seen
- * the items that were dirty in their persistent roots as they started
- * reading.  By the time they insert their read pages the previously
- * dirty items have been reclaimed and are not in the cache.  The old
- * stale items will be inserted in their place, effectively corrupting
- * by having the dirty items disappear.
- *
- * We fix this by tracking the max seq of items in pages.  As readers
- * start they record the current transaction seq.  Invalidation skips
- * pages with a max seq greater than the first reader seq because the
- * items in the page have to stick around to prevent the readers stale
- * items from being inserted.
- *
- * This naturally only affects a small set of pages with items that were
- * written relatively recently.  If we're in memory pressure then we
- * probably have a lot of pages and they'll naturally have items that
- * were visible to any raders.  We don't bother with the complicated and
- * expensive further refinement of tracking the ranges that are being
- * read and comparing those with pages to invalidate.
- */
-struct active_reader {
-	struct list_head head;
-	u64 seq;
-};
-
-#define INIT_ACTIVE_READER(rdr) \
-	struct active_reader rdr = { .head = LIST_HEAD_INIT(rdr.head) }
-
-static void add_active_reader(struct super_block *sb, struct active_reader *active)
-{
-	DECLARE_ITEM_CACHE_INFO(sb, cinf);
-
-	BUG_ON(!list_empty(&active->head));
-
-	active->seq = scoutfs_trans_sample_seq(sb);
-
-	spin_lock(&cinf->active_lock);
-	list_add_tail(&active->head, &cinf->active_list);
-	spin_unlock(&cinf->active_lock);
-}
-
-static u64 first_active_reader_seq(struct item_cache_info *cinf)
-{
-	struct active_reader *active;
-	u64 first;
-
-	/* only the calling task adds or deletes this active */
-	spin_lock(&cinf->active_lock);
-	active = list_first_entry_or_null(&cinf->active_list, struct active_reader, head);
-	first = active ? active->seq : U64_MAX;
-	spin_unlock(&cinf->active_lock);
-
-	return first;
-}
-
-static void del_active_reader(struct item_cache_info *cinf, struct active_reader *active)
-{
-	/* only the calling task adds or deletes this active */
-	if (!list_empty(&active->head)) {
-		spin_lock(&cinf->active_lock);
-		list_del_init(&active->head);
-		spin_unlock(&cinf->active_lock);
-	}
-}
-
 /*
 * Add a newly read item to the pages that we're assembling for
 * insertion into the cache.   These pages are private, they only exist
@@ -1450,24 +1376,34 @@ static int read_page_item(struct super_block *sb, struct scoutfs_key *key, u64 s
 * and duplicates, we insert any resulting pages which don't overlap
 * with existing cached pages.
 *
- * We only insert uncached regions because this is called with cluster
- * locks held, but without locking the cache.  The regions we read can
- * be stale with respect to the current cache, which can be read and
- * dirtied by other cluster lock holders on our node, but the cluster
- * locks protect the stable items we read.  Invalidation is careful not
- * to drop pages that have items that we couldn't see because they were
- * dirty when we started reading.
- *
 * The forest item reader is reading stable trees that could be
 * overwritten.  It can return -ESTALE which we return to the caller who
 * will retry the operation and work with a new set of more recent
 * btrees.
+ *
+ * We only insert uncached regions because this is called with cluster
+ * locks held, but without locking the cache.  The regions we read can
+ * be stale with respect to the current cache, which can be read and
+ * dirtied by other cluster lock holders on our node, but the cluster
+ * locks protect the stable items we read.
+ *
+ * Using the presence of locally written dirty pages to override stale
+ * read pages only works if, well, the more recent locally written pages
+ * are still present.  Readers are totally decoupled from writers and
+ * can have a set of items that is very old indeed.  In the mean time
+ * more recent items would have been dirtied locally, committed,
+ * cleaned, and reclaimed.  We have a coarse barrier which ensures that
+ * readers can't insert items read from old roots from before local data
+ * was written.  If a write completes while a read is in progress the
+ * read will have to retry.  The retried read can use cached blocks so
+ * we're relying on reads being much faster than writes to reduce the
+ * overhead to mostly cpu work of recollecting the items from cached
+ * blocks via a more recent root from the server.
 */
 static int read_pages(struct super_block *sb, struct item_cache_info *cinf,
 		      struct scoutfs_key *key, struct scoutfs_lock *lock)
 {
 	struct rb_root root = RB_ROOT;
-	INIT_ACTIVE_READER(active);
 	struct cached_page *right = NULL;
 	struct cached_page *pg;
 	struct cached_page *rd;
@@ -1480,6 +1416,7 @@ static int read_pages(struct super_block *sb, struct item_cache_info *cinf,
 	struct rb_node *par;
 	struct rb_node *pg_tmp;
 	struct rb_node *item_tmp;
+	u64 rdbar;
 	int pgi;
 	int ret;

@@ -1493,8 +1430,9 @@ static int read_pages(struct super_block *sb, struct item_cache_info *cinf,
 	pg->end = lock->end;
 	rbtree_insert(&pg->node, NULL, &root.rb_node, &root);

-	/* set active reader seq before reading persistent roots */
-	add_active_reader(sb, &active);
+	read_lock(&cinf->rwlock);
+	rdbar = cinf->read_dirty_barrier;
+	read_unlock(&cinf->rwlock);

 	start = lock->start;
 	end = lock->end;
@@ -1533,6 +1471,13 @@ static int read_pages(struct super_block *sb, struct item_cache_info *cinf,
 retry:
 	write_lock(&cinf->rwlock);

+	/* can't insert if write has cleaned since we read */
+	if (cinf->read_dirty_barrier != rdbar) {
+		scoutfs_inc_counter(sb, item_read_pages_barrier);
+		ret = -ESTALE;
+		goto unlock;
+	}
+
 	while ((rd = first_page(&root))) {

 		pg = page_rbtree_walk(sb, &cinf->pg_root, &rd->start, &rd->end,
@@ -1570,12 +1515,12 @@ retry:
 		}
 	}

+	ret = 0;
+
+unlock:
 	write_unlock(&cinf->rwlock);

-	ret = 0;
 out:
-	del_active_reader(cinf, &active);
-
 	/* free any pages we left dangling on error */
 	for_each_page_safe(&root, rd, pg_tmp) {
 		rbtree_erase(&rd->node, &root);
@@ -1635,6 +1580,7 @@ retry:
 			ret = read_pages(sb, cinf, key, lock);
 		if (ret < 0 && ret != -ESTALE)
 			goto out;
+		scoutfs_inc_counter(sb, item_read_pages_retry);
 		goto retry;
 	}

@@ -2401,6 +2347,12 @@ out:
 * The caller has successfully committed all the dirty btree blocks that
 * contained the currently dirty items.  Clear all the dirty items and
 * pages.
+ *
+ * This strange lock/trylock loop comes from sparse issuing spurious
+ * mismatched context warnings if we do anything (like unlock and relax)
+ * in the else branch of the failed trylock.  We're jumping through
+ * hoops to not use the else but still drop and reacquire the dirty_lock
+ * if the trylock fails.
 */
 int scoutfs_item_write_done(struct super_block *sb)
 {
@@ -2409,40 +2361,35 @@ int scoutfs_item_write_done(struct super_block *sb)
 	struct cached_item *tmp;
 	struct cached_page *pg;

-retry:
+	/* don't let read_pages miss written+cleaned items */
+	write_lock(&cinf->rwlock);
+	cinf->read_dirty_barrier++;
+	write_unlock(&cinf->rwlock);
+
 	spin_lock(&cinf->dirty_lock);
-
-	while ((pg = list_first_entry_or_null(&cinf->dirty_list,
-					      struct cached_page,
-					      dirty_head))) {
-
-		if (!write_trylock(&pg->rwlock)) {
+	while ((pg = list_first_entry_or_null(&cinf->dirty_list, struct cached_page, dirty_head))) {
+		if (write_trylock(&pg->rwlock)) {
 			spin_unlock(&cinf->dirty_lock);
-			cpu_relax();
-			goto retry;
-		}
+			list_for_each_entry_safe(item, tmp, &pg->dirty_list,
+						 dirty_head) {
+				clear_item_dirty(sb, cinf, pg, item);

+				if (item->delta)
+					scoutfs_inc_counter(sb, item_delta_written);
+
+				/* free deletion items */
+				if (item->deletion || item->delta)
+					erase_item(pg, item);
+				else
+					item->persistent = 1;
+			}
+
+			write_unlock(&pg->rwlock);
+			spin_lock(&cinf->dirty_lock);
+		}
 		spin_unlock(&cinf->dirty_lock);
-
-		list_for_each_entry_safe(item, tmp, &pg->dirty_list,
-					 dirty_head) {
-			clear_item_dirty(sb, cinf, pg, item);
-
-			if (item->delta)
-				scoutfs_inc_counter(sb, item_delta_written);
-
-			/* free deletion items */
-			if (item->deletion || item->delta)
-				erase_item(pg, item);
-			else
-				item->persistent = 1;
-		}
-
-		write_unlock(&pg->rwlock);
-
 		spin_lock(&cinf->dirty_lock);
-	}
-
+	} while (pg);
 	spin_unlock(&cinf->dirty_lock);

 	return 0;
@@ -2597,24 +2544,15 @@ static unsigned long item_cache_scan_objects(struct shrinker *shrink,
 	struct cached_page *tmp;
 	struct cached_page *pg;
 	unsigned long freed = 0;
-	u64 first_reader_seq;
 	int nr = sc->nr_to_scan;

 	scoutfs_inc_counter(sb, item_cache_scan_objects);

-	/* can't invalidate pages with items that weren't visible to first reader */
-	first_reader_seq = first_active_reader_seq(cinf);
-
 	write_lock(&cinf->rwlock);
 	spin_lock(&cinf->lru_lock);

 	list_for_each_entry_safe(pg, tmp, &cinf->lru_list, lru_head) {

-		if (first_reader_seq <= pg->max_seq) {
-			scoutfs_inc_counter(sb, item_shrink_page_reader);
-			continue;
-		}
-
 		if (!write_trylock(&pg->rwlock)) {
 			scoutfs_inc_counter(sb, item_shrink_page_trylock);
 			continue;
@@ -2681,8 +2619,6 @@ int scoutfs_item_setup(struct super_block *sb)
 	atomic_set(&cinf->dirty_pages, 0);
 	spin_lock_init(&cinf->lru_lock);
 	INIT_LIST_HEAD(&cinf->lru_list);
-	spin_lock_init(&cinf->active_lock);
-	INIT_LIST_HEAD(&cinf->active_list);

 	cinf->pcpu_pages = alloc_percpu(struct item_percpu_pages);
 	if (!cinf->pcpu_pages)
@@ -2715,8 +2651,6 @@ void scoutfs_item_destroy(struct super_block *sb)
 	int cpu;

 	if (cinf) {
-		BUG_ON(!list_empty(&cinf->active_list));
-
 #ifdef KC_CPU_NOTIFIER
 		unregister_hotcpu_notifier(&cinf->notifier);
 #endif
@@ -81,3 +81,69 @@ kc_generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov,
 	return written ? written : status;
 }
 #endif
+
+#include <linux/list_lru.h>
+
+#ifdef KC_LIST_LRU_WALK_CB_ITEM_LOCK
+static enum lru_status kc_isolate(struct list_head *item, spinlock_t *lock, void *cb_arg)
+{
+	struct kc_isolate_args *args = cb_arg;
+
+	/* isolate doesn't use list, nr_items updated in caller */
+	return args->isolate(item, NULL, args->cb_arg);
+}
+
+unsigned long kc_list_lru_walk(struct list_lru *lru, kc_list_lru_walk_cb_t isolate, void *cb_arg,
+				      unsigned long nr_to_walk)
+{
+	struct kc_isolate_args args = {
+		.isolate = isolate,
+		.cb_arg = cb_arg,
+	};
+
+	return list_lru_walk(lru, kc_isolate, &args, nr_to_walk);
+}
+
+unsigned long kc_list_lru_shrink_walk(struct list_lru *lru, struct shrink_control *sc,
+				      kc_list_lru_walk_cb_t isolate, void *cb_arg)
+{
+	struct kc_isolate_args args = {
+		.isolate = isolate,
+		.cb_arg = cb_arg,
+	};
+
+	return list_lru_shrink_walk(lru, sc, kc_isolate, &args);
+}
+#endif
+
+#ifdef KC_LIST_LRU_WALK_CB_LIST_LOCK
+static enum lru_status kc_isolate(struct list_head *item, struct list_lru_one *list,
+				  spinlock_t *lock, void *cb_arg)
+{
+	struct kc_isolate_args *args = cb_arg;
+
+	return args->isolate(item, list, args->cb_arg);
+}
+
+unsigned long kc_list_lru_walk(struct list_lru *lru, kc_list_lru_walk_cb_t isolate, void *cb_arg,
+				      unsigned long nr_to_walk)
+{
+	struct kc_isolate_args args = {
+		.isolate = isolate,
+		.cb_arg = cb_arg,
+	};
+
+	return list_lru_walk(lru, kc_isolate, &args, nr_to_walk);
+}
+unsigned long kc_list_lru_shrink_walk(struct list_lru *lru, struct shrink_control *sc,
+				      kc_list_lru_walk_cb_t isolate, void *cb_arg)
+{
+	struct kc_isolate_args args = {
+		.isolate = isolate,
+		.cb_arg = cb_arg,
+	};
+
+	return list_lru_shrink_walk(lru, sc, kc_isolate, &args);
+}
+
+#endif
@@ -263,6 +263,11 @@ typedef unsigned int blk_opf_t;
 #define kc__vmalloc __vmalloc
 #endif

+#ifdef KC_VFS_METHOD_MNT_IDMAP_ARG
+#define KC_VFS_NS_DEF struct mnt_idmap *mnt_idmap,
+#define KC_VFS_NS mnt_idmap,
+#define KC_VFS_INIT_NS &nop_mnt_idmap,
+#else
 #ifdef KC_VFS_METHOD_USER_NAMESPACE_ARG
 #define KC_VFS_NS_DEF struct user_namespace *mnt_user_ns,
 #define KC_VFS_NS mnt_user_ns,
@@ -272,6 +277,7 @@ typedef unsigned int blk_opf_t;
 #define KC_VFS_NS
 #define KC_VFS_INIT_NS
 #endif
+#endif /* KC_VFS_METHOD_MNT_IDMAP_ARG */

 #ifdef KC_BIO_ALLOC_DEV_OPF_ARGS
 #define kc_bio_alloc bio_alloc
@@ -410,4 +416,77 @@ static inline vm_fault_t vmf_error(int err)
 }
 #endif

+#include <linux/list_lru.h>
+
+#ifndef KC_LIST_LRU_SHRINK_COUNT_WALK
+/* we don't bother with sc->{nid,memcg} (which doesn't exist in oldest kernels) */
+static inline unsigned long list_lru_shrink_count(struct list_lru *lru,
+                                                  struct shrink_control *sc)
+{
+        return list_lru_count(lru);
+}
+static inline unsigned long
+list_lru_shrink_walk(struct list_lru *lru, struct shrink_control *sc,
+		     list_lru_walk_cb isolate, void *cb_arg)
+{
+	return list_lru_walk(lru, isolate, cb_arg, sc->nr_to_scan);
+}
+#endif
+
+#ifndef KC_LIST_LRU_ADD_OBJ
+#define list_lru_add_obj list_lru_add
+#define list_lru_del_obj list_lru_del
+#endif
+
+#if defined(KC_LIST_LRU_WALK_CB_LIST_LOCK) || defined(KC_LIST_LRU_WALK_CB_ITEM_LOCK)
+struct list_lru_one;
+typedef enum lru_status (*kc_list_lru_walk_cb_t)(struct list_head *item, struct list_lru_one *list,
+						 void *cb_arg);
+struct kc_isolate_args {
+	kc_list_lru_walk_cb_t isolate;
+	void *cb_arg;
+};
+unsigned long kc_list_lru_walk(struct list_lru *lru, kc_list_lru_walk_cb_t isolate, void *cb_arg,
+			       unsigned long nr_to_walk);
+unsigned long kc_list_lru_shrink_walk(struct list_lru *lru, struct shrink_control *sc,
+				      kc_list_lru_walk_cb_t isolate, void *cb_arg);
+#else
+#define kc_list_lru_shrink_walk list_lru_shrink_walk
+#endif
+
+#if defined(KC_LIST_LRU_WALK_CB_ITEM_LOCK)
+/* isolate moved by hand, nr_items updated in walk as _REMOVE returned */
+static inline void list_lru_isolate_move(struct list_lru_one *list, struct list_head *item,
+					 struct list_head *head)
+{
+        list_move(item, head);
+}
+#endif
+
+#ifndef KC_STACK_TRACE_SAVE
+#include <linux/stacktrace.h>
+static inline unsigned int stack_trace_save(unsigned long *store, unsigned int size,
+					    unsigned int skipnr)
+{
+        struct stack_trace trace = {
+                .entries        = store,
+                .max_entries    = size,
+                .skip           = skipnr,
+        };
+
+        save_stack_trace(&trace);
+        return trace.nr_entries;
+}
+
+static inline void stack_trace_print(unsigned long *entries, unsigned int nr_entries, int spaces)
+{
+        struct stack_trace trace = {
+                .entries        = entries,
+                .nr_entries     = nr_entries,
+        };
+
+	print_stack_trace(&trace, spaces);
+}
+#endif
+
 #endif
@@ -1,6 +1,8 @@
 #ifndef _SCOUTFS_LOCK_H_
 #define _SCOUTFS_LOCK_H_

+#include <linux/rhashtable.h>
+
 #include "key.h"
 #include "tseq.h"

@@ -19,20 +21,24 @@ struct inode_deletion_lock_data;
 */
 struct scoutfs_lock {
 	struct super_block *sb;
+	atomic_t refcount;
+	spinlock_t lock;
+	struct rcu_head rcu_head;
 	struct scoutfs_key start;
 	struct scoutfs_key end;
-	struct rb_node node;
+	struct rhash_head ht_head;
 	struct rb_node range_node;
 	u64 refresh_gen;
 	u64 write_seq;
 	u64 dirty_trans_seq;
 	struct list_head lru_head;
+	int lru_on_list;
 	wait_queue_head_t waitq;
 	unsigned long request_pending:1,
 		      invalidate_pending:1;

 	struct list_head inv_head;  /* entry in linfo's list of locks with invalidations */
-	struct list_head inv_list;  /* list of lock's invalidation requests */
+	struct list_head inv_req_list;  /* list of lock's invalidation requests */
 	struct list_head shrink_head;

 	spinlock_t cov_list_lock;
@@ -20,6 +20,7 @@
 #include <net/sock.h>
 #include <net/tcp.h>
 #include <linux/log2.h>
+#include <linux/jhash.h>

 #include "format.h"
 #include "counters.h"
@@ -31,6 +32,7 @@
 #include "endian_swap.h"
 #include "tseq.h"
 #include "fence.h"
+#include "options.h"

 /*
 * scoutfs networking delivers requests and responses between nodes.
@@ -134,6 +136,7 @@ struct message_send {
 struct message_recv {
 	struct scoutfs_tseq_entry tseq_entry;
 	struct work_struct proc_work;
+	struct list_head ordered_head;
 	struct scoutfs_net_connection *conn;
 	struct scoutfs_net_header nh;
 };
@@ -332,7 +335,7 @@ static int submit_send(struct super_block *sb,
 		return -EINVAL;

 	if (scoutfs_forcing_unmount(sb))
-		return -EIO;
+		return -ENOLINK;

 	msend = kmalloc(offsetof(struct message_send,
 				 nh.data[data_len]), GFP_NOFS);
@@ -498,6 +501,51 @@ static void scoutfs_net_proc_worker(struct work_struct *work)
 	trace_scoutfs_net_proc_work_exit(sb, 0, ret);
 }

+static void scoutfs_net_ordered_proc_worker(struct work_struct *work)
+{
+	struct scoutfs_work_list *wlist = container_of(work, struct scoutfs_work_list, work);
+	struct message_recv *mrecv;
+	struct message_recv *mrecv__;
+	LIST_HEAD(list);
+
+	spin_lock(&wlist->lock);
+	list_splice_init(&wlist->list, &list);
+	spin_unlock(&wlist->lock);
+
+	list_for_each_entry_safe(mrecv, mrecv__, &list, ordered_head) {
+		list_del_init(&mrecv->ordered_head);
+		scoutfs_net_proc_worker(&mrecv->proc_work);
+	}
+}
+
+/*
+ * Some messages require in-order processing.  But the scope of the
+ * ordering isn't global.  In the case of lock messages, it's per lock.
+ * So for these messages we hash them to a number of ordered workers who
+ * walk a list and call the usual work function in order.  This replaced
+ * first the proc work detecting OOO and re-ordering, and then only
+ * calling proc from the one recv work context.
+ */
+static void queue_ordered_proc(struct scoutfs_net_connection *conn, struct message_recv *mrecv)
+{
+	struct scoutfs_work_list *wlist;
+	struct scoutfs_net_lock *nl;
+	u32 h;
+
+	if (WARN_ON_ONCE(mrecv->nh.cmd != SCOUTFS_NET_CMD_LOCK ||
+		         le16_to_cpu(mrecv->nh.data_len) != sizeof(struct scoutfs_net_lock)))
+		return scoutfs_net_proc_worker(&mrecv->proc_work);
+
+	nl = (void *)mrecv->nh.data;
+	h = jhash(&nl->key, sizeof(struct scoutfs_key), 0x6fdd3cd5);
+	wlist = &conn->ordered_proc_wlists[h % conn->ordered_proc_nr];
+
+	spin_lock(&wlist->lock);
+	list_add_tail(&mrecv->ordered_head, &wlist->list);
+	spin_unlock(&wlist->lock);
+	queue_work(conn->workq, &wlist->work);
+}
+
 /*
 * Free live responses up to and including the seq by marking them dead
 * and moving them to the send queue to be freed.
@@ -541,33 +589,17 @@ static void free_acked_responses(struct scoutfs_net_connection *conn, u64 seq)
 		queue_work(conn->workq, &conn->send_work);
 }

-static int recvmsg_full(struct socket *sock, void *buf, unsigned len)
+static int k_recvmsg(struct socket *sock, void *buf, unsigned len)
 {
-	struct msghdr msg;
-	struct kvec kv;
-	int ret;
+	struct kvec kv = {
+		.iov_base = buf,
+		.iov_len = len,
+	};
+	struct msghdr msg = {
+		.msg_flags = MSG_NOSIGNAL,
+	};

-	while (len) {
-		memset(&msg, 0, sizeof(msg));
-		msg.msg_flags = MSG_NOSIGNAL;
-		kv.iov_base = buf;
-		kv.iov_len = len;
-
-#ifndef KC_MSGHDR_STRUCT_IOV_ITER
-		msg.msg_iov = (struct iovec *)&kv;
-		msg.msg_iovlen = 1;
-#else
-		iov_iter_init(&msg.msg_iter, READ, (struct iovec *)&kv, len, 1);
-#endif
-		ret = kernel_recvmsg(sock, &msg, &kv, 1, len, msg.msg_flags);
-		if (ret <= 0)
-			return -ECONNABORTED;
-
-		len -= ret;
-		buf += ret;
-	}
-
-	return 0;
+	return kernel_recvmsg(sock, &msg, &kv, 1, len, msg.msg_flags);
 }

 static bool invalid_message(struct scoutfs_net_connection *conn,
@@ -604,6 +636,72 @@ static bool invalid_message(struct scoutfs_net_connection *conn,
 	return false;
 }

+static int recv_one_message(struct super_block *sb, struct net_info *ninf,
+			    struct scoutfs_net_connection *conn, struct scoutfs_net_header *nh,
+			    unsigned int data_len)
+{
+	struct message_recv *mrecv;
+	int ret;
+
+	scoutfs_inc_counter(sb, net_recv_messages);
+	scoutfs_add_counter(sb, net_recv_bytes, nh_bytes(data_len));
+	trace_scoutfs_net_recv_message(sb, &conn->sockname, &conn->peername, nh);
+
+	/* caller's invalid message checked data len */
+	mrecv = kmalloc(offsetof(struct message_recv, nh.data[data_len]), GFP_NOFS);
+	if (!mrecv) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	mrecv->conn = conn;
+	INIT_WORK(&mrecv->proc_work, scoutfs_net_proc_worker);
+	INIT_LIST_HEAD(&mrecv->ordered_head);
+	mrecv->nh = *nh;
+	if (data_len)
+		memcpy(mrecv->nh.data, (nh + 1), data_len);
+
+	if (nh->cmd == SCOUTFS_NET_CMD_GREETING) {
+		/* greetings are out of band, no seq mechanics */
+		set_conn_fl(conn, saw_greeting);
+
+	} else if (le64_to_cpu(nh->seq) <=
+		   atomic64_read(&conn->recv_seq)) {
+		/* drop any resent duplicated messages */
+		scoutfs_inc_counter(sb, net_recv_dropped_duplicate);
+		kfree(mrecv);
+		ret = 0;
+		goto out;
+
+	} else {
+		/* record that we've received sender's seq */
+		atomic64_set(&conn->recv_seq, le64_to_cpu(nh->seq));
+		/* and free our responses that sender has received */
+		free_acked_responses(conn, le64_to_cpu(nh->recv_seq));
+	}
+
+	scoutfs_tseq_add(&ninf->msg_tseq_tree, &mrecv->tseq_entry);
+
+	/*
+	 * Initial received greetings are processed inline
+	 * before any other incoming messages.
+	 *
+	 * Incoming requests or responses to the lock client
+	 * can't handle re-ordering, so they're queued to
+	 * ordered receive processing work.
+	 */
+	if (nh->cmd == SCOUTFS_NET_CMD_GREETING)
+		scoutfs_net_proc_worker(&mrecv->proc_work);
+	else if (nh->cmd == SCOUTFS_NET_CMD_LOCK && !conn->listening_conn)
+		queue_ordered_proc(conn, mrecv);
+	else
+		queue_work(conn->workq, &mrecv->proc_work);
+	ret = 0;
+
+out:
+	return ret;
+}
+
 /*
 * Always block receiving from the socket.  Errors trigger shutting down
 * the connection.
@@ -614,86 +712,72 @@ static void scoutfs_net_recv_worker(struct work_struct *work)
 	struct super_block *sb = conn->sb;
 	struct net_info *ninf = SCOUTFS_SB(sb)->net_info;
 	struct socket *sock = conn->sock;
-	struct scoutfs_net_header nh;
-	struct message_recv *mrecv;
+	struct scoutfs_net_header *nh;
+	struct page *page = NULL;
 	unsigned int data_len;
+	int hdr_off;
+	int rx_off;
+	int size;
 	int ret;

 	trace_scoutfs_net_recv_work_enter(sb, 0, 0);

+	page = alloc_page(GFP_NOFS);
+	if (!page) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	hdr_off = 0;
+	rx_off = 0;
+
 	for (;;) {
 		/* receive the header */
-		ret = recvmsg_full(sock, &nh, sizeof(nh));
-		if (ret)
-			break;
-
-		/* receiving an invalid message breaks the connection */
-		if (invalid_message(conn, &nh)) {
-			scoutfs_inc_counter(sb, net_recv_invalid_message);
-			ret = -EBADMSG;
-			break;
+		ret = k_recvmsg(sock, page_address(page) + rx_off, PAGE_SIZE - rx_off);
+		if (ret <= 0) {
+			ret = -ECONNABORTED;
+			goto out;
 		}

-		data_len = le16_to_cpu(nh.data_len);
+		rx_off += ret;

-		scoutfs_inc_counter(sb, net_recv_messages);
-		scoutfs_add_counter(sb, net_recv_bytes, nh_bytes(data_len));
-		trace_scoutfs_net_recv_message(sb, &conn->sockname,
-					       &conn->peername, &nh);
+		for (;;) {
+			size = rx_off - hdr_off;
+			if (size < sizeof(struct scoutfs_net_header))
+				break;

-		/* invalid message checked data len */
-		mrecv = kmalloc(offsetof(struct message_recv,
-					 nh.data[data_len]), GFP_NOFS);
-		if (!mrecv) {
-			ret = -ENOMEM;
-			break;
+			nh = page_address(page) + hdr_off;
+
+			/* receiving an invalid message breaks the connection */
+			if (invalid_message(conn, nh)) {
+				scoutfs_inc_counter(sb, net_recv_invalid_message);
+				ret = -EBADMSG;
+				break;
+			}
+
+			data_len = le16_to_cpu(nh->data_len);
+			if (sizeof(struct scoutfs_net_header) + data_len > size)
+				break;
+
+			ret = recv_one_message(sb, ninf, conn, nh, data_len);
+			if (ret < 0)
+				goto out;
+
+			hdr_off += sizeof(struct scoutfs_net_header) + data_len;
 		}

-		mrecv->conn = conn;
-		INIT_WORK(&mrecv->proc_work, scoutfs_net_proc_worker);
-		mrecv->nh = nh;
-
-		/* receive the data payload */
-		ret = recvmsg_full(sock, mrecv->nh.data, data_len);
-		if (ret) {
-			kfree(mrecv);
-			break;
+		if ((PAGE_SIZE - rx_off) <
+		    (sizeof(struct scoutfs_net_header) + SCOUTFS_NET_MAX_DATA_LEN)) {
+			if (size)
+				memmove(page_address(page), page_address(page) + hdr_off, size);
+			hdr_off = 0;
+			rx_off = size;
 		}
-
-		if (nh.cmd == SCOUTFS_NET_CMD_GREETING) {
-			/* greetings are out of band, no seq mechanics */
-			set_conn_fl(conn, saw_greeting);
-
-		} else if (le64_to_cpu(nh.seq) <=
-			   atomic64_read(&conn->recv_seq)) {
-			/* drop any resent duplicated messages */
-			scoutfs_inc_counter(sb, net_recv_dropped_duplicate);
-			kfree(mrecv);
-			continue;
-
-		} else {
-			/* record that we've received sender's seq */
-			atomic64_set(&conn->recv_seq, le64_to_cpu(nh.seq));
-			/* and free our responses that sender has received */
-			free_acked_responses(conn, le64_to_cpu(nh.recv_seq));
-		}
-
-		scoutfs_tseq_add(&ninf->msg_tseq_tree, &mrecv->tseq_entry);
-
-		/*
-		 * Initial received greetings are processed
-		 * synchronously before any other incoming messages.
-		 *
-		 * Incoming requests or responses to the lock client are
-		 * called synchronously to avoid reordering.
-		 */
-		if (nh.cmd == SCOUTFS_NET_CMD_GREETING ||
-		    (nh.cmd == SCOUTFS_NET_CMD_LOCK && !conn->listening_conn))
-			scoutfs_net_proc_worker(&mrecv->proc_work);
-		else
-			queue_work(conn->workq, &mrecv->proc_work);
 	}

+out:
+	__free_page(page);
+
 	if (ret)
 		scoutfs_inc_counter(sb, net_recv_error);

@@ -703,33 +787,41 @@ static void scoutfs_net_recv_worker(struct work_struct *work)
 	trace_scoutfs_net_recv_work_exit(sb, 0, ret);
 }

-static int sendmsg_full(struct socket *sock, void *buf, unsigned len)
+/*
+ * This consumes the kvec.
+ */
+static int k_sendmsg_full(struct socket *sock, struct kvec *kv, unsigned long nr_segs, size_t count)
 {
-	struct msghdr msg;
-	struct kvec kv;
-	int ret;
+	int ret = 0;

-	while (len) {
-		memset(&msg, 0, sizeof(msg));
-		msg.msg_flags = MSG_NOSIGNAL;
-		kv.iov_base = buf;
-		kv.iov_len = len;
+	while (count > 0) {
+		struct msghdr msg = {
+			.msg_flags = MSG_NOSIGNAL,
+		};

-#ifndef KC_MSGHDR_STRUCT_IOV_ITER
-		msg.msg_iov = (struct iovec *)&kv;
-		msg.msg_iovlen = 1;
-#else
-		iov_iter_init(&msg.msg_iter, WRITE, (struct iovec *)&kv, len, 1);
-#endif
-		ret = kernel_sendmsg(sock, &msg, &kv, 1, len);
-		if (ret <= 0)
-			return -ECONNABORTED;
+		ret = kernel_sendmsg(sock, &msg, kv, nr_segs, count);
+		if (ret <= 0) {
+			ret = -ECONNABORTED;
+			break;
+		}

-		len -= ret;
-		buf += ret;
+		count -= ret;
+		if (count) {
+			while (nr_segs > 0 && ret >= kv->iov_len) {
+				ret -= kv->iov_len;
+				kv++;
+				nr_segs--;
+			}
+			if (nr_segs > 0 && ret > 0) {
+				kv->iov_base += ret;
+				kv->iov_len -= ret;
+			}
+			BUG_ON(nr_segs == 0);
+		}
+		ret = 0;
 	}
-
-	return 0;
+	
+	return ret;
 }

 static void free_msend(struct net_info *ninf, struct message_send *msend)
@@ -760,54 +852,73 @@ static void scoutfs_net_send_worker(struct work_struct *work)
 	struct super_block *sb = conn->sb;
 	struct net_info *ninf = SCOUTFS_SB(sb)->net_info;
 	struct message_send *msend;
-	int ret = 0;
+	struct message_send *_msend_;
+	struct kvec kv[16];
+	unsigned long nr_segs;
+	size_t count;
 	int len;
+	int ret;

 	trace_scoutfs_net_send_work_enter(sb, 0, 0);

-	spin_lock(&conn->lock);
-
-	while ((msend = list_first_entry_or_null(&conn->send_queue,
-						 struct message_send, head))) {
-
-		if (msend->dead) {
-			free_msend(ninf, msend);
-			continue;
-		}
-
-		if ((msend->nh.cmd == SCOUTFS_NET_CMD_FAREWELL) &&
-		    nh_is_response(&msend->nh)) {
-			set_conn_fl(conn, saw_farewell);
-		}
-
-		msend->nh.recv_seq =
-			cpu_to_le64(atomic64_read(&conn->recv_seq));
-
-		spin_unlock(&conn->lock);
-
-		len = nh_bytes(le16_to_cpu(msend->nh.data_len));
-
-		scoutfs_inc_counter(sb, net_send_messages);
-		scoutfs_add_counter(sb, net_send_bytes, len);
-		trace_scoutfs_net_send_message(sb, &conn->sockname,
-					       &conn->peername, &msend->nh);
-
-		ret = sendmsg_full(conn->sock, &msend->nh, len);
+	for (;;) {
+		nr_segs = 0;
+		count = 0;

 		spin_lock(&conn->lock);
+		list_for_each_entry_safe(msend, _msend_, &conn->send_queue, head) {
+			if (msend->dead) {
+				free_msend(ninf, msend);
+				continue;
+			}

-		msend->nh.recv_seq = 0;
+			len = nh_bytes(le16_to_cpu(msend->nh.data_len));

-		if (ret)
-			break;
+			if ((msend->nh.cmd == SCOUTFS_NET_CMD_FAREWELL) &&
+			    nh_is_response(&msend->nh)) {
+				set_conn_fl(conn, saw_farewell);
+			}

-		/* resend if it wasn't freed while we sent */
-		if (!msend->dead)
-			list_move_tail(&msend->head, &conn->resend_queue);
+			msend->nh.recv_seq = cpu_to_le64(atomic64_read(&conn->recv_seq));
+
+			scoutfs_inc_counter(sb, net_send_messages);
+			scoutfs_add_counter(sb, net_send_bytes, len);
+			trace_scoutfs_net_send_message(sb, &conn->sockname,
+						       &conn->peername, &msend->nh);
+
+			count += len;
+			kv[nr_segs].iov_base = &msend->nh;
+			kv[nr_segs].iov_len = len;
+			if (++nr_segs == ARRAY_SIZE(kv))
+				break;
+
+		}
+		spin_unlock(&conn->lock);
+
+		if (nr_segs == 0) {
+			ret = 0;
+			goto out;
+		}
+
+		ret = k_sendmsg_full(conn->sock, kv, nr_segs, count);
+		if (ret < 0)
+			goto out;
+
+		spin_lock(&conn->lock);
+		list_for_each_entry_safe(msend, _msend_, &conn->send_queue, head) {
+			msend->nh.recv_seq = 0;
+
+			/* resend if it wasn't freed while we sent */
+			if (!msend->dead)
+				list_move_tail(&msend->head, &conn->resend_queue);
+
+			if (--nr_segs == 0)
+				break;
+		}
+		spin_unlock(&conn->lock);
 	}

-	spin_unlock(&conn->lock);
-
+out:
 	if (ret) {
 		scoutfs_inc_counter(sb, net_send_error);
 		shutdown_conn(conn);
@@ -862,6 +973,7 @@ static void scoutfs_net_destroy_worker(struct work_struct *work)
 	destroy_workqueue(conn->workq);
 	scoutfs_tseq_del(&ninf->conn_tseq_tree, &conn->tseq_entry);
 	kfree(conn->info);
+	kfree(conn->ordered_proc_wlists);
 	trace_scoutfs_conn_destroy_free(conn);
 	kfree(conn);

@@ -887,7 +999,7 @@ static void destroy_conn(struct scoutfs_net_connection *conn)
 * The TCP_KEEP* and TCP_USER_TIMEOUT option interaction is subtle.
 * TCP_USER_TIMEOUT only applies if there is unacked written data in the
 * send queue.  It doesn't work if the connection is idle.  Adding
- * keepalice probes with user_timeout set changes how the keepalive
+ * keepalive probes with user_timeout set changes how the keepalive
 * timeout is calculated.   CNT no longer matters.   Each time
 * additional probes (not the first) are sent the user timeout is
 * checked against the last time data was received.  If none of the
@@ -899,14 +1011,16 @@ static void destroy_conn(struct scoutfs_net_connection *conn)
 * elapses during the probe timer processing after the unsuccessful
 * probes.
 */
-#define UNRESPONSIVE_TIMEOUT_SECS 10
-#define UNRESPONSIVE_PROBES 3
-static int sock_opts_and_names(struct scoutfs_net_connection *conn,
+static int sock_opts_and_names(struct super_block *sb,
+			       struct scoutfs_net_connection *conn,
 			       struct socket *sock)
 {
+	struct scoutfs_mount_options opts;
 	int optval;
 	int ret;

+	scoutfs_options_read(sb, &opts);
+
 	/* we use a keepalive timeout instead of send timeout */
 	ret = kc_sock_set_sndtimeo(sock, 0);
 	if (ret)
@@ -919,8 +1033,7 @@ static int sock_opts_and_names(struct scoutfs_net_connection *conn,
 	if (ret)
 		goto out;

-	BUILD_BUG_ON(UNRESPONSIVE_PROBES >= UNRESPONSIVE_TIMEOUT_SECS);
-	optval = UNRESPONSIVE_TIMEOUT_SECS - (UNRESPONSIVE_PROBES);
+	optval = (opts.tcp_keepalive_timeout_ms / MSEC_PER_SEC) - UNRESPONSIVE_PROBES;
 	ret = kc_tcp_sock_set_keepidle(sock, optval);
 	if (ret)
 		goto out;
@@ -930,7 +1043,7 @@ static int sock_opts_and_names(struct scoutfs_net_connection *conn,
 	if (ret)
 		goto out;

-	optval = UNRESPONSIVE_TIMEOUT_SECS * MSEC_PER_SEC;
+	optval = opts.tcp_keepalive_timeout_ms;
 	ret = kc_tcp_sock_set_user_timeout(sock, optval);
 	if (ret)
 		goto out;
@@ -992,13 +1105,19 @@ static void scoutfs_net_listen_worker(struct work_struct *work)
 						  conn->notify_down,
 						  conn->info_size,
 						  conn->req_funcs, "accepted");
+		/*
+		 * scoutfs_net_alloc_conn() can fail due to ENOMEM. If this
+		 * is the only thing that does so, there's no harm in trying
+		 * to see if kernel_accept() can get enough memory to try accepting
+		 * a new connection again. If that then fails with ENOMEM, it'll
+		 * shut down the conn anyway. So just retry here.
+		 */
 		if (!acc_conn) {
 			sock_release(acc_sock);
-			ret = -ENOMEM;
 			continue;
 		}

-		ret = sock_opts_and_names(acc_conn, acc_sock);
+		ret = sock_opts_and_names(sb, acc_conn, acc_sock);
 		if (ret) {
 			sock_release(acc_sock);
 			destroy_conn(acc_conn);
@@ -1069,7 +1188,7 @@ static void scoutfs_net_connect_worker(struct work_struct *work)
 	if (ret)
 		goto out;

-	ret = sock_opts_and_names(conn, sock);
+	ret = sock_opts_and_names(sb, conn, sock);
 	if (ret)
 		goto out;

@@ -1330,25 +1449,30 @@ scoutfs_net_alloc_conn(struct super_block *sb,
 {
 	struct net_info *ninf = SCOUTFS_SB(sb)->net_info;
 	struct scoutfs_net_connection *conn;
+	unsigned int nr;
+	unsigned int i;
+
+	nr = min_t(unsigned int, num_possible_cpus(),
+		   PAGE_SIZE / sizeof(struct scoutfs_work_list));

 	conn = kzalloc(sizeof(struct scoutfs_net_connection), GFP_NOFS);
-	if (!conn)
-		return NULL;
-
-	if (info_size) {
-		conn->info = kzalloc(info_size, GFP_NOFS);
-		if (!conn->info) {
-			kfree(conn);
-			return NULL;
-		}
+	if (conn) {
+		if (info_size)
+			conn->info = kzalloc(info_size, GFP_NOFS);
+		conn->ordered_proc_wlists = kmalloc_array(nr, sizeof(struct scoutfs_work_list),
+							  GFP_NOFS);
+		conn->workq = alloc_workqueue("scoutfs_net_%s",
+					      WQ_UNBOUND | WQ_NON_REENTRANT, 0,
+					      name_suffix);
 	}
-
-	conn->workq = alloc_workqueue("scoutfs_net_%s",
-				      WQ_UNBOUND | WQ_NON_REENTRANT, 0,
-				      name_suffix);
-	if (!conn->workq) {
-		kfree(conn->info);
-		kfree(conn);
+	if (!conn || (info_size && !conn->info) || !conn->workq || !conn->ordered_proc_wlists) {
+		if (conn) {
+			kfree(conn->info);
+			kfree(conn->ordered_proc_wlists);
+			if (conn->workq)
+				destroy_workqueue(conn->workq);
+			kfree(conn);
+		}
 		return NULL;
 	}

@@ -1378,6 +1502,13 @@ scoutfs_net_alloc_conn(struct super_block *sb,
 	INIT_DELAYED_WORK(&conn->reconn_free_dwork,
 			  scoutfs_net_reconn_free_worker);

+	conn->ordered_proc_nr = nr;
+	for (i = 0; i < nr; i++) {
+		INIT_WORK(&conn->ordered_proc_wlists[i].work, scoutfs_net_ordered_proc_worker);
+		spin_lock_init(&conn->ordered_proc_wlists[i].lock);
+		INIT_LIST_HEAD(&conn->ordered_proc_wlists[i].list);
+	}
+
 	scoutfs_tseq_add(&ninf->conn_tseq_tree, &conn->tseq_entry);
 	trace_scoutfs_conn_alloc(conn);

@@ -1,10 +1,18 @@
 #ifndef _SCOUTFS_NET_H_
 #define _SCOUTFS_NET_H_

+#include <linux/spinlock.h>
+#include <linux/list.h>
 #include <linux/in.h>
 #include "endian_swap.h"
 #include "tseq.h"

+struct scoutfs_work_list {
+	struct work_struct work;
+	spinlock_t lock;
+	struct list_head list;
+};
+
 struct scoutfs_net_connection;

 /* These are called in their own blocking context */
@@ -61,6 +69,8 @@ struct scoutfs_net_connection {
 	struct list_head resend_queue;

 	atomic64_t recv_seq;
+	unsigned int ordered_proc_nr;
+	struct scoutfs_work_list *ordered_proc_wlists;

 	struct workqueue_struct *workq;
 	struct work_struct listen_work;
@@ -592,7 +592,7 @@ static int handle_request(struct super_block *sb, struct omap_request *req)
 	ret = 0;
 out:
 	free_rids(&priv_rids);
-	if (ret < 0) {
+	if ((ret < 0) && (req != NULL)) {
 		ret = scoutfs_server_send_omap_response(sb, req->client_rid, req->client_id,
 							NULL, ret);
 		free_req(req);
@@ -39,6 +39,7 @@ enum {
 	Opt_orphan_scan_delay_ms,
 	Opt_quorum_heartbeat_timeout_ms,
 	Opt_quorum_slot_nr,
+	Opt_tcp_keepalive_timeout_ms,
 	Opt_err,
 };

@@ -52,6 +53,7 @@ static const match_table_t tokens = {
 	{Opt_orphan_scan_delay_ms, "orphan_scan_delay_ms=%s"},
 	{Opt_quorum_heartbeat_timeout_ms, "quorum_heartbeat_timeout_ms=%s"},
 	{Opt_quorum_slot_nr, "quorum_slot_nr=%s"},
+	{Opt_tcp_keepalive_timeout_ms, "tcp_keepalive_timeout_ms=%s"},
 	{Opt_err, NULL}
 };

@@ -126,6 +128,8 @@ static void free_options(struct scoutfs_mount_options *opts)
 #define MIN_DATA_PREALLOC_BLOCKS	1ULL
 #define MAX_DATA_PREALLOC_BLOCKS	((unsigned long long)SCOUTFS_BLOCK_SM_MAX)

+#define DEFAULT_TCP_KEEPALIVE_TIMEOUT_MS	(60 * MSEC_PER_SEC)
+
 static void init_default_options(struct scoutfs_mount_options *opts)
 {
 	memset(opts, 0, sizeof(*opts));
@@ -136,6 +140,7 @@ static void init_default_options(struct scoutfs_mount_options *opts)
 	opts->orphan_scan_delay_ms = -1;
 	opts->quorum_heartbeat_timeout_ms = SCOUTFS_QUORUM_DEF_HB_TIMEO_MS;
 	opts->quorum_slot_nr = -1;
+	opts->tcp_keepalive_timeout_ms = DEFAULT_TCP_KEEPALIVE_TIMEOUT_MS;
 }

 static int verify_log_merge_wait_timeout_ms(struct super_block *sb, int ret, int val)
@@ -168,6 +173,21 @@ static int verify_quorum_heartbeat_timeout_ms(struct super_block *sb, int ret, u
 	return 0;
 }

+static int verify_tcp_keepalive_timeout_ms(struct super_block *sb, int ret, int val)
+{
+	if (ret < 0) {
+		scoutfs_err(sb, "failed to parse tcp_keepalive_timeout_ms value");
+		return -EINVAL;
+	}
+	if (val <= (UNRESPONSIVE_PROBES * MSEC_PER_SEC)) {
+		scoutfs_err(sb, "invalid tcp_keepalive_timeout_ms value %d, must be larger than %lu",
+			    val, (UNRESPONSIVE_PROBES * MSEC_PER_SEC));
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
 /*
 * Parse the option string into our options struct.   This can allocate
 * memory in the struct.  The caller is responsible for always calling
@@ -218,6 +238,14 @@ static int parse_options(struct super_block *sb, char *options, struct scoutfs_m
 			opts->data_prealloc_contig_only = nr;
 			break;

+		case Opt_tcp_keepalive_timeout_ms:
+			ret = match_int(args, &nr);
+			ret = verify_tcp_keepalive_timeout_ms(sb, ret, nr);
+			if (ret < 0)
+				return ret;
+			opts->tcp_keepalive_timeout_ms = nr;
+			break;
+
 		case Opt_log_merge_wait_timeout_ms:
 			ret = match_int(args, &nr);
 			ret = verify_log_merge_wait_timeout_ms(sb, ret, nr);
@@ -371,6 +399,7 @@ int scoutfs_options_show(struct seq_file *seq, struct dentry *root)
 	seq_printf(seq, ",orphan_scan_delay_ms=%u", opts.orphan_scan_delay_ms);
 	if (opts.quorum_slot_nr >= 0)
 		seq_printf(seq, ",quorum_slot_nr=%d", opts.quorum_slot_nr);
+	seq_printf(seq, ",tcp_keepalive_timeout_ms=%d", opts.tcp_keepalive_timeout_ms);

 	return 0;
 }
@@ -13,8 +13,11 @@ struct scoutfs_mount_options {
 	unsigned int orphan_scan_delay_ms;
 	int quorum_slot_nr;
 	u64 quorum_heartbeat_timeout_ms;
+	int tcp_keepalive_timeout_ms;
 };

+#define UNRESPONSIVE_PROBES	3
+
 void scoutfs_options_read(struct super_block *sb, struct scoutfs_mount_options *opts);
 int scoutfs_options_show(struct seq_file *seq, struct dentry *root);

@@ -243,10 +243,6 @@ static int send_msg_members(struct super_block *sb, int type, u64 term, int only
 	};
 	struct sockaddr_in sin;
 	struct msghdr mh = {
-#ifndef KC_MSGHDR_STRUCT_IOV_ITER
-		.msg_iov = (struct iovec *)&kv,
-		.msg_iovlen = 1,
-#endif
 		.msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL,
 		.msg_name = &sin,
 		.msg_namelen = sizeof(sin),
@@ -268,9 +264,7 @@ static int send_msg_members(struct super_block *sb, int type, u64 term, int only

 		scoutfs_quorum_slot_sin(&qinf->qconf, i, &sin);
 		now = ktime_get();
-#ifdef KC_MSGHDR_STRUCT_IOV_ITER
-		iov_iter_init(&mh.msg_iter, WRITE, (struct iovec *)&kv, sizeof(qmes), 1);
-#endif
+
 		ret = kernel_sendmsg(qinf->sock, &mh, &kv, 1, kv.iov_len);
 		if (ret != kv.iov_len)
 			failed++;
@@ -312,10 +306,6 @@ static int recv_msg(struct super_block *sb, struct quorum_host_msg *msg,
 		.iov_len = sizeof(struct scoutfs_quorum_message),
 	};
 	struct msghdr mh = {
-#ifndef KC_MSGHDR_STRUCT_IOV_ITER
-		.msg_iov = (struct iovec *)&kv,
-		.msg_iovlen = 1,
-#endif
 		.msg_flags = MSG_NOSIGNAL,
 	};

@@ -333,9 +323,6 @@ static int recv_msg(struct super_block *sb, struct quorum_host_msg *msg,
 		ret = kc_tcp_sock_set_rcvtimeo(qinf->sock, rel_to);
 	}

-#ifdef KC_MSGHDR_STRUCT_IOV_ITER
-	iov_iter_init(&mh.msg_iter, READ, (struct iovec *)&kv, sizeof(struct scoutfs_quorum_message), 1);
-#endif
 	ret = kernel_recvmsg(qinf->sock, &mh, &kv, 1, kv.iov_len, mh.msg_flags);
 	if (ret < 0)
 		return ret;
@@ -520,10 +507,10 @@ static int update_quorum_block(struct super_block *sb, int event, u64 term, bool
 		set_quorum_block_event(sb, &blk, event, term);
 		ret = write_quorum_block(sb, blkno, &blk);
 		if (ret < 0)
-			scoutfs_err(sb, "error %d reading quorum block %llu to update event %d term %llu",
+			scoutfs_err(sb, "error %d writing quorum block %llu after updating event %d term %llu",
 				    ret, blkno, event, term);
 	} else {
-		scoutfs_err(sb, "error %d writing quorum block %llu after updating event %d term %llu",
+		scoutfs_err(sb, "error %d reading quorum block %llu to update event %d term %llu",
 			    ret, blkno, event, term);
 	}

@@ -822,6 +809,7 @@ static void scoutfs_quorum_worker(struct work_struct *work)

 		/* followers and candidates start new election on timeout */
 		if (qst.role != LEADER &&
+		    msg.type == SCOUTFS_QUORUM_MSG_INVALID &&
 		    ktime_after(ktime_get(), qst.timeout)) {
 			/* .. but only if their server has stopped */
 			if (!scoutfs_server_is_down(sb)) {
@@ -982,7 +970,10 @@ static void scoutfs_quorum_worker(struct work_struct *work)
 	}

 	/* record that this slot no longer has an active quorum */
-	update_quorum_block(sb, SCOUTFS_QUORUM_EVENT_END, qst.term, true);
+	err = update_quorum_block(sb, SCOUTFS_QUORUM_EVENT_END, qst.term, true);
+	if (err < 0 && ret == 0)
+		ret = err;
+
 out:
 	if (ret < 0) {
 		scoutfs_err(sb, "quorum service saw error %d, shutting down.  This mount is no longer participating in quorum.  It should be remounted to restore service.",
@@ -1071,7 +1062,7 @@ static char *role_str(int role)
 		[LEADER] = "leader",
 	};

-	if (role < 0 || role > ARRAY_SIZE(roles) || !roles[role])
+	if (role < 0 || role >= ARRAY_SIZE(roles) || !roles[role])
 		return "invalid";

 	return roles[role];
@@ -823,13 +823,14 @@ DEFINE_EVENT(scoutfs_lock_info_class, scoutfs_lock_destroy,
 );

 TRACE_EVENT(scoutfs_xattr_set,
-	TP_PROTO(struct super_block *sb, size_t name_len, const void *value,
-		 size_t size, int flags),
+	TP_PROTO(struct super_block *sb, __u64 ino, size_t name_len,
+		 const void *value, size_t size, int flags),

-	TP_ARGS(sb, name_len, value, size, flags),
+	TP_ARGS(sb, ino, name_len, value, size, flags),

 	TP_STRUCT__entry(
 		SCSB_TRACE_FIELDS
+		__field(__u64, ino)
 		__field(size_t, name_len)
 		__field(const void *, value)
 		__field(size_t, size)
@@ -838,15 +839,16 @@ TRACE_EVENT(scoutfs_xattr_set,

 	TP_fast_assign(
 		SCSB_TRACE_ASSIGN(sb);
+		__entry->ino = ino;
 		__entry->name_len = name_len;
 		__entry->value = value;
 		__entry->size = size;
 		__entry->flags = flags;
 	),

-	TP_printk(SCSBF" name_len %zu value %p size %zu flags 0x%x",
-		  SCSB_TRACE_ARGS, __entry->name_len, __entry->value,
-		  __entry->size, __entry->flags)
+	TP_printk(SCSBF" ino %llu name_len %zu value %p size %zu flags 0x%x",
+		  SCSB_TRACE_ARGS, __entry->ino,  __entry->name_len,
+		  __entry->value, __entry->size, __entry->flags)
 );

 TRACE_EVENT(scoutfs_advance_dirty_super,
@@ -1098,6 +1100,7 @@ DECLARE_EVENT_CLASS(scoutfs_lock_class,
 		__field(unsigned char, invalidate_pending)
 		__field(int, mode)
 		__field(int, invalidating_mode)
+		__field(unsigned int, refcount)
 		__field(unsigned int, waiters_cw)
 		__field(unsigned int, waiters_pr)
 		__field(unsigned int, waiters_ex)
@@ -1116,6 +1119,7 @@ DECLARE_EVENT_CLASS(scoutfs_lock_class,
 		__entry->invalidate_pending = lck->invalidate_pending;
 		__entry->mode = lck->mode;
 		__entry->invalidating_mode = lck->invalidating_mode;
+		__entry->refcount = atomic_read(&lck->refcount);
 		__entry->waiters_pr = lck->waiters[SCOUTFS_LOCK_READ];
 		__entry->waiters_ex = lck->waiters[SCOUTFS_LOCK_WRITE];
 		__entry->waiters_cw = lck->waiters[SCOUTFS_LOCK_WRITE_ONLY];
@@ -1123,11 +1127,11 @@ DECLARE_EVENT_CLASS(scoutfs_lock_class,
 		__entry->users_ex = lck->users[SCOUTFS_LOCK_WRITE];
 		__entry->users_cw = lck->users[SCOUTFS_LOCK_WRITE_ONLY];
        ),
-        TP_printk(SCSBF" start "SK_FMT" end "SK_FMT" mode %u invmd %u reqp %u invp %u refg %llu wris %llu dts %llu waiters: pr %u ex %u cw %u users: pr %u ex %u cw %u",
+        TP_printk(SCSBF" start "SK_FMT" end "SK_FMT" mode %u invmd %u reqp %u invp %u refg %llu rfcnt %d wris %llu dts %llu waiters: pr %u ex %u cw %u users: pr %u ex %u cw %u",
 		  SCSB_TRACE_ARGS, sk_trace_args(start), sk_trace_args(end),
 		  __entry->mode, __entry->invalidating_mode, __entry->request_pending,
-		  __entry->invalidate_pending, __entry->refresh_gen, __entry->write_seq,
-		  __entry->dirty_trans_seq,
+		  __entry->invalidate_pending, __entry->refresh_gen, __entry->refcount,
+		  __entry->write_seq, __entry->dirty_trans_seq,
 		  __entry->waiters_pr, __entry->waiters_ex, __entry->waiters_cw,
 		  __entry->users_pr, __entry->users_ex, __entry->users_cw)
 );
@@ -2463,6 +2467,27 @@ TRACE_EVENT(scoutfs_block_dirty_ref,
 		  __entry->block_blkno, __entry->block_seq)
 );

+TRACE_EVENT(scoutfs_get_file_block,
+	TP_PROTO(struct super_block *sb, u64 blkno, int flags),
+
+	TP_ARGS(sb, blkno, flags),
+
+	TP_STRUCT__entry(
+		SCSB_TRACE_FIELDS
+		__field(__u64, blkno)
+		__field(int, flags)
+	),
+
+	TP_fast_assign(
+		SCSB_TRACE_ASSIGN(sb);
+		__entry->blkno = blkno;
+		__entry->flags = flags;
+	),
+
+	TP_printk(SCSBF" blkno %llu flags 0x%x",
+		  SCSB_TRACE_ARGS, __entry->blkno, __entry->flags)
+);
+
 TRACE_EVENT(scoutfs_block_stale,
 	TP_PROTO(struct super_block *sb, struct scoutfs_block_ref *ref,
 		 struct scoutfs_block_header *hdr, u32 magic, u32 crc),
@@ -2503,8 +2528,8 @@ TRACE_EVENT(scoutfs_block_stale,

 DECLARE_EVENT_CLASS(scoutfs_block_class,
 	TP_PROTO(struct super_block *sb, void *bp, u64 blkno, int refcount, int io_count,
-		 unsigned long bits, __u64 accessed),
-	TP_ARGS(sb, bp, blkno, refcount, io_count, bits, accessed),
+		 unsigned long bits),
+	TP_ARGS(sb, bp, blkno, refcount, io_count, bits),
 	TP_STRUCT__entry(
 		SCSB_TRACE_FIELDS
 		__field(void *, bp)
@@ -2512,7 +2537,6 @@ DECLARE_EVENT_CLASS(scoutfs_block_class,
 		__field(int, refcount)
 		__field(int, io_count)
 		__field(long, bits)
-		__field(__u64, accessed)
 	),
 	TP_fast_assign(
 		SCSB_TRACE_ASSIGN(sb);
@@ -2521,71 +2545,65 @@ DECLARE_EVENT_CLASS(scoutfs_block_class,
 		__entry->refcount = refcount;
 		__entry->io_count = io_count;
 		__entry->bits = bits;
-		__entry->accessed = accessed;
 	),
-	TP_printk(SCSBF" bp %p blkno %llu refcount %d io_count %d bits 0x%lx accessed %llu",
+	TP_printk(SCSBF" bp %p blkno %llu refcount %x io_count %d bits 0x%lx",
 		  SCSB_TRACE_ARGS, __entry->bp, __entry->blkno, __entry->refcount,
-		  __entry->io_count, __entry->bits, __entry->accessed)
+		  __entry->io_count, __entry->bits)
 );
 DEFINE_EVENT(scoutfs_block_class, scoutfs_block_allocate,
 	TP_PROTO(struct super_block *sb, void *bp, u64 blkno,
-		 int refcount, int io_count, unsigned long bits,
-		 __u64 accessed),
-	TP_ARGS(sb, bp, blkno, refcount, io_count, bits, accessed)
+		 int refcount, int io_count, unsigned long bits),
+	TP_ARGS(sb, bp, blkno, refcount, io_count, bits)
 );
 DEFINE_EVENT(scoutfs_block_class, scoutfs_block_free,
 	TP_PROTO(struct super_block *sb, void *bp, u64 blkno,
-		 int refcount, int io_count, unsigned long bits,
-		 __u64 accessed),
-	TP_ARGS(sb, bp, blkno, refcount, io_count, bits, accessed)
+		 int refcount, int io_count, unsigned long bits),
+	TP_ARGS(sb, bp, blkno, refcount, io_count, bits)
 );
 DEFINE_EVENT(scoutfs_block_class, scoutfs_block_insert,
 	TP_PROTO(struct super_block *sb, void *bp, u64 blkno,
-		 int refcount, int io_count, unsigned long bits,
-		 __u64 accessed),
-	TP_ARGS(sb, bp, blkno, refcount, io_count, bits, accessed)
+		 int refcount, int io_count, unsigned long bits),
+	TP_ARGS(sb, bp, blkno, refcount, io_count, bits)
 );
 DEFINE_EVENT(scoutfs_block_class, scoutfs_block_remove,
 	TP_PROTO(struct super_block *sb, void *bp, u64 blkno,
-		 int refcount, int io_count, unsigned long bits,
-		 __u64 accessed),
-	TP_ARGS(sb, bp, blkno, refcount, io_count, bits, accessed)
+		 int refcount, int io_count, unsigned long bits),
+	TP_ARGS(sb, bp, blkno, refcount, io_count, bits)
 );
 DEFINE_EVENT(scoutfs_block_class, scoutfs_block_end_io,
 	TP_PROTO(struct super_block *sb, void *bp, u64 blkno,
-		 int refcount, int io_count, unsigned long bits,
-		 __u64 accessed),
-	TP_ARGS(sb, bp, blkno, refcount, io_count, bits, accessed)
+		 int refcount, int io_count, unsigned long bits),
+	TP_ARGS(sb, bp, blkno, refcount, io_count, bits)
 );
 DEFINE_EVENT(scoutfs_block_class, scoutfs_block_submit,
 	TP_PROTO(struct super_block *sb, void *bp, u64 blkno,
-		 int refcount, int io_count, unsigned long bits,
-		 __u64 accessed),
-	TP_ARGS(sb, bp, blkno, refcount, io_count, bits, accessed)
+		 int refcount, int io_count, unsigned long bits),
+	TP_ARGS(sb, bp, blkno, refcount, io_count, bits)
 );
 DEFINE_EVENT(scoutfs_block_class, scoutfs_block_invalidate,
 	TP_PROTO(struct super_block *sb, void *bp, u64 blkno,
-		 int refcount, int io_count, unsigned long bits,
-		 __u64 accessed),
-	TP_ARGS(sb, bp, blkno, refcount, io_count, bits, accessed)
+		 int refcount, int io_count, unsigned long bits),
+	TP_ARGS(sb, bp, blkno, refcount, io_count, bits)
 );
 DEFINE_EVENT(scoutfs_block_class, scoutfs_block_mark_dirty,
 	TP_PROTO(struct super_block *sb, void *bp, u64 blkno,
-		 int refcount, int io_count, unsigned long bits,
-		 __u64 accessed),
-	TP_ARGS(sb, bp, blkno, refcount, io_count, bits, accessed)
+		 int refcount, int io_count, unsigned long bits),
+	TP_ARGS(sb, bp, blkno, refcount, io_count, bits)
 );
 DEFINE_EVENT(scoutfs_block_class, scoutfs_block_forget,
 	TP_PROTO(struct super_block *sb, void *bp, u64 blkno,
-		 int refcount, int io_count, unsigned long bits,
-		 __u64 accessed),
-	TP_ARGS(sb, bp, blkno, refcount, io_count, bits, accessed)
+		 int refcount, int io_count, unsigned long bits),
+	TP_ARGS(sb, bp, blkno, refcount, io_count, bits)
 );
 DEFINE_EVENT(scoutfs_block_class, scoutfs_block_shrink,
 	TP_PROTO(struct super_block *sb, void *bp, u64 blkno,
-		 int refcount, int io_count, unsigned long bits,
-		 __u64 accessed),
-	TP_ARGS(sb, bp, blkno, refcount, io_count, bits, accessed)
+		 int refcount, int io_count, unsigned long bits),
+	TP_ARGS(sb, bp, blkno, refcount, io_count, bits)
+);
+DEFINE_EVENT(scoutfs_block_class, scoutfs_block_isolate,
+	TP_PROTO(struct super_block *sb, void *bp, u64 blkno,
+		 int refcount, int io_count, unsigned long bits),
+	TP_ARGS(sb, bp, blkno, refcount, io_count, bits)
 );

 DECLARE_EVENT_CLASS(scoutfs_ext_next_class,
@@ -3060,6 +3078,27 @@ DEFINE_EVENT(scoutfs_srch_compact_class, scoutfs_srch_compact_client_recv,
 	TP_ARGS(sb, sc)
 );

+TRACE_EVENT(scoutfs_ioc_search_xattrs,
+	TP_PROTO(struct super_block *sb, u64 ino, u64 last_ino),
+
+	TP_ARGS(sb, ino, last_ino),
+
+	TP_STRUCT__entry(
+		SCSB_TRACE_FIELDS
+		__field(u64, ino)
+		__field(u64, last_ino)
+	),
+
+	TP_fast_assign(
+		SCSB_TRACE_ASSIGN(sb);
+		__entry->ino = ino;
+		__entry->last_ino = last_ino;
+	),
+
+	TP_printk(SCSBF" ino %llu last_ino %llu", SCSB_TRACE_ARGS,
+		  __entry->ino, __entry->last_ino)
+);
+
 #endif /* _TRACE_SCOUTFS_H */

 /* This part must be outside protection */
@@ -610,7 +610,7 @@ static void scoutfs_server_commit_func(struct work_struct *work)
 		goto out;

 	if (scoutfs_forcing_unmount(sb)) {
-		ret = -EIO;
+		ret = -ENOLINK;
 		goto out;
 	}

@@ -1040,6 +1040,101 @@ static int next_log_merge_item(struct super_block *sb,
 	return next_log_merge_item_key(sb, root, zone, &key, val, val_len);
 }

+static int do_finalize_ours(struct super_block *sb,
+			    struct scoutfs_log_trees *lt,
+			    struct commit_hold *hold)
+{
+	struct server_info *server = SCOUTFS_SB(sb)->server_info;
+	struct scoutfs_super_block *super = DIRTY_SUPER_SB(sb);
+	struct scoutfs_key key;
+	char *err_str = NULL;
+	u64 rid = le64_to_cpu(lt->rid);
+	bool more;
+	int ret;
+	int err;
+
+	mutex_lock(&server->srch_mutex);
+	ret = scoutfs_srch_rotate_log(sb, &server->alloc, &server->wri,
+				      &super->srch_root, &lt->srch_file, true);
+	mutex_unlock(&server->srch_mutex);
+	if (ret < 0) {
+		scoutfs_err(sb, "error rotating srch log for rid %016llx: %d",
+			    rid, ret);
+		return ret;
+        }
+
+	do {
+		more = false;
+
+		/*
+		 * All of these can return errors, perhaps indicating successful
+		 * partial progress, after having modified the allocator trees.
+		 * We always have to update the roots in the log item.
+		 */
+		mutex_lock(&server->alloc_mutex);
+		ret = (err_str = "splice meta_freed to other_freed",
+				scoutfs_alloc_splice_list(sb, &server->alloc,
+					&server->wri, server->other_freed,
+					&lt->meta_freed)) ?:
+			(err_str = "splice meta_avail",
+			 scoutfs_alloc_splice_list(sb, &server->alloc,
+					&server->wri, server->other_freed,
+					&lt->meta_avail)) ?:
+			(err_str = "empty data_avail",
+			 alloc_move_empty(sb, &super->data_alloc,
+					  &lt->data_avail,
+					  COMMIT_HOLD_ALLOC_BUDGET / 2)) ?:
+			(err_str = "empty data_freed",
+			 alloc_move_empty(sb, &super->data_alloc,
+					  &lt->data_freed,
+					  COMMIT_HOLD_ALLOC_BUDGET / 2));
+		mutex_unlock(&server->alloc_mutex);
+
+		/*
+		 * only finalize, allowing merging, once the allocators are
+		 * fully freed
+		 */
+		if (ret == 0) {
+			/* the transaction is no longer open */
+			le64_add_cpu(&lt->flags, SCOUTFS_LOG_TREES_FINALIZED);
+			lt->finalize_seq = cpu_to_le64(scoutfs_server_next_seq(sb));
+		}
+
+		scoutfs_key_init_log_trees(&key, rid, le64_to_cpu(lt->nr));
+
+		err = scoutfs_btree_update(sb, &server->alloc, &server->wri,
+					   &super->logs_root, &key, lt,
+					   sizeof(*lt));
+		BUG_ON(err != 0); /* alloc, log, srch items out of sync */
+
+		if (ret == -EINPROGRESS) {
+			more = true;
+			mutex_unlock(&server->logs_mutex);
+			ret = server_apply_commit(sb, hold, 0);
+			if (ret < 0)
+				WARN_ON_ONCE(ret < 0);
+			server_hold_commit(sb, hold);
+			mutex_lock(&server->logs_mutex);
+		} else if (ret == 0) {
+			memset(&lt->item_root, 0, sizeof(lt->item_root));
+			memset(&lt->bloom_ref, 0, sizeof(lt->bloom_ref));
+			lt->inode_count_delta = 0;
+			lt->max_item_seq = 0;
+			lt->finalize_seq = 0;
+			le64_add_cpu(&lt->nr, 1);
+			lt->flags = 0;
+		}
+	} while (more);
+
+	if (ret < 0) {
+		scoutfs_err(sb,
+			    "error %d finalizing log trees for rid %016llx: %s",
+			    ret, rid, err_str);
+	}
+
+	return ret;
+}
+
 /*
 * Finalizing the log btrees for merging needs to be done carefully so
 * that items don't appear to go backwards in time.
@@ -1091,7 +1186,6 @@ static int finalize_and_start_log_merge(struct super_block *sb, struct scoutfs_l
 	struct scoutfs_log_merge_range rng;
 	struct scoutfs_mount_options opts;
 	struct scoutfs_log_trees each_lt;
-	struct scoutfs_log_trees fin;
 	unsigned int delay_ms;
 	unsigned long timeo;
 	bool saw_finalized;
@@ -1162,6 +1256,7 @@ static int finalize_and_start_log_merge(struct super_block *sb, struct scoutfs_l
 		/* done if we're not finalizing and there's no finalized */
 		if (!finalize_ours && !saw_finalized) {
 			ret = 0;
+			scoutfs_inc_counter(sb, log_merge_no_finalized);
 			break;
 		}

@@ -1196,32 +1291,11 @@ static int finalize_and_start_log_merge(struct super_block *sb, struct scoutfs_l

 		/* Finalize ours if it's visible to others */
 		if (ours_visible) {
-			fin = *lt;
-			memset(&fin.meta_avail, 0, sizeof(fin.meta_avail));
-			memset(&fin.meta_freed, 0, sizeof(fin.meta_freed));
-			memset(&fin.data_avail, 0, sizeof(fin.data_avail));
-			memset(&fin.data_freed, 0, sizeof(fin.data_freed));
-			memset(&fin.srch_file, 0, sizeof(fin.srch_file));
-			le64_add_cpu(&fin.flags, SCOUTFS_LOG_TREES_FINALIZED);
-			fin.finalize_seq = cpu_to_le64(scoutfs_server_next_seq(sb));
-
-			scoutfs_key_init_log_trees(&key, le64_to_cpu(fin.rid),
-						   le64_to_cpu(fin.nr));
-			ret = scoutfs_btree_update(sb, &server->alloc, &server->wri,
-						   &super->logs_root, &key, &fin,
-						   sizeof(fin));
+			ret = do_finalize_ours(sb, lt, hold);
 			if (ret < 0) {
-				err_str = "updating finalized log_trees";
+				err_str = "finalizing ours";
 				break;
 			}
-
-			memset(&lt->item_root, 0, sizeof(lt->item_root));
-			memset(&lt->bloom_ref, 0, sizeof(lt->bloom_ref));
-			lt->inode_count_delta = 0;
-			lt->max_item_seq = 0;
-			lt->finalize_seq = 0;
-			le64_add_cpu(&lt->nr, 1);
-			lt->flags = 0;
 		}

 		/* wait a bit for mounts to arrive */
@@ -1680,8 +1754,8 @@ unlock:

 	ret = server_apply_commit(sb, &hold, ret);
 	if (ret < 0)
-		scoutfs_err(sb, "server error %d committing client logs for rid %016llx: %s",
-			    ret, rid, err_str);
+		scoutfs_err(sb, "server error %d committing client logs for rid %016llx, nr %llu: %s",
+			    ret, rid, le64_to_cpu(lt.nr), err_str);
 out:
 	WARN_ON_ONCE(ret < 0);
 	return scoutfs_net_response(sb, conn, cmd, id, ret, NULL, 0);
@@ -1816,6 +1890,9 @@ static int reclaim_open_log_tree(struct super_block *sb, u64 rid)
 out:
 	mutex_unlock(&server->logs_mutex);

+	if (ret == 0)
+		scoutfs_inc_counter(sb, reclaimed_open_logs);
+
 	if (ret < 0 && ret != -EINPROGRESS)
 		scoutfs_err(sb, "server error %d reclaiming log trees for rid %016llx: %s",
 			    ret, rid, err_str);
@@ -2057,7 +2134,7 @@ static int server_srch_commit_compact(struct super_block *sb,
 					  &super->srch_root, rid, sc,
 					  &av, &fr);
 	mutex_unlock(&server->srch_mutex);
-	if (ret < 0) /* XXX very bad, leaks allocators */
+	if (ret < 0)
 		goto apply;

 	/* reclaim allocators if they were set by _srch_commit_ */
@@ -2067,10 +2144,10 @@ static int server_srch_commit_compact(struct super_block *sb,
 	      scoutfs_alloc_splice_list(sb, &server->alloc, &server->wri,
 					server->other_freed, &fr);
 	mutex_unlock(&server->alloc_mutex);
+	WARN_ON(ret < 0); /* XXX leaks allocators */
 apply:
 	ret = server_apply_commit(sb, &hold, ret);
 out:
-	WARN_ON(ret < 0); /* XXX leaks allocators */
 	return scoutfs_net_response(sb, conn, cmd, id, ret, NULL, 0);
 }

@@ -2533,7 +2610,7 @@ static void server_log_merge_free_work(struct work_struct *work)

 		ret = scoutfs_btree_free_blocks(sb, &server->alloc,
 						&server->wri, &fr.key,
-						&fr.root, COMMIT_HOLD_ALLOC_BUDGET / 2);
+						&fr.root, COMMIT_HOLD_ALLOC_BUDGET / 8);
 		if (ret < 0) {
 			err_str = "freeing log btree";
 			break;
@@ -2552,7 +2629,7 @@ static void server_log_merge_free_work(struct work_struct *work)
 		/* freed blocks are in allocator, we *have* to update fr */
 		BUG_ON(ret < 0);

-		if (server_hold_alloc_used_since(sb, &hold) >= COMMIT_HOLD_ALLOC_BUDGET / 2) {
+		if (server_hold_alloc_used_since(sb, &hold) >= (COMMIT_HOLD_ALLOC_BUDGET * 3) / 4) {
 			mutex_unlock(&server->logs_mutex);
 			ret = server_apply_commit(sb, &hold, ret);
 			commit = false;
@@ -0,0 +1,45 @@
+#!/bin/bash
+
+#
+# Unfortunately, kernels can ship which contain sparse errors that are
+# unrelated to us.
+#
+# The exit status of this filtering wrapper will indicate an error if
+# sparse wasn't found or if there were any unfiltered output lines.  It
+# can hide error exit status from sparse or grep if they don't produce
+# output that makes it past the filters.
+#
+
+# must have sparse.  Fail with error message, mask success path.
+which sparse > /dev/null || exit 1
+
+# initial unmatchable, additional added as RE+="|..."
+RE="$^"
+
+#
+# Darn.  sparse has multi-line error messages, and I'd rather not bother
+# with multi-line filters.  So we'll just drop this context.
+#
+# command-line: note: in included file (through include/linux/netlink.h, include/linux/ethtool.h, include/linux/netdevice.h, include/net/sock.h, /root/scoutfs/kmod/src/kernelcompat.h, builtin): 
+#         fprintf(stderr, "%s: note: in included file%s:\n",
+#
+RE+="|: note: in included file"
+
+# 3.10.0-1160.119.1.el7.x86_64.debug
+# include/linux/posix_acl.h:138:9: warning: incorrect type in assignment (different address spaces)
+# include/linux/posix_acl.h:138:9:    expected struct posix_acl *<noident>
+# include/linux/posix_acl.h:138:9:    got struct posix_acl [noderef] <asn:4>*<noident>
+RE+="|include/linux/posix_acl.h:"
+
+# 3.10.0-1160.119.1.el7.x86_64.debug
+#include/uapi/linux/perf_event.h:146:56: warning: cast truncates bits from constant value (8000000000000000 becomes 0)
+RE+="|include/uapi/linux/perf_event.h:"
+
+# 4.18.0-513.24.1.el8_9.x86_64+debug'
+#./include/linux/skbuff.h:824:1: warning: directive in macro's argument list
+RE+="|include/linux/skbuff.h:"
+
+sparse "$@" |& \
+	grep -E -v "($RE)" |& \
+	awk '{ print $0 } END { exit NR > 0 }'
+exit $?
@@ -62,7 +62,7 @@
 * re-allocated and re-written.  Search can restart by checking the
 * btree for the current set of files.  Compaction reads log files which
 * are protected from other compactions by the persistent busy items
- * created by the server.  Compaction won't see it's blocks reused out
+ * created by the server.  Compaction won't see its blocks reused out
 * from under it, but it can encounter stale cached blocks that need to
 * be invalidated.
 */
@@ -442,6 +442,10 @@ out:
 	if (ret == 0 && (flags & GFB_INSERT) && blk >= le64_to_cpu(sfl->blocks))
 		sfl->blocks = cpu_to_le64(blk + 1);

+	if (bl) {
+		trace_scoutfs_get_file_block(sb, bl->blkno, flags);
+	}
+
 	*bl_ret = bl;
 	return ret;
 }
@@ -533,23 +537,35 @@ out:
 * the pairs cancel each other out by all readers (the second encoding
 * looks like deletion) so they aren't visible to the first/last bounds of
 * the block or file.
+ *
+ * We use the same entry repeatedly, so the diff between them will be empty.
+ * This lets us just emit the two-byte count word, leaving the other bytes
+ * as zero.
+ *
+ * Split the desired total len into two pieces, adding any remainder to the
+ * first four-bit value.
 */
-static int append_padded_entry(struct scoutfs_srch_file *sfl, u64 blk,
-			       struct scoutfs_srch_block *srb, struct scoutfs_srch_entry *sre)
+static void append_padded_entry(struct scoutfs_srch_file *sfl,
+				struct scoutfs_srch_block *srb,
+				int len)
 {
-	int ret;
+	int each;
+	int rem;
+	u16 lengths = 0;
+	u8 *buf = srb->entries + le32_to_cpu(srb->entry_bytes);

-	ret = encode_entry(srb->entries + le32_to_cpu(srb->entry_bytes),
-			   sre, &srb->tail);
-	if (ret > 0) {
-		srb->tail = *sre;
-		le32_add_cpu(&srb->entry_nr, 1);
-		le32_add_cpu(&srb->entry_bytes, ret);
-		le64_add_cpu(&sfl->entries, 1);
-		ret = 0;
-	}
+	each = (len - 2) >> 1;
+	rem = (len - 2) & 1;

-	return ret;
+	lengths |= each + rem;
+	lengths |= each << 4;
+
+	memset(buf, 0, len);
+	put_unaligned_le16(lengths, buf);
+
+	le32_add_cpu(&srb->entry_nr, 1);
+	le32_add_cpu(&srb->entry_bytes, len);
+	le64_add_cpu(&sfl->entries, 1);
 }

 /*
@@ -560,61 +576,41 @@ static int append_padded_entry(struct scoutfs_srch_file *sfl, u64 blk,
 * This is called when there is a single existing entry in the block.
 * We have the entire block to work with.  We encode pairs of matching
 * entries.  This hides them from readers (both searches and merging) as
- * they're interpreted as creation and deletion and are deleted.  We use
- * the existing hash value of the first entry in the block but then set
- * the inode to an impossibly large number so it doesn't interfere with
- * anything.
+ * they're interpreted as creation and deletion and are deleted.
 *
- * To hit the specific offset we very carefully manage the amount of
- * bytes of change between fields in the entry.  We know that if we
- * change all the byte of the ino and id we end up with a 20 byte
- * (2+8+8,2) encoding of the pair of entries.  To have the last entry
- * start at the _SAFE_POS offset we know that the final 20 byte pair
- * encoding needs to end at 2 bytes (second entry encoding) after the
- * _SAFE_POS offset.
+ * For simplicity and to maintain sort ordering within the block, we reuse
+ * the existing entry. This lets us skip the encoding step, because we know
+ * the diff will be zero. We can zero-pad the resulting entries to hit the
+ * target offset exactly.
 *
- * So as we encode pairs we watch the delta of our current offset from
- * that desired final offset of 2 past _SAFE_POS.  If we're a multiple
- * of 20 away then we encode the full 20 byte pairs.  If we're not, then
- * we drop a byte to encode 19 bytes.  That'll slowly change the offset
- * to be a multiple of 20 again while encoding large entries.
+ * Because we can't predict the exact number of entry_bytes when we start,
+ * we adjust the byte count of subsequent entries until we wind up at a
+ * multiple of 20 bytes away from our goal and then use that length for
+ * the remaining entries.
+ *
+ * We could just use a single pair of unnaturally large entries to consume
+ * the needed space, adjusting for an odd number of entry_bytes if necessary.
+ * The use of 19 or 20 bytes for the entry pair matches what we would see with
+ * real (non-zero) entries that vary from the existing entry.
 */
-static void pad_entries_at_safe(struct scoutfs_srch_file *sfl, u64 blk,
+static void pad_entries_at_safe(struct scoutfs_srch_file *sfl,
 				struct scoutfs_srch_block *srb)
 {
-	struct scoutfs_srch_entry sre;
 	u32 target;
 	s32 diff;
-	u64 hash;
-	u64 ino;
-	u64 id;
-	int ret;
-
-	hash = le64_to_cpu(srb->tail.hash);
-	ino = le64_to_cpu(srb->tail.ino) | (1ULL << 62);
-	id = le64_to_cpu(srb->tail.id);

 	target = SCOUTFS_SRCH_BLOCK_SAFE_BYTES + 2;

 	while ((diff = target - le32_to_cpu(srb->entry_bytes)) > 0) {
-		ino ^= 1ULL << (7 * 8);
+		append_padded_entry(sfl, srb, 10);
 		if (diff % 20 == 0) {
-			id ^= 1ULL << (7 * 8);
+			append_padded_entry(sfl, srb, 10);
 		} else {
-			id ^= 1ULL << (6 * 8);
+			append_padded_entry(sfl, srb, 9);
 		}
-
-		sre.hash = cpu_to_le64(hash);
-		sre.ino = cpu_to_le64(ino);
-		sre.id = cpu_to_le64(id);
-
-		ret = append_padded_entry(sfl, blk, srb, &sre);
-		if (ret == 0)
-			ret = append_padded_entry(sfl, blk, srb, &sre);
-		BUG_ON(ret != 0);
-
-		diff = target - le32_to_cpu(srb->entry_bytes);
 	}
+
+	WARN_ON_ONCE(diff != 0);
 }

 /*
@@ -749,14 +745,14 @@ static int search_log_file(struct super_block *sb,
 		for (i = 0; i < le32_to_cpu(srb->entry_nr); i++) {
 			if (pos > SCOUTFS_SRCH_BLOCK_SAFE_BYTES) {
 				/* can only be inconsistency :/ */
-				ret = EIO;
+				ret = -EIO;
 				break;
 			}

 			ret = decode_entry(srb->entries + pos, &sre, &prev);
 			if (ret <= 0) {
 				/* can only be inconsistency :/ */
-				ret = EIO;
+				ret = -EIO;
 				break;
 			}
 			pos += ret;
@@ -859,15 +855,15 @@ static int search_sorted_file(struct super_block *sb,

 		if (pos > SCOUTFS_SRCH_BLOCK_SAFE_BYTES) {
 			/* can only be inconsistency :/ */
-			ret = EIO;
-			break;
+			ret = -EIO;
+			goto out;
 		}

 		ret = decode_entry(srb->entries + pos, &sre, &prev);
 		if (ret <= 0) {
 			/* can only be inconsistency :/ */
-			ret = EIO;
-			break;
+			ret = -EIO;
+			goto out;
 		}
 		pos += ret;
 		prev = sre;
@@ -972,6 +968,8 @@ int scoutfs_srch_search_xattrs(struct super_block *sb,

 	scoutfs_inc_counter(sb, srch_search_xattrs);

+	trace_scoutfs_ioc_search_xattrs(sb, ino, last_ino);
+
 	*done = false;
 	srch_init_rb_root(sroot);

@@ -1408,7 +1406,7 @@ int scoutfs_srch_commit_compact(struct super_block *sb,
 			ret = -EIO;
 		scoutfs_btree_put_iref(&iref);
 	}
-	if (ret < 0) /* XXX leaks allocators */
+	if (ret < 0)
 		goto out;

 	/* restore busy to pending if the operation failed */
@@ -1428,10 +1426,8 @@ int scoutfs_srch_commit_compact(struct super_block *sb,
 	/* update file references if we finished compaction (!deleting) */
 	if (!(res->flags & SCOUTFS_SRCH_COMPACT_FLAG_DELETE)) {
 		ret = commit_files(sb, alloc, wri, root, res);
-		if (ret < 0) {
-			/* XXX we can't commit, shutdown? */
+		if (ret < 0)
 			goto out;
-		}

 		/* transition flags for deleting input files */
 		for (i = 0; i < res->nr; i++) {
@@ -1458,7 +1454,7 @@ update:
 			      le64_to_cpu(pending->id), 0);
 		ret = scoutfs_btree_insert(sb, alloc, wri, root, &key,
 					   pending, sizeof(*pending));
-		if (ret < 0)
+		if (WARN_ON_ONCE(ret < 0)) /* XXX inconsistency */
 			goto out;
 	}

@@ -1471,7 +1467,6 @@ update:
 		BUG_ON(err); /* both busy and pending present */
 	}
 out:
-	WARN_ON_ONCE(ret < 0); /* XXX inconsistency */
 	kfree(busy);
 	return ret;
 }
@@ -1669,7 +1664,7 @@ static int kway_merge(struct super_block *sb,
 			/* end sorted block on _SAFE offset for testing */
 			if (bl && le32_to_cpu(srb->entry_nr) == 1 && logs_input &&
 			    scoutfs_trigger(sb, SRCH_COMPACT_LOGS_PAD_SAFE)) {
-				pad_entries_at_safe(sfl, blk, srb);
+				pad_entries_at_safe(sfl, srb);
 				scoutfs_block_put(sb, bl);
 				bl = NULL;
 				blk++;
@@ -1802,7 +1797,7 @@ static void swap_page_sre(void *A, void *B, int size)
 * typically, ~10x worst case).
 *
 * Because we read and sort all the input files we must perform the full
- * compaction in one operation.  The server must have given us a
+ * compaction in one operation.  The server must have given us
 * sufficiently large avail/freed lists, otherwise we'll return ENOSPC.
 */
 static int compact_logs(struct super_block *sb,
@@ -1866,14 +1861,14 @@ static int compact_logs(struct super_block *sb,

 		if (pos > SCOUTFS_SRCH_BLOCK_SAFE_BYTES) {
 			/* can only be inconsistency :/ */
-			ret = EIO;
-			break;
+			ret = -EIO;
+			goto out;
 		}

 		ret = decode_entry(srb->entries + pos, sre, &prev);
 		if (ret <= 0) {
 			/* can only be inconsistency :/ */
-			ret = EIO;
+			ret = -EIO;
 			goto out;
 		}
 		prev = *sre;
@@ -2281,12 +2276,11 @@ static void scoutfs_srch_compact_worker(struct work_struct *work)
 	} else {
 		ret = -EINVAL;
 	}
-	if (ret < 0)
-		goto commit;

-	ret = scoutfs_alloc_prepare_commit(sb, &alloc, &wri) ?:
+	scoutfs_alloc_prepare_commit(sb, &alloc, &wri);
+	if (ret == 0)
 	      scoutfs_block_writer_write(sb, &wri);
-commit:
+
 	/* the server won't use our partial compact if _ERROR is set */
 	sc->meta_avail = alloc.avail;
 	sc->meta_freed = alloc.freed;
@@ -2303,7 +2297,7 @@ out:
 		scoutfs_inc_counter(sb, srch_compact_error);

 	scoutfs_block_writer_forget_all(sb, &wri);
-	queue_compact_work(srinf, sc->nr > 0 && ret == 0);
+	queue_compact_work(srinf, sc != NULL && sc->nr > 0 && ret == 0);

 	kfree(sc);
 }
@@ -512,9 +512,9 @@ static int scoutfs_fill_super(struct super_block *sb, void *data, int silent)

 	sbi = kzalloc(sizeof(struct scoutfs_sb_info), GFP_KERNEL);
 	sb->s_fs_info = sbi;
-	sbi->sb = sb;
 	if (!sbi)
 		return -ENOMEM;
+	sbi->sb = sb;

 	ret = assign_random_id(sbi);
 	if (ret < 0)
@@ -196,7 +196,7 @@ static int retry_forever(struct super_block *sb, int (*func)(struct super_block
 			}

 			if (scoutfs_forcing_unmount(sb)) {
-				ret = -EIO;
+				ret = -ENOLINK;
 				break;
 			}

@@ -252,7 +252,7 @@ void scoutfs_trans_write_func(struct work_struct *work)
 	}

 	if (scoutfs_forcing_unmount(sb)) {
-		ret = -EIO;
+		ret = -ENOLINK;
 		goto out;
 	}

@@ -742,7 +742,7 @@ int scoutfs_xattr_set_locked(struct inode *inode, const char *name, size_t name_
 	int ret;
 	int err;

-	trace_scoutfs_xattr_set(sb, name_len, value, size, flags);
+	trace_scoutfs_xattr_set(sb, ino, name_len, value, size, flags);

 	if (WARN_ON_ONCE(tgs->totl && tgs->indx) ||
 	    WARN_ON_ONCE((tgs->totl | tgs->indx) && !tag_lock))
@@ -140,6 +140,9 @@ t_filter_dmesg()
 	re="$re|scoutfs .* error.*server failed to bind to.*"
 	re="$re|scoutfs .* critical transaction commit failure.*"

+	# ENOLINK (-67) indicates an expected forced unmount error
+	re="$re|scoutfs .* error -67 .*"
+
 	# change-devices causes loop device resizing
 	re="$re|loop: module loaded"
 	re="$re|loop[0-9].* detected capacity change from.*"
@@ -1,4 +1,3 @@
-== setting longer hung task timeout
 == creating fragmented extents
 == unlink file with moved extents to free extents per block
 == cleanup
@@ -49,7 +49,7 @@ offline wating should be empty:
 0
 == truncating does wait
 truncate should be waiting for first block:
-trunate should no longer be waiting:
+truncate should no longer be waiting:
 0
 == writing waits
 should be waiting for write
@@ -69,6 +69,7 @@ $(basename $0) options:
    -r <dir>  | Specify the directory in which to store results of
              | test runs.  The directory will be created if it doesn't
              | exist.  Previous results will be deleted as each test runs.
+    -R        | shuffle the test order randomly using shuf
    -s        | Skip git repo checkouts.
    -t        | Enabled trace events that match the given glob argument.
              | Multiple options enable multiple globbed events.
@@ -89,6 +90,7 @@ done
 # set some T_ defaults
 T_TRACE_DUMP="0"
 T_TRACE_PRINTK="0"
+T_PORT_START="19700"

 # array declarations to be able to use array ops
 declare -a T_TRACE_GLOB
@@ -164,6 +166,9 @@ while true; do
 		T_RESULTS="$2"
 		shift
 		;;
+	-R)
+		T_SHUF="1"
+		;;
 	-s)
 	        T_SKIP_CHECKOUT="1"
 		;;
@@ -261,13 +266,37 @@ for e in T_META_DEVICE T_DATA_DEVICE T_EX_META_DEV T_EX_DATA_DEV T_KMOD T_RESULT
 	eval $e=\"$(readlink -f "${!e}")\"
 done

+# try and check ports, but not necessary
+T_TEST_PORT="$T_PORT_START"
+T_SCRATCH_PORT="$((T_PORT_START + 100))"
+T_DEV_PORT="$((T_PORT_START + 200))"
+read local_start local_end < /proc/sys/net/ipv4/ip_local_port_range
+if [ -n "$local_start" -a -n "$local_end" -a "$local_start" -lt "$local_end" ]; then
+	if [ ! "$T_DEV_PORT" -lt "$local_start" -a ! "$T_TEST_PORT" -gt "$local_end" ]; then
+		die "listening port range $T_TEST_PORT - $T_DEV_PORT is within local dynamic port range $local_start - $local_end in /proc/sys/net/ipv4/ip_local_port_range"
+	fi
+fi
+
+# permute sequence?
+T_SEQUENCE=sequence
+if [ -n "$T_SHUF" ]; then
+	msg "shuffling test order"
+	shuf sequence -o sequence.shuf
+	# keep xfstests at the end
+	if grep -q 'xfstests.sh' sequence.shuf ; then
+		sed -i '/xfstests.sh/d' sequence.shuf
+		echo "xfstests.sh" >> sequence.shuf
+	fi
+	T_SEQUENCE=sequence.shuf
+fi
+
 # include everything by default
 test -z "$T_INCLUDE" && T_INCLUDE="-e '.*'"
 # (quickly) exclude nothing by default
 test -z "$T_EXCLUDE" && T_EXCLUDE="-e '\Zx'"

 # eval to strip re ticks but not expand
-tests=$(grep -v "^#" sequence |
+tests=$(grep -v "^#" $T_SEQUENCE |
 	eval grep "$T_INCLUDE" | eval grep -v "$T_EXCLUDE")
 test -z "$tests" && \
 	die "no tests found by including $T_INCLUDE and excluding $T_EXCLUDE"
@@ -346,7 +375,7 @@ fi
 quo=""
 if [ -n "$T_MKFS" ]; then
 	for i in $(seq -0 $((T_QUORUM - 1))); do
-		quo="$quo -Q $i,127.0.0.1,$((42000 + i))"
+		quo="$quo -Q $i,127.0.0.1,$((T_TEST_PORT + i))"
 	done

 	msg "making new filesystem with $T_QUORUM quorum members"
@@ -15,7 +15,7 @@ echo "== prepare devices, mount point, and logs"
 SCR="$T_TMPDIR/mnt.scratch"
 mkdir -p "$SCR"
 > $T_TMP.mount.out
-scoutfs mkfs -f -Q 0,127.0.0.1,53000 "$T_EX_META_DEV" "$T_EX_DATA_DEV" > $T_TMP.mkfs.out 2>&1 \
+scoutfs mkfs -f -Q 0,127.0.0.1,$T_SCRATCH_PORT "$T_EX_META_DEV" "$T_EX_DATA_DEV" > $T_TMP.mkfs.out 2>&1 \
 	|| t_fail "mkfs failed"

 echo "== bad devices, bad options"
@@ -11,7 +11,7 @@ truncate -s $sz "$T_TMP.equal"
 truncate -s $large_sz "$T_TMP.large"

 echo "== make scratch fs"
-t_quiet scoutfs mkfs -f -Q 0,127.0.0.1,53000 "$T_EX_META_DEV" "$T_EX_DATA_DEV"
+t_quiet scoutfs mkfs -f -Q 0,127.0.0.1,$T_SCRATCH_PORT "$T_EX_META_DEV" "$T_EX_DATA_DEV"
 SCR="$T_TMPDIR/mnt.scratch"
 mkdir -p "$SCR"

@@ -57,7 +57,7 @@ test "$before" == "$after" || \
 # XXX this is all pretty manual, would be nice to have helpers
 echo "== make small meta fs"
 # meta device just big enough for reserves and the metadata we'll fill
-scoutfs mkfs -A -f -Q 0,127.0.0.1,53000 -m 10G "$T_EX_META_DEV" "$T_EX_DATA_DEV" > $T_TMP.mkfs.out 2>&1 || \
+scoutfs mkfs -A -f -Q 0,127.0.0.1,$T_SCRATCH_PORT -m 10G "$T_EX_META_DEV" "$T_EX_DATA_DEV" > $T_TMP.mkfs.out 2>&1 || \
 	t_fail "mkfs failed"
 SCR="$T_TMPDIR/mnt.scratch"
 mkdir -p "$SCR"
@@ -11,8 +11,8 @@
 # format version.
 #

-# not supported on el9!
-if [ $(source /etc/os-release ; echo ${VERSION_ID:0:1}) -gt 8 ]; then
+# not supported on el8 or higher
+if [ $(source /etc/os-release ; echo ${VERSION_ID:0:1}) -gt 7 ]; then
 	t_skip_permitted "Unsupported OS version"
 fi

@@ -89,7 +89,7 @@ for vers in $(seq $MIN $((MAX - 1))); do
 	old_module="$builds/$vers/scoutfs.ko"

 	echo "mkfs $vers" >> "$T_TMP.log"
-	t_quiet $old_scoutfs mkfs -f -Q 0,127.0.0.1,53000 "$T_EX_META_DEV" "$T_EX_DATA_DEV" \
+	t_quiet $old_scoutfs mkfs -f -Q 0,127.0.0.1,$T_SCRATCH_PORT "$T_EX_META_DEV" "$T_EX_DATA_DEV" \
 		|| t_fail "mkfs $vers failed"

 	echo "mount $vers with $vers" >> "$T_TMP.log"
@@ -10,30 +10,6 @@ EXTENTS_PER_BTREE_BLOCK=600
 EXTENTS_PER_LIST_BLOCK=8192
 FREED_EXTENTS=$((EXTENTS_PER_BTREE_BLOCK * EXTENTS_PER_LIST_BLOCK))

-#
-# This test specifically creates a pathologically sparse file that will
-# be as expensive as possible to free.  This is usually fine on
-# dedicated or reasonable hardware, but trying to run this in
-# virtualized debug kernels can take a very long time.  This test is
-# about making sure that the server doesn't fail, not that the platform
-# can handle the scale of work that our btree formats happen to require
-# while execution is bogged down with use-after-free memory reference
-# tracking.  So we give the test a lot more breathing room before
-# deciding that its hung.
-#
-echo "== setting longer hung task timeout"
-if [ -w /proc/sys/kernel/hung_task_timeout_secs ]; then
-	secs=$(cat /proc/sys/kernel/hung_task_timeout_secs)
-	test "$secs" -gt 0 || \
-		t_fail "confusing value '$secs' from /proc/sys/kernel/hung_task_timeout_secs"
-	restore_hung_task_timeout()
-	{
-		echo "$secs" > /proc/sys/kernel/hung_task_timeout_secs
-	}
-	trap restore_hung_task_timeout EXIT
-	echo "$((secs * 5))" > /proc/sys/kernel/hung_task_timeout_secs
-fi
-
 echo "== creating fragmented extents"
 fragmented_data_extents $FREED_EXTENTS $EXTENTS_PER_BTREE_BLOCK "$T_D0/alloc" "$T_D0/move"

@@ -157,7 +157,7 @@ echo "truncate should be waiting for first block:"
 expect_wait "$DIR/file" "change_size" $ino 0
 scoutfs stage "$DIR/golden" "$DIR/file" -V "$vers" -o 0 -l $BYTES
 sleep .1
-echo "trunate should no longer be waiting:"
+echo "truncate should no longer be waiting:"
 scoutfs data-waiting -B 0 -I 0 -p "$DIR" | wc -l
 cat "$DIR/golden" > "$DIR/file"
 vers=$(scoutfs stat -s data_version "$DIR/file")
@@ -168,10 +168,13 @@ scoutfs release "$DIR/file" -V "$vers" -o 0 -l $BYTES
 # overwrite, not truncate+write
 dd if="$DIR/other" of="$DIR/file" \
 	bs=$BS count=$BLOCKS conv=notrunc status=none &
+pid="$!"
 sleep .1
 echo "should be waiting for write"
 expect_wait "$DIR/file" "write" $ino 0
 scoutfs stage "$DIR/golden" "$DIR/file" -V "$vers" -o 0 -l $BYTES
+# wait for the background dd to complete
+wait "$pid" 2> /dev/null
 cmp "$DIR/file" "$DIR/other"

 echo "== cleanup"
@@ -67,18 +67,49 @@ t_mount_all
 while test -d $(echo /sys/fs/scoutfs/*/fence/* | cut -d " " -f 1); do
 	sleep .5
 done
-# wait for orphan scans to run
-t_set_all_sysfs_mount_options orphan_scan_delay_ms 1000
-# also have to wait for delayed log merge work from mount
-C=120
-while (( C-- )); do
-	brk=1
-	for ino in $inos; do
-		inode_exists $ino && brk=0
-	done
-	test $brk -eq 1 && break
+
+
+sv=$(t_server_nr)
+
+# wait for reclaim_open_log_tree() to complete for each mount
+while [ $(t_counter reclaimed_open_logs $sv) -lt $T_NR_MOUNTS ]; do
 	sleep 1
 done
+
+# wait for finalize_and_start_log_merge() to find no active merges in flight
+# and not find any finalized trees
+while [ $(t_counter log_merge_no_finalized $sv) -lt 1 ]; do
+	sleep 1
+done
+
+# wait for orphan scans to run
+t_set_all_sysfs_mount_options orphan_scan_delay_ms 1000
+# wait until we see two consecutive orphan scan attempts without
+# any inode deletion forward progress in each mount
+for nr in $(t_fs_nrs); do
+	C=0
+	LOSA=$(t_counter orphan_scan_attempts $nr)
+	LDOP=$(t_counter inode_deleted $nr)
+
+	while [ $C -lt 2 ]; do
+		sleep 1
+
+		OSA=$(t_counter orphan_scan_attempts $nr)
+		DOP=$(t_counter inode_deleted $nr)
+
+		if [ $OSA != $LOSA ]; then
+			if [ $DOP == $LDOP ]; then
+				(( C++ ))
+			else
+				C=0
+			fi
+		fi
+
+		LOSA=$OSA
+		LDOP=$DOP
+	done
+done
+
 for ino in $inos; do
 	inode_exists $ino && echo "$ino still exists"
 done
@@ -72,7 +72,7 @@ quarter_data=$(echo "$size_data / 4" | bc)

 # XXX this is all pretty manual, would be nice to have helpers
 echo "== make initial small fs"
-scoutfs mkfs -A -f -Q 0,127.0.0.1,53000 -m $quarter_meta -d $quarter_data \
+scoutfs mkfs -A -f -Q 0,127.0.0.1,$T_SCRATCH_PORT -m $quarter_meta -d $quarter_data \
 	"$T_EX_META_DEV" "$T_EX_DATA_DEV" > $T_TMP.mkfs.out 2>&1 || \
 		t_fail "mkfs failed"
 SCR="$T_TMPDIR/mnt.scratch"
@@ -50,9 +50,9 @@ t_quiet sync
 cat << EOF > local.config
 export FSTYP=scoutfs
 export MKFS_OPTIONS="-f"
-export MKFS_TEST_OPTIONS="-Q 0,127.0.0.1,42000"
-export MKFS_SCRATCH_OPTIONS="-Q 0,127.0.0.1,43000"
-export MKFS_DEV_OPTIONS="-Q 0,127.0.0.1,44000"
+export MKFS_TEST_OPTIONS="-Q 0,127.0.0.1,$T_TEST_PORT"
+export MKFS_SCRATCH_OPTIONS="-Q 0,127.0.0.1,$T_SCRATCH_PORT"
+export MKFS_DEV_OPTIONS="-Q 0,127.0.0.1,$T_DEV_PORT"
 export TEST_DEV=$T_DB0
 export TEST_DIR=$T_M0
 export SCRATCH_META_DEV=$T_EX_META_DEV
@@ -130,6 +130,23 @@ the server for the filesystem if it is elected leader.
 The assigned number must match one of the slots defined with \-Q options
 when the filesystem was created with mkfs.  If the number assigned
 doesn't match a number created during mkfs then the mount will fail.
+.TP
+.B tcp_keepalive_timeout_ms=<number>
+This option sets the amount of time, in milliseconds, that a client
+connection will wait for active TCP packets, before deciding that
+the connection is dead. This setting is per-mount and only changes
+the behavior of that mount.
+.sp
+The default value of this setting is 60000msec (60s). Any precision
+beyond a whole second is likely unrealistic due to the nature of
+TCP keepalive mechanisms in the Linux kernel. Valid values are any
+value higher than 3000 (3s).
+.sp
+The TCP keepalive mechanism is complex and observing a lost connection
+quickly is important to maintain cluster stability. If the local
+network suffers from intermittent outages this option may provide
+some respite to overcome these outages without the cluster becoming
+desynchronized.
 .SH VOLUME OPTIONS
 Volume options are persistent options which are stored in the super
 block in the metadata device and which apply to all mounts of the volume.
@@ -1,7 +1,7 @@
 #!/bin/bash

-# can we find sparse?  If not, we're done.
-which sparse > /dev/null 2>&1 || exit 0
+# must have sparse.  Fail with error message, mask success path.
+which sparse > /dev/null || exit 1

 # 
 # one of the problems with using sparse in userspace is that it picks up
@@ -22,6 +22,11 @@ RE="$RE|warning: memset with byte count of 4194304"
 # some sparse versions don't know about some builtins
 RE="$RE|error: undefined identifier '__builtin_fpclassify'"

+# on el8, sparse can't handle __has_include for some reason when _GNU_SOURCE
+# is defined, and we need that for O_DIRECT.
+RE="$RE|note: in included file .through /usr/include/sys/stat.h.:"
+RE="$RE|/usr/include/bits/statx.h:30:6: error: "
+
 #
 # don't filter out 'too many errors' here, it can signify that
 # sparse doesn't understand something and is throwing a *ton*