Swap meta allocators with much more reserve.

We increase the reserve from 2x to 3x the minimum number of blocks needed for our reserve, and change the algorithm that determines when to swap them. The old algorithm swaps them if _avail is just larger than _freed. While the simplest algorithm, it suffers from the problem that in practice, when we hit ENOSPC conditions, it will almost always swap on every iteration when we hit our low water mark. In our testing, we regularly see failures because what is effectively happening is that we starve both allocators by slowly draining them block by block, trying to do work. The work of course requires us to drain more blocks to commit changes. This cycle doesn't end until both allocators are almost completely drained and not enough blocks remain to do any real work anymore. The new algorithm will not swap allocators unless _freed is 50% larger than _avail. The outcome is that, during meta space pressure, we're allowing _avail to slowly drain down to the same levels as before, effectively. However, we then swap to _freed which is now 1.5x larger. This results in us being able to do a whole chunk of work without needing to swap. While draining _avail for longer, we allow work to commit and recycle blocks back into _freed muich more effectively. Signed-off-by: Auke Kok <auke.kok@versity.com>
2026-01-28 22:32:02 +00:00 · 2025-02-11 17:04:28 -05:00
53 changed files with 1081 additions and 2665 deletions
--- a/ReleaseNotes.md
+++ b/ReleaseNotes.md
@@ -1,43 +1,6 @@
 Versity ScoutFS Release Notes
 =============================

---
-v1.25
-\
-*Jun 3, 2025*
-
-Fix a bug that could cause indefinite retries of failed client commits.
-Under specific error conditions the client and server's understanding of
-the current client commit could get out of sync.  The client would retry
-commits indefinitely that could never succeed.  This manifested as
-infinite "critical transaction commit failure" messages in the kernel
-log on the client and matching "error <nr> committing client logs" on
-the server.
-
-Fix a bug in a specific case of server error handling that could result
-in sending references to unwritten blocks to the client.  The client
-would try to read blocks that hadn't been written and return spurious
-errors.  This was seen under low free space conditions on the server and
-resulted in error messages with error code 116 (The errno enum for
-ESTALE, the client's indication that it couldn't read the blocks that it
-expected.)
-
---
-v1.24
-\
-*Mar 14, 2025*
-
-Add support for coherent read and write mmap() mappings of regular file
-data between mounts.
-
-Fix a bug that was causing scoutfs utilities to parse and change some
-file names before passing them on to the kernel for processing.  This
-fixes spurious scoutfs command errors for files with the offending
-patterns in their names.
-
-Fix a bug where rename wasn't updating the ctime of the inode at the
-destination name if it existed.
-
 ---
 v1.23
 \
--- a/kmod/Makefile
+++ b/kmod/Makefile
@@ -5,6 +5,13 @@ ifeq ($(SK_KSRC),)
 SK_KSRC := $(shell echo /lib/modules/`uname -r`/build)
 endif

+# fail if sparse fails if we find it
+ifeq ($(shell sparse && echo found),found)
+SP =
+else
+SP = @:
+endif
+
 SCOUTFS_GIT_DESCRIBE ?= \
 	$(shell git describe --all --abbrev=6 --long 2>/dev/null || \
 		echo no-git)
@@ -29,7 +36,9 @@ TARFILE = scoutfs-kmod-$(RPM_VERSION).tar
 all: module

 module:
-	$(MAKE) CHECK=$(CURDIR)/src/sparse-filtered.sh C=1 CF="-D__CHECK_ENDIAN__" $(SCOUTFS_ARGS)
+	$(MAKE) $(SCOUTFS_ARGS)
+	$(SP) $(MAKE) C=2 CF="-D__CHECK_ENDIAN__" $(SCOUTFS_ARGS)
+

 modules_install:
 	$(MAKE) $(SCOUTFS_ARGS) modules_install
--- a/kmod/src/Makefile.kernelcompat
+++ b/kmod/src/Makefile.kernelcompat
@@ -6,6 +6,26 @@

 ccflags-y += -include $(src)/kernelcompat.h

+#
+# v3.10-rc6-21-gbb6f619b3a49
+#
+# _readdir changes from fop->readdir() to fop->iterate() and from
+# filldir(dirent) to dir_emit(ctx).
+#
+ifneq (,$(shell grep 'iterate.*dir_context' include/linux/fs.h))
+ccflags-y += -DKC_ITERATE_DIR_CONTEXT
+endif
+
+#
+# v3.10-rc6-23-g5f99f4e79abc
+#
+# Helpers including dir_emit_dots() are added in the process of
+# switching dcache_readdir() from fop->readdir() to fop->iterate()
+#
+ifneq (,$(shell grep 'dir_emit_dots' include/linux/fs.h))
+ccflags-y += -DKC_DIR_EMIT_DOTS
+endif
+
 #
 # v3.18-rc2-19-gb5ae6b15bd73
 # 
@@ -158,6 +178,15 @@ ifneq (,$(shell grep 'sock_create_kern.*struct net' include/linux/net.h))
 ccflags-y += -DKC_SOCK_CREATE_KERN_NET=1
 endif

+#
+# v3.18-rc6-1619-gc0371da6047a
+#
+# iov_iter is now part of struct msghdr
+#
+ifneq (,$(shell grep 'struct iov_iter.*msg_iter' include/linux/socket.h))
+ccflags-y += -DKC_MSGHDR_STRUCT_IOV_ITER=1
+endif
+
 #
 # v4.17-rc6-7-g95582b008388
 #
@@ -402,63 +431,3 @@ endif
 ifneq (,$(shell grep 'struct file.*bdev_file_open_by_path.const char.*path' include/linux/blkdev.h))
 ccflags-y += -DKC_BDEV_FILE_OPEN_BY_PATH
 endif
-
-# v4.0-rc7-1796-gfe0f07d08ee3
-#
-# direct-io changes modify inode_dio_done to now be called inode_dio_end
-ifneq (,$(shell grep 'void inode_dio_end.struct inode' include/linux/fs.h))
-ccflags-y += -DKC_INODE_DIO_END
-endif
-
-#
-# v5.0-6476-g3d3539018d2c
-#
-# page fault handlers return a bitmask vm_fault_t instead
-# Note: el8's header has a slightly modified prefix here
-ifneq (,$(shell grep 'typedef.*__bitwise unsigned.*int vm_fault_t' include/linux/mm_types.h))
-ccflags-y += -DKC_MM_VM_FAULT_T
-endif
-
-# v3.19-499-gd83a08db5ba6
-#
-# .remap pages becomes obsolete
-ifneq (,$(shell grep 'int ..remap_pages..struct vm_area_struct' include/linux/mm.h))
-ccflags-y += -DKC_MM_REMAP_PAGES
-endif
-
-#
-# v3.19-4742-g503c358cf192
-#
-# list_lru_shrink_count() and list_lru_shrink_walk() introduced
-#
-ifneq (,$(shell grep 'list_lru_shrink_count.*struct list_lru' include/linux/list_lru.h))
-ccflags-y += -DKC_LIST_LRU_SHRINK_COUNT_WALK
-endif
-
-#
-# v3.19-4757-g3f97b163207c
-#
-# lru_list_walk_cb lru arg added
-#
-ifneq (,$(shell grep 'struct list_head \*item, spinlock_t \*lock, void \*cb_arg' include/linux/list_lru.h))
-ccflags-y += -DKC_LIST_LRU_WALK_CB_ITEM_LOCK
-endif
-
-#
-# v6.7-rc4-153-g0a97c01cd20b
-#
-# list_lru_{add,del} -> list_lru_{add,del}_obj
-#
-ifneq (,$(shell grep '^bool list_lru_add_obj' include/linux/list_lru.h))
-ccflags-y += -DKC_LIST_LRU_ADD_OBJ
-endif
-
-#
-# v6.12-rc6-227-gda0c02516c50
-#
-# lru_list_walk_cb lock arg removed
-#
-ifneq (,$(shell grep 'struct list_lru_one \*list, spinlock_t \*lock, void \*cb_arg' include/linux/list_lru.h))
-ccflags-y += -DKC_LIST_LRU_WALK_CB_LIST_LOCK
-endif
-
--- a/kmod/src/alloc.c
+++ b/kmod/src/alloc.c
@@ -86,47 +86,18 @@ static u64 smallest_order_length(u64 len)
 }

 /*
- * Moving an extent between trees can dirty blocks in several ways. This
- * function calculates worst case number of blocks across these scenarions.
- * We treat the alloc and free counts independently, so the values below are
- * max(allocated, freed), not the sum.
- *
- * We track extents with two separate btree items: by block number and by size.
- *
- * If we're removing an extent from the btree (allocating), we can dirty
- * two blocks if the keys are in different leaves. If we wind up merging
- * leaves because we fall below the low water mark, we can wind up freeing
- * three leaves.
- *
- * That sequence is as follows, assuming the original keys are removed from
- * blocks A and B:
- *
- * Allocate new dirty A' and B'
- * Free old stable A and B
- * B' has fallen below the low water mark, so copy B' into A'
- * Free B'
- *
- * An extent insertion (freeing an extent) can dirty up to five distinct items
- * in the btree as it adds and removes the blkno and size sorted items for the
- * old and new lengths of the extent:
- *
- * In the by-blkno portion of the btree, we can dirty (allocate for COW) up
- * to two blocks- either by merging adjacent extents, which can cause us to
- * join leaf blocks; or by an insertion that causes a split.
- *
- * In the by-size portion, we never merge extents, so normally we just dirty
- * a single item with a size insertion. But if we merged adjacent extents in
- * the by-blkno portion of the tree, we might be working with three by-sizex
- * items: removing the two old ones that were combined in the merge; and
- * adding the new one for the larger, merged size.
- *
- * Finally, dirtying the paths to these leaves can grow the tree and grow/shrink
- * neighbours at each level, so we multiply by the height of the tree after
- * accounting for a possible new level.
+ * An extent modification dirties three distinct leaves of an allocator
+ * btree as it adds and removes the blkno and size sorted items for the
+ * old and new lengths of the extent.  Dirtying the paths to these
+ * leaves can grow the tree and grow/shrink neighbours at each level.
+ * We over-estimate the number of blocks allocated and freed (the paths
+ * share a root, growth doesn't free) to err on the simpler and safer
+ * side.  The overhead is minimal given the relatively large list blocks
+ * and relatively short allocator trees.
 */
 static u32 extent_mod_blocks(u32 height)
 {
-	return ((1 + height) * 3) * 5;
+	return ((1 + height) * 2) * 3;
 }

 /*
--- a/kmod/src/block.c
+++ b/kmod/src/block.c
@@ -22,7 +22,6 @@
 #include <linux/rhashtable.h>
 #include <linux/random.h>
 #include <linux/sched/mm.h>
-#include <linux/list_lru.h>

 #include "format.h"
 #include "super.h"
@@ -39,12 +38,26 @@
 * than the page size.  Callers can have their own contexts for tracking
 * dirty blocks that are written together.  We pin dirty blocks in
 * memory and only checksum them all as they're all written.
+ *
+ * Memory reclaim is driven by maintaining two very coarse groups of
+ * blocks.  As we access blocks we mark them with an increasing counter
+ * to discourage them from being reclaimed.  We then define a threshold
+ * at the current counter minus half the population.  Recent blocks have
+ * a counter greater than the threshold, and all other blocks with
+ * counters less than it are considered older and are candidates for
+ * reclaim.  This results in access updates rarely modifying an atomic
+ * counter as blocks need to be moved into the recent group, and shrink
+ * can randomly scan blocks looking for the half of the population that
+ * will be in the old group.  It's reasonably effective, but is
+ * particularly efficient and avoids contention between concurrent
+ * accesses and shrinking.
 */

 struct block_info {
 	struct super_block *sb;
+	atomic_t total_inserted;
+	atomic64_t access_counter;
 	struct rhashtable ht;
-	struct list_lru lru;
 	wait_queue_head_t waitq;
 	KC_DEFINE_SHRINKER(shrinker);
 	struct work_struct free_work;
@@ -63,15 +76,28 @@ enum block_status_bits {
 	BLOCK_BIT_PAGE_ALLOC,	/* page (possibly high order) allocation */
 	BLOCK_BIT_VIRT,		/* mapped virt allocation */
 	BLOCK_BIT_CRC_VALID,	/* crc has been verified */
-	BLOCK_BIT_ACCESSED,	/* seen by lookup since last lru add/walk */
 };

+/*
+ * We want to tie atomic changes in refcounts to whether or not the
+ * block is still visible in the hash table, so we store the hash
+ * table's reference up at a known high bit.  We could naturally set the
+ * inserted bit through excessive refcount increments.  We don't do
+ * anything about that but at least warn if we get close.
+ *
+ * We're avoiding the high byte for no real good reason, just out of a
+ * historical fear of implementations that don't provide the full
+ * precision.
+ */
+#define BLOCK_REF_INSERTED	(1U << 23)
+#define BLOCK_REF_FULL		(BLOCK_REF_INSERTED >> 1)
+
 struct block_private {
 	struct scoutfs_block bl;
 	struct super_block *sb;
 	atomic_t refcount;
+	u64 accessed;
 	struct rhash_head ht_head;
-	struct list_head lru_head;
 	struct list_head dirty_entry;
 	struct llist_node free_node;
 	unsigned long bits;
@@ -86,7 +112,7 @@ struct block_private {
 do {												\
 	__typeof__(bp) _bp = (bp);								\
 	trace_scoutfs_block_##which(_bp->sb, _bp, _bp->bl.blkno, atomic_read(&_bp->refcount),	\
-				    atomic_read(&_bp->io_count), _bp->bits);	\
+				    atomic_read(&_bp->io_count), _bp->bits, _bp->accessed);	\
 } while (0)

 #define BLOCK_PRIVATE(_bl) \
@@ -150,7 +176,6 @@ static struct block_private *block_alloc(struct super_block *sb, u64 blkno)
 	bp->bl.blkno = blkno;
 	bp->sb = sb;
 	atomic_set(&bp->refcount, 1);
-	INIT_LIST_HEAD(&bp->lru_head);
 	INIT_LIST_HEAD(&bp->dirty_entry);
 	set_bit(BLOCK_BIT_NEW, &bp->bits);
 	atomic_set(&bp->io_count, 0);
@@ -208,85 +233,32 @@ static void block_free_work(struct work_struct *work)
 }

 /*
- * Users of blocks hold a refcount.  If putting a refcount drops to zero
- * then the block is freed.
- *
- * Acquiring new references and claiming the exclusive right to tear
- * down a block is built around this LIVE_REFCOUNT_BASE refcount value.
- * As blocks are initially cached they have the live base added to their
- * refcount.  Lookups will only increment the refcount and return blocks
- * for reference holders while the refcount is >= than the base.
- *
- * To remove a block from the cache and eventually free it, either by
- * the lru walk in the shrinker, or by reference holders, the live base
- * is removed and turned into a normal refcount increment that will be
- * put by the caller.  This can only be done once for a block, and once
- * its done lookup will not return any more references.
- */
-#define LIVE_REFCOUNT_BASE (INT_MAX ^ (INT_MAX >> 1))
-
-/*
- * Inc the refcount while holding an incremented refcount.  We can't
- * have so many individual reference holders that they pass the live
- * base.
+ * Get a reference to a block while holding an existing reference.
 */
 static void block_get(struct block_private *bp)
 {
-	int now = atomic_inc_return(&bp->refcount);
+	WARN_ON_ONCE((atomic_read(&bp->refcount) & ~BLOCK_REF_INSERTED) <= 0);

-	BUG_ON(now <= 1);
-	BUG_ON(now == LIVE_REFCOUNT_BASE);
+	atomic_inc(&bp->refcount);
 }

 /*
- * if (*v >= u) {
- * 	*v += a;
- * 	return true;
- * }
- */
-static bool atomic_add_unless_less(atomic_t *v, int a, int u)
+ * Get a reference to a block as long as it's been inserted in the hash
+ * table and hasn't been removed.
+ */ 
+static struct block_private *block_get_if_inserted(struct block_private *bp)
 {
-	int c;
+	int cnt;

 	do {
-		c = atomic_read(v);
-		if (c < u)
-			return false;
-	} while (atomic_cmpxchg(v, c, c + a) != c);
+		cnt = atomic_read(&bp->refcount);
+		WARN_ON_ONCE(cnt & BLOCK_REF_FULL);
+		if (!(cnt & BLOCK_REF_INSERTED))
+			return NULL;

-	return true;
-}
+	} while (atomic_cmpxchg(&bp->refcount, cnt, cnt + 1) != cnt);

-static bool block_get_if_live(struct block_private *bp)
-{
-	return atomic_add_unless_less(&bp->refcount, 1, LIVE_REFCOUNT_BASE);
-}
-
-/*
- * If the refcount still has the live base, subtract it and increment
- * the callers refcount that they'll put.
- */
-static bool block_get_remove_live(struct block_private *bp)
-{
-	return atomic_add_unless_less(&bp->refcount, (1 - LIVE_REFCOUNT_BASE), LIVE_REFCOUNT_BASE);
-}
-
-/*
- * Only get the live base refcount if it is the only refcount remaining.
- * This means that there are no active refcount holders and the block
- * can't be dirty or under IO, which both hold references.
- */
-static bool block_get_remove_live_only(struct block_private *bp)
-{
-	int c;
-
-	do {
-		c = atomic_read(&bp->refcount);
-		if (c != LIVE_REFCOUNT_BASE)
-			return false;
-	} while (atomic_cmpxchg(&bp->refcount, c, c - LIVE_REFCOUNT_BASE + 1) != c);
-
-	return true;
+	return bp;
 }

 /*
@@ -318,81 +290,143 @@ static const struct rhashtable_params block_ht_params = {
 };

 /*
- * Insert the block into the cache so that it's visible for lookups.
- * The caller can hold references (including for a dirty block).
- *
- * We make sure the base is added and the block is in the lru once it's
- * in the hash.  If hash table insertion fails it'll be briefly visible
- * in the lru, but won't be isolated/evicted because we hold an
- * incremented refcount in addition to the live base.
+ * Insert a new block into the hash table.  Once it is inserted in the
+ * hash table readers can start getting references.  The caller may have
+ * multiple refs but the block can't already be inserted.
 */
 static int block_insert(struct super_block *sb, struct block_private *bp)
 {
 	DECLARE_BLOCK_INFO(sb, binf);
 	int ret;

-	BUG_ON(atomic_read(&bp->refcount) >= LIVE_REFCOUNT_BASE);
-	atomic_add(LIVE_REFCOUNT_BASE, &bp->refcount);
-	smp_mb__after_atomic(); /* make sure live base is visible to list_lru walk */
-	list_lru_add_obj(&binf->lru, &bp->lru_head);
+	WARN_ON_ONCE(atomic_read(&bp->refcount) & BLOCK_REF_INSERTED);
+
 retry:
+	atomic_add(BLOCK_REF_INSERTED, &bp->refcount);
 	ret = rhashtable_lookup_insert_fast(&binf->ht, &bp->ht_head, block_ht_params);
 	if (ret < 0) {
+		atomic_sub(BLOCK_REF_INSERTED, &bp->refcount);
 		if (ret == -EBUSY) {
 			/* wait for pending rebalance to finish */
 			synchronize_rcu();
 			goto retry;
-		} else {
-			atomic_sub(LIVE_REFCOUNT_BASE, &bp->refcount);
-			BUG_ON(atomic_read(&bp->refcount) >= LIVE_REFCOUNT_BASE);
-			list_lru_del_obj(&binf->lru, &bp->lru_head);
 		}
 	} else {
+		atomic_inc(&binf->total_inserted);
 		TRACE_BLOCK(insert, bp);
 	}

 	return ret;
 }

-/*
- * Indicate to the lru walker that this block has been accessed since it
- * was added or last walked.
- */
-static void block_accessed(struct super_block *sb, struct block_private *bp)
+static u64 accessed_recently(struct block_info *binf)
 {
-	if (!test_and_set_bit(BLOCK_BIT_ACCESSED, &bp->bits))
-		scoutfs_inc_counter(sb, block_cache_access_update);
+	return atomic64_read(&binf->access_counter) - (atomic_read(&binf->total_inserted) >> 1);
 }

 /*
- * Remove the block from the cache.  When this returns the block won't
- * be visible for additional references from lookup.
- *
- * We always try and remove from the hash table.  It's safe to remove a
- * block that isn't hashed, it just returns -ENOENT.
- *
- * This is racing with the lru walk in the shrinker also trying to
- * remove idle blocks from the cache.  They both try to remove the live
- * refcount base and perform their removal and put if they get it.
+ * Make sure that a block that is being accessed is less likely to be
+ * reclaimed if it is seen by the shrinker.   If the block hasn't been
+ * accessed recently we update its accessed value.
 */
-static void block_remove(struct super_block *sb, struct block_private *bp)
+static void block_accessed(struct super_block *sb, struct block_private *bp)
 {
 	DECLARE_BLOCK_INFO(sb, binf);

-	rhashtable_remove_fast(&binf->ht, &bp->ht_head, block_ht_params);
-
-	if (block_get_remove_live(bp)) {
-		list_lru_del_obj(&binf->lru, &bp->lru_head);
-		block_put(sb, bp);
+	if (bp->accessed == 0 || bp->accessed < accessed_recently(binf)) {
+		scoutfs_inc_counter(sb, block_cache_access_update);
+		bp->accessed = atomic64_inc_return(&binf->access_counter);
 	}
 }

+/*
+ * The caller wants to remove the block from the hash table and has an
+ * idea what the refcount should be.  If the refcount does still
+ * indicate that the block is hashed, and we're able to clear that bit,
+ * then we can remove it from the hash table.
+ *
+ * The caller makes sure that it's safe to be referencing this block,
+ * either with their own held reference (most everything) or by being in
+ * an rcu grace period (shrink).
+ */
+static bool block_remove_cnt(struct super_block *sb, struct block_private *bp, int cnt)
+{
+	DECLARE_BLOCK_INFO(sb, binf);
+	int ret;
+
+	if ((cnt & BLOCK_REF_INSERTED) &&
+	    (atomic_cmpxchg(&bp->refcount, cnt, cnt & ~BLOCK_REF_INSERTED) == cnt)) {
+
+		TRACE_BLOCK(remove, bp);
+		ret = rhashtable_remove_fast(&binf->ht, &bp->ht_head, block_ht_params);
+		WARN_ON_ONCE(ret); /* must have been inserted */
+		atomic_dec(&binf->total_inserted);
+		return true;
+	}
+
+	return false;
+}
+
+/*
+ * Try to remove the block from the hash table as long as the refcount
+ * indicates that it is still in the hash table.  This can be racing
+ * with normal refcount changes so it might have to retry.
+ */
+static void block_remove(struct super_block *sb, struct block_private *bp)
+{
+	int cnt;
+
+	do {
+		cnt = atomic_read(&bp->refcount);
+	} while ((cnt & BLOCK_REF_INSERTED) && !block_remove_cnt(sb, bp, cnt));
+}
+
+/*
+ * Take one shot at removing the block from the hash table if it's still
+ * in the hash table and the caller has the only other reference.
+ */
+static bool block_remove_solo(struct super_block *sb, struct block_private *bp)
+{
+	return block_remove_cnt(sb, bp, BLOCK_REF_INSERTED | 1);
+}
+
 static bool io_busy(struct block_private *bp)
 {
 	smp_rmb(); /* test after adding to wait queue */
 	return test_bit(BLOCK_BIT_IO_BUSY, &bp->bits);
 }

+/*
+ * Called during shutdown with no other users.
+ */
+static void block_remove_all(struct super_block *sb)
+{
+	DECLARE_BLOCK_INFO(sb, binf);
+	struct rhashtable_iter iter;
+	struct block_private *bp;
+
+	rhashtable_walk_enter(&binf->ht, &iter);
+	rhashtable_walk_start(&iter);
+
+	for (;;) {
+		bp = rhashtable_walk_next(&iter);
+		if (bp == NULL)
+			break;
+		if (bp == ERR_PTR(-EAGAIN))
+			continue;
+
+		if (block_get_if_inserted(bp)) {
+			block_remove(sb, bp);
+			WARN_ON_ONCE(atomic_read(&bp->refcount) != 1);
+			block_put(sb, bp);
+		}
+	}
+
+	rhashtable_walk_stop(&iter);
+	rhashtable_walk_exit(&iter);
+
+	WARN_ON_ONCE(atomic_read(&binf->total_inserted) != 0);
+}

 /*
 * XXX The io_count and sb fields in the block_private are only used
@@ -454,7 +488,7 @@ static int block_submit_bio(struct super_block *sb, struct block_private *bp,
 	int ret = 0;

 	if (scoutfs_forcing_unmount(sb))
-		return -ENOLINK;
+		return -EIO;

 	sector = bp->bl.blkno << (SCOUTFS_BLOCK_LG_SHIFT - 9);

@@ -509,10 +543,6 @@ static int block_submit_bio(struct super_block *sb, struct block_private *bp,
 	return ret;
 }

-/*
- * Return a block with an elevated refcount if it was present in the
- * hash table and its refcount didn't indicate that it was being freed.
- */
 static struct block_private *block_lookup(struct super_block *sb, u64 blkno)
 {
 	DECLARE_BLOCK_INFO(sb, binf);
@@ -520,8 +550,8 @@ static struct block_private *block_lookup(struct super_block *sb, u64 blkno)

 	rcu_read_lock();
 	bp = rhashtable_lookup(&binf->ht, &blkno, block_ht_params);
-	if (bp && !block_get_if_live(bp))
-		bp = NULL;
+	if (bp)
+		bp = block_get_if_inserted(bp);
 	rcu_read_unlock();

 	return bp;
@@ -682,8 +712,8 @@ retry:

 	ret = 0;
 out:
-	if (!retried && !IS_ERR_OR_NULL(bp) && !block_is_dirty(bp) &&
-	    (ret == -ESTALE || scoutfs_trigger(sb, BLOCK_REMOVE_STALE))) {
+	if ((ret == -ESTALE || scoutfs_trigger(sb, BLOCK_REMOVE_STALE)) &&
+	    !retried && !block_is_dirty(bp)) {
 		retried = true;
 		scoutfs_inc_counter(sb, block_cache_remove_stale);
 		block_remove(sb, bp);
@@ -1048,85 +1078,100 @@ static unsigned long block_count_objects(struct shrinker *shrink, struct shrink_
 	struct super_block *sb = binf->sb;

 	scoutfs_inc_counter(sb, block_cache_count_objects);
-	return list_lru_shrink_count(&binf->lru, sc);
-}
-
-struct isolate_args {
-	struct super_block *sb;
-	struct list_head dispose;
-};
-
-#define DECLARE_ISOLATE_ARGS(sb_, name_) \
-	struct isolate_args name_ = { \
-		.sb = sb_, \
-		.dispose = LIST_HEAD_INIT(name_.dispose), \
-	}
-
-static enum lru_status isolate_lru_block(struct list_head *item, struct list_lru_one *list,
-					 void *cb_arg)
-{
-	struct block_private *bp = container_of(item, struct block_private, lru_head);
-	struct isolate_args *ia = cb_arg;
-
-	TRACE_BLOCK(isolate, bp);
-
-	/* rotate accessed blocks to the tail of the list (lazy promotion) */
-	if (test_and_clear_bit(BLOCK_BIT_ACCESSED, &bp->bits)) {
-		scoutfs_inc_counter(ia->sb, block_cache_isolate_rotate);
-		return LRU_ROTATE;
-	}
-
-	/* any refs, including dirty/io, stop us from acquiring lru refcount */
-	if (!block_get_remove_live_only(bp)) {
-		scoutfs_inc_counter(ia->sb, block_cache_isolate_skip);
-		return LRU_SKIP;
-	}
-
-	scoutfs_inc_counter(ia->sb, block_cache_isolate_removed);
-	list_lru_isolate_move(list, &bp->lru_head, &ia->dispose);
-	return LRU_REMOVED;
-}
-
-static void shrink_dispose_blocks(struct super_block *sb, struct list_head *dispose)
-{
-	struct block_private *bp;
-	struct block_private *bp__;
-
-	list_for_each_entry_safe(bp, bp__, dispose, lru_head) {
-		list_del_init(&bp->lru_head);
-		block_remove(sb, bp);
-		block_put(sb, bp);
-	}
+
+	return shrinker_min_long(atomic_read(&binf->total_inserted));
 }

+/*
+ * Remove a number of cached blocks that haven't been used recently.
+ *
+ * We don't maintain a strictly ordered LRU to avoid the contention of
+ * accesses always moving blocks around in some precise global
+ * structure.
+ *
+ * Instead we use counters to divide the blocks into two roughly equal
+ * groups by how recently they were accessed.  We randomly walk all
+ * inserted blocks looking for any blocks in the older half to remove
+ * and free.  The random walk and there being two groups means that we
+ * typically only walk a small multiple of the number we're looking for
+ * before we find them all.
+ *
+ * Our rcu walk of blocks can see blocks in all stages of their life
+ * cycle, from dirty blocks to those with 0 references that are queued
+ * for freeing.  We only want to free idle inserted blocks so we
+ * atomically remove blocks when the only references are ours and the
+ * hash table.
+ */
 static unsigned long block_scan_objects(struct shrinker *shrink, struct shrink_control *sc)
 {
 	struct block_info *binf = KC_SHRINKER_CONTAINER_OF(shrink, struct block_info);
 	struct super_block *sb = binf->sb;
-	DECLARE_ISOLATE_ARGS(sb, ia);
-	unsigned long freed;
+	struct rhashtable_iter iter;
+	struct block_private *bp;
+	bool stop = false;
+	unsigned long freed = 0;
+	unsigned long nr = sc->nr_to_scan;
+	u64 recently;

 	scoutfs_inc_counter(sb, block_cache_scan_objects);

-	freed = kc_list_lru_shrink_walk(&binf->lru, sc, isolate_lru_block, &ia);
-	shrink_dispose_blocks(sb, &ia.dispose);
-	return freed;
-}
+	recently = accessed_recently(binf);
+	rhashtable_walk_enter(&binf->ht, &iter);
+	rhashtable_walk_start(&iter);

-/*
- * Called during shutdown with no other users.  The isolating walk must
- * find blocks on the lru that only have references for presence on the
- * lru and in the hash table.
- */
-static void block_shrink_all(struct super_block *sb)
-{
-	DECLARE_BLOCK_INFO(sb, binf);
-	DECLARE_ISOLATE_ARGS(sb, ia);
+	/*
+	 * This isn't great but I don't see a better way.  We want to
+	 * walk the hash from a random point so that we're not
+	 * constantly walking over the same region that we've already
+	 * freed old blocks within.  The interface doesn't let us do
+	 * this explicitly, but this seems to work?  The difference this
+	 * makes is enormous, around a few orders of magnitude fewer
+	 * _nexts per shrink.
+	 */
+	if (iter.walker.tbl)
+		iter.slot = prandom_u32_max(iter.walker.tbl->size);

-	do {
-		kc_list_lru_walk(&binf->lru, isolate_lru_block, &ia, 128);
-		shrink_dispose_blocks(sb, &ia.dispose);
-        } while (list_lru_count(&binf->lru) > 0);
+	while (nr > 0) {
+		bp = rhashtable_walk_next(&iter);
+		if (bp == NULL)
+			break;
+		if (bp == ERR_PTR(-EAGAIN)) {
+			/*
+			 * We can be called from reclaim in the allocation
+			 * to resize the hash table itself.  We have to
+			 * return so that the caller can proceed and
+			 * enable hash table iteration again.
+			 */
+			scoutfs_inc_counter(sb, block_cache_shrink_stop);
+			stop = true;
+			break;
+		}
+
+		scoutfs_inc_counter(sb, block_cache_shrink_next);
+
+		if (bp->accessed >= recently) {
+			scoutfs_inc_counter(sb, block_cache_shrink_recent);
+			continue;
+		}
+
+		if (block_get_if_inserted(bp)) {
+			if (block_remove_solo(sb, bp)) {
+				scoutfs_inc_counter(sb, block_cache_shrink_remove);
+				TRACE_BLOCK(shrink, bp);
+				freed++;
+				nr--;
+			}
+			block_put(sb, bp);
+		}
+	}
+
+	rhashtable_walk_stop(&iter);
+	rhashtable_walk_exit(&iter);
+
+	if (stop)
+		return SHRINK_STOP;
+	else
+		return freed;
 }

 struct sm_block_completion {
@@ -1165,7 +1210,7 @@ static int sm_block_io(struct super_block *sb, struct block_device *bdev, blk_op
 	BUILD_BUG_ON(PAGE_SIZE < SCOUTFS_BLOCK_SM_SIZE);

 	if (scoutfs_forcing_unmount(sb))
-		return -ENOLINK;
+		return -EIO;

 	if (WARN_ON_ONCE(len > SCOUTFS_BLOCK_SM_SIZE) ||
 	    WARN_ON_ONCE(!op_is_write(opf) && !blk_crc))
@@ -1231,7 +1276,7 @@ int scoutfs_block_write_sm(struct super_block *sb,
 int scoutfs_block_setup(struct super_block *sb)
 {
 	struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
-	struct block_info *binf = NULL;
+	struct block_info *binf;
 	int ret;

 	binf = kzalloc(sizeof(struct block_info), GFP_KERNEL);
@@ -1240,15 +1285,15 @@ int scoutfs_block_setup(struct super_block *sb)
 		goto out;
 	}

-	ret = list_lru_init(&binf->lru);
-	if (ret < 0)
-		goto out;
-
 	ret = rhashtable_init(&binf->ht, &block_ht_params);
-	if (ret < 0)
+	if (ret < 0) {
+		kfree(binf);
 		goto out;
+	}

 	binf->sb = sb;
+	atomic_set(&binf->total_inserted, 0);
+	atomic64_set(&binf->access_counter, 0);
 	init_waitqueue_head(&binf->waitq);
 	KC_INIT_SHRINKER_FUNCS(&binf->shrinker, block_count_objects,
 			       block_scan_objects);
@@ -1260,10 +1305,8 @@ int scoutfs_block_setup(struct super_block *sb)

 	ret = 0;
 out:
-	if (ret < 0 && binf) {
-		list_lru_destroy(&binf->lru);
-		kfree(binf);
-	}
+	if (ret)
+		scoutfs_block_destroy(sb);

 	return ret;
 }
@@ -1275,10 +1318,9 @@ void scoutfs_block_destroy(struct super_block *sb)

 	if (binf) {
 		KC_UNREGISTER_SHRINKER(&binf->shrinker);
-		block_shrink_all(sb);
+		block_remove_all(sb);
 		flush_work(&binf->free_work);
 		rhashtable_destroy(&binf->ht);
-		list_lru_destroy(&binf->lru);

 		kfree(binf);
 		sbi->block_info = NULL;
--- a/kmod/src/counters.h
+++ b/kmod/src/counters.h
@@ -26,15 +26,17 @@
 	EXPAND_COUNTER(block_cache_alloc_page_order)		\
 	EXPAND_COUNTER(block_cache_alloc_virt)			\
 	EXPAND_COUNTER(block_cache_end_io_error)		\
-	EXPAND_COUNTER(block_cache_isolate_removed)		\
-	EXPAND_COUNTER(block_cache_isolate_rotate)		\
-	EXPAND_COUNTER(block_cache_isolate_skip)		\
 	EXPAND_COUNTER(block_cache_forget)			\
 	EXPAND_COUNTER(block_cache_free)			\
 	EXPAND_COUNTER(block_cache_free_work)			\
 	EXPAND_COUNTER(block_cache_remove_stale)		\
 	EXPAND_COUNTER(block_cache_count_objects)		\
 	EXPAND_COUNTER(block_cache_scan_objects)		\
+	EXPAND_COUNTER(block_cache_shrink)			\
+	EXPAND_COUNTER(block_cache_shrink_next)			\
+	EXPAND_COUNTER(block_cache_shrink_recent)		\
+	EXPAND_COUNTER(block_cache_shrink_remove)		\
+	EXPAND_COUNTER(block_cache_shrink_stop)			\
 	EXPAND_COUNTER(btree_compact_values)			\
 	EXPAND_COUNTER(btree_compact_values_enomem)		\
 	EXPAND_COUNTER(btree_delete)				\
@@ -88,7 +90,6 @@
 	EXPAND_COUNTER(forest_read_items)			\
 	EXPAND_COUNTER(forest_roots_next_hint)			\
 	EXPAND_COUNTER(forest_set_bloom_bits)			\
-	EXPAND_COUNTER(inode_deleted)				\
 	EXPAND_COUNTER(item_cache_count_objects)		\
 	EXPAND_COUNTER(item_cache_scan_objects)			\
 	EXPAND_COUNTER(item_clear_dirty)			\
@@ -116,11 +117,10 @@
 	EXPAND_COUNTER(item_pcpu_page_hit)			\
 	EXPAND_COUNTER(item_pcpu_page_miss)			\
 	EXPAND_COUNTER(item_pcpu_page_miss_keys)		\
-	EXPAND_COUNTER(item_read_pages_barrier)			\
-	EXPAND_COUNTER(item_read_pages_retry)			\
 	EXPAND_COUNTER(item_read_pages_split)			\
 	EXPAND_COUNTER(item_shrink_page)			\
 	EXPAND_COUNTER(item_shrink_page_dirty)			\
+	EXPAND_COUNTER(item_shrink_page_reader)			\
 	EXPAND_COUNTER(item_shrink_page_trylock)		\
 	EXPAND_COUNTER(item_update)				\
 	EXPAND_COUNTER(item_write_dirty)			\
@@ -145,7 +145,6 @@
 	EXPAND_COUNTER(lock_shrink_work)			\
 	EXPAND_COUNTER(lock_unlock)				\
 	EXPAND_COUNTER(lock_wait)				\
-	EXPAND_COUNTER(log_merge_no_finalized)			\
 	EXPAND_COUNTER(log_merge_wait_timeout)			\
 	EXPAND_COUNTER(net_dropped_response)			\
 	EXPAND_COUNTER(net_send_bytes)				\
@@ -182,7 +181,6 @@
 	EXPAND_COUNTER(quorum_send_vote)			\
 	EXPAND_COUNTER(quorum_server_shutdown)			\
 	EXPAND_COUNTER(quorum_term_follower)			\
-	EXPAND_COUNTER(reclaimed_open_logs)			\
 	EXPAND_COUNTER(server_commit_hold)			\
 	EXPAND_COUNTER(server_commit_queue)			\
 	EXPAND_COUNTER(server_commit_worker)			\
--- a/kmod/src/data.c
+++ b/kmod/src/data.c
@@ -560,7 +560,7 @@ static int scoutfs_get_block(struct inode *inode, sector_t iblock,
 	u64 offset;
 	int ret;

-	WARN_ON_ONCE(create && !rwsem_is_locked(&si->extent_sem));
+	WARN_ON_ONCE(create && !inode_is_locked(inode));

 	/* make sure caller holds a cluster lock */
 	lock = scoutfs_per_task_get(&si->pt_data_lock);
@@ -1551,17 +1551,13 @@ int scoutfs_data_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
 	struct super_block *sb = inode->i_sb;
 	const u64 ino = scoutfs_ino(inode);
 	struct scoutfs_lock *lock = NULL;
-	struct scoutfs_extent *info = NULL;
-	struct page *page = NULL;
 	struct scoutfs_extent ext;
 	struct scoutfs_extent cur;
 	struct data_ext_args args;
 	u32 last_flags;
 	u64 iblock;
 	u64 last;
-	int entries = 0;
 	int ret;
-	int complete = 0;

 	if (len == 0) {
 		ret = 0;
@@ -1572,11 +1568,16 @@ int scoutfs_data_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
 	if (ret)
 		goto out;

-	page = alloc_page(GFP_KERNEL);
-	if (!page) {
-		ret = -ENOMEM;
-		goto out;
-	}
+	inode_lock(inode);
+	down_read(&si->extent_sem);
+
+	ret = scoutfs_lock_inode(sb, SCOUTFS_LOCK_READ, 0, inode, &lock);
+	if (ret)
+		goto unlock;
+
+	args.ino = ino;
+	args.inode = inode;
+	args.lock = lock;

 	/* use a dummy extent to track */
 	memset(&cur, 0, sizeof(cur));
@@ -1585,93 +1586,48 @@ int scoutfs_data_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
 	iblock = start >> SCOUTFS_BLOCK_SM_SHIFT;
 	last = (start + len - 1) >> SCOUTFS_BLOCK_SM_SHIFT;

-	args.ino = ino;
-	args.inode = inode;
-
-	/* outer loop */
 	while (iblock <= last) {
-		/* lock */
-		inode_lock(inode);
-		down_read(&si->extent_sem);
-
-		ret = scoutfs_lock_inode(sb, SCOUTFS_LOCK_READ, 0, inode, &lock);
-		if (ret) {
-			up_read(&si->extent_sem);
-			inode_unlock(inode);
-			break;
-		}
-
-		args.lock = lock;
-
-		/* collect entries */
-		info = page_address(page);
-		memset(info, 0, PAGE_SIZE);
-		while (entries < (PAGE_SIZE / sizeof(struct fiemap_extent)) - 1) {
-			ret = scoutfs_ext_next(sb, &data_ext_ops, &args,
-					       iblock, 1, &ext);
-			if (ret < 0) {
-				if (ret == -ENOENT)
-					ret = 0;
-				complete = 1;
-				last_flags = FIEMAP_EXTENT_LAST;
-				break;
-			}
-
-			trace_scoutfs_data_fiemap_extent(sb, ino, &ext);
-
-			if (ext.start > last) {
-				/* not setting _LAST, it's for end of file */
+		ret = scoutfs_ext_next(sb, &data_ext_ops, &args,
+				       iblock, 1, &ext);
+		if (ret < 0) {
+			if (ret == -ENOENT)
 				ret = 0;
-				complete = 1;
-				break;
-			}
-
-			if (scoutfs_ext_can_merge(&cur, &ext)) {
-				/* merged extents could be greater than input len */
-				cur.len += ext.len;
-			} else {
-				/* fill it */
-				memcpy(info, &cur, sizeof(cur));
-
-				entries++;
-				info++;
-
-				cur = ext;
-			}
-
-			iblock = ext.start + ext.len;
+			last_flags = FIEMAP_EXTENT_LAST;
+			break;
 		}

-		/* unlock */
-		scoutfs_unlock(sb, lock, SCOUTFS_LOCK_READ);
-		up_read(&si->extent_sem);
-		inode_unlock(inode);
+		trace_scoutfs_data_fiemap_extent(sb, ino, &ext);

-		if (ret)
+		if (ext.start > last) {
+			/* not setting _LAST, it's for end of file */
+			ret = 0;
 			break;
+		}

-		/* emit entries */
-		info = page_address(page);
-		for (; entries > 0; entries--) {
-			ret = fill_extent(fieinfo, info, 0);
+		if (scoutfs_ext_can_merge(&cur, &ext)) {
+			/* merged extents could be greater than input len */
+			cur.len += ext.len;
+		} else {
+			ret = fill_extent(fieinfo, &cur, 0);
 			if (ret != 0)
-				goto out;
-			info++;
+				goto unlock;
+			cur = ext;
 		}

-		if (complete)
-			break;
+		iblock = ext.start + ext.len;
 	}

-	/* still one left, it's in cur */
 	if (cur.len)
 		ret = fill_extent(fieinfo, &cur, last_flags);
+unlock:
+	scoutfs_unlock(sb, lock, SCOUTFS_LOCK_READ);
+	up_read(&si->extent_sem);
+	inode_unlock(inode);

 out:
 	if (ret == 1)
 		ret = 0;
-	if (page)
-		__free_page(page);
+
 	trace_scoutfs_data_fiemap(sb, start, len, ret);

 	return ret;
@@ -1958,236 +1914,6 @@ int scoutfs_data_waiting(struct super_block *sb, u64 ino, u64 iblock,
 	return ret;
 }

-#ifdef KC_MM_VM_FAULT_T
-static vm_fault_t scoutfs_data_page_mkwrite(struct vm_fault *vmf)
-{
-	struct vm_area_struct *vma = vmf->vma;
-#else
-static int scoutfs_data_page_mkwrite(struct vm_area_struct *vma,
-				     struct vm_fault *vmf)
-{
-#endif
-	struct page *page = vmf->page;
-	struct file *file = vma->vm_file;
-	struct inode *inode = file_inode(file);
-	struct scoutfs_inode_info *si = SCOUTFS_I(inode);
-	struct super_block *sb = inode->i_sb;
-	struct scoutfs_lock *lock = NULL;
-	SCOUTFS_DECLARE_PER_TASK_ENTRY(pt_ent);
-	DECLARE_DATA_WAIT(dw);
-	struct write_begin_data wbd;
-	u64 ind_seq;
-	loff_t pos;
-	loff_t size;
-	unsigned int len = PAGE_SIZE;
-	vm_fault_t ret = VM_FAULT_SIGBUS;
-	int err;
-
-	pos = vmf->pgoff << PAGE_SHIFT;
-
-	sb_start_pagefault(sb);
-
-	err = scoutfs_lock_inode(sb, SCOUTFS_LOCK_WRITE,
-				 SCOUTFS_LKF_REFRESH_INODE, inode, &lock);
-	if (err) {
-		ret = vmf_error(err);
-		goto out;
-	}
-
-	size = i_size_read(inode);
-
-	if (scoutfs_per_task_add_excl(&si->pt_data_lock, &pt_ent, lock)) {
-		/* data_version is per inode, whole file must be online */
-		err = scoutfs_data_wait_check(inode, 0, size,
-					      SEF_OFFLINE,
-					      SCOUTFS_IOC_DWO_WRITE,
-					      &dw, lock);
-		if (err != 0) {
-			if (err < 0)
-				ret = vmf_error(err);
-			goto out_unlock;
-		}
-	}
-
-
-	/* scoutfs_write_begin */
-	memset(&wbd, 0, sizeof(wbd));
-	INIT_LIST_HEAD(&wbd.ind_locks);
-	wbd.lock = lock;
-
-	/*
-	 * Start transaction before taking page locks - we want to make sure we're
-	 * not locking a page, then waiting for trans, because writeback might race
-	 * against it and cause a lock inversion hang - as demonstrated by both
-	 * holetest and fsstress tests in xfstests.
-	 */
-	do {
-		err = scoutfs_inode_index_start(sb, &ind_seq) ?:
-			scoutfs_inode_index_prepare(sb, &wbd.ind_locks, inode,
-						    true) ?:
-			scoutfs_inode_index_try_lock_hold(sb, &wbd.ind_locks,
-							  ind_seq, false);
-	} while (err > 0);
-	if (err < 0) {
-		ret = vmf_error(err);
-		goto out_trans;
-	}
-
-	down_write(&si->extent_sem);
-
-	if (!trylock_page(page)) {
-		ret = VM_FAULT_NOPAGE;
-		goto out_sem;
-	}
-	ret = VM_FAULT_LOCKED;
-
-	if ((page->mapping != inode->i_mapping) ||
-	    (!PageUptodate(page)) ||
-	    (page_offset(page) > size))	 {
-		unlock_page(page);
-		ret = VM_FAULT_NOPAGE;
-		goto out_sem;
-	}
-
-	if (page->index == (size - 1) >> PAGE_SHIFT)
-		len = ((size - 1) & ~PAGE_MASK) + 1;
-
-	err = __block_write_begin(page, pos, PAGE_SIZE, scoutfs_get_block);
-	if (err) {
-		ret = vmf_error(err);
-		unlock_page(page);
-		goto out_sem;
-	}
-	/* end scoutfs_write_begin */
-
-	/*
-	 * We mark the page dirty already here so that when freeze is in
-	 * progress, we are guaranteed that writeback during freezing will
-	 * see the dirty page and writeprotect it again.
-	 */
-	set_page_dirty(page);
-	wait_for_stable_page(page);
-
-	/* scoutfs_write_end */
-	scoutfs_inode_set_data_seq(inode);
-	scoutfs_inode_inc_data_version(inode);
-
-	file_update_time(vma->vm_file);
-
-	scoutfs_update_inode_item(inode, wbd.lock, &wbd.ind_locks);
-	scoutfs_inode_queue_writeback(inode);
-
-out_sem:
-	up_write(&si->extent_sem);
-out_trans:
-	scoutfs_release_trans(sb);
-	scoutfs_inode_index_unlock(sb, &wbd.ind_locks);
-	/* end scoutfs_write_end */
-
-out_unlock:
-	scoutfs_per_task_del(&si->pt_data_lock, &pt_ent);
-	scoutfs_unlock(sb, lock, SCOUTFS_LOCK_WRITE);
-
-out:
-	sb_end_pagefault(sb);
-
-	if (scoutfs_data_wait_found(&dw)) {
-		/*
-		 * It'd be really nice to not hold the mmap_sem lock here
-		 * before waiting for data, and then return VM_FAULT_RETRY
-		 */
-		err = scoutfs_data_wait(inode, &dw);
-		if (err == 0)
-			ret = VM_FAULT_NOPAGE;
-		else
-			ret = vmf_error(err);
-	}
-
-	trace_scoutfs_data_page_mkwrite(sb, scoutfs_ino(inode), pos, (__force u32)ret);
-
-	return ret;
-}
-
-#ifdef KC_MM_VM_FAULT_T
-static vm_fault_t scoutfs_data_filemap_fault(struct vm_fault *vmf)
-{
-	struct vm_area_struct *vma = vmf->vma;
-#else
-static int scoutfs_data_filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
-{
-#endif
-	struct file *file = vma->vm_file;
-	struct inode *inode = file_inode(file);
-	struct scoutfs_inode_info *si = SCOUTFS_I(inode);
-	struct super_block *sb = inode->i_sb;
-	struct scoutfs_lock *inode_lock = NULL;
-	SCOUTFS_DECLARE_PER_TASK_ENTRY(pt_ent);
-	DECLARE_DATA_WAIT(dw);
-	loff_t pos;
-	int err;
-	vm_fault_t ret = VM_FAULT_SIGBUS;
-
-	pos = vmf->pgoff;
-	pos <<= PAGE_SHIFT;
-
-retry:
-	err = scoutfs_lock_inode(sb, SCOUTFS_LOCK_READ,
-				 SCOUTFS_LKF_REFRESH_INODE, inode, &inode_lock);
-	if (err < 0)
-		return vmf_error(err);
-
-	if (scoutfs_per_task_add_excl(&si->pt_data_lock, &pt_ent, inode_lock)) {
-		/* protect checked extents from stage/release */
-		atomic_inc(&inode->i_dio_count);
-
-		err = scoutfs_data_wait_check(inode, pos, PAGE_SIZE,
-					      SEF_OFFLINE, SCOUTFS_IOC_DWO_READ,
-					      &dw, inode_lock);
-		if (err != 0) {
-			if (err < 0)
-				ret = vmf_error(err);
-			goto out;
-		}
-	}
-
-#ifdef KC_MM_VM_FAULT_T
-	ret = filemap_fault(vmf);
-#else
-	ret = filemap_fault(vma, vmf);
-#endif
-
-out:
-	if (scoutfs_per_task_del(&si->pt_data_lock, &pt_ent))
-		kc_inode_dio_end(inode);
-	scoutfs_unlock(sb, inode_lock, SCOUTFS_LOCK_READ);
-	if (scoutfs_data_wait_found(&dw)) {
-		err = scoutfs_data_wait(inode, &dw);
-		if (err == 0)
-			goto retry;
-
-		ret = VM_FAULT_RETRY;
-	}
-
-	trace_scoutfs_data_filemap_fault(sb, scoutfs_ino(inode), pos, (__force u32)ret);
-
-	return ret;
-}
-
-static const struct vm_operations_struct scoutfs_data_file_vm_ops = {
-	.fault		= scoutfs_data_filemap_fault,
-	.page_mkwrite	= scoutfs_data_page_mkwrite,
-#ifdef KC_MM_REMAP_PAGES
-	.remap_pages	= generic_file_remap_pages,
-#endif
-};
-
-static int scoutfs_file_mmap(struct file *file, struct vm_area_struct *vma)
-{
-	file_accessed(file);
-	vma->vm_ops = &scoutfs_data_file_vm_ops;
-	return 0;
-}
-
 const struct address_space_operations scoutfs_file_aops = {
 #ifdef KC_MPAGE_READ_FOLIO
 	.dirty_folio		= block_dirty_folio,
@@ -2219,7 +1945,6 @@ const struct file_operations scoutfs_file_fops = {
 	.splice_read	= generic_file_splice_read,
 	.splice_write	= iter_file_splice_write,
 #endif
-	.mmap		= scoutfs_file_mmap,
 	.unlocked_ioctl	= scoutfs_ioctl,
 	.fsync		= scoutfs_file_fsync,
 	.llseek		= scoutfs_file_llseek,
--- a/kmod/src/dir.c
+++ b/kmod/src/dir.c
@@ -11,13 +11,11 @@
 * General Public License for more details.
 */
 #include <linux/kernel.h>
-#include <linux/stddef.h>
 #include <linux/fs.h>
 #include <linux/slab.h>
 #include <linux/uio.h>
 #include <linux/xattr.h>
 #include <linux/namei.h>
-#include <linux/mm.h>

 #include "format.h"
 #include "file.h"
@@ -436,15 +434,6 @@ out:
 		return d_splice_alias(inode, dentry);
 }

-/*
- * Helper to make iterating through dirent ptrs aligned
- */
-static inline struct scoutfs_dirent *next_aligned_dirent(struct scoutfs_dirent *dent, u8 len)
-{
-	return (void *)dent +
-		ALIGN(offsetof(struct scoutfs_dirent, name[len]), __alignof__(struct scoutfs_dirent));
-}
-
 /*
 * readdir simply iterates over the dirent items for the dir inode and
 * uses their offset as the readdir position.
@@ -452,112 +441,76 @@ static inline struct scoutfs_dirent *next_aligned_dirent(struct scoutfs_dirent *
 * It will need to be careful not to read past the region of the dirent
 * hash offset keys that it has access to.
 */
-static int scoutfs_readdir(struct file *file, struct dir_context *ctx)
+static int KC_DECLARE_READDIR(scoutfs_readdir, struct file *file,
+			      void *dirent, kc_readdir_ctx_t ctx)
 {
 	struct inode *inode = file_inode(file);
 	struct super_block *sb = inode->i_sb;
 	struct scoutfs_lock *dir_lock = NULL;
 	struct scoutfs_dirent *dent = NULL;
-/* we'll store name_len in dent->__pad[0] */
-#define hacky_name_len __pad[0]
 	struct scoutfs_key last_key;
 	struct scoutfs_key key;
-	struct page *page = NULL;
 	int name_len;
 	u64 pos;
-	int entries = 0;
 	int ret;
-	int complete = 0;
-	struct scoutfs_dirent *end;

-	if (!dir_emit_dots(file, ctx))
+	if (!kc_dir_emit_dots(file, dirent, ctx))
 		return 0;

-	page = alloc_page(GFP_KERNEL);
-	if (!page)
+	dent = alloc_dirent(SCOUTFS_NAME_LEN);
+	if (!dent) {
 		return -ENOMEM;
-
-	end = page_address(page) + PAGE_SIZE;
+	}

 	init_dirent_key(&last_key, SCOUTFS_READDIR_TYPE, scoutfs_ino(inode),
 			SCOUTFS_DIRENT_LAST_POS, 0);

-	/*
-	 * lock and fetch dirent items, until the page no longer fits
-	 * a max size dirent (288b). Then unlock and dir_emit the ones
-	 * we stored in the page.
-	 */
+	ret = scoutfs_lock_inode(sb, SCOUTFS_LOCK_READ, 0, inode, &dir_lock);
+	if (ret)
+		goto out;
+
 	for (;;) {
-		/* lock */
-		ret = scoutfs_lock_inode(sb, SCOUTFS_LOCK_READ, 0, inode, &dir_lock);
-		if (ret)
-			break;
+		init_dirent_key(&key, SCOUTFS_READDIR_TYPE, scoutfs_ino(inode),
+				kc_readdir_pos(file, ctx), 0);

-		dent = page_address(page);
-		pos = ctx->pos;
-		while (next_aligned_dirent(dent, SCOUTFS_NAME_LEN) < end) {
-			init_dirent_key(&key, SCOUTFS_READDIR_TYPE, scoutfs_ino(inode),
-					pos, 0);
-
-			ret = scoutfs_item_next(sb, &key, &last_key, dent,
-						dirent_bytes(SCOUTFS_NAME_LEN),
-						dir_lock);
-			if (ret < 0) {
-				if (ret == -ENOENT) {
-					ret = 0;
-					complete = 1;
-				}
-				break;
-			}
-
-			name_len = ret - sizeof(struct scoutfs_dirent);
-			dent->hacky_name_len = name_len;
-			if (name_len < 1 || name_len > SCOUTFS_NAME_LEN) {
-				scoutfs_corruption(sb, SC_DIRENT_READDIR_NAME_LEN,
-						   corrupt_dirent_readdir_name_len,
-						   "dir_ino %llu pos %llu key "SK_FMT" len %d",
-						   scoutfs_ino(inode),
-						   pos,
-						   SK_ARG(&key), name_len);
-				ret = -EIO;
-				break;
-			}
-
-			pos = le64_to_cpu(dent->pos) + 1;
-
-			dent = next_aligned_dirent(dent, name_len);
-			entries++;
-		}
-
-		/* unlock */
-		scoutfs_unlock(sb, dir_lock, SCOUTFS_LOCK_READ);
-
-		if (ret < 0)
-			break;
-
-		dent = page_address(page);
-		for (; entries > 0; entries--) {
-			ctx->pos = le64_to_cpu(dent->pos);
-			if (!dir_emit(ctx, dent->name, dent->hacky_name_len,
-					le64_to_cpu(dent->ino),
-					dentry_type(dent->type))) {
+		ret = scoutfs_item_next(sb, &key, &last_key, dent,
+					dirent_bytes(SCOUTFS_NAME_LEN),
+					dir_lock);
+		if (ret < 0) {
+			if (ret == -ENOENT)
 				ret = 0;
-				goto out;
-			}
-
-			dent = next_aligned_dirent(dent, dent->hacky_name_len);
-
-			/* always advance ctx->pos past */
-			ctx->pos++;
+			break;
 		}

-		if (complete)
+		name_len = ret - sizeof(struct scoutfs_dirent);
+		if (name_len < 1 || name_len > SCOUTFS_NAME_LEN) {
+			scoutfs_corruption(sb, SC_DIRENT_READDIR_NAME_LEN,
+					   corrupt_dirent_readdir_name_len,
+					   "dir_ino %llu pos %llu key "SK_FMT" len %d",
+					   scoutfs_ino(inode),
+					   kc_readdir_pos(file, ctx),
+					   SK_ARG(&key), name_len);
+			ret = -EIO;
+			goto out;
+		}
+
+		pos = le64_to_cpu(key.skd_major);
+		kc_readdir_pos(file, ctx) = pos;
+
+		if (!kc_dir_emit(ctx, dirent, dent->name, name_len, pos,
+				le64_to_cpu(dent->ino),
+				dentry_type(dent->type))) {
+			ret = 0;
 			break;
+		}
+
+		kc_readdir_pos(file, ctx) = pos + 1;
 	}

 out:
-	if (page)
-		__free_page(page);
+	scoutfs_unlock(sb, dir_lock, SCOUTFS_LOCK_READ);
+
+	kfree(dent);
 	return ret;
 }

@@ -1812,7 +1765,7 @@ retry:
 	}
 	old_inode->i_ctime = now;
 	if (new_inode)
-		new_inode->i_ctime = now;
+		old_inode->i_ctime = now;

 	inode_inc_iversion(old_dir);
 	inode_inc_iversion(old_inode);
@@ -2020,7 +1973,7 @@ const struct inode_operations scoutfs_symlink_iops = {
 };

 const struct file_operations scoutfs_dir_fops = {
-	.iterate	= scoutfs_readdir,
+	.KC_FOP_READDIR	= scoutfs_readdir,
 #ifdef KC_FMODE_KABI_ITERATE
 	.open		= scoutfs_dir_open,
 #endif
--- a/kmod/src/format.h
+++ b/kmod/src/format.h
@@ -470,7 +470,7 @@ struct scoutfs_srch_compact {
 * @get_trans_seq, @commit_trans_seq: These pair of sequence numbers
 * determine if a transaction is currently open for the mount that owns
 * the log_trees struct.  get_trans_seq is advanced by the server as the
- * transaction is opened.   The server sets commit_trans_seq equal to
+ * transaction is opened.   The server sets comimt_trans_seq equal to
 * get_ as the transaction is committed.
 */
 struct scoutfs_log_trees {
@@ -1091,8 +1091,7 @@ enum scoutfs_net_cmd {
 	EXPAND_NET_ERRNO(ENOMEM)	\
 	EXPAND_NET_ERRNO(EIO)		\
 	EXPAND_NET_ERRNO(ENOSPC)	\
-	EXPAND_NET_ERRNO(EINVAL)	\
-	EXPAND_NET_ERRNO(ENOLINK)
+	EXPAND_NET_ERRNO(EINVAL)

 #undef EXPAND_NET_ERRNO
 #define EXPAND_NET_ERRNO(which) SCOUTFS_NET_ERR_##which,
--- a/kmod/src/inode.c
+++ b/kmod/src/inode.c
@@ -1854,9 +1854,6 @@ static int try_delete_inode_items(struct super_block *sb, u64 ino)
 		goto out;

 	ret = delete_inode_items(sb, ino, &sinode, lock, orph_lock);
-	if (ret == 0)
-		scoutfs_inc_counter(sb, inode_deleted);
-
 out:
 	if (clear_trying)
 		clear_bit(bit_nr, ldata->trying);
@@ -1965,8 +1962,6 @@ static void iput_worker(struct work_struct *work)
 		while (count-- > 0)
 			iput(inode);

-		cond_resched();
-
 		/* can't touch inode after final iput */

 		spin_lock(&inf->iput_lock);
--- a/kmod/src/ioctl.c
+++ b/kmod/src/ioctl.c
@@ -58,23 +58,25 @@
 * key space after we find no items in a given lock region.  This is
 * relatively cheap because reading is going to check the segments
 * anyway.
+ *
+ * This is copying to userspace while holding a read lock.  This is safe
+ * because faulting can send a request for a write lock while the read
+ * lock is being used.  The cluster locks don't block tasks in a node,
+ * they match and the tasks fall back to local locking.  In this case
+ * the spin locks around the item cache.
 */
 static long scoutfs_ioc_walk_inodes(struct file *file, unsigned long arg)
 {
 	struct super_block *sb = file_inode(file)->i_sb;
 	struct scoutfs_ioctl_walk_inodes __user *uwalk = (void __user *)arg;
 	struct scoutfs_ioctl_walk_inodes walk;
-	struct scoutfs_ioctl_walk_inodes_entry *ent = NULL;
-	struct scoutfs_ioctl_walk_inodes_entry *end;
+	struct scoutfs_ioctl_walk_inodes_entry ent;
 	struct scoutfs_key next_key;
 	struct scoutfs_key last_key;
 	struct scoutfs_key key;
 	struct scoutfs_lock *lock;
-	struct page *page = NULL;
 	u64 last_seq;
-	u64 entries = 0;
 	int ret = 0;
-	int complete = 0;
 	u32 nr = 0;
 	u8 type;

@@ -105,10 +107,6 @@ static long scoutfs_ioc_walk_inodes(struct file *file, unsigned long arg)
 		}
 	}

-	page = alloc_page(GFP_KERNEL);
-	if (!page)
-		return -ENOMEM;
-
 	scoutfs_inode_init_index_key(&key, type, walk.first.major,
 				     walk.first.minor, walk.first.ino);
 	scoutfs_inode_init_index_key(&last_key, type, walk.last.major,
@@ -117,107 +115,77 @@ static long scoutfs_ioc_walk_inodes(struct file *file, unsigned long arg)
 	/* cap nr to the max the ioctl can return to a compat task */
 	walk.nr_entries = min_t(u64, walk.nr_entries, INT_MAX);

-	end = page_address(page) + PAGE_SIZE;
+	ret = scoutfs_lock_inode_index(sb, SCOUTFS_LOCK_READ, type,
+				       walk.first.major, walk.first.ino,
+				       &lock);
+	if (ret < 0)
+		goto out;

-	/* outer loop */
-	for (nr = 0;;) {
-		ent = page_address(page);
-		/* make sure _pad and minor are zeroed */
-		memset(ent, 0, PAGE_SIZE);
+	for (nr = 0; nr < walk.nr_entries; ) {

-		ret = scoutfs_lock_inode_index(sb, SCOUTFS_LOCK_READ, type,
-					       le64_to_cpu(key.skii_major),
-					       le64_to_cpu(key.skii_ino),
-					       &lock);
-		if (ret)
+		ret = scoutfs_item_next(sb, &key, &last_key, NULL, 0, lock);
+		if (ret < 0 && ret != -ENOENT)
 			break;

-		/* inner loop 1 */
-		while (ent + 1 < end) {
-			ret = scoutfs_item_next(sb, &key, &last_key, NULL, 0, lock);
-			if (ret < 0 && ret != -ENOENT)
+		if (ret == -ENOENT) {
+
+			/* done if lock covers last iteration key */
+			if (scoutfs_key_compare(&last_key, &lock->end) <= 0) {
+				ret = 0;
 				break;
-
-			if (ret == -ENOENT) {
-				/* done if lock covers last iteration key */
-				if (scoutfs_key_compare(&last_key, &lock->end) <= 0) {
-					ret = 0;
-					complete = 1;
-					break;
-				}
-
-				/* continue iterating after locked empty region */
-				key = lock->end;
-				scoutfs_key_inc(&key);
-
-				scoutfs_unlock(sb, lock, SCOUTFS_LOCK_READ);
-				/* avoid double-unlocking here after break */
-				lock = NULL;
-
-				ret = scoutfs_forest_next_hint(sb, &key, &next_key);
-				if (ret < 0 && ret != -ENOENT)
-					break;
-
-				if (ret == -ENOENT ||
-				    scoutfs_key_compare(&next_key, &last_key) > 0) {
-					ret = 0;
-					complete = 1;
-					break;
-				}
-
-				key = next_key;
-
-				ret = scoutfs_lock_inode_index(sb, SCOUTFS_LOCK_READ,
-							type,
-							le64_to_cpu(key.skii_major),
-							le64_to_cpu(key.skii_ino),
-							&lock);
-				if (ret)
-					break;
-
-				continue;
 			}

-			ent->major = le64_to_cpu(key.skii_major);
-			ent->ino = le64_to_cpu(key.skii_ino);
-
+			/* continue iterating after locked empty region */
+			key = lock->end;
 			scoutfs_key_inc(&key);

-			ent++;
-			entries++;
+			scoutfs_unlock(sb, lock, SCOUTFS_LOCK_READ);

-			if (nr + entries >= walk.nr_entries) {
-				complete = 1;
-				break;
-			}
-		}
+			ret = scoutfs_forest_next_hint(sb, &key, &next_key);
+			if (ret < 0 && ret != -ENOENT)
+				goto out;

-		scoutfs_unlock(sb, lock, SCOUTFS_LOCK_READ);
-		if (ret < 0)
-			break;
-
-		/* inner loop 2 */
-		ent = page_address(page);
-		for (; entries > 0; entries--) {
-			if (copy_to_user((void __user *)walk.entries_ptr, ent,
-					 sizeof(struct scoutfs_ioctl_walk_inodes_entry))) {
-				ret = -EFAULT;
+			if (ret == -ENOENT ||
+			    scoutfs_key_compare(&next_key, &last_key) > 0) {
+				ret = 0;
 				goto out;
 			}
-			walk.entries_ptr += sizeof(struct scoutfs_ioctl_walk_inodes_entry);
-			ent++;
-			nr++;
+
+			key = next_key;
+
+			ret = scoutfs_lock_inode_index(sb, SCOUTFS_LOCK_READ,
+						key.sk_type,
+						le64_to_cpu(key.skii_major),
+						le64_to_cpu(key.skii_ino),
+						&lock);
+			if (ret < 0)
+				goto out;
+
+			continue;
 		}

-		if (complete)
+		ent.major = le64_to_cpu(key.skii_major);
+		ent.minor = 0;
+		ent.ino = le64_to_cpu(key.skii_ino);
+
+		if (copy_to_user((void __user *)walk.entries_ptr, &ent,
+				 sizeof(ent))) {
+			ret = -EFAULT;
 			break;
+		}
+
+		nr++;
+		walk.entries_ptr += sizeof(ent);
+
+		scoutfs_key_inc(&key);
 	}

+	scoutfs_unlock(sb, lock, SCOUTFS_LOCK_READ);
+
 out:
-	if (page)
-		__free_page(page);
 	if (nr > 0)
 		ret = nr;
+
 	return ret;
 }

@@ -1195,15 +1163,11 @@ static long scoutfs_ioc_get_allocated_inos(struct file *file, unsigned long arg)
 	struct scoutfs_lock *lock = NULL;
 	struct scoutfs_key key;
 	struct scoutfs_key end;
-	struct page *page = NULL;
 	u64 __user *uinos;
 	u64 bytes;
-	u64 *ino;
-	u64 *ino_end;
-	int entries = 0;
+	u64 ino;
 	int nr;
 	int ret;
-	int complete = 0;

 	if (!(file->f_mode & FMODE_READ)) {
 		ret = -EBADF;
@@ -1225,83 +1189,47 @@ static long scoutfs_ioc_get_allocated_inos(struct file *file, unsigned long arg)
 		goto out;
 	}

-	page = alloc_page(GFP_KERNEL);
-	if (!page) {
-		ret = -ENOMEM;
-		goto out;
-	}
-	ino_end = page_address(page) + PAGE_SIZE;
-
 	scoutfs_inode_init_key(&key, gai.start_ino);
 	scoutfs_inode_init_key(&end, gai.start_ino | SCOUTFS_LOCK_INODE_GROUP_MASK);
 	uinos = (void __user *)gai.inos_ptr;
 	bytes = gai.inos_bytes;
 	nr = 0;

-	for (;;) {
+	ret = scoutfs_lock_ino(sb, SCOUTFS_LOCK_READ, 0, gai.start_ino, &lock);
+	if (ret < 0)
+		goto out;

-		ret = scoutfs_lock_ino(sb, SCOUTFS_LOCK_READ, 0, gai.start_ino, &lock);
-		if (ret < 0)
-			goto out;
+	while (bytes >= sizeof(*uinos)) {

-		ino = page_address(page);
-		while (ino < ino_end) {
-
-			ret = scoutfs_item_next(sb, &key, &end, NULL, 0, lock);
-			if (ret < 0) {
-				if (ret == -ENOENT) {
-					ret = 0;
-					complete = 1;
-				}
-				break;
-			}
-
-			if (key.sk_zone != SCOUTFS_FS_ZONE) {
+		ret = scoutfs_item_next(sb, &key, &end, NULL, 0, lock);
+		if (ret < 0) {
+			if (ret == -ENOENT)
 				ret = 0;
-				complete = 1;
-				break;
-			}
-
-			/* all fs items are owned by allocated inodes, and _first is always ino */
-			*ino = le64_to_cpu(key._sk_first);
-			scoutfs_inode_init_key(&key, *ino + 1);
-
-			ino++;
-			entries++;
-			nr++;
-
-			bytes -= sizeof(*uinos);
-			if (bytes < sizeof(*uinos)) {
-				complete = 1;
-				break;
-			}
-
-			if (nr == INT_MAX) {
-				complete = 1;
-				break;
-			}
+			break;
 		}

-		scoutfs_unlock(sb, lock, SCOUTFS_LOCK_READ);
-
-		if (ret < 0)
+		if (key.sk_zone != SCOUTFS_FS_ZONE) {
+			ret = 0;
 			break;
+		}

-		ino = page_address(page);
-		if (copy_to_user(uinos, ino, entries * sizeof(*uinos))) {
+		/* all fs items are owned by allocated inodes, and _first is always ino */
+		ino = le64_to_cpu(key._sk_first);
+		if (put_user(ino, uinos)) {
 			ret = -EFAULT;
-			goto out;
+			break;
 		}

-		uinos += entries;
-		entries = 0;
-
-		if (complete)
+		uinos++;
+		bytes -= sizeof(*uinos);
+		if (++nr == INT_MAX)
 			break;
+
+		scoutfs_inode_init_key(&key, ino + 1);
 	}
+
+	scoutfs_unlock(sb, lock, SCOUTFS_LOCK_READ);
 out:
-	if (page)
-		__free_page(page);
 	return ret ?: nr;
 }

--- a/kmod/src/item.c
+++ b/kmod/src/item.c
@@ -97,8 +97,9 @@ struct item_cache_info {
 	struct list_head lru_list;
 	unsigned long lru_pages;

-	/* stop readers from caching stale items behind reclaimed cleaned written items */
-	atomic64_t read_dirty_barrier;
+	/* written by page readers, read by shrink */
+	spinlock_t active_lock;
+	struct list_head active_list;
 };

 #define DECLARE_ITEM_CACHE_INFO(sb, name) \
@@ -1284,6 +1285,78 @@ static int cache_empty_page(struct super_block *sb,
 	return 0;
 }

+/*
+ * Readers operate independently from dirty items and transactions.
+ * They read a set of persistent items and insert them into the cache
+ * when there aren't already pages whose key range contains the items.
+ * This naturally prefers cached dirty items over stale read items.
+ *
+ * We have to deal with the case where dirty items are written and
+ * invalidated while a read is in flight.   The reader won't have seen
+ * the items that were dirty in their persistent roots as they started
+ * reading.  By the time they insert their read pages the previously
+ * dirty items have been reclaimed and are not in the cache.  The old
+ * stale items will be inserted in their place, effectively corrupting
+ * by having the dirty items disappear.
+ *
+ * We fix this by tracking the max seq of items in pages.  As readers
+ * start they record the current transaction seq.  Invalidation skips
+ * pages with a max seq greater than the first reader seq because the
+ * items in the page have to stick around to prevent the readers stale
+ * items from being inserted.
+ *
+ * This naturally only affects a small set of pages with items that were
+ * written relatively recently.  If we're in memory pressure then we
+ * probably have a lot of pages and they'll naturally have items that
+ * were visible to any raders.  We don't bother with the complicated and
+ * expensive further refinement of tracking the ranges that are being
+ * read and comparing those with pages to invalidate.
+ */
+struct active_reader {
+	struct list_head head;
+	u64 seq;
+};
+
+#define INIT_ACTIVE_READER(rdr) \
+	struct active_reader rdr = { .head = LIST_HEAD_INIT(rdr.head) }
+
+static void add_active_reader(struct super_block *sb, struct active_reader *active)
+{
+	DECLARE_ITEM_CACHE_INFO(sb, cinf);
+
+	BUG_ON(!list_empty(&active->head));
+
+	active->seq = scoutfs_trans_sample_seq(sb);
+
+	spin_lock(&cinf->active_lock);
+	list_add_tail(&active->head, &cinf->active_list);
+	spin_unlock(&cinf->active_lock);
+}
+
+static u64 first_active_reader_seq(struct item_cache_info *cinf)
+{
+	struct active_reader *active;
+	u64 first;
+
+	/* only the calling task adds or deletes this active */
+	spin_lock(&cinf->active_lock);
+	active = list_first_entry_or_null(&cinf->active_list, struct active_reader, head);
+	first = active ? active->seq : U64_MAX;
+	spin_unlock(&cinf->active_lock);
+
+	return first;
+}
+
+static void del_active_reader(struct item_cache_info *cinf, struct active_reader *active)
+{
+	/* only the calling task adds or deletes this active */
+	if (!list_empty(&active->head)) {
+		spin_lock(&cinf->active_lock);
+		list_del_init(&active->head);
+		spin_unlock(&cinf->active_lock);
+	}
+}
+
 /*
 * Add a newly read item to the pages that we're assembling for
 * insertion into the cache.   These pages are private, they only exist
@@ -1377,34 +1450,24 @@ static int read_page_item(struct super_block *sb, struct scoutfs_key *key, u64 s
 * and duplicates, we insert any resulting pages which don't overlap
 * with existing cached pages.
 *
- * The forest item reader is reading stable trees that could be
- * overwritten.  It can return -ESTALE which we return to the caller who
- * will retry the operation and work with a new set of more recent
- * btrees.
- *
 * We only insert uncached regions because this is called with cluster
 * locks held, but without locking the cache.  The regions we read can
 * be stale with respect to the current cache, which can be read and
 * dirtied by other cluster lock holders on our node, but the cluster
- * locks protect the stable items we read.
+ * locks protect the stable items we read.  Invalidation is careful not
+ * to drop pages that have items that we couldn't see because they were
+ * dirty when we started reading.
 *
- * Using the presence of locally written dirty pages to override stale
- * read pages only works if, well, the more recent locally written pages
- * are still present.  Readers are totally decoupled from writers and
- * can have a set of items that is very old indeed.  In the mean time
- * more recent items would have been dirtied locally, committed,
- * cleaned, and reclaimed.  We have a coarse barrier which ensures that
- * readers can't insert items read from old roots from before local data
- * was written.  If a write completes while a read is in progress the
- * read will have to retry.  The retried read can use cached blocks so
- * we're relying on reads being much faster than writes to reduce the
- * overhead to mostly cpu work of recollecting the items from cached
- * blocks via a more recent root from the server.
+ * The forest item reader is reading stable trees that could be
+ * overwritten.  It can return -ESTALE which we return to the caller who
+ * will retry the operation and work with a new set of more recent
+ * btrees.
 */
 static int read_pages(struct super_block *sb, struct item_cache_info *cinf,
 		      struct scoutfs_key *key, struct scoutfs_lock *lock)
 {
 	struct rb_root root = RB_ROOT;
+	INIT_ACTIVE_READER(active);
 	struct cached_page *right = NULL;
 	struct cached_page *pg;
 	struct cached_page *rd;
@@ -1417,7 +1480,6 @@ static int read_pages(struct super_block *sb, struct item_cache_info *cinf,
 	struct rb_node *par;
 	struct rb_node *pg_tmp;
 	struct rb_node *item_tmp;
-	u64 rdbar;
 	int pgi;
 	int ret;

@@ -1431,7 +1493,8 @@ static int read_pages(struct super_block *sb, struct item_cache_info *cinf,
 	pg->end = lock->end;
 	rbtree_insert(&pg->node, NULL, &root.rb_node, &root);

-	rdbar = atomic64_read(&cinf->read_dirty_barrier);
+	/* set active reader seq before reading persistent roots */
+	add_active_reader(sb, &active);

 	start = lock->start;
 	end = lock->end;
@@ -1470,19 +1533,11 @@ static int read_pages(struct super_block *sb, struct item_cache_info *cinf,
 retry:
 	write_lock(&cinf->rwlock);

-	ret = 0;
 	while ((rd = first_page(&root))) {

 		pg = page_rbtree_walk(sb, &cinf->pg_root, &rd->start, &rd->end,
 				      NULL, NULL, &par, &pnode);
 		if (!pg) {
-			/* can't insert if write is cleaning (write_lock is read barrier) */
-			if (atomic64_read(&cinf->read_dirty_barrier) != rdbar) {
-				scoutfs_inc_counter(sb, item_read_pages_barrier);
-				ret = -ESTALE;
-				break;
-			}
-
 			/* insert read pages that don't intersect */
 			rbtree_erase(&rd->node, &root);
 			rbtree_insert(&rd->node, par, pnode, &cinf->pg_root);
@@ -1517,7 +1572,10 @@ retry:

 	write_unlock(&cinf->rwlock);

+	ret = 0;
 out:
+	del_active_reader(cinf, &active);
+
 	/* free any pages we left dangling on error */
 	for_each_page_safe(&root, rd, pg_tmp) {
 		rbtree_erase(&rd->node, &root);
@@ -1577,7 +1635,6 @@ retry:
 			ret = read_pages(sb, cinf, key, lock);
 		if (ret < 0 && ret != -ESTALE)
 			goto out;
-		scoutfs_inc_counter(sb, item_read_pages_retry);
 		goto retry;
 	}

@@ -2344,12 +2401,6 @@ out:
 * The caller has successfully committed all the dirty btree blocks that
 * contained the currently dirty items.  Clear all the dirty items and
 * pages.
- *
- * This strange lock/trylock loop comes from sparse issuing spurious
- * mismatched context warnings if we do anything (like unlock and relax)
- * in the else branch of the failed trylock.  We're jumping through
- * hoops to not use the else but still drop and reacquire the dirty_lock
- * if the trylock fails.
 */
 int scoutfs_item_write_done(struct super_block *sb)
 {
@@ -2358,34 +2409,40 @@ int scoutfs_item_write_done(struct super_block *sb)
 	struct cached_item *tmp;
 	struct cached_page *pg;

-	/* don't let read_pages insert possibly stale items */
-	atomic64_inc(&cinf->read_dirty_barrier);
-	smp_mb__after_atomic();
-
+retry:
 	spin_lock(&cinf->dirty_lock);
-	while ((pg = list_first_entry_or_null(&cinf->dirty_list, struct cached_page, dirty_head))) {
-		if (write_trylock(&pg->rwlock)) {
+
+	while ((pg = list_first_entry_or_null(&cinf->dirty_list,
+					      struct cached_page,
+					      dirty_head))) {
+
+		if (!write_trylock(&pg->rwlock)) {
 			spin_unlock(&cinf->dirty_lock);
-			list_for_each_entry_safe(item, tmp, &pg->dirty_list,
-						 dirty_head) {
-				clear_item_dirty(sb, cinf, pg, item);
-
-				if (item->delta)
-					scoutfs_inc_counter(sb, item_delta_written);
-
-				/* free deletion items */
-				if (item->deletion || item->delta)
-					erase_item(pg, item);
-				else
-					item->persistent = 1;
-			}
-
-			write_unlock(&pg->rwlock);
-			spin_lock(&cinf->dirty_lock);
+			cpu_relax();
+			goto retry;
 		}
+
 		spin_unlock(&cinf->dirty_lock);
+
+		list_for_each_entry_safe(item, tmp, &pg->dirty_list,
+					 dirty_head) {
+			clear_item_dirty(sb, cinf, pg, item);
+
+			if (item->delta)
+				scoutfs_inc_counter(sb, item_delta_written);
+
+			/* free deletion items */
+			if (item->deletion || item->delta)
+				erase_item(pg, item);
+			else
+				item->persistent = 1;
+		}
+
+		write_unlock(&pg->rwlock);
+
 		spin_lock(&cinf->dirty_lock);
-	} while (pg);
+	}
+
 	spin_unlock(&cinf->dirty_lock);

 	return 0;
@@ -2540,15 +2597,24 @@ static unsigned long item_cache_scan_objects(struct shrinker *shrink,
 	struct cached_page *tmp;
 	struct cached_page *pg;
 	unsigned long freed = 0;
+	u64 first_reader_seq;
 	int nr = sc->nr_to_scan;

 	scoutfs_inc_counter(sb, item_cache_scan_objects);

+	/* can't invalidate pages with items that weren't visible to first reader */
+	first_reader_seq = first_active_reader_seq(cinf);
+
 	write_lock(&cinf->rwlock);
 	spin_lock(&cinf->lru_lock);

 	list_for_each_entry_safe(pg, tmp, &cinf->lru_list, lru_head) {

+		if (first_reader_seq <= pg->max_seq) {
+			scoutfs_inc_counter(sb, item_shrink_page_reader);
+			continue;
+		}
+
 		if (!write_trylock(&pg->rwlock)) {
 			scoutfs_inc_counter(sb, item_shrink_page_trylock);
 			continue;
@@ -2615,7 +2681,8 @@ int scoutfs_item_setup(struct super_block *sb)
 	atomic_set(&cinf->dirty_pages, 0);
 	spin_lock_init(&cinf->lru_lock);
 	INIT_LIST_HEAD(&cinf->lru_list);
-	atomic64_set(&cinf->read_dirty_barrier, 0);
+	spin_lock_init(&cinf->active_lock);
+	INIT_LIST_HEAD(&cinf->active_list);

 	cinf->pcpu_pages = alloc_percpu(struct item_percpu_pages);
 	if (!cinf->pcpu_pages)
@@ -2648,6 +2715,8 @@ void scoutfs_item_destroy(struct super_block *sb)
 	int cpu;

 	if (cinf) {
+		BUG_ON(!list_empty(&cinf->active_list));
+
 #ifdef KC_CPU_NOTIFIER
 		unregister_hotcpu_notifier(&cinf->notifier);
 #endif
--- a/kmod/src/kernelcompat.c
+++ b/kmod/src/kernelcompat.c
@@ -81,69 +81,3 @@ kc_generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov,
 	return written ? written : status;
 }
 #endif
-
-#include <linux/list_lru.h>
-
-#ifdef KC_LIST_LRU_WALK_CB_ITEM_LOCK
-static enum lru_status kc_isolate(struct list_head *item, spinlock_t *lock, void *cb_arg)
-{
-	struct kc_isolate_args *args = cb_arg;
-
-	/* isolate doesn't use list, nr_items updated in caller */
-	return args->isolate(item, NULL, args->cb_arg);
-}
-
-unsigned long kc_list_lru_walk(struct list_lru *lru, kc_list_lru_walk_cb_t isolate, void *cb_arg,
-				      unsigned long nr_to_walk)
-{
-	struct kc_isolate_args args = {
-		.isolate = isolate,
-		.cb_arg = cb_arg,
-	};
-
-	return list_lru_walk(lru, kc_isolate, &args, nr_to_walk);
-}
-
-unsigned long kc_list_lru_shrink_walk(struct list_lru *lru, struct shrink_control *sc,
-				      kc_list_lru_walk_cb_t isolate, void *cb_arg)
-{
-	struct kc_isolate_args args = {
-		.isolate = isolate,
-		.cb_arg = cb_arg,
-	};
-
-	return list_lru_shrink_walk(lru, sc, kc_isolate, &args);
-}
-#endif
-
-#ifdef KC_LIST_LRU_WALK_CB_LIST_LOCK
-static enum lru_status kc_isolate(struct list_head *item, struct list_lru_one *list,
-				  spinlock_t *lock, void *cb_arg)
-{
-	struct kc_isolate_args *args = cb_arg;
-
-	return args->isolate(item, list, args->cb_arg);
-}
-
-unsigned long kc_list_lru_walk(struct list_lru *lru, kc_list_lru_walk_cb_t isolate, void *cb_arg,
-				      unsigned long nr_to_walk)
-{
-	struct kc_isolate_args args = {
-		.isolate = isolate,
-		.cb_arg = cb_arg,
-	};
-
-	return list_lru_walk(lru, kc_isolate, &args, nr_to_walk);
-}
-unsigned long kc_list_lru_shrink_walk(struct list_lru *lru, struct shrink_control *sc,
-				      kc_list_lru_walk_cb_t isolate, void *cb_arg)
-{
-	struct kc_isolate_args args = {
-		.isolate = isolate,
-		.cb_arg = cb_arg,
-	};
-
-	return list_lru_shrink_walk(lru, sc, kc_isolate, &args);
-}
-
-#endif
--- a/kmod/src/kernelcompat.h
+++ b/kmod/src/kernelcompat.h
@@ -29,6 +29,50 @@ do {						\
 })
 #endif

+#ifndef KC_ITERATE_DIR_CONTEXT
+typedef filldir_t kc_readdir_ctx_t;
+#define KC_DECLARE_READDIR(name, file, dirent, ctx) name(file, dirent, ctx)
+#define KC_FOP_READDIR readdir
+#define kc_readdir_pos(filp, ctx) (filp)->f_pos
+#define kc_dir_emit_dots(file, dirent, ctx) dir_emit_dots(file, dirent, ctx)
+#define kc_dir_emit(ctx, dirent, name, name_len, pos, ino, dt) \
+	(ctx(dirent, name, name_len, pos, ino, dt) == 0)
+#else
+typedef struct dir_context * kc_readdir_ctx_t;
+#define KC_DECLARE_READDIR(name, file, dirent, ctx) name(file, ctx)
+#define KC_FOP_READDIR iterate
+#define kc_readdir_pos(filp, ctx) (ctx)->pos
+#define kc_dir_emit_dots(file, dirent, ctx) dir_emit_dots(file, ctx)
+#define kc_dir_emit(ctx, dirent, name, name_len, pos, ino, dt) \
+	dir_emit(ctx, name, name_len, ino, dt)
+#endif
+
+#ifndef KC_DIR_EMIT_DOTS
+/*
+ * Kernels before ->iterate and don't have dir_emit_dots so we give them
+ * one that works with the ->readdir() filldir() method.
+ */
+static inline int dir_emit_dots(struct file *file, void *dirent,
+				filldir_t filldir)
+{
+	if (file->f_pos == 0) {
+		if (filldir(dirent, ".", 1, 1,
+			    file->f_path.dentry->d_inode->i_ino, DT_DIR))
+			return 0;
+		file->f_pos = 1;
+	}
+
+	if (file->f_pos == 1) {
+		if (filldir(dirent, "..", 2, 1,
+			    parent_ino(file->f_path.dentry), DT_DIR))
+			return 0;
+		file->f_pos = 2;
+	}
+
+	return 1;
+}
+#endif
+
 #ifdef KC_POSIX_ACL_VALID_USER_NS
 #define kc_posix_acl_valid(user_ns, acl) posix_acl_valid(user_ns, acl)
 #else
@@ -394,67 +438,4 @@ static inline int kc_tcp_sock_set_nodelay(struct socket *sock)
 }
 #endif

-#ifdef KC_INODE_DIO_END
-#define kc_inode_dio_end inode_dio_end
-#else
-#define kc_inode_dio_end inode_dio_done
-#endif
-
-#ifndef KC_MM_VM_FAULT_T
-typedef unsigned int vm_fault_t;
-static inline vm_fault_t vmf_error(int err)
-{
-	if (err == -ENOMEM)
-		return VM_FAULT_OOM;
-	return VM_FAULT_SIGBUS;
-}
-#endif
-
-#include <linux/list_lru.h>
-
-#ifndef KC_LIST_LRU_SHRINK_COUNT_WALK
-/* we don't bother with sc->{nid,memcg} (which doesn't exist in oldest kernels) */
-static inline unsigned long list_lru_shrink_count(struct list_lru *lru,
-                                                  struct shrink_control *sc)
-{
-        return list_lru_count(lru);
-}
-static inline unsigned long
-list_lru_shrink_walk(struct list_lru *lru, struct shrink_control *sc,
-		     list_lru_walk_cb isolate, void *cb_arg)
-{
-	return list_lru_walk(lru, isolate, cb_arg, sc->nr_to_scan);
-}
-#endif
-
-#ifndef KC_LIST_LRU_ADD_OBJ
-#define list_lru_add_obj list_lru_add
-#define list_lru_del_obj list_lru_del
-#endif
-
-#if defined(KC_LIST_LRU_WALK_CB_LIST_LOCK) || defined(KC_LIST_LRU_WALK_CB_ITEM_LOCK)
-struct list_lru_one;
-typedef enum lru_status (*kc_list_lru_walk_cb_t)(struct list_head *item, struct list_lru_one *list,
-						 void *cb_arg);
-struct kc_isolate_args {
-	kc_list_lru_walk_cb_t isolate;
-	void *cb_arg;
-};
-unsigned long kc_list_lru_walk(struct list_lru *lru, kc_list_lru_walk_cb_t isolate, void *cb_arg,
-			       unsigned long nr_to_walk);
-unsigned long kc_list_lru_shrink_walk(struct list_lru *lru, struct shrink_control *sc,
-				      kc_list_lru_walk_cb_t isolate, void *cb_arg);
-#else
-#define kc_list_lru_shrink_walk list_lru_shrink_walk
-#endif
-
-#if defined(KC_LIST_LRU_WALK_CB_ITEM_LOCK)
-/* isolate moved by hand, nr_items updated in walk as _REMOVE returned */
-static inline void list_lru_isolate_move(struct list_lru_one *list, struct list_head *item,
-					 struct list_head *head)
-{
-        list_move(item, head);
-}
-#endif
-
 #endif
--- a/kmod/src/lock.c
+++ b/kmod/src/lock.c
@@ -168,6 +168,7 @@ static int lock_invalidate(struct super_block *sb, struct scoutfs_lock *lock,
 			   enum scoutfs_lock_mode prev, enum scoutfs_lock_mode mode)
 {
 	struct scoutfs_lock_coverage *cov;
+	struct scoutfs_lock_coverage *tmp;
 	u64 ino, last;
 	int ret = 0;

@@ -191,22 +192,19 @@ static int lock_invalidate(struct super_block *sb, struct scoutfs_lock *lock,

 	/* have to invalidate if we're not in the only usable case */
 	if (!(prev == SCOUTFS_LOCK_WRITE && mode == SCOUTFS_LOCK_READ)) {
-		/*
-		 * Remove cov items to tell users that their cache is
-		 * stale.  The unlock pattern comes from avoiding bad
-		 * sparse warnings when taking else in a failed trylock.
-		 */
+retry:
+		/* remove cov items to tell users that their cache is stale */
 		spin_lock(&lock->cov_list_lock);
-		while ((cov = list_first_entry_or_null(&lock->cov_list,
-						       struct scoutfs_lock_coverage, head))) {
-			if (spin_trylock(&cov->cov_lock)) {
-				list_del_init(&cov->head);
-				cov->lock = NULL;
-				spin_unlock(&cov->cov_lock);
-				scoutfs_inc_counter(sb, lock_invalidate_coverage);
+		list_for_each_entry_safe(cov, tmp, &lock->cov_list, head) {
+			if (!spin_trylock(&cov->cov_lock)) {
+				spin_unlock(&lock->cov_list_lock);
+				cpu_relax();
+				goto retry;
 			}
-			spin_unlock(&lock->cov_list_lock);
-			spin_lock(&lock->cov_list_lock);
+			list_del_init(&cov->head);
+			cov->lock = NULL;
+			spin_unlock(&cov->cov_lock);
+			scoutfs_inc_counter(sb, lock_invalidate_coverage);
 		}
 		spin_unlock(&lock->cov_list_lock);

@@ -304,7 +302,6 @@ static void lock_inc_count(unsigned int *counts, enum scoutfs_lock_mode mode)
 static void lock_dec_count(unsigned int *counts, enum scoutfs_lock_mode mode)
 {
 	BUG_ON(mode < 0 || mode >= SCOUTFS_LOCK_NR_MODES);
-	BUG_ON(counts[mode] == 0);
 	counts[mode]--;
 }

--- a/kmod/src/net.c
+++ b/kmod/src/net.c
@@ -20,7 +20,6 @@
 #include <net/sock.h>
 #include <net/tcp.h>
 #include <linux/log2.h>
-#include <linux/jhash.h>

 #include "format.h"
 #include "counters.h"
@@ -32,7 +31,6 @@
 #include "endian_swap.h"
 #include "tseq.h"
 #include "fence.h"
-#include "options.h"

 /*
 * scoutfs networking delivers requests and responses between nodes.
@@ -136,7 +134,6 @@ struct message_send {
 struct message_recv {
 	struct scoutfs_tseq_entry tseq_entry;
 	struct work_struct proc_work;
-	struct list_head ordered_head;
 	struct scoutfs_net_connection *conn;
 	struct scoutfs_net_header nh;
 };
@@ -335,7 +332,7 @@ static int submit_send(struct super_block *sb,
 		return -EINVAL;

 	if (scoutfs_forcing_unmount(sb))
-		return -ENOLINK;
+		return -EIO;

 	msend = kmalloc(offsetof(struct message_send,
 				 nh.data[data_len]), GFP_NOFS);
@@ -501,51 +498,6 @@ static void scoutfs_net_proc_worker(struct work_struct *work)
 	trace_scoutfs_net_proc_work_exit(sb, 0, ret);
 }

-static void scoutfs_net_ordered_proc_worker(struct work_struct *work)
-{
-	struct scoutfs_work_list *wlist = container_of(work, struct scoutfs_work_list, work);
-	struct message_recv *mrecv;
-	struct message_recv *mrecv__;
-	LIST_HEAD(list);
-
-	spin_lock(&wlist->lock);
-	list_splice_init(&wlist->list, &list);
-	spin_unlock(&wlist->lock);
-
-	list_for_each_entry_safe(mrecv, mrecv__, &list, ordered_head) {
-		list_del_init(&mrecv->ordered_head);
-		scoutfs_net_proc_worker(&mrecv->proc_work);
-	}
-}
-
-/*
- * Some messages require in-order processing.  But the scope of the
- * ordering isn't global.  In the case of lock messages, it's per lock.
- * So for these messages we hash them to a number of ordered workers who
- * walk a list and call the usual work function in order.  This replaced
- * first the proc work detecting OOO and re-ordering, and then only
- * calling proc from the one recv work context.
- */
-static void queue_ordered_proc(struct scoutfs_net_connection *conn, struct message_recv *mrecv)
-{
-	struct scoutfs_work_list *wlist;
-	struct scoutfs_net_lock *nl;
-	u32 h;
-
-	if (WARN_ON_ONCE(mrecv->nh.cmd != SCOUTFS_NET_CMD_LOCK ||
-		         le16_to_cpu(mrecv->nh.data_len) != sizeof(struct scoutfs_net_lock)))
-		return scoutfs_net_proc_worker(&mrecv->proc_work);
-
-	nl = (void *)mrecv->nh.data;
-	h = jhash(&nl->key, sizeof(struct scoutfs_key), 0x6fdd3cd5);
-	wlist = &conn->ordered_proc_wlists[h % conn->ordered_proc_nr];
-
-	spin_lock(&wlist->lock);
-	list_add_tail(&mrecv->ordered_head, &wlist->list);
-	spin_unlock(&wlist->lock);
-	queue_work(conn->workq, &wlist->work);
-}
-
 /*
 * Free live responses up to and including the seq by marking them dead
 * and moving them to the send queue to be freed.
@@ -589,17 +541,33 @@ static void free_acked_responses(struct scoutfs_net_connection *conn, u64 seq)
 		queue_work(conn->workq, &conn->send_work);
 }

-static int k_recvmsg(struct socket *sock, void *buf, unsigned len)
+static int recvmsg_full(struct socket *sock, void *buf, unsigned len)
 {
-	struct kvec kv = {
-		.iov_base = buf,
-		.iov_len = len,
-	};
-	struct msghdr msg = {
-		.msg_flags = MSG_NOSIGNAL,
-	};
+	struct msghdr msg;
+	struct kvec kv;
+	int ret;

-	return kernel_recvmsg(sock, &msg, &kv, 1, len, msg.msg_flags);
+	while (len) {
+		memset(&msg, 0, sizeof(msg));
+		msg.msg_flags = MSG_NOSIGNAL;
+		kv.iov_base = buf;
+		kv.iov_len = len;
+
+#ifndef KC_MSGHDR_STRUCT_IOV_ITER
+		msg.msg_iov = (struct iovec *)&kv;
+		msg.msg_iovlen = 1;
+#else
+		iov_iter_init(&msg.msg_iter, READ, (struct iovec *)&kv, len, 1);
+#endif
+		ret = kernel_recvmsg(sock, &msg, &kv, 1, len, msg.msg_flags);
+		if (ret <= 0)
+			return -ECONNABORTED;
+
+		len -= ret;
+		buf += ret;
+	}
+
+	return 0;
 }

 static bool invalid_message(struct scoutfs_net_connection *conn,
@@ -636,72 +604,6 @@ static bool invalid_message(struct scoutfs_net_connection *conn,
 	return false;
 }

-static int recv_one_message(struct super_block *sb, struct net_info *ninf,
-			    struct scoutfs_net_connection *conn, struct scoutfs_net_header *nh,
-			    unsigned int data_len)
-{
-	struct message_recv *mrecv;
-	int ret;
-
-	scoutfs_inc_counter(sb, net_recv_messages);
-	scoutfs_add_counter(sb, net_recv_bytes, nh_bytes(data_len));
-	trace_scoutfs_net_recv_message(sb, &conn->sockname, &conn->peername, nh);
-
-	/* caller's invalid message checked data len */
-	mrecv = kmalloc(offsetof(struct message_recv, nh.data[data_len]), GFP_NOFS);
-	if (!mrecv) {
-		ret = -ENOMEM;
-		goto out;
-	}
-
-	mrecv->conn = conn;
-	INIT_WORK(&mrecv->proc_work, scoutfs_net_proc_worker);
-	INIT_LIST_HEAD(&mrecv->ordered_head);
-	mrecv->nh = *nh;
-	if (data_len)
-		memcpy(mrecv->nh.data, (nh + 1), data_len);
-
-	if (nh->cmd == SCOUTFS_NET_CMD_GREETING) {
-		/* greetings are out of band, no seq mechanics */
-		set_conn_fl(conn, saw_greeting);
-
-	} else if (le64_to_cpu(nh->seq) <=
-		   atomic64_read(&conn->recv_seq)) {
-		/* drop any resent duplicated messages */
-		scoutfs_inc_counter(sb, net_recv_dropped_duplicate);
-		kfree(mrecv);
-		ret = 0;
-		goto out;
-
-	} else {
-		/* record that we've received sender's seq */
-		atomic64_set(&conn->recv_seq, le64_to_cpu(nh->seq));
-		/* and free our responses that sender has received */
-		free_acked_responses(conn, le64_to_cpu(nh->recv_seq));
-	}
-
-	scoutfs_tseq_add(&ninf->msg_tseq_tree, &mrecv->tseq_entry);
-
-	/*
-	 * Initial received greetings are processed inline
-	 * before any other incoming messages.
-	 *
-	 * Incoming requests or responses to the lock client
-	 * can't handle re-ordering, so they're queued to
-	 * ordered receive processing work.
-	 */
-	if (nh->cmd == SCOUTFS_NET_CMD_GREETING)
-		scoutfs_net_proc_worker(&mrecv->proc_work);
-	else if (nh->cmd == SCOUTFS_NET_CMD_LOCK && !conn->listening_conn)
-		queue_ordered_proc(conn, mrecv);
-	else
-		queue_work(conn->workq, &mrecv->proc_work);
-	ret = 0;
-
-out:
-	return ret;
-}
-
 /*
 * Always block receiving from the socket.  Errors trigger shutting down
 * the connection.
@@ -712,72 +614,86 @@ static void scoutfs_net_recv_worker(struct work_struct *work)
 	struct super_block *sb = conn->sb;
 	struct net_info *ninf = SCOUTFS_SB(sb)->net_info;
 	struct socket *sock = conn->sock;
-	struct scoutfs_net_header *nh;
-	struct page *page = NULL;
+	struct scoutfs_net_header nh;
+	struct message_recv *mrecv;
 	unsigned int data_len;
-	int hdr_off;
-	int rx_off;
-	int size;
 	int ret;

 	trace_scoutfs_net_recv_work_enter(sb, 0, 0);

-	page = alloc_page(GFP_NOFS);
-	if (!page) {
-		ret = -ENOMEM;
-		goto out;
-	}
-
-	hdr_off = 0;
-	rx_off = 0;
-
 	for (;;) {
 		/* receive the header */
-		ret = k_recvmsg(sock, page_address(page) + rx_off, PAGE_SIZE - rx_off);
-		if (ret <= 0) {
-			ret = -ECONNABORTED;
-			goto out;
+		ret = recvmsg_full(sock, &nh, sizeof(nh));
+		if (ret)
+			break;
+
+		/* receiving an invalid message breaks the connection */
+		if (invalid_message(conn, &nh)) {
+			scoutfs_inc_counter(sb, net_recv_invalid_message);
+			ret = -EBADMSG;
+			break;
 		}

-		rx_off += ret;
+		data_len = le16_to_cpu(nh.data_len);

-		for (;;) {
-			size = rx_off - hdr_off;
-			if (size < sizeof(struct scoutfs_net_header))
-				break;
+		scoutfs_inc_counter(sb, net_recv_messages);
+		scoutfs_add_counter(sb, net_recv_bytes, nh_bytes(data_len));
+		trace_scoutfs_net_recv_message(sb, &conn->sockname,
+					       &conn->peername, &nh);

-			nh = page_address(page) + hdr_off;
-
-			/* receiving an invalid message breaks the connection */
-			if (invalid_message(conn, nh)) {
-				scoutfs_inc_counter(sb, net_recv_invalid_message);
-				ret = -EBADMSG;
-				break;
-			}
-
-			data_len = le16_to_cpu(nh->data_len);
-			if (sizeof(struct scoutfs_net_header) + data_len > size)
-				break;
-
-			ret = recv_one_message(sb, ninf, conn, nh, data_len);
-			if (ret < 0)
-				goto out;
-
-			hdr_off += sizeof(struct scoutfs_net_header) + data_len;
+		/* invalid message checked data len */
+		mrecv = kmalloc(offsetof(struct message_recv,
+					 nh.data[data_len]), GFP_NOFS);
+		if (!mrecv) {
+			ret = -ENOMEM;
+			break;
 		}

-		if ((PAGE_SIZE - rx_off) <
-		    (sizeof(struct scoutfs_net_header) + SCOUTFS_NET_MAX_DATA_LEN)) {
-			if (size)
-				memmove(page_address(page), page_address(page) + hdr_off, size);
-			hdr_off = 0;
-			rx_off = size;
+		mrecv->conn = conn;
+		INIT_WORK(&mrecv->proc_work, scoutfs_net_proc_worker);
+		mrecv->nh = nh;
+
+		/* receive the data payload */
+		ret = recvmsg_full(sock, mrecv->nh.data, data_len);
+		if (ret) {
+			kfree(mrecv);
+			break;
 		}
+
+		if (nh.cmd == SCOUTFS_NET_CMD_GREETING) {
+			/* greetings are out of band, no seq mechanics */
+			set_conn_fl(conn, saw_greeting);
+
+		} else if (le64_to_cpu(nh.seq) <=
+			   atomic64_read(&conn->recv_seq)) {
+			/* drop any resent duplicated messages */
+			scoutfs_inc_counter(sb, net_recv_dropped_duplicate);
+			kfree(mrecv);
+			continue;
+
+		} else {
+			/* record that we've received sender's seq */
+			atomic64_set(&conn->recv_seq, le64_to_cpu(nh.seq));
+			/* and free our responses that sender has received */
+			free_acked_responses(conn, le64_to_cpu(nh.recv_seq));
+		}
+
+		scoutfs_tseq_add(&ninf->msg_tseq_tree, &mrecv->tseq_entry);
+
+		/*
+		 * Initial received greetings are processed
+		 * synchronously before any other incoming messages.
+		 *
+		 * Incoming requests or responses to the lock client are
+		 * called synchronously to avoid reordering.
+		 */
+		if (nh.cmd == SCOUTFS_NET_CMD_GREETING ||
+		    (nh.cmd == SCOUTFS_NET_CMD_LOCK && !conn->listening_conn))
+			scoutfs_net_proc_worker(&mrecv->proc_work);
+		else
+			queue_work(conn->workq, &mrecv->proc_work);
 	}

-out:
-	__free_page(page);
-
 	if (ret)
 		scoutfs_inc_counter(sb, net_recv_error);

@@ -787,41 +703,33 @@ out:
 	trace_scoutfs_net_recv_work_exit(sb, 0, ret);
 }

-/*
- * This consumes the kvec.
- */
-static int k_sendmsg_full(struct socket *sock, struct kvec *kv, unsigned long nr_segs, size_t count)
+static int sendmsg_full(struct socket *sock, void *buf, unsigned len)
 {
-	int ret = 0;
+	struct msghdr msg;
+	struct kvec kv;
+	int ret;

-	while (count > 0) {
-		struct msghdr msg = {
-			.msg_flags = MSG_NOSIGNAL,
-		};
+	while (len) {
+		memset(&msg, 0, sizeof(msg));
+		msg.msg_flags = MSG_NOSIGNAL;
+		kv.iov_base = buf;
+		kv.iov_len = len;

-		ret = kernel_sendmsg(sock, &msg, kv, nr_segs, count);
-		if (ret <= 0) {
-			ret = -ECONNABORTED;
-			break;
-		}
+#ifndef KC_MSGHDR_STRUCT_IOV_ITER
+		msg.msg_iov = (struct iovec *)&kv;
+		msg.msg_iovlen = 1;
+#else
+		iov_iter_init(&msg.msg_iter, WRITE, (struct iovec *)&kv, len, 1);
+#endif
+		ret = kernel_sendmsg(sock, &msg, &kv, 1, len);
+		if (ret <= 0)
+			return -ECONNABORTED;

-		count -= ret;
-		if (count) {
-			while (nr_segs > 0 && ret >= kv->iov_len) {
-				ret -= kv->iov_len;
-				kv++;
-				nr_segs--;
-			}
-			if (nr_segs > 0 && ret > 0) {
-				kv->iov_base += ret;
-				kv->iov_len -= ret;
-			}
-			BUG_ON(nr_segs == 0);
-		}
-		ret = 0;
+		len -= ret;
+		buf += ret;
 	}
-	
-	return ret;
+
+	return 0;
 }

 static void free_msend(struct net_info *ninf, struct message_send *msend)
@@ -852,73 +760,54 @@ static void scoutfs_net_send_worker(struct work_struct *work)
 	struct super_block *sb = conn->sb;
 	struct net_info *ninf = SCOUTFS_SB(sb)->net_info;
 	struct message_send *msend;
-	struct message_send *_msend_;
-	struct kvec kv[16];
-	unsigned long nr_segs;
-	size_t count;
+	int ret = 0;
 	int len;
-	int ret;

 	trace_scoutfs_net_send_work_enter(sb, 0, 0);

-	for (;;) {
-		nr_segs = 0;
-		count = 0;
+	spin_lock(&conn->lock);

-		spin_lock(&conn->lock);
-		list_for_each_entry_safe(msend, _msend_, &conn->send_queue, head) {
-			if (msend->dead) {
-				free_msend(ninf, msend);
-				continue;
-			}
-
-			len = nh_bytes(le16_to_cpu(msend->nh.data_len));
-
-			if ((msend->nh.cmd == SCOUTFS_NET_CMD_FAREWELL) &&
-			    nh_is_response(&msend->nh)) {
-				set_conn_fl(conn, saw_farewell);
-			}
-
-			msend->nh.recv_seq = cpu_to_le64(atomic64_read(&conn->recv_seq));
-
-			scoutfs_inc_counter(sb, net_send_messages);
-			scoutfs_add_counter(sb, net_send_bytes, len);
-			trace_scoutfs_net_send_message(sb, &conn->sockname,
-						       &conn->peername, &msend->nh);
-
-			count += len;
-			kv[nr_segs].iov_base = &msend->nh;
-			kv[nr_segs].iov_len = len;
-			if (++nr_segs == ARRAY_SIZE(kv))
-				break;
+	while ((msend = list_first_entry_or_null(&conn->send_queue,
+						 struct message_send, head))) {

+		if (msend->dead) {
+			free_msend(ninf, msend);
+			continue;
 		}
+
+		if ((msend->nh.cmd == SCOUTFS_NET_CMD_FAREWELL) &&
+		    nh_is_response(&msend->nh)) {
+			set_conn_fl(conn, saw_farewell);
+		}
+
+		msend->nh.recv_seq =
+			cpu_to_le64(atomic64_read(&conn->recv_seq));
+
 		spin_unlock(&conn->lock);

-		if (nr_segs == 0) {
-			ret = 0;
-			goto out;
-		}
+		len = nh_bytes(le16_to_cpu(msend->nh.data_len));

-		ret = k_sendmsg_full(conn->sock, kv, nr_segs, count);
-		if (ret < 0)
-			goto out;
+		scoutfs_inc_counter(sb, net_send_messages);
+		scoutfs_add_counter(sb, net_send_bytes, len);
+		trace_scoutfs_net_send_message(sb, &conn->sockname,
+					       &conn->peername, &msend->nh);
+
+		ret = sendmsg_full(conn->sock, &msend->nh, len);

 		spin_lock(&conn->lock);
-		list_for_each_entry_safe(msend, _msend_, &conn->send_queue, head) {
-			msend->nh.recv_seq = 0;

-			/* resend if it wasn't freed while we sent */
-			if (!msend->dead)
-				list_move_tail(&msend->head, &conn->resend_queue);
+		msend->nh.recv_seq = 0;

-			if (--nr_segs == 0)
-				break;
-		}
-		spin_unlock(&conn->lock);
+		if (ret)
+			break;
+
+		/* resend if it wasn't freed while we sent */
+		if (!msend->dead)
+			list_move_tail(&msend->head, &conn->resend_queue);
 	}

-out:
+	spin_unlock(&conn->lock);
+
 	if (ret) {
 		scoutfs_inc_counter(sb, net_send_error);
 		shutdown_conn(conn);
@@ -973,7 +862,6 @@ static void scoutfs_net_destroy_worker(struct work_struct *work)
 	destroy_workqueue(conn->workq);
 	scoutfs_tseq_del(&ninf->conn_tseq_tree, &conn->tseq_entry);
 	kfree(conn->info);
-	kfree(conn->ordered_proc_wlists);
 	trace_scoutfs_conn_destroy_free(conn);
 	kfree(conn);

@@ -999,7 +887,7 @@ static void destroy_conn(struct scoutfs_net_connection *conn)
 * The TCP_KEEP* and TCP_USER_TIMEOUT option interaction is subtle.
 * TCP_USER_TIMEOUT only applies if there is unacked written data in the
 * send queue.  It doesn't work if the connection is idle.  Adding
- * keepalive probes with user_timeout set changes how the keepalive
+ * keepalice probes with user_timeout set changes how the keepalive
 * timeout is calculated.   CNT no longer matters.   Each time
 * additional probes (not the first) are sent the user timeout is
 * checked against the last time data was received.  If none of the
@@ -1011,16 +899,14 @@ static void destroy_conn(struct scoutfs_net_connection *conn)
 * elapses during the probe timer processing after the unsuccessful
 * probes.
 */
-static int sock_opts_and_names(struct super_block *sb,
-			       struct scoutfs_net_connection *conn,
+#define UNRESPONSIVE_TIMEOUT_SECS 10
+#define UNRESPONSIVE_PROBES 3
+static int sock_opts_and_names(struct scoutfs_net_connection *conn,
 			       struct socket *sock)
 {
-	struct scoutfs_mount_options opts;
 	int optval;
 	int ret;

-	scoutfs_options_read(sb, &opts);
-
 	/* we use a keepalive timeout instead of send timeout */
 	ret = kc_sock_set_sndtimeo(sock, 0);
 	if (ret)
@@ -1033,7 +919,8 @@ static int sock_opts_and_names(struct super_block *sb,
 	if (ret)
 		goto out;

-	optval = (opts.tcp_keepalive_timeout_ms / MSEC_PER_SEC) - UNRESPONSIVE_PROBES;
+	BUILD_BUG_ON(UNRESPONSIVE_PROBES >= UNRESPONSIVE_TIMEOUT_SECS);
+	optval = UNRESPONSIVE_TIMEOUT_SECS - (UNRESPONSIVE_PROBES);
 	ret = kc_tcp_sock_set_keepidle(sock, optval);
 	if (ret)
 		goto out;
@@ -1043,7 +930,7 @@ static int sock_opts_and_names(struct super_block *sb,
 	if (ret)
 		goto out;

-	optval = opts.tcp_keepalive_timeout_ms;
+	optval = UNRESPONSIVE_TIMEOUT_SECS * MSEC_PER_SEC;
 	ret = kc_tcp_sock_set_user_timeout(sock, optval);
 	if (ret)
 		goto out;
@@ -1111,7 +998,7 @@ static void scoutfs_net_listen_worker(struct work_struct *work)
 			continue;
 		}

-		ret = sock_opts_and_names(sb, acc_conn, acc_sock);
+		ret = sock_opts_and_names(acc_conn, acc_sock);
 		if (ret) {
 			sock_release(acc_sock);
 			destroy_conn(acc_conn);
@@ -1182,7 +1069,7 @@ static void scoutfs_net_connect_worker(struct work_struct *work)
 	if (ret)
 		goto out;

-	ret = sock_opts_and_names(sb, conn, sock);
+	ret = sock_opts_and_names(conn, sock);
 	if (ret)
 		goto out;

@@ -1443,30 +1330,25 @@ scoutfs_net_alloc_conn(struct super_block *sb,
 {
 	struct net_info *ninf = SCOUTFS_SB(sb)->net_info;
 	struct scoutfs_net_connection *conn;
-	unsigned int nr;
-	unsigned int i;
-
-	nr = min_t(unsigned int, num_possible_cpus(),
-		   PAGE_SIZE / sizeof(struct scoutfs_work_list));

 	conn = kzalloc(sizeof(struct scoutfs_net_connection), GFP_NOFS);
-	if (conn) {
-		if (info_size)
-			conn->info = kzalloc(info_size, GFP_NOFS);
-		conn->ordered_proc_wlists = kmalloc_array(nr, sizeof(struct scoutfs_work_list),
-							  GFP_NOFS);
-		conn->workq = alloc_workqueue("scoutfs_net_%s",
-					      WQ_UNBOUND | WQ_NON_REENTRANT, 0,
-					      name_suffix);
-	}
-	if (!conn || (info_size && !conn->info) || !conn->workq || !conn->ordered_proc_wlists) {
-		if (conn) {
-			kfree(conn->info);
-			kfree(conn->ordered_proc_wlists);
-			if (conn->workq)
-				destroy_workqueue(conn->workq);
+	if (!conn)
+		return NULL;
+
+	if (info_size) {
+		conn->info = kzalloc(info_size, GFP_NOFS);
+		if (!conn->info) {
 			kfree(conn);
+			return NULL;
 		}
+	}
+
+	conn->workq = alloc_workqueue("scoutfs_net_%s",
+				      WQ_UNBOUND | WQ_NON_REENTRANT, 0,
+				      name_suffix);
+	if (!conn->workq) {
+		kfree(conn->info);
+		kfree(conn);
 		return NULL;
 	}

@@ -1496,13 +1378,6 @@ scoutfs_net_alloc_conn(struct super_block *sb,
 	INIT_DELAYED_WORK(&conn->reconn_free_dwork,
 			  scoutfs_net_reconn_free_worker);

-	conn->ordered_proc_nr = nr;
-	for (i = 0; i < nr; i++) {
-		INIT_WORK(&conn->ordered_proc_wlists[i].work, scoutfs_net_ordered_proc_worker);
-		spin_lock_init(&conn->ordered_proc_wlists[i].lock);
-		INIT_LIST_HEAD(&conn->ordered_proc_wlists[i].list);
-	}
-
 	scoutfs_tseq_add(&ninf->conn_tseq_tree, &conn->tseq_entry);
 	trace_scoutfs_conn_alloc(conn);

--- a/kmod/src/net.h
+++ b/kmod/src/net.h
@@ -1,18 +1,10 @@
 #ifndef _SCOUTFS_NET_H_
 #define _SCOUTFS_NET_H_

-#include <linux/spinlock.h>
-#include <linux/list.h>
 #include <linux/in.h>
 #include "endian_swap.h"
 #include "tseq.h"

-struct scoutfs_work_list {
-	struct work_struct work;
-	spinlock_t lock;
-	struct list_head list;
-};
-
 struct scoutfs_net_connection;

 /* These are called in their own blocking context */
@@ -69,8 +61,6 @@ struct scoutfs_net_connection {
 	struct list_head resend_queue;

 	atomic64_t recv_seq;
-	unsigned int ordered_proc_nr;
-	struct scoutfs_work_list *ordered_proc_wlists;

 	struct workqueue_struct *workq;
 	struct work_struct listen_work;
--- a/kmod/src/options.c
+++ b/kmod/src/options.c
@@ -39,7 +39,6 @@ enum {
 	Opt_orphan_scan_delay_ms,
 	Opt_quorum_heartbeat_timeout_ms,
 	Opt_quorum_slot_nr,
-	Opt_tcp_keepalive_timeout_ms,
 	Opt_err,
 };

@@ -53,7 +52,6 @@ static const match_table_t tokens = {
 	{Opt_orphan_scan_delay_ms, "orphan_scan_delay_ms=%s"},
 	{Opt_quorum_heartbeat_timeout_ms, "quorum_heartbeat_timeout_ms=%s"},
 	{Opt_quorum_slot_nr, "quorum_slot_nr=%s"},
-	{Opt_tcp_keepalive_timeout_ms, "tcp_keepalive_timeout_ms=%s"},
 	{Opt_err, NULL}
 };

@@ -128,8 +126,6 @@ static void free_options(struct scoutfs_mount_options *opts)
 #define MIN_DATA_PREALLOC_BLOCKS	1ULL
 #define MAX_DATA_PREALLOC_BLOCKS	((unsigned long long)SCOUTFS_BLOCK_SM_MAX)

-#define DEFAULT_TCP_KEEPALIVE_TIMEOUT_MS	(10 * MSEC_PER_SEC)
-
 static void init_default_options(struct scoutfs_mount_options *opts)
 {
 	memset(opts, 0, sizeof(*opts));
@@ -140,7 +136,6 @@ static void init_default_options(struct scoutfs_mount_options *opts)
 	opts->orphan_scan_delay_ms = -1;
 	opts->quorum_heartbeat_timeout_ms = SCOUTFS_QUORUM_DEF_HB_TIMEO_MS;
 	opts->quorum_slot_nr = -1;
-	opts->tcp_keepalive_timeout_ms = DEFAULT_TCP_KEEPALIVE_TIMEOUT_MS;
 }

 static int verify_log_merge_wait_timeout_ms(struct super_block *sb, int ret, int val)
@@ -173,21 +168,6 @@ static int verify_quorum_heartbeat_timeout_ms(struct super_block *sb, int ret, u
 	return 0;
 }

-static int verify_tcp_keepalive_timeout_ms(struct super_block *sb, int ret, int val)
-{
-	if (ret < 0) {
-		scoutfs_err(sb, "failed to parse tcp_keepalive_timeout_ms value");
-		return -EINVAL;
-	}
-	if (val <= (UNRESPONSIVE_PROBES * MSEC_PER_SEC)) {
-		scoutfs_err(sb, "invalid tcp_keepalive_timeout_ms value %d, must be larger than %lu",
-			    val, (UNRESPONSIVE_PROBES * MSEC_PER_SEC));
-		return -EINVAL;
-	}
-
-	return 0;
-}
-
 /*
 * Parse the option string into our options struct.   This can allocate
 * memory in the struct.  The caller is responsible for always calling
@@ -238,14 +218,6 @@ static int parse_options(struct super_block *sb, char *options, struct scoutfs_m
 			opts->data_prealloc_contig_only = nr;
 			break;

-		case Opt_tcp_keepalive_timeout_ms:
-			ret = match_int(args, &nr);
-			ret = verify_tcp_keepalive_timeout_ms(sb, ret, nr);
-			if (ret < 0)
-				return ret;
-			opts->tcp_keepalive_timeout_ms = nr;
-			break;
-
 		case Opt_log_merge_wait_timeout_ms:
 			ret = match_int(args, &nr);
 			ret = verify_log_merge_wait_timeout_ms(sb, ret, nr);
@@ -399,7 +371,6 @@ int scoutfs_options_show(struct seq_file *seq, struct dentry *root)
 	seq_printf(seq, ",orphan_scan_delay_ms=%u", opts.orphan_scan_delay_ms);
 	if (opts.quorum_slot_nr >= 0)
 		seq_printf(seq, ",quorum_slot_nr=%d", opts.quorum_slot_nr);
-	seq_printf(seq, ",tcp_keepalive_timeout_ms=%d", opts.tcp_keepalive_timeout_ms);

 	return 0;
 }
--- a/kmod/src/options.h
+++ b/kmod/src/options.h
@@ -13,11 +13,8 @@ struct scoutfs_mount_options {
 	unsigned int orphan_scan_delay_ms;
 	int quorum_slot_nr;
 	u64 quorum_heartbeat_timeout_ms;
-	int tcp_keepalive_timeout_ms;
 };

-#define UNRESPONSIVE_PROBES	3
-
 void scoutfs_options_read(struct super_block *sb, struct scoutfs_mount_options *opts);
 int scoutfs_options_show(struct seq_file *seq, struct dentry *root);

--- a/kmod/src/quorum.c
+++ b/kmod/src/quorum.c
@@ -243,6 +243,10 @@ static int send_msg_members(struct super_block *sb, int type, u64 term, int only
 	};
 	struct sockaddr_in sin;
 	struct msghdr mh = {
+#ifndef KC_MSGHDR_STRUCT_IOV_ITER
+		.msg_iov = (struct iovec *)&kv,
+		.msg_iovlen = 1,
+#endif
 		.msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL,
 		.msg_name = &sin,
 		.msg_namelen = sizeof(sin),
@@ -264,7 +268,9 @@ static int send_msg_members(struct super_block *sb, int type, u64 term, int only

 		scoutfs_quorum_slot_sin(&qinf->qconf, i, &sin);
 		now = ktime_get();
-
+#ifdef KC_MSGHDR_STRUCT_IOV_ITER
+		iov_iter_init(&mh.msg_iter, WRITE, (struct iovec *)&kv, sizeof(qmes), 1);
+#endif
 		ret = kernel_sendmsg(qinf->sock, &mh, &kv, 1, kv.iov_len);
 		if (ret != kv.iov_len)
 			failed++;
@@ -306,6 +312,10 @@ static int recv_msg(struct super_block *sb, struct quorum_host_msg *msg,
 		.iov_len = sizeof(struct scoutfs_quorum_message),
 	};
 	struct msghdr mh = {
+#ifndef KC_MSGHDR_STRUCT_IOV_ITER
+		.msg_iov = (struct iovec *)&kv,
+		.msg_iovlen = 1,
+#endif
 		.msg_flags = MSG_NOSIGNAL,
 	};

@@ -323,6 +333,9 @@ static int recv_msg(struct super_block *sb, struct quorum_host_msg *msg,
 		ret = kc_tcp_sock_set_rcvtimeo(qinf->sock, rel_to);
 	}

+#ifdef KC_MSGHDR_STRUCT_IOV_ITER
+	iov_iter_init(&mh.msg_iter, READ, (struct iovec *)&kv, sizeof(struct scoutfs_quorum_message), 1);
+#endif
 	ret = kernel_recvmsg(qinf->sock, &mh, &kv, 1, kv.iov_len, mh.msg_flags);
 	if (ret < 0)
 		return ret;
@@ -713,8 +726,6 @@ static void scoutfs_quorum_worker(struct work_struct *work)
 	struct quorum_status qst = {0,};
 	struct hb_recording hbr;
 	bool record_hb;
-	bool recv_failed;
-	bool initializing = true;
 	int ret;
 	int err;

@@ -747,8 +758,6 @@ static void scoutfs_quorum_worker(struct work_struct *work)

 		update_show_status(qinf, &qst);

-		recv_failed = false;
-
 		ret = recv_msg(sb, &msg, qst.timeout);
 		if (ret < 0) {
 			if (ret != -ETIMEDOUT && ret != -EAGAIN) {
@@ -756,9 +765,6 @@ static void scoutfs_quorum_worker(struct work_struct *work)
 				scoutfs_inc_counter(sb, quorum_recv_error);
 				goto out;
 			}
-
-			recv_failed = true;
-
 			msg.type = SCOUTFS_QUORUM_MSG_INVALID;
 			ret = 0;
 		}
@@ -816,13 +822,12 @@ static void scoutfs_quorum_worker(struct work_struct *work)

 		/* followers and candidates start new election on timeout */
 		if (qst.role != LEADER &&
-		    (initializing || recv_failed) &&
 		    ktime_after(ktime_get(), qst.timeout)) {
 			/* .. but only if their server has stopped */
 			if (!scoutfs_server_is_down(sb)) {
 				qst.timeout = election_timeout();
 				scoutfs_inc_counter(sb, quorum_candidate_server_stopping);
-				goto again;
+				continue;
 			}

 			qst.role = CANDIDATE;
@@ -959,9 +964,6 @@ static void scoutfs_quorum_worker(struct work_struct *work)
 		}

 		record_hb_delay(sb, qinf, &hbr, record_hb, qst.role);
-
-again:
-		initializing = false;
 	}

 	update_show_status(qinf, &qst);
--- a/kmod/src/scoutfs_trace.h
+++ b/kmod/src/scoutfs_trace.h
@@ -286,52 +286,6 @@ TRACE_EVENT(scoutfs_data_alloc_block_enter,
 		  STE_ENTRY_ARGS(ext))
 );

-TRACE_EVENT(scoutfs_data_page_mkwrite,
-	TP_PROTO(struct super_block *sb, __u64 ino, __u64 pos, __u32 ret),
-
-	TP_ARGS(sb, ino, pos, ret),
-
-	TP_STRUCT__entry(
-		SCSB_TRACE_FIELDS
-		__field(__u64, ino)
-		__field(__u64, pos)
-		__field(__u32, ret)
-	),
-
-	TP_fast_assign(
-		SCSB_TRACE_ASSIGN(sb);
-		__entry->ino = ino;
-		__entry->pos = pos;
-		__entry->ret = ret;
-	),
-
-	TP_printk(SCSBF" ino %llu pos %llu ret %u ",
-		  SCSB_TRACE_ARGS, __entry->ino, __entry->pos, __entry->ret)
-);
-
-TRACE_EVENT(scoutfs_data_filemap_fault,
-	TP_PROTO(struct super_block *sb, __u64 ino, __u64 pos, __u32 ret),
-
-	TP_ARGS(sb, ino, pos, ret),
-
-	TP_STRUCT__entry(
-		SCSB_TRACE_FIELDS
-		__field(__u64, ino)
-		__field(__u64, pos)
-		__field(__u32, ret)
-	),
-
-	TP_fast_assign(
-		SCSB_TRACE_ASSIGN(sb);
-		__entry->ino = ino;
-		__entry->pos = pos;
-		__entry->ret = ret;
-	),
-
-	TP_printk(SCSBF" ino %llu pos %llu ret %u ",
-		  SCSB_TRACE_ARGS, __entry->ino, __entry->pos, __entry->ret)
-);
-
 DECLARE_EVENT_CLASS(scoutfs_data_file_extent_class,
 	TP_PROTO(struct super_block *sb, __u64 ino, struct scoutfs_extent *ext),

@@ -823,14 +777,13 @@ DEFINE_EVENT(scoutfs_lock_info_class, scoutfs_lock_destroy,
 );

 TRACE_EVENT(scoutfs_xattr_set,
-	TP_PROTO(struct super_block *sb, __u64 ino, size_t name_len,
-		 const void *value, size_t size, int flags),
+	TP_PROTO(struct super_block *sb, size_t name_len, const void *value,
+		 size_t size, int flags),

-	TP_ARGS(sb, ino, name_len, value, size, flags),
+	TP_ARGS(sb, name_len, value, size, flags),

 	TP_STRUCT__entry(
 		SCSB_TRACE_FIELDS
-		__field(__u64, ino)
 		__field(size_t, name_len)
 		__field(const void *, value)
 		__field(size_t, size)
@@ -839,16 +792,15 @@ TRACE_EVENT(scoutfs_xattr_set,

 	TP_fast_assign(
 		SCSB_TRACE_ASSIGN(sb);
-		__entry->ino = ino;
 		__entry->name_len = name_len;
 		__entry->value = value;
 		__entry->size = size;
 		__entry->flags = flags;
 	),

-	TP_printk(SCSBF" ino %llu name_len %zu value %p size %zu flags 0x%x",
-		  SCSB_TRACE_ARGS, __entry->ino,  __entry->name_len,
-		  __entry->value, __entry->size, __entry->flags)
+	TP_printk(SCSBF" name_len %zu value %p size %zu flags 0x%x",
+		  SCSB_TRACE_ARGS, __entry->name_len, __entry->value,
+		  __entry->size, __entry->flags)
 );

 TRACE_EVENT(scoutfs_advance_dirty_super,
@@ -1968,17 +1920,15 @@ DEFINE_EVENT(scoutfs_server_client_count_class, scoutfs_server_client_down,
 );

 DECLARE_EVENT_CLASS(scoutfs_server_commit_users_class,
-        TP_PROTO(struct super_block *sb, int holding, int applying,
-		 int nr_holders, u32 budget,
-		 u32 avail_before, u32 freed_before,
-		 int committing, int exceeded),
-        TP_ARGS(sb, holding, applying, nr_holders, budget, avail_before, freed_before, committing, exceeded),
+        TP_PROTO(struct super_block *sb, int holding, int applying, int nr_holders,
+		 u32 avail_before, u32 freed_before, int committing, int exceeded),
+        TP_ARGS(sb, holding, applying, nr_holders, avail_before, freed_before, committing,
+		exceeded),
        TP_STRUCT__entry(
 		SCSB_TRACE_FIELDS
 		__field(int, holding)
 		__field(int, applying)
 		__field(int, nr_holders)
-		__field(u32, budget)
 		__field(__u32, avail_before)
 		__field(__u32, freed_before)
 		__field(int, committing)
@@ -1989,45 +1939,35 @@ DECLARE_EVENT_CLASS(scoutfs_server_commit_users_class,
 		__entry->holding = !!holding;
 		__entry->applying = !!applying;
 		__entry->nr_holders = nr_holders;
-		__entry->budget = budget;
 		__entry->avail_before = avail_before;
 		__entry->freed_before = freed_before;
 		__entry->committing = !!committing;
 		__entry->exceeded = !!exceeded;
        ),
-	TP_printk(SCSBF" holding %u applying %u nr %u budget %u avail_before %u freed_before %u committing %u exceeded %u",
-		  SCSB_TRACE_ARGS, __entry->holding, __entry->applying,
-		  __entry->nr_holders, __entry->budget,
-		  __entry->avail_before, __entry->freed_before,
-		  __entry->committing, __entry->exceeded)
+	TP_printk(SCSBF" holding %u applying %u nr %u avail_before %u freed_before %u committing %u exceeded %u",
+		  SCSB_TRACE_ARGS, __entry->holding, __entry->applying, __entry->nr_holders,
+		  __entry->avail_before, __entry->freed_before, __entry->committing,
+		  __entry->exceeded)
 );
 DEFINE_EVENT(scoutfs_server_commit_users_class, scoutfs_server_commit_hold,
-        TP_PROTO(struct super_block *sb, int holding, int applying,
-		 int nr_holders, u32 budget,
-		 u32 avail_before, u32 freed_before,
-		 int committing, int exceeded),
-        TP_ARGS(sb, holding, applying, nr_holders, budget, avail_before, freed_before, committing, exceeded)
+        TP_PROTO(struct super_block *sb, int holding, int applying, int nr_holders,
+		 u32 avail_before, u32 freed_before, int committing, int exceeded),
+        TP_ARGS(sb, holding, applying, nr_holders, avail_before, freed_before, committing, exceeded)
 );
 DEFINE_EVENT(scoutfs_server_commit_users_class, scoutfs_server_commit_apply,
-        TP_PROTO(struct super_block *sb, int holding, int applying,
-		 int nr_holders, u32 budget,
-		 u32 avail_before, u32 freed_before,
-		 int committing, int exceeded),
-        TP_ARGS(sb, holding, applying, nr_holders, budget, avail_before, freed_before, committing, exceeded)
+        TP_PROTO(struct super_block *sb, int holding, int applying, int nr_holders,
+		 u32 avail_before, u32 freed_before, int committing, int exceeded),
+        TP_ARGS(sb, holding, applying, nr_holders, avail_before, freed_before, committing, exceeded)
 );
 DEFINE_EVENT(scoutfs_server_commit_users_class, scoutfs_server_commit_start,
-        TP_PROTO(struct super_block *sb, int holding, int applying,
-		 int nr_holders, u32 budget,
-		 u32 avail_before, u32 freed_before,
-		 int committing, int exceeded),
-        TP_ARGS(sb, holding, applying, nr_holders, budget, avail_before, freed_before, committing, exceeded)
+        TP_PROTO(struct super_block *sb, int holding, int applying, int nr_holders,
+		 u32 avail_before, u32 freed_before, int committing, int exceeded),
+        TP_ARGS(sb, holding, applying, nr_holders, avail_before, freed_before, committing, exceeded)
 );
 DEFINE_EVENT(scoutfs_server_commit_users_class, scoutfs_server_commit_end,
-        TP_PROTO(struct super_block *sb, int holding, int applying,
-		 int nr_holders, u32 budget,
-		 u32 avail_before, u32 freed_before,
-		 int committing, int exceeded),
-        TP_ARGS(sb, holding, applying, nr_holders, budget, avail_before, freed_before, committing, exceeded)
+        TP_PROTO(struct super_block *sb, int holding, int applying, int nr_holders,
+		 u32 avail_before, u32 freed_before, int committing, int exceeded),
+        TP_ARGS(sb, holding, applying, nr_holders, avail_before, freed_before, committing, exceeded)
 );

 #define slt_symbolic(mode)						\
@@ -2465,27 +2405,6 @@ TRACE_EVENT(scoutfs_block_dirty_ref,
 		  __entry->block_blkno, __entry->block_seq)
 );

-TRACE_EVENT(scoutfs_get_file_block,
-	TP_PROTO(struct super_block *sb, u64 blkno, int flags),
-
-	TP_ARGS(sb, blkno, flags),
-
-	TP_STRUCT__entry(
-		SCSB_TRACE_FIELDS
-		__field(__u64, blkno)
-		__field(int, flags)
-	),
-
-	TP_fast_assign(
-		SCSB_TRACE_ASSIGN(sb);
-		__entry->blkno = blkno;
-		__entry->flags = flags;
-	),
-
-	TP_printk(SCSBF" blkno %llu flags 0x%x",
-		  SCSB_TRACE_ARGS, __entry->blkno, __entry->flags)
-);
-
 TRACE_EVENT(scoutfs_block_stale,
 	TP_PROTO(struct super_block *sb, struct scoutfs_block_ref *ref,
 		 struct scoutfs_block_header *hdr, u32 magic, u32 crc),
@@ -2526,8 +2445,8 @@ TRACE_EVENT(scoutfs_block_stale,

 DECLARE_EVENT_CLASS(scoutfs_block_class,
 	TP_PROTO(struct super_block *sb, void *bp, u64 blkno, int refcount, int io_count,
-		 unsigned long bits),
-	TP_ARGS(sb, bp, blkno, refcount, io_count, bits),
+		 unsigned long bits, __u64 accessed),
+	TP_ARGS(sb, bp, blkno, refcount, io_count, bits, accessed),
 	TP_STRUCT__entry(
 		SCSB_TRACE_FIELDS
 		__field(void *, bp)
@@ -2535,6 +2454,7 @@ DECLARE_EVENT_CLASS(scoutfs_block_class,
 		__field(int, refcount)
 		__field(int, io_count)
 		__field(long, bits)
+		__field(__u64, accessed)
 	),
 	TP_fast_assign(
 		SCSB_TRACE_ASSIGN(sb);
@@ -2543,65 +2463,71 @@ DECLARE_EVENT_CLASS(scoutfs_block_class,
 		__entry->refcount = refcount;
 		__entry->io_count = io_count;
 		__entry->bits = bits;
+		__entry->accessed = accessed;
 	),
-	TP_printk(SCSBF" bp %p blkno %llu refcount %x io_count %d bits 0x%lx",
+	TP_printk(SCSBF" bp %p blkno %llu refcount %d io_count %d bits 0x%lx accessed %llu",
 		  SCSB_TRACE_ARGS, __entry->bp, __entry->blkno, __entry->refcount,
-		  __entry->io_count, __entry->bits)
+		  __entry->io_count, __entry->bits, __entry->accessed)
 );
 DEFINE_EVENT(scoutfs_block_class, scoutfs_block_allocate,
 	TP_PROTO(struct super_block *sb, void *bp, u64 blkno,
-		 int refcount, int io_count, unsigned long bits),
-	TP_ARGS(sb, bp, blkno, refcount, io_count, bits)
+		 int refcount, int io_count, unsigned long bits,
+		 __u64 accessed),
+	TP_ARGS(sb, bp, blkno, refcount, io_count, bits, accessed)
 );
 DEFINE_EVENT(scoutfs_block_class, scoutfs_block_free,
 	TP_PROTO(struct super_block *sb, void *bp, u64 blkno,
-		 int refcount, int io_count, unsigned long bits),
-	TP_ARGS(sb, bp, blkno, refcount, io_count, bits)
+		 int refcount, int io_count, unsigned long bits,
+		 __u64 accessed),
+	TP_ARGS(sb, bp, blkno, refcount, io_count, bits, accessed)
 );
 DEFINE_EVENT(scoutfs_block_class, scoutfs_block_insert,
 	TP_PROTO(struct super_block *sb, void *bp, u64 blkno,
-		 int refcount, int io_count, unsigned long bits),
-	TP_ARGS(sb, bp, blkno, refcount, io_count, bits)
+		 int refcount, int io_count, unsigned long bits,
+		 __u64 accessed),
+	TP_ARGS(sb, bp, blkno, refcount, io_count, bits, accessed)
 );
 DEFINE_EVENT(scoutfs_block_class, scoutfs_block_remove,
 	TP_PROTO(struct super_block *sb, void *bp, u64 blkno,
-		 int refcount, int io_count, unsigned long bits),
-	TP_ARGS(sb, bp, blkno, refcount, io_count, bits)
+		 int refcount, int io_count, unsigned long bits,
+		 __u64 accessed),
+	TP_ARGS(sb, bp, blkno, refcount, io_count, bits, accessed)
 );
 DEFINE_EVENT(scoutfs_block_class, scoutfs_block_end_io,
 	TP_PROTO(struct super_block *sb, void *bp, u64 blkno,
-		 int refcount, int io_count, unsigned long bits),
-	TP_ARGS(sb, bp, blkno, refcount, io_count, bits)
+		 int refcount, int io_count, unsigned long bits,
+		 __u64 accessed),
+	TP_ARGS(sb, bp, blkno, refcount, io_count, bits, accessed)
 );
 DEFINE_EVENT(scoutfs_block_class, scoutfs_block_submit,
 	TP_PROTO(struct super_block *sb, void *bp, u64 blkno,
-		 int refcount, int io_count, unsigned long bits),
-	TP_ARGS(sb, bp, blkno, refcount, io_count, bits)
+		 int refcount, int io_count, unsigned long bits,
+		 __u64 accessed),
+	TP_ARGS(sb, bp, blkno, refcount, io_count, bits, accessed)
 );
 DEFINE_EVENT(scoutfs_block_class, scoutfs_block_invalidate,
 	TP_PROTO(struct super_block *sb, void *bp, u64 blkno,
-		 int refcount, int io_count, unsigned long bits),
-	TP_ARGS(sb, bp, blkno, refcount, io_count, bits)
+		 int refcount, int io_count, unsigned long bits,
+		 __u64 accessed),
+	TP_ARGS(sb, bp, blkno, refcount, io_count, bits, accessed)
 );
 DEFINE_EVENT(scoutfs_block_class, scoutfs_block_mark_dirty,
 	TP_PROTO(struct super_block *sb, void *bp, u64 blkno,
-		 int refcount, int io_count, unsigned long bits),
-	TP_ARGS(sb, bp, blkno, refcount, io_count, bits)
+		 int refcount, int io_count, unsigned long bits,
+		 __u64 accessed),
+	TP_ARGS(sb, bp, blkno, refcount, io_count, bits, accessed)
 );
 DEFINE_EVENT(scoutfs_block_class, scoutfs_block_forget,
 	TP_PROTO(struct super_block *sb, void *bp, u64 blkno,
-		 int refcount, int io_count, unsigned long bits),
-	TP_ARGS(sb, bp, blkno, refcount, io_count, bits)
+		 int refcount, int io_count, unsigned long bits,
+		 __u64 accessed),
+	TP_ARGS(sb, bp, blkno, refcount, io_count, bits, accessed)
 );
 DEFINE_EVENT(scoutfs_block_class, scoutfs_block_shrink,
 	TP_PROTO(struct super_block *sb, void *bp, u64 blkno,
-		 int refcount, int io_count, unsigned long bits),
-	TP_ARGS(sb, bp, blkno, refcount, io_count, bits)
-);
-DEFINE_EVENT(scoutfs_block_class, scoutfs_block_isolate,
-	TP_PROTO(struct super_block *sb, void *bp, u64 blkno,
-		 int refcount, int io_count, unsigned long bits),
-	TP_ARGS(sb, bp, blkno, refcount, io_count, bits)
+		 int refcount, int io_count, unsigned long bits,
+		 __u64 accessed),
+	TP_ARGS(sb, bp, blkno, refcount, io_count, bits, accessed)
 );

 DECLARE_EVENT_CLASS(scoutfs_ext_next_class,
@@ -3076,27 +3002,6 @@ DEFINE_EVENT(scoutfs_srch_compact_class, scoutfs_srch_compact_client_recv,
 	TP_ARGS(sb, sc)
 );

-TRACE_EVENT(scoutfs_ioc_search_xattrs,
-	TP_PROTO(struct super_block *sb, u64 ino, u64 last_ino),
-
-	TP_ARGS(sb, ino, last_ino),
-
-	TP_STRUCT__entry(
-		SCSB_TRACE_FIELDS
-		__field(u64, ino)
-		__field(u64, last_ino)
-	),
-
-	TP_fast_assign(
-		SCSB_TRACE_ASSIGN(sb);
-		__entry->ino = ino;
-		__entry->last_ino = last_ino;
-	),
-
-	TP_printk(SCSBF" ino %llu last_ino %llu", SCSB_TRACE_ARGS,
-		  __entry->ino, __entry->last_ino)
-);
-
 #endif /* _TRACE_SCOUTFS_H */

 /* This part must be outside protection */
--- a/kmod/src/server.c
+++ b/kmod/src/server.c
@@ -65,7 +65,6 @@ struct commit_users {
 	struct list_head holding;
 	struct list_head applying;
 	unsigned int nr_holders;
-	u32 budget;
 	u32 avail_before;
 	u32 freed_before;
 	bool committing;
@@ -85,9 +84,8 @@ static void init_commit_users(struct commit_users *cusers)
 do {												\
 	__typeof__(cusers) _cusers = (cusers);							\
 	trace_scoutfs_server_commit_##which(sb, !list_empty(&_cusers->holding),			\
-		!list_empty(&_cusers->applying), _cusers->nr_holders, _cusers->budget,		\
-		_cusers->avail_before, _cusers->freed_before, _cusers->committing,		\
-		_cusers->exceeded);								\
+		!list_empty(&_cusers->applying), _cusers->nr_holders, _cusers->avail_before,	\
+		_cusers->freed_before, _cusers->committing, _cusers->exceeded);			\
 } while (0)

 struct server_info {
@@ -305,6 +303,7 @@ static void check_holder_budget(struct super_block *sb, struct server_info *serv
 	u32 freed_used;
 	u32 avail_now;
 	u32 freed_now;
+	u32 budget;

 	assert_spin_locked(&cusers->lock);

@@ -319,14 +318,15 @@ static void check_holder_budget(struct super_block *sb, struct server_info *serv
 	else
 		freed_used = SCOUTFS_ALLOC_LIST_MAX_BLOCKS - freed_now;

-	if (avail_used <= cusers->budget && freed_used <= cusers->budget)
+	budget = cusers->nr_holders * COMMIT_HOLD_ALLOC_BUDGET;
+	if (avail_used <= budget && freed_used <= budget)
 		return;

 	exceeded_once = true;
 	cusers->exceeded = cusers->nr_holders;

-	scoutfs_err(sb, "holders exceeded alloc budget %u av: bef %u now %u, fr: bef %u now %u",
-		    cusers->budget, cusers->avail_before, avail_now,
+	scoutfs_err(sb, "%u holders exceeded alloc budget av: bef %u now %u, fr: bef %u now %u",
+		    cusers->nr_holders, cusers->avail_before, avail_now,
 		    cusers->freed_before, freed_now);

 	list_for_each_entry(hold, &cusers->holding, entry) {
@@ -349,7 +349,7 @@ static bool hold_commit(struct super_block *sb, struct server_info *server,
 {
 	bool has_room;
 	bool held;
-	u32 new_budget;
+	u32 budget;
 	u32 av;
 	u32 fr;

@@ -367,8 +367,8 @@ static bool hold_commit(struct super_block *sb, struct server_info *server,
 	}

 	/* +2 for our additional hold and then for the final commit work the server does */
-	new_budget = max(cusers->budget, (cusers->nr_holders + 2) * COMMIT_HOLD_ALLOC_BUDGET);
-	has_room = av >= new_budget && fr >= new_budget;
+	budget = (cusers->nr_holders + 2) * COMMIT_HOLD_ALLOC_BUDGET;
+	has_room = av >= budget && fr >= budget;
 	/* checking applying so holders drain once an apply caller starts waiting */
 	held = !cusers->committing && has_room && list_empty(&cusers->applying);

@@ -388,7 +388,6 @@ static bool hold_commit(struct super_block *sb, struct server_info *server,
 		list_add_tail(&hold->entry, &cusers->holding);

 		cusers->nr_holders++;
-		cusers->budget = new_budget;

 	} else if (!has_room && cusers->nr_holders == 0 && !cusers->committing) {
 		cusers->committing = true;
@@ -517,7 +516,6 @@ static void commit_end(struct super_block *sb, struct commit_users *cusers, int
 	list_for_each_entry_safe(hold, tmp, &cusers->applying, entry)
 		list_del_init(&hold->entry);
 	cusers->committing = false;
-	cusers->budget = 0;
 	spin_unlock(&cusers->lock);

 	wake_up(&cusers->waitq);
@@ -610,7 +608,7 @@ static void scoutfs_server_commit_func(struct work_struct *work)
 		goto out;

 	if (scoutfs_forcing_unmount(sb)) {
-		ret = -ENOLINK;
+		ret = -EIO;
 		goto out;
 	}

@@ -670,14 +668,16 @@ static void scoutfs_server_commit_func(struct work_struct *work)
 	 * the reserved blocks after having filled the log trees's avail
 	 * allocator during its transaction.  To avoid prematurely
 	 * setting the low flag and causing enospc we make sure that the
-	 * next transaction's meta_avail has 2x the reserved blocks so
+	 * next transaction's meta_avail has 3x the reserved blocks so
 	 * that it can consume a full reserved amount and still have
 	 * enough to avoid enospc.  We swap to freed if avail is under
-	 * the buffer and freed is larger.
+	 * the buffer and freed is larger by 50%. This results in much less
+	 * swapping overall and allows the pools to refill naturally.
 	 */
 	if ((le64_to_cpu(server->meta_avail->total_len) <
-	     (scoutfs_server_reserved_meta_blocks(sb) * 2)) &&
-	    (le64_to_cpu(server->meta_freed->total_len) >
+	     (scoutfs_server_reserved_meta_blocks(sb) * 3)) &&
+	    ((le64_to_cpu(server->meta_freed->total_len) +
+	     (le64_to_cpu(server->meta_freed->total_len) >> 1)) >
 	     le64_to_cpu(server->meta_avail->total_len)))
 		swap(server->meta_avail, server->meta_freed);

@@ -1040,101 +1040,6 @@ static int next_log_merge_item(struct super_block *sb,
 	return next_log_merge_item_key(sb, root, zone, &key, val, val_len);
 }

-static int do_finalize_ours(struct super_block *sb,
-			    struct scoutfs_log_trees *lt,
-			    struct commit_hold *hold)
-{
-	struct server_info *server = SCOUTFS_SB(sb)->server_info;
-	struct scoutfs_super_block *super = DIRTY_SUPER_SB(sb);
-	struct scoutfs_key key;
-	char *err_str = NULL;
-	u64 rid = le64_to_cpu(lt->rid);
-	bool more;
-	int ret;
-	int err;
-
-	mutex_lock(&server->srch_mutex);
-	ret = scoutfs_srch_rotate_log(sb, &server->alloc, &server->wri,
-				      &super->srch_root, &lt->srch_file, true);
-	mutex_unlock(&server->srch_mutex);
-	if (ret < 0) {
-		scoutfs_err(sb, "error rotating srch log for rid %016llx: %d",
-			    rid, ret);
-		return ret;
-        }
-
-	do {
-		more = false;
-
-		/*
-		 * All of these can return errors, perhaps indicating successful
-		 * partial progress, after having modified the allocator trees.
-		 * We always have to update the roots in the log item.
-		 */
-		mutex_lock(&server->alloc_mutex);
-		ret = (err_str = "splice meta_freed to other_freed",
-				scoutfs_alloc_splice_list(sb, &server->alloc,
-					&server->wri, server->other_freed,
-					&lt->meta_freed)) ?:
-			(err_str = "splice meta_avail",
-			 scoutfs_alloc_splice_list(sb, &server->alloc,
-					&server->wri, server->other_freed,
-					&lt->meta_avail)) ?:
-			(err_str = "empty data_avail",
-			 alloc_move_empty(sb, &super->data_alloc,
-					  &lt->data_avail,
-					  COMMIT_HOLD_ALLOC_BUDGET / 2)) ?:
-			(err_str = "empty data_freed",
-			 alloc_move_empty(sb, &super->data_alloc,
-					  &lt->data_freed,
-					  COMMIT_HOLD_ALLOC_BUDGET / 2));
-		mutex_unlock(&server->alloc_mutex);
-
-		/*
-		 * only finalize, allowing merging, once the allocators are
-		 * fully freed
-		 */
-		if (ret == 0) {
-			/* the transaction is no longer open */
-			le64_add_cpu(&lt->flags, SCOUTFS_LOG_TREES_FINALIZED);
-			lt->finalize_seq = cpu_to_le64(scoutfs_server_next_seq(sb));
-		}
-
-		scoutfs_key_init_log_trees(&key, rid, le64_to_cpu(lt->nr));
-
-		err = scoutfs_btree_update(sb, &server->alloc, &server->wri,
-					   &super->logs_root, &key, lt,
-					   sizeof(*lt));
-		BUG_ON(err != 0); /* alloc, log, srch items out of sync */
-
-		if (ret == -EINPROGRESS) {
-			more = true;
-			mutex_unlock(&server->logs_mutex);
-			ret = server_apply_commit(sb, hold, 0);
-			if (ret < 0)
-				WARN_ON_ONCE(ret < 0);
-			server_hold_commit(sb, hold);
-			mutex_lock(&server->logs_mutex);
-		} else if (ret == 0) {
-			memset(&lt->item_root, 0, sizeof(lt->item_root));
-			memset(&lt->bloom_ref, 0, sizeof(lt->bloom_ref));
-			lt->inode_count_delta = 0;
-			lt->max_item_seq = 0;
-			lt->finalize_seq = 0;
-			le64_add_cpu(&lt->nr, 1);
-			lt->flags = 0;
-		}
-	} while (more);
-
-	if (ret < 0) {
-		scoutfs_err(sb,
-			    "error %d finalizing log trees for rid %016llx: %s",
-			    ret, rid, err_str);
-	}
-
-	return ret;
-}
-
 /*
 * Finalizing the log btrees for merging needs to be done carefully so
 * that items don't appear to go backwards in time.
@@ -1186,6 +1091,7 @@ static int finalize_and_start_log_merge(struct super_block *sb, struct scoutfs_l
 	struct scoutfs_log_merge_range rng;
 	struct scoutfs_mount_options opts;
 	struct scoutfs_log_trees each_lt;
+	struct scoutfs_log_trees fin;
 	unsigned int delay_ms;
 	unsigned long timeo;
 	bool saw_finalized;
@@ -1256,7 +1162,6 @@ static int finalize_and_start_log_merge(struct super_block *sb, struct scoutfs_l
 		/* done if we're not finalizing and there's no finalized */
 		if (!finalize_ours && !saw_finalized) {
 			ret = 0;
-			scoutfs_inc_counter(sb, log_merge_no_finalized);
 			break;
 		}

@@ -1291,11 +1196,32 @@ static int finalize_and_start_log_merge(struct super_block *sb, struct scoutfs_l

 		/* Finalize ours if it's visible to others */
 		if (ours_visible) {
-			ret = do_finalize_ours(sb, lt, hold);
+			fin = *lt;
+			memset(&fin.meta_avail, 0, sizeof(fin.meta_avail));
+			memset(&fin.meta_freed, 0, sizeof(fin.meta_freed));
+			memset(&fin.data_avail, 0, sizeof(fin.data_avail));
+			memset(&fin.data_freed, 0, sizeof(fin.data_freed));
+			memset(&fin.srch_file, 0, sizeof(fin.srch_file));
+			le64_add_cpu(&fin.flags, SCOUTFS_LOG_TREES_FINALIZED);
+			fin.finalize_seq = cpu_to_le64(scoutfs_server_next_seq(sb));
+
+			scoutfs_key_init_log_trees(&key, le64_to_cpu(fin.rid),
+						   le64_to_cpu(fin.nr));
+			ret = scoutfs_btree_update(sb, &server->alloc, &server->wri,
+						   &super->logs_root, &key, &fin,
+						   sizeof(fin));
 			if (ret < 0) {
-				err_str = "finalizing ours";
+				err_str = "updating finalized log_trees";
 				break;
 			}
+
+			memset(&lt->item_root, 0, sizeof(lt->item_root));
+			memset(&lt->bloom_ref, 0, sizeof(lt->bloom_ref));
+			lt->inode_count_delta = 0;
+			lt->max_item_seq = 0;
+			lt->finalize_seq = 0;
+			le64_add_cpu(&lt->nr, 1);
+			lt->flags = 0;
 		}

 		/* wait a bit for mounts to arrive */
@@ -1375,10 +1301,12 @@ static int finalize_and_start_log_merge(struct super_block *sb, struct scoutfs_l
 * is nested inside holding commits so we recheck the persistent item
 * each time we commit to make sure it's still what we think.   The
 * caller is still going to send the item to the client so we update the
- * caller's each time we make progress.  If we hit an error applying the
- * changes we make then we can't send the log_trees to the client.
+ * caller's each time we make progress.  This is a best-effort attempt
+ * to clean up and it's valid to leave extents in data_freed we don't
+ * return errors to the caller.  The client will continue the work later
+ * in get_log_trees or as the rid is reclaimed.
 */
-static int try_drain_data_freed(struct super_block *sb, struct scoutfs_log_trees *lt)
+static void try_drain_data_freed(struct super_block *sb, struct scoutfs_log_trees *lt)
 {
 	DECLARE_SERVER_INFO(sb, server);
 	struct scoutfs_super_block *super = DIRTY_SUPER_SB(sb);
@@ -1387,7 +1315,6 @@ static int try_drain_data_freed(struct super_block *sb, struct scoutfs_log_trees
 	struct scoutfs_log_trees drain;
 	struct scoutfs_key key;
 	COMMIT_HOLD(hold);
-	bool apply = false;
 	int ret = 0;
 	int err;

@@ -1396,27 +1323,22 @@ static int try_drain_data_freed(struct super_block *sb, struct scoutfs_log_trees
 	while (lt->data_freed.total_len != 0) {
 		server_hold_commit(sb, &hold);
 		mutex_lock(&server->logs_mutex);
-		apply = true;

 		ret = find_log_trees_item(sb, &super->logs_root, false, rid, U64_MAX, &drain);
-		if (ret < 0) {
-			ret = 0;
+		if (ret < 0)
 			break;
-		}

 		/* careful to only keep draining the caller's specific open trans */
 		if (drain.nr != lt->nr || drain.get_trans_seq != lt->get_trans_seq ||
 		    drain.commit_trans_seq != lt->commit_trans_seq || drain.flags != lt->flags) {
-			ret = 0;
+			ret = -ENOENT;
 			break;
 		}

 		ret = scoutfs_btree_dirty(sb, &server->alloc, &server->wri,
 					  &super->logs_root, &key);
-		if (ret < 0) {
-			ret = 0;
+		if (ret < 0)
 			break;
-		}

 		/* moving can modify and return errors, always update caller and item */
 		mutex_lock(&server->alloc_mutex);
@@ -1432,19 +1354,19 @@ static int try_drain_data_freed(struct super_block *sb, struct scoutfs_log_trees
 		BUG_ON(err < 0); /* dirtying must guarantee success */

 		mutex_unlock(&server->logs_mutex);
+
 		ret = server_apply_commit(sb, &hold, ret);
-		apply = false;
-
-		if (ret < 0)
+		if (ret < 0) {
+			ret = 0; /* don't try to abort, ignoring ret */
 			break;
+		}
 	}

-	if (apply) {
+	/* try to cleanly abort and write any partial dirty btree blocks, but ignore result */
+	if (ret < 0) {
 		mutex_unlock(&server->logs_mutex);
-		server_apply_commit(sb, &hold, ret);
+		server_apply_commit(sb, &hold, 0);
 	}
-
-	return ret;
 }

 /*
@@ -1652,9 +1574,9 @@ out:
 		scoutfs_err(sb, "error %d getting log trees for rid %016llx: %s",
 			    ret, rid, err_str);

-	/* try to drain excessive data_freed with additional commits, if needed */
+	/* try to drain excessive data_freed with additional commits, if needed, ignoring err */
 	if (ret == 0)
-		ret = try_drain_data_freed(sb, &lt);
+		try_drain_data_freed(sb, &lt);

 	return scoutfs_net_response(sb, conn, cmd, id, ret, &lt, sizeof(lt));
 }
@@ -1754,8 +1676,8 @@ unlock:

 	ret = server_apply_commit(sb, &hold, ret);
 	if (ret < 0)
-		scoutfs_err(sb, "server error %d committing client logs for rid %016llx, nr %llu: %s",
-			    ret, rid, le64_to_cpu(lt.nr), err_str);
+		scoutfs_err(sb, "server error %d committing client logs for rid %016llx: %s",
+			    ret, rid, err_str);
 out:
 	WARN_ON_ONCE(ret < 0);
 	return scoutfs_net_response(sb, conn, cmd, id, ret, NULL, 0);
@@ -1890,9 +1812,6 @@ static int reclaim_open_log_tree(struct super_block *sb, u64 rid)
 out:
 	mutex_unlock(&server->logs_mutex);

-	if (ret == 0)
-		scoutfs_inc_counter(sb, reclaimed_open_logs);
-
 	if (ret < 0 && ret != -EINPROGRESS)
 		scoutfs_err(sb, "server error %d reclaiming log trees for rid %016llx: %s",
 			    ret, rid, err_str);
@@ -2610,7 +2529,7 @@ static void server_log_merge_free_work(struct work_struct *work)

 		ret = scoutfs_btree_free_blocks(sb, &server->alloc,
 						&server->wri, &fr.key,
-						&fr.root, COMMIT_HOLD_ALLOC_BUDGET / 8);
+						&fr.root, COMMIT_HOLD_ALLOC_BUDGET / 2);
 		if (ret < 0) {
 			err_str = "freeing log btree";
 			break;
@@ -2629,7 +2548,7 @@ static void server_log_merge_free_work(struct work_struct *work)
 		/* freed blocks are in allocator, we *have* to update fr */
 		BUG_ON(ret < 0);

-		if (server_hold_alloc_used_since(sb, &hold) >= (COMMIT_HOLD_ALLOC_BUDGET * 3) / 4) {
+		if (server_hold_alloc_used_since(sb, &hold) >= COMMIT_HOLD_ALLOC_BUDGET / 2) {
 			mutex_unlock(&server->logs_mutex);
 			ret = server_apply_commit(sb, &hold, ret);
 			commit = false;
@@ -4232,7 +4151,7 @@ static void fence_pending_recov_worker(struct work_struct *work)
 	struct server_info *server = container_of(work, struct server_info,
 						  fence_pending_recov_work);
 	struct super_block *sb = server->sb;
-	union scoutfs_inet_addr addr = {{0,}};
+	union scoutfs_inet_addr addr;
 	u64 rid = 0;
 	int ret = 0;

--- a/kmod/src/sparse-filtered.sh
+++ b/kmod/src/sparse-filtered.sh
@@ -1,45 +0,0 @@
-#!/bin/bash
-
-#
-# Unfortunately, kernels can ship which contain sparse errors that are
-# unrelated to us.
-#
-# The exit status of this filtering wrapper will indicate an error if
-# sparse wasn't found or if there were any unfiltered output lines.  It
-# can hide error exit status from sparse or grep if they don't produce
-# output that makes it past the filters.
-#
-
-# must have sparse.  Fail with error message, mask success path.
-which sparse > /dev/null || exit 1
-
-# initial unmatchable, additional added as RE+="|..."
-RE="$^"
-
-#
-# Darn.  sparse has multi-line error messages, and I'd rather not bother
-# with multi-line filters.  So we'll just drop this context.
-#
-# command-line: note: in included file (through include/linux/netlink.h, include/linux/ethtool.h, include/linux/netdevice.h, include/net/sock.h, /root/scoutfs/kmod/src/kernelcompat.h, builtin): 
-#         fprintf(stderr, "%s: note: in included file%s:\n",
-#
-RE+="|: note: in included file"
-
-# 3.10.0-1160.119.1.el7.x86_64.debug
-# include/linux/posix_acl.h:138:9: warning: incorrect type in assignment (different address spaces)
-# include/linux/posix_acl.h:138:9:    expected struct posix_acl *<noident>
-# include/linux/posix_acl.h:138:9:    got struct posix_acl [noderef] <asn:4>*<noident>
-RE+="|include/linux/posix_acl.h:"
-
-# 3.10.0-1160.119.1.el7.x86_64.debug
-#include/uapi/linux/perf_event.h:146:56: warning: cast truncates bits from constant value (8000000000000000 becomes 0)
-RE+="|include/uapi/linux/perf_event.h:"
-
-# 4.18.0-513.24.1.el8_9.x86_64+debug'
-#./include/linux/skbuff.h:824:1: warning: directive in macro's argument list
-RE+="|include/linux/skbuff.h:"
-
-sparse "$@" |& \
-	grep -E -v "($RE)" |& \
-	awk '{ print $0 } END { exit NR > 0 }'
-exit $?
--- a/kmod/src/srch.c
+++ b/kmod/src/srch.c
@@ -62,7 +62,7 @@
 * re-allocated and re-written.  Search can restart by checking the
 * btree for the current set of files.  Compaction reads log files which
 * are protected from other compactions by the persistent busy items
- * created by the server.  Compaction won't see its blocks reused out
+ * created by the server.  Compaction won't see it's blocks reused out
 * from under it, but it can encounter stale cached blocks that need to
 * be invalidated.
 */
@@ -442,10 +442,6 @@ out:
 	if (ret == 0 && (flags & GFB_INSERT) && blk >= le64_to_cpu(sfl->blocks))
 		sfl->blocks = cpu_to_le64(blk + 1);

-	if (bl) {
-		trace_scoutfs_get_file_block(sb, bl->blkno, flags);
-	}
-
 	*bl_ret = bl;
 	return ret;
 }
@@ -753,14 +749,14 @@ static int search_log_file(struct super_block *sb,
 		for (i = 0; i < le32_to_cpu(srb->entry_nr); i++) {
 			if (pos > SCOUTFS_SRCH_BLOCK_SAFE_BYTES) {
 				/* can only be inconsistency :/ */
-				ret = -EIO;
+				ret = EIO;
 				break;
 			}

 			ret = decode_entry(srb->entries + pos, &sre, &prev);
 			if (ret <= 0) {
 				/* can only be inconsistency :/ */
-				ret = -EIO;
+				ret = EIO;
 				break;
 			}
 			pos += ret;
@@ -863,14 +859,14 @@ static int search_sorted_file(struct super_block *sb,

 		if (pos > SCOUTFS_SRCH_BLOCK_SAFE_BYTES) {
 			/* can only be inconsistency :/ */
-			ret = -EIO;
+			ret = EIO;
 			break;
 		}

 		ret = decode_entry(srb->entries + pos, &sre, &prev);
 		if (ret <= 0) {
 			/* can only be inconsistency :/ */
-			ret = -EIO;
+			ret = EIO;
 			break;
 		}
 		pos += ret;
@@ -976,8 +972,6 @@ int scoutfs_srch_search_xattrs(struct super_block *sb,

 	scoutfs_inc_counter(sb, srch_search_xattrs);

-	trace_scoutfs_ioc_search_xattrs(sb, ino, last_ino);
-
 	*done = false;
 	srch_init_rb_root(sroot);

@@ -1808,7 +1802,7 @@ static void swap_page_sre(void *A, void *B, int size)
 * typically, ~10x worst case).
 *
 * Because we read and sort all the input files we must perform the full
- * compaction in one operation.  The server must have given us
+ * compaction in one operation.  The server must have given us a
 * sufficiently large avail/freed lists, otherwise we'll return ENOSPC.
 */
 static int compact_logs(struct super_block *sb,
@@ -1872,14 +1866,14 @@ static int compact_logs(struct super_block *sb,

 		if (pos > SCOUTFS_SRCH_BLOCK_SAFE_BYTES) {
 			/* can only be inconsistency :/ */
-			ret = -EIO;
+			ret = EIO;
 			break;
 		}

 		ret = decode_entry(srb->entries + pos, sre, &prev);
 		if (ret <= 0) {
 			/* can only be inconsistency :/ */
-			ret = -EIO;
+			ret = EIO;
 			goto out;
 		}
 		prev = *sre;
--- a/kmod/src/trans.c
+++ b/kmod/src/trans.c
@@ -159,58 +159,6 @@ static bool drained_holders(struct trans_info *tri)
 	return holders == 0;
 }

-static int commit_current_log_trees(struct super_block *sb, char **str)
-{
-	DECLARE_TRANS_INFO(sb, tri);
-
-	return (*str = "data submit", scoutfs_inode_walk_writeback(sb, true)) ?:
-	       (*str = "item dirty", scoutfs_item_write_dirty(sb))  ?:
-	       (*str = "data prepare", scoutfs_data_prepare_commit(sb))  ?:
-	       (*str = "alloc prepare", scoutfs_alloc_prepare_commit(sb, &tri->alloc, &tri->wri)) ?:
-	       (*str = "meta write", scoutfs_block_writer_write(sb, &tri->wri))  ?:
-	       (*str = "data wait", scoutfs_inode_walk_writeback(sb, false)) ?:
-	       (*str = "commit log trees", commit_btrees(sb)) ?:
-	       scoutfs_item_write_done(sb);
-}
-
-static int get_next_log_trees(struct super_block *sb, char **str)
-{
-	return (*str = "get log trees", scoutfs_trans_get_log_trees(sb));
-}
-
-static int retry_forever(struct super_block *sb, int (*func)(struct super_block *sb, char **str))
-{
-	bool retrying = false;
-	char *str;
-	int ret;
-
-	do {
-		str = NULL;
-
-		ret = func(sb, &str);
-		if (ret < 0) {
-			if (!retrying) {
-				scoutfs_warn(sb, "critical transaction commit failure: %s = %d, retrying",
-					    str, ret);
-				retrying = true;
-			}
-
-			if (scoutfs_forcing_unmount(sb)) {
-				ret = -ENOLINK;
-				break;
-			}
-
-			msleep(2 * MSEC_PER_SEC);
-
-		} else if (retrying) {
-			scoutfs_info(sb, "retried transaction commit succeeded");
-		}
-
-	} while (ret < 0);
-
-	return ret;
-}
-
 /*
 * This work func is responsible for writing out all the dirty blocks
 * that make up the current dirty transaction.  It prevents writers from
@@ -236,6 +184,8 @@ void scoutfs_trans_write_func(struct work_struct *work)
 	struct trans_info *tri = container_of(work, struct trans_info, write_work.work);
 	struct super_block *sb = tri->sb;
 	struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
+	bool retrying = false;
+	char *s = NULL;
 	int ret = 0;

 	tri->task = current;
@@ -252,7 +202,7 @@ void scoutfs_trans_write_func(struct work_struct *work)
 	}

 	if (scoutfs_forcing_unmount(sb)) {
-		ret = -ENOLINK;
+		ret = -EIO;
 		goto out;
 	}

@@ -264,9 +214,37 @@ void scoutfs_trans_write_func(struct work_struct *work)

 	scoutfs_inc_counter(sb, trans_commit_written);

-	/* retry {commit,get}_log_trees until they succeeed, can only fail when forcing unmount */
-	ret = retry_forever(sb, commit_current_log_trees) ?:
-	      retry_forever(sb, get_next_log_trees);
+	do {
+		ret = (s = "data submit", scoutfs_inode_walk_writeback(sb, true)) ?:
+		      (s = "item dirty", scoutfs_item_write_dirty(sb))  ?:
+		      (s = "data prepare", scoutfs_data_prepare_commit(sb))  ?:
+		      (s = "alloc prepare", scoutfs_alloc_prepare_commit(sb, &tri->alloc,
+									 &tri->wri))  ?:
+		      (s = "meta write", scoutfs_block_writer_write(sb, &tri->wri))  ?:
+		      (s = "data wait", scoutfs_inode_walk_writeback(sb, false)) ?:
+		      (s = "commit log trees", commit_btrees(sb)) ?:
+		      scoutfs_item_write_done(sb) ?:
+		      (s = "get log trees", scoutfs_trans_get_log_trees(sb));
+		if (ret < 0) {
+			if (!retrying) {
+				scoutfs_warn(sb, "critical transaction commit failure: %s = %d, retrying",
+					    s, ret);
+				retrying = true;
+			}
+
+			if (scoutfs_forcing_unmount(sb)) {
+				ret = -EIO;
+				break;
+			}
+
+			msleep(2 * MSEC_PER_SEC);
+
+		} else if (retrying) {
+			scoutfs_info(sb, "retried transaction commit succeeded");
+		}
+
+	} while (ret < 0);
+
 out:
 	spin_lock(&tri->write_lock);
 	tri->write_count++;
--- a/kmod/src/xattr.c
+++ b/kmod/src/xattr.c
@@ -742,7 +742,7 @@ int scoutfs_xattr_set_locked(struct inode *inode, const char *name, size_t name_
 	int ret;
 	int err;

-	trace_scoutfs_xattr_set(sb, ino, name_len, value, size, flags);
+	trace_scoutfs_xattr_set(sb, name_len, value, size, flags);

 	if (WARN_ON_ONCE(tgs->totl && tgs->indx) ||
 	    WARN_ON_ONCE((tgs->totl | tgs->indx) && !tag_lock))
--- a/tests/.gitignore
+++ b/tests/.gitignore
@@ -10,5 +10,3 @@ src/stage_tmpfile
 src/create_xattr_loop
 src/o_tmpfile_umask
 src/o_tmpfile_linkat
-src/mmap_stress
-src/mmap_validate
--- a/tests/Makefile
+++ b/tests/Makefile
@@ -13,9 +13,7 @@ BIN := src/createmany			\
 	src/create_xattr_loop		\
 	src/fragmented_data_extents	\
 	src/o_tmpfile_umask		\
-	src/o_tmpfile_linkat		\
-	src/mmap_stress			\
-	src/mmap_validate
+	src/o_tmpfile_linkat

 DEPS := $(wildcard src/*.d)

@@ -25,10 +23,8 @@ ifneq ($(DEPS),)
 -include $(DEPS)
 endif

-src/mmap_stress: LIBS+=-lpthread
-
 $(BIN): %: %.c Makefile
-	gcc $(CFLAGS) -MD -MP -MF $*.d $< -o $@ $(LIBS)
+	gcc $(CFLAGS) -MD -MP -MF $*.d $< -o $@

 .PHONY: clean
 clean:
--- a/tests/funcs/exec.sh
+++ b/tests/funcs/exec.sh
@@ -80,15 +80,3 @@ t_compare_output()
 {
 	"$@" >&7 2>&1
 }
-
-#
-# usually bash prints an annoying output message when jobs
-# are killed.  We can avoid that by redirecting stderr for
-# the bash process when it reaps the jobs that are killed.
-#
-t_silent_kill() {
-	exec {ERR}>&2 2>/dev/null
-	kill "$@"
-	wait "$@"
-	exec 2>&$ERR {ERR}>&-
-}
--- a/tests/funcs/filter.sh
+++ b/tests/funcs/filter.sh
@@ -140,9 +140,6 @@ t_filter_dmesg()
 	re="$re|scoutfs .* error.*server failed to bind to.*"
 	re="$re|scoutfs .* critical transaction commit failure.*"

-	# ENOLINK (-67) indicates an expected forced unmount error
-	re="$re|scoutfs .* error -67 .*"
-
 	# change-devices causes loop device resizing
 	re="$re|loop: module loaded"
 	re="$re|loop[0-9].* detected capacity change from.*"
@@ -163,9 +160,6 @@ t_filter_dmesg()
 	re="$re|Pipe handler or fully qualified core dump path required.*"
 	re="$re|Set kernel.core_pattern before fs.suid_dumpable.*"

-	# perf warning that it adjusted sample rate
-	re="$re|perf: interrupt took too long.*lowering kernel.perf_event_max_sample_rate.*"
-
 	egrep -v "($re)" | \
 		ignore_harmless_unwind_kasan_stack_oob
 }
--- a/tests/funcs/tap.sh
+++ b/tests/funcs/tap.sh
@@ -1,88 +0,0 @@
-
-#
-# Generate TAP format test results
-#
-
-t_tap_header()
-{
-	local runid=$1
-	local sequence=( $(echo $tests) )
-	local count=${#sequence[@]}
-
-	# avoid recreating the same TAP result over again - harness sets this
-	[[ -z "$runid" ]] && runid="*test*"
-
-	cat > $T_RESULTS/scoutfs.tap <<TAPEOF
-TAP version 14
-1..${count}
-#
-# TAP results for run ${runid}
-#
-# host/run info:
-#
-#   hostname: ${HOSTNAME}
-#   test start time: $(date --utc)
-#   uname -r: $(uname -r)
-#   scoutfs commit id: $(git describe --tags)
-#
-# sequence for this run:
-#
-TAPEOF
-
-	# Sequence
-	for t in ${tests}; do
-		 echo ${t/.sh/}
-	done | cat -n | expand | column -c 120 | expand | sed 's/^ /#/' >> $T_RESULTS/scoutfs.tap
-	echo "#" >> $T_RESULTS/scoutfs.tap
-}
-
-t_tap_progress()
-{
-(
-	local i=$(( testcount + 1 ))
-	local testname=$1
-	local result=$2
-
-	local diff=""
-	local dmsg=""
-
-	if [[ -s "$T_RESULTS/tmp/${testname}/dmesg.new" ]]; then
-		dmsg="1"
-	fi
-
-	if ! cmp -s golden/${testname} $T_RESULTS/output/${testname}; then
-		diff="1"
-	fi
-
-	if [[ "${result}" == "100" ]] && [[ -z "${dmsg}" ]] && [[ -z "${diff}" ]]; then
-		echo "ok ${i} - ${testname}"
-	elif [[ "${result}" == "103" ]]; then
-		echo "ok ${i} - ${testname}"
-		echo "# ${testname} ** skipped - permitted **"
-	else
-		echo "not ok ${i} - ${testname}"
-		case ${result} in
-		101)
-			echo "# ${testname} ** skipped **"
-			;;
-		102)
-			echo "# ${testname} ** failed **"
-			;;
-		esac
-
-		if [[ -n "${diff}" ]]; then
-			echo "#"
-			echo "# diff:"
-			echo "#"
-			diff -u golden/${testname} $T_RESULTS/output/${testname} | expand | sed 's/^/#   /'
-		fi
-
-		if [[ -n "${dmsg}" ]]; then
-			echo "#"
-			echo "# dmesg:"
-			echo "#"
-			cat "$T_RESULTS/tmp/${testname}/dmesg.new" | sed 's/^/#   /'
-		fi
-	fi
-) >> $T_RESULTS/scoutfs.tap
-}
--- a/tests/golden/large-fragmented-free
+++ b/tests/golden/large-fragmented-free
@@ -1,3 +1,4 @@
+== setting longer hung task timeout
 == creating fragmented extents
 == unlink file with moved extents to free extents per block
 == cleanup
--- a/tests/golden/mmap
+++ b/tests/golden/mmap
@@ -1,27 +0,0 @@
-== mmap_stress
-thread 0 complete
-thread 1 complete
-thread 2 complete
-thread 3 complete
-thread 4 complete
-== basic mmap/read/write consistency checks
-== mmap read from offline extent
-0: offset: 0 length: 2 flags: O.L
-extents: 1
-1
-00000200:  ea ea ea ea ea ea ea ea ea ea ea ea ea ea ea ea  ................
-0
-0: offset: 0 length: 2 flags: ..L
-extents: 1
-== mmap write to an offline extent
-0: offset: 0 length: 2 flags: O.L
-extents: 1
-1
-0
-0: offset: 0 length: 2 flags: ..L
-extents: 1
-00000000  ea ea ea ea ea ea ea ea  ea ea ea ea ea ea ea ea  |................|
-00000010  11 11 11 11 11 11 11 11  11 11 11 11 11 11 11 11  |................|
-00000020  ea ea ea ea ea ea ea ea  ea ea ea ea ea ea ea ea  |................|
-00000030
-== done
--- a/tests/golden/offline-extent-waiting
+++ b/tests/golden/offline-extent-waiting
@@ -49,7 +49,7 @@ offline wating should be empty:
 0
 == truncating does wait
 truncate should be waiting for first block:
-truncate should no longer be waiting:
+trunate should no longer be waiting:
 0
 == writing waits
 should be waiting for write
--- a/tests/golden/simple-readdir
+++ b/tests/golden/simple-readdir
@@ -1,97 +0,0 @@
-== create content
-== readdir all
-00000000: d_off: 0x00000001 d_reclen: 0x18 d_type: DT_DIR d_name: .
-00000001: d_off: 0x00000002 d_reclen: 0x18 d_type: DT_DIR d_name: ..
-00000002: d_off: 0x00000003 d_reclen: 0x18 d_type: DT_REG d_name: a
-00000003: d_off: 0x00000004 d_reclen: 0x20 d_type: DT_REG d_name: aaaaaaaa
-00000004: d_off: 0x00000005 d_reclen: 0x28 d_type: DT_REG d_name: aaaaaaaaaaaaaaa
-00000005: d_off: 0x00000006 d_reclen: 0x30 d_type: DT_REG d_name: aaaaaaaaaaaaaaaaaaaaaa
-00000006: d_off: 0x00000007 d_reclen: 0x38 d_type: DT_REG d_name: aaaaaaaaaaaaaaaaaaaaaaaaaaaaa
-00000007: d_off: 0x00000008 d_reclen: 0x38 d_type: DT_REG d_name: aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa
-00000008: d_off: 0x00000009 d_reclen: 0x40 d_type: DT_REG d_name: aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa
-00000009: d_off: 0x0000000a d_reclen: 0x48 d_type: DT_REG d_name: aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa
-0000000a: d_off: 0x0000000b d_reclen: 0x50 d_type: DT_REG d_name: aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa
-0000000b: d_off: 0x0000000c d_reclen: 0x58 d_type: DT_REG d_name: aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa
-0000000c: d_off: 0x0000000d d_reclen: 0x60 d_type: DT_REG d_name: aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa
-0000000d: d_off: 0x0000000e d_reclen: 0x68 d_type: DT_REG d_name: aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa
-0000000e: d_off: 0x0000000f d_reclen: 0x70 d_type: DT_REG d_name: aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa
-0000000f: d_off: 0x00000010 d_reclen: 0x70 d_type: DT_REG d_name: aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa
-00000010: d_off: 0x00000011 d_reclen: 0x78 d_type: DT_REG d_name: aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa
-00000011: d_off: 0x00000012 d_reclen: 0x80 d_type: DT_REG d_name: aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa
-00000012: d_off: 0x00000013 d_reclen: 0x88 d_type: DT_REG d_name: aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa
-00000013: d_off: 0x00000014 d_reclen: 0x90 d_type: DT_REG d_name: aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa
-00000014: d_off: 0x00000015 d_reclen: 0x98 d_type: DT_REG d_name: aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa
-00000015: d_off: 0x00000016 d_reclen: 0xa0 d_type: DT_REG d_name: aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa
-00000016: d_off: 0x00000017 d_reclen: 0xa8 d_type: DT_REG d_name: aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa
-00000017: d_off: 0x00000018 d_reclen: 0xa8 d_type: DT_REG d_name: aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa
-00000018: d_off: 0x00000019 d_reclen: 0xb0 d_type: DT_REG d_name: aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa
-00000019: d_off: 0x0000001a d_reclen: 0xb8 d_type: DT_REG d_name: aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa
-0000001a: d_off: 0x0000001b d_reclen: 0xc0 d_type: DT_REG d_name: aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa
-0000001b: d_off: 0x0000001c d_reclen: 0xc8 d_type: DT_REG d_name: aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa
-0000001c: d_off: 0x0000001d d_reclen: 0xd0 d_type: DT_REG d_name: aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa
-0000001d: d_off: 0x0000001e d_reclen: 0xd8 d_type: DT_REG d_name: aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa
-0000001e: d_off: 0x0000001f d_reclen: 0xe0 d_type: DT_REG d_name: aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa
-0000001f: d_off: 0x00000020 d_reclen: 0xe0 d_type: DT_REG d_name: aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa
-00000020: d_off: 0x00000021 d_reclen: 0xe8 d_type: DT_REG d_name: aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa
-00000021: d_off: 0x00000022 d_reclen: 0xf0 d_type: DT_REG d_name: aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa
-00000022: d_off: 0x00000023 d_reclen: 0xf8 d_type: DT_REG d_name: aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa
-00000023: d_off: 0x00000024 d_reclen: 0x100 d_type: DT_REG d_name: aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa
-00000024: d_off: 0x00000025 d_reclen: 0x108 d_type: DT_REG d_name: aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa
-00000025: d_off: 0x00000026 d_reclen: 0x110 d_type: DT_REG d_name: aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa
-== readdir offset
-00000014: d_off: 0x00000015 d_reclen: 0x98 d_type: DT_REG d_name: aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa
-00000015: d_off: 0x00000016 d_reclen: 0xa0 d_type: DT_REG d_name: aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa
-00000016: d_off: 0x00000017 d_reclen: 0xa8 d_type: DT_REG d_name: aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa
-00000017: d_off: 0x00000018 d_reclen: 0xa8 d_type: DT_REG d_name: aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa
-00000018: d_off: 0x00000019 d_reclen: 0xb0 d_type: DT_REG d_name: aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa
-00000019: d_off: 0x0000001a d_reclen: 0xb8 d_type: DT_REG d_name: aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa
-0000001a: d_off: 0x0000001b d_reclen: 0xc0 d_type: DT_REG d_name: aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa
-0000001b: d_off: 0x0000001c d_reclen: 0xc8 d_type: DT_REG d_name: aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa
-0000001c: d_off: 0x0000001d d_reclen: 0xd0 d_type: DT_REG d_name: aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa
-0000001d: d_off: 0x0000001e d_reclen: 0xd8 d_type: DT_REG d_name: aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa
-0000001e: d_off: 0x0000001f d_reclen: 0xe0 d_type: DT_REG d_name: aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa
-0000001f: d_off: 0x00000020 d_reclen: 0xe0 d_type: DT_REG d_name: aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa
-00000020: d_off: 0x00000021 d_reclen: 0xe8 d_type: DT_REG d_name: aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa
-00000021: d_off: 0x00000022 d_reclen: 0xf0 d_type: DT_REG d_name: aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa
-00000022: d_off: 0x00000023 d_reclen: 0xf8 d_type: DT_REG d_name: aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa
-00000023: d_off: 0x00000024 d_reclen: 0x100 d_type: DT_REG d_name: aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa
-00000024: d_off: 0x00000025 d_reclen: 0x108 d_type: DT_REG d_name: aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa
-00000025: d_off: 0x00000026 d_reclen: 0x110 d_type: DT_REG d_name: aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa
-== readdir len (bytes)
-00000000: d_off: 0x00000001 d_reclen: 0x18 d_type: DT_DIR d_name: .
-00000001: d_off: 0x00000002 d_reclen: 0x18 d_type: DT_DIR d_name: ..
-00000002: d_off: 0x00000003 d_reclen: 0x18 d_type: DT_REG d_name: a
-00000003: d_off: 0x00000004 d_reclen: 0x20 d_type: DT_REG d_name: aaaaaaaa
-00000004: d_off: 0x00000005 d_reclen: 0x28 d_type: DT_REG d_name: aaaaaaaaaaaaaaa
-00000005: d_off: 0x00000006 d_reclen: 0x30 d_type: DT_REG d_name: aaaaaaaaaaaaaaaaaaaaaa
-00000006: d_off: 0x00000007 d_reclen: 0x38 d_type: DT_REG d_name: aaaaaaaaaaaaaaaaaaaaaaaaaaaaa
-== introduce gap
-00000000: d_off: 0x00000001 d_reclen: 0x18 d_type: DT_DIR d_name: .
-00000001: d_off: 0x00000002 d_reclen: 0x18 d_type: DT_DIR d_name: ..
-00000002: d_off: 0x00000003 d_reclen: 0x18 d_type: DT_REG d_name: a
-00000003: d_off: 0x00000004 d_reclen: 0x20 d_type: DT_REG d_name: aaaaaaaa
-00000004: d_off: 0x00000005 d_reclen: 0x28 d_type: DT_REG d_name: aaaaaaaaaaaaaaa
-00000005: d_off: 0x00000006 d_reclen: 0x30 d_type: DT_REG d_name: aaaaaaaaaaaaaaaaaaaaaa
-00000006: d_off: 0x00000007 d_reclen: 0x38 d_type: DT_REG d_name: aaaaaaaaaaaaaaaaaaaaaaaaaaaaa
-00000007: d_off: 0x00000008 d_reclen: 0x38 d_type: DT_REG d_name: aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa
-00000008: d_off: 0x00000009 d_reclen: 0x40 d_type: DT_REG d_name: aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa
-00000009: d_off: 0x00000014 d_reclen: 0x48 d_type: DT_REG d_name: aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa
-00000014: d_off: 0x00000015 d_reclen: 0x98 d_type: DT_REG d_name: aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa
-00000015: d_off: 0x00000016 d_reclen: 0xa0 d_type: DT_REG d_name: aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa
-00000016: d_off: 0x00000017 d_reclen: 0xa8 d_type: DT_REG d_name: aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa
-00000017: d_off: 0x00000018 d_reclen: 0xa8 d_type: DT_REG d_name: aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa
-00000018: d_off: 0x00000019 d_reclen: 0xb0 d_type: DT_REG d_name: aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa
-00000019: d_off: 0x0000001a d_reclen: 0xb8 d_type: DT_REG d_name: aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa
-0000001a: d_off: 0x0000001b d_reclen: 0xc0 d_type: DT_REG d_name: aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa
-0000001b: d_off: 0x0000001c d_reclen: 0xc8 d_type: DT_REG d_name: aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa
-0000001c: d_off: 0x0000001d d_reclen: 0xd0 d_type: DT_REG d_name: aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa
-0000001d: d_off: 0x0000001e d_reclen: 0xd8 d_type: DT_REG d_name: aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa
-0000001e: d_off: 0x0000001f d_reclen: 0xe0 d_type: DT_REG d_name: aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa
-0000001f: d_off: 0x00000020 d_reclen: 0xe0 d_type: DT_REG d_name: aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa
-00000020: d_off: 0x00000021 d_reclen: 0xe8 d_type: DT_REG d_name: aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa
-00000021: d_off: 0x00000022 d_reclen: 0xf0 d_type: DT_REG d_name: aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa
-00000022: d_off: 0x00000023 d_reclen: 0xf8 d_type: DT_REG d_name: aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa
-00000023: d_off: 0x00000024 d_reclen: 0x100 d_type: DT_REG d_name: aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa
-00000024: d_off: 0x00000025 d_reclen: 0x108 d_type: DT_REG d_name: aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa
-00000025: d_off: 0x00000026 d_reclen: 0x110 d_type: DT_REG d_name: aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa
-== cleanup
--- a/tests/golden/xfstests
+++ b/tests/golden/xfstests
@@ -22,8 +22,6 @@ generic/024
 generic/025
 generic/026
 generic/028
-generic/029
-generic/030
 generic/031
 generic/032
 generic/033
@@ -55,7 +53,6 @@ generic/073
 generic/076
 generic/078
 generic/079
-generic/080
 generic/081
 generic/082
 generic/084
@@ -84,12 +81,10 @@ generic/116
 generic/117
 generic/118
 generic/119
-generic/120
 generic/121
 generic/122
 generic/123
 generic/124
-generic/126
 generic/128
 generic/129
 generic/130
@@ -100,7 +95,6 @@ generic/136
 generic/138
 generic/139
 generic/140
-generic/141
 generic/142
 generic/143
 generic/144
@@ -159,7 +153,6 @@ generic/210
 generic/211
 generic/212
 generic/214
-generic/215
 generic/216
 generic/217
 generic/218
@@ -180,9 +173,6 @@ generic/238
 generic/240
 generic/244
 generic/245
-generic/246
-generic/247
-generic/248
 generic/249
 generic/250
 generic/252
@@ -241,7 +231,6 @@ generic/317
 generic/319
 generic/322
 generic/324
-generic/325
 generic/326
 generic/327
 generic/328
@@ -255,7 +244,6 @@ generic/337
 generic/341
 generic/342
 generic/343
-generic/346
 generic/348
 generic/353
 generic/355
@@ -317,9 +305,7 @@ generic/424
 generic/425
 generic/426
 generic/427
-generic/428
 generic/436
-generic/437
 generic/439
 generic/440
 generic/443
@@ -329,7 +315,6 @@ generic/448
 generic/449
 generic/450
 generic/451
-generic/452
 generic/453
 generic/454
 generic/456
@@ -453,7 +438,6 @@ generic/610
 generic/611
 generic/612
 generic/613
-generic/614
 generic/618
 generic/621
 generic/623
@@ -467,7 +451,6 @@ generic/632
 generic/634
 generic/635
 generic/637
-generic/638
 generic/639
 generic/640
 generic/644
@@ -879,4 +862,4 @@ generic/688
 generic/689
 shared/002
 shared/032
-Passed all 512 tests
+Passed all 495 tests
--- a/tests/run-tests.sh
+++ b/tests/run-tests.sh
@@ -512,11 +512,6 @@ msg "running tests"
 > "$T_RESULTS/skip.log"
 > "$T_RESULTS/fail.log"

-# generate a test ID to make sure we can de-duplicate TAP results in aggregation
-. funcs/tap.sh
-t_tap_header $(uuidgen)
-
-testcount=0
 passed=0
 skipped=0
 failed=0
@@ -532,15 +527,12 @@ for t in $tests; do
 	cmd rm -rf "$T_TMPDIR"
 	cmd mkdir -p "$T_TMPDIR"

-	# create a test name dir in the fs, clean up old data as needed
+	# create a test name dir in the fs
 	T_DS=""
 	for i in $(seq 0 $((T_NR_MOUNTS - 1))); do
 		dir="${T_M[$i]}/test/$test_name"

-		test $i == 0 && (
-			test -d "$dir" && cmd rm -rf "$dir"
-			cmd mkdir -p "$dir"
-		)
+		test $i == 0 && cmd mkdir -p "$dir"

 		eval T_D$i=$dir
 		T_D[$i]=$dir
@@ -645,11 +637,6 @@ for t in $tests; do

 		test -n "$T_ABORT" && die "aborting after first failure"
 	fi
-
-	# record results for TAP format output
-	t_tap_progress $test_name $sts
-	((testcount++))
-
 done

 msg "all tests run: $passed passed, $skipped skipped, $skipped_permitted skipped (permitted), $failed failed"
--- a/tests/sequence
+++ b/tests/sequence
@@ -6,7 +6,6 @@ inode-items-updated.sh
 simple-inode-index.sh
 simple-staging.sh
 simple-release-extents.sh
-simple-readdir.sh
 get-referring-entries.sh
 fallocate.sh
 basic-truncate.sh
@@ -18,7 +17,6 @@ projects.sh
 large-fragmented-free.sh
 format-version-forward-back.sh
 enospc.sh
-mmap.sh
 srch-safe-merge-pos.sh
 srch-basic-functionality.sh
 simple-xattr-unit.sh
--- a/tests/src/mmap_stress.c
+++ b/tests/src/mmap_stress.c
@@ -1,181 +0,0 @@
-#define _GNU_SOURCE
-/*
- * mmap() stress test for scoutfs
- *
- * This test exercises the scoutfs kernel module's locking by
- * repeatedly reading/writing using mmap and pread/write calls
- * across 5 clients (mounts).
- *
- * Each thread operates on a single thread/client, and performs
- * operations in a random order on the file.
- *
- * The goal is to assure that locking between _page_mkwrite vfs
- * calls and the normal read/write paths do not cause deadlocks.
- *
- * There is no content validation performed. All that is done is
- * assure that the programs continues without errors.
- */
-
-#include <sys/types.h>
-#include <stdio.h>
-#include <sys/stat.h>
-#include <fcntl.h>
-#include <unistd.h>
-#include <stdlib.h>
-#include <string.h>
-#include <stdbool.h>
-#include <sys/mman.h>
-#include <pthread.h>
-#include <errno.h>
-
-static int size = 0;
-static int count = 0; /* XXX make this duration instead */
-
-struct thread_info {
-	int nr;
-	int fd;
-};
-
-static void *run_test_func(void *ptr)
-{
-	void *buf = NULL;
-	char *addr = NULL;
-	struct thread_info *tinfo = ptr;
-	int c = 0;
-	int fd;
-	ssize_t read, written, ret;
-	int preads = 0, pwrites = 0, mreads = 0, mwrites = 0;
-
-	fd = tinfo->fd;
-
-	if (posix_memalign(&buf, 4096, size) != 0) {
-		perror("calloc");
-		exit(-1);
-	}
-
-	addr = mmap(NULL, size, PROT_WRITE | PROT_READ, MAP_SHARED, fd, 0);
-	if (addr == MAP_FAILED) {
-		perror("mmap");
-		exit(-1);
-	}
-
-	usleep(100000); /* 0.1sec to allow all threads to start roughly at the same time */
-
-	for (;;) {
-		if (++c > count)
-			break;
-
-		switch (rand() % 4) {
-		case 0: /* pread */
-			preads++;
-			for (read = 0; read < size;) {
-				ret = pread(fd, buf, size - read, read);
-				if (ret < 0) {
-					perror("pwrite");
-					exit(-1);
-				}
-				read += ret;
-			}
-			break;
-		case 1: /* pwrite */
-			pwrites++;
-			memset(buf, (char)(c & 0xff), size);
-			for (written = 0; written < size;) {
-				ret = pwrite(fd, buf, size - written, written);
-				if (ret < 0) {
-					perror("pwrite");
-					exit(-1);
-				}
-				written += ret;
-			}
-			break;
-		case 2: /* mmap read */
-			mreads++;
-			memcpy(buf, addr, size); /* noerr */
-			break;
-		case 3: /* mmap write */
-			mwrites++;
-			memset(buf, (char)(c & 0xff), size);
-			memcpy(addr, buf, size); /* noerr */
-			break;
-		}
-	}
-
-	munmap(addr, size);
-
-	free(buf);
-
-	printf("thread %u complete: preads %u pwrites %u mreads %u mwrites %u\n", tinfo->nr,
-		mreads, mwrites, preads, pwrites);
-
-	return NULL;
-}
-
-int main(int argc, char **argv)
-{
-	pthread_t thread[5];
-	struct thread_info tinfo[5];
-	int fd[5];
-	int ret;
-	int i;
-
-	if (argc != 8) {
-		fprintf(stderr, "%s requires 7 arguments - size count file1 file2 file3 file4 file5\n", argv[0]);
-		exit(-1);
-	}
-
-	size = atoi(argv[1]);
-	if (size <= 0) {
-		fprintf(stderr, "invalid size, must be greater than 0\n");
-		exit(-1);
-	}
-
-	count = atoi(argv[2]);
-	if (count < 0) {
-		fprintf(stderr, "invalid count, must be greater than 0\n");
-		exit(-1);
-	}
-
-	/* create and truncate one fd */
-	fd[0] = open(argv[3], O_RDWR | O_CREAT | O_TRUNC, 00644);
-	if (fd[0] < 0) {
-		perror("open");
-		exit(-1);
-	}
-
-	/* make it the test size */
-	if (posix_fallocate(fd[0], 0, size) != 0) {
-		perror("fallocate");
-		exit(-1);
-	}
-
-	/* now open the rest of the fds */
-	for (i = 1; i < 5; i++) {
-		fd[i] = open(argv[3+i], O_RDWR);
-		if (fd[i] < 0) {
-			perror("open");
-			exit(-1);
-		}
-	}
-
-	/* start threads */
-	for (i = 0; i < 5; i++) {
-		tinfo[i].fd = fd[i];
-		tinfo[i].nr = i;
-		ret = pthread_create(&thread[i], NULL, run_test_func, (void*)&tinfo[i]);
-
-		if (ret) {
-			perror("pthread_create");
-			exit(-1);
-		}
-	}
-
-	/* wait for complete */
-	for (i = 0; i < 5; i++)
-		pthread_join(thread[i], NULL);
-
-	for (i = 0; i < 5; i++)
-		close(fd[i]);
-
-	exit(0);
-}
--- a/tests/src/mmap_validate.c
+++ b/tests/src/mmap_validate.c
@@ -1,159 +0,0 @@
-#define _GNU_SOURCE
-/*
- * mmap() content consistency checking for scoutfs
- *
- * This test program validates that content from memory mappings
- * are consistent across clients, whether written/read with mmap or
- * normal writes/reads.
- *
- * One side of (read/write) will always be memory mapped. It may
- * be that both sides do memory mapped (33% of the time).
- */
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <unistd.h>
-#include <string.h>
-#include <sys/mman.h>
-#include <fcntl.h>
-#include <errno.h>
-
-static int count = 0;
-static int size = 0;
-
-static void run_test_func(int fd1, int fd2)
-{
-	void *buf1 = NULL;
-	void *buf2 = NULL;
-	char *addr1 = NULL;
-	char *addr2 = NULL;
-	int c = 0;
-	ssize_t read, written, ret;
-
-	/* buffers for both sides to compare */
-	if (posix_memalign(&buf1, 4096, size) != 0) {
-		perror("calloc1");
-		exit(-1);
-	}
-
-	if (posix_memalign(&buf2, 4096, size) != 0) {
-		perror("calloc1");
-		exit(-1);
-	}
-
-	/* memory maps for both sides */
-	addr1 = mmap(NULL, size, PROT_WRITE | PROT_READ, MAP_SHARED, fd1, 0);
-	if (addr1 == MAP_FAILED) {
-		perror("mmap1");
-		exit(-1);
-	}
-
-	addr2 = mmap(NULL, size, PROT_WRITE | PROT_READ, MAP_SHARED, fd2, 0);
-	if (addr2 == MAP_FAILED) {
-		perror("mmap2");
-		exit(-1);
-	}
-
-	for (;;) {
-		if (++c > count) /* 10k iterations */
-			break;
-
-		/* put a pattern in buf1 */
-		memset(buf1, c & 0xff, size);
-
-		/* pwrite or mmap write from buf1 */
-		switch (c % 3) {
-		case 0:	/* pwrite */
-			for (written = 0; written < size;) {
-				ret = pwrite(fd1, buf1, size - written, written);
-				if (ret < 0) {
-					perror("pwrite");
-					exit(-1);
-				}
-				written += ret;
-			}
-			break;
-		default: /* mmap write */
-			memcpy(addr1, buf1, size);
-			break;
-		}
-
-		/* pread or mmap read to buf2 */
-		switch (c % 3) {
-		case 2: /* pread */
-			for (read = 0; read < size;) {
-				ret = pread(fd2, buf2, size - read, read);
-				if (ret < 0) {
-					perror("pwrite");
-					exit(-1);
-				}
-				read += ret;
-			}
-			break;
-		default: /* mmap read */
-			memcpy(buf2, addr2, size);
-			break;
-		}
-
-		/* compare bufs */
-		if (memcmp(buf1, buf2, size) != 0) {
-			fprintf(stderr, "memcmp() failed\n");
-			exit(-1);
-		}
-	}
-
-	munmap(addr1, size);
-	munmap(addr2, size);
-
-	free(buf1);
-	free(buf2);
-}
-
-int main(int argc, char **argv)
-{
-	int fd[1];
-
-	if (argc != 5) {
-		fprintf(stderr, "%s requires 4 arguments - size count file1 file2\n", argv[0]);
-		exit(-1);
-	}
-
-	size = atoi(argv[1]);
-	if (size <= 0) {
-		fprintf(stderr, "invalid size, must be greater than 0\n");
-		exit(-1);
-	}
-
-	count = atoi(argv[2]);
-	if (count < 3) {
-		fprintf(stderr, "invalid count, must be greater than 3\n");
-		exit(-1);
-	}
-
-	/* create and truncate one fd */
-	fd[0] = open(argv[3], O_RDWR | O_CREAT | O_TRUNC, 00644);
-	if (fd[0] < 0) {
-		perror("open");
-		exit(-1);
-	}
-
-	fd[1] = open(argv[4], O_RDWR , 00644);
-	if (fd[1] < 0) {
-		perror("open");
-		exit(-1);
-	}
-
-	/* make it the test size */
-	if (posix_fallocate(fd[0], 0, size) != 0) {
-		perror("fallocate");
-		exit(-1);
-	}
-
-	/* run the test function */
-	run_test_func(fd[0], fd[1]);
-
-	close(fd[0]);
-	close(fd[1]);
-
-	exit(0);
-}
--- a/tests/tests/basic-truncate.sh
+++ b/tests/tests/basic-truncate.sh
@@ -11,7 +11,7 @@ FILE="$T_D0/file"
 # final block as we truncated past it.
 #
 echo "== truncate writes zeroed partial end of file block"
-yes 2>/dev/null | dd of="$FILE" bs=8K count=1 status=none iflag=fullblock
+yes | dd of="$FILE" bs=8K count=1 status=none iflag=fullblock
 sync

 # not passing iflag=fullblock causes the file occasionally to just be
--- a/tests/tests/enospc.sh
+++ b/tests/tests/enospc.sh
@@ -88,11 +88,6 @@ rm -rf "$SCR/xattrs"

 echo "== make sure we can create again"
 file="$SCR/file-after"
-C=120
-while (( C-- )); do
-	touch $file 2> /dev/null && break
-	sleep 1
-done
 touch $file
 setfattr -n user.scoutfs-enospc -v 1 "$file"
 sync
--- a/tests/tests/format-version-forward-back.sh
+++ b/tests/tests/format-version-forward-back.sh
@@ -11,8 +11,8 @@
 # format version.
 #

-# not supported on el8 or higher
-if [ $(source /etc/os-release ; echo ${VERSION_ID:0:1}) -gt 7 ]; then
+# not supported on el9!
+if [ $(source /etc/os-release ; echo ${VERSION_ID:0:1}) -gt 8 ]; then
 	t_skip_permitted "Unsupported OS version"
 fi

--- a/tests/tests/large-fragmented-free.sh
+++ b/tests/tests/large-fragmented-free.sh
@@ -10,6 +10,30 @@ EXTENTS_PER_BTREE_BLOCK=600
 EXTENTS_PER_LIST_BLOCK=8192
 FREED_EXTENTS=$((EXTENTS_PER_BTREE_BLOCK * EXTENTS_PER_LIST_BLOCK))

+#
+# This test specifically creates a pathologically sparse file that will
+# be as expensive as possible to free.  This is usually fine on
+# dedicated or reasonable hardware, but trying to run this in
+# virtualized debug kernels can take a very long time.  This test is
+# about making sure that the server doesn't fail, not that the platform
+# can handle the scale of work that our btree formats happen to require
+# while execution is bogged down with use-after-free memory reference
+# tracking.  So we give the test a lot more breathing room before
+# deciding that its hung.
+#
+echo "== setting longer hung task timeout"
+if [ -w /proc/sys/kernel/hung_task_timeout_secs ]; then
+	secs=$(cat /proc/sys/kernel/hung_task_timeout_secs)
+	test "$secs" -gt 0 || \
+		t_fail "confusing value '$secs' from /proc/sys/kernel/hung_task_timeout_secs"
+	restore_hung_task_timeout()
+	{
+		echo "$secs" > /proc/sys/kernel/hung_task_timeout_secs
+	}
+	trap restore_hung_task_timeout EXIT
+	echo "$((secs * 5))" > /proc/sys/kernel/hung_task_timeout_secs
+fi
+
 echo "== creating fragmented extents"
 fragmented_data_extents $FREED_EXTENTS $EXTENTS_PER_BTREE_BLOCK "$T_D0/alloc" "$T_D0/move"

--- a/tests/tests/lock-recover-invalidate.sh
+++ b/tests/tests/lock-recover-invalidate.sh
@@ -38,6 +38,6 @@ while [ "$SECONDS" -lt "$END" ]; do
 done

 echo "== stopping background load"
-t_silent_kill $load_pids
+kill $load_pids

 t_pass
--- a/tests/tests/mmap.sh
+++ b/tests/tests/mmap.sh
@@ -1,54 +0,0 @@
-#
-# test mmap() and normal read/write consistency between different nodes
-#
-
-t_require_commands mmap_stress mmap_validate scoutfs xfs_io
-
-echo "== mmap_stress"
-mmap_stress 8192 2000 "$T_D0/mmap_stress" "$T_D1/mmap_stress" "$T_D2/mmap_stress" "$T_D3/mmap_stress" "$T_D4/mmap_stress" | sed 's/:.*//g' | sort
-
-echo "== basic mmap/read/write consistency checks"
-mmap_validate 256 1000 "$T_D0/mmap_val1" "$T_D1/mmap_val1"
-mmap_validate 8192 1000 "$T_D0/mmap_val2" "$T_D1/mmap_val2"
-mmap_validate 88400 1000 "$T_D0/mmap_val3" "$T_D1/mmap_val3"
-
-echo "== mmap read from offline extent"
-F="$T_D0/mmap-offline"
-touch "$F"
-xfs_io -c "pwrite -S 0xEA 0 8192" "$F" > /dev/null
-cp "$F" "${F}-stage"
-vers=$(scoutfs stat -s data_version "$F")
-scoutfs release "$F" -V "$vers" -o 0 -l 8192
-scoutfs get-fiemap -L "$F"
-xfs_io -c "mmap -rwx 0 8192" \
-	-c "mread -v 512 16" "$F" &
-sleep 1
-# should be 1 - data waiting
-jobs | wc -l
-scoutfs stage "${F}-stage" "$F" -V "$vers" -o 0 -l 8192
-# xfs_io thread <here> will output 16 bytes of read data
-sleep 1
-# should be 0 - no more waiting jobs, xfs_io should have exited
-jobs | wc -l
-scoutfs get-fiemap -L "$F"
-
-echo "== mmap write to an offline extent"
-# reuse the same file
-scoutfs release "$F" -V "$vers" -o 0 -l 8192
-scoutfs get-fiemap -L "$F"
-xfs_io -c "mmap -rwx 0 8192" \
-	-c "mwrite -S 0x11 528 16" "$F" &
-sleep 1
-# should be 1 job waiting
-jobs | wc -l
-scoutfs stage "${F}-stage" "$F" -V "$vers" -o 0 -l 8192
-# no output here from write
-sleep 1
-# should be 0 - no more waiting jobs, xfs_io should have exited
-jobs | wc -l
-scoutfs get-fiemap -L "$F"
-# read back contents to assure write changed the file
-dd status=none if="$F" bs=1 count=48 skip=512 | hexdump -C
-
-echo "== done"
-t_pass
--- a/tests/tests/offline-extent-waiting.sh
+++ b/tests/tests/offline-extent-waiting.sh
@@ -157,7 +157,7 @@ echo "truncate should be waiting for first block:"
 expect_wait "$DIR/file" "change_size" $ino 0
 scoutfs stage "$DIR/golden" "$DIR/file" -V "$vers" -o 0 -l $BYTES
 sleep .1
-echo "truncate should no longer be waiting:"
+echo "trunate should no longer be waiting:"
 scoutfs data-waiting -B 0 -I 0 -p "$DIR" | wc -l
 cat "$DIR/golden" > "$DIR/file"
 vers=$(scoutfs stat -s data_version "$DIR/file")
@@ -168,13 +168,10 @@ scoutfs release "$DIR/file" -V "$vers" -o 0 -l $BYTES
 # overwrite, not truncate+write
 dd if="$DIR/other" of="$DIR/file" \
 	bs=$BS count=$BLOCKS conv=notrunc status=none &
-pid="$!"
 sleep .1
 echo "should be waiting for write"
 expect_wait "$DIR/file" "write" $ino 0
 scoutfs stage "$DIR/golden" "$DIR/file" -V "$vers" -o 0 -l $BYTES
-# wait for the background dd to complete
-wait "$pid" 2> /dev/null
 cmp "$DIR/file" "$DIR/other"

 echo "== cleanup"
--- a/tests/tests/orphan-inodes.sh
+++ b/tests/tests/orphan-inodes.sh
@@ -5,6 +5,18 @@
 t_require_commands sleep touch sync stat handle_cat kill rm
 t_require_mounts 2

+#
+# usually bash prints an annoying output message when jobs
+# are killed.  We can avoid that by redirecting stderr for
+# the bash process when it reaps the jobs that are killed.
+#
+silent_kill() {
+	exec {ERR}>&2 2>/dev/null
+	kill "$@"
+	wait "$@"
+	exec 2>&$ERR {ERR}>&-
+}
+
 #
 # We don't have a great way to test that inode items still exist.   We
 # don't prevent opening handles with nlink 0 today, so we'll use that.
@@ -40,7 +52,7 @@ inode_exists $ino || echo "$ino didn't exist"

 echo "== orphan from failed evict deletion is picked up"
 # pending kill signal stops evict from getting locks and deleting
-t_silent_kill $pid
+silent_kill $pid
 t_set_sysfs_mount_option 0 orphan_scan_delay_ms 1000
 sleep 5
 inode_exists $ino && echo "$ino still exists"
@@ -58,7 +70,7 @@ for nr in $(t_fs_nrs); do
 	rm -f "$path"
 done
 sync
-t_silent_kill $pids
+silent_kill $pids
 for nr in $(t_fs_nrs); do
 	t_force_umount $nr
 done
@@ -67,49 +79,10 @@ t_mount_all
 while test -d $(echo /sys/fs/scoutfs/*/fence/* | cut -d " " -f 1); do
 	sleep .5
 done
-
-
-sv=$(t_server_nr)
-
-# wait for reclaim_open_log_tree() to complete for each mount
-while [ $(t_counter reclaimed_open_logs $sv) -lt $T_NR_MOUNTS ]; do
-	sleep 1
-done
-
-# wait for finalize_and_start_log_merge() to find no active merges in flight
-# and not find any finalized trees
-while [ $(t_counter log_merge_no_finalized $sv) -lt 1 ]; do
-	sleep 1
-done
-
 # wait for orphan scans to run
 t_set_all_sysfs_mount_options orphan_scan_delay_ms 1000
-# wait until we see two consecutive orphan scan attempts without
-# any inode deletion forward progress in each mount
-for nr in $(t_fs_nrs); do
-	C=0
-	LOSA=$(t_counter orphan_scan_attempts $nr)
-	LDOP=$(t_counter inode_deleted $nr)
-
-	while [ $C -lt 2 ]; do
-		sleep 1
-
-		OSA=$(t_counter orphan_scan_attempts $nr)
-		DOP=$(t_counter inode_deleted $nr)
-
-		if [ $OSA != $LOSA ]; then
-			if [ $DOP == $LDOP ]; then
-				(( C++ ))
-			else
-				C=0
-			fi
-		fi
-
-		LOSA=$OSA
-		LDOP=$DOP
-	done
-done
-
+# also have to wait for delayed log merge work from mount
+sleep 15
 for ino in $inos; do
 	inode_exists $ino && echo "$ino still exists"
 done
@@ -158,7 +131,7 @@ while [ $SECONDS -lt $END ]; do
 	done

 	# trigger eviction deletion of each file in each mount
-	t_silent_kill $pids
+	silent_kill $pids

 	wait || t_fail "handle_fsetxattr failed"

--- a/tests/tests/simple-readdir.sh
+++ b/tests/tests/simple-readdir.sh
@@ -1,37 +0,0 @@
-#
-# verify d_off output of xfs_io is consistent.
-#
-
-t_require_commands xfs_io
-
-filt()
-{
-	grep d_off | cut -d ' ' -f 1,4-
-}
-
-echo "== create content"
-for s in $(seq 1 7 250); do
-	f=$(printf '%*s' $s | tr ' ' 'a')
-	touch ${T_D0}/$f
-done
-
-echo "== readdir all"
-xfs_io -c "readdir -v" $T_D0 | filt
-
-echo "== readdir offset"
-xfs_io -c "readdir -v -o 20" $T_D0 | filt
-
-echo "== readdir len (bytes)"
-xfs_io -c "readdir -v -l 193" $T_D0 | filt
-
-echo "== introduce gap"
-for s in $(seq 57 7 120); do
-	f=$(printf '%*s' $s | tr ' ' 'a')
-	rm -f ${T_D0}/$f
-done
-xfs_io -c "readdir -v" $T_D0 | filt
-
-echo "== cleanup"
-rm -rf $T_D0
-
-t_pass
--- a/tests/tests/xfstests.sh
+++ b/tests/tests/xfstests.sh
@@ -65,14 +65,26 @@ EOF

 cat << EOF > local.exclude
 generic/003	# missing atime update in buffered read
+generic/029	# mmap missing
+generic/030	# mmap missing
 generic/075	# file content mismatch failures (fds, etc)
+generic/080	# mmap missing
 generic/103	# enospc causes trans commit failures
 generic/108	# mount fails on failing device?
 generic/112	# file content mismatch failures (fds, etc)
+generic/120	# (can't exec 'cause no mmap)
+generic/126	# (can't exec 'cause no mmap)
+generic/141	# mmap missing
 generic/213	# enospc causes trans commit failures
+generic/215	# mmap missing
+generic/246	# mmap missing
+generic/247	# mmap missing
+generic/248	# mmap missing
 generic/318	# can't support user namespaces until v5.11
 generic/321	# requires selinux enabled for '+' in ls?
+generic/325	# mmap missing
 generic/338	# BUG_ON update inode error handling
+generic/346	# mmap missing
 generic/347	# _dmthin_mount doesn't work?
 generic/356	# swap
 generic/357	# swap
@@ -80,13 +92,16 @@ generic/409	# bind mounts not scripted yet
 generic/410	# bind mounts not scripted yet
 generic/411	# bind mounts not scripted yet
 generic/423	# symlink inode size is strlen() + 1 on scoutfs
+generic/428	# mmap missing
 generic/430	# xfs_io copy_range missing in el7
 generic/431	# xfs_io copy_range missing in el7
 generic/432	# xfs_io copy_range missing in el7
 generic/433	# xfs_io copy_range missing in el7
 generic/434	# xfs_io copy_range missing in el7
+generic/437	# mmap missing
 generic/441	# dm-mapper
 generic/444	# el9's posix_acl_update_mode is buggy ?
+generic/452	# exec test - no mmap
 generic/467	# open_by_handle ESTALE
 generic/472	# swap
 generic/484	# dm-mapper
@@ -103,9 +118,11 @@ generic/565	# xfs_io copy_range missing in el7
 generic/568	# falloc not resulting in block count increase
 generic/569	# swap
 generic/570	# swap
+generic/614	# mmap missing
 generic/620	# dm-hugedisk
-generic/633	# id-mapped mounts missing in el7
+generic/633	# mmap, id-mapped mounts missing in el7
 generic/636	# swap
+generic/638	# mmap missing
 generic/641	# swap
 generic/643	# swap
 EOF
--- a/utils/man/scoutfs.5
+++ b/utils/man/scoutfs.5
@@ -130,24 +130,6 @@ the server for the filesystem if it is elected leader.
 The assigned number must match one of the slots defined with \-Q options
 when the filesystem was created with mkfs.  If the number assigned
 doesn't match a number created during mkfs then the mount will fail.
-.TP
-.B tcp_keepalive_timeout_ms=<number>
-This option sets the amount of time, in milliseconds, that a client
-connection will wait for active TCP packets, before deciding that
-the connection is dead. This setting is per-mount and only changes
-the behavior of that mount.
-.sp
-The default value of this setting is 10000msec (10s). Any precision
-beyond a whole second is likely unrealistic due to the nature of
-TCP keepalive mechanisms in the Linux kernel. Valid values are any
-value higher than 3000 (3s). Values that are higher than 30000msec
-(30s) will likely interfere with other embedded timeout values.
-.sp
-The TCP keepalive mechanism is complex and observing a lost connection
-quickly is important to maintain cluster stability. If the local
-network suffers from intermittent outages this option may provide
-some respite to overcome these outages without the cluster becoming
-desynchronized.
 .SH VOLUME OPTIONS
 Volume options are persistent options which are stored in the super
 block in the metadata device and which apply to all mounts of the volume.
--- a/utils/sparse.sh
+++ b/utils/sparse.sh
@@ -1,7 +1,7 @@
 #!/bin/bash

-# must have sparse.  Fail with error message, mask success path.
-which sparse > /dev/null || exit 1
+# can we find sparse?  If not, we're done.
+which sparse > /dev/null 2>&1 || exit 0

 # 
 # one of the problems with using sparse in userspace is that it picks up
@@ -22,11 +22,6 @@ RE="$RE|warning: memset with byte count of 4194304"
 # some sparse versions don't know about some builtins
 RE="$RE|error: undefined identifier '__builtin_fpclassify'"

-# on el8, sparse can't handle __has_include for some reason when _GNU_SOURCE
-# is defined, and we need that for O_DIRECT.
-RE="$RE|note: in included file .through /usr/include/sys/stat.h.:"
-RE="$RE|/usr/include/bits/statx.h:30:6: error: "
-
 #
 # don't filter out 'too many errors' here, it can signify that
 # sparse doesn't understand something and is throwing a *ton*
--- a/utils/src/util.c
+++ b/utils/src/util.c
@@ -7,6 +7,7 @@
 #include <errno.h>
 #include <stdio.h>
 #include <stdlib.h>
+#include <wordexp.h>

 #include "util.h"
 #include "format.h"
@@ -17,15 +18,26 @@

 static int open_path(char *path, int flags)
 {
+	wordexp_t exp_result;
 	int ret;

-	ret = open(path, flags);
+	ret = wordexp(path, &exp_result, WRDE_NOCMD | WRDE_SHOWERR | WRDE_UNDEF);
+	if (ret) {
+		fprintf(stderr, "wordexp() failure for \"%s\": %d\n", path, ret);
+		ret = -EINVAL;
+		goto out;
+	}
+
+	ret = open(exp_result.we_wordv[0], flags);
 	if (ret < 0) {
 		ret = -errno;
 		fprintf(stderr, "failed to open '%s': %s (%d)\n",
 			path, strerror(errno), errno);
 	}

+out:
+	wordfree(&exp_result);
+
 	return ret;
 }