merge conflict from zab/shrink cleanup

Fix a sparse warning in net.c
Add tcp_keepalive_timeout_ms option.
2026-04-30 18:05:43 +00:00 · 2025-10-07 12:22:53 -07:00 · 2025-10-07 12:22:40 -07:00 · 2025-10-07 12:16:23 -07:00 · 2025-10-07 12:15:59 -07:00 · 2025-10-07 12:15:51 -07:00
98 changed files with 2859 additions and 8939 deletions
--- a/ReleaseNotes.md
+++ b/ReleaseNotes.md
@@ -1,6 +1,65 @@
 Versity ScoutFS Release Notes
 =============================

+---
+v1.25
+\
+*Jun 3, 2025*
+
+Fix a bug that could cause indefinite retries of failed client commits.
+Under specific error conditions the client and server's understanding of
+the current client commit could get out of sync.  The client would retry
+commits indefinitely that could never succeed.  This manifested as
+infinite "critical transaction commit failure" messages in the kernel
+log on the client and matching "error <nr> committing client logs" on
+the server.
+
+Fix a bug in a specific case of server error handling that could result
+in sending references to unwritten blocks to the client.  The client
+would try to read blocks that hadn't been written and return spurious
+errors.  This was seen under low free space conditions on the server and
+resulted in error messages with error code 116 (The errno enum for
+ESTALE, the client's indication that it couldn't read the blocks that it
+expected.)
+
+---
+v1.24
+\
+*Mar 14, 2025*
+
+Add support for coherent read and write mmap() mappings of regular file
+data between mounts.
+
+Fix a bug that was causing scoutfs utilities to parse and change some
+file names before passing them on to the kernel for processing.  This
+fixes spurious scoutfs command errors for files with the offending
+patterns in their names.
+
+Fix a bug where rename wasn't updating the ctime of the inode at the
+destination name if it existed.
+
+---
+v1.23
+\
+*Dec 11, 2024*
+
+Add support for kernels in the RHEL 9.5 minor release.
+
+---
+v1.22
+\
+*Nov 1, 2024*
+
+Add support for building against the RHEL9 family of kernels.
+
+Fix failure of the setattr\_more ioctl() to set the attributes of a
+zero-length file when restoring.
+
+Fix support for POSIX ACLs in the RHEL8 and later family of kernels.
+
+Fix a race condition in the lock server that could drop lock requests
+under heavy load and cause cluster lock attempts to hang.
+
 ---
 v1.21
 \
--- a/kmod/Makefile
+++ b/kmod/Makefile
@@ -5,13 +5,6 @@ ifeq ($(SK_KSRC),)
 SK_KSRC := $(shell echo /lib/modules/`uname -r`/build)
 endif

-# fail if sparse fails if we find it
-ifeq ($(shell sparse && echo found),found)
-SP =
-else
-SP = @:
-endif
-
 SCOUTFS_GIT_DESCRIBE ?= \
 	$(shell git describe --all --abbrev=6 --long 2>/dev/null || \
 		echo no-git)
@@ -36,9 +29,7 @@ TARFILE = scoutfs-kmod-$(RPM_VERSION).tar
 all: module

 module:
-	$(MAKE) $(SCOUTFS_ARGS)
-	$(SP) $(MAKE) C=2 CF="-D__CHECK_ENDIAN__" $(SCOUTFS_ARGS)
-
+	$(MAKE) CHECK=$(CURDIR)/src/sparse-filtered.sh C=1 CF="-D__CHECK_ENDIAN__" $(SCOUTFS_ARGS)

 modules_install:
 	$(MAKE) $(SCOUTFS_ARGS) modules_install
--- a/kmod/scoutfs-kmod.spec.in
+++ b/kmod/scoutfs-kmod.spec.in
@@ -4,9 +4,6 @@
 %define kmod_git_describe @@GITDESCRIBE@@
 %define pkg_date %(date +%%Y%%m%%d)

-# Disable the building of the debug package(s).
-%define debug_package %{nil}
-
 # take kernel version or default to uname -r
 %{!?kversion: %global kversion %(uname -r)}
 %global kernel_version %{kversion}
@@ -56,6 +53,18 @@ Source:		%{kmod_name}-kmod-%{kmod_version}.tar
 %global flavors_to_build x86_64
 %endif

+# el9 sanity: make sure we lock to the minor release we built for and block upgrades
+%{lua:
+  if string.match(rpm.expand("%{dist}"), "%.el9") then
+    rpm.define("el9 1")
+  end
+}
+
+%if 0%{?el9}
+%define release_major_minor 9.%{lua: print(rpm.expand("%{dist}"):match("%.el9_(%d)"))}
+Requires: system-release = %{release_major_minor}
+%endif
+
 %description
 %{kmod_name} - kernel module

--- a/kmod/src/Makefile.kernelcompat
+++ b/kmod/src/Makefile.kernelcompat
@@ -6,26 +6,6 @@

 ccflags-y += -include $(src)/kernelcompat.h

-#
-# v3.10-rc6-21-gbb6f619b3a49
-#
-# _readdir changes from fop->readdir() to fop->iterate() and from
-# filldir(dirent) to dir_emit(ctx).
-#
-ifneq (,$(shell grep 'iterate.*dir_context' include/linux/fs.h))
-ccflags-y += -DKC_ITERATE_DIR_CONTEXT
-endif
-
-#
-# v3.10-rc6-23-g5f99f4e79abc
-#
-# Helpers including dir_emit_dots() are added in the process of
-# switching dcache_readdir() from fop->readdir() to fop->iterate()
-#
-ifneq (,$(shell grep 'dir_emit_dots' include/linux/fs.h))
-ccflags-y += -DKC_DIR_EMIT_DOTS
-endif
-
 #
 # v3.18-rc2-19-gb5ae6b15bd73
 # 
@@ -178,21 +158,12 @@ ifneq (,$(shell grep 'sock_create_kern.*struct net' include/linux/net.h))
 ccflags-y += -DKC_SOCK_CREATE_KERN_NET=1
 endif

-#
-# v3.18-rc6-1619-gc0371da6047a
-#
-# iov_iter is now part of struct msghdr
-#
-ifneq (,$(shell grep 'struct iov_iter.*msg_iter' include/linux/socket.h))
-ccflags-y += -DKC_MSGHDR_STRUCT_IOV_ITER=1
-endif
-
 #
 # v4.17-rc6-7-g95582b008388
 #
 # Kernel has current_time(inode) to uniformly retreive timespec in the right unit
 #
-ifneq (,$(shell grep 'extern struct timespec64 current_time' include/linux/fs.h))
+ifneq (,$(shell grep 'struct timespec64 current_time' include/linux/fs.h))
 ccflags-y += -DKC_CURRENT_TIME_INODE=1
 endif

@@ -413,3 +384,81 @@ endif
 ifneq (,$(shell grep 'blkdev_put.struct block_device .bdev, void .holder' include/linux/blkdev.h))
 ccflags-y += -DKC_BLKDEV_PUT_HOLDER_ARG
 endif
+
+#
+# v6.4-rc4-163-g0d625446d0a4
+#
+# Entirely removes current->backing_dev_info to ultimately remove buffer_head
+# completely at some point.
+ifneq (,$(shell grep 'struct backing_dev_info.*backing_dev_info;' include/linux/sched.h))
+ccflags-y += -DKC_CURRENT_BACKING_DEV_INFO
+endif
+
+#
+# v6.8-rc1-4-gf3a608827d1f
+#
+# adds bdev_file_open_by_path() and later in v6.8-rc1-30-ge97d06a46526 removes bdev_open_by_path()
+# which requires us to use the file method from now on.
+ifneq (,$(shell grep 'struct file.*bdev_file_open_by_path.const char.*path' include/linux/blkdev.h))
+ccflags-y += -DKC_BDEV_FILE_OPEN_BY_PATH
+endif
+
+# v4.0-rc7-1796-gfe0f07d08ee3
+#
+# direct-io changes modify inode_dio_done to now be called inode_dio_end
+ifneq (,$(shell grep 'void inode_dio_end.struct inode' include/linux/fs.h))
+ccflags-y += -DKC_INODE_DIO_END
+endif
+
+#
+# v5.0-6476-g3d3539018d2c
+#
+# page fault handlers return a bitmask vm_fault_t instead
+# Note: el8's header has a slightly modified prefix here
+ifneq (,$(shell grep 'typedef.*__bitwise unsigned.*int vm_fault_t' include/linux/mm_types.h))
+ccflags-y += -DKC_MM_VM_FAULT_T
+endif
+
+# v3.19-499-gd83a08db5ba6
+#
+# .remap pages becomes obsolete
+ifneq (,$(shell grep 'int ..remap_pages..struct vm_area_struct' include/linux/mm.h))
+ccflags-y += -DKC_MM_REMAP_PAGES
+endif
+
+#
+# v3.19-4742-g503c358cf192
+#
+# list_lru_shrink_count() and list_lru_shrink_walk() introduced
+#
+ifneq (,$(shell grep 'list_lru_shrink_count.*struct list_lru' include/linux/list_lru.h))
+ccflags-y += -DKC_LIST_LRU_SHRINK_COUNT_WALK
+endif
+
+#
+# v3.19-4757-g3f97b163207c
+#
+# lru_list_walk_cb lru arg added
+#
+ifneq (,$(shell grep 'struct list_head \*item, spinlock_t \*lock, void \*cb_arg' include/linux/list_lru.h))
+ccflags-y += -DKC_LIST_LRU_WALK_CB_ITEM_LOCK
+endif
+
+#
+# v6.7-rc4-153-g0a97c01cd20b
+#
+# list_lru_{add,del} -> list_lru_{add,del}_obj
+#
+ifneq (,$(shell grep '^bool list_lru_add_obj' include/linux/list_lru.h))
+ccflags-y += -DKC_LIST_LRU_ADD_OBJ
+endif
+
+#
+# v6.12-rc6-227-gda0c02516c50
+#
+# lru_list_walk_cb lock arg removed
+#
+ifneq (,$(shell grep 'struct list_lru_one \*list, spinlock_t \*lock, void \*cb_arg' include/linux/list_lru.h))
+ccflags-y += -DKC_LIST_LRU_WALK_CB_LIST_LOCK
+endif
+
--- a/kmod/src/alloc.c
+++ b/kmod/src/alloc.c
@@ -86,18 +86,47 @@ static u64 smallest_order_length(u64 len)
 }

 /*
- * An extent modification dirties three distinct leaves of an allocator
- * btree as it adds and removes the blkno and size sorted items for the
- * old and new lengths of the extent.  Dirtying the paths to these
- * leaves can grow the tree and grow/shrink neighbours at each level.
- * We over-estimate the number of blocks allocated and freed (the paths
- * share a root, growth doesn't free) to err on the simpler and safer
- * side.  The overhead is minimal given the relatively large list blocks
- * and relatively short allocator trees.
+ * Moving an extent between trees can dirty blocks in several ways. This
+ * function calculates worst case number of blocks across these scenarions.
+ * We treat the alloc and free counts independently, so the values below are
+ * max(allocated, freed), not the sum.
+ *
+ * We track extents with two separate btree items: by block number and by size.
+ *
+ * If we're removing an extent from the btree (allocating), we can dirty
+ * two blocks if the keys are in different leaves. If we wind up merging
+ * leaves because we fall below the low water mark, we can wind up freeing
+ * three leaves.
+ *
+ * That sequence is as follows, assuming the original keys are removed from
+ * blocks A and B:
+ *
+ * Allocate new dirty A' and B'
+ * Free old stable A and B
+ * B' has fallen below the low water mark, so copy B' into A'
+ * Free B'
+ *
+ * An extent insertion (freeing an extent) can dirty up to five distinct items
+ * in the btree as it adds and removes the blkno and size sorted items for the
+ * old and new lengths of the extent:
+ *
+ * In the by-blkno portion of the btree, we can dirty (allocate for COW) up
+ * to two blocks- either by merging adjacent extents, which can cause us to
+ * join leaf blocks; or by an insertion that causes a split.
+ *
+ * In the by-size portion, we never merge extents, so normally we just dirty
+ * a single item with a size insertion. But if we merged adjacent extents in
+ * the by-blkno portion of the tree, we might be working with three by-sizex
+ * items: removing the two old ones that were combined in the merge; and
+ * adding the new one for the larger, merged size.
+ *
+ * Finally, dirtying the paths to these leaves can grow the tree and grow/shrink
+ * neighbours at each level, so we multiply by the height of the tree after
+ * accounting for a possible new level.
 */
 static u32 extent_mod_blocks(u32 height)
 {
-	return ((1 + height) * 2) * 3;
+	return ((1 + height) * 3) * 5;
 }

 /*
--- a/kmod/src/block.c
+++ b/kmod/src/block.c
@@ -22,6 +22,7 @@
 #include <linux/rhashtable.h>
 #include <linux/random.h>
 #include <linux/sched/mm.h>
+#include <linux/list_lru.h>

 #include "format.h"
 #include "super.h"
@@ -38,26 +39,12 @@
 * than the page size.  Callers can have their own contexts for tracking
 * dirty blocks that are written together.  We pin dirty blocks in
 * memory and only checksum them all as they're all written.
- *
- * Memory reclaim is driven by maintaining two very coarse groups of
- * blocks.  As we access blocks we mark them with an increasing counter
- * to discourage them from being reclaimed.  We then define a threshold
- * at the current counter minus half the population.  Recent blocks have
- * a counter greater than the threshold, and all other blocks with
- * counters less than it are considered older and are candidates for
- * reclaim.  This results in access updates rarely modifying an atomic
- * counter as blocks need to be moved into the recent group, and shrink
- * can randomly scan blocks looking for the half of the population that
- * will be in the old group.  It's reasonably effective, but is
- * particularly efficient and avoids contention between concurrent
- * accesses and shrinking.
 */

 struct block_info {
 	struct super_block *sb;
-	atomic_t total_inserted;
-	atomic64_t access_counter;
 	struct rhashtable ht;
+	struct list_lru lru;
 	wait_queue_head_t waitq;
 	KC_DEFINE_SHRINKER(shrinker);
 	struct work_struct free_work;
@@ -76,28 +63,15 @@ enum block_status_bits {
 	BLOCK_BIT_PAGE_ALLOC,	/* page (possibly high order) allocation */
 	BLOCK_BIT_VIRT,		/* mapped virt allocation */
 	BLOCK_BIT_CRC_VALID,	/* crc has been verified */
+	BLOCK_BIT_ACCESSED,	/* seen by lookup since last lru add/walk */
 };

-/*
- * We want to tie atomic changes in refcounts to whether or not the
- * block is still visible in the hash table, so we store the hash
- * table's reference up at a known high bit.  We could naturally set the
- * inserted bit through excessive refcount increments.  We don't do
- * anything about that but at least warn if we get close.
- *
- * We're avoiding the high byte for no real good reason, just out of a
- * historical fear of implementations that don't provide the full
- * precision.
- */
-#define BLOCK_REF_INSERTED	(1U << 23)
-#define BLOCK_REF_FULL		(BLOCK_REF_INSERTED >> 1)
-
 struct block_private {
 	struct scoutfs_block bl;
 	struct super_block *sb;
 	atomic_t refcount;
-	u64 accessed;
 	struct rhash_head ht_head;
+	struct list_head lru_head;
 	struct list_head dirty_entry;
 	struct llist_node free_node;
 	unsigned long bits;
@@ -112,7 +86,7 @@ struct block_private {
 do {												\
 	__typeof__(bp) _bp = (bp);								\
 	trace_scoutfs_block_##which(_bp->sb, _bp, _bp->bl.blkno, atomic_read(&_bp->refcount),	\
-				    atomic_read(&_bp->io_count), _bp->bits, _bp->accessed);	\
+				    atomic_read(&_bp->io_count), _bp->bits);	\
 } while (0)

 #define BLOCK_PRIVATE(_bl) \
@@ -176,6 +150,7 @@ static struct block_private *block_alloc(struct super_block *sb, u64 blkno)
 	bp->bl.blkno = blkno;
 	bp->sb = sb;
 	atomic_set(&bp->refcount, 1);
+	INIT_LIST_HEAD(&bp->lru_head);
 	INIT_LIST_HEAD(&bp->dirty_entry);
 	set_bit(BLOCK_BIT_NEW, &bp->bits);
 	atomic_set(&bp->io_count, 0);
@@ -233,32 +208,85 @@ static void block_free_work(struct work_struct *work)
 }

 /*
- * Get a reference to a block while holding an existing reference.
+ * Users of blocks hold a refcount.  If putting a refcount drops to zero
+ * then the block is freed.
+ *
+ * Acquiring new references and claiming the exclusive right to tear
+ * down a block is built around this LIVE_REFCOUNT_BASE refcount value.
+ * As blocks are initially cached they have the live base added to their
+ * refcount.  Lookups will only increment the refcount and return blocks
+ * for reference holders while the refcount is >= than the base.
+ *
+ * To remove a block from the cache and eventually free it, either by
+ * the lru walk in the shrinker, or by reference holders, the live base
+ * is removed and turned into a normal refcount increment that will be
+ * put by the caller.  This can only be done once for a block, and once
+ * its done lookup will not return any more references.
+ */
+#define LIVE_REFCOUNT_BASE (INT_MAX ^ (INT_MAX >> 1))
+
+/*
+ * Inc the refcount while holding an incremented refcount.  We can't
+ * have so many individual reference holders that they pass the live
+ * base.
 */
 static void block_get(struct block_private *bp)
 {
-	WARN_ON_ONCE((atomic_read(&bp->refcount) & ~BLOCK_REF_INSERTED) <= 0);
+	int now = atomic_inc_return(&bp->refcount);

-	atomic_inc(&bp->refcount);
+	BUG_ON(now <= 1);
+	BUG_ON(now == LIVE_REFCOUNT_BASE);
 }

 /*
- * Get a reference to a block as long as it's been inserted in the hash
- * table and hasn't been removed.
- */ 
-static struct block_private *block_get_if_inserted(struct block_private *bp)
+ * if (*v >= u) {
+ * 	*v += a;
+ * 	return true;
+ * }
+ */
+static bool atomic_add_unless_less(atomic_t *v, int a, int u)
 {
-	int cnt;
+	int c;

 	do {
-		cnt = atomic_read(&bp->refcount);
-		WARN_ON_ONCE(cnt & BLOCK_REF_FULL);
-		if (!(cnt & BLOCK_REF_INSERTED))
-			return NULL;
+		c = atomic_read(v);
+		if (c < u)
+			return false;
+	} while (atomic_cmpxchg(v, c, c + a) != c);

-	} while (atomic_cmpxchg(&bp->refcount, cnt, cnt + 1) != cnt);
+	return true;
+}

-	return bp;
+static bool block_get_if_live(struct block_private *bp)
+{
+	return atomic_add_unless_less(&bp->refcount, 1, LIVE_REFCOUNT_BASE);
+}
+
+/*
+ * If the refcount still has the live base, subtract it and increment
+ * the callers refcount that they'll put.
+ */
+static bool block_get_remove_live(struct block_private *bp)
+{
+	return atomic_add_unless_less(&bp->refcount, (1 - LIVE_REFCOUNT_BASE), LIVE_REFCOUNT_BASE);
+}
+
+/*
+ * Only get the live base refcount if it is the only refcount remaining.
+ * This means that there are no active refcount holders and the block
+ * can't be dirty or under IO, which both hold references.
+ */
+static bool block_get_remove_live_only(struct block_private *bp)
+{
+	int c;
+
+	do {
+		c = atomic_read(&bp->refcount);
+		if (c != LIVE_REFCOUNT_BASE)
+			return false;
+	} while (atomic_cmpxchg(&bp->refcount, c, c - LIVE_REFCOUNT_BASE + 1) != c);
+
+	return true;
 }

 /*
@@ -290,104 +318,73 @@ static const struct rhashtable_params block_ht_params = {
 };

 /*
- * Insert a new block into the hash table.  Once it is inserted in the
- * hash table readers can start getting references.  The caller may have
- * multiple refs but the block can't already be inserted.
+ * Insert the block into the cache so that it's visible for lookups.
+ * The caller can hold references (including for a dirty block).
+ *
+ * We make sure the base is added and the block is in the lru once it's
+ * in the hash.  If hash table insertion fails it'll be briefly visible
+ * in the lru, but won't be isolated/evicted because we hold an
+ * incremented refcount in addition to the live base.
 */
 static int block_insert(struct super_block *sb, struct block_private *bp)
 {
 	DECLARE_BLOCK_INFO(sb, binf);
 	int ret;

-	WARN_ON_ONCE(atomic_read(&bp->refcount) & BLOCK_REF_INSERTED);
-
+	BUG_ON(atomic_read(&bp->refcount) >= LIVE_REFCOUNT_BASE);
+	atomic_add(LIVE_REFCOUNT_BASE, &bp->refcount);
+	smp_mb__after_atomic(); /* make sure live base is visible to list_lru walk */
+	list_lru_add_obj(&binf->lru, &bp->lru_head);
 retry:
-	atomic_add(BLOCK_REF_INSERTED, &bp->refcount);
 	ret = rhashtable_lookup_insert_fast(&binf->ht, &bp->ht_head, block_ht_params);
 	if (ret < 0) {
-		atomic_sub(BLOCK_REF_INSERTED, &bp->refcount);
 		if (ret == -EBUSY) {
 			/* wait for pending rebalance to finish */
 			synchronize_rcu();
 			goto retry;
+		} else {
+			atomic_sub(LIVE_REFCOUNT_BASE, &bp->refcount);
+			BUG_ON(atomic_read(&bp->refcount) >= LIVE_REFCOUNT_BASE);
+			list_lru_del_obj(&binf->lru, &bp->lru_head);
 		}
 	} else {
-		atomic_inc(&binf->total_inserted);
 		TRACE_BLOCK(insert, bp);
 	}

 	return ret;
 }

-static u64 accessed_recently(struct block_info *binf)
-{
-	return atomic64_read(&binf->access_counter) - (atomic_read(&binf->total_inserted) >> 1);
-}
-
 /*
- * Make sure that a block that is being accessed is less likely to be
- * reclaimed if it is seen by the shrinker.   If the block hasn't been
- * accessed recently we update its accessed value.
+ * Indicate to the lru walker that this block has been accessed since it
+ * was added or last walked.
 */
 static void block_accessed(struct super_block *sb, struct block_private *bp)
 {
-	DECLARE_BLOCK_INFO(sb, binf);
-
-	if (bp->accessed == 0 || bp->accessed < accessed_recently(binf)) {
+	if (!test_and_set_bit(BLOCK_BIT_ACCESSED, &bp->bits))
 		scoutfs_inc_counter(sb, block_cache_access_update);
-		bp->accessed = atomic64_inc_return(&binf->access_counter);
-	}
 }

 /*
- * The caller wants to remove the block from the hash table and has an
- * idea what the refcount should be.  If the refcount does still
- * indicate that the block is hashed, and we're able to clear that bit,
- * then we can remove it from the hash table.
+ * Remove the block from the cache.  When this returns the block won't
+ * be visible for additional references from lookup.
 *
- * The caller makes sure that it's safe to be referencing this block,
- * either with their own held reference (most everything) or by being in
- * an rcu grace period (shrink).
- */
-static bool block_remove_cnt(struct super_block *sb, struct block_private *bp, int cnt)
-{
-	DECLARE_BLOCK_INFO(sb, binf);
-	int ret;
-
-	if ((cnt & BLOCK_REF_INSERTED) &&
-	    (atomic_cmpxchg(&bp->refcount, cnt, cnt & ~BLOCK_REF_INSERTED) == cnt)) {
-
-		TRACE_BLOCK(remove, bp);
-		ret = rhashtable_remove_fast(&binf->ht, &bp->ht_head, block_ht_params);
-		WARN_ON_ONCE(ret); /* must have been inserted */
-		atomic_dec(&binf->total_inserted);
-		return true;
-	}
-
-	return false;
-}
-
-/*
- * Try to remove the block from the hash table as long as the refcount
- * indicates that it is still in the hash table.  This can be racing
- * with normal refcount changes so it might have to retry.
+ * We always try and remove from the hash table.  It's safe to remove a
+ * block that isn't hashed, it just returns -ENOENT.
+ *
+ * This is racing with the lru walk in the shrinker also trying to
+ * remove idle blocks from the cache.  They both try to remove the live
+ * refcount base and perform their removal and put if they get it.
 */
 static void block_remove(struct super_block *sb, struct block_private *bp)
 {
-	int cnt;
+	DECLARE_BLOCK_INFO(sb, binf);

-	do {
-		cnt = atomic_read(&bp->refcount);
-	} while ((cnt & BLOCK_REF_INSERTED) && !block_remove_cnt(sb, bp, cnt));
-}
+	rhashtable_remove_fast(&binf->ht, &bp->ht_head, block_ht_params);

-/*
- * Take one shot at removing the block from the hash table if it's still
- * in the hash table and the caller has the only other reference.
- */
-static bool block_remove_solo(struct super_block *sb, struct block_private *bp)
-{
-	return block_remove_cnt(sb, bp, BLOCK_REF_INSERTED | 1);
+	if (block_get_remove_live(bp)) {
+		list_lru_del_obj(&binf->lru, &bp->lru_head);
+		block_put(sb, bp);
+	}
 }

 static bool io_busy(struct block_private *bp)
@@ -396,37 +393,6 @@ static bool io_busy(struct block_private *bp)
 	return test_bit(BLOCK_BIT_IO_BUSY, &bp->bits);
 }

-/*
- * Called during shutdown with no other users.
- */
-static void block_remove_all(struct super_block *sb)
-{
-	DECLARE_BLOCK_INFO(sb, binf);
-	struct rhashtable_iter iter;
-	struct block_private *bp;
-
-	rhashtable_walk_enter(&binf->ht, &iter);
-	rhashtable_walk_start(&iter);
-
-	for (;;) {
-		bp = rhashtable_walk_next(&iter);
-		if (bp == NULL)
-			break;
-		if (bp == ERR_PTR(-EAGAIN))
-			continue;
-
-		if (block_get_if_inserted(bp)) {
-			block_remove(sb, bp);
-			WARN_ON_ONCE(atomic_read(&bp->refcount) != 1);
-			block_put(sb, bp);
-		}
-	}
-
-	rhashtable_walk_stop(&iter);
-	rhashtable_walk_exit(&iter);
-
-	WARN_ON_ONCE(atomic_read(&binf->total_inserted) != 0);
-}

 /*
 * XXX The io_count and sb fields in the block_private are only used
@@ -488,7 +454,7 @@ static int block_submit_bio(struct super_block *sb, struct block_private *bp,
 	int ret = 0;

 	if (scoutfs_forcing_unmount(sb))
-		return -EIO;
+		return -ENOLINK;

 	sector = bp->bl.blkno << (SCOUTFS_BLOCK_LG_SHIFT - 9);

@@ -543,6 +509,10 @@ static int block_submit_bio(struct super_block *sb, struct block_private *bp,
 	return ret;
 }

+/*
+ * Return a block with an elevated refcount if it was present in the
+ * hash table and its refcount didn't indicate that it was being freed.
+ */
 static struct block_private *block_lookup(struct super_block *sb, u64 blkno)
 {
 	DECLARE_BLOCK_INFO(sb, binf);
@@ -550,8 +520,8 @@ static struct block_private *block_lookup(struct super_block *sb, u64 blkno)

 	rcu_read_lock();
 	bp = rhashtable_lookup(&binf->ht, &blkno, block_ht_params);
-	if (bp)
-		bp = block_get_if_inserted(bp);
+	if (bp && !block_get_if_live(bp))
+		bp = NULL;
 	rcu_read_unlock();

 	return bp;
@@ -712,8 +682,8 @@ retry:

 	ret = 0;
 out:
-	if ((ret == -ESTALE || scoutfs_trigger(sb, BLOCK_REMOVE_STALE)) &&
-	    !retried && !block_is_dirty(bp)) {
+	if (!retried && !IS_ERR_OR_NULL(bp) && !block_is_dirty(bp) &&
+	    (ret == -ESTALE || scoutfs_trigger(sb, BLOCK_REMOVE_STALE))) {
 		retried = true;
 		scoutfs_inc_counter(sb, block_cache_remove_stale);
 		block_remove(sb, bp);
@@ -1078,100 +1048,85 @@ static unsigned long block_count_objects(struct shrinker *shrink, struct shrink_
 	struct super_block *sb = binf->sb;

 	scoutfs_inc_counter(sb, block_cache_count_objects);
-
-	return shrinker_min_long(atomic_read(&binf->total_inserted));
+	return list_lru_shrink_count(&binf->lru, sc);
+}
+
+struct isolate_args {
+	struct super_block *sb;
+	struct list_head dispose;
+};
+
+#define DECLARE_ISOLATE_ARGS(sb_, name_) \
+	struct isolate_args name_ = { \
+		.sb = sb_, \
+		.dispose = LIST_HEAD_INIT(name_.dispose), \
+	}
+
+static enum lru_status isolate_lru_block(struct list_head *item, struct list_lru_one *list,
+					 void *cb_arg)
+{
+	struct block_private *bp = container_of(item, struct block_private, lru_head);
+	struct isolate_args *ia = cb_arg;
+
+	TRACE_BLOCK(isolate, bp);
+
+	/* rotate accessed blocks to the tail of the list (lazy promotion) */
+	if (test_and_clear_bit(BLOCK_BIT_ACCESSED, &bp->bits)) {
+		scoutfs_inc_counter(ia->sb, block_cache_isolate_rotate);
+		return LRU_ROTATE;
+	}
+
+	/* any refs, including dirty/io, stop us from acquiring lru refcount */
+	if (!block_get_remove_live_only(bp)) {
+		scoutfs_inc_counter(ia->sb, block_cache_isolate_skip);
+		return LRU_SKIP;
+	}
+
+	scoutfs_inc_counter(ia->sb, block_cache_isolate_removed);
+	list_lru_isolate_move(list, &bp->lru_head, &ia->dispose);
+	return LRU_REMOVED;
+}
+
+static void shrink_dispose_blocks(struct super_block *sb, struct list_head *dispose)
+{
+	struct block_private *bp;
+	struct block_private *bp__;
+
+	list_for_each_entry_safe(bp, bp__, dispose, lru_head) {
+		list_del_init(&bp->lru_head);
+		block_remove(sb, bp);
+		block_put(sb, bp);
+	}
 }

-/*
- * Remove a number of cached blocks that haven't been used recently.
- *
- * We don't maintain a strictly ordered LRU to avoid the contention of
- * accesses always moving blocks around in some precise global
- * structure.
- *
- * Instead we use counters to divide the blocks into two roughly equal
- * groups by how recently they were accessed.  We randomly walk all
- * inserted blocks looking for any blocks in the older half to remove
- * and free.  The random walk and there being two groups means that we
- * typically only walk a small multiple of the number we're looking for
- * before we find them all.
- *
- * Our rcu walk of blocks can see blocks in all stages of their life
- * cycle, from dirty blocks to those with 0 references that are queued
- * for freeing.  We only want to free idle inserted blocks so we
- * atomically remove blocks when the only references are ours and the
- * hash table.
- */
 static unsigned long block_scan_objects(struct shrinker *shrink, struct shrink_control *sc)
 {
 	struct block_info *binf = KC_SHRINKER_CONTAINER_OF(shrink, struct block_info);
 	struct super_block *sb = binf->sb;
-	struct rhashtable_iter iter;
-	struct block_private *bp;
-	bool stop = false;
-	unsigned long freed = 0;
-	unsigned long nr = sc->nr_to_scan;
-	u64 recently;
+	DECLARE_ISOLATE_ARGS(sb, ia);
+	unsigned long freed;

 	scoutfs_inc_counter(sb, block_cache_scan_objects);

-	recently = accessed_recently(binf);
-	rhashtable_walk_enter(&binf->ht, &iter);
-	rhashtable_walk_start(&iter);
+	freed = kc_list_lru_shrink_walk(&binf->lru, sc, isolate_lru_block, &ia);
+	shrink_dispose_blocks(sb, &ia.dispose);
+	return freed;
+}

-	/*
-	 * This isn't great but I don't see a better way.  We want to
-	 * walk the hash from a random point so that we're not
-	 * constantly walking over the same region that we've already
-	 * freed old blocks within.  The interface doesn't let us do
-	 * this explicitly, but this seems to work?  The difference this
-	 * makes is enormous, around a few orders of magnitude fewer
-	 * _nexts per shrink.
-	 */
-	if (iter.walker.tbl)
-		iter.slot = prandom_u32_max(iter.walker.tbl->size);
+/*
+ * Called during shutdown with no other users.  The isolating walk must
+ * find blocks on the lru that only have references for presence on the
+ * lru and in the hash table.
+ */
+static void block_shrink_all(struct super_block *sb)
+{
+	DECLARE_BLOCK_INFO(sb, binf);
+	DECLARE_ISOLATE_ARGS(sb, ia);

-	while (nr > 0) {
-		bp = rhashtable_walk_next(&iter);
-		if (bp == NULL)
-			break;
-		if (bp == ERR_PTR(-EAGAIN)) {
-			/*
-			 * We can be called from reclaim in the allocation
-			 * to resize the hash table itself.  We have to
-			 * return so that the caller can proceed and
-			 * enable hash table iteration again.
-			 */
-			scoutfs_inc_counter(sb, block_cache_shrink_stop);
-			stop = true;
-			break;
-		}
-
-		scoutfs_inc_counter(sb, block_cache_shrink_next);
-
-		if (bp->accessed >= recently) {
-			scoutfs_inc_counter(sb, block_cache_shrink_recent);
-			continue;
-		}
-
-		if (block_get_if_inserted(bp)) {
-			if (block_remove_solo(sb, bp)) {
-				scoutfs_inc_counter(sb, block_cache_shrink_remove);
-				TRACE_BLOCK(shrink, bp);
-				freed++;
-				nr--;
-			}
-			block_put(sb, bp);
-		}
-	}
-
-	rhashtable_walk_stop(&iter);
-	rhashtable_walk_exit(&iter);
-
-	if (stop)
-		return SHRINK_STOP;
-	else
-		return freed;
+	do {
+		kc_list_lru_walk(&binf->lru, isolate_lru_block, &ia, 128);
+		shrink_dispose_blocks(sb, &ia.dispose);
+        } while (list_lru_count(&binf->lru) > 0);
 }

 struct sm_block_completion {
@@ -1210,7 +1165,7 @@ static int sm_block_io(struct super_block *sb, struct block_device *bdev, blk_op
 	BUILD_BUG_ON(PAGE_SIZE < SCOUTFS_BLOCK_SM_SIZE);

 	if (scoutfs_forcing_unmount(sb))
-		return -EIO;
+		return -ENOLINK;

 	if (WARN_ON_ONCE(len > SCOUTFS_BLOCK_SM_SIZE) ||
 	    WARN_ON_ONCE(!op_is_write(opf) && !blk_crc))
@@ -1276,7 +1231,7 @@ int scoutfs_block_write_sm(struct super_block *sb,
 int scoutfs_block_setup(struct super_block *sb)
 {
 	struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
-	struct block_info *binf;
+	struct block_info *binf = NULL;
 	int ret;

 	binf = kzalloc(sizeof(struct block_info), GFP_KERNEL);
@@ -1285,15 +1240,15 @@ int scoutfs_block_setup(struct super_block *sb)
 		goto out;
 	}

-	ret = rhashtable_init(&binf->ht, &block_ht_params);
-	if (ret < 0) {
-		kfree(binf);
+	ret = list_lru_init(&binf->lru);
+	if (ret < 0)
+		goto out;
+
+	ret = rhashtable_init(&binf->ht, &block_ht_params);
+	if (ret < 0)
 		goto out;
-	}

 	binf->sb = sb;
-	atomic_set(&binf->total_inserted, 0);
-	atomic64_set(&binf->access_counter, 0);
 	init_waitqueue_head(&binf->waitq);
 	KC_INIT_SHRINKER_FUNCS(&binf->shrinker, block_count_objects,
 			       block_scan_objects);
@@ -1305,8 +1260,10 @@ int scoutfs_block_setup(struct super_block *sb)

 	ret = 0;
 out:
-	if (ret)
-		scoutfs_block_destroy(sb);
+	if (ret < 0 && binf) {
+		list_lru_destroy(&binf->lru);
+		kfree(binf);
+	}

 	return ret;
 }
@@ -1318,9 +1275,10 @@ void scoutfs_block_destroy(struct super_block *sb)

 	if (binf) {
 		KC_UNREGISTER_SHRINKER(&binf->shrinker);
-		block_remove_all(sb);
+		block_shrink_all(sb);
 		flush_work(&binf->free_work);
 		rhashtable_destroy(&binf->ht);
+		list_lru_destroy(&binf->lru);

 		kfree(binf);
 		sbi->block_info = NULL;
--- a/kmod/src/counters.h
+++ b/kmod/src/counters.h
@@ -26,17 +26,15 @@
 	EXPAND_COUNTER(block_cache_alloc_page_order)		\
 	EXPAND_COUNTER(block_cache_alloc_virt)			\
 	EXPAND_COUNTER(block_cache_end_io_error)		\
+	EXPAND_COUNTER(block_cache_isolate_removed)		\
+	EXPAND_COUNTER(block_cache_isolate_rotate)		\
+	EXPAND_COUNTER(block_cache_isolate_skip)		\
 	EXPAND_COUNTER(block_cache_forget)			\
 	EXPAND_COUNTER(block_cache_free)			\
 	EXPAND_COUNTER(block_cache_free_work)			\
 	EXPAND_COUNTER(block_cache_remove_stale)		\
 	EXPAND_COUNTER(block_cache_count_objects)		\
 	EXPAND_COUNTER(block_cache_scan_objects)		\
-	EXPAND_COUNTER(block_cache_shrink)			\
-	EXPAND_COUNTER(block_cache_shrink_next)			\
-	EXPAND_COUNTER(block_cache_shrink_recent)		\
-	EXPAND_COUNTER(block_cache_shrink_remove)		\
-	EXPAND_COUNTER(block_cache_shrink_stop)			\
 	EXPAND_COUNTER(btree_compact_values)			\
 	EXPAND_COUNTER(btree_compact_values_enomem)		\
 	EXPAND_COUNTER(btree_delete)				\
@@ -90,6 +88,7 @@
 	EXPAND_COUNTER(forest_read_items)			\
 	EXPAND_COUNTER(forest_roots_next_hint)			\
 	EXPAND_COUNTER(forest_set_bloom_bits)			\
+	EXPAND_COUNTER(inode_deleted)				\
 	EXPAND_COUNTER(item_cache_count_objects)		\
 	EXPAND_COUNTER(item_cache_scan_objects)			\
 	EXPAND_COUNTER(item_clear_dirty)			\
@@ -117,10 +116,11 @@
 	EXPAND_COUNTER(item_pcpu_page_hit)			\
 	EXPAND_COUNTER(item_pcpu_page_miss)			\
 	EXPAND_COUNTER(item_pcpu_page_miss_keys)		\
+	EXPAND_COUNTER(item_read_pages_barrier)			\
+	EXPAND_COUNTER(item_read_pages_retry)			\
 	EXPAND_COUNTER(item_read_pages_split)			\
 	EXPAND_COUNTER(item_shrink_page)			\
 	EXPAND_COUNTER(item_shrink_page_dirty)			\
-	EXPAND_COUNTER(item_shrink_page_reader)			\
 	EXPAND_COUNTER(item_shrink_page_trylock)		\
 	EXPAND_COUNTER(item_update)				\
 	EXPAND_COUNTER(item_write_dirty)			\
@@ -145,6 +145,7 @@
 	EXPAND_COUNTER(lock_shrink_work)			\
 	EXPAND_COUNTER(lock_unlock)				\
 	EXPAND_COUNTER(lock_wait)				\
+	EXPAND_COUNTER(log_merge_no_finalized)			\
 	EXPAND_COUNTER(log_merge_wait_timeout)			\
 	EXPAND_COUNTER(net_dropped_response)			\
 	EXPAND_COUNTER(net_send_bytes)				\
@@ -181,6 +182,7 @@
 	EXPAND_COUNTER(quorum_send_vote)			\
 	EXPAND_COUNTER(quorum_server_shutdown)			\
 	EXPAND_COUNTER(quorum_term_follower)			\
+	EXPAND_COUNTER(reclaimed_open_logs)			\
 	EXPAND_COUNTER(server_commit_hold)			\
 	EXPAND_COUNTER(server_commit_queue)			\
 	EXPAND_COUNTER(server_commit_worker)			\
--- a/kmod/src/data.c
+++ b/kmod/src/data.c
@@ -560,7 +560,7 @@ static int scoutfs_get_block(struct inode *inode, sector_t iblock,
 	u64 offset;
 	int ret;

-	WARN_ON_ONCE(create && !inode_is_locked(inode));
+	WARN_ON_ONCE(create && !rwsem_is_locked(&si->extent_sem));

 	/* make sure caller holds a cluster lock */
 	lock = scoutfs_per_task_get(&si->pt_data_lock);
@@ -1551,13 +1551,17 @@ int scoutfs_data_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
 	struct super_block *sb = inode->i_sb;
 	const u64 ino = scoutfs_ino(inode);
 	struct scoutfs_lock *lock = NULL;
+	struct scoutfs_extent *info = NULL;
+	struct page *page = NULL;
 	struct scoutfs_extent ext;
 	struct scoutfs_extent cur;
 	struct data_ext_args args;
 	u32 last_flags;
 	u64 iblock;
 	u64 last;
+	int entries = 0;
 	int ret;
+	int complete = 0;

 	if (len == 0) {
 		ret = 0;
@@ -1568,16 +1572,11 @@ int scoutfs_data_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
 	if (ret)
 		goto out;

-	inode_lock(inode);
-	down_read(&si->extent_sem);
-
-	ret = scoutfs_lock_inode(sb, SCOUTFS_LOCK_READ, 0, inode, &lock);
-	if (ret)
-		goto unlock;
-
-	args.ino = ino;
-	args.inode = inode;
-	args.lock = lock;
+	page = alloc_page(GFP_KERNEL);
+	if (!page) {
+		ret = -ENOMEM;
+		goto out;
+	}

 	/* use a dummy extent to track */
 	memset(&cur, 0, sizeof(cur));
@@ -1586,48 +1585,93 @@ int scoutfs_data_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
 	iblock = start >> SCOUTFS_BLOCK_SM_SHIFT;
 	last = (start + len - 1) >> SCOUTFS_BLOCK_SM_SHIFT;

+	args.ino = ino;
+	args.inode = inode;
+
+	/* outer loop */
 	while (iblock <= last) {
-		ret = scoutfs_ext_next(sb, &data_ext_ops, &args,
-				       iblock, 1, &ext);
-		if (ret < 0) {
-			if (ret == -ENOENT)
+		/* lock */
+		inode_lock(inode);
+		down_read(&si->extent_sem);
+
+		ret = scoutfs_lock_inode(sb, SCOUTFS_LOCK_READ, 0, inode, &lock);
+		if (ret) {
+			up_read(&si->extent_sem);
+			inode_unlock(inode);
+			break;
+		}
+
+		args.lock = lock;
+
+		/* collect entries */
+		info = page_address(page);
+		memset(info, 0, PAGE_SIZE);
+		while (entries < (PAGE_SIZE / sizeof(struct fiemap_extent)) - 1) {
+			ret = scoutfs_ext_next(sb, &data_ext_ops, &args,
+					       iblock, 1, &ext);
+			if (ret < 0) {
+				if (ret == -ENOENT)
+					ret = 0;
+				complete = 1;
+				last_flags = FIEMAP_EXTENT_LAST;
+				break;
+			}
+
+			trace_scoutfs_data_fiemap_extent(sb, ino, &ext);
+
+			if (ext.start > last) {
+				/* not setting _LAST, it's for end of file */
 				ret = 0;
-			last_flags = FIEMAP_EXTENT_LAST;
-			break;
+				complete = 1;
+				break;
+			}
+
+			if (scoutfs_ext_can_merge(&cur, &ext)) {
+				/* merged extents could be greater than input len */
+				cur.len += ext.len;
+			} else {
+				/* fill it */
+				memcpy(info, &cur, sizeof(cur));
+
+				entries++;
+				info++;
+
+				cur = ext;
+			}
+
+			iblock = ext.start + ext.len;
 		}

-		trace_scoutfs_data_fiemap_extent(sb, ino, &ext);
+		/* unlock */
+		scoutfs_unlock(sb, lock, SCOUTFS_LOCK_READ);
+		up_read(&si->extent_sem);
+		inode_unlock(inode);

-		if (ext.start > last) {
-			/* not setting _LAST, it's for end of file */
-			ret = 0;
+		if (ret)
 			break;
-		}

-		if (scoutfs_ext_can_merge(&cur, &ext)) {
-			/* merged extents could be greater than input len */
-			cur.len += ext.len;
-		} else {
-			ret = fill_extent(fieinfo, &cur, 0);
+		/* emit entries */
+		info = page_address(page);
+		for (; entries > 0; entries--) {
+			ret = fill_extent(fieinfo, info, 0);
 			if (ret != 0)
-				goto unlock;
-			cur = ext;
+				goto out;
+			info++;
 		}

-		iblock = ext.start + ext.len;
+		if (complete)
+			break;
 	}

+	/* still one left, it's in cur */
 	if (cur.len)
 		ret = fill_extent(fieinfo, &cur, last_flags);
-unlock:
-	scoutfs_unlock(sb, lock, SCOUTFS_LOCK_READ);
-	up_read(&si->extent_sem);
-	inode_unlock(inode);

 out:
 	if (ret == 1)
 		ret = 0;
-
+	if (page)
+		__free_page(page);
 	trace_scoutfs_data_fiemap(sb, start, len, ret);

 	return ret;
@@ -1914,6 +1958,236 @@ int scoutfs_data_waiting(struct super_block *sb, u64 ino, u64 iblock,
 	return ret;
 }

+#ifdef KC_MM_VM_FAULT_T
+static vm_fault_t scoutfs_data_page_mkwrite(struct vm_fault *vmf)
+{
+	struct vm_area_struct *vma = vmf->vma;
+#else
+static int scoutfs_data_page_mkwrite(struct vm_area_struct *vma,
+				     struct vm_fault *vmf)
+{
+#endif
+	struct page *page = vmf->page;
+	struct file *file = vma->vm_file;
+	struct inode *inode = file_inode(file);
+	struct scoutfs_inode_info *si = SCOUTFS_I(inode);
+	struct super_block *sb = inode->i_sb;
+	struct scoutfs_lock *lock = NULL;
+	SCOUTFS_DECLARE_PER_TASK_ENTRY(pt_ent);
+	DECLARE_DATA_WAIT(dw);
+	struct write_begin_data wbd;
+	u64 ind_seq;
+	loff_t pos;
+	loff_t size;
+	unsigned int len = PAGE_SIZE;
+	vm_fault_t ret = VM_FAULT_SIGBUS;
+	int err;
+
+	pos = vmf->pgoff << PAGE_SHIFT;
+
+	sb_start_pagefault(sb);
+
+	err = scoutfs_lock_inode(sb, SCOUTFS_LOCK_WRITE,
+				 SCOUTFS_LKF_REFRESH_INODE, inode, &lock);
+	if (err) {
+		ret = vmf_error(err);
+		goto out;
+	}
+
+	size = i_size_read(inode);
+
+	if (scoutfs_per_task_add_excl(&si->pt_data_lock, &pt_ent, lock)) {
+		/* data_version is per inode, whole file must be online */
+		err = scoutfs_data_wait_check(inode, 0, size,
+					      SEF_OFFLINE,
+					      SCOUTFS_IOC_DWO_WRITE,
+					      &dw, lock);
+		if (err != 0) {
+			if (err < 0)
+				ret = vmf_error(err);
+			goto out_unlock;
+		}
+	}
+
+
+	/* scoutfs_write_begin */
+	memset(&wbd, 0, sizeof(wbd));
+	INIT_LIST_HEAD(&wbd.ind_locks);
+	wbd.lock = lock;
+
+	/*
+	 * Start transaction before taking page locks - we want to make sure we're
+	 * not locking a page, then waiting for trans, because writeback might race
+	 * against it and cause a lock inversion hang - as demonstrated by both
+	 * holetest and fsstress tests in xfstests.
+	 */
+	do {
+		err = scoutfs_inode_index_start(sb, &ind_seq) ?:
+			scoutfs_inode_index_prepare(sb, &wbd.ind_locks, inode,
+						    true) ?:
+			scoutfs_inode_index_try_lock_hold(sb, &wbd.ind_locks,
+							  ind_seq, false);
+	} while (err > 0);
+	if (err < 0) {
+		ret = vmf_error(err);
+		goto out_trans;
+	}
+
+	down_write(&si->extent_sem);
+
+	if (!trylock_page(page)) {
+		ret = VM_FAULT_NOPAGE;
+		goto out_sem;
+	}
+	ret = VM_FAULT_LOCKED;
+
+	if ((page->mapping != inode->i_mapping) ||
+	    (!PageUptodate(page)) ||
+	    (page_offset(page) > size))	 {
+		unlock_page(page);
+		ret = VM_FAULT_NOPAGE;
+		goto out_sem;
+	}
+
+	if (page->index == (size - 1) >> PAGE_SHIFT)
+		len = ((size - 1) & ~PAGE_MASK) + 1;
+
+	err = __block_write_begin(page, pos, PAGE_SIZE, scoutfs_get_block);
+	if (err) {
+		ret = vmf_error(err);
+		unlock_page(page);
+		goto out_sem;
+	}
+	/* end scoutfs_write_begin */
+
+	/*
+	 * We mark the page dirty already here so that when freeze is in
+	 * progress, we are guaranteed that writeback during freezing will
+	 * see the dirty page and writeprotect it again.
+	 */
+	set_page_dirty(page);
+	wait_for_stable_page(page);
+
+	/* scoutfs_write_end */
+	scoutfs_inode_set_data_seq(inode);
+	scoutfs_inode_inc_data_version(inode);
+
+	file_update_time(vma->vm_file);
+
+	scoutfs_update_inode_item(inode, wbd.lock, &wbd.ind_locks);
+	scoutfs_inode_queue_writeback(inode);
+
+out_sem:
+	up_write(&si->extent_sem);
+out_trans:
+	scoutfs_release_trans(sb);
+	scoutfs_inode_index_unlock(sb, &wbd.ind_locks);
+	/* end scoutfs_write_end */
+
+out_unlock:
+	scoutfs_per_task_del(&si->pt_data_lock, &pt_ent);
+	scoutfs_unlock(sb, lock, SCOUTFS_LOCK_WRITE);
+
+out:
+	sb_end_pagefault(sb);
+
+	if (scoutfs_data_wait_found(&dw)) {
+		/*
+		 * It'd be really nice to not hold the mmap_sem lock here
+		 * before waiting for data, and then return VM_FAULT_RETRY
+		 */
+		err = scoutfs_data_wait(inode, &dw);
+		if (err == 0)
+			ret = VM_FAULT_NOPAGE;
+		else
+			ret = vmf_error(err);
+	}
+
+	trace_scoutfs_data_page_mkwrite(sb, scoutfs_ino(inode), pos, (__force u32)ret);
+
+	return ret;
+}
+
+#ifdef KC_MM_VM_FAULT_T
+static vm_fault_t scoutfs_data_filemap_fault(struct vm_fault *vmf)
+{
+	struct vm_area_struct *vma = vmf->vma;
+#else
+static int scoutfs_data_filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
+{
+#endif
+	struct file *file = vma->vm_file;
+	struct inode *inode = file_inode(file);
+	struct scoutfs_inode_info *si = SCOUTFS_I(inode);
+	struct super_block *sb = inode->i_sb;
+	struct scoutfs_lock *inode_lock = NULL;
+	SCOUTFS_DECLARE_PER_TASK_ENTRY(pt_ent);
+	DECLARE_DATA_WAIT(dw);
+	loff_t pos;
+	int err;
+	vm_fault_t ret = VM_FAULT_SIGBUS;
+
+	pos = vmf->pgoff;
+	pos <<= PAGE_SHIFT;
+
+retry:
+	err = scoutfs_lock_inode(sb, SCOUTFS_LOCK_READ,
+				 SCOUTFS_LKF_REFRESH_INODE, inode, &inode_lock);
+	if (err < 0)
+		return vmf_error(err);
+
+	if (scoutfs_per_task_add_excl(&si->pt_data_lock, &pt_ent, inode_lock)) {
+		/* protect checked extents from stage/release */
+		atomic_inc(&inode->i_dio_count);
+
+		err = scoutfs_data_wait_check(inode, pos, PAGE_SIZE,
+					      SEF_OFFLINE, SCOUTFS_IOC_DWO_READ,
+					      &dw, inode_lock);
+		if (err != 0) {
+			if (err < 0)
+				ret = vmf_error(err);
+			goto out;
+		}
+	}
+
+#ifdef KC_MM_VM_FAULT_T
+	ret = filemap_fault(vmf);
+#else
+	ret = filemap_fault(vma, vmf);
+#endif
+
+out:
+	if (scoutfs_per_task_del(&si->pt_data_lock, &pt_ent))
+		kc_inode_dio_end(inode);
+	scoutfs_unlock(sb, inode_lock, SCOUTFS_LOCK_READ);
+	if (scoutfs_data_wait_found(&dw)) {
+		err = scoutfs_data_wait(inode, &dw);
+		if (err == 0)
+			goto retry;
+
+		ret = VM_FAULT_RETRY;
+	}
+
+	trace_scoutfs_data_filemap_fault(sb, scoutfs_ino(inode), pos, (__force u32)ret);
+
+	return ret;
+}
+
+static const struct vm_operations_struct scoutfs_data_file_vm_ops = {
+	.fault		= scoutfs_data_filemap_fault,
+	.page_mkwrite	= scoutfs_data_page_mkwrite,
+#ifdef KC_MM_REMAP_PAGES
+	.remap_pages	= generic_file_remap_pages,
+#endif
+};
+
+static int scoutfs_file_mmap(struct file *file, struct vm_area_struct *vma)
+{
+	file_accessed(file);
+	vma->vm_ops = &scoutfs_data_file_vm_ops;
+	return 0;
+}
+
 const struct address_space_operations scoutfs_file_aops = {
 #ifdef KC_MPAGE_READ_FOLIO
 	.dirty_folio		= block_dirty_folio,
@@ -1945,6 +2219,7 @@ const struct file_operations scoutfs_file_fops = {
 	.splice_read	= generic_file_splice_read,
 	.splice_write	= iter_file_splice_write,
 #endif
+	.mmap		= scoutfs_file_mmap,
 	.unlocked_ioctl	= scoutfs_ioctl,
 	.fsync		= scoutfs_file_fsync,
 	.llseek		= scoutfs_file_llseek,
--- a/kmod/src/dir.c
+++ b/kmod/src/dir.c
@@ -11,11 +11,13 @@
 * General Public License for more details.
 */
 #include <linux/kernel.h>
+#include <linux/stddef.h>
 #include <linux/fs.h>
 #include <linux/slab.h>
 #include <linux/uio.h>
 #include <linux/xattr.h>
 #include <linux/namei.h>
+#include <linux/mm.h>

 #include "format.h"
 #include "file.h"
@@ -434,6 +436,15 @@ out:
 		return d_splice_alias(inode, dentry);
 }

+/*
+ * Helper to make iterating through dirent ptrs aligned
+ */
+static inline struct scoutfs_dirent *next_aligned_dirent(struct scoutfs_dirent *dent, u8 len)
+{
+	return (void *)dent +
+		ALIGN(offsetof(struct scoutfs_dirent, name[len]), __alignof__(struct scoutfs_dirent));
+}
+
 /*
 * readdir simply iterates over the dirent items for the dir inode and
 * uses their offset as the readdir position.
@@ -441,76 +452,112 @@ out:
 * It will need to be careful not to read past the region of the dirent
 * hash offset keys that it has access to.
 */
-static int KC_DECLARE_READDIR(scoutfs_readdir, struct file *file,
-			      void *dirent, kc_readdir_ctx_t ctx)
+static int scoutfs_readdir(struct file *file, struct dir_context *ctx)
 {
 	struct inode *inode = file_inode(file);
 	struct super_block *sb = inode->i_sb;
 	struct scoutfs_lock *dir_lock = NULL;
 	struct scoutfs_dirent *dent = NULL;
+/* we'll store name_len in dent->__pad[0] */
+#define hacky_name_len __pad[0]
 	struct scoutfs_key last_key;
 	struct scoutfs_key key;
+	struct page *page = NULL;
 	int name_len;
 	u64 pos;
+	int entries = 0;
 	int ret;
+	int complete = 0;
+	struct scoutfs_dirent *end;

-	if (!kc_dir_emit_dots(file, dirent, ctx))
+	if (!dir_emit_dots(file, ctx))
 		return 0;

-	dent = alloc_dirent(SCOUTFS_NAME_LEN);
-	if (!dent) {
+	page = alloc_page(GFP_KERNEL);
+	if (!page)
 		return -ENOMEM;
-	}
+
+	end = page_address(page) + PAGE_SIZE;

 	init_dirent_key(&last_key, SCOUTFS_READDIR_TYPE, scoutfs_ino(inode),
 			SCOUTFS_DIRENT_LAST_POS, 0);

-	ret = scoutfs_lock_inode(sb, SCOUTFS_LOCK_READ, 0, inode, &dir_lock);
-	if (ret)
-		goto out;
-
+	/*
+	 * lock and fetch dirent items, until the page no longer fits
+	 * a max size dirent (288b). Then unlock and dir_emit the ones
+	 * we stored in the page.
+	 */
 	for (;;) {
-		init_dirent_key(&key, SCOUTFS_READDIR_TYPE, scoutfs_ino(inode),
-				kc_readdir_pos(file, ctx), 0);
+		/* lock */
+		ret = scoutfs_lock_inode(sb, SCOUTFS_LOCK_READ, 0, inode, &dir_lock);
+		if (ret)
+			break;

-		ret = scoutfs_item_next(sb, &key, &last_key, dent,
-					dirent_bytes(SCOUTFS_NAME_LEN),
-					dir_lock);
-		if (ret < 0) {
-			if (ret == -ENOENT)
+		dent = page_address(page);
+		pos = ctx->pos;
+		while (next_aligned_dirent(dent, SCOUTFS_NAME_LEN) < end) {
+			init_dirent_key(&key, SCOUTFS_READDIR_TYPE, scoutfs_ino(inode),
+					pos, 0);
+
+			ret = scoutfs_item_next(sb, &key, &last_key, dent,
+						dirent_bytes(SCOUTFS_NAME_LEN),
+						dir_lock);
+			if (ret < 0) {
+				if (ret == -ENOENT) {
+					ret = 0;
+					complete = 1;
+				}
+				break;
+			}
+
+			name_len = ret - sizeof(struct scoutfs_dirent);
+			dent->hacky_name_len = name_len;
+			if (name_len < 1 || name_len > SCOUTFS_NAME_LEN) {
+				scoutfs_corruption(sb, SC_DIRENT_READDIR_NAME_LEN,
+						   corrupt_dirent_readdir_name_len,
+						   "dir_ino %llu pos %llu key "SK_FMT" len %d",
+						   scoutfs_ino(inode),
+						   pos,
+						   SK_ARG(&key), name_len);
+				ret = -EIO;
+				break;
+			}
+
+			pos = le64_to_cpu(dent->pos) + 1;
+
+			dent = next_aligned_dirent(dent, name_len);
+			entries++;
+		}
+
+		/* unlock */
+		scoutfs_unlock(sb, dir_lock, SCOUTFS_LOCK_READ);
+
+		if (ret < 0)
+			break;
+
+		dent = page_address(page);
+		for (; entries > 0; entries--) {
+			ctx->pos = le64_to_cpu(dent->pos);
+			if (!dir_emit(ctx, dent->name, dent->hacky_name_len,
+					le64_to_cpu(dent->ino),
+					dentry_type(dent->type))) {
 				ret = 0;
+				goto out;
+			}
+
+			dent = next_aligned_dirent(dent, dent->hacky_name_len);
+
+			/* always advance ctx->pos past */
+			ctx->pos++;
+		}
+
+		if (complete)
 			break;
-		}
-
-		name_len = ret - sizeof(struct scoutfs_dirent);
-		if (name_len < 1 || name_len > SCOUTFS_NAME_LEN) {
-			scoutfs_corruption(sb, SC_DIRENT_READDIR_NAME_LEN,
-					   corrupt_dirent_readdir_name_len,
-					   "dir_ino %llu pos %llu key "SK_FMT" len %d",
-					   scoutfs_ino(inode),
-					   kc_readdir_pos(file, ctx),
-					   SK_ARG(&key), name_len);
-			ret = -EIO;
-			goto out;
-		}
-
-		pos = le64_to_cpu(key.skd_major);
-		kc_readdir_pos(file, ctx) = pos;
-
-		if (!kc_dir_emit(ctx, dirent, dent->name, name_len, pos,
-				le64_to_cpu(dent->ino),
-				dentry_type(dent->type))) {
-			ret = 0;
-			break;
-		}
-
-		kc_readdir_pos(file, ctx) = pos + 1;
 	}

 out:
-	scoutfs_unlock(sb, dir_lock, SCOUTFS_LOCK_READ);
-
-	kfree(dent);
+	if (page)
+		__free_page(page);
 	return ret;
 }

@@ -1765,7 +1812,7 @@ retry:
 	}
 	old_inode->i_ctime = now;
 	if (new_inode)
-		old_inode->i_ctime = now;
+		new_inode->i_ctime = now;

 	inode_inc_iversion(old_dir);
 	inode_inc_iversion(old_inode);
@@ -1973,7 +2020,7 @@ const struct inode_operations scoutfs_symlink_iops = {
 };

 const struct file_operations scoutfs_dir_fops = {
-	.KC_FOP_READDIR	= scoutfs_readdir,
+	.iterate	= scoutfs_readdir,
 #ifdef KC_FMODE_KABI_ITERATE
 	.open		= scoutfs_dir_open,
 #endif
--- a/kmod/src/format.h
+++ b/kmod/src/format.h
@@ -470,7 +470,7 @@ struct scoutfs_srch_compact {
 * @get_trans_seq, @commit_trans_seq: These pair of sequence numbers
 * determine if a transaction is currently open for the mount that owns
 * the log_trees struct.  get_trans_seq is advanced by the server as the
- * transaction is opened.   The server sets comimt_trans_seq equal to
+ * transaction is opened.   The server sets commit_trans_seq equal to
 * get_ as the transaction is committed.
 */
 struct scoutfs_log_trees {
@@ -1091,7 +1091,8 @@ enum scoutfs_net_cmd {
 	EXPAND_NET_ERRNO(ENOMEM)	\
 	EXPAND_NET_ERRNO(EIO)		\
 	EXPAND_NET_ERRNO(ENOSPC)	\
-	EXPAND_NET_ERRNO(EINVAL)
+	EXPAND_NET_ERRNO(EINVAL)	\
+	EXPAND_NET_ERRNO(ENOLINK)

 #undef EXPAND_NET_ERRNO
 #define EXPAND_NET_ERRNO(which) SCOUTFS_NET_ERR_##which,
--- a/kmod/src/inode.c
+++ b/kmod/src/inode.c
@@ -1854,6 +1854,9 @@ static int try_delete_inode_items(struct super_block *sb, u64 ino)
 		goto out;

 	ret = delete_inode_items(sb, ino, &sinode, lock, orph_lock);
+	if (ret == 0)
+		scoutfs_inc_counter(sb, inode_deleted);
+
 out:
 	if (clear_trying)
 		clear_bit(bit_nr, ldata->trying);
@@ -1962,6 +1965,8 @@ static void iput_worker(struct work_struct *work)
 		while (count-- > 0)
 			iput(inode);

+		cond_resched();
+
 		/* can't touch inode after final iput */

 		spin_lock(&inf->iput_lock);
--- a/kmod/src/ioctl.c
+++ b/kmod/src/ioctl.c
@@ -58,25 +58,23 @@
 * key space after we find no items in a given lock region.  This is
 * relatively cheap because reading is going to check the segments
 * anyway.
- *
- * This is copying to userspace while holding a read lock.  This is safe
- * because faulting can send a request for a write lock while the read
- * lock is being used.  The cluster locks don't block tasks in a node,
- * they match and the tasks fall back to local locking.  In this case
- * the spin locks around the item cache.
 */
 static long scoutfs_ioc_walk_inodes(struct file *file, unsigned long arg)
 {
 	struct super_block *sb = file_inode(file)->i_sb;
 	struct scoutfs_ioctl_walk_inodes __user *uwalk = (void __user *)arg;
 	struct scoutfs_ioctl_walk_inodes walk;
-	struct scoutfs_ioctl_walk_inodes_entry ent;
+	struct scoutfs_ioctl_walk_inodes_entry *ent = NULL;
+	struct scoutfs_ioctl_walk_inodes_entry *end;
 	struct scoutfs_key next_key;
 	struct scoutfs_key last_key;
 	struct scoutfs_key key;
 	struct scoutfs_lock *lock;
+	struct page *page = NULL;
 	u64 last_seq;
+	u64 entries = 0;
 	int ret = 0;
+	int complete = 0;
 	u32 nr = 0;
 	u8 type;

@@ -107,6 +105,10 @@ static long scoutfs_ioc_walk_inodes(struct file *file, unsigned long arg)
 		}
 	}

+	page = alloc_page(GFP_KERNEL);
+	if (!page)
+		return -ENOMEM;
+
 	scoutfs_inode_init_index_key(&key, type, walk.first.major,
 				     walk.first.minor, walk.first.ino);
 	scoutfs_inode_init_index_key(&last_key, type, walk.last.major,
@@ -115,77 +117,107 @@ static long scoutfs_ioc_walk_inodes(struct file *file, unsigned long arg)
 	/* cap nr to the max the ioctl can return to a compat task */
 	walk.nr_entries = min_t(u64, walk.nr_entries, INT_MAX);

-	ret = scoutfs_lock_inode_index(sb, SCOUTFS_LOCK_READ, type,
-				       walk.first.major, walk.first.ino,
-				       &lock);
-	if (ret < 0)
-		goto out;
+	end = page_address(page) + PAGE_SIZE;

-	for (nr = 0; nr < walk.nr_entries; ) {
+	/* outer loop */
+	for (nr = 0;;) {
+		ent = page_address(page);
+		/* make sure _pad and minor are zeroed */
+		memset(ent, 0, PAGE_SIZE);

-		ret = scoutfs_item_next(sb, &key, &last_key, NULL, 0, lock);
-		if (ret < 0 && ret != -ENOENT)
+		ret = scoutfs_lock_inode_index(sb, SCOUTFS_LOCK_READ, type,
+					       le64_to_cpu(key.skii_major),
+					       le64_to_cpu(key.skii_ino),
+					       &lock);
+		if (ret)
 			break;

-		if (ret == -ENOENT) {
-
-			/* done if lock covers last iteration key */
-			if (scoutfs_key_compare(&last_key, &lock->end) <= 0) {
-				ret = 0;
+		/* inner loop 1 */
+		while (ent + 1 < end) {
+			ret = scoutfs_item_next(sb, &key, &last_key, NULL, 0, lock);
+			if (ret < 0 && ret != -ENOENT)
 				break;
+
+			if (ret == -ENOENT) {
+				/* done if lock covers last iteration key */
+				if (scoutfs_key_compare(&last_key, &lock->end) <= 0) {
+					ret = 0;
+					complete = 1;
+					break;
+				}
+
+				/* continue iterating after locked empty region */
+				key = lock->end;
+				scoutfs_key_inc(&key);
+
+				scoutfs_unlock(sb, lock, SCOUTFS_LOCK_READ);
+				/* avoid double-unlocking here after break */
+				lock = NULL;
+
+				ret = scoutfs_forest_next_hint(sb, &key, &next_key);
+				if (ret < 0 && ret != -ENOENT)
+					break;
+
+				if (ret == -ENOENT ||
+				    scoutfs_key_compare(&next_key, &last_key) > 0) {
+					ret = 0;
+					complete = 1;
+					break;
+				}
+
+				key = next_key;
+
+				ret = scoutfs_lock_inode_index(sb, SCOUTFS_LOCK_READ,
+							type,
+							le64_to_cpu(key.skii_major),
+							le64_to_cpu(key.skii_ino),
+							&lock);
+				if (ret)
+					break;
+
+				continue;
 			}

-			/* continue iterating after locked empty region */
-			key = lock->end;
+			ent->major = le64_to_cpu(key.skii_major);
+			ent->ino = le64_to_cpu(key.skii_ino);
+
 			scoutfs_key_inc(&key);

-			scoutfs_unlock(sb, lock, SCOUTFS_LOCK_READ);
+			ent++;
+			entries++;

-			ret = scoutfs_forest_next_hint(sb, &key, &next_key);
-			if (ret < 0 && ret != -ENOENT)
-				goto out;
+			if (nr + entries >= walk.nr_entries) {
+				complete = 1;
+				break;
+			}
+		}

-			if (ret == -ENOENT ||
-			    scoutfs_key_compare(&next_key, &last_key) > 0) {
-				ret = 0;
+		scoutfs_unlock(sb, lock, SCOUTFS_LOCK_READ);
+		if (ret < 0)
+			break;
+
+		/* inner loop 2 */
+		ent = page_address(page);
+		for (; entries > 0; entries--) {
+			if (copy_to_user((void __user *)walk.entries_ptr, ent,
+					 sizeof(struct scoutfs_ioctl_walk_inodes_entry))) {
+				ret = -EFAULT;
 				goto out;
 			}
-
-			key = next_key;
-
-			ret = scoutfs_lock_inode_index(sb, SCOUTFS_LOCK_READ,
-						key.sk_type,
-						le64_to_cpu(key.skii_major),
-						le64_to_cpu(key.skii_ino),
-						&lock);
-			if (ret < 0)
-				goto out;
-
-			continue;
+			walk.entries_ptr += sizeof(struct scoutfs_ioctl_walk_inodes_entry);
+			ent++;
+			nr++;
 		}

-		ent.major = le64_to_cpu(key.skii_major);
-		ent.minor = 0;
-		ent.ino = le64_to_cpu(key.skii_ino);
-
-		if (copy_to_user((void __user *)walk.entries_ptr, &ent,
-				 sizeof(ent))) {
-			ret = -EFAULT;
+		if (complete)
 			break;
-		}
-
-		nr++;
-		walk.entries_ptr += sizeof(ent);
-
-		scoutfs_key_inc(&key);
 	}

-	scoutfs_unlock(sb, lock, SCOUTFS_LOCK_READ);
-
 out:
+	if (page)
+		__free_page(page);
 	if (nr > 0)
 		ret = nr;
-
 	return ret;
 }

@@ -524,7 +556,9 @@ static long scoutfs_ioc_stage(struct file *file, unsigned long arg)
 	}

 	si->staging = true;
+#ifdef KC_CURRENT_BACKING_DEV_INFO
 	current->backing_dev_info = inode_to_bdi(inode);
+#endif

 	pos = args.offset;
 	written = 0;
@@ -537,7 +571,9 @@ static long scoutfs_ioc_stage(struct file *file, unsigned long arg)
 	} while (ret > 0 && written < args.length);

 	si->staging = false;
+#ifdef KC_CURRENT_BACKING_DEV_INFO
 	current->backing_dev_info = NULL;
+#endif
 out:
 	scoutfs_per_task_del(&si->pt_data_lock, &pt_ent);
 	scoutfs_unlock(sb, lock, SCOUTFS_LOCK_WRITE);
@@ -1159,11 +1195,15 @@ static long scoutfs_ioc_get_allocated_inos(struct file *file, unsigned long arg)
 	struct scoutfs_lock *lock = NULL;
 	struct scoutfs_key key;
 	struct scoutfs_key end;
+	struct page *page = NULL;
 	u64 __user *uinos;
 	u64 bytes;
-	u64 ino;
+	u64 *ino;
+	u64 *ino_end;
+	int entries = 0;
 	int nr;
 	int ret;
+	int complete = 0;

 	if (!(file->f_mode & FMODE_READ)) {
 		ret = -EBADF;
@@ -1185,47 +1225,83 @@ static long scoutfs_ioc_get_allocated_inos(struct file *file, unsigned long arg)
 		goto out;
 	}

+	page = alloc_page(GFP_KERNEL);
+	if (!page) {
+		ret = -ENOMEM;
+		goto out;
+	}
+	ino_end = page_address(page) + PAGE_SIZE;
+
 	scoutfs_inode_init_key(&key, gai.start_ino);
 	scoutfs_inode_init_key(&end, gai.start_ino | SCOUTFS_LOCK_INODE_GROUP_MASK);
 	uinos = (void __user *)gai.inos_ptr;
 	bytes = gai.inos_bytes;
 	nr = 0;

-	ret = scoutfs_lock_ino(sb, SCOUTFS_LOCK_READ, 0, gai.start_ino, &lock);
-	if (ret < 0)
-		goto out;
+	for (;;) {

-	while (bytes >= sizeof(*uinos)) {
+		ret = scoutfs_lock_ino(sb, SCOUTFS_LOCK_READ, 0, gai.start_ino, &lock);
+		if (ret < 0)
+			goto out;

-		ret = scoutfs_item_next(sb, &key, &end, NULL, 0, lock);
-		if (ret < 0) {
-			if (ret == -ENOENT)
+		ino = page_address(page);
+		while (ino < ino_end) {
+
+			ret = scoutfs_item_next(sb, &key, &end, NULL, 0, lock);
+			if (ret < 0) {
+				if (ret == -ENOENT) {
+					ret = 0;
+					complete = 1;
+				}
+				break;
+			}
+
+			if (key.sk_zone != SCOUTFS_FS_ZONE) {
 				ret = 0;
-			break;
+				complete = 1;
+				break;
+			}
+
+			/* all fs items are owned by allocated inodes, and _first is always ino */
+			*ino = le64_to_cpu(key._sk_first);
+			scoutfs_inode_init_key(&key, *ino + 1);
+
+			ino++;
+			entries++;
+			nr++;
+
+			bytes -= sizeof(*uinos);
+			if (bytes < sizeof(*uinos)) {
+				complete = 1;
+				break;
+			}
+
+			if (nr == INT_MAX) {
+				complete = 1;
+				break;
+			}
 		}

-		if (key.sk_zone != SCOUTFS_FS_ZONE) {
-			ret = 0;
-			break;
-		}
+		scoutfs_unlock(sb, lock, SCOUTFS_LOCK_READ);

-		/* all fs items are owned by allocated inodes, and _first is always ino */
-		ino = le64_to_cpu(key._sk_first);
-		if (put_user(ino, uinos)) {
+		if (ret < 0)
+			break;
+
+		ino = page_address(page);
+		if (copy_to_user(uinos, ino, entries * sizeof(*uinos))) {
 			ret = -EFAULT;
-			break;
+			goto out;
 		}

-		uinos++;
-		bytes -= sizeof(*uinos);
-		if (++nr == INT_MAX)
+		uinos += entries;
+		entries = 0;
+
+		if (complete)
 			break;
-
-		scoutfs_inode_init_key(&key, ino + 1);
 	}
-
-	scoutfs_unlock(sb, lock, SCOUTFS_LOCK_READ);
 out:
+	if (page)
+		__free_page(page);
 	return ret ?: nr;
 }

--- a/kmod/src/item.c
+++ b/kmod/src/item.c
@@ -97,9 +97,8 @@ struct item_cache_info {
 	struct list_head lru_list;
 	unsigned long lru_pages;

-	/* written by page readers, read by shrink */
-	spinlock_t active_lock;
-	struct list_head active_list;
+	/* stop readers from caching stale items behind reclaimed cleaned written items */
+	atomic64_t read_dirty_barrier;
 };

 #define DECLARE_ITEM_CACHE_INFO(sb, name) \
@@ -1285,78 +1284,6 @@ static int cache_empty_page(struct super_block *sb,
 	return 0;
 }

-/*
- * Readers operate independently from dirty items and transactions.
- * They read a set of persistent items and insert them into the cache
- * when there aren't already pages whose key range contains the items.
- * This naturally prefers cached dirty items over stale read items.
- *
- * We have to deal with the case where dirty items are written and
- * invalidated while a read is in flight.   The reader won't have seen
- * the items that were dirty in their persistent roots as they started
- * reading.  By the time they insert their read pages the previously
- * dirty items have been reclaimed and are not in the cache.  The old
- * stale items will be inserted in their place, effectively corrupting
- * by having the dirty items disappear.
- *
- * We fix this by tracking the max seq of items in pages.  As readers
- * start they record the current transaction seq.  Invalidation skips
- * pages with a max seq greater than the first reader seq because the
- * items in the page have to stick around to prevent the readers stale
- * items from being inserted.
- *
- * This naturally only affects a small set of pages with items that were
- * written relatively recently.  If we're in memory pressure then we
- * probably have a lot of pages and they'll naturally have items that
- * were visible to any raders.  We don't bother with the complicated and
- * expensive further refinement of tracking the ranges that are being
- * read and comparing those with pages to invalidate.
- */
-struct active_reader {
-	struct list_head head;
-	u64 seq;
-};
-
-#define INIT_ACTIVE_READER(rdr) \
-	struct active_reader rdr = { .head = LIST_HEAD_INIT(rdr.head) }
-
-static void add_active_reader(struct super_block *sb, struct active_reader *active)
-{
-	DECLARE_ITEM_CACHE_INFO(sb, cinf);
-
-	BUG_ON(!list_empty(&active->head));
-
-	active->seq = scoutfs_trans_sample_seq(sb);
-
-	spin_lock(&cinf->active_lock);
-	list_add_tail(&active->head, &cinf->active_list);
-	spin_unlock(&cinf->active_lock);
-}
-
-static u64 first_active_reader_seq(struct item_cache_info *cinf)
-{
-	struct active_reader *active;
-	u64 first;
-
-	/* only the calling task adds or deletes this active */
-	spin_lock(&cinf->active_lock);
-	active = list_first_entry_or_null(&cinf->active_list, struct active_reader, head);
-	first = active ? active->seq : U64_MAX;
-	spin_unlock(&cinf->active_lock);
-
-	return first;
-}
-
-static void del_active_reader(struct item_cache_info *cinf, struct active_reader *active)
-{
-	/* only the calling task adds or deletes this active */
-	if (!list_empty(&active->head)) {
-		spin_lock(&cinf->active_lock);
-		list_del_init(&active->head);
-		spin_unlock(&cinf->active_lock);
-	}
-}
-
 /*
 * Add a newly read item to the pages that we're assembling for
 * insertion into the cache.   These pages are private, they only exist
@@ -1450,24 +1377,34 @@ static int read_page_item(struct super_block *sb, struct scoutfs_key *key, u64 s
 * and duplicates, we insert any resulting pages which don't overlap
 * with existing cached pages.
 *
- * We only insert uncached regions because this is called with cluster
- * locks held, but without locking the cache.  The regions we read can
- * be stale with respect to the current cache, which can be read and
- * dirtied by other cluster lock holders on our node, but the cluster
- * locks protect the stable items we read.  Invalidation is careful not
- * to drop pages that have items that we couldn't see because they were
- * dirty when we started reading.
- *
 * The forest item reader is reading stable trees that could be
 * overwritten.  It can return -ESTALE which we return to the caller who
 * will retry the operation and work with a new set of more recent
 * btrees.
+ *
+ * We only insert uncached regions because this is called with cluster
+ * locks held, but without locking the cache.  The regions we read can
+ * be stale with respect to the current cache, which can be read and
+ * dirtied by other cluster lock holders on our node, but the cluster
+ * locks protect the stable items we read.
+ *
+ * Using the presence of locally written dirty pages to override stale
+ * read pages only works if, well, the more recent locally written pages
+ * are still present.  Readers are totally decoupled from writers and
+ * can have a set of items that is very old indeed.  In the mean time
+ * more recent items would have been dirtied locally, committed,
+ * cleaned, and reclaimed.  We have a coarse barrier which ensures that
+ * readers can't insert items read from old roots from before local data
+ * was written.  If a write completes while a read is in progress the
+ * read will have to retry.  The retried read can use cached blocks so
+ * we're relying on reads being much faster than writes to reduce the
+ * overhead to mostly cpu work of recollecting the items from cached
+ * blocks via a more recent root from the server.
 */
 static int read_pages(struct super_block *sb, struct item_cache_info *cinf,
 		      struct scoutfs_key *key, struct scoutfs_lock *lock)
 {
 	struct rb_root root = RB_ROOT;
-	INIT_ACTIVE_READER(active);
 	struct cached_page *right = NULL;
 	struct cached_page *pg;
 	struct cached_page *rd;
@@ -1480,6 +1417,7 @@ static int read_pages(struct super_block *sb, struct item_cache_info *cinf,
 	struct rb_node *par;
 	struct rb_node *pg_tmp;
 	struct rb_node *item_tmp;
+	u64 rdbar;
 	int pgi;
 	int ret;

@@ -1493,8 +1431,7 @@ static int read_pages(struct super_block *sb, struct item_cache_info *cinf,
 	pg->end = lock->end;
 	rbtree_insert(&pg->node, NULL, &root.rb_node, &root);

-	/* set active reader seq before reading persistent roots */
-	add_active_reader(sb, &active);
+	rdbar = atomic64_read(&cinf->read_dirty_barrier);

 	start = lock->start;
 	end = lock->end;
@@ -1533,11 +1470,19 @@ static int read_pages(struct super_block *sb, struct item_cache_info *cinf,
 retry:
 	write_lock(&cinf->rwlock);

+	ret = 0;
 	while ((rd = first_page(&root))) {

 		pg = page_rbtree_walk(sb, &cinf->pg_root, &rd->start, &rd->end,
 				      NULL, NULL, &par, &pnode);
 		if (!pg) {
+			/* can't insert if write is cleaning (write_lock is read barrier) */
+			if (atomic64_read(&cinf->read_dirty_barrier) != rdbar) {
+				scoutfs_inc_counter(sb, item_read_pages_barrier);
+				ret = -ESTALE;
+				break;
+			}
+
 			/* insert read pages that don't intersect */
 			rbtree_erase(&rd->node, &root);
 			rbtree_insert(&rd->node, par, pnode, &cinf->pg_root);
@@ -1572,10 +1517,7 @@ retry:

 	write_unlock(&cinf->rwlock);

-	ret = 0;
 out:
-	del_active_reader(cinf, &active);
-
 	/* free any pages we left dangling on error */
 	for_each_page_safe(&root, rd, pg_tmp) {
 		rbtree_erase(&rd->node, &root);
@@ -1635,6 +1577,7 @@ retry:
 			ret = read_pages(sb, cinf, key, lock);
 		if (ret < 0 && ret != -ESTALE)
 			goto out;
+		scoutfs_inc_counter(sb, item_read_pages_retry);
 		goto retry;
 	}

@@ -2401,6 +2344,12 @@ out:
 * The caller has successfully committed all the dirty btree blocks that
 * contained the currently dirty items.  Clear all the dirty items and
 * pages.
+ *
+ * This strange lock/trylock loop comes from sparse issuing spurious
+ * mismatched context warnings if we do anything (like unlock and relax)
+ * in the else branch of the failed trylock.  We're jumping through
+ * hoops to not use the else but still drop and reacquire the dirty_lock
+ * if the trylock fails.
 */
 int scoutfs_item_write_done(struct super_block *sb)
 {
@@ -2409,40 +2358,34 @@ int scoutfs_item_write_done(struct super_block *sb)
 	struct cached_item *tmp;
 	struct cached_page *pg;

-retry:
+	/* don't let read_pages insert possibly stale items */
+	atomic64_inc(&cinf->read_dirty_barrier);
+	smp_mb__after_atomic();
+
 	spin_lock(&cinf->dirty_lock);
-
-	while ((pg = list_first_entry_or_null(&cinf->dirty_list,
-					      struct cached_page,
-					      dirty_head))) {
-
-		if (!write_trylock(&pg->rwlock)) {
+	while ((pg = list_first_entry_or_null(&cinf->dirty_list, struct cached_page, dirty_head))) {
+		if (write_trylock(&pg->rwlock)) {
 			spin_unlock(&cinf->dirty_lock);
-			cpu_relax();
-			goto retry;
-		}
+			list_for_each_entry_safe(item, tmp, &pg->dirty_list,
+						 dirty_head) {
+				clear_item_dirty(sb, cinf, pg, item);

+				if (item->delta)
+					scoutfs_inc_counter(sb, item_delta_written);
+
+				/* free deletion items */
+				if (item->deletion || item->delta)
+					erase_item(pg, item);
+				else
+					item->persistent = 1;
+			}
+
+			write_unlock(&pg->rwlock);
+			spin_lock(&cinf->dirty_lock);
+		}
 		spin_unlock(&cinf->dirty_lock);
-
-		list_for_each_entry_safe(item, tmp, &pg->dirty_list,
-					 dirty_head) {
-			clear_item_dirty(sb, cinf, pg, item);
-
-			if (item->delta)
-				scoutfs_inc_counter(sb, item_delta_written);
-
-			/* free deletion items */
-			if (item->deletion || item->delta)
-				erase_item(pg, item);
-			else
-				item->persistent = 1;
-		}
-
-		write_unlock(&pg->rwlock);
-
 		spin_lock(&cinf->dirty_lock);
-	}
-
+	} while (pg);
 	spin_unlock(&cinf->dirty_lock);

 	return 0;
@@ -2597,24 +2540,15 @@ static unsigned long item_cache_scan_objects(struct shrinker *shrink,
 	struct cached_page *tmp;
 	struct cached_page *pg;
 	unsigned long freed = 0;
-	u64 first_reader_seq;
 	int nr = sc->nr_to_scan;

 	scoutfs_inc_counter(sb, item_cache_scan_objects);

-	/* can't invalidate pages with items that weren't visible to first reader */
-	first_reader_seq = first_active_reader_seq(cinf);
-
 	write_lock(&cinf->rwlock);
 	spin_lock(&cinf->lru_lock);

 	list_for_each_entry_safe(pg, tmp, &cinf->lru_list, lru_head) {

-		if (first_reader_seq <= pg->max_seq) {
-			scoutfs_inc_counter(sb, item_shrink_page_reader);
-			continue;
-		}
-
 		if (!write_trylock(&pg->rwlock)) {
 			scoutfs_inc_counter(sb, item_shrink_page_trylock);
 			continue;
@@ -2681,8 +2615,7 @@ int scoutfs_item_setup(struct super_block *sb)
 	atomic_set(&cinf->dirty_pages, 0);
 	spin_lock_init(&cinf->lru_lock);
 	INIT_LIST_HEAD(&cinf->lru_list);
-	spin_lock_init(&cinf->active_lock);
-	INIT_LIST_HEAD(&cinf->active_list);
+	atomic64_set(&cinf->read_dirty_barrier, 0);

 	cinf->pcpu_pages = alloc_percpu(struct item_percpu_pages);
 	if (!cinf->pcpu_pages)
@@ -2715,8 +2648,6 @@ void scoutfs_item_destroy(struct super_block *sb)
 	int cpu;

 	if (cinf) {
-		BUG_ON(!list_empty(&cinf->active_list));
-
 #ifdef KC_CPU_NOTIFIER
 		unregister_hotcpu_notifier(&cinf->notifier);
 #endif
--- a/kmod/src/kernelcompat.c
+++ b/kmod/src/kernelcompat.c
@@ -81,3 +81,69 @@ kc_generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov,
 	return written ? written : status;
 }
 #endif
+
+#include <linux/list_lru.h>
+
+#ifdef KC_LIST_LRU_WALK_CB_ITEM_LOCK
+static enum lru_status kc_isolate(struct list_head *item, spinlock_t *lock, void *cb_arg)
+{
+	struct kc_isolate_args *args = cb_arg;
+
+	/* isolate doesn't use list, nr_items updated in caller */
+	return args->isolate(item, NULL, args->cb_arg);
+}
+
+unsigned long kc_list_lru_walk(struct list_lru *lru, kc_list_lru_walk_cb_t isolate, void *cb_arg,
+				      unsigned long nr_to_walk)
+{
+	struct kc_isolate_args args = {
+		.isolate = isolate,
+		.cb_arg = cb_arg,
+	};
+
+	return list_lru_walk(lru, kc_isolate, &args, nr_to_walk);
+}
+
+unsigned long kc_list_lru_shrink_walk(struct list_lru *lru, struct shrink_control *sc,
+				      kc_list_lru_walk_cb_t isolate, void *cb_arg)
+{
+	struct kc_isolate_args args = {
+		.isolate = isolate,
+		.cb_arg = cb_arg,
+	};
+
+	return list_lru_shrink_walk(lru, sc, kc_isolate, &args);
+}
+#endif
+
+#ifdef KC_LIST_LRU_WALK_CB_LIST_LOCK
+static enum lru_status kc_isolate(struct list_head *item, struct list_lru_one *list,
+				  spinlock_t *lock, void *cb_arg)
+{
+	struct kc_isolate_args *args = cb_arg;
+
+	return args->isolate(item, list, args->cb_arg);
+}
+
+unsigned long kc_list_lru_walk(struct list_lru *lru, kc_list_lru_walk_cb_t isolate, void *cb_arg,
+				      unsigned long nr_to_walk)
+{
+	struct kc_isolate_args args = {
+		.isolate = isolate,
+		.cb_arg = cb_arg,
+	};
+
+	return list_lru_walk(lru, kc_isolate, &args, nr_to_walk);
+}
+unsigned long kc_list_lru_shrink_walk(struct list_lru *lru, struct shrink_control *sc,
+				      kc_list_lru_walk_cb_t isolate, void *cb_arg)
+{
+	struct kc_isolate_args args = {
+		.isolate = isolate,
+		.cb_arg = cb_arg,
+	};
+
+	return list_lru_shrink_walk(lru, sc, kc_isolate, &args);
+}
+
+#endif
--- a/kmod/src/kernelcompat.h
+++ b/kmod/src/kernelcompat.h
@@ -29,50 +29,6 @@ do {						\
 })
 #endif

-#ifndef KC_ITERATE_DIR_CONTEXT
-typedef filldir_t kc_readdir_ctx_t;
-#define KC_DECLARE_READDIR(name, file, dirent, ctx) name(file, dirent, ctx)
-#define KC_FOP_READDIR readdir
-#define kc_readdir_pos(filp, ctx) (filp)->f_pos
-#define kc_dir_emit_dots(file, dirent, ctx) dir_emit_dots(file, dirent, ctx)
-#define kc_dir_emit(ctx, dirent, name, name_len, pos, ino, dt) \
-	(ctx(dirent, name, name_len, pos, ino, dt) == 0)
-#else
-typedef struct dir_context * kc_readdir_ctx_t;
-#define KC_DECLARE_READDIR(name, file, dirent, ctx) name(file, ctx)
-#define KC_FOP_READDIR iterate
-#define kc_readdir_pos(filp, ctx) (ctx)->pos
-#define kc_dir_emit_dots(file, dirent, ctx) dir_emit_dots(file, ctx)
-#define kc_dir_emit(ctx, dirent, name, name_len, pos, ino, dt) \
-	dir_emit(ctx, name, name_len, ino, dt)
-#endif
-
-#ifndef KC_DIR_EMIT_DOTS
-/*
- * Kernels before ->iterate and don't have dir_emit_dots so we give them
- * one that works with the ->readdir() filldir() method.
- */
-static inline int dir_emit_dots(struct file *file, void *dirent,
-				filldir_t filldir)
-{
-	if (file->f_pos == 0) {
-		if (filldir(dirent, ".", 1, 1,
-			    file->f_path.dentry->d_inode->i_ino, DT_DIR))
-			return 0;
-		file->f_pos = 1;
-	}
-
-	if (file->f_pos == 1) {
-		if (filldir(dirent, "..", 2, 1,
-			    parent_ino(file->f_path.dentry), DT_DIR))
-			return 0;
-		file->f_pos = 2;
-	}
-
-	return 1;
-}
-#endif
-
 #ifdef KC_POSIX_ACL_VALID_USER_NS
 #define kc_posix_acl_valid(user_ns, acl) posix_acl_valid(user_ns, acl)
 #else
@@ -438,4 +394,67 @@ static inline int kc_tcp_sock_set_nodelay(struct socket *sock)
 }
 #endif

+#ifdef KC_INODE_DIO_END
+#define kc_inode_dio_end inode_dio_end
+#else
+#define kc_inode_dio_end inode_dio_done
+#endif
+
+#ifndef KC_MM_VM_FAULT_T
+typedef unsigned int vm_fault_t;
+static inline vm_fault_t vmf_error(int err)
+{
+	if (err == -ENOMEM)
+		return VM_FAULT_OOM;
+	return VM_FAULT_SIGBUS;
+}
+#endif
+
+#include <linux/list_lru.h>
+
+#ifndef KC_LIST_LRU_SHRINK_COUNT_WALK
+/* we don't bother with sc->{nid,memcg} (which doesn't exist in oldest kernels) */
+static inline unsigned long list_lru_shrink_count(struct list_lru *lru,
+                                                  struct shrink_control *sc)
+{
+        return list_lru_count(lru);
+}
+static inline unsigned long
+list_lru_shrink_walk(struct list_lru *lru, struct shrink_control *sc,
+		     list_lru_walk_cb isolate, void *cb_arg)
+{
+	return list_lru_walk(lru, isolate, cb_arg, sc->nr_to_scan);
+}
+#endif
+
+#ifndef KC_LIST_LRU_ADD_OBJ
+#define list_lru_add_obj list_lru_add
+#define list_lru_del_obj list_lru_del
+#endif
+
+#if defined(KC_LIST_LRU_WALK_CB_LIST_LOCK) || defined(KC_LIST_LRU_WALK_CB_ITEM_LOCK)
+struct list_lru_one;
+typedef enum lru_status (*kc_list_lru_walk_cb_t)(struct list_head *item, struct list_lru_one *list,
+						 void *cb_arg);
+struct kc_isolate_args {
+	kc_list_lru_walk_cb_t isolate;
+	void *cb_arg;
+};
+unsigned long kc_list_lru_walk(struct list_lru *lru, kc_list_lru_walk_cb_t isolate, void *cb_arg,
+			       unsigned long nr_to_walk);
+unsigned long kc_list_lru_shrink_walk(struct list_lru *lru, struct shrink_control *sc,
+				      kc_list_lru_walk_cb_t isolate, void *cb_arg);
+#else
+#define kc_list_lru_shrink_walk list_lru_shrink_walk
+#endif
+
+#if defined(KC_LIST_LRU_WALK_CB_ITEM_LOCK)
+/* isolate moved by hand, nr_items updated in walk as _REMOVE returned */
+static inline void list_lru_isolate_move(struct list_lru_one *list, struct list_head *item,
+					 struct list_head *head)
+{
+        list_move(item, head);
+}
+#endif
+
 #endif
--- a/kmod/src/lock.c
+++ b/kmod/src/lock.c
@@ -168,7 +168,6 @@ static int lock_invalidate(struct super_block *sb, struct scoutfs_lock *lock,
 			   enum scoutfs_lock_mode prev, enum scoutfs_lock_mode mode)
 {
 	struct scoutfs_lock_coverage *cov;
-	struct scoutfs_lock_coverage *tmp;
 	u64 ino, last;
 	int ret = 0;

@@ -192,19 +191,22 @@ static int lock_invalidate(struct super_block *sb, struct scoutfs_lock *lock,

 	/* have to invalidate if we're not in the only usable case */
 	if (!(prev == SCOUTFS_LOCK_WRITE && mode == SCOUTFS_LOCK_READ)) {
-retry:
-		/* remove cov items to tell users that their cache is stale */
+		/*
+		 * Remove cov items to tell users that their cache is
+		 * stale.  The unlock pattern comes from avoiding bad
+		 * sparse warnings when taking else in a failed trylock.
+		 */
 		spin_lock(&lock->cov_list_lock);
-		list_for_each_entry_safe(cov, tmp, &lock->cov_list, head) {
-			if (!spin_trylock(&cov->cov_lock)) {
-				spin_unlock(&lock->cov_list_lock);
-				cpu_relax();
-				goto retry;
+		while ((cov = list_first_entry_or_null(&lock->cov_list,
+						       struct scoutfs_lock_coverage, head))) {
+			if (spin_trylock(&cov->cov_lock)) {
+				list_del_init(&cov->head);
+				cov->lock = NULL;
+				spin_unlock(&cov->cov_lock);
+				scoutfs_inc_counter(sb, lock_invalidate_coverage);
 			}
-			list_del_init(&cov->head);
-			cov->lock = NULL;
-			spin_unlock(&cov->cov_lock);
-			scoutfs_inc_counter(sb, lock_invalidate_coverage);
+			spin_unlock(&lock->cov_list_lock);
+			spin_lock(&lock->cov_list_lock);
 		}
 		spin_unlock(&lock->cov_list_lock);

@@ -302,6 +304,7 @@ static void lock_inc_count(unsigned int *counts, enum scoutfs_lock_mode mode)
 static void lock_dec_count(unsigned int *counts, enum scoutfs_lock_mode mode)
 {
 	BUG_ON(mode < 0 || mode >= SCOUTFS_LOCK_NR_MODES);
+	BUG_ON(counts[mode] == 0);
 	counts[mode]--;
 }

--- a/kmod/src/lock_server.c
+++ b/kmod/src/lock_server.c
@@ -202,21 +202,48 @@ static u8 invalidation_mode(u8 granted, u8 requested)

 /*
 * Return true of the client lock instances described by the entries can
- * be granted at the same time.  Typically this only means they're both
- * modes that are compatible between nodes. In addition there's the
- * special case where a read lock on a client is compatible with a write
- * lock on the same client because the client's cache covered by the
- * read lock is still valid if they get a write lock.
+ * be granted at the same time.  There's only three cases where this is
+ * true.
+ *
+ * First, the two locks are both of the same mode that allows full
+ * sharing -- read and write only.  The only point of these modes is
+ * that everyone can share them.
+ *
+ * Second, a write lock gives the client permission to read as well.
+ * This means that a client can upgrade its read lock to a write lock
+ * without having to invalidate the existing read and drop caches.
+ *
+ * Third, null locks are always compatible between clients.  It's as
+ * though the client with the null lock has no lock at all.  But it's
+ * never compatible with all locks on the client requesting null.
+ * Sending invalidations for existing locks on a client when we get a
+ * null request is how we resolve races in shrinking locks -- we turn it
+ * into the unsolicited remote invalidation case.
+ *
+ * All other mode and client combinations can not be shared, most
+ * typically a write lock invalidating all other non-write holders to
+ * drop caches and force a read after the write has completed.
 */
 static bool client_entries_compatible(struct client_lock_entry *granted,
 				      struct client_lock_entry *requested)
 {
-	return (granted->mode == requested->mode &&
-		(granted->mode == SCOUTFS_LOCK_READ ||
-		 granted->mode == SCOUTFS_LOCK_WRITE_ONLY)) ||
-	       (granted->rid == requested->rid &&
-		granted->mode == SCOUTFS_LOCK_READ &&
-		requested->mode == SCOUTFS_LOCK_WRITE);
+	/* only read and write_only can be full shared */
+	if ((granted->mode == requested->mode) &&
+	    (granted->mode == SCOUTFS_LOCK_READ || granted->mode == SCOUTFS_LOCK_WRITE_ONLY))
+		return true;
+
+	/* _write includes reading, so a client can upgrade its read to write */
+	if (granted->rid == requested->rid &&
+	    granted->mode == SCOUTFS_LOCK_READ &&
+	    requested->mode == SCOUTFS_LOCK_WRITE)
+		return true;
+
+	/* null is always compatible across clients, never within a client */
+	if ((granted->rid != requested->rid) &&
+	    (granted->mode == SCOUTFS_LOCK_NULL || requested->mode == SCOUTFS_LOCK_NULL))
+		return true;
+
+	return false;
 }

 /*
@@ -317,16 +344,18 @@ static void put_server_lock(struct lock_server_info *inf,

 	BUG_ON(!mutex_is_locked(&snode->mutex));

+	spin_lock(&inf->lock);
+
 	if (atomic_dec_and_test(&snode->refcount) &&
 	    list_empty(&snode->granted) &&
 	    list_empty(&snode->requested) &&
 	    list_empty(&snode->invalidated)) {
-		spin_lock(&inf->lock);
 		rb_erase(&snode->node, &inf->locks_root);
-		spin_unlock(&inf->lock);
 		should_free = true;
 	}

+	spin_unlock(&inf->lock);
+
 	mutex_unlock(&snode->mutex);

 	if (should_free) {
--- a/kmod/src/net.c
+++ b/kmod/src/net.c
@@ -20,6 +20,7 @@
 #include <net/sock.h>
 #include <net/tcp.h>
 #include <linux/log2.h>
+#include <linux/jhash.h>

 #include "format.h"
 #include "counters.h"
@@ -31,6 +32,7 @@
 #include "endian_swap.h"
 #include "tseq.h"
 #include "fence.h"
+#include "options.h"

 /*
 * scoutfs networking delivers requests and responses between nodes.
@@ -134,6 +136,7 @@ struct message_send {
 struct message_recv {
 	struct scoutfs_tseq_entry tseq_entry;
 	struct work_struct proc_work;
+	struct list_head ordered_head;
 	struct scoutfs_net_connection *conn;
 	struct scoutfs_net_header nh;
 };
@@ -332,7 +335,7 @@ static int submit_send(struct super_block *sb,
 		return -EINVAL;

 	if (scoutfs_forcing_unmount(sb))
-		return -EIO;
+		return -ENOLINK;

 	msend = kmalloc(offsetof(struct message_send,
 				 nh.data[data_len]), GFP_NOFS);
@@ -498,16 +501,61 @@ static void scoutfs_net_proc_worker(struct work_struct *work)
 	trace_scoutfs_net_proc_work_exit(sb, 0, ret);
 }

+static void scoutfs_net_ordered_proc_worker(struct work_struct *work)
+{
+	struct scoutfs_work_list *wlist = container_of(work, struct scoutfs_work_list, work);
+	struct message_recv *mrecv;
+	struct message_recv *mrecv__;
+	LIST_HEAD(list);
+
+	spin_lock(&wlist->lock);
+	list_splice_init(&wlist->list, &list);
+	spin_unlock(&wlist->lock);
+
+	list_for_each_entry_safe(mrecv, mrecv__, &list, ordered_head) {
+		list_del_init(&mrecv->ordered_head);
+		scoutfs_net_proc_worker(&mrecv->proc_work);
+	}
+}
+
+/*
+ * Some messages require in-order processing.  But the scope of the
+ * ordering isn't global.  In the case of lock messages, it's per lock.
+ * So for these messages we hash them to a number of ordered workers who
+ * walk a list and call the usual work function in order.  This replaced
+ * first the proc work detecting OOO and re-ordering, and then only
+ * calling proc from the one recv work context.
+ */
+static void queue_ordered_proc(struct scoutfs_net_connection *conn, struct message_recv *mrecv)
+{
+	struct scoutfs_work_list *wlist;
+	struct scoutfs_net_lock *nl;
+	u32 h;
+
+	if (WARN_ON_ONCE(mrecv->nh.cmd != SCOUTFS_NET_CMD_LOCK ||
+		         le16_to_cpu(mrecv->nh.data_len) != sizeof(struct scoutfs_net_lock)))
+		return scoutfs_net_proc_worker(&mrecv->proc_work);
+
+	nl = (void *)mrecv->nh.data;
+	h = jhash(&nl->key, sizeof(struct scoutfs_key), 0x6fdd3cd5);
+	wlist = &conn->ordered_proc_wlists[h % conn->ordered_proc_nr];
+
+	spin_lock(&wlist->lock);
+	list_add_tail(&mrecv->ordered_head, &wlist->list);
+	spin_unlock(&wlist->lock);
+	queue_work(conn->workq, &wlist->work);
+}
+
 /*
 * Free live responses up to and including the seq by marking them dead
 * and moving them to the send queue to be freed.
 */
-static int move_acked_responses(struct scoutfs_net_connection *conn,
-				struct list_head *list, u64 seq)
+static bool move_acked_responses(struct scoutfs_net_connection *conn,
+				 struct list_head *list, u64 seq)
 {
 	struct message_send *msend;
 	struct message_send *tmp;
-	int ret = 0;
+	bool moved = false;

 	assert_spin_locked(&conn->lock);

@@ -519,20 +567,20 @@ static int move_acked_responses(struct scoutfs_net_connection *conn,

 		msend->dead = 1;
 		list_move(&msend->head, &conn->send_queue);
-		ret = 1;
+		moved = true;
 	}

-	return ret;
+	return moved;
 }

 /* acks are processed inline in the recv worker */
 static void free_acked_responses(struct scoutfs_net_connection *conn, u64 seq)
 {
-	int moved;
+	bool moved;

 	spin_lock(&conn->lock);

-	moved = move_acked_responses(conn, &conn->send_queue, seq) +
+	moved = move_acked_responses(conn, &conn->send_queue, seq) |
 		move_acked_responses(conn, &conn->resend_queue, seq);

 	spin_unlock(&conn->lock);
@@ -541,33 +589,17 @@ static void free_acked_responses(struct scoutfs_net_connection *conn, u64 seq)
 		queue_work(conn->workq, &conn->send_work);
 }

-static int recvmsg_full(struct socket *sock, void *buf, unsigned len)
+static int k_recvmsg(struct socket *sock, void *buf, unsigned len)
 {
-	struct msghdr msg;
-	struct kvec kv;
-	int ret;
+	struct kvec kv = {
+		.iov_base = buf,
+		.iov_len = len,
+	};
+	struct msghdr msg = {
+		.msg_flags = MSG_NOSIGNAL,
+	};

-	while (len) {
-		memset(&msg, 0, sizeof(msg));
-		msg.msg_flags = MSG_NOSIGNAL;
-		kv.iov_base = buf;
-		kv.iov_len = len;
-
-#ifndef KC_MSGHDR_STRUCT_IOV_ITER
-		msg.msg_iov = (struct iovec *)&kv;
-		msg.msg_iovlen = 1;
-#else
-		iov_iter_init(&msg.msg_iter, READ, (struct iovec *)&kv, len, 1);
-#endif
-		ret = kernel_recvmsg(sock, &msg, &kv, 1, len, msg.msg_flags);
-		if (ret <= 0)
-			return -ECONNABORTED;
-
-		len -= ret;
-		buf += ret;
-	}
-
-	return 0;
+	return kernel_recvmsg(sock, &msg, &kv, 1, len, msg.msg_flags);
 }

 static bool invalid_message(struct scoutfs_net_connection *conn,
@@ -604,6 +636,72 @@ static bool invalid_message(struct scoutfs_net_connection *conn,
 	return false;
 }

+static int recv_one_message(struct super_block *sb, struct net_info *ninf,
+			    struct scoutfs_net_connection *conn, struct scoutfs_net_header *nh,
+			    unsigned int data_len)
+{
+	struct message_recv *mrecv;
+	int ret;
+
+	scoutfs_inc_counter(sb, net_recv_messages);
+	scoutfs_add_counter(sb, net_recv_bytes, nh_bytes(data_len));
+	trace_scoutfs_net_recv_message(sb, &conn->sockname, &conn->peername, nh);
+
+	/* caller's invalid message checked data len */
+	mrecv = kmalloc(offsetof(struct message_recv, nh.data[data_len]), GFP_NOFS);
+	if (!mrecv) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	mrecv->conn = conn;
+	INIT_WORK(&mrecv->proc_work, scoutfs_net_proc_worker);
+	INIT_LIST_HEAD(&mrecv->ordered_head);
+	mrecv->nh = *nh;
+	if (data_len)
+		memcpy(mrecv->nh.data, (nh + 1), data_len);
+
+	if (nh->cmd == SCOUTFS_NET_CMD_GREETING) {
+		/* greetings are out of band, no seq mechanics */
+		set_conn_fl(conn, saw_greeting);
+
+	} else if (le64_to_cpu(nh->seq) <=
+		   atomic64_read(&conn->recv_seq)) {
+		/* drop any resent duplicated messages */
+		scoutfs_inc_counter(sb, net_recv_dropped_duplicate);
+		kfree(mrecv);
+		ret = 0;
+		goto out;
+
+	} else {
+		/* record that we've received sender's seq */
+		atomic64_set(&conn->recv_seq, le64_to_cpu(nh->seq));
+		/* and free our responses that sender has received */
+		free_acked_responses(conn, le64_to_cpu(nh->recv_seq));
+	}
+
+	scoutfs_tseq_add(&ninf->msg_tseq_tree, &mrecv->tseq_entry);
+
+	/*
+	 * Initial received greetings are processed inline
+	 * before any other incoming messages.
+	 *
+	 * Incoming requests or responses to the lock client
+	 * can't handle re-ordering, so they're queued to
+	 * ordered receive processing work.
+	 */
+	if (nh->cmd == SCOUTFS_NET_CMD_GREETING)
+		scoutfs_net_proc_worker(&mrecv->proc_work);
+	else if (nh->cmd == SCOUTFS_NET_CMD_LOCK && !conn->listening_conn)
+		queue_ordered_proc(conn, mrecv);
+	else
+		queue_work(conn->workq, &mrecv->proc_work);
+	ret = 0;
+
+out:
+	return ret;
+}
+
 /*
 * Always block receiving from the socket.  Errors trigger shutting down
 * the connection.
@@ -614,86 +712,72 @@ static void scoutfs_net_recv_worker(struct work_struct *work)
 	struct super_block *sb = conn->sb;
 	struct net_info *ninf = SCOUTFS_SB(sb)->net_info;
 	struct socket *sock = conn->sock;
-	struct scoutfs_net_header nh;
-	struct message_recv *mrecv;
+	struct scoutfs_net_header *nh;
+	struct page *page = NULL;
 	unsigned int data_len;
+	int hdr_off;
+	int rx_off;
+	int size;
 	int ret;

 	trace_scoutfs_net_recv_work_enter(sb, 0, 0);

+	page = alloc_page(GFP_NOFS);
+	if (!page) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	hdr_off = 0;
+	rx_off = 0;
+
 	for (;;) {
 		/* receive the header */
-		ret = recvmsg_full(sock, &nh, sizeof(nh));
-		if (ret)
-			break;
-
-		/* receiving an invalid message breaks the connection */
-		if (invalid_message(conn, &nh)) {
-			scoutfs_inc_counter(sb, net_recv_invalid_message);
-			ret = -EBADMSG;
-			break;
+		ret = k_recvmsg(sock, page_address(page) + rx_off, PAGE_SIZE - rx_off);
+		if (ret <= 0) {
+			ret = -ECONNABORTED;
+			goto out;
 		}

-		data_len = le16_to_cpu(nh.data_len);
+		rx_off += ret;

-		scoutfs_inc_counter(sb, net_recv_messages);
-		scoutfs_add_counter(sb, net_recv_bytes, nh_bytes(data_len));
-		trace_scoutfs_net_recv_message(sb, &conn->sockname,
-					       &conn->peername, &nh);
+		for (;;) {
+			size = rx_off - hdr_off;
+			if (size < sizeof(struct scoutfs_net_header))
+				break;

-		/* invalid message checked data len */
-		mrecv = kmalloc(offsetof(struct message_recv,
-					 nh.data[data_len]), GFP_NOFS);
-		if (!mrecv) {
-			ret = -ENOMEM;
-			break;
+			nh = page_address(page) + hdr_off;
+
+			/* receiving an invalid message breaks the connection */
+			if (invalid_message(conn, nh)) {
+				scoutfs_inc_counter(sb, net_recv_invalid_message);
+				ret = -EBADMSG;
+				break;
+			}
+
+			data_len = le16_to_cpu(nh->data_len);
+			if (sizeof(struct scoutfs_net_header) + data_len > size)
+				break;
+
+			ret = recv_one_message(sb, ninf, conn, nh, data_len);
+			if (ret < 0)
+				goto out;
+
+			hdr_off += sizeof(struct scoutfs_net_header) + data_len;
 		}

-		mrecv->conn = conn;
-		INIT_WORK(&mrecv->proc_work, scoutfs_net_proc_worker);
-		mrecv->nh = nh;
-
-		/* receive the data payload */
-		ret = recvmsg_full(sock, mrecv->nh.data, data_len);
-		if (ret) {
-			kfree(mrecv);
-			break;
+		if ((PAGE_SIZE - rx_off) <
+		    (sizeof(struct scoutfs_net_header) + SCOUTFS_NET_MAX_DATA_LEN)) {
+			if (size)
+				memmove(page_address(page), page_address(page) + hdr_off, size);
+			hdr_off = 0;
+			rx_off = size;
 		}
-
-		if (nh.cmd == SCOUTFS_NET_CMD_GREETING) {
-			/* greetings are out of band, no seq mechanics */
-			set_conn_fl(conn, saw_greeting);
-
-		} else if (le64_to_cpu(nh.seq) <=
-			   atomic64_read(&conn->recv_seq)) {
-			/* drop any resent duplicated messages */
-			scoutfs_inc_counter(sb, net_recv_dropped_duplicate);
-			kfree(mrecv);
-			continue;
-
-		} else {
-			/* record that we've received sender's seq */
-			atomic64_set(&conn->recv_seq, le64_to_cpu(nh.seq));
-			/* and free our responses that sender has received */
-			free_acked_responses(conn, le64_to_cpu(nh.recv_seq));
-		}
-
-		scoutfs_tseq_add(&ninf->msg_tseq_tree, &mrecv->tseq_entry);
-
-		/*
-		 * Initial received greetings are processed
-		 * synchronously before any other incoming messages.
-		 *
-		 * Incoming requests or responses to the lock client are
-		 * called synchronously to avoid reordering.
-		 */
-		if (nh.cmd == SCOUTFS_NET_CMD_GREETING ||
-		    (nh.cmd == SCOUTFS_NET_CMD_LOCK && !conn->listening_conn))
-			scoutfs_net_proc_worker(&mrecv->proc_work);
-		else
-			queue_work(conn->workq, &mrecv->proc_work);
 	}

+out:
+	__free_page(page);
+
 	if (ret)
 		scoutfs_inc_counter(sb, net_recv_error);

@@ -703,33 +787,41 @@ static void scoutfs_net_recv_worker(struct work_struct *work)
 	trace_scoutfs_net_recv_work_exit(sb, 0, ret);
 }

-static int sendmsg_full(struct socket *sock, void *buf, unsigned len)
+/*
+ * This consumes the kvec.
+ */
+static int k_sendmsg_full(struct socket *sock, struct kvec *kv, unsigned long nr_segs, size_t count)
 {
-	struct msghdr msg;
-	struct kvec kv;
-	int ret;
+	int ret = 0;

-	while (len) {
-		memset(&msg, 0, sizeof(msg));
-		msg.msg_flags = MSG_NOSIGNAL;
-		kv.iov_base = buf;
-		kv.iov_len = len;
+	while (count > 0) {
+		struct msghdr msg = {
+			.msg_flags = MSG_NOSIGNAL,
+		};

-#ifndef KC_MSGHDR_STRUCT_IOV_ITER
-		msg.msg_iov = (struct iovec *)&kv;
-		msg.msg_iovlen = 1;
-#else
-		iov_iter_init(&msg.msg_iter, WRITE, (struct iovec *)&kv, len, 1);
-#endif
-		ret = kernel_sendmsg(sock, &msg, &kv, 1, len);
-		if (ret <= 0)
-			return -ECONNABORTED;
+		ret = kernel_sendmsg(sock, &msg, kv, nr_segs, count);
+		if (ret <= 0) {
+			ret = -ECONNABORTED;
+			break;
+		}

-		len -= ret;
-		buf += ret;
+		count -= ret;
+		if (count) {
+			while (nr_segs > 0 && ret >= kv->iov_len) {
+				ret -= kv->iov_len;
+				kv++;
+				nr_segs--;
+			}
+			if (nr_segs > 0 && ret > 0) {
+				kv->iov_base += ret;
+				kv->iov_len -= ret;
+			}
+			BUG_ON(nr_segs == 0);
+		}
+		ret = 0;
 	}
-
-	return 0;
+	
+	return ret;
 }

 static void free_msend(struct net_info *ninf, struct message_send *msend)
@@ -760,54 +852,73 @@ static void scoutfs_net_send_worker(struct work_struct *work)
 	struct super_block *sb = conn->sb;
 	struct net_info *ninf = SCOUTFS_SB(sb)->net_info;
 	struct message_send *msend;
-	int ret = 0;
+	struct message_send *_msend_;
+	struct kvec kv[16];
+	unsigned long nr_segs;
+	size_t count;
 	int len;
+	int ret;

 	trace_scoutfs_net_send_work_enter(sb, 0, 0);

-	spin_lock(&conn->lock);
-
-	while ((msend = list_first_entry_or_null(&conn->send_queue,
-						 struct message_send, head))) {
-
-		if (msend->dead) {
-			free_msend(ninf, msend);
-			continue;
-		}
-
-		if ((msend->nh.cmd == SCOUTFS_NET_CMD_FAREWELL) &&
-		    nh_is_response(&msend->nh)) {
-			set_conn_fl(conn, saw_farewell);
-		}
-
-		msend->nh.recv_seq =
-			cpu_to_le64(atomic64_read(&conn->recv_seq));
-
-		spin_unlock(&conn->lock);
-
-		len = nh_bytes(le16_to_cpu(msend->nh.data_len));
-
-		scoutfs_inc_counter(sb, net_send_messages);
-		scoutfs_add_counter(sb, net_send_bytes, len);
-		trace_scoutfs_net_send_message(sb, &conn->sockname,
-					       &conn->peername, &msend->nh);
-
-		ret = sendmsg_full(conn->sock, &msend->nh, len);
+	for (;;) {
+		nr_segs = 0;
+		count = 0;

 		spin_lock(&conn->lock);
+		list_for_each_entry_safe(msend, _msend_, &conn->send_queue, head) {
+			if (msend->dead) {
+				free_msend(ninf, msend);
+				continue;
+			}

-		msend->nh.recv_seq = 0;
+			len = nh_bytes(le16_to_cpu(msend->nh.data_len));

-		if (ret)
-			break;
+			if ((msend->nh.cmd == SCOUTFS_NET_CMD_FAREWELL) &&
+			    nh_is_response(&msend->nh)) {
+				set_conn_fl(conn, saw_farewell);
+			}

-		/* resend if it wasn't freed while we sent */
-		if (!msend->dead)
-			list_move_tail(&msend->head, &conn->resend_queue);
+			msend->nh.recv_seq = cpu_to_le64(atomic64_read(&conn->recv_seq));
+
+			scoutfs_inc_counter(sb, net_send_messages);
+			scoutfs_add_counter(sb, net_send_bytes, len);
+			trace_scoutfs_net_send_message(sb, &conn->sockname,
+						       &conn->peername, &msend->nh);
+
+			count += len;
+			kv[nr_segs].iov_base = &msend->nh;
+			kv[nr_segs].iov_len = len;
+			if (++nr_segs == ARRAY_SIZE(kv))
+				break;
+
+		}
+		spin_unlock(&conn->lock);
+
+		if (nr_segs == 0) {
+			ret = 0;
+			goto out;
+		}
+
+		ret = k_sendmsg_full(conn->sock, kv, nr_segs, count);
+		if (ret < 0)
+			goto out;
+
+		spin_lock(&conn->lock);
+		list_for_each_entry_safe(msend, _msend_, &conn->send_queue, head) {
+			msend->nh.recv_seq = 0;
+
+			/* resend if it wasn't freed while we sent */
+			if (!msend->dead)
+				list_move_tail(&msend->head, &conn->resend_queue);
+
+			if (--nr_segs == 0)
+				break;
+		}
+		spin_unlock(&conn->lock);
 	}

-	spin_unlock(&conn->lock);
-
+out:
 	if (ret) {
 		scoutfs_inc_counter(sb, net_send_error);
 		shutdown_conn(conn);
@@ -862,6 +973,7 @@ static void scoutfs_net_destroy_worker(struct work_struct *work)
 	destroy_workqueue(conn->workq);
 	scoutfs_tseq_del(&ninf->conn_tseq_tree, &conn->tseq_entry);
 	kfree(conn->info);
+	kfree(conn->ordered_proc_wlists);
 	trace_scoutfs_conn_destroy_free(conn);
 	kfree(conn);

@@ -887,7 +999,7 @@ static void destroy_conn(struct scoutfs_net_connection *conn)
 * The TCP_KEEP* and TCP_USER_TIMEOUT option interaction is subtle.
 * TCP_USER_TIMEOUT only applies if there is unacked written data in the
 * send queue.  It doesn't work if the connection is idle.  Adding
- * keepalice probes with user_timeout set changes how the keepalive
+ * keepalive probes with user_timeout set changes how the keepalive
 * timeout is calculated.   CNT no longer matters.   Each time
 * additional probes (not the first) are sent the user timeout is
 * checked against the last time data was received.  If none of the
@@ -899,14 +1011,16 @@ static void destroy_conn(struct scoutfs_net_connection *conn)
 * elapses during the probe timer processing after the unsuccessful
 * probes.
 */
-#define UNRESPONSIVE_TIMEOUT_SECS 10
-#define UNRESPONSIVE_PROBES 3
-static int sock_opts_and_names(struct scoutfs_net_connection *conn,
+static int sock_opts_and_names(struct super_block *sb,
+			       struct scoutfs_net_connection *conn,
 			       struct socket *sock)
 {
+	struct scoutfs_mount_options opts;
 	int optval;
 	int ret;

+	scoutfs_options_read(sb, &opts);
+
 	/* we use a keepalive timeout instead of send timeout */
 	ret = kc_sock_set_sndtimeo(sock, 0);
 	if (ret)
@@ -919,8 +1033,7 @@ static int sock_opts_and_names(struct scoutfs_net_connection *conn,
 	if (ret)
 		goto out;

-	BUILD_BUG_ON(UNRESPONSIVE_PROBES >= UNRESPONSIVE_TIMEOUT_SECS);
-	optval = UNRESPONSIVE_TIMEOUT_SECS - (UNRESPONSIVE_PROBES);
+	optval = (opts.tcp_keepalive_timeout_ms / MSEC_PER_SEC) - UNRESPONSIVE_PROBES;
 	ret = kc_tcp_sock_set_keepidle(sock, optval);
 	if (ret)
 		goto out;
@@ -930,7 +1043,7 @@ static int sock_opts_and_names(struct scoutfs_net_connection *conn,
 	if (ret)
 		goto out;

-	optval = UNRESPONSIVE_TIMEOUT_SECS * MSEC_PER_SEC;
+	optval = opts.tcp_keepalive_timeout_ms;
 	ret = kc_tcp_sock_set_user_timeout(sock, optval);
 	if (ret)
 		goto out;
@@ -998,7 +1111,7 @@ static void scoutfs_net_listen_worker(struct work_struct *work)
 			continue;
 		}

-		ret = sock_opts_and_names(acc_conn, acc_sock);
+		ret = sock_opts_and_names(sb, acc_conn, acc_sock);
 		if (ret) {
 			sock_release(acc_sock);
 			destroy_conn(acc_conn);
@@ -1069,7 +1182,7 @@ static void scoutfs_net_connect_worker(struct work_struct *work)
 	if (ret)
 		goto out;

-	ret = sock_opts_and_names(conn, sock);
+	ret = sock_opts_and_names(sb, conn, sock);
 	if (ret)
 		goto out;

@@ -1330,25 +1443,30 @@ scoutfs_net_alloc_conn(struct super_block *sb,
 {
 	struct net_info *ninf = SCOUTFS_SB(sb)->net_info;
 	struct scoutfs_net_connection *conn;
+	unsigned int nr;
+	unsigned int i;
+
+	nr = min_t(unsigned int, num_possible_cpus(),
+		   PAGE_SIZE / sizeof(struct scoutfs_work_list));

 	conn = kzalloc(sizeof(struct scoutfs_net_connection), GFP_NOFS);
-	if (!conn)
-		return NULL;
-
-	if (info_size) {
-		conn->info = kzalloc(info_size, GFP_NOFS);
-		if (!conn->info) {
-			kfree(conn);
-			return NULL;
-		}
+	if (conn) {
+		if (info_size)
+			conn->info = kzalloc(info_size, GFP_NOFS);
+		conn->ordered_proc_wlists = kmalloc_array(nr, sizeof(struct scoutfs_work_list),
+							  GFP_NOFS);
+		conn->workq = alloc_workqueue("scoutfs_net_%s",
+					      WQ_UNBOUND | WQ_NON_REENTRANT, 0,
+					      name_suffix);
 	}
-
-	conn->workq = alloc_workqueue("scoutfs_net_%s",
-				      WQ_UNBOUND | WQ_NON_REENTRANT, 0,
-				      name_suffix);
-	if (!conn->workq) {
-		kfree(conn->info);
-		kfree(conn);
+	if (!conn || (info_size && !conn->info) || !conn->workq || !conn->ordered_proc_wlists) {
+		if (conn) {
+			kfree(conn->info);
+			kfree(conn->ordered_proc_wlists);
+			if (conn->workq)
+				destroy_workqueue(conn->workq);
+			kfree(conn);
+		}
 		return NULL;
 	}

@@ -1378,6 +1496,13 @@ scoutfs_net_alloc_conn(struct super_block *sb,
 	INIT_DELAYED_WORK(&conn->reconn_free_dwork,
 			  scoutfs_net_reconn_free_worker);

+	conn->ordered_proc_nr = nr;
+	for (i = 0; i < nr; i++) {
+		INIT_WORK(&conn->ordered_proc_wlists[i].work, scoutfs_net_ordered_proc_worker);
+		spin_lock_init(&conn->ordered_proc_wlists[i].lock);
+		INIT_LIST_HEAD(&conn->ordered_proc_wlists[i].list);
+	}
+
 	scoutfs_tseq_add(&ninf->conn_tseq_tree, &conn->tseq_entry);
 	trace_scoutfs_conn_alloc(conn);

--- a/kmod/src/net.h
+++ b/kmod/src/net.h
@@ -1,10 +1,18 @@
 #ifndef _SCOUTFS_NET_H_
 #define _SCOUTFS_NET_H_

+#include <linux/spinlock.h>
+#include <linux/list.h>
 #include <linux/in.h>
 #include "endian_swap.h"
 #include "tseq.h"

+struct scoutfs_work_list {
+	struct work_struct work;
+	spinlock_t lock;
+	struct list_head list;
+};
+
 struct scoutfs_net_connection;

 /* These are called in their own blocking context */
@@ -61,6 +69,8 @@ struct scoutfs_net_connection {
 	struct list_head resend_queue;

 	atomic64_t recv_seq;
+	unsigned int ordered_proc_nr;
+	struct scoutfs_work_list *ordered_proc_wlists;

 	struct workqueue_struct *workq;
 	struct work_struct listen_work;
--- a/kmod/src/options.c
+++ b/kmod/src/options.c
@@ -39,6 +39,7 @@ enum {
 	Opt_orphan_scan_delay_ms,
 	Opt_quorum_heartbeat_timeout_ms,
 	Opt_quorum_slot_nr,
+	Opt_tcp_keepalive_timeout_ms,
 	Opt_err,
 };

@@ -52,6 +53,7 @@ static const match_table_t tokens = {
 	{Opt_orphan_scan_delay_ms, "orphan_scan_delay_ms=%s"},
 	{Opt_quorum_heartbeat_timeout_ms, "quorum_heartbeat_timeout_ms=%s"},
 	{Opt_quorum_slot_nr, "quorum_slot_nr=%s"},
+	{Opt_tcp_keepalive_timeout_ms, "tcp_keepalive_timeout_ms=%s"},
 	{Opt_err, NULL}
 };

@@ -126,6 +128,8 @@ static void free_options(struct scoutfs_mount_options *opts)
 #define MIN_DATA_PREALLOC_BLOCKS	1ULL
 #define MAX_DATA_PREALLOC_BLOCKS	((unsigned long long)SCOUTFS_BLOCK_SM_MAX)

+#define DEFAULT_TCP_KEEPALIVE_TIMEOUT_MS	(10 * MSEC_PER_SEC)
+
 static void init_default_options(struct scoutfs_mount_options *opts)
 {
 	memset(opts, 0, sizeof(*opts));
@@ -136,6 +140,7 @@ static void init_default_options(struct scoutfs_mount_options *opts)
 	opts->orphan_scan_delay_ms = -1;
 	opts->quorum_heartbeat_timeout_ms = SCOUTFS_QUORUM_DEF_HB_TIMEO_MS;
 	opts->quorum_slot_nr = -1;
+	opts->tcp_keepalive_timeout_ms = DEFAULT_TCP_KEEPALIVE_TIMEOUT_MS;
 }

 static int verify_log_merge_wait_timeout_ms(struct super_block *sb, int ret, int val)
@@ -168,6 +173,21 @@ static int verify_quorum_heartbeat_timeout_ms(struct super_block *sb, int ret, u
 	return 0;
 }

+static int verify_tcp_keepalive_timeout_ms(struct super_block *sb, int ret, int val)
+{
+	if (ret < 0) {
+		scoutfs_err(sb, "failed to parse tcp_keepalive_timeout_ms value");
+		return -EINVAL;
+	}
+	if (val <= (UNRESPONSIVE_PROBES * MSEC_PER_SEC)) {
+		scoutfs_err(sb, "invalid tcp_keepalive_timeout_ms value %d, must be larger than %lu",
+			    val, (UNRESPONSIVE_PROBES * MSEC_PER_SEC));
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
 /*
 * Parse the option string into our options struct.   This can allocate
 * memory in the struct.  The caller is responsible for always calling
@@ -218,6 +238,14 @@ static int parse_options(struct super_block *sb, char *options, struct scoutfs_m
 			opts->data_prealloc_contig_only = nr;
 			break;

+		case Opt_tcp_keepalive_timeout_ms:
+			ret = match_int(args, &nr);
+			ret = verify_tcp_keepalive_timeout_ms(sb, ret, nr);
+			if (ret < 0)
+				return ret;
+			opts->tcp_keepalive_timeout_ms = nr;
+			break;
+
 		case Opt_log_merge_wait_timeout_ms:
 			ret = match_int(args, &nr);
 			ret = verify_log_merge_wait_timeout_ms(sb, ret, nr);
@@ -371,6 +399,7 @@ int scoutfs_options_show(struct seq_file *seq, struct dentry *root)
 	seq_printf(seq, ",orphan_scan_delay_ms=%u", opts.orphan_scan_delay_ms);
 	if (opts.quorum_slot_nr >= 0)
 		seq_printf(seq, ",quorum_slot_nr=%d", opts.quorum_slot_nr);
+	seq_printf(seq, ",tcp_keepalive_timeout_ms=%d", opts.tcp_keepalive_timeout_ms);

 	return 0;
 }
--- a/kmod/src/options.h
+++ b/kmod/src/options.h
@@ -13,8 +13,11 @@ struct scoutfs_mount_options {
 	unsigned int orphan_scan_delay_ms;
 	int quorum_slot_nr;
 	u64 quorum_heartbeat_timeout_ms;
+	int tcp_keepalive_timeout_ms;
 };

+#define UNRESPONSIVE_PROBES	3
+
 void scoutfs_options_read(struct super_block *sb, struct scoutfs_mount_options *opts);
 int scoutfs_options_show(struct seq_file *seq, struct dentry *root);

--- a/kmod/src/quorum.c
+++ b/kmod/src/quorum.c
@@ -243,10 +243,6 @@ static int send_msg_members(struct super_block *sb, int type, u64 term, int only
 	};
 	struct sockaddr_in sin;
 	struct msghdr mh = {
-#ifndef KC_MSGHDR_STRUCT_IOV_ITER
-		.msg_iov = (struct iovec *)&kv,
-		.msg_iovlen = 1,
-#endif
 		.msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL,
 		.msg_name = &sin,
 		.msg_namelen = sizeof(sin),
@@ -268,9 +264,7 @@ static int send_msg_members(struct super_block *sb, int type, u64 term, int only

 		scoutfs_quorum_slot_sin(&qinf->qconf, i, &sin);
 		now = ktime_get();
-#ifdef KC_MSGHDR_STRUCT_IOV_ITER
-		iov_iter_init(&mh.msg_iter, WRITE, (struct iovec *)&kv, sizeof(qmes), 1);
-#endif
+
 		ret = kernel_sendmsg(qinf->sock, &mh, &kv, 1, kv.iov_len);
 		if (ret != kv.iov_len)
 			failed++;
@@ -312,10 +306,6 @@ static int recv_msg(struct super_block *sb, struct quorum_host_msg *msg,
 		.iov_len = sizeof(struct scoutfs_quorum_message),
 	};
 	struct msghdr mh = {
-#ifndef KC_MSGHDR_STRUCT_IOV_ITER
-		.msg_iov = (struct iovec *)&kv,
-		.msg_iovlen = 1,
-#endif
 		.msg_flags = MSG_NOSIGNAL,
 	};

@@ -333,9 +323,6 @@ static int recv_msg(struct super_block *sb, struct quorum_host_msg *msg,
 		ret = kc_tcp_sock_set_rcvtimeo(qinf->sock, rel_to);
 	}

-#ifdef KC_MSGHDR_STRUCT_IOV_ITER
-	iov_iter_init(&mh.msg_iter, READ, (struct iovec *)&kv, sizeof(struct scoutfs_quorum_message), 1);
-#endif
 	ret = kernel_recvmsg(qinf->sock, &mh, &kv, 1, kv.iov_len, mh.msg_flags);
 	if (ret < 0)
 		return ret;
@@ -726,6 +713,8 @@ static void scoutfs_quorum_worker(struct work_struct *work)
 	struct quorum_status qst = {0,};
 	struct hb_recording hbr;
 	bool record_hb;
+	bool recv_failed;
+	bool initializing = true;
 	int ret;
 	int err;

@@ -758,6 +747,8 @@ static void scoutfs_quorum_worker(struct work_struct *work)

 		update_show_status(qinf, &qst);

+		recv_failed = false;
+
 		ret = recv_msg(sb, &msg, qst.timeout);
 		if (ret < 0) {
 			if (ret != -ETIMEDOUT && ret != -EAGAIN) {
@@ -765,6 +756,9 @@ static void scoutfs_quorum_worker(struct work_struct *work)
 				scoutfs_inc_counter(sb, quorum_recv_error);
 				goto out;
 			}
+
+			recv_failed = true;
+
 			msg.type = SCOUTFS_QUORUM_MSG_INVALID;
 			ret = 0;
 		}
@@ -822,12 +816,13 @@ static void scoutfs_quorum_worker(struct work_struct *work)

 		/* followers and candidates start new election on timeout */
 		if (qst.role != LEADER &&
+		    (initializing || recv_failed) &&
 		    ktime_after(ktime_get(), qst.timeout)) {
 			/* .. but only if their server has stopped */
 			if (!scoutfs_server_is_down(sb)) {
 				qst.timeout = election_timeout();
 				scoutfs_inc_counter(sb, quorum_candidate_server_stopping);
-				continue;
+				goto again;
 			}

 			qst.role = CANDIDATE;
@@ -964,6 +959,9 @@ static void scoutfs_quorum_worker(struct work_struct *work)
 		}

 		record_hb_delay(sb, qinf, &hbr, record_hb, qst.role);
+
+again:
+		initializing = false;
 	}

 	update_show_status(qinf, &qst);
--- a/kmod/src/scoutfs_trace.h
+++ b/kmod/src/scoutfs_trace.h
@@ -286,6 +286,52 @@ TRACE_EVENT(scoutfs_data_alloc_block_enter,
 		  STE_ENTRY_ARGS(ext))
 );

+TRACE_EVENT(scoutfs_data_page_mkwrite,
+	TP_PROTO(struct super_block *sb, __u64 ino, __u64 pos, __u32 ret),
+
+	TP_ARGS(sb, ino, pos, ret),
+
+	TP_STRUCT__entry(
+		SCSB_TRACE_FIELDS
+		__field(__u64, ino)
+		__field(__u64, pos)
+		__field(__u32, ret)
+	),
+
+	TP_fast_assign(
+		SCSB_TRACE_ASSIGN(sb);
+		__entry->ino = ino;
+		__entry->pos = pos;
+		__entry->ret = ret;
+	),
+
+	TP_printk(SCSBF" ino %llu pos %llu ret %u ",
+		  SCSB_TRACE_ARGS, __entry->ino, __entry->pos, __entry->ret)
+);
+
+TRACE_EVENT(scoutfs_data_filemap_fault,
+	TP_PROTO(struct super_block *sb, __u64 ino, __u64 pos, __u32 ret),
+
+	TP_ARGS(sb, ino, pos, ret),
+
+	TP_STRUCT__entry(
+		SCSB_TRACE_FIELDS
+		__field(__u64, ino)
+		__field(__u64, pos)
+		__field(__u32, ret)
+	),
+
+	TP_fast_assign(
+		SCSB_TRACE_ASSIGN(sb);
+		__entry->ino = ino;
+		__entry->pos = pos;
+		__entry->ret = ret;
+	),
+
+	TP_printk(SCSBF" ino %llu pos %llu ret %u ",
+		  SCSB_TRACE_ARGS, __entry->ino, __entry->pos, __entry->ret)
+);
+
 DECLARE_EVENT_CLASS(scoutfs_data_file_extent_class,
 	TP_PROTO(struct super_block *sb, __u64 ino, struct scoutfs_extent *ext),

@@ -777,13 +823,14 @@ DEFINE_EVENT(scoutfs_lock_info_class, scoutfs_lock_destroy,
 );

 TRACE_EVENT(scoutfs_xattr_set,
-	TP_PROTO(struct super_block *sb, size_t name_len, const void *value,
-		 size_t size, int flags),
+	TP_PROTO(struct super_block *sb, __u64 ino, size_t name_len,
+		 const void *value, size_t size, int flags),

-	TP_ARGS(sb, name_len, value, size, flags),
+	TP_ARGS(sb, ino, name_len, value, size, flags),

 	TP_STRUCT__entry(
 		SCSB_TRACE_FIELDS
+		__field(__u64, ino)
 		__field(size_t, name_len)
 		__field(const void *, value)
 		__field(size_t, size)
@@ -792,15 +839,16 @@ TRACE_EVENT(scoutfs_xattr_set,

 	TP_fast_assign(
 		SCSB_TRACE_ASSIGN(sb);
+		__entry->ino = ino;
 		__entry->name_len = name_len;
 		__entry->value = value;
 		__entry->size = size;
 		__entry->flags = flags;
 	),

-	TP_printk(SCSBF" name_len %zu value %p size %zu flags 0x%x",
-		  SCSB_TRACE_ARGS, __entry->name_len, __entry->value,
-		  __entry->size, __entry->flags)
+	TP_printk(SCSBF" ino %llu name_len %zu value %p size %zu flags 0x%x",
+		  SCSB_TRACE_ARGS, __entry->ino,  __entry->name_len,
+		  __entry->value, __entry->size, __entry->flags)
 );

 TRACE_EVENT(scoutfs_advance_dirty_super,
@@ -1046,9 +1094,12 @@ DECLARE_EVENT_CLASS(scoutfs_lock_class,
 		sk_trace_define(start)
 		sk_trace_define(end)
 		__field(u64, refresh_gen)
+		__field(u64, write_seq)
+		__field(u64, dirty_trans_seq)
 		__field(unsigned char, request_pending)
 		__field(unsigned char, invalidate_pending)
 		__field(int, mode)
+		__field(int, invalidating_mode)
 		__field(unsigned int, waiters_cw)
 		__field(unsigned int, waiters_pr)
 		__field(unsigned int, waiters_ex)
@@ -1061,9 +1112,12 @@ DECLARE_EVENT_CLASS(scoutfs_lock_class,
 		sk_trace_assign(start, &lck->start);
 		sk_trace_assign(end, &lck->end);
 		__entry->refresh_gen = lck->refresh_gen;
+		__entry->write_seq = lck->write_seq;
+		__entry->dirty_trans_seq = lck->dirty_trans_seq;
 		__entry->request_pending = lck->request_pending;
 		__entry->invalidate_pending = lck->invalidate_pending;
 		__entry->mode = lck->mode;
+		__entry->invalidating_mode = lck->invalidating_mode;
 		__entry->waiters_pr = lck->waiters[SCOUTFS_LOCK_READ];
 		__entry->waiters_ex = lck->waiters[SCOUTFS_LOCK_WRITE];
 		__entry->waiters_cw = lck->waiters[SCOUTFS_LOCK_WRITE_ONLY];
@@ -1071,10 +1125,11 @@ DECLARE_EVENT_CLASS(scoutfs_lock_class,
 		__entry->users_ex = lck->users[SCOUTFS_LOCK_WRITE];
 		__entry->users_cw = lck->users[SCOUTFS_LOCK_WRITE_ONLY];
        ),
-        TP_printk(SCSBF" start "SK_FMT" end "SK_FMT" mode %u reqpnd %u invpnd %u rfrgen %llu waiters: pr %u ex %u cw %u users: pr %u ex %u cw %u",
+        TP_printk(SCSBF" start "SK_FMT" end "SK_FMT" mode %u invmd %u reqp %u invp %u refg %llu wris %llu dts %llu waiters: pr %u ex %u cw %u users: pr %u ex %u cw %u",
 		  SCSB_TRACE_ARGS, sk_trace_args(start), sk_trace_args(end),
-		  __entry->mode, __entry->request_pending,
-		  __entry->invalidate_pending, __entry->refresh_gen,
+		  __entry->mode, __entry->invalidating_mode, __entry->request_pending,
+		  __entry->invalidate_pending, __entry->refresh_gen, __entry->write_seq,
+		  __entry->dirty_trans_seq,
 		  __entry->waiters_pr, __entry->waiters_ex, __entry->waiters_cw,
 		  __entry->users_pr, __entry->users_ex, __entry->users_cw)
 );
@@ -1913,15 +1968,17 @@ DEFINE_EVENT(scoutfs_server_client_count_class, scoutfs_server_client_down,
 );

 DECLARE_EVENT_CLASS(scoutfs_server_commit_users_class,
-        TP_PROTO(struct super_block *sb, int holding, int applying, int nr_holders,
-		 u32 avail_before, u32 freed_before, int committing, int exceeded),
-        TP_ARGS(sb, holding, applying, nr_holders, avail_before, freed_before, committing,
-		exceeded),
+        TP_PROTO(struct super_block *sb, int holding, int applying,
+		 int nr_holders, u32 budget,
+		 u32 avail_before, u32 freed_before,
+		 int committing, int exceeded),
+        TP_ARGS(sb, holding, applying, nr_holders, budget, avail_before, freed_before, committing, exceeded),
        TP_STRUCT__entry(
 		SCSB_TRACE_FIELDS
 		__field(int, holding)
 		__field(int, applying)
 		__field(int, nr_holders)
+		__field(u32, budget)
 		__field(__u32, avail_before)
 		__field(__u32, freed_before)
 		__field(int, committing)
@@ -1932,35 +1989,45 @@ DECLARE_EVENT_CLASS(scoutfs_server_commit_users_class,
 		__entry->holding = !!holding;
 		__entry->applying = !!applying;
 		__entry->nr_holders = nr_holders;
+		__entry->budget = budget;
 		__entry->avail_before = avail_before;
 		__entry->freed_before = freed_before;
 		__entry->committing = !!committing;
 		__entry->exceeded = !!exceeded;
        ),
-	TP_printk(SCSBF" holding %u applying %u nr %u avail_before %u freed_before %u committing %u exceeded %u",
-		  SCSB_TRACE_ARGS, __entry->holding, __entry->applying, __entry->nr_holders,
-		  __entry->avail_before, __entry->freed_before, __entry->committing,
-		  __entry->exceeded)
+	TP_printk(SCSBF" holding %u applying %u nr %u budget %u avail_before %u freed_before %u committing %u exceeded %u",
+		  SCSB_TRACE_ARGS, __entry->holding, __entry->applying,
+		  __entry->nr_holders, __entry->budget,
+		  __entry->avail_before, __entry->freed_before,
+		  __entry->committing, __entry->exceeded)
 );
 DEFINE_EVENT(scoutfs_server_commit_users_class, scoutfs_server_commit_hold,
-        TP_PROTO(struct super_block *sb, int holding, int applying, int nr_holders,
-		 u32 avail_before, u32 freed_before, int committing, int exceeded),
-        TP_ARGS(sb, holding, applying, nr_holders, avail_before, freed_before, committing, exceeded)
+        TP_PROTO(struct super_block *sb, int holding, int applying,
+		 int nr_holders, u32 budget,
+		 u32 avail_before, u32 freed_before,
+		 int committing, int exceeded),
+        TP_ARGS(sb, holding, applying, nr_holders, budget, avail_before, freed_before, committing, exceeded)
 );
 DEFINE_EVENT(scoutfs_server_commit_users_class, scoutfs_server_commit_apply,
-        TP_PROTO(struct super_block *sb, int holding, int applying, int nr_holders,
-		 u32 avail_before, u32 freed_before, int committing, int exceeded),
-        TP_ARGS(sb, holding, applying, nr_holders, avail_before, freed_before, committing, exceeded)
+        TP_PROTO(struct super_block *sb, int holding, int applying,
+		 int nr_holders, u32 budget,
+		 u32 avail_before, u32 freed_before,
+		 int committing, int exceeded),
+        TP_ARGS(sb, holding, applying, nr_holders, budget, avail_before, freed_before, committing, exceeded)
 );
 DEFINE_EVENT(scoutfs_server_commit_users_class, scoutfs_server_commit_start,
-        TP_PROTO(struct super_block *sb, int holding, int applying, int nr_holders,
-		 u32 avail_before, u32 freed_before, int committing, int exceeded),
-        TP_ARGS(sb, holding, applying, nr_holders, avail_before, freed_before, committing, exceeded)
+        TP_PROTO(struct super_block *sb, int holding, int applying,
+		 int nr_holders, u32 budget,
+		 u32 avail_before, u32 freed_before,
+		 int committing, int exceeded),
+        TP_ARGS(sb, holding, applying, nr_holders, budget, avail_before, freed_before, committing, exceeded)
 );
 DEFINE_EVENT(scoutfs_server_commit_users_class, scoutfs_server_commit_end,
-        TP_PROTO(struct super_block *sb, int holding, int applying, int nr_holders,
-		 u32 avail_before, u32 freed_before, int committing, int exceeded),
-        TP_ARGS(sb, holding, applying, nr_holders, avail_before, freed_before, committing, exceeded)
+        TP_PROTO(struct super_block *sb, int holding, int applying,
+		 int nr_holders, u32 budget,
+		 u32 avail_before, u32 freed_before,
+		 int committing, int exceeded),
+        TP_ARGS(sb, holding, applying, nr_holders, budget, avail_before, freed_before, committing, exceeded)
 );

 #define slt_symbolic(mode)						\
@@ -2398,6 +2465,27 @@ TRACE_EVENT(scoutfs_block_dirty_ref,
 		  __entry->block_blkno, __entry->block_seq)
 );

+TRACE_EVENT(scoutfs_get_file_block,
+	TP_PROTO(struct super_block *sb, u64 blkno, int flags),
+
+	TP_ARGS(sb, blkno, flags),
+
+	TP_STRUCT__entry(
+		SCSB_TRACE_FIELDS
+		__field(__u64, blkno)
+		__field(int, flags)
+	),
+
+	TP_fast_assign(
+		SCSB_TRACE_ASSIGN(sb);
+		__entry->blkno = blkno;
+		__entry->flags = flags;
+	),
+
+	TP_printk(SCSBF" blkno %llu flags 0x%x",
+		  SCSB_TRACE_ARGS, __entry->blkno, __entry->flags)
+);
+
 TRACE_EVENT(scoutfs_block_stale,
 	TP_PROTO(struct super_block *sb, struct scoutfs_block_ref *ref,
 		 struct scoutfs_block_header *hdr, u32 magic, u32 crc),
@@ -2438,8 +2526,8 @@ TRACE_EVENT(scoutfs_block_stale,

 DECLARE_EVENT_CLASS(scoutfs_block_class,
 	TP_PROTO(struct super_block *sb, void *bp, u64 blkno, int refcount, int io_count,
-		 unsigned long bits, __u64 accessed),
-	TP_ARGS(sb, bp, blkno, refcount, io_count, bits, accessed),
+		 unsigned long bits),
+	TP_ARGS(sb, bp, blkno, refcount, io_count, bits),
 	TP_STRUCT__entry(
 		SCSB_TRACE_FIELDS
 		__field(void *, bp)
@@ -2447,7 +2535,6 @@ DECLARE_EVENT_CLASS(scoutfs_block_class,
 		__field(int, refcount)
 		__field(int, io_count)
 		__field(long, bits)
-		__field(__u64, accessed)
 	),
 	TP_fast_assign(
 		SCSB_TRACE_ASSIGN(sb);
@@ -2456,71 +2543,65 @@ DECLARE_EVENT_CLASS(scoutfs_block_class,
 		__entry->refcount = refcount;
 		__entry->io_count = io_count;
 		__entry->bits = bits;
-		__entry->accessed = accessed;
 	),
-	TP_printk(SCSBF" bp %p blkno %llu refcount %d io_count %d bits 0x%lx accessed %llu",
+	TP_printk(SCSBF" bp %p blkno %llu refcount %x io_count %d bits 0x%lx",
 		  SCSB_TRACE_ARGS, __entry->bp, __entry->blkno, __entry->refcount,
-		  __entry->io_count, __entry->bits, __entry->accessed)
+		  __entry->io_count, __entry->bits)
 );
 DEFINE_EVENT(scoutfs_block_class, scoutfs_block_allocate,
 	TP_PROTO(struct super_block *sb, void *bp, u64 blkno,
-		 int refcount, int io_count, unsigned long bits,
-		 __u64 accessed),
-	TP_ARGS(sb, bp, blkno, refcount, io_count, bits, accessed)
+		 int refcount, int io_count, unsigned long bits),
+	TP_ARGS(sb, bp, blkno, refcount, io_count, bits)
 );
 DEFINE_EVENT(scoutfs_block_class, scoutfs_block_free,
 	TP_PROTO(struct super_block *sb, void *bp, u64 blkno,
-		 int refcount, int io_count, unsigned long bits,
-		 __u64 accessed),
-	TP_ARGS(sb, bp, blkno, refcount, io_count, bits, accessed)
+		 int refcount, int io_count, unsigned long bits),
+	TP_ARGS(sb, bp, blkno, refcount, io_count, bits)
 );
 DEFINE_EVENT(scoutfs_block_class, scoutfs_block_insert,
 	TP_PROTO(struct super_block *sb, void *bp, u64 blkno,
-		 int refcount, int io_count, unsigned long bits,
-		 __u64 accessed),
-	TP_ARGS(sb, bp, blkno, refcount, io_count, bits, accessed)
+		 int refcount, int io_count, unsigned long bits),
+	TP_ARGS(sb, bp, blkno, refcount, io_count, bits)
 );
 DEFINE_EVENT(scoutfs_block_class, scoutfs_block_remove,
 	TP_PROTO(struct super_block *sb, void *bp, u64 blkno,
-		 int refcount, int io_count, unsigned long bits,
-		 __u64 accessed),
-	TP_ARGS(sb, bp, blkno, refcount, io_count, bits, accessed)
+		 int refcount, int io_count, unsigned long bits),
+	TP_ARGS(sb, bp, blkno, refcount, io_count, bits)
 );
 DEFINE_EVENT(scoutfs_block_class, scoutfs_block_end_io,
 	TP_PROTO(struct super_block *sb, void *bp, u64 blkno,
-		 int refcount, int io_count, unsigned long bits,
-		 __u64 accessed),
-	TP_ARGS(sb, bp, blkno, refcount, io_count, bits, accessed)
+		 int refcount, int io_count, unsigned long bits),
+	TP_ARGS(sb, bp, blkno, refcount, io_count, bits)
 );
 DEFINE_EVENT(scoutfs_block_class, scoutfs_block_submit,
 	TP_PROTO(struct super_block *sb, void *bp, u64 blkno,
-		 int refcount, int io_count, unsigned long bits,
-		 __u64 accessed),
-	TP_ARGS(sb, bp, blkno, refcount, io_count, bits, accessed)
+		 int refcount, int io_count, unsigned long bits),
+	TP_ARGS(sb, bp, blkno, refcount, io_count, bits)
 );
 DEFINE_EVENT(scoutfs_block_class, scoutfs_block_invalidate,
 	TP_PROTO(struct super_block *sb, void *bp, u64 blkno,
-		 int refcount, int io_count, unsigned long bits,
-		 __u64 accessed),
-	TP_ARGS(sb, bp, blkno, refcount, io_count, bits, accessed)
+		 int refcount, int io_count, unsigned long bits),
+	TP_ARGS(sb, bp, blkno, refcount, io_count, bits)
 );
 DEFINE_EVENT(scoutfs_block_class, scoutfs_block_mark_dirty,
 	TP_PROTO(struct super_block *sb, void *bp, u64 blkno,
-		 int refcount, int io_count, unsigned long bits,
-		 __u64 accessed),
-	TP_ARGS(sb, bp, blkno, refcount, io_count, bits, accessed)
+		 int refcount, int io_count, unsigned long bits),
+	TP_ARGS(sb, bp, blkno, refcount, io_count, bits)
 );
 DEFINE_EVENT(scoutfs_block_class, scoutfs_block_forget,
 	TP_PROTO(struct super_block *sb, void *bp, u64 blkno,
-		 int refcount, int io_count, unsigned long bits,
-		 __u64 accessed),
-	TP_ARGS(sb, bp, blkno, refcount, io_count, bits, accessed)
+		 int refcount, int io_count, unsigned long bits),
+	TP_ARGS(sb, bp, blkno, refcount, io_count, bits)
 );
 DEFINE_EVENT(scoutfs_block_class, scoutfs_block_shrink,
 	TP_PROTO(struct super_block *sb, void *bp, u64 blkno,
-		 int refcount, int io_count, unsigned long bits,
-		 __u64 accessed),
-	TP_ARGS(sb, bp, blkno, refcount, io_count, bits, accessed)
+		 int refcount, int io_count, unsigned long bits),
+	TP_ARGS(sb, bp, blkno, refcount, io_count, bits)
+);
+DEFINE_EVENT(scoutfs_block_class, scoutfs_block_isolate,
+	TP_PROTO(struct super_block *sb, void *bp, u64 blkno,
+		 int refcount, int io_count, unsigned long bits),
+	TP_ARGS(sb, bp, blkno, refcount, io_count, bits)
 );

 DECLARE_EVENT_CLASS(scoutfs_ext_next_class,
@@ -2995,6 +3076,27 @@ DEFINE_EVENT(scoutfs_srch_compact_class, scoutfs_srch_compact_client_recv,
 	TP_ARGS(sb, sc)
 );

+TRACE_EVENT(scoutfs_ioc_search_xattrs,
+	TP_PROTO(struct super_block *sb, u64 ino, u64 last_ino),
+
+	TP_ARGS(sb, ino, last_ino),
+
+	TP_STRUCT__entry(
+		SCSB_TRACE_FIELDS
+		__field(u64, ino)
+		__field(u64, last_ino)
+	),
+
+	TP_fast_assign(
+		SCSB_TRACE_ASSIGN(sb);
+		__entry->ino = ino;
+		__entry->last_ino = last_ino;
+	),
+
+	TP_printk(SCSBF" ino %llu last_ino %llu", SCSB_TRACE_ARGS,
+		  __entry->ino, __entry->last_ino)
+);
+
 #endif /* _TRACE_SCOUTFS_H */

 /* This part must be outside protection */
--- a/kmod/src/server.c
+++ b/kmod/src/server.c
@@ -65,6 +65,7 @@ struct commit_users {
 	struct list_head holding;
 	struct list_head applying;
 	unsigned int nr_holders;
+	u32 budget;
 	u32 avail_before;
 	u32 freed_before;
 	bool committing;
@@ -84,8 +85,9 @@ static void init_commit_users(struct commit_users *cusers)
 do {												\
 	__typeof__(cusers) _cusers = (cusers);							\
 	trace_scoutfs_server_commit_##which(sb, !list_empty(&_cusers->holding),			\
-		!list_empty(&_cusers->applying), _cusers->nr_holders, _cusers->avail_before,	\
-		_cusers->freed_before, _cusers->committing, _cusers->exceeded);			\
+		!list_empty(&_cusers->applying), _cusers->nr_holders, _cusers->budget,		\
+		_cusers->avail_before, _cusers->freed_before, _cusers->committing,		\
+		_cusers->exceeded);								\
 } while (0)

 struct server_info {
@@ -303,7 +305,6 @@ static void check_holder_budget(struct super_block *sb, struct server_info *serv
 	u32 freed_used;
 	u32 avail_now;
 	u32 freed_now;
-	u32 budget;

 	assert_spin_locked(&cusers->lock);

@@ -318,15 +319,14 @@ static void check_holder_budget(struct super_block *sb, struct server_info *serv
 	else
 		freed_used = SCOUTFS_ALLOC_LIST_MAX_BLOCKS - freed_now;

-	budget = cusers->nr_holders * COMMIT_HOLD_ALLOC_BUDGET;
-	if (avail_used <= budget && freed_used <= budget)
+	if (avail_used <= cusers->budget && freed_used <= cusers->budget)
 		return;

 	exceeded_once = true;
 	cusers->exceeded = cusers->nr_holders;

-	scoutfs_err(sb, "%u holders exceeded alloc budget av: bef %u now %u, fr: bef %u now %u",
-		    cusers->nr_holders, cusers->avail_before, avail_now,
+	scoutfs_err(sb, "holders exceeded alloc budget %u av: bef %u now %u, fr: bef %u now %u",
+		    cusers->budget, cusers->avail_before, avail_now,
 		    cusers->freed_before, freed_now);

 	list_for_each_entry(hold, &cusers->holding, entry) {
@@ -349,7 +349,7 @@ static bool hold_commit(struct super_block *sb, struct server_info *server,
 {
 	bool has_room;
 	bool held;
-	u32 budget;
+	u32 new_budget;
 	u32 av;
 	u32 fr;

@@ -367,8 +367,8 @@ static bool hold_commit(struct super_block *sb, struct server_info *server,
 	}

 	/* +2 for our additional hold and then for the final commit work the server does */
-	budget = (cusers->nr_holders + 2) * COMMIT_HOLD_ALLOC_BUDGET;
-	has_room = av >= budget && fr >= budget;
+	new_budget = max(cusers->budget, (cusers->nr_holders + 2) * COMMIT_HOLD_ALLOC_BUDGET);
+	has_room = av >= new_budget && fr >= new_budget;
 	/* checking applying so holders drain once an apply caller starts waiting */
 	held = !cusers->committing && has_room && list_empty(&cusers->applying);

@@ -388,6 +388,7 @@ static bool hold_commit(struct super_block *sb, struct server_info *server,
 		list_add_tail(&hold->entry, &cusers->holding);

 		cusers->nr_holders++;
+		cusers->budget = new_budget;

 	} else if (!has_room && cusers->nr_holders == 0 && !cusers->committing) {
 		cusers->committing = true;
@@ -516,6 +517,7 @@ static void commit_end(struct super_block *sb, struct commit_users *cusers, int
 	list_for_each_entry_safe(hold, tmp, &cusers->applying, entry)
 		list_del_init(&hold->entry);
 	cusers->committing = false;
+	cusers->budget = 0;
 	spin_unlock(&cusers->lock);

 	wake_up(&cusers->waitq);
@@ -608,7 +610,7 @@ static void scoutfs_server_commit_func(struct work_struct *work)
 		goto out;

 	if (scoutfs_forcing_unmount(sb)) {
-		ret = -EIO;
+		ret = -ENOLINK;
 		goto out;
 	}

@@ -1038,6 +1040,101 @@ static int next_log_merge_item(struct super_block *sb,
 	return next_log_merge_item_key(sb, root, zone, &key, val, val_len);
 }

+static int do_finalize_ours(struct super_block *sb,
+			    struct scoutfs_log_trees *lt,
+			    struct commit_hold *hold)
+{
+	struct server_info *server = SCOUTFS_SB(sb)->server_info;
+	struct scoutfs_super_block *super = DIRTY_SUPER_SB(sb);
+	struct scoutfs_key key;
+	char *err_str = NULL;
+	u64 rid = le64_to_cpu(lt->rid);
+	bool more;
+	int ret;
+	int err;
+
+	mutex_lock(&server->srch_mutex);
+	ret = scoutfs_srch_rotate_log(sb, &server->alloc, &server->wri,
+				      &super->srch_root, &lt->srch_file, true);
+	mutex_unlock(&server->srch_mutex);
+	if (ret < 0) {
+		scoutfs_err(sb, "error rotating srch log for rid %016llx: %d",
+			    rid, ret);
+		return ret;
+        }
+
+	do {
+		more = false;
+
+		/*
+		 * All of these can return errors, perhaps indicating successful
+		 * partial progress, after having modified the allocator trees.
+		 * We always have to update the roots in the log item.
+		 */
+		mutex_lock(&server->alloc_mutex);
+		ret = (err_str = "splice meta_freed to other_freed",
+				scoutfs_alloc_splice_list(sb, &server->alloc,
+					&server->wri, server->other_freed,
+					&lt->meta_freed)) ?:
+			(err_str = "splice meta_avail",
+			 scoutfs_alloc_splice_list(sb, &server->alloc,
+					&server->wri, server->other_freed,
+					&lt->meta_avail)) ?:
+			(err_str = "empty data_avail",
+			 alloc_move_empty(sb, &super->data_alloc,
+					  &lt->data_avail,
+					  COMMIT_HOLD_ALLOC_BUDGET / 2)) ?:
+			(err_str = "empty data_freed",
+			 alloc_move_empty(sb, &super->data_alloc,
+					  &lt->data_freed,
+					  COMMIT_HOLD_ALLOC_BUDGET / 2));
+		mutex_unlock(&server->alloc_mutex);
+
+		/*
+		 * only finalize, allowing merging, once the allocators are
+		 * fully freed
+		 */
+		if (ret == 0) {
+			/* the transaction is no longer open */
+			le64_add_cpu(&lt->flags, SCOUTFS_LOG_TREES_FINALIZED);
+			lt->finalize_seq = cpu_to_le64(scoutfs_server_next_seq(sb));
+		}
+
+		scoutfs_key_init_log_trees(&key, rid, le64_to_cpu(lt->nr));
+
+		err = scoutfs_btree_update(sb, &server->alloc, &server->wri,
+					   &super->logs_root, &key, lt,
+					   sizeof(*lt));
+		BUG_ON(err != 0); /* alloc, log, srch items out of sync */
+
+		if (ret == -EINPROGRESS) {
+			more = true;
+			mutex_unlock(&server->logs_mutex);
+			ret = server_apply_commit(sb, hold, 0);
+			if (ret < 0)
+				WARN_ON_ONCE(ret < 0);
+			server_hold_commit(sb, hold);
+			mutex_lock(&server->logs_mutex);
+		} else if (ret == 0) {
+			memset(&lt->item_root, 0, sizeof(lt->item_root));
+			memset(&lt->bloom_ref, 0, sizeof(lt->bloom_ref));
+			lt->inode_count_delta = 0;
+			lt->max_item_seq = 0;
+			lt->finalize_seq = 0;
+			le64_add_cpu(&lt->nr, 1);
+			lt->flags = 0;
+		}
+	} while (more);
+
+	if (ret < 0) {
+		scoutfs_err(sb,
+			    "error %d finalizing log trees for rid %016llx: %s",
+			    ret, rid, err_str);
+	}
+
+	return ret;
+}
+
 /*
 * Finalizing the log btrees for merging needs to be done carefully so
 * that items don't appear to go backwards in time.
@@ -1089,7 +1186,6 @@ static int finalize_and_start_log_merge(struct super_block *sb, struct scoutfs_l
 	struct scoutfs_log_merge_range rng;
 	struct scoutfs_mount_options opts;
 	struct scoutfs_log_trees each_lt;
-	struct scoutfs_log_trees fin;
 	unsigned int delay_ms;
 	unsigned long timeo;
 	bool saw_finalized;
@@ -1160,6 +1256,7 @@ static int finalize_and_start_log_merge(struct super_block *sb, struct scoutfs_l
 		/* done if we're not finalizing and there's no finalized */
 		if (!finalize_ours && !saw_finalized) {
 			ret = 0;
+			scoutfs_inc_counter(sb, log_merge_no_finalized);
 			break;
 		}

@@ -1194,32 +1291,11 @@ static int finalize_and_start_log_merge(struct super_block *sb, struct scoutfs_l

 		/* Finalize ours if it's visible to others */
 		if (ours_visible) {
-			fin = *lt;
-			memset(&fin.meta_avail, 0, sizeof(fin.meta_avail));
-			memset(&fin.meta_freed, 0, sizeof(fin.meta_freed));
-			memset(&fin.data_avail, 0, sizeof(fin.data_avail));
-			memset(&fin.data_freed, 0, sizeof(fin.data_freed));
-			memset(&fin.srch_file, 0, sizeof(fin.srch_file));
-			le64_add_cpu(&fin.flags, SCOUTFS_LOG_TREES_FINALIZED);
-			fin.finalize_seq = cpu_to_le64(scoutfs_server_next_seq(sb));
-
-			scoutfs_key_init_log_trees(&key, le64_to_cpu(fin.rid),
-						   le64_to_cpu(fin.nr));
-			ret = scoutfs_btree_update(sb, &server->alloc, &server->wri,
-						   &super->logs_root, &key, &fin,
-						   sizeof(fin));
+			ret = do_finalize_ours(sb, lt, hold);
 			if (ret < 0) {
-				err_str = "updating finalized log_trees";
+				err_str = "finalizing ours";
 				break;
 			}
-
-			memset(&lt->item_root, 0, sizeof(lt->item_root));
-			memset(&lt->bloom_ref, 0, sizeof(lt->bloom_ref));
-			lt->inode_count_delta = 0;
-			lt->max_item_seq = 0;
-			lt->finalize_seq = 0;
-			le64_add_cpu(&lt->nr, 1);
-			lt->flags = 0;
 		}

 		/* wait a bit for mounts to arrive */
@@ -1299,12 +1375,10 @@ static int finalize_and_start_log_merge(struct super_block *sb, struct scoutfs_l
 * is nested inside holding commits so we recheck the persistent item
 * each time we commit to make sure it's still what we think.   The
 * caller is still going to send the item to the client so we update the
- * caller's each time we make progress.  This is a best-effort attempt
- * to clean up and it's valid to leave extents in data_freed we don't
- * return errors to the caller.  The client will continue the work later
- * in get_log_trees or as the rid is reclaimed.
+ * caller's each time we make progress.  If we hit an error applying the
+ * changes we make then we can't send the log_trees to the client.
 */
-static void try_drain_data_freed(struct super_block *sb, struct scoutfs_log_trees *lt)
+static int try_drain_data_freed(struct super_block *sb, struct scoutfs_log_trees *lt)
 {
 	DECLARE_SERVER_INFO(sb, server);
 	struct scoutfs_super_block *super = DIRTY_SUPER_SB(sb);
@@ -1313,6 +1387,7 @@ static void try_drain_data_freed(struct super_block *sb, struct scoutfs_log_tree
 	struct scoutfs_log_trees drain;
 	struct scoutfs_key key;
 	COMMIT_HOLD(hold);
+	bool apply = false;
 	int ret = 0;
 	int err;

@@ -1321,22 +1396,27 @@ static void try_drain_data_freed(struct super_block *sb, struct scoutfs_log_tree
 	while (lt->data_freed.total_len != 0) {
 		server_hold_commit(sb, &hold);
 		mutex_lock(&server->logs_mutex);
+		apply = true;

 		ret = find_log_trees_item(sb, &super->logs_root, false, rid, U64_MAX, &drain);
-		if (ret < 0)
+		if (ret < 0) {
+			ret = 0;
 			break;
+		}

 		/* careful to only keep draining the caller's specific open trans */
 		if (drain.nr != lt->nr || drain.get_trans_seq != lt->get_trans_seq ||
 		    drain.commit_trans_seq != lt->commit_trans_seq || drain.flags != lt->flags) {
-			ret = -ENOENT;
+			ret = 0;
 			break;
 		}

 		ret = scoutfs_btree_dirty(sb, &server->alloc, &server->wri,
 					  &super->logs_root, &key);
-		if (ret < 0)
+		if (ret < 0) {
+			ret = 0;
 			break;
+		}

 		/* moving can modify and return errors, always update caller and item */
 		mutex_lock(&server->alloc_mutex);
@@ -1352,19 +1432,19 @@ static void try_drain_data_freed(struct super_block *sb, struct scoutfs_log_tree
 		BUG_ON(err < 0); /* dirtying must guarantee success */

 		mutex_unlock(&server->logs_mutex);
-
 		ret = server_apply_commit(sb, &hold, ret);
-		if (ret < 0) {
-			ret = 0; /* don't try to abort, ignoring ret */
+		apply = false;
+
+		if (ret < 0)
 			break;
-		}
 	}

-	/* try to cleanly abort and write any partial dirty btree blocks, but ignore result */
-	if (ret < 0) {
+	if (apply) {
 		mutex_unlock(&server->logs_mutex);
-		server_apply_commit(sb, &hold, 0);
+		server_apply_commit(sb, &hold, ret);
 	}
+
+	return ret;
 }

 /*
@@ -1572,9 +1652,9 @@ out:
 		scoutfs_err(sb, "error %d getting log trees for rid %016llx: %s",
 			    ret, rid, err_str);

-	/* try to drain excessive data_freed with additional commits, if needed, ignoring err */
+	/* try to drain excessive data_freed with additional commits, if needed */
 	if (ret == 0)
-		try_drain_data_freed(sb, &lt);
+		ret = try_drain_data_freed(sb, &lt);

 	return scoutfs_net_response(sb, conn, cmd, id, ret, &lt, sizeof(lt));
 }
@@ -1674,8 +1754,8 @@ unlock:

 	ret = server_apply_commit(sb, &hold, ret);
 	if (ret < 0)
-		scoutfs_err(sb, "server error %d committing client logs for rid %016llx: %s",
-			    ret, rid, err_str);
+		scoutfs_err(sb, "server error %d committing client logs for rid %016llx, nr %llu: %s",
+			    ret, rid, le64_to_cpu(lt.nr), err_str);
 out:
 	WARN_ON_ONCE(ret < 0);
 	return scoutfs_net_response(sb, conn, cmd, id, ret, NULL, 0);
@@ -1810,6 +1890,9 @@ static int reclaim_open_log_tree(struct super_block *sb, u64 rid)
 out:
 	mutex_unlock(&server->logs_mutex);

+	if (ret == 0)
+		scoutfs_inc_counter(sb, reclaimed_open_logs);
+
 	if (ret < 0 && ret != -EINPROGRESS)
 		scoutfs_err(sb, "server error %d reclaiming log trees for rid %016llx: %s",
 			    ret, rid, err_str);
@@ -2527,7 +2610,7 @@ static void server_log_merge_free_work(struct work_struct *work)

 		ret = scoutfs_btree_free_blocks(sb, &server->alloc,
 						&server->wri, &fr.key,
-						&fr.root, COMMIT_HOLD_ALLOC_BUDGET / 2);
+						&fr.root, COMMIT_HOLD_ALLOC_BUDGET / 8);
 		if (ret < 0) {
 			err_str = "freeing log btree";
 			break;
@@ -2546,7 +2629,7 @@ static void server_log_merge_free_work(struct work_struct *work)
 		/* freed blocks are in allocator, we *have* to update fr */
 		BUG_ON(ret < 0);

-		if (server_hold_alloc_used_since(sb, &hold) >= COMMIT_HOLD_ALLOC_BUDGET / 2) {
+		if (server_hold_alloc_used_since(sb, &hold) >= (COMMIT_HOLD_ALLOC_BUDGET * 3) / 4) {
 			mutex_unlock(&server->logs_mutex);
 			ret = server_apply_commit(sb, &hold, ret);
 			commit = false;
@@ -4149,7 +4232,7 @@ static void fence_pending_recov_worker(struct work_struct *work)
 	struct server_info *server = container_of(work, struct server_info,
 						  fence_pending_recov_work);
 	struct super_block *sb = server->sb;
-	union scoutfs_inet_addr addr;
+	union scoutfs_inet_addr addr = {{0,}};
 	u64 rid = 0;
 	int ret = 0;

--- a/kmod/src/sparse-filtered.sh
+++ b/kmod/src/sparse-filtered.sh
@@ -0,0 +1,45 @@
+#!/bin/bash
+
+#
+# Unfortunately, kernels can ship which contain sparse errors that are
+# unrelated to us.
+#
+# The exit status of this filtering wrapper will indicate an error if
+# sparse wasn't found or if there were any unfiltered output lines.  It
+# can hide error exit status from sparse or grep if they don't produce
+# output that makes it past the filters.
+#
+
+# must have sparse.  Fail with error message, mask success path.
+which sparse > /dev/null || exit 1
+
+# initial unmatchable, additional added as RE+="|..."
+RE="$^"
+
+#
+# Darn.  sparse has multi-line error messages, and I'd rather not bother
+# with multi-line filters.  So we'll just drop this context.
+#
+# command-line: note: in included file (through include/linux/netlink.h, include/linux/ethtool.h, include/linux/netdevice.h, include/net/sock.h, /root/scoutfs/kmod/src/kernelcompat.h, builtin): 
+#         fprintf(stderr, "%s: note: in included file%s:\n",
+#
+RE+="|: note: in included file"
+
+# 3.10.0-1160.119.1.el7.x86_64.debug
+# include/linux/posix_acl.h:138:9: warning: incorrect type in assignment (different address spaces)
+# include/linux/posix_acl.h:138:9:    expected struct posix_acl *<noident>
+# include/linux/posix_acl.h:138:9:    got struct posix_acl [noderef] <asn:4>*<noident>
+RE+="|include/linux/posix_acl.h:"
+
+# 3.10.0-1160.119.1.el7.x86_64.debug
+#include/uapi/linux/perf_event.h:146:56: warning: cast truncates bits from constant value (8000000000000000 becomes 0)
+RE+="|include/uapi/linux/perf_event.h:"
+
+# 4.18.0-513.24.1.el8_9.x86_64+debug'
+#./include/linux/skbuff.h:824:1: warning: directive in macro's argument list
+RE+="|include/linux/skbuff.h:"
+
+sparse "$@" |& \
+	grep -E -v "($RE)" |& \
+	awk '{ print $0 } END { exit NR > 0 }'
+exit $?
--- a/kmod/src/srch.c
+++ b/kmod/src/srch.c
@@ -62,7 +62,7 @@
 * re-allocated and re-written.  Search can restart by checking the
 * btree for the current set of files.  Compaction reads log files which
 * are protected from other compactions by the persistent busy items
- * created by the server.  Compaction won't see it's blocks reused out
+ * created by the server.  Compaction won't see its blocks reused out
 * from under it, but it can encounter stale cached blocks that need to
 * be invalidated.
 */
@@ -442,6 +442,10 @@ out:
 	if (ret == 0 && (flags & GFB_INSERT) && blk >= le64_to_cpu(sfl->blocks))
 		sfl->blocks = cpu_to_le64(blk + 1);

+	if (bl) {
+		trace_scoutfs_get_file_block(sb, bl->blkno, flags);
+	}
+
 	*bl_ret = bl;
 	return ret;
 }
@@ -749,14 +753,14 @@ static int search_log_file(struct super_block *sb,
 		for (i = 0; i < le32_to_cpu(srb->entry_nr); i++) {
 			if (pos > SCOUTFS_SRCH_BLOCK_SAFE_BYTES) {
 				/* can only be inconsistency :/ */
-				ret = EIO;
+				ret = -EIO;
 				break;
 			}

 			ret = decode_entry(srb->entries + pos, &sre, &prev);
 			if (ret <= 0) {
 				/* can only be inconsistency :/ */
-				ret = EIO;
+				ret = -EIO;
 				break;
 			}
 			pos += ret;
@@ -859,14 +863,14 @@ static int search_sorted_file(struct super_block *sb,

 		if (pos > SCOUTFS_SRCH_BLOCK_SAFE_BYTES) {
 			/* can only be inconsistency :/ */
-			ret = EIO;
+			ret = -EIO;
 			break;
 		}

 		ret = decode_entry(srb->entries + pos, &sre, &prev);
 		if (ret <= 0) {
 			/* can only be inconsistency :/ */
-			ret = EIO;
+			ret = -EIO;
 			break;
 		}
 		pos += ret;
@@ -972,6 +976,8 @@ int scoutfs_srch_search_xattrs(struct super_block *sb,

 	scoutfs_inc_counter(sb, srch_search_xattrs);

+	trace_scoutfs_ioc_search_xattrs(sb, ino, last_ino);
+
 	*done = false;
 	srch_init_rb_root(sroot);

@@ -1802,7 +1808,7 @@ static void swap_page_sre(void *A, void *B, int size)
 * typically, ~10x worst case).
 *
 * Because we read and sort all the input files we must perform the full
- * compaction in one operation.  The server must have given us a
+ * compaction in one operation.  The server must have given us
 * sufficiently large avail/freed lists, otherwise we'll return ENOSPC.
 */
 static int compact_logs(struct super_block *sb,
@@ -1866,14 +1872,14 @@ static int compact_logs(struct super_block *sb,

 		if (pos > SCOUTFS_SRCH_BLOCK_SAFE_BYTES) {
 			/* can only be inconsistency :/ */
-			ret = EIO;
+			ret = -EIO;
 			break;
 		}

 		ret = decode_entry(srb->entries + pos, sre, &prev);
 		if (ret <= 0) {
 			/* can only be inconsistency :/ */
-			ret = EIO;
+			ret = -EIO;
 			goto out;
 		}
 		prev = *sre;
--- a/kmod/src/super.c
+++ b/kmod/src/super.c
@@ -160,11 +160,17 @@ static void scoutfs_metadev_close(struct super_block *sb)
 		 * from kill_sb->put_super.
 		 */
 		lockdep_off();
+
+#ifdef KC_BDEV_FILE_OPEN_BY_PATH
+		bdev_fput(sbi->meta_bdev_file);
+#else
 #ifdef KC_BLKDEV_PUT_HOLDER_ARG
 		blkdev_put(sbi->meta_bdev, sb);
 #else
 		blkdev_put(sbi->meta_bdev, SCOUTFS_META_BDEV_MODE);
 #endif
+#endif
+
 		lockdep_on();
 		sbi->meta_bdev = NULL;
 	}
@@ -481,7 +487,11 @@ out:
 static int scoutfs_fill_super(struct super_block *sb, void *data, int silent)
 {
 	struct scoutfs_mount_options opts;
+#ifdef KC_BDEV_FILE_OPEN_BY_PATH
+	struct file *meta_bdev_file;
+#else
 	struct block_device *meta_bdev;
+#endif
 	struct scoutfs_sb_info *sbi;
 	struct inode *inode;
 	int ret;
@@ -527,6 +537,22 @@ static int scoutfs_fill_super(struct super_block *sb, void *data, int silent)
 		goto out;
 	}

+#ifdef KC_BDEV_FILE_OPEN_BY_PATH
+	/*
+	 * pass sbi as holder, since dev_mount already passes sb, which triggers a
+	 * WARN_ON because dev_mount also passes non-NULL hops. By passing sbi
+	 * here we just get a simple error in our test cases.
+	 */
+	meta_bdev_file = bdev_file_open_by_path(opts.metadev_path, SCOUTFS_META_BDEV_MODE, sbi, NULL);
+	if (IS_ERR(meta_bdev_file)) {
+		scoutfs_err(sb, "could not open metadev: error %ld",
+			    PTR_ERR(meta_bdev_file));
+		ret = PTR_ERR(meta_bdev_file);
+		goto out;
+	}
+	sbi->meta_bdev_file = meta_bdev_file;
+	sbi->meta_bdev = file_bdev(meta_bdev_file);
+#else
 #ifdef KC_BLKDEV_PUT_HOLDER_ARG
 	meta_bdev = blkdev_get_by_path(opts.metadev_path, SCOUTFS_META_BDEV_MODE, sb, NULL);
 #else
@@ -539,6 +565,8 @@ static int scoutfs_fill_super(struct super_block *sb, void *data, int silent)
 		goto out;
 	}
 	sbi->meta_bdev = meta_bdev;
+#endif
+
 	ret = set_blocksize(sbi->meta_bdev, SCOUTFS_BLOCK_SM_SIZE);
 	if (ret != 0) {
 		scoutfs_err(sb, "failed to set metadev blocksize, returned %d",
--- a/kmod/src/super.h
+++ b/kmod/src/super.h
@@ -42,6 +42,9 @@ struct scoutfs_sb_info {
 	u64 fmt_vers;

 	struct block_device *meta_bdev;
+#ifdef KC_BDEV_FILE_OPEN_BY_PATH
+	struct file *meta_bdev_file;
+#endif

 	spinlock_t next_ino_lock;

--- a/kmod/src/trans.c
+++ b/kmod/src/trans.c
@@ -159,6 +159,58 @@ static bool drained_holders(struct trans_info *tri)
 	return holders == 0;
 }

+static int commit_current_log_trees(struct super_block *sb, char **str)
+{
+	DECLARE_TRANS_INFO(sb, tri);
+
+	return (*str = "data submit", scoutfs_inode_walk_writeback(sb, true)) ?:
+	       (*str = "item dirty", scoutfs_item_write_dirty(sb))  ?:
+	       (*str = "data prepare", scoutfs_data_prepare_commit(sb))  ?:
+	       (*str = "alloc prepare", scoutfs_alloc_prepare_commit(sb, &tri->alloc, &tri->wri)) ?:
+	       (*str = "meta write", scoutfs_block_writer_write(sb, &tri->wri))  ?:
+	       (*str = "data wait", scoutfs_inode_walk_writeback(sb, false)) ?:
+	       (*str = "commit log trees", commit_btrees(sb)) ?:
+	       scoutfs_item_write_done(sb);
+}
+
+static int get_next_log_trees(struct super_block *sb, char **str)
+{
+	return (*str = "get log trees", scoutfs_trans_get_log_trees(sb));
+}
+
+static int retry_forever(struct super_block *sb, int (*func)(struct super_block *sb, char **str))
+{
+	bool retrying = false;
+	char *str;
+	int ret;
+
+	do {
+		str = NULL;
+
+		ret = func(sb, &str);
+		if (ret < 0) {
+			if (!retrying) {
+				scoutfs_warn(sb, "critical transaction commit failure: %s = %d, retrying",
+					    str, ret);
+				retrying = true;
+			}
+
+			if (scoutfs_forcing_unmount(sb)) {
+				ret = -ENOLINK;
+				break;
+			}
+
+			msleep(2 * MSEC_PER_SEC);
+
+		} else if (retrying) {
+			scoutfs_info(sb, "retried transaction commit succeeded");
+		}
+
+	} while (ret < 0);
+
+	return ret;
+}
+
 /*
 * This work func is responsible for writing out all the dirty blocks
 * that make up the current dirty transaction.  It prevents writers from
@@ -184,8 +236,6 @@ void scoutfs_trans_write_func(struct work_struct *work)
 	struct trans_info *tri = container_of(work, struct trans_info, write_work.work);
 	struct super_block *sb = tri->sb;
 	struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
-	bool retrying = false;
-	char *s = NULL;
 	int ret = 0;

 	tri->task = current;
@@ -202,7 +252,7 @@ void scoutfs_trans_write_func(struct work_struct *work)
 	}

 	if (scoutfs_forcing_unmount(sb)) {
-		ret = -EIO;
+		ret = -ENOLINK;
 		goto out;
 	}

@@ -214,37 +264,9 @@ void scoutfs_trans_write_func(struct work_struct *work)

 	scoutfs_inc_counter(sb, trans_commit_written);

-	do {
-		ret = (s = "data submit", scoutfs_inode_walk_writeback(sb, true)) ?:
-		      (s = "item dirty", scoutfs_item_write_dirty(sb))  ?:
-		      (s = "data prepare", scoutfs_data_prepare_commit(sb))  ?:
-		      (s = "alloc prepare", scoutfs_alloc_prepare_commit(sb, &tri->alloc,
-									 &tri->wri))  ?:
-		      (s = "meta write", scoutfs_block_writer_write(sb, &tri->wri))  ?:
-		      (s = "data wait", scoutfs_inode_walk_writeback(sb, false)) ?:
-		      (s = "commit log trees", commit_btrees(sb)) ?:
-		      scoutfs_item_write_done(sb) ?:
-		      (s = "get log trees", scoutfs_trans_get_log_trees(sb));
-		if (ret < 0) {
-			if (!retrying) {
-				scoutfs_warn(sb, "critical transaction commit failure: %s = %d, retrying",
-					    s, ret);
-				retrying = true;
-			}
-
-			if (scoutfs_forcing_unmount(sb)) {
-				ret = -EIO;
-				break;
-			}
-
-			msleep(2 * MSEC_PER_SEC);
-
-		} else if (retrying) {
-			scoutfs_info(sb, "retried transaction commit succeeded");
-		}
-
-	} while (ret < 0);
-
+	/* retry {commit,get}_log_trees until they succeeed, can only fail when forcing unmount */
+	ret = retry_forever(sb, commit_current_log_trees) ?:
+	      retry_forever(sb, get_next_log_trees);
 out:
 	spin_lock(&tri->write_lock);
 	tri->write_count++;
--- a/kmod/src/xattr.c
+++ b/kmod/src/xattr.c
@@ -742,7 +742,7 @@ int scoutfs_xattr_set_locked(struct inode *inode, const char *name, size_t name_
 	int ret;
 	int err;

-	trace_scoutfs_xattr_set(sb, name_len, value, size, flags);
+	trace_scoutfs_xattr_set(sb, ino, name_len, value, size, flags);

 	if (WARN_ON_ONCE(tgs->totl && tgs->indx) ||
 	    WARN_ON_ONCE((tgs->totl | tgs->indx) && !tag_lock))
--- a/restore/cmd/test_restore/test_restore.go
+++ b/restore/cmd/test_restore/test_restore.go
@@ -1,244 +0,0 @@
-package main
-
-import (
-	"flag"
-	"fmt"
-	"log"
-	"os"
-	"path/filepath"
-	"sync"
-	"syscall"
-
-	"restore/pkg/restore"
-)
-
-type options struct {
-	metaPath   string
-	sourceDir  string
-	numWorkers int
-}
-
-// hardlinkTracker keeps track of inodes we've already processed
-type hardlinkTracker struct {
-	sync.Mutex
-	seen map[uint64]bool
-}
-
-func newHardlinkTracker() *hardlinkTracker {
-	return &hardlinkTracker{
-		seen: make(map[uint64]bool),
-	}
-}
-
-func (h *hardlinkTracker) isNewInode(ino uint64, nlink bool) bool {
-	if !nlink {
-		return true
-	}
-
-	h.Lock()
-	defer h.Unlock()
-
-	if _, exists := h.seen[ino]; exists {
-		return false
-	}
-
-	h.seen[ino] = true
-	return true
-}
-
-// getFileInfo extracts file information from os.FileInfo
-func getFileInfo(info os.FileInfo) restore.FileInfo {
-	stat := info.Sys().(*syscall.Stat_t)
-
-	// Use target inode number if specified, otherwise use actual inode number
-	ino := uint64(stat.Ino)
-
-	return restore.FileInfo{
-		Ino:       ino,
-		Mode:      uint32(stat.Mode),
-		Uid:       uint32(stat.Uid),
-		Gid:       uint32(stat.Gid),
-		Size:      uint64(stat.Size),
-		Rdev:      uint64(stat.Rdev),
-		AtimeSec:  stat.Atim.Sec,
-		AtimeNsec: stat.Atim.Nsec,
-		MtimeSec:  stat.Mtim.Sec,
-		MtimeNsec: stat.Mtim.Nsec,
-		CtimeSec:  stat.Ctim.Sec,
-		CtimeNsec: stat.Ctim.Nsec,
-		IsDir:     info.IsDir(),
-		IsRegular: stat.Mode&syscall.S_IFMT == syscall.S_IFREG,
-	}
-}
-
-// getXAttrs gets extended attributes for a file/directory
-func getXAttrs(path string) ([]restore.XAttr, error) {
-	size, err := syscall.Listxattr(path, nil)
-	if err != nil || size == 0 {
-		return nil, err
-	}
-
-	buf := make([]byte, size)
-	size, err = syscall.Listxattr(path, buf)
-	if err != nil {
-		return nil, err
-	}
-
-	var xattrs []restore.XAttr
-	start := 0
-	for i := 0; i < size; i++ {
-		if buf[i] == 0 {
-			name := string(buf[start:i])
-			value, err := syscall.Getxattr(path, name, nil)
-			if err != nil {
-				continue
-			}
-
-			valueBuf := make([]byte, value)
-			_, err = syscall.Getxattr(path, name, valueBuf)
-			if err != nil {
-				continue
-			}
-
-			xattrs = append(xattrs, restore.XAttr{
-				Name:  name,
-				Value: valueBuf,
-			})
-			start = i + 1
-		}
-	}
-
-	return xattrs, nil
-}
-
-func restorePath(writer *restore.WorkerWriter, hlTracker *hardlinkTracker, path string, parentIno uint64) error {
-	entries, err := os.ReadDir(path)
-	if err != nil {
-		return fmt.Errorf("failed to read directory: %v", err)
-	}
-	log.Printf("Restoring path: %s", path)
-	var subdirs int
-	var nameBytes int
-
-	for pos, entry := range entries {
-		if entry.Name() == "." || entry.Name() == ".." {
-			continue
-		}
-
-		info, err := entry.Info()
-		if err != nil {
-			return fmt.Errorf("failed to get entry info: %v", err)
-		}
-
-		stat, ok := info.Sys().(*syscall.Stat_t)
-		if !ok {
-			return fmt.Errorf("failed to get stat_t")
-		}
-		nameBytes += len(entry.Name())
-		fullPath := filepath.Join(path, entry.Name())
-
-		// Recurse into directories
-		if info.IsDir() {
-			subdirs++
-
-			if err := restorePath(writer, hlTracker, fullPath, uint64(stat.Ino)); err != nil {
-				return err
-			}
-
-		}
-
-		err = writer.CreateEntry(parentIno, uint64(pos), uint64(stat.Ino), uint32(info.Mode()), entry.Name())
-		if err != nil {
-			return fmt.Errorf("failed to create entry: %v", err)
-		}
-
-		// Handle inode
-		isHardlink := stat.Nlink > 1
-		if !info.IsDir() && hlTracker.isNewInode(uint64(stat.Ino), isHardlink) {
-			fileInfo := getFileInfo(info)
-			err = writer.CreateInode(fileInfo)
-			if err != nil {
-				return fmt.Errorf("failed to create inode: %v", err)
-			}
-
-			// Handle xattrs
-			xattrs, err := getXAttrs(fullPath)
-			if err == nil {
-				for pos, xattr := range xattrs {
-					err = writer.CreateXAttr(uint64(stat.Ino), uint64(pos), xattr)
-					if err != nil {
-						return fmt.Errorf("failed to create xattr: %v", err)
-					}
-				}
-			}
-		}
-	}
-	// Get directory info
-	dirInfo, err := os.Stat(path)
-	if err != nil {
-		return fmt.Errorf("failed to stat directory: %v", err)
-	}
-
-	// Create directory inode
-	dirFileInfo := getFileInfo(dirInfo)
-	dirFileInfo.NrSubdirs = uint64(subdirs)
-	dirFileInfo.NameBytes = uint64(nameBytes)
-
-	return writer.CreateInode(dirFileInfo)
-}
-
-func main() {
-	opts := options{}
-	flag.StringVar(&opts.metaPath, "m", "", "path to metadata device")
-	flag.StringVar(&opts.sourceDir, "s", "", "path to source directory")
-	flag.IntVar(&opts.numWorkers, "w", 4, "number of worker threads")
-	flag.Parse()
-
-	if opts.metaPath == "" || opts.sourceDir == "" {
-		flag.Usage()
-		os.Exit(1)
-	}
-
-	// Create master and worker writers
-	master, workers, err := restore.NewWriters(opts.metaPath, opts.numWorkers)
-	if err != nil {
-		fmt.Fprintf(os.Stderr, "Failed to create writers: %v\n", err)
-		os.Exit(1)
-	}
-	defer master.Destroy()
-
-	// Create hardlink tracker
-	hlTracker := newHardlinkTracker()
-
-	// Start workers
-	var wg sync.WaitGroup
-	for i, worker := range workers {
-		wg.Add(1)
-		go func(w *restore.WorkerWriter, workerNum int) {
-			defer wg.Done()
-
-			// Each worker processes a subset of the directory tree
-			if err := restorePath(w, hlTracker, opts.sourceDir, 1); err != nil {
-				fmt.Fprintf(os.Stderr, "Worker %d failed: %v\n", workerNum, err)
-				os.Exit(1)
-			}
-			// Create root inode for source directory
-			rootInfo, err := os.Stat(opts.sourceDir)
-			if err != nil {
-				fmt.Fprintf(os.Stderr, "Failed to stat source directory: %v\n", err)
-				os.Exit(1)
-			}
-			w.CreateInode(getFileInfo(rootInfo))
-			err = w.Destroy()
-			if err != nil {
-				fmt.Fprintf(os.Stderr, "Failed to destroy worker: %v\n", err)
-				os.Exit(1)
-			}
-		}(worker, i)
-	}
-
-	// Wait for all workers to complete
-	wg.Wait()
-
-	fmt.Println("Restore completed successfully")
-}
--- a/restore/go.mod
+++ b/restore/go.mod
@@ -1,3 +0,0 @@
-module restore
-
-go 1.21.11
--- a/restore/pkg/restore/restore.go
+++ b/restore/pkg/restore/restore.go
@@ -1,472 +0,0 @@
-package restore
-
-/*
-#cgo CFLAGS: -I${SRCDIR}/../../../utils/src -I${SRCDIR}/../../../kmod/src
-#cgo LDFLAGS: -L${SRCDIR}/../../../utils/src -l:scoutfs_parallel_restore.a -lm
-
-#include <stdlib.h>
-#include <linux/types.h>
-#include <stdbool.h>
-#include <math.h>
-#include "sparse.h"
-#include "util.h"
-#include "format.h"
-#include "parallel_restore.h"
-
-// If there are any type conflicts, you might need to add:
-// #include "kernel_types.h"
-*/
-import "C"
-import (
-    "errors"
-    "fmt"
-    "sync"
-    "syscall"
-    "unsafe"
-)
-
-const batchSize = 1000
-const bufSize = 2 * 1024 * 1024
-
-type WorkerWriter struct {
-    writer      *C.struct_scoutfs_parallel_restore_writer
-    progressCh  chan *ScoutfsParallelWriterProgress
-    fileCreated int64
-    devFd       int
-    buf         unsafe.Pointer
-    wg          *sync.WaitGroup
-}
-
-type MasterWriter struct {
-    writer     *C.struct_scoutfs_parallel_restore_writer
-    progressCh chan *ScoutfsParallelWriterProgress
-    workers    []*WorkerWriter
-    wg         sync.WaitGroup
-    slice      *C.struct_scoutfs_parallel_restore_slice // Add slice field
-    progressWg sync.WaitGroup
-    devFd      int
-    super      *C.struct_scoutfs_super_block
-}
-
-type ScoutfsParallelWriterProgress struct {
-    Progress *C.struct_scoutfs_parallel_restore_progress
-    Slice    *C.struct_scoutfs_parallel_restore_slice
-}
-
-func (m *MasterWriter) aggregateProgress() {
-    defer m.progressWg.Done()
-    for progress := range m.progressCh {
-        ret := C.scoutfs_parallel_restore_add_progress(m.writer, progress.Progress)
-        if ret != 0 {
-            // Handle error appropriately, e.g., log it
-            fmt.Printf("Failed to add progress, error code: %d\n", ret)
-        }
-        if progress.Slice != nil {
-            ret = C.scoutfs_parallel_restore_add_slice(m.writer, progress.Slice)
-            C.free(unsafe.Pointer(progress.Slice))
-            if ret != 0 {
-                // Handle error appropriately, e.g., log it
-                fmt.Printf("Failed to add slice, error code: %d\n", ret)
-            }
-        }
-        // Free the C-allocated progress structures
-        C.free(unsafe.Pointer(progress.Progress))
-    }
-}
-
-func (m *MasterWriter) Destroy() {
-    m.wg.Wait()
-    close(m.progressCh)
-    m.progressWg.Wait()
-
-    if m.slice != nil {
-        C.free(unsafe.Pointer(m.slice)) // Free slice on error
-    }
-    if m.super != nil {
-        C.free(unsafe.Pointer(m.super)) // Free superblock on error
-    }
-    if m.devFd != 0 {
-        syscall.Close(m.devFd)
-    }
-    // Destroy master writer
-    C.scoutfs_parallel_restore_destroy_writer(&m.writer)
-}
-
-func NewWriters(path string, numWriters int) (*MasterWriter, []*WorkerWriter, error) {
-    if numWriters <= 1 {
-        return nil, nil, errors.New("number of writers must be positive")
-    }
-
-    devFd, err := syscall.Open(path, syscall.O_DIRECT|syscall.O_RDWR|syscall.O_EXCL, 0)
-    if err != nil {
-        return nil, nil, fmt.Errorf("failed to open metadata device '%s': %v", path, err)
-    }
-
-    var masterWriter MasterWriter
-    masterWriter.progressCh = make(chan *ScoutfsParallelWriterProgress, numWriters*2)
-    masterWriter.workers = make([]*WorkerWriter, 0, numWriters-1)
-    masterWriter.devFd = devFd
-
-    var ret C.int
-    // Allocate aligned memory for superblock
-    var super unsafe.Pointer
-    ret = C.posix_memalign(&super, 4096, C.SCOUTFS_BLOCK_SM_SIZE)
-    if ret != 0 {
-        masterWriter.Destroy()
-        return nil, nil, fmt.Errorf("failed to allocate aligned memory for superblock: %d", ret)
-    }
-    masterWriter.super = (*C.struct_scoutfs_super_block)(super)
-
-    // Read the superblock from devFd
-    superOffset := C.SCOUTFS_SUPER_BLKNO << C.SCOUTFS_BLOCK_SM_SHIFT
-    count, err := syscall.Pread(devFd, (*[1 << 30]byte)(super)[:C.SCOUTFS_BLOCK_SM_SIZE], int64(superOffset))
-    if err != nil {
-        masterWriter.Destroy()
-        return nil, nil, fmt.Errorf("failed to read superblock: %v", err)
-    }
-    if count != int(C.SCOUTFS_BLOCK_SM_SIZE) {
-        masterWriter.Destroy()
-        return nil, nil, fmt.Errorf("failed to read superblock, bytes read: %d", count)
-    }
-
-    // Check if the superblock is valid.
-    if C.le64_to_cpu(masterWriter.super.flags)&C.SCOUTFS_FLAG_IS_META_BDEV == 0 {
-        masterWriter.Destroy()
-        return nil, nil, errors.New("superblock is not a metadata device")
-    }
-
-    // Create master writer
-    ret = C.scoutfs_parallel_restore_create_writer(&masterWriter.writer)
-    if ret != 0 {
-        masterWriter.Destroy()
-        return nil, nil, errors.New("failed to create master writer")
-    }
-
-    ret = C.scoutfs_parallel_restore_import_super(masterWriter.writer, masterWriter.super, C.int(devFd))
-    if ret != 0 {
-        masterWriter.Destroy()
-        return nil, nil, fmt.Errorf("failed to import superblock, error code: %d", ret)
-    }
-
-    // Initialize slices for each worker
-    masterWriter.slice = (*C.struct_scoutfs_parallel_restore_slice)(C.malloc(C.size_t(numWriters) *
-        C.size_t(unsafe.Sizeof(C.struct_scoutfs_parallel_restore_slice{}))))
-    if masterWriter.slice == nil {
-        masterWriter.Destroy()
-        return nil, nil, errors.New("failed to allocate slices")
-    }
-
-    ret = C.scoutfs_parallel_restore_init_slices(masterWriter.writer,
-        masterWriter.slice,
-        C.int(numWriters))
-    if ret != 0 {
-        masterWriter.Destroy()
-        return nil, nil, errors.New("failed to initialize slices")
-    }
-
-    ret = C.scoutfs_parallel_restore_add_slice(masterWriter.writer, masterWriter.slice)
-    if ret != 0 {
-        masterWriter.Destroy()
-        return nil, nil, errors.New("failed to add slice to master writer")
-    }
-
-    // Create worker writers
-    for i := 1; i < numWriters; i++ {
-        var bufPtr unsafe.Pointer
-        if ret := C.posix_memalign(&bufPtr, 4096, bufSize); ret != 0 {
-            masterWriter.Destroy()
-            return nil, nil, fmt.Errorf("failed to allocate aligned worker buffer: %d", ret)
-        }
-
-        worker := &WorkerWriter{
-            progressCh: masterWriter.progressCh,
-            buf:        bufPtr,
-            wg:         &masterWriter.wg,
-        }
-        ret = C.scoutfs_parallel_restore_create_writer(&worker.writer)
-        if ret != 0 {
-            masterWriter.Destroy()
-            return nil, nil, errors.New("failed to create worker writer")
-        }
-
-        masterWriter.wg.Add(1)
-
-        // Use each slice for the corresponding worker
-        slice := (*C.struct_scoutfs_parallel_restore_slice)(unsafe.Pointer(uintptr(unsafe.Pointer(masterWriter.slice)) +
-            uintptr(i)*unsafe.Sizeof(C.struct_scoutfs_parallel_restore_slice{})))
-        ret = C.scoutfs_parallel_restore_add_slice(worker.writer, slice)
-        if ret != 0 {
-            C.scoutfs_parallel_restore_destroy_writer(&worker.writer)
-            masterWriter.Destroy()
-            return nil, nil, errors.New("failed to add slice to worker writer")
-        }
-
-        masterWriter.workers = append(masterWriter.workers, worker)
-    }
-    masterWriter.progressWg.Add(1)
-    go masterWriter.aggregateProgress()
-
-    return &masterWriter, masterWriter.workers, nil
-
-}
-
-func (w *WorkerWriter) getProgress(withSlice bool) (*ScoutfsParallelWriterProgress, error) {
-    progress := (*C.struct_scoutfs_parallel_restore_progress)(
-        C.malloc(C.size_t(unsafe.Sizeof(C.struct_scoutfs_parallel_restore_progress{}))),
-    )
-    if progress == nil {
-        return nil, errors.New("failed to allocate memory for progress")
-    }
-
-    // Fetch the current progress from the C library
-    ret := C.scoutfs_parallel_restore_get_progress(w.writer, progress)
-    if ret != 0 {
-        C.free(unsafe.Pointer(progress))
-        return nil, fmt.Errorf("failed to get progress, error code: %d", ret)
-    }
-
-    var slice *C.struct_scoutfs_parallel_restore_slice
-    if withSlice {
-        slice = (*C.struct_scoutfs_parallel_restore_slice)(
-            C.malloc(C.size_t(unsafe.Sizeof(C.struct_scoutfs_parallel_restore_slice{}))),
-        )
-        if slice == nil {
-            C.free(unsafe.Pointer(progress))
-            return nil, errors.New("failed to allocate memory for slice")
-        }
-
-        // Optionally fetch the slice information
-        ret = C.scoutfs_parallel_restore_get_slice(w.writer, slice)
-        if ret != 0 {
-            C.free(unsafe.Pointer(progress))
-            C.free(unsafe.Pointer(slice))
-            return nil, fmt.Errorf("failed to get slice, error code: %d", ret)
-        }
-    }
-
-    return &ScoutfsParallelWriterProgress{
-        Progress: progress,
-        Slice:    slice,
-    }, nil
-}
-
-// writeBuffer writes data from the buffer to the device file descriptor.
-// It uses scoutfs_parallel_restore_write_buf to get data and pwrite to write it.
-func (w *WorkerWriter) writeBuffer() (int64, error) {
-    var totalWritten int64
-    var count int64
-    var off int64
-    var ret C.int
-
-    // Allocate memory for off and count
-    offPtr := (*C.off_t)(unsafe.Pointer(&off))
-    countPtr := (*C.size_t)(unsafe.Pointer(&count))
-
-    for {
-        ret = C.scoutfs_parallel_restore_write_buf(w.writer, w.buf,
-            C.size_t(bufSize), offPtr, countPtr)
-
-        if ret != 0 {
-            return totalWritten, fmt.Errorf("failed to write buffer: error code %d", ret)
-        }
-
-        if count > 0 {
-            n, err := syscall.Pwrite(w.devFd, unsafe.Slice((*byte)(w.buf), count), off)
-            if err != nil {
-                return totalWritten, fmt.Errorf("pwrite failed: %v", err)
-            }
-            if n != int(count) {
-                return totalWritten, fmt.Errorf("pwrite wrote %d bytes; expected %d", n, count)
-            }
-            totalWritten += int64(n)
-        }
-
-        if count == 0 {
-            break
-        }
-    }
-
-    return totalWritten, nil
-}
-
-func (w *WorkerWriter) InsertEntry(entry *C.struct_scoutfs_parallel_restore_entry) error {
-    // Add the entry using the C library
-    ret := C.scoutfs_parallel_restore_add_entry(w.writer, entry)
-    if ret != 0 {
-        return fmt.Errorf("failed to add entry, error code: %d", ret)
-    }
-
-    // Increment the fileCreated counter
-    w.fileCreated++
-    if w.fileCreated >= batchSize {
-        _, err := w.writeBuffer()
-        if err != nil {
-            return fmt.Errorf("error writing buffers: %v", err)
-        }
-        // Allocate memory for progress and slice structures
-        progress, err := w.getProgress(false)
-        if err != nil {
-            return err
-        }
-        // Send the progress update to the shared progress channel
-        w.progressCh <- progress
-        // Reset the fileCreated counter
-        w.fileCreated = 0
-    }
-
-    return nil
-}
-
-func (w *WorkerWriter) InsertXattr(xattr *C.struct_scoutfs_parallel_restore_xattr) error {
-    ret := C.scoutfs_parallel_restore_add_xattr(w.writer, xattr)
-    if ret != 0 {
-        return fmt.Errorf("failed to add xattr, error code: %d", ret)
-    }
-    return nil
-}
-
-func (w *WorkerWriter) InsertInode(inode *C.struct_scoutfs_parallel_restore_inode) error {
-    ret := C.scoutfs_parallel_restore_add_inode(w.writer, inode)
-    if ret != 0 {
-        return fmt.Errorf("failed to add inode, error code: %d", ret)
-    }
-    return nil
-}
-
-// should only be called once
-func (w *WorkerWriter) Destroy() error {
-    defer w.wg.Done()
-    // Send final progress if there are remaining entries
-    if w.fileCreated > 0 {
-        _, err := w.writeBuffer()
-        if err != nil {
-            return err
-        }
-        progress := &ScoutfsParallelWriterProgress{
-            Progress: (*C.struct_scoutfs_parallel_restore_progress)(C.malloc(C.size_t(unsafe.Sizeof(C.struct_scoutfs_parallel_restore_progress{})))),
-            Slice:    (*C.struct_scoutfs_parallel_restore_slice)(C.malloc(C.size_t(unsafe.Sizeof(C.struct_scoutfs_parallel_restore_slice{})))),
-        }
-        w.progressCh <- progress
-        w.fileCreated = 0
-    }
-
-    if w.buf != nil {
-        C.free(w.buf)
-        w.buf = nil
-    }
-
-    C.scoutfs_parallel_restore_destroy_writer(&w.writer)
-    return nil
-}
-
-// Add these new types and functions to the existing restore.go file
-
-type FileInfo struct {
-    Ino       uint64
-    Mode      uint32
-    Uid       uint32
-    Gid       uint32
-    Size      uint64
-    Rdev      uint64
-    AtimeSec  int64
-    AtimeNsec int64
-    MtimeSec  int64
-    MtimeNsec int64
-    CtimeSec  int64
-    CtimeNsec int64
-    NrSubdirs uint64
-    NameBytes uint64
-    IsDir     bool
-    IsRegular bool
-}
-
-type XAttr struct {
-    Name  string
-    Value []byte
-}
-
-// CreateInode creates a C inode structure from FileInfo
-func (w *WorkerWriter) CreateInode(info FileInfo) error {
-    inode := (*C.struct_scoutfs_parallel_restore_inode)(C.malloc(C.size_t(unsafe.Sizeof(C.struct_scoutfs_parallel_restore_inode{}))))
-    if inode == nil {
-        return fmt.Errorf("failed to allocate inode")
-    }
-    defer C.free(unsafe.Pointer(inode))
-
-    inode.ino = C.__u64(info.Ino)
-    inode.mode = C.__u32(info.Mode)
-    inode.uid = C.__u32(info.Uid)
-    inode.gid = C.__u32(info.Gid)
-    inode.size = C.__u64(info.Size)
-    inode.rdev = C.uint(info.Rdev)
-
-    inode.atime.tv_sec = C.__time_t(info.AtimeSec)
-    inode.atime.tv_nsec = C.long(info.AtimeNsec)
-    inode.mtime.tv_sec = C.__time_t(info.MtimeSec)
-    inode.mtime.tv_nsec = C.long(info.MtimeNsec)
-    inode.ctime.tv_sec = C.__time_t(info.CtimeSec)
-    inode.ctime.tv_nsec = C.long(info.CtimeNsec)
-    inode.crtime = inode.ctime
-
-    if info.IsRegular && info.Size > 0 {
-        inode.offline = C.bool(true)
-    }
-
-    if info.IsDir {
-        inode.nr_subdirs = C.__u64(info.NrSubdirs)
-        inode.total_entry_name_bytes = C.__u64(info.NameBytes)
-    }
-
-    return w.InsertInode(inode)
-}
-
-// CreateEntry creates a directory entry
-func (w *WorkerWriter) CreateEntry(dirIno uint64, pos uint64, ino uint64, mode uint32, name string) error {
-    entryC := (*C.struct_scoutfs_parallel_restore_entry)(C.malloc(C.size_t(unsafe.Sizeof(C.struct_scoutfs_parallel_restore_entry{})) + C.size_t(len(name))))
-
-    if entryC == nil {
-        return fmt.Errorf("failed to allocate entry")
-    }
-    defer C.free(unsafe.Pointer(entryC))
-
-    entryC.dir_ino = C.__u64(dirIno)
-    entryC.pos = C.__u64(pos)
-    entryC.ino = C.__u64(ino)
-    entryC.mode = C.__u32(mode)
-    entryC.name_len = C.uint(len(name))
-
-    entryC.name = (*C.char)(C.malloc(C.size_t(len(name))))
-    if entryC.name == nil {
-        return fmt.Errorf("failed to allocate entry name")
-    }
-    defer C.free(unsafe.Pointer(entryC.name))
-    copy((*[1 << 30]byte)(unsafe.Pointer(entryC.name))[:len(name)], []byte(name))
-
-    return w.InsertEntry(entryC)
-}
-
-// CreateXAttr creates an extended attribute
-func (w *WorkerWriter) CreateXAttr(ino uint64, pos uint64, xattr XAttr) error {
-    xattrC := (*C.struct_scoutfs_parallel_restore_xattr)(C.malloc(C.size_t(unsafe.Sizeof(C.struct_scoutfs_parallel_restore_xattr{})) + C.size_t(len(xattr.Name)) + C.size_t(len(xattr.Value))))
-    if xattrC == nil {
-        return fmt.Errorf("failed to allocate xattr")
-    }
-    defer C.free(unsafe.Pointer(xattrC))
-
-    xattrC.ino = C.__u64(ino)
-    xattrC.pos = C.__u64(pos)
-    xattrC.name_len = C.uint(len(xattr.Name))
-    xattrC.value_len = C.__u32(len(xattr.Value))
-
-    xattrC.name = (*C.char)(C.malloc(C.size_t(len(xattr.Name))))
-    if xattrC.name == nil {
-        return fmt.Errorf("failed to allocate xattr name")
-    }
-    defer C.free(unsafe.Pointer(xattrC.name))
-
-    copy((*[1 << 30]byte)(unsafe.Pointer(xattrC.name))[:len(xattr.Name)], []byte(xattr.Name))
-
-    xattrC.value = unsafe.Pointer(&xattr.Value[0])
-
-    return w.InsertXattr(xattrC)
-}
--- a/restore/pkg/restore/restore_test.go
+++ b/restore/pkg/restore/restore_test.go
@@ -1,10 +0,0 @@
-package restore
-
-import "testing"
-
-func TestNewWriters(t *testing.T) {
-	_, _, err := NewWriters("/tmp", 2)
-	if err != nil {
-		t.Fatalf("failed to create master writer: %v", err)
-	}
-}
--- a/tests/.gitignore
+++ b/tests/.gitignore
@@ -10,3 +10,5 @@ src/stage_tmpfile
 src/create_xattr_loop
 src/o_tmpfile_umask
 src/o_tmpfile_linkat
+src/mmap_stress
+src/mmap_validate
--- a/tests/Makefile
+++ b/tests/Makefile
@@ -14,8 +14,8 @@ BIN := src/createmany			\
 	src/fragmented_data_extents	\
 	src/o_tmpfile_umask		\
 	src/o_tmpfile_linkat		\
-	src/parallel_restore		\
-	src/restore_copy
+	src/mmap_stress			\
+	src/mmap_validate

 DEPS := $(wildcard src/*.d)

@@ -25,12 +25,10 @@ ifneq ($(DEPS),)
 -include $(DEPS)
 endif

-src/parallel_restore_cflags := ../utils/src/scoutfs_parallel_restore.a -lm
-src/restore_copy_cflags := ../utils/src/scoutfs_parallel_restore.a -lm
+src/mmap_stress: LIBS+=-lpthread

 $(BIN): %: %.c Makefile
-	gcc $(CFLAGS) -MD -MP -MF $*.d $< -o $@ $($(@)_cflags)
-
+	gcc $(CFLAGS) -MD -MP -MF $*.d $< -o $@ $(LIBS)

 .PHONY: clean
 clean:
--- a/tests/funcs/exec.sh
+++ b/tests/funcs/exec.sh
@@ -80,3 +80,15 @@ t_compare_output()
 {
 	"$@" >&7 2>&1
 }
+
+#
+# usually bash prints an annoying output message when jobs
+# are killed.  We can avoid that by redirecting stderr for
+# the bash process when it reaps the jobs that are killed.
+#
+t_silent_kill() {
+	exec {ERR}>&2 2>/dev/null
+	kill "$@"
+	wait "$@"
+	exec 2>&$ERR {ERR}>&-
+}
--- a/tests/funcs/filter.sh
+++ b/tests/funcs/filter.sh
@@ -140,6 +140,9 @@ t_filter_dmesg()
 	re="$re|scoutfs .* error.*server failed to bind to.*"
 	re="$re|scoutfs .* critical transaction commit failure.*"

+	# ENOLINK (-67) indicates an expected forced unmount error
+	re="$re|scoutfs .* error -67 .*"
+
 	# change-devices causes loop device resizing
 	re="$re|loop: module loaded"
 	re="$re|loop[0-9].* detected capacity change from.*"
@@ -160,6 +163,9 @@ t_filter_dmesg()
 	re="$re|Pipe handler or fully qualified core dump path required.*"
 	re="$re|Set kernel.core_pattern before fs.suid_dumpable.*"

+	# perf warning that it adjusted sample rate
+	re="$re|perf: interrupt took too long.*lowering kernel.perf_event_max_sample_rate.*"
+
 	egrep -v "($re)" | \
 		ignore_harmless_unwind_kasan_stack_oob
 }
--- a/tests/funcs/tap.sh
+++ b/tests/funcs/tap.sh
@@ -0,0 +1,88 @@
+
+#
+# Generate TAP format test results
+#
+
+t_tap_header()
+{
+	local runid=$1
+	local sequence=( $(echo $tests) )
+	local count=${#sequence[@]}
+
+	# avoid recreating the same TAP result over again - harness sets this
+	[[ -z "$runid" ]] && runid="*test*"
+
+	cat > $T_RESULTS/scoutfs.tap <<TAPEOF
+TAP version 14
+1..${count}
+#
+# TAP results for run ${runid}
+#
+# host/run info:
+#
+#   hostname: ${HOSTNAME}
+#   test start time: $(date --utc)
+#   uname -r: $(uname -r)
+#   scoutfs commit id: $(git describe --tags)
+#
+# sequence for this run:
+#
+TAPEOF
+
+	# Sequence
+	for t in ${tests}; do
+		 echo ${t/.sh/}
+	done | cat -n | expand | column -c 120 | expand | sed 's/^ /#/' >> $T_RESULTS/scoutfs.tap
+	echo "#" >> $T_RESULTS/scoutfs.tap
+}
+
+t_tap_progress()
+{
+(
+	local i=$(( testcount + 1 ))
+	local testname=$1
+	local result=$2
+
+	local diff=""
+	local dmsg=""
+
+	if [[ -s "$T_RESULTS/tmp/${testname}/dmesg.new" ]]; then
+		dmsg="1"
+	fi
+
+	if ! cmp -s golden/${testname} $T_RESULTS/output/${testname}; then
+		diff="1"
+	fi
+
+	if [[ "${result}" == "100" ]] && [[ -z "${dmsg}" ]] && [[ -z "${diff}" ]]; then
+		echo "ok ${i} - ${testname}"
+	elif [[ "${result}" == "103" ]]; then
+		echo "ok ${i} - ${testname}"
+		echo "# ${testname} ** skipped - permitted **"
+	else
+		echo "not ok ${i} - ${testname}"
+		case ${result} in
+		101)
+			echo "# ${testname} ** skipped **"
+			;;
+		102)
+			echo "# ${testname} ** failed **"
+			;;
+		esac
+
+		if [[ -n "${diff}" ]]; then
+			echo "#"
+			echo "# diff:"
+			echo "#"
+			diff -u golden/${testname} $T_RESULTS/output/${testname} | expand | sed 's/^/#   /'
+		fi
+
+		if [[ -n "${dmsg}" ]]; then
+			echo "#"
+			echo "# dmesg:"
+			echo "#"
+			cat "$T_RESULTS/tmp/${testname}/dmesg.new" | sed 's/^/#   /'
+		fi
+	fi
+) >> $T_RESULTS/scoutfs.tap
+}
--- a/tests/golden/large-fragmented-free
+++ b/tests/golden/large-fragmented-free
@@ -1,4 +1,3 @@
-== setting longer hung task timeout
 == creating fragmented extents
 == unlink file with moved extents to free extents per block
 == cleanup
--- a/tests/golden/lock-shrink-read-race
+++ b/tests/golden/lock-shrink-read-race
@@ -0,0 +1,2 @@
+=== setup
+=== spin reading and shrinking
--- a/tests/golden/mmap
+++ b/tests/golden/mmap
@@ -0,0 +1,27 @@
+== mmap_stress
+thread 0 complete
+thread 1 complete
+thread 2 complete
+thread 3 complete
+thread 4 complete
+== basic mmap/read/write consistency checks
+== mmap read from offline extent
+0: offset: 0 length: 2 flags: O.L
+extents: 1
+1
+00000200:  ea ea ea ea ea ea ea ea ea ea ea ea ea ea ea ea  ................
+0
+0: offset: 0 length: 2 flags: ..L
+extents: 1
+== mmap write to an offline extent
+0: offset: 0 length: 2 flags: O.L
+extents: 1
+1
+0
+0: offset: 0 length: 2 flags: ..L
+extents: 1
+00000000  ea ea ea ea ea ea ea ea  ea ea ea ea ea ea ea ea  |................|
+00000010  11 11 11 11 11 11 11 11  11 11 11 11 11 11 11 11  |................|
+00000020  ea ea ea ea ea ea ea ea  ea ea ea ea ea ea ea ea  |................|
+00000030
+== done
--- a/tests/golden/offline-extent-waiting
+++ b/tests/golden/offline-extent-waiting
@@ -49,7 +49,7 @@ offline wating should be empty:
 0
 == truncating does wait
 truncate should be waiting for first block:
-trunate should no longer be waiting:
+truncate should no longer be waiting:
 0
 == writing waits
 should be waiting for write
--- a/tests/golden/parallel_restore
+++ b/tests/golden/parallel_restore
@@ -1,28 +0,0 @@
-== simple mkfs/restore/mount
-committed_seq     1120
-total_meta_blocks 163840
-total_data_blocks 15728640
-   1440    1440   57120
-     80      80     400
-0: offset: 0 length: 1 flags: O.L
-extents: 1
-0: offset: 0 length: 1 flags: O.L
-extents: 1
-0: offset: 0 length: 1 flags: O.L
-extents: 1
-0: offset: 0 length: 1 flags: O.L
-extents: 1
-    Type  Size     Total   Used      Free  Use%  
-MetaData  64KB    163840  34722    129118    21  
-    Data   4KB  15728640     64  15728576     0  
-  7 13,L,- 15,L,- 17,L,- I 33 -
-== just under ENOSPC
-    Type  Size     Total    Used      Free  Use%  
-MetaData  64KB    163840  155666      8174    95  
-    Data   4KB  15728640      64  15728576     0  
-== just over ENOSPC
-== ENOSPC
-== attempt to restore data device
-== attempt format_v1 restore
-== test if previously mounted
-== cleanup
--- a/tests/golden/simple-readdir
+++ b/tests/golden/simple-readdir
@@ -0,0 +1,97 @@
+== create content
+== readdir all
+00000000: d_off: 0x00000001 d_reclen: 0x18 d_type: DT_DIR d_name: .
+00000001: d_off: 0x00000002 d_reclen: 0x18 d_type: DT_DIR d_name: ..
+00000002: d_off: 0x00000003 d_reclen: 0x18 d_type: DT_REG d_name: a
+00000003: d_off: 0x00000004 d_reclen: 0x20 d_type: DT_REG d_name: aaaaaaaa
+00000004: d_off: 0x00000005 d_reclen: 0x28 d_type: DT_REG d_name: aaaaaaaaaaaaaaa
+00000005: d_off: 0x00000006 d_reclen: 0x30 d_type: DT_REG d_name: aaaaaaaaaaaaaaaaaaaaaa
+00000006: d_off: 0x00000007 d_reclen: 0x38 d_type: DT_REG d_name: aaaaaaaaaaaaaaaaaaaaaaaaaaaaa
+00000007: d_off: 0x00000008 d_reclen: 0x38 d_type: DT_REG d_name: aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa
+00000008: d_off: 0x00000009 d_reclen: 0x40 d_type: DT_REG d_name: aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa
+00000009: d_off: 0x0000000a d_reclen: 0x48 d_type: DT_REG d_name: aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa
+0000000a: d_off: 0x0000000b d_reclen: 0x50 d_type: DT_REG d_name: aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa
+0000000b: d_off: 0x0000000c d_reclen: 0x58 d_type: DT_REG d_name: aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa
+0000000c: d_off: 0x0000000d d_reclen: 0x60 d_type: DT_REG d_name: aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa
+0000000d: d_off: 0x0000000e d_reclen: 0x68 d_type: DT_REG d_name: aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa
+0000000e: d_off: 0x0000000f d_reclen: 0x70 d_type: DT_REG d_name: aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa
+0000000f: d_off: 0x00000010 d_reclen: 0x70 d_type: DT_REG d_name: aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa
+00000010: d_off: 0x00000011 d_reclen: 0x78 d_type: DT_REG d_name: aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa
+00000011: d_off: 0x00000012 d_reclen: 0x80 d_type: DT_REG d_name: aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa
+00000012: d_off: 0x00000013 d_reclen: 0x88 d_type: DT_REG d_name: aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa
+00000013: d_off: 0x00000014 d_reclen: 0x90 d_type: DT_REG d_name: aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa
+00000014: d_off: 0x00000015 d_reclen: 0x98 d_type: DT_REG d_name: aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa
+00000015: d_off: 0x00000016 d_reclen: 0xa0 d_type: DT_REG d_name: aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa
+00000016: d_off: 0x00000017 d_reclen: 0xa8 d_type: DT_REG d_name: aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa
+00000017: d_off: 0x00000018 d_reclen: 0xa8 d_type: DT_REG d_name: aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa
+00000018: d_off: 0x00000019 d_reclen: 0xb0 d_type: DT_REG d_name: aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa
+00000019: d_off: 0x0000001a d_reclen: 0xb8 d_type: DT_REG d_name: aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa
+0000001a: d_off: 0x0000001b d_reclen: 0xc0 d_type: DT_REG d_name: aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa
+0000001b: d_off: 0x0000001c d_reclen: 0xc8 d_type: DT_REG d_name: aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa
+0000001c: d_off: 0x0000001d d_reclen: 0xd0 d_type: DT_REG d_name: aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa
+0000001d: d_off: 0x0000001e d_reclen: 0xd8 d_type: DT_REG d_name: aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa
+0000001e: d_off: 0x0000001f d_reclen: 0xe0 d_type: DT_REG d_name: aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa
+0000001f: d_off: 0x00000020 d_reclen: 0xe0 d_type: DT_REG d_name: aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa
+00000020: d_off: 0x00000021 d_reclen: 0xe8 d_type: DT_REG d_name: aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa
+00000021: d_off: 0x00000022 d_reclen: 0xf0 d_type: DT_REG d_name: aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa
+00000022: d_off: 0x00000023 d_reclen: 0xf8 d_type: DT_REG d_name: aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa
+00000023: d_off: 0x00000024 d_reclen: 0x100 d_type: DT_REG d_name: aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa
+00000024: d_off: 0x00000025 d_reclen: 0x108 d_type: DT_REG d_name: aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa
+00000025: d_off: 0x00000026 d_reclen: 0x110 d_type: DT_REG d_name: aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa
+== readdir offset
+00000014: d_off: 0x00000015 d_reclen: 0x98 d_type: DT_REG d_name: aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa
+00000015: d_off: 0x00000016 d_reclen: 0xa0 d_type: DT_REG d_name: aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa
+00000016: d_off: 0x00000017 d_reclen: 0xa8 d_type: DT_REG d_name: aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa
+00000017: d_off: 0x00000018 d_reclen: 0xa8 d_type: DT_REG d_name: aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa
+00000018: d_off: 0x00000019 d_reclen: 0xb0 d_type: DT_REG d_name: aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa
+00000019: d_off: 0x0000001a d_reclen: 0xb8 d_type: DT_REG d_name: aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa
+0000001a: d_off: 0x0000001b d_reclen: 0xc0 d_type: DT_REG d_name: aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa
+0000001b: d_off: 0x0000001c d_reclen: 0xc8 d_type: DT_REG d_name: aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa
+0000001c: d_off: 0x0000001d d_reclen: 0xd0 d_type: DT_REG d_name: aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa
+0000001d: d_off: 0x0000001e d_reclen: 0xd8 d_type: DT_REG d_name: aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa
+0000001e: d_off: 0x0000001f d_reclen: 0xe0 d_type: DT_REG d_name: aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa
+0000001f: d_off: 0x00000020 d_reclen: 0xe0 d_type: DT_REG d_name: aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa
+00000020: d_off: 0x00000021 d_reclen: 0xe8 d_type: DT_REG d_name: aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa
+00000021: d_off: 0x00000022 d_reclen: 0xf0 d_type: DT_REG d_name: aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa
+00000022: d_off: 0x00000023 d_reclen: 0xf8 d_type: DT_REG d_name: aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa
+00000023: d_off: 0x00000024 d_reclen: 0x100 d_type: DT_REG d_name: aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa
+00000024: d_off: 0x00000025 d_reclen: 0x108 d_type: DT_REG d_name: aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa
+00000025: d_off: 0x00000026 d_reclen: 0x110 d_type: DT_REG d_name: aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa
+== readdir len (bytes)
+00000000: d_off: 0x00000001 d_reclen: 0x18 d_type: DT_DIR d_name: .
+00000001: d_off: 0x00000002 d_reclen: 0x18 d_type: DT_DIR d_name: ..
+00000002: d_off: 0x00000003 d_reclen: 0x18 d_type: DT_REG d_name: a
+00000003: d_off: 0x00000004 d_reclen: 0x20 d_type: DT_REG d_name: aaaaaaaa
+00000004: d_off: 0x00000005 d_reclen: 0x28 d_type: DT_REG d_name: aaaaaaaaaaaaaaa
+00000005: d_off: 0x00000006 d_reclen: 0x30 d_type: DT_REG d_name: aaaaaaaaaaaaaaaaaaaaaa
+00000006: d_off: 0x00000007 d_reclen: 0x38 d_type: DT_REG d_name: aaaaaaaaaaaaaaaaaaaaaaaaaaaaa
+== introduce gap
+00000000: d_off: 0x00000001 d_reclen: 0x18 d_type: DT_DIR d_name: .
+00000001: d_off: 0x00000002 d_reclen: 0x18 d_type: DT_DIR d_name: ..
+00000002: d_off: 0x00000003 d_reclen: 0x18 d_type: DT_REG d_name: a
+00000003: d_off: 0x00000004 d_reclen: 0x20 d_type: DT_REG d_name: aaaaaaaa
+00000004: d_off: 0x00000005 d_reclen: 0x28 d_type: DT_REG d_name: aaaaaaaaaaaaaaa
+00000005: d_off: 0x00000006 d_reclen: 0x30 d_type: DT_REG d_name: aaaaaaaaaaaaaaaaaaaaaa
+00000006: d_off: 0x00000007 d_reclen: 0x38 d_type: DT_REG d_name: aaaaaaaaaaaaaaaaaaaaaaaaaaaaa
+00000007: d_off: 0x00000008 d_reclen: 0x38 d_type: DT_REG d_name: aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa
+00000008: d_off: 0x00000009 d_reclen: 0x40 d_type: DT_REG d_name: aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa
+00000009: d_off: 0x00000014 d_reclen: 0x48 d_type: DT_REG d_name: aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa
+00000014: d_off: 0x00000015 d_reclen: 0x98 d_type: DT_REG d_name: aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa
+00000015: d_off: 0x00000016 d_reclen: 0xa0 d_type: DT_REG d_name: aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa
+00000016: d_off: 0x00000017 d_reclen: 0xa8 d_type: DT_REG d_name: aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa
+00000017: d_off: 0x00000018 d_reclen: 0xa8 d_type: DT_REG d_name: aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa
+00000018: d_off: 0x00000019 d_reclen: 0xb0 d_type: DT_REG d_name: aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa
+00000019: d_off: 0x0000001a d_reclen: 0xb8 d_type: DT_REG d_name: aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa
+0000001a: d_off: 0x0000001b d_reclen: 0xc0 d_type: DT_REG d_name: aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa
+0000001b: d_off: 0x0000001c d_reclen: 0xc8 d_type: DT_REG d_name: aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa
+0000001c: d_off: 0x0000001d d_reclen: 0xd0 d_type: DT_REG d_name: aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa
+0000001d: d_off: 0x0000001e d_reclen: 0xd8 d_type: DT_REG d_name: aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa
+0000001e: d_off: 0x0000001f d_reclen: 0xe0 d_type: DT_REG d_name: aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa
+0000001f: d_off: 0x00000020 d_reclen: 0xe0 d_type: DT_REG d_name: aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa
+00000020: d_off: 0x00000021 d_reclen: 0xe8 d_type: DT_REG d_name: aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa
+00000021: d_off: 0x00000022 d_reclen: 0xf0 d_type: DT_REG d_name: aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa
+00000022: d_off: 0x00000023 d_reclen: 0xf8 d_type: DT_REG d_name: aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa
+00000023: d_off: 0x00000024 d_reclen: 0x100 d_type: DT_REG d_name: aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa
+00000024: d_off: 0x00000025 d_reclen: 0x108 d_type: DT_REG d_name: aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa
+00000025: d_off: 0x00000026 d_reclen: 0x110 d_type: DT_REG d_name: aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa
+== cleanup
--- a/tests/golden/xfstests
+++ b/tests/golden/xfstests
@@ -22,6 +22,8 @@ generic/024
 generic/025
 generic/026
 generic/028
+generic/029
+generic/030
 generic/031
 generic/032
 generic/033
@@ -53,6 +55,7 @@ generic/073
 generic/076
 generic/078
 generic/079
+generic/080
 generic/081
 generic/082
 generic/084
@@ -81,10 +84,12 @@ generic/116
 generic/117
 generic/118
 generic/119
+generic/120
 generic/121
 generic/122
 generic/123
 generic/124
+generic/126
 generic/128
 generic/129
 generic/130
@@ -95,6 +100,7 @@ generic/136
 generic/138
 generic/139
 generic/140
+generic/141
 generic/142
 generic/143
 generic/144
@@ -153,6 +159,7 @@ generic/210
 generic/211
 generic/212
 generic/214
+generic/215
 generic/216
 generic/217
 generic/218
@@ -173,6 +180,9 @@ generic/238
 generic/240
 generic/244
 generic/245
+generic/246
+generic/247
+generic/248
 generic/249
 generic/250
 generic/252
@@ -231,6 +241,7 @@ generic/317
 generic/319
 generic/322
 generic/324
+generic/325
 generic/326
 generic/327
 generic/328
@@ -244,6 +255,7 @@ generic/337
 generic/341
 generic/342
 generic/343
+generic/346
 generic/348
 generic/353
 generic/355
@@ -305,7 +317,9 @@ generic/424
 generic/425
 generic/426
 generic/427
+generic/428
 generic/436
+generic/437
 generic/439
 generic/440
 generic/443
@@ -315,6 +329,7 @@ generic/448
 generic/449
 generic/450
 generic/451
+generic/452
 generic/453
 generic/454
 generic/456
@@ -438,6 +453,7 @@ generic/610
 generic/611
 generic/612
 generic/613
+generic/614
 generic/618
 generic/621
 generic/623
@@ -451,6 +467,7 @@ generic/632
 generic/634
 generic/635
 generic/637
+generic/638
 generic/639
 generic/640
 generic/644
@@ -862,4 +879,4 @@ generic/688
 generic/689
 shared/002
 shared/032
-Passed all 495 tests
+Passed all 512 tests
--- a/tests/run-tests.sh
+++ b/tests/run-tests.sh
@@ -512,6 +512,11 @@ msg "running tests"
 > "$T_RESULTS/skip.log"
 > "$T_RESULTS/fail.log"

+# generate a test ID to make sure we can de-duplicate TAP results in aggregation
+. funcs/tap.sh
+t_tap_header $(uuidgen)
+
+testcount=0
 passed=0
 skipped=0
 failed=0
@@ -527,12 +532,15 @@ for t in $tests; do
 	cmd rm -rf "$T_TMPDIR"
 	cmd mkdir -p "$T_TMPDIR"

-	# create a test name dir in the fs
+	# create a test name dir in the fs, clean up old data as needed
 	T_DS=""
 	for i in $(seq 0 $((T_NR_MOUNTS - 1))); do
 		dir="${T_M[$i]}/test/$test_name"

-		test $i == 0 && cmd mkdir -p "$dir"
+		test $i == 0 && (
+			test -d "$dir" && cmd rm -rf "$dir"
+			cmd mkdir -p "$dir"
+		)

 		eval T_D$i=$dir
 		T_D[$i]=$dir
@@ -637,6 +645,11 @@ for t in $tests; do

 		test -n "$T_ABORT" && die "aborting after first failure"
 	fi
+
+	# record results for TAP format output
+	t_tap_progress $test_name $sts
+	((testcount++))
+
 done

 msg "all tests run: $passed passed, $skipped skipped, $skipped_permitted skipped (permitted), $failed failed"
--- a/tests/sequence
+++ b/tests/sequence
@@ -6,6 +6,7 @@ inode-items-updated.sh
 simple-inode-index.sh
 simple-staging.sh
 simple-release-extents.sh
+simple-readdir.sh
 get-referring-entries.sh
 fallocate.sh
 basic-truncate.sh
@@ -17,6 +18,7 @@ projects.sh
 large-fragmented-free.sh
 format-version-forward-back.sh
 enospc.sh
+mmap.sh
 srch-safe-merge-pos.sh
 srch-basic-functionality.sh
 simple-xattr-unit.sh
@@ -25,6 +27,7 @@ totl-xattr-tag.sh
 quota.sh
 lock-refleak.sh
 lock-shrink-consistency.sh
+lock-shrink-read-race.sh
 lock-pr-cw-conflict.sh
 lock-revoke-getcwd.sh
 lock-recover-invalidate.sh
@@ -54,5 +57,4 @@ archive-light-cycle.sh
 block-stale-reads.sh
 inode-deletion.sh
 renameat2-noreplace.sh
-parallel_restore.sh
 xfstests.sh
--- a/tests/src/mmap_stress.c
+++ b/tests/src/mmap_stress.c
@@ -0,0 +1,181 @@
+#define _GNU_SOURCE
+/*
+ * mmap() stress test for scoutfs
+ *
+ * This test exercises the scoutfs kernel module's locking by
+ * repeatedly reading/writing using mmap and pread/write calls
+ * across 5 clients (mounts).
+ *
+ * Each thread operates on a single thread/client, and performs
+ * operations in a random order on the file.
+ *
+ * The goal is to assure that locking between _page_mkwrite vfs
+ * calls and the normal read/write paths do not cause deadlocks.
+ *
+ * There is no content validation performed. All that is done is
+ * assure that the programs continues without errors.
+ */
+
+#include <sys/types.h>
+#include <stdio.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <unistd.h>
+#include <stdlib.h>
+#include <string.h>
+#include <stdbool.h>
+#include <sys/mman.h>
+#include <pthread.h>
+#include <errno.h>
+
+static int size = 0;
+static int count = 0; /* XXX make this duration instead */
+
+struct thread_info {
+	int nr;
+	int fd;
+};
+
+static void *run_test_func(void *ptr)
+{
+	void *buf = NULL;
+	char *addr = NULL;
+	struct thread_info *tinfo = ptr;
+	int c = 0;
+	int fd;
+	ssize_t read, written, ret;
+	int preads = 0, pwrites = 0, mreads = 0, mwrites = 0;
+
+	fd = tinfo->fd;
+
+	if (posix_memalign(&buf, 4096, size) != 0) {
+		perror("calloc");
+		exit(-1);
+	}
+
+	addr = mmap(NULL, size, PROT_WRITE | PROT_READ, MAP_SHARED, fd, 0);
+	if (addr == MAP_FAILED) {
+		perror("mmap");
+		exit(-1);
+	}
+
+	usleep(100000); /* 0.1sec to allow all threads to start roughly at the same time */
+
+	for (;;) {
+		if (++c > count)
+			break;
+
+		switch (rand() % 4) {
+		case 0: /* pread */
+			preads++;
+			for (read = 0; read < size;) {
+				ret = pread(fd, buf, size - read, read);
+				if (ret < 0) {
+					perror("pwrite");
+					exit(-1);
+				}
+				read += ret;
+			}
+			break;
+		case 1: /* pwrite */
+			pwrites++;
+			memset(buf, (char)(c & 0xff), size);
+			for (written = 0; written < size;) {
+				ret = pwrite(fd, buf, size - written, written);
+				if (ret < 0) {
+					perror("pwrite");
+					exit(-1);
+				}
+				written += ret;
+			}
+			break;
+		case 2: /* mmap read */
+			mreads++;
+			memcpy(buf, addr, size); /* noerr */
+			break;
+		case 3: /* mmap write */
+			mwrites++;
+			memset(buf, (char)(c & 0xff), size);
+			memcpy(addr, buf, size); /* noerr */
+			break;
+		}
+	}
+
+	munmap(addr, size);
+
+	free(buf);
+
+	printf("thread %u complete: preads %u pwrites %u mreads %u mwrites %u\n", tinfo->nr,
+		mreads, mwrites, preads, pwrites);
+
+	return NULL;
+}
+
+int main(int argc, char **argv)
+{
+	pthread_t thread[5];
+	struct thread_info tinfo[5];
+	int fd[5];
+	int ret;
+	int i;
+
+	if (argc != 8) {
+		fprintf(stderr, "%s requires 7 arguments - size count file1 file2 file3 file4 file5\n", argv[0]);
+		exit(-1);
+	}
+
+	size = atoi(argv[1]);
+	if (size <= 0) {
+		fprintf(stderr, "invalid size, must be greater than 0\n");
+		exit(-1);
+	}
+
+	count = atoi(argv[2]);
+	if (count < 0) {
+		fprintf(stderr, "invalid count, must be greater than 0\n");
+		exit(-1);
+	}
+
+	/* create and truncate one fd */
+	fd[0] = open(argv[3], O_RDWR | O_CREAT | O_TRUNC, 00644);
+	if (fd[0] < 0) {
+		perror("open");
+		exit(-1);
+	}
+
+	/* make it the test size */
+	if (posix_fallocate(fd[0], 0, size) != 0) {
+		perror("fallocate");
+		exit(-1);
+	}
+
+	/* now open the rest of the fds */
+	for (i = 1; i < 5; i++) {
+		fd[i] = open(argv[3+i], O_RDWR);
+		if (fd[i] < 0) {
+			perror("open");
+			exit(-1);
+		}
+	}
+
+	/* start threads */
+	for (i = 0; i < 5; i++) {
+		tinfo[i].fd = fd[i];
+		tinfo[i].nr = i;
+		ret = pthread_create(&thread[i], NULL, run_test_func, (void*)&tinfo[i]);
+
+		if (ret) {
+			perror("pthread_create");
+			exit(-1);
+		}
+	}
+
+	/* wait for complete */
+	for (i = 0; i < 5; i++)
+		pthread_join(thread[i], NULL);
+
+	for (i = 0; i < 5; i++)
+		close(fd[i]);
+
+	exit(0);
+}
--- a/tests/src/mmap_validate.c
+++ b/tests/src/mmap_validate.c
@@ -0,0 +1,159 @@
+#define _GNU_SOURCE
+/*
+ * mmap() content consistency checking for scoutfs
+ *
+ * This test program validates that content from memory mappings
+ * are consistent across clients, whether written/read with mmap or
+ * normal writes/reads.
+ *
+ * One side of (read/write) will always be memory mapped. It may
+ * be that both sides do memory mapped (33% of the time).
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <string.h>
+#include <sys/mman.h>
+#include <fcntl.h>
+#include <errno.h>
+
+static int count = 0;
+static int size = 0;
+
+static void run_test_func(int fd1, int fd2)
+{
+	void *buf1 = NULL;
+	void *buf2 = NULL;
+	char *addr1 = NULL;
+	char *addr2 = NULL;
+	int c = 0;
+	ssize_t read, written, ret;
+
+	/* buffers for both sides to compare */
+	if (posix_memalign(&buf1, 4096, size) != 0) {
+		perror("calloc1");
+		exit(-1);
+	}
+
+	if (posix_memalign(&buf2, 4096, size) != 0) {
+		perror("calloc1");
+		exit(-1);
+	}
+
+	/* memory maps for both sides */
+	addr1 = mmap(NULL, size, PROT_WRITE | PROT_READ, MAP_SHARED, fd1, 0);
+	if (addr1 == MAP_FAILED) {
+		perror("mmap1");
+		exit(-1);
+	}
+
+	addr2 = mmap(NULL, size, PROT_WRITE | PROT_READ, MAP_SHARED, fd2, 0);
+	if (addr2 == MAP_FAILED) {
+		perror("mmap2");
+		exit(-1);
+	}
+
+	for (;;) {
+		if (++c > count) /* 10k iterations */
+			break;
+
+		/* put a pattern in buf1 */
+		memset(buf1, c & 0xff, size);
+
+		/* pwrite or mmap write from buf1 */
+		switch (c % 3) {
+		case 0:	/* pwrite */
+			for (written = 0; written < size;) {
+				ret = pwrite(fd1, buf1, size - written, written);
+				if (ret < 0) {
+					perror("pwrite");
+					exit(-1);
+				}
+				written += ret;
+			}
+			break;
+		default: /* mmap write */
+			memcpy(addr1, buf1, size);
+			break;
+		}
+
+		/* pread or mmap read to buf2 */
+		switch (c % 3) {
+		case 2: /* pread */
+			for (read = 0; read < size;) {
+				ret = pread(fd2, buf2, size - read, read);
+				if (ret < 0) {
+					perror("pwrite");
+					exit(-1);
+				}
+				read += ret;
+			}
+			break;
+		default: /* mmap read */
+			memcpy(buf2, addr2, size);
+			break;
+		}
+
+		/* compare bufs */
+		if (memcmp(buf1, buf2, size) != 0) {
+			fprintf(stderr, "memcmp() failed\n");
+			exit(-1);
+		}
+	}
+
+	munmap(addr1, size);
+	munmap(addr2, size);
+
+	free(buf1);
+	free(buf2);
+}
+
+int main(int argc, char **argv)
+{
+	int fd[1];
+
+	if (argc != 5) {
+		fprintf(stderr, "%s requires 4 arguments - size count file1 file2\n", argv[0]);
+		exit(-1);
+	}
+
+	size = atoi(argv[1]);
+	if (size <= 0) {
+		fprintf(stderr, "invalid size, must be greater than 0\n");
+		exit(-1);
+	}
+
+	count = atoi(argv[2]);
+	if (count < 3) {
+		fprintf(stderr, "invalid count, must be greater than 3\n");
+		exit(-1);
+	}
+
+	/* create and truncate one fd */
+	fd[0] = open(argv[3], O_RDWR | O_CREAT | O_TRUNC, 00644);
+	if (fd[0] < 0) {
+		perror("open");
+		exit(-1);
+	}
+
+	fd[1] = open(argv[4], O_RDWR , 00644);
+	if (fd[1] < 0) {
+		perror("open");
+		exit(-1);
+	}
+
+	/* make it the test size */
+	if (posix_fallocate(fd[0], 0, size) != 0) {
+		perror("fallocate");
+		exit(-1);
+	}
+
+	/* run the test function */
+	run_test_func(fd[0], fd[1]);
+
+	close(fd[0]);
+	close(fd[1]);
+
+	exit(0);
+}
--- a/tests/src/parallel_restore.c
+++ b/tests/src/parallel_restore.c
@@ -1,838 +0,0 @@
-#define _GNU_SOURCE /* O_DIRECT */
-#include <stdlib.h>
-#include <stdio.h>
-#include <stdbool.h>
-#include <unistd.h>
-#include <fcntl.h>
-#include <sys/types.h>
-#include <sys/xattr.h>
-#include <ctype.h>
-#include <string.h>
-#include <errno.h>
-#include <limits.h>
-#include <time.h>
-#include <sys/prctl.h>
-#include <signal.h>
-#include <sys/socket.h>
-
-#include "../../utils/src/sparse.h"
-#include "../../utils/src/util.h"
-#include "../../utils/src/list.h"
-#include "../../utils/src/parse.h"
-#include "../../kmod/src/format.h"
-#include "../../utils/src/parallel_restore.h"
-
-/*
- * XXX:
- *  - add a nice description of what's going on
- *  - mention allocator contention
- *  - test child process dying handling
- *  - root dir entry name length is wrong
- */
-
-#define ERRF " errno %d (%s)"
-#define ERRA errno, strerror(errno)
-
-#define error_exit(cond, fmt, args...)			\
-do {							\
-	if (cond) {					\
-		printf("error: "fmt"\n", ##args);	\
-		exit(1);				\
-	}						\
-} while (0)
-
-#define dprintf(fmt, args...)		\
-do {					\
-	if (0)				\
-		printf(fmt, ##args);	\
-} while (0)
-
-#define REG_MODE (S_IFREG | 0644)
-#define DIR_MODE (S_IFDIR | 0755)
-
-struct opts {
-	unsigned long long buf_size;
-
-	unsigned long long write_batch;
-	unsigned long long low_dirs;
-	unsigned long long high_dirs;
-	unsigned long long low_files;
-	unsigned long long high_files;
-	char *meta_path;
-	unsigned long long total_files;
-	bool read_only;
-	unsigned long long seed;
-	unsigned long long nr_writers;
-};
-
-static void usage(void)
-{
-	printf("usage:\n"
-	       " -b NR       | threads write blocks in batches files (100000)\n"
-	       " -d LOW:HIGH | range of subdirs per directory (5:10)\n"
-	       " -f LOW:HIGH | range of files per directory (10:20)\n"
-	       " -m PATH     | path to metadata device\n"
-	       " -n NR       | total number of files to create (100)\n"
-	       " -r          | read-only, all work except writing, measure cpu cost\n"
-	       " -s NR       | randomization seed (random)\n"
-	       " -w NR       | number of writing processes to fork (online cpus)\n"
-	       );
-}
-
-static size_t write_bufs(struct opts *opts, struct scoutfs_parallel_restore_writer *wri,
-			 void *buf, size_t buf_size, int dev_fd)
-{
-	size_t total = 0;
-	size_t count;
-	off_t off;
-	int ret;
-
-	do {
-		ret = scoutfs_parallel_restore_write_buf(wri, buf, buf_size, &off, &count);
-		error_exit(ret, "write buf %d", ret);
-
-		if (count > 0) {
-			if (!opts->read_only)
-				ret = pwrite(dev_fd, buf, count, off);
-			else
-				ret = count;
-			error_exit(ret != count, "pwrite count %zu ret %d", count, ret);
-			total += ret;
-		}
-	} while (count > 0);
-
-	return total;
-}
-
-struct gen_inode {
-	struct scoutfs_parallel_restore_inode inode;
-	struct scoutfs_parallel_restore_xattr **xattrs;
-	u64 nr_xattrs;
-	struct scoutfs_parallel_restore_entry **entries;
-	u64 nr_files;
-	u64 nr_entries;
-};
-
-static void free_gino(struct gen_inode *gino)
-{
-	u64 i;
-
-	if (gino) {
-		if (gino->entries) {
-			for (i = 0; i < gino->nr_entries; i++)
-				free(gino->entries[i]);
-			free(gino->entries);
-		}
-		if (gino->xattrs) {
-			for (i = 0; i < gino->nr_xattrs; i++)
-				free(gino->xattrs[i]);
-			free(gino->xattrs);
-		}
-		free(gino);
-	}
-}
-
-static struct scoutfs_parallel_restore_xattr *
-generate_xattr(struct opts *opts, u64 ino, u64 pos, char *name, int name_len, void *value,
-		int value_len)
-{
-	struct scoutfs_parallel_restore_xattr *xattr;
-
-	xattr = malloc(sizeof(struct scoutfs_parallel_restore_xattr) + name_len + value_len);
-	error_exit(!xattr, "error allocating generated xattr");
-
-	*xattr = (struct scoutfs_parallel_restore_xattr) {
-		.ino = ino,
-		.pos = pos,
-		.name_len = name_len,
-		.value_len = value_len,
-	};
-
-	xattr->name = (void *)(xattr + 1);
-	xattr->value = (void *)(xattr->name + name_len);
-
-	memcpy(xattr->name, name, name_len);
-	if (value_len)
-		memcpy(xattr->value, value, value_len);
-
-	return xattr;
-}
-
-static struct gen_inode *generate_inode(struct opts *opts, u64 ino, mode_t mode)
-{
-	struct gen_inode *gino;
-	struct timespec now;
-
-	clock_gettime(CLOCK_REALTIME, &now);
-
-	gino = calloc(1, sizeof(struct gen_inode));
-	error_exit(!gino, "failure allocating generated inode");
-
-	gino->inode = (struct scoutfs_parallel_restore_inode) {
-		.ino = ino,
-		.meta_seq = ino,
-		.data_seq = 0,
-		.mode = mode,
-		.atime = now,
-		.ctime = now,
-		.mtime = now,
-		.crtime = now,
-	};
-
-	/*
-	 * hacky creation of a bunch of xattrs for now.
-	 */
-	if ((mode & S_IFMT) == S_IFREG) {
-		#define NV(n, v) { n, sizeof(n) - 1, v, sizeof(v) - 1, }
-		struct name_val {
-			char *name;
-			int len;
-			char *value;
-			int value_len;
-		} nv[] = {
-			NV("scoutfs.hide.totl.acct.8314611887310466424.2.0", "1"),
-			NV("scoutfs.hide.srch.sam_vol_E01001L6_4", ""),
-			NV("scoutfs.hide.sam_reqcopies", ""),
-			NV("scoutfs.hide.sam_copy_2", ""),
-			NV("scoutfs.hide.totl.acct.F01030L6.8314611887310466424.7.30", "1"),
-			NV("scoutfs.hide.sam_copy_1", ""),
-			NV("scoutfs.hide.srch.sam_vol_F01030L6_4", ""),
-			NV("scoutfs.hide.srch.sam_release_cand", ""),
-			NV("scoutfs.hide.sam_restime", ""),
-			NV("scoutfs.hide.sam_uuid", ""),
-			NV("scoutfs.hide.totl.acct.8314611887310466424.3.0", "1"),
-			NV("scoutfs.hide.srch.sam_vol_F01030L6", ""),
-			NV("scoutfs.hide.srch.sam_uuid_865939b7-24d6-472f-b85c-7ce7afeb813a", ""),
-			NV("scoutfs.hide.srch.sam_vol_E01001L6", ""),
-			NV("scoutfs.hide.totl.acct.E01001L6.8314611887310466424.7.1", "1"),
-			NV("scoutfs.hide.totl.acct.8314611887310466424.4.0", "1"),
-			NV("scoutfs.hide.totl.acct.8314611887310466424.11.0", "1"),
-			NV("scoutfs.hide.totl.acct.8314611887310466424.1.0", "1"),
-		};
-		unsigned int nr = array_size(nv);
-		int i;
-
-		gino->xattrs = calloc(nr, sizeof(struct scoutfs_parallel_restore_xattr *));
-
-		for (i = 0; i < nr; i++)
-			gino->xattrs[i] = generate_xattr(opts, ino, i, nv[i].name, nv[i].len,
-							 nv[i].value, nv[i].value_len);
-
-		gino->nr_xattrs = nr;
-		gino->inode.nr_xattrs = nr;
-
-		gino->inode.size = 4096;
-		gino->inode.offline = true;
-	}
-
-	return gino;
-}
-
-static struct scoutfs_parallel_restore_entry *
-generate_entry(struct opts *opts, char *prefix, u64 nr, u64 dir_ino, u64 pos, u64 ino, mode_t mode)
-{
-	struct scoutfs_parallel_restore_entry *entry;
-	char buf[PATH_MAX];
-	int bytes;
-
-	bytes = snprintf(buf, sizeof(buf), "%s-%llu", prefix, nr);
-
-	entry = malloc(sizeof(struct scoutfs_parallel_restore_entry) + bytes);
-	error_exit(!entry, "error allocating generated entry");
-
-	*entry = (struct scoutfs_parallel_restore_entry) {
-		.dir_ino = dir_ino,
-		.pos = pos,
-		.ino = ino,
-		.mode = mode,
-		.name = (void *)(entry + 1),
-		.name_len = bytes,
-	};
-
-	memcpy(entry->name, buf, bytes);
-
-	return entry;
-}
-
-/*
- * since the _parallel_restore_quota_rule mimics the squota_rule found in the
- * kernel we can also mimic its rule_to_irule function
- */
-
-#define TEST_RULE_STR "7 13,L,- 15,L,- 17,L,- I 33 -"
-
-static struct scoutfs_parallel_restore_quota_rule *
-generate_quota(struct opts *opts)
-{
-	struct scoutfs_parallel_restore_quota_rule *prule;
-	int err;
-
-	prule = calloc(1, sizeof(struct scoutfs_parallel_restore_quota_rule));
-	error_exit(!prule, "Quota rule alloc failed");
-
-	err = sscanf(TEST_RULE_STR, " %hhu %llu,%c,%c %llu,%c,%c %llu,%c,%c %c %llu %c",
-		     &prule->prio,
-			 &prule->names[0].val, &prule->names[0].source, &prule->names[0].flags,
-		     &prule->names[1].val, &prule->names[1].source, &prule->names[1].flags,
-			 &prule->names[2].val, &prule->names[2].source, &prule->names[2].flags,
-			 &prule->op, &prule->limit, &prule->rule_flags);
-	error_exit(err != 13, "invalid quota rule, missing fields. nr fields: %d rule str: %s\n", err, TEST_RULE_STR);
-
-	return prule;
-}
-
-static u64 random64(void)
-{
-	return ((u64)lrand48() << 32) | lrand48();
-}
-
-static u64 random_range(u64 low, u64 high)
-{
-	return low + (random64() % (high - low + 1));
-}
-
-static struct gen_inode *generate_dir(struct opts *opts, u64 dir_ino, u64 ino_start, u64 ino_len,
-				      bool no_dirs)
-{
-	struct scoutfs_parallel_restore_entry *entry;
-	struct gen_inode *gino;
-	u64 nr_entries;
-	u64 nr_files;
-	u64 nr_dirs;
-	u64 ino;
-	char *prefix;
-	mode_t mode;
-	u64 i;
-
-	nr_dirs = no_dirs ? 0 : random_range(opts->low_dirs, opts->high_dirs);
-	nr_files = random_range(opts->low_files, opts->high_files);
-
-	if (1 + nr_dirs + nr_files > ino_len) {
-		nr_dirs = no_dirs ? 0 : (ino_len - 1) / 2;
-		nr_files = (ino_len - 1) - nr_dirs;
-	}
-
-	nr_entries = nr_dirs + nr_files;
-
-	gino = generate_inode(opts, dir_ino, DIR_MODE);
-	error_exit(!gino, "error allocating generated inode");
-
-	gino->inode.nr_subdirs = nr_dirs;
-	gino->nr_files = nr_files;
-
-	if (nr_entries) {
-		gino->entries = calloc(nr_entries, sizeof(struct scoutfs_parallel_restore_entry *));
-		error_exit(!gino->entries, "error allocating generated inode entries");
-
-		gino->nr_entries = nr_entries;
-	}
-
-	mode = DIR_MODE;
-	prefix = "dir";
-	for (i = 0; i < nr_entries; i++) {
-		if (i == nr_dirs) {
-			mode = REG_MODE;
-			prefix = "file";
-		}
-
-		ino = ino_start + i;
-		entry = generate_entry(opts, prefix, ino, gino->inode.ino,
-				       SCOUTFS_DIRENT_FIRST_POS + i, ino, mode);
-
-		gino->entries[i] = entry;
-		gino->inode.total_entry_name_bytes += entry->name_len;
-	}
-
-	return gino;
-}
-
-/*
- * Restore a generated inode.  If it's a directory then we also restore
- * all its entries.  The caller is going to descend into subdir entries and generate
- * those dir inodes.  We have to generate and restore all non-dir inodes referenced
- * by this inode's entries.
- */
-static void restore_inode(struct opts *opts, struct scoutfs_parallel_restore_writer *wri,
-			  struct gen_inode *gino)
-{
-	struct gen_inode *nondir;
-	int ret;
-	u64 i;
-
-	ret = scoutfs_parallel_restore_add_inode(wri, &gino->inode);
-	error_exit(ret, "thread add root inode %d", ret);
-
-	for (i = 0; i < gino->nr_entries; i++) {
-		ret = scoutfs_parallel_restore_add_entry(wri, gino->entries[i]);
-		error_exit(ret, "thread add entry %d", ret);
-
-		/* caller only needs subdir entries, generate and free others */
-		if ((gino->entries[i]->mode & S_IFMT) != S_IFDIR) {
-
-			nondir = generate_inode(opts, gino->entries[i]->ino,
-						gino->entries[i]->mode);
-			restore_inode(opts, wri, nondir);
-			free_gino(nondir);
-
-			free(gino->entries[i]);
-			if (i != gino->nr_entries - 1)
-				gino->entries[i] = gino->entries[gino->nr_entries - 1];
-			gino->nr_entries--;
-			gino->nr_files--;
-			i--;
-		}
-	}
-
-	for (i = 0; i < gino->nr_xattrs; i++) {
-		ret = scoutfs_parallel_restore_add_xattr(wri, gino->xattrs[i]);
-		error_exit(ret, "thread add xattr %d", ret);
-	}
-}
-
-struct writer_args {
-	struct list_head head;
-
-	int dev_fd;
-	int pair_fd;
-
-	struct scoutfs_parallel_restore_slice slice;
-	u64 writer_nr;
-	u64 dir_height;
-	u64 ino_start;
-	u64 ino_len;
-};
-
-struct write_result {
-	struct scoutfs_parallel_restore_progress prog;
-	struct scoutfs_parallel_restore_slice slice;
-	__le64 files_created;
-	__le64 bytes_written;
-};
-
-static void write_bufs_and_send(struct opts *opts, struct scoutfs_parallel_restore_writer *wri,
-				  void *buf, size_t buf_size, int dev_fd,
-				  struct write_result *res, bool get_slice, int pair_fd)
-{
-	size_t total;
-	int ret;
-
-	total = write_bufs(opts, wri, buf, buf_size, dev_fd);
-	le64_add_cpu(&res->bytes_written, total);
-
-	ret = scoutfs_parallel_restore_get_progress(wri, &res->prog);
-	error_exit(ret, "get prog %d", ret);
-
-	if (get_slice) {
-		ret = scoutfs_parallel_restore_get_slice(wri, &res->slice);
-		error_exit(ret, "thread get slice %d", ret);
-	}
-
-	ret = write(pair_fd, res, sizeof(struct write_result));
-	error_exit(ret != sizeof(struct write_result), "result send error");
-
-	memset(res, 0, sizeof(struct write_result));
-}
-
-/*
- * Calculate the number of bytes in toplevel "dir-%llu" entry names for the given
- * number of writers.
- */
-static u64 topdir_entry_bytes(u64 nr_writers)
-{
-	u64 bytes = (3 + 1) * nr_writers;
-	u64 limit;
-	u64 done;
-	u64 wid;
-	u64 nr;
-
-	for (done = 0, wid = 1, limit = 10; done < nr_writers; done += nr, wid++, limit *= 10) {
-		nr = min(limit - done, nr_writers - done);
-		bytes += nr * wid;
-	}
-
-	return bytes;
-}
-
-struct dir_pos {
-	struct gen_inode *gino;
-	u64 pos;
-};
-
-static void writer_proc(struct opts *opts, struct writer_args *args)
-{
-	struct scoutfs_parallel_restore_writer *wri = NULL;
-	struct scoutfs_parallel_restore_entry *entry;
-	struct dir_pos *dirs = NULL;
-	struct write_result res;
-	struct gen_inode *gino;
-	void *buf = NULL;
-	u64 level;
-	u64 ino;
-	int ret;
-
-	memset(&res, 0, sizeof(res));
-
-	dirs = calloc(args->dir_height, sizeof(struct dir_pos));
-	error_exit(errno, "error allocating parent dirs "ERRF, ERRA);
-
-	errno = posix_memalign((void **)&buf, 4096, opts->buf_size);
-	error_exit(errno, "error allocating block buf "ERRF, ERRA);
-
-	ret = scoutfs_parallel_restore_create_writer(&wri);
-	error_exit(ret, "create writer %d", ret);
-
-	ret = scoutfs_parallel_restore_add_slice(wri, &args->slice);
-	error_exit(ret, "add slice %d", ret);
-
-	/* writer 0 creates the root dir */
-	if (args->writer_nr == 0) {
-		gino = generate_inode(opts, SCOUTFS_ROOT_INO, DIR_MODE);
-		gino->inode.nr_subdirs = opts->nr_writers;
-		gino->inode.total_entry_name_bytes = topdir_entry_bytes(opts->nr_writers);
-
-		ret = scoutfs_parallel_restore_add_inode(wri, &gino->inode);
-		error_exit(ret, "thread add root inode %d", ret);
-		free_gino(gino);
-	}
-
-	/* create root entry for our top level dir */
-	ino = args->ino_start++;
-	args->ino_len--;
-
-	entry = generate_entry(opts, "top", args->writer_nr,
-			       SCOUTFS_ROOT_INO, SCOUTFS_DIRENT_FIRST_POS + args->writer_nr,
-			       ino, DIR_MODE);
-
-	ret = scoutfs_parallel_restore_add_entry(wri, entry);
-	error_exit(ret, "thread top entry %d", ret);
-	free(entry);
-
-	level = args->dir_height - 1;
-
-	while (args->ino_len > 0 && level < args->dir_height) {
-		gino = dirs[level].gino;
-
-		/* generate and restore if we follow entries */
-		if (!gino) {
-			gino = generate_dir(opts, ino, args->ino_start, args->ino_len, level == 0);
-			args->ino_start += gino->nr_entries;
-			args->ino_len -= gino->nr_entries;
-			le64_add_cpu(&res.files_created, gino->nr_files);
-
-			restore_inode(opts, wri, gino);
-			dirs[level].gino = gino;
-		}
-
-		if (dirs[level].pos == gino->nr_entries) {
-			/* ascend if we're done with this dir */
-			dirs[level].gino = NULL;
-			dirs[level].pos = 0;
-			free_gino(gino);
-			level++;
-
-		} else {
-			/* otherwise descend into subdir entry */
-			ino = gino->entries[dirs[level].pos]->ino;
-			dirs[level].pos++;
-			level--;
-		}
-
-		/* do a partial write at batch intervals when there's still more to do */
-		if (le64_to_cpu(res.files_created) >= opts->write_batch && args->ino_len > 0)
-			write_bufs_and_send(opts, wri, buf, opts->buf_size, args->dev_fd,
-					    &res, false, args->pair_fd);
-	}
-
-	write_bufs_and_send(opts, wri, buf, opts->buf_size, args->dev_fd,
-			    &res, true, args->pair_fd);
-
-	scoutfs_parallel_restore_destroy_writer(&wri);
-
-	free(dirs);
-	free(buf);
-}
-
-/*
- * If any of our children exited with an error code, we hard exit.
- * The child processes should themselves report out any errors
- * encountered. Any remaining children will receive SIGHUP and
- * terminate.
- */
-static void sigchld_handler(int signo, siginfo_t *info, void *context)
-{
-	if (info->si_status)
-		exit(EXIT_FAILURE);
-}
-
-static void fork_writer(struct opts *opts, struct writer_args *args)
-{
-	pid_t parent = getpid();
-	pid_t pid;
-	int ret;
-
-	pid = fork();
-	error_exit(pid == -1, "fork error");
-
-	if (pid != 0)
-		return;
-
-	ret = prctl(PR_SET_PDEATHSIG, SIGHUP);
-	error_exit(ret < 0, "failed to set parent death sig");
-
-	printf("pid %u getpid() %u parent %u getppid() %u\n",
-		pid, getpid(), parent, getppid());
-	error_exit(getppid() != parent, "child parent already changed");
-
-	writer_proc(opts, args);
-	exit(0);
-}
-
-static int do_restore(struct opts *opts)
-{
-	struct scoutfs_parallel_restore_writer *wri = NULL;
-	struct scoutfs_parallel_restore_slice *slices = NULL;
-	struct scoutfs_parallel_restore_quota_rule *rule = NULL;
-	struct scoutfs_super_block *super = NULL;
-	struct write_result res;
-	struct writer_args *args;
-	struct timespec begin;
-	struct timespec end;
-	LIST_HEAD(writers);
-	u64 next_ino;
-	u64 ino_per;
-	u64 avg_dirs;
-	u64 avg_files;
-	u64 dir_height;
-	u64 tot_files;
-	u64 tot_bytes;
-	int pair[2] = {-1, -1};
-	float secs;
-	void *buf = NULL;
-	int dev_fd = -1;
-	int ret;
-	int i;
-
-	ret = socketpair(PF_LOCAL, SOCK_STREAM, 0, pair);
-	error_exit(ret, "socketpair error "ERRF, ERRA);
-
-	dev_fd = open(opts->meta_path, O_DIRECT | (opts->read_only ? O_RDONLY : (O_RDWR|O_EXCL)));
-	error_exit(dev_fd < 0, "error opening '%s': "ERRF, opts->meta_path, ERRA);
-
-	errno = posix_memalign((void **)&super, 4096, SCOUTFS_BLOCK_SM_SIZE) ?:
-		posix_memalign((void **)&buf, 4096, opts->buf_size);
-	error_exit(errno, "error allocating block bufs "ERRF, ERRA);
-
-	ret = pread(dev_fd, super, SCOUTFS_BLOCK_SM_SIZE,
-		    SCOUTFS_SUPER_BLKNO << SCOUTFS_BLOCK_SM_SHIFT);
-	error_exit(ret != SCOUTFS_BLOCK_SM_SIZE, "error reading super, ret %d", ret);
-
-	ret = scoutfs_parallel_restore_create_writer(&wri);
-	error_exit(ret, "create writer %d", ret);
-
-	ret = scoutfs_parallel_restore_import_super(wri, super, dev_fd);
-	error_exit(ret, "import super %d", ret);
-
-	rule = generate_quota(opts);
-	ret = scoutfs_parallel_restore_add_quota_rule(wri, rule);
-	free(rule);
-	error_exit(ret, "add quotas %d", ret);
-
-	slices = calloc(1 + opts->nr_writers, sizeof(struct scoutfs_parallel_restore_slice));
-	error_exit(!slices, "alloc slices");
-
-	scoutfs_parallel_restore_init_slices(wri, slices, 1 + opts->nr_writers);
-
-	ret = scoutfs_parallel_restore_add_slice(wri, &slices[0]);
-	error_exit(ret, "add slices[0] %d", ret);
-
-	next_ino = (SCOUTFS_ROOT_INO | SCOUTFS_LOCK_INODE_GROUP_MASK) + 1;
-	ino_per = opts->total_files / opts->nr_writers;
-	avg_dirs = (opts->low_dirs + opts->high_dirs) / 2;
-	avg_files = (opts->low_files + opts->high_files) / 2;
-
-	dir_height = 1;
-	tot_files = avg_files * opts->nr_writers;
-
-	while (tot_files < opts->total_files) {
-		dir_height++;
-		tot_files *= avg_dirs;
-	}
-
-	dprintf("height %llu tot %llu total %llu\n", dir_height, tot_files, opts->total_files);
-
-	clock_gettime(CLOCK_MONOTONIC_RAW, &begin);
-
-	/* start each writing process */
-	for (i = 0; i < opts->nr_writers; i++) {
-		args = calloc(1, sizeof(struct writer_args));
-		error_exit(!args, "alloc writer args");
-
-		args->dev_fd = dev_fd;
-		args->pair_fd = pair[1];
-		args->slice = slices[1 + i];
-		args->writer_nr = i;
-		args->dir_height = dir_height;
-		args->ino_start = next_ino;
-		args->ino_len = ino_per;
-
-		list_add_tail(&args->head, &writers);
-		next_ino += ino_per;
-
-		fork_writer(opts, args);
-	}
-
-	/* read results and watch for writers to finish */
-	tot_files = 0;
-	tot_bytes = 0;
-	i = 0;
-	while (i < opts->nr_writers) {
-		ret = read(pair[0], &res, sizeof(struct write_result));
-		error_exit(ret != sizeof(struct write_result), "result read error %d", ret);
-
-		ret = scoutfs_parallel_restore_add_progress(wri, &res.prog);
-		error_exit(ret, "add thr prog %d", ret);
-
-		if (res.slice.meta_len != 0) {
-			ret = scoutfs_parallel_restore_add_slice(wri, &res.slice);
-			error_exit(ret, "add thr slice %d", ret);
-			i++;
-		}
-
-		tot_files += le64_to_cpu(res.files_created);
-		tot_bytes += le64_to_cpu(res.bytes_written);
-	}
-
-	tot_bytes += write_bufs(opts, wri, buf, opts->buf_size, dev_fd);
-
-	ret = scoutfs_parallel_restore_export_super(wri, super);
-	error_exit(ret, "update super %d", ret);
-
-	if (!opts->read_only) {
-		ret = pwrite(dev_fd, super, SCOUTFS_BLOCK_SM_SIZE,
-			     SCOUTFS_SUPER_BLKNO << SCOUTFS_BLOCK_SM_SHIFT);
-		error_exit(ret != SCOUTFS_BLOCK_SM_SIZE, "error writing super, ret %d", ret);
-	}
-
-	clock_gettime(CLOCK_MONOTONIC_RAW, &end);
-
-	scoutfs_parallel_restore_destroy_writer(&wri);
-
-	secs = ((float)end.tv_sec + ((float)end.tv_nsec/NSEC_PER_SEC)) -
-	       ((float)begin.tv_sec + ((float)begin.tv_nsec/NSEC_PER_SEC));
-	printf("created %llu files in %llu bytes and %f secs => %f bytes/file, %f files/sec\n",
-		tot_files, tot_bytes, secs,
-		(float)tot_bytes / tot_files, (float)tot_files / secs);
-
-	if (dev_fd >= 0)
-		close(dev_fd);
-	if (pair[0] >= 0)
-		close(pair[0]);
-	if (pair[1] >= 0)
-		close(pair[1]);
-	free(super);
-	free(slices);
-	free(buf);
-
-	return 0;
-}
-
-static int parse_low_high(char *str, u64 *low_ret, u64 *high_ret)
-{
-	char *sep;
-	int ret = 0;
-
-	sep = index(str, ':');
-	if (sep) {
-		*sep = '\0';
-		ret = parse_u64(sep + 1, high_ret);
-	}
-
-	if (ret == 0)
-		ret = parse_u64(str, low_ret);
-
-	if (sep)
-		*sep = ':';
-
-	return ret;
-}
-
-int main(int argc, char **argv)
-{
-	struct opts opts = {
-		.buf_size = (32 * 1024 * 1024),
-
-		.write_batch = 1000000,
-		.low_dirs = 5,
-		.high_dirs = 10,
-		.low_files = 10,
-		.high_files = 20,
-		.total_files = 100,
-	};
-	struct sigaction act = { 0 };
-	int ret;
-	int c;
-
-	opts.seed = random64();
-	opts.nr_writers = sysconf(_SC_NPROCESSORS_ONLN);
-
-        while ((c = getopt(argc, argv, "b:d:f:m:n:rs:w:")) != -1) {
-                switch(c) {
-                case 'b':
-			ret = parse_u64(optarg, &opts.write_batch);
-			error_exit(ret, "error parsing -b '%s'\n", optarg);
-			error_exit(opts.write_batch == 0, "-b can't be 0");
-                        break;
-                case 'd':
-			ret = parse_low_high(optarg, &opts.low_dirs, &opts.high_dirs);
-			error_exit(ret, "error parsing -d '%s'\n", optarg);
-                        break;
-                case 'f':
-			ret = parse_low_high(optarg, &opts.low_files, &opts.high_files);
-			error_exit(ret, "error parsing -f '%s'\n", optarg);
-                        break;
-                case 'm':
-                        opts.meta_path = strdup(optarg);
-                        break;
-                case 'n':
-			ret = parse_u64(optarg, &opts.total_files);
-			error_exit(ret, "error parsing -n '%s'\n", optarg);
-                        break;
-                case 'r':
-			opts.read_only = true;
-			break;
-                case 's':
-			ret = parse_u64(optarg, &opts.seed);
-			error_exit(ret, "error parsing -s '%s'\n", optarg);
-                        break;
-                case 'w':
-			ret = parse_u64(optarg, &opts.nr_writers);
-			error_exit(ret, "error parsing -w '%s'\n", optarg);
-                        break;
-                case '?':
-                        printf("Unknown option '%c'\n", optopt);
-                        usage();
-			exit(1);
-                }
-        }
-
-	error_exit(opts.low_dirs > opts.high_dirs, "LOW > HIGH in -d %llu:%llu",
-		   opts.low_dirs, opts.high_dirs);
-	error_exit(opts.low_files > opts.high_files, "LOW > HIGH in -f %llu:%llu",
-		   opts.low_files, opts.high_files);
-	error_exit(!opts.meta_path, "must specify metadata device path with -m");
-
-	printf("recreate with: -d %llu:%llu -f %llu:%llu -n %llu -s %llu -w %llu\n",
-		opts.low_dirs, opts.high_dirs, opts.low_files, opts.high_files,
-		opts.total_files, opts.seed, opts.nr_writers);
-
-	act.sa_flags = SA_SIGINFO | SA_RESTART;
-	act.sa_sigaction = &sigchld_handler;
-	if (sigaction(SIGCHLD, &act, NULL) == -1)
-		error_exit(ret, "error setting up signal handler\n");
-
-	ret = do_restore(&opts);
-
-	free(opts.meta_path);
-
-	return ret == 0 ? 0 : 1;
-}
--- a/tests/src/restore_copy.c
+++ b/tests/src/restore_copy.c
@@ -1,817 +0,0 @@
-#define _GNU_SOURCE /* O_DIRECT */
-#include <stdlib.h>
-#include <stdio.h>
-#include <stdbool.h>
-#include <unistd.h>
-#include <fcntl.h>
-#include <sys/types.h>
-#include <sys/xattr.h>
-#include <sys/stat.h>
-#include <sys/ioctl.h>
-#include <ctype.h>
-#include <string.h>
-#include <errno.h>
-#include <limits.h>
-#include <time.h>
-#include <sys/prctl.h>
-#include <sys/socket.h>
-#include <sys/signal.h>
-#include <sys/statfs.h>
-#include <dirent.h>
-
-#include "../../utils/src/sparse.h"
-#include "../../utils/src/util.h"
-#include "../../utils/src/list.h"
-#include "../../utils/src/parse.h"
-#include "../../kmod/src/format.h"
-#include "../../kmod/src/ioctl.h"
-#include "../../utils/src/parallel_restore.h"
-
-/*
- * XXX:
- */
-
-#define ERRF " errno %d (%s)"
-#define ERRA errno, strerror(errno)
-
-#define error_exit(cond, fmt, args...)			\
-do {							\
-	if (cond) {					\
-		printf("error: "fmt"\n", ##args);	\
-		exit(1);				\
-	}						\
-} while (0)
-
-#define REG_MODE (S_IFREG | 0644)
-#define DIR_MODE (S_IFDIR | 0755)
-#define LNK_MODE (S_IFLNK | 0777)
-
-/*
- * At about 1k files we seem to be writing about 1MB of data, so
- * set buffer sizes adequately above that.
- */
-#define BATCH_FILES 1024
-#define BUF_SIZ 2 * 1024 * 1024
-
-/*
- * We can't make duplicate inodes for hardlinked files, so we
- * will need to track these as we generate them. Not too costly
- * to do, since it's just an integer, and sorting shouldn't matter
- * until we get into the millions of entries, hopefully.
- */
-static struct list_head hardlinks;
-struct hardlink_head {
-	struct list_head head;
-	u64 ino;
-};
-
-struct opts {
-	char *meta_path;
-	char *source_dir;
-};
-
-static bool warn_scoutfs = false;
-
-static void usage(void)
-{
-	printf("usage:\n"
-	       " -m PATH     | path to metadata device\n"
-	       " -s PATH     | path to source directory\n"
-	       );
-}
-
-static size_t write_bufs(struct scoutfs_parallel_restore_writer *wri,
-			 void *buf, int dev_fd)
-{
-	size_t total = 0;
-	size_t count;
-	off_t off;
-	int ret;
-
-	do {
-		ret = scoutfs_parallel_restore_write_buf(wri, buf, BUF_SIZ, &off, &count);
-		error_exit(ret, "write buf %d", ret);
-
-		if (count > 0) {
-			ret = pwrite(dev_fd, buf, count, off);
-			error_exit(ret != count, "pwrite count %zu ret %d", count, ret);
-			total += ret;
-		}
-	} while (count > 0);
-
-	return total;
-}
-
-struct write_result {
-	struct scoutfs_parallel_restore_progress prog;
-	struct scoutfs_parallel_restore_slice slice;
-	__le64 files_created;
-	__le64 dirs_created;
-	__le64 bytes_written;
-	bool complete;
-};
-
-static void write_bufs_and_send(struct scoutfs_parallel_restore_writer *wri,
-				void *buf, int dev_fd,
-				struct write_result *res, bool get_slice, int pair_fd)
-{
-	size_t total;
-	int ret;
-
-	total = write_bufs(wri, buf, dev_fd);
-	le64_add_cpu(&res->bytes_written, total);
-
-	ret = scoutfs_parallel_restore_get_progress(wri, &res->prog);
-	error_exit(ret, "get prog %d", ret);
-
-	if (get_slice) {
-		ret = scoutfs_parallel_restore_get_slice(wri, &res->slice);
-		error_exit(ret, "thread get slice %d", ret);
-	}
-
-	ret = write(pair_fd, res, sizeof(struct write_result));
-	error_exit(ret != sizeof(struct write_result), "result send error");
-
-	memset(res, 0, sizeof(struct write_result));
-}
-
-/*
- * Adding xattrs is supported for files and directories only.
- *
- * If the filesystem on which the path resides isn't scoutfs, we omit the
- * scoutfs specific ioctl to fetch hidden xattrs.
- *
- * Untested if the hidden xattr ioctl works on directories or symlinks.
- */
-static void add_xattrs(struct scoutfs_parallel_restore_writer *wri, char *path, u64 ino, bool is_scoutfs)
-{
-	struct scoutfs_ioctl_listxattr_hidden lxh;
-	struct scoutfs_parallel_restore_xattr *xattr;
-	char *buf = NULL;
-	char *name = NULL;
-	int fd = -1;
-	int bytes;
-	int len;
-	int value_len;
-	int ret;
-	int pos = 0;
-
-	if (!is_scoutfs)
-		goto normal_xattrs;
-
-	fd = open(path, O_RDONLY);
-	error_exit(fd < 0, "open"ERRF, ERRA);
-
-	memset(&lxh, 0, sizeof(lxh));
-	lxh.id_pos = 0;
-	lxh.hash_pos = 0;
-	lxh.buf_bytes = 256 * 1024;
-
-	buf = malloc(lxh.buf_bytes);
-	error_exit(!buf, "alloc xattr_hidden buf");
-	lxh.buf_ptr = (unsigned long)buf;
-
-	/* hidden */
-	for (;;) {
-		ret = ioctl(fd, SCOUTFS_IOC_LISTXATTR_HIDDEN, &lxh);
-		if (ret == 0) /* done */
-			break;
-		error_exit(ret < 0, "listxattr_hidden"ERRF, ERRA);
-		bytes = ret;
-		error_exit(bytes > lxh.buf_bytes, "listxattr_hidden overflow");
-		error_exit(buf[bytes - 1] != '\0', "listxattr_hidden didn't term");
-
-		name = buf;
-
-		do {
-			len = strlen(name);
-			error_exit(len == 0, "listxattr_hidden empty name");
-			error_exit(len > SCOUTFS_XATTR_MAX_NAME_LEN, "listxattr_hidden long name");
-
-			/* get value len */
-			value_len = fgetxattr(fd, name, NULL, 0);
-			error_exit(value_len < 0, "malloc value hidden"ERRF, ERRA);
-
-			/* allocate everything at once */
-			xattr = malloc(sizeof(struct scoutfs_parallel_restore_xattr) + len + value_len);
-			error_exit(!xattr, "error allocating generated xattr");
-
-			*xattr = (struct scoutfs_parallel_restore_xattr) {
-				.ino = ino,
-				.pos = pos++,
-				.name_len = len,
-				.value_len = value_len,
-			};
-			xattr->name = (void *)(xattr + 1);
-			xattr->value = (void *)(xattr->name + len);
-
-			/* get value into xattr directly */
-			ret = fgetxattr(fd, name, (void *)(xattr->name + len), value_len);
-			error_exit(ret != value_len, "fgetxattr value"ERRF, ERRA);
-
-			memcpy(xattr->name, name, len);
-
-			ret = scoutfs_parallel_restore_add_xattr(wri, xattr);
-			error_exit(ret, "add hidden xattr %d", ret);
-
-			free(xattr);
-
-			name += len + 1;
-			bytes -= len + 1;
-		} while (bytes > 0);
-	}
-
-	free(buf);
-	close(fd);
-
-normal_xattrs:
-	value_len = listxattr(path, NULL, 0);
-	error_exit(value_len < 0, "hidden listxattr "ERRF, ERRA);
-	if (value_len == 0)
-		return;
-
-	buf = calloc(1, value_len);
-	error_exit(!buf, "malloc value"ERRF, ERRA);
-
-	ret = listxattr(path, buf, value_len);
-	error_exit(ret < 0, "hidden listxattr %d", ret);
-
-	name = buf;
-	bytes = ret;
-	do {
-		len = strlen(name);
-
-		error_exit(len == 0, "listxattr_hidden empty name");
-		error_exit(len > SCOUTFS_XATTR_MAX_NAME_LEN, "listxattr_hidden long name");
-
-		value_len = getxattr(path, name, NULL, 0);
-		error_exit(value_len < 0, "value "ERRF, ERRA);
-
-		xattr = malloc(sizeof(struct scoutfs_parallel_restore_xattr) + len + value_len);
-		error_exit(!xattr, "error allocating generated xattr");
-
-		*xattr = (struct scoutfs_parallel_restore_xattr) {
-			.ino = ino,
-			.pos = pos++,
-			.name_len = len,
-			.value_len = value_len,
-		};
-		xattr->name = (void *)(xattr + 1);
-		xattr->value = (void *)(xattr->name + len);
-
-		ret = getxattr(path, name, (void *)(xattr->name + len), value_len);
-		error_exit(ret != value_len, "fgetxattr value"ERRF, ERRA);
-
-		memcpy(xattr->name, name, len);
-
-		ret = scoutfs_parallel_restore_add_xattr(wri, xattr);
-		error_exit(ret, "add xattr %d", ret);
-
-		free(xattr);
-
-		name += len + 1;
-		bytes -= len + 1;
-	} while (bytes > 0);
-
-	free(buf);
-}
-
-/*
- * We can't store the same inode multiple times, so we need to make
- * sure to account for hardlinks. Maintain a LL that stores the first
- * hardlink inode we encounter, and every subsequent hardlink to this
- * inode will omit inserting an inode, and just adds another entry
- */
-static bool is_new_inode_item(bool nlink, u64 ino)
-{
-	struct hardlink_head *hh_tmp;
-	struct hardlink_head *hh;
-
-	if (!nlink)
-		return true;
-
-	/* lineair search, pretty awful, should be a binary tree */
-	list_for_each_entry_safe(hh, hh_tmp, &hardlinks, head) {
-		if (hh->ino == ino)
-			return false;
-	}
-
-	/* insert item */
-	hh = malloc(sizeof(struct hardlink_head));
-	error_exit(!hh, "malloc");
-	hh->ino = ino;
-	list_add_tail(&hh->head, &hardlinks);
-
-	/*
-	 *  XXX
-	 *
-	 * We can be confident that if we don't traverse filesystems
-	 * that once we've created N entries of an N-linked inode, that
-	 * it can be removed from the LL. This would significantly
-	 * improve the manageability of the list.
-	 *
-	 * All we'd need to do is add a counter and compare it to the nr_links
-	 * field of the inode.
-	 */
-
-	return true;
-}
-
-/*
- * create the inode data for a given path as best as possible
- * duplicating the exact data from the source path
- */
-static struct scoutfs_parallel_restore_inode *read_inode_data(char *path, u64 ino, bool *nlink, bool is_scoutfs)
-{
-	struct scoutfs_parallel_restore_inode *inode = NULL;
-	struct scoutfs_ioctl_stat_more stm;
-	struct stat st;
-	int ret;
-	int fd;
-
-	inode = calloc(1, sizeof(struct scoutfs_parallel_restore_inode));
-	error_exit(!inode, "failure allocating inode");
-
-	ret = lstat(path, &st);
-	error_exit(ret, "failure stat inode");
-
-	/* use exact inode numbers from path, except for root ino */
-	if (ino != SCOUTFS_ROOT_INO)
-		inode->ino = st.st_ino;
-	else
-		inode->ino = SCOUTFS_ROOT_INO;
-
-	inode->mode = st.st_mode;
-	inode->uid = st.st_uid;
-	inode->gid = st.st_gid;
-	inode->atime = st.st_atim;
-	inode->ctime = st.st_ctim;
-	inode->mtime = st.st_mtim;
-	inode->size = st.st_size;
-
-	inode->rdev = st.st_rdev;
-
-	/* scoutfs specific */
-	inode->meta_seq = 0;
-	inode->data_seq = 0;
-	inode->crtime = st.st_ctim;
-
-	if (S_ISREG(inode->mode)) {
-		if (inode->size > 0)
-			inode->offline = true;
-
-		if (is_scoutfs) {
-			fd = open(path, O_RDONLY);
-			error_exit(!fd, "open failure"ERRF, ERRA);
-
-			ret = ioctl(fd, SCOUTFS_IOC_STAT_MORE, &stm);
-			error_exit(ret, "failure SCOUTFS_IOC_STAT_MORE inode");
-
-			inode->meta_seq = stm.meta_seq;
-			inode->data_seq = stm.data_seq;
-			inode->crtime = (struct timespec){.tv_sec = stm.crtime_sec, .tv_nsec = stm.crtime_nsec};
-
-			close(fd);
-		}
-
-	}
-
-	/* pass whether item is hardlinked or not */
-	*nlink = (st.st_nlink > 1);
-
-	return inode;
-}
-
-struct writer_args {
-	struct list_head head;
-
-	int dev_fd;
-	int pair_fd;
-
-	struct scoutfs_parallel_restore_slice slice;
-};
-
-static void restore_path(struct scoutfs_parallel_restore_writer *wri, struct writer_args *args, struct write_result *res, void *buf, char *path, u64 ino)
-{
-	struct scoutfs_parallel_restore_inode *inode;
-	struct scoutfs_parallel_restore_entry *entry;
-	DIR *dirp = NULL;
-	char *subdir = NULL;
-	char link[PATH_MAX + 1];
-	struct dirent *ent;
-	struct statfs stf;
-	int ret = 0;
-	int subdir_count = 0, file_count = 0;
-	size_t ent_len = 0;
-	size_t pos = 0;
-	bool nlink = false;
-	char ind = '?';
-	u64 mode;
-	bool is_scoutfs = false;
-
-	/* get fs info once per path */
-	ret = statfs(path, &stf);
-	error_exit(ret != 0, "statfs"ERRF, ERRA);
-	is_scoutfs = (stf.f_type == 0x554f4353);
-
-	if (!is_scoutfs && !warn_scoutfs) {
-		warn_scoutfs = true;
-		fprintf(stderr, "Non-scoutfs source path detected: scoutfs specific features disabled\n");
-	}
-
-	/* traverse the entire tree */
-	dirp = opendir(path);
-	errno = 0;
-	while ((ent = readdir(dirp))) {
-		if (ent->d_type == DT_DIR) {
-			if ((strcmp(ent->d_name, ".") == 0) ||
-			    (strcmp(ent->d_name, "..") == 0)) {
-				/* position still matters */
-				pos++;
-				continue;
-			}
-
-			/* recurse into subdir */
-			ret = asprintf(&subdir, "%s/%s", path, ent->d_name);
-			error_exit(ret == -1, "asprintf subdir"ERRF, ERRA);
-			restore_path(wri, args, res, buf, subdir, ent->d_ino);
-
-			subdir_count++;
-
-			ent_len += strlen(ent->d_name);
-
-			entry = malloc(sizeof(struct scoutfs_parallel_restore_entry) + strlen(ent->d_name));
-			error_exit(!entry, "error allocating generated entry");
-
-			*entry = (struct scoutfs_parallel_restore_entry) {
-				.dir_ino = ino,
-				.pos = pos++,
-				.ino = ent->d_ino,
-				.mode = DIR_MODE,
-				.name = (void *)(entry + 1),
-				.name_len = strlen(ent->d_name),
-			};
-
-			memcpy(entry->name, ent->d_name, strlen(ent->d_name));
-			ret = scoutfs_parallel_restore_add_entry(wri, entry);
-			error_exit(ret, "add entry %d", ret);
-			free(entry);
-
-			add_xattrs(wri, subdir, ent->d_ino, is_scoutfs);
-
-			free(subdir);
-
-			le64_add_cpu(&res->dirs_created, 1);
-		} else if (ent->d_type == DT_REG) {
-
-			file_count++;
-
-			ent_len += strlen(ent->d_name);
-
-			entry = malloc(sizeof(struct scoutfs_parallel_restore_entry) + strlen(ent->d_name));
-			error_exit(!entry, "error allocating generated entry");
-
-			*entry = (struct scoutfs_parallel_restore_entry) {
-				.dir_ino = ino,
-				.pos = pos++,
-				.ino = ent->d_ino,
-				.mode = REG_MODE,
-				.name = (void *)(entry + 1),
-				.name_len = strlen(ent->d_name),
-			};
-
-			memcpy(entry->name, ent->d_name, strlen(ent->d_name));
-			ret = scoutfs_parallel_restore_add_entry(wri, entry);
-			error_exit(ret, "add entry %d", ret);
-			free(entry);
-
-			ret = asprintf(&subdir, "%s/%s", path, ent->d_name);
-			error_exit(ret == -1, "asprintf subdir"ERRF, ERRA);
-
-			/* file inode */
-			inode = read_inode_data(subdir, ent->d_ino, &nlink, is_scoutfs);
-			fprintf(stdout, "f %s/%s\n", path, ent->d_name);
-			if (is_new_inode_item(nlink, ent->d_ino)) {
-				ret = scoutfs_parallel_restore_add_inode(wri, inode);
-				error_exit(ret, "add reg file inode %d", ret);
-
-				/* xattrs */
-				add_xattrs(wri, subdir, ent->d_ino, is_scoutfs);
-			}
-			free(inode);
-
-			free(subdir);
-
-			le64_add_cpu(&res->files_created, 1);
-		} else if (ent->d_type == DT_LNK) {
-			/* readlink */
-
-			ret = asprintf(&subdir, "%s/%s", path, ent->d_name);
-			error_exit(ret == -1, "asprintf subdir"ERRF, ERRA);
-
-			ent_len += strlen(ent->d_name);
-
-			ret = readlink(subdir, link, PATH_MAX);
-			error_exit(ret < 0, "readlink %d", ret);
-			/* must 0-terminate if we want to print it */
-			link[ret] = 0;
-
-			entry = malloc(sizeof(struct scoutfs_parallel_restore_entry) + strlen(ent->d_name));
-			error_exit(!entry, "error allocating generated entry");
-
-			*entry = (struct scoutfs_parallel_restore_entry) {
-				.dir_ino = ino,
-				.pos = pos++,
-				.ino = ent->d_ino,
-				.mode = LNK_MODE,
-				.name = (void *)(entry + 1),
-				.name_len = strlen(ent->d_name),
-			};
-
-			memcpy(entry->name, ent->d_name, strlen(ent->d_name));
-			ret = scoutfs_parallel_restore_add_entry(wri, entry);
-			error_exit(ret, "add symlink entry %d", ret);
-
-			/* link inode */
-			inode = read_inode_data(subdir, ent->d_ino, &nlink, is_scoutfs);
-
-			fprintf(stdout, "l %s/%s -> %s\n", path, ent->d_name, link);
-
-			inode->mode = LNK_MODE;
-			inode->target = link;
-			inode->target_len = strlen(link) + 1; /* scoutfs null terminates symlinks */
-
-			ret = scoutfs_parallel_restore_add_inode(wri, inode);
-			error_exit(ret, "add syml inode %d", ret);
-
-			free(inode);
-			free(subdir);
-
-			le64_add_cpu(&res->files_created, 1);
-		} else {
-			/* odd stuff */
-			switch(ent->d_type) {
-			case DT_CHR:
-				ind = 'c';
-				mode = S_IFCHR;
-				break;
-			case DT_BLK:
-				ind = 'b';
-				mode = S_IFBLK;
-				break;
-			case DT_FIFO:
-				ind = 'p';
-				mode = S_IFIFO;
-				break;
-			case DT_SOCK:
-				ind = 's';
-				mode = S_IFSOCK;
-				break;
-			default:
-				error_exit(true, "Unknown readdir entry type");
-				;;
-			}
-
-			file_count++;
-
-			ent_len += strlen(ent->d_name);
-
-			entry = malloc(sizeof(struct scoutfs_parallel_restore_entry) + strlen(ent->d_name));
-			error_exit(!entry, "error allocating generated entry");
-
-			*entry = (struct scoutfs_parallel_restore_entry) {
-				.dir_ino = ino,
-				.pos = pos++,
-				.ino = ent->d_ino,
-				.mode = mode,
-				.name = (void *)(entry + 1),
-				.name_len = strlen(ent->d_name),
-			};
-
-			memcpy(entry->name, ent->d_name, strlen(ent->d_name));
-			ret = scoutfs_parallel_restore_add_entry(wri, entry);
-			error_exit(ret, "add entry %d", ret);
-
-			free(entry);
-
-			ret = asprintf(&subdir, "%s/%s", path, ent->d_name);
-			error_exit(ret == -1, "asprintf subdir"ERRF, ERRA);
-
-			/* file inode */
-			inode = read_inode_data(subdir, ent->d_ino, &nlink, is_scoutfs);
-			fprintf(stdout, "%c %lld %s/%s\n", ind, inode->ino, path, ent->d_name);
-			if (is_new_inode_item(nlink, ent->d_ino)) {
-				ret = scoutfs_parallel_restore_add_inode(wri, inode);
-				error_exit(ret, "add reg file inode %d", ret);
-			}
-			free(inode);
-
-			free(subdir);
-
-			le64_add_cpu(&res->files_created, 1);
-		}
-
-		/* batch out changes, will be about 1M */
-		if (le64_to_cpu(res->files_created) > BATCH_FILES) {
-			write_bufs_and_send(wri, buf, args->dev_fd, res, false, args->pair_fd);
-		}
-
-	}
-	if (ent != NULL)
-		error_exit(errno, "readdir"ERRF, ERRA);
-	closedir(dirp);
-
-	/* create the dir itself */
-	inode = read_inode_data(path, ino, &nlink, is_scoutfs);
-	inode->nr_subdirs = subdir_count;
-	inode->total_entry_name_bytes = ent_len;
-	fprintf(stdout, "d %s\n", path);
-
-	ret = scoutfs_parallel_restore_add_inode(wri, inode);
-	error_exit(ret, "add dir inode %d", ret);
-
-	free(inode);
-
-	/* No need to send, we'll send final after last directory is complete */
-}
-
-static int do_restore(struct opts *opts)
-{
-	struct scoutfs_parallel_restore_writer *pwri, *wri = NULL;
-	struct scoutfs_parallel_restore_slice *slices = NULL;
-	struct scoutfs_super_block *super = NULL;
-	struct writer_args *args;
-	struct write_result res;
-	int pair[2] = {-1, -1};
-	LIST_HEAD(writers);
-	void *buf = NULL;
-	void *bufp = NULL;
-	int dev_fd = -1;
-	pid_t pid;
-	int ret;
-	u64 tot_bytes;
-	u64 tot_dirs;
-	u64 tot_files;
-
-	ret = socketpair(PF_LOCAL, SOCK_STREAM, 0, pair);
-	error_exit(ret, "socketpair error "ERRF, ERRA);
-
-	dev_fd = open(opts->meta_path, O_DIRECT | (O_RDWR|O_EXCL));
-	error_exit(dev_fd < 0, "error opening '%s': "ERRF, opts->meta_path, ERRA);
-
-	errno = posix_memalign((void **)&super, 4096, SCOUTFS_BLOCK_SM_SIZE) ?:
-		posix_memalign((void **)&buf, 4096, BUF_SIZ);
-	error_exit(errno, "error allocating block bufs "ERRF, ERRA);
-
-	ret = pread(dev_fd, super, SCOUTFS_BLOCK_SM_SIZE,
-		    SCOUTFS_SUPER_BLKNO << SCOUTFS_BLOCK_SM_SHIFT);
-	error_exit(ret != SCOUTFS_BLOCK_SM_SIZE, "error reading super, ret %d", ret);
-
-	error_exit((super->flags & SCOUTFS_FLAG_IS_META_BDEV) == 0, "super block is not meta dev");
-
-	ret = scoutfs_parallel_restore_create_writer(&wri);
-	error_exit(ret, "create writer %d", ret);
-
-	ret = scoutfs_parallel_restore_import_super(wri, super, dev_fd);
-	error_exit(ret, "import super %d", ret);
-
-	slices = calloc(2, sizeof(struct scoutfs_parallel_restore_slice));
-	error_exit(!slices, "alloc slices");
-
-	scoutfs_parallel_restore_init_slices(wri, slices, 2);
-
-	ret = scoutfs_parallel_restore_add_slice(wri, &slices[0]);
-	error_exit(ret, "add slices[0] %d", ret);
-
-	args = calloc(1, sizeof(struct writer_args));
-	error_exit(!args, "alloc writer args");
-
-	args->dev_fd = dev_fd;
-	args->slice = slices[1];
-	args->pair_fd = pair[1];
-	list_add_tail(&args->head, &writers);
-
-	/* fork writer process */
-	pid = fork();
-	error_exit(pid == -1, "fork error");
-
-	if (pid == 0) {
-		ret = prctl(PR_SET_PDEATHSIG, SIGHUP);
-		error_exit(ret < 0, "failed to set parent death sig");
-
-		errno = posix_memalign((void **)&bufp, 4096, BUF_SIZ);
-		error_exit(errno, "error allocating block bufp "ERRF, ERRA);
-
-		ret = scoutfs_parallel_restore_create_writer(&pwri);
-		error_exit(ret, "create pwriter %d", ret);
-
-		ret = scoutfs_parallel_restore_add_slice(pwri, &args->slice);
-		error_exit(ret, "add pslice %d", ret);
-
-		memset(&res, 0, sizeof(res));
-
-		restore_path(pwri, args, &res, bufp, opts->source_dir, SCOUTFS_ROOT_INO);
-
-		res.complete = true;
-
-		write_bufs_and_send(pwri, buf, args->dev_fd, &res, true, args->pair_fd);
-
-		scoutfs_parallel_restore_destroy_writer(&pwri);
-		free(bufp);
-
-		exit(0);
-	};
-
-	/* read results and wait for writer to finish */
-	tot_bytes = 0;
-	tot_dirs = 1;
-	tot_files = 0;
-	for (;;) {
-		ret = read(pair[0], &res, sizeof(struct write_result));
-		error_exit(ret != sizeof(struct write_result), "result read error %d", ret);
-
-		ret = scoutfs_parallel_restore_add_progress(wri, &res.prog);
-		error_exit(ret, "add thr prog %d", ret);
-
-		if (res.slice.meta_len != 0) {
-			ret = scoutfs_parallel_restore_add_slice(wri, &res.slice);
-			error_exit(ret, "add thr slice %d", ret);
-
-			if (res.complete)
-				break;
-		}
-
-		tot_bytes += le64_to_cpu(res.bytes_written);
-		tot_files += le64_to_cpu(res.files_created);
-		tot_dirs += le64_to_cpu(res.dirs_created);
-	}
-
-	tot_bytes += write_bufs(wri, buf, args->dev_fd);
-
-	fprintf(stdout, "Wrote %lld directories, %lld files, %lld bytes total\n",
-		tot_dirs, tot_files, tot_bytes);
-
-	/* write super to finalize */
-	ret = scoutfs_parallel_restore_export_super(wri, super);
-	error_exit(ret, "update super %d", ret);
-
-	ret = pwrite(dev_fd, super, SCOUTFS_BLOCK_SM_SIZE,
-		     SCOUTFS_SUPER_BLKNO << SCOUTFS_BLOCK_SM_SHIFT);
-	error_exit(ret != SCOUTFS_BLOCK_SM_SIZE, "error writing super, ret %d", ret);
-
-	scoutfs_parallel_restore_destroy_writer(&wri);
-
-	if (dev_fd >= 0)
-		close(dev_fd);
-	if (pair[0] > 0)
-		close(pair[0]);
-	if (pair[1] > 0)
-		close(pair[1]);
-	free(super);
-	free(args);
-	free(slices);
-	free(buf);
-
-	return 0;
-}
-
-int main(int argc, char **argv)
-{
-	struct opts opts = (struct opts){ 0 };
-	struct hardlink_head *hh_tmp;
-	struct hardlink_head *hh;
-	int ret;
-	int c;
-
-	INIT_LIST_HEAD(&hardlinks);
-
-        while ((c = getopt(argc, argv, "b:m:s:")) != -1) {
-                switch(c) {
-                case 'm':
-                        opts.meta_path = strdup(optarg);
-                        break;
-		case 's':
-			opts.source_dir = strdup(optarg);
-			break;
-                case '?':
-                        printf("Unknown option '%c'\n", optopt);
-                        usage();
-			exit(1);
-                }
-        }
-
-	error_exit(!opts.meta_path, "must specify metadata device path with -m");
-	error_exit(!opts.source_dir, "must specify source directory path with -s");
-
-	ret = do_restore(&opts);
-
-	free(opts.meta_path);
-	free(opts.source_dir);
-
-	list_for_each_entry_safe(hh, hh_tmp, &hardlinks, head) {
-		list_del_init(&hh->head);
-		free(hh);
-	}
-
-	return ret == 0 ? 0 : 1;
-}
--- a/tests/tests/basic-truncate.sh
+++ b/tests/tests/basic-truncate.sh
@@ -11,7 +11,7 @@ FILE="$T_D0/file"
 # final block as we truncated past it.
 #
 echo "== truncate writes zeroed partial end of file block"
-yes | dd of="$FILE" bs=8K count=1 status=none iflag=fullblock
+yes 2>/dev/null | dd of="$FILE" bs=8K count=1 status=none iflag=fullblock
 sync

 # not passing iflag=fullblock causes the file occasionally to just be
--- a/tests/tests/enospc.sh
+++ b/tests/tests/enospc.sh
@@ -88,6 +88,11 @@ rm -rf "$SCR/xattrs"

 echo "== make sure we can create again"
 file="$SCR/file-after"
+C=120
+while (( C-- )); do
+	touch $file 2> /dev/null && break
+	sleep 1
+done
 touch $file
 setfattr -n user.scoutfs-enospc -v 1 "$file"
 sync
--- a/tests/tests/format-version-forward-back.sh
+++ b/tests/tests/format-version-forward-back.sh
@@ -11,8 +11,8 @@
 # format version.
 #

-# not supported on el9!
-if [ $(source /etc/os-release ; echo ${VERSION_ID:0:1}) -gt 8 ]; then
+# not supported on el8 or higher
+if [ $(source /etc/os-release ; echo ${VERSION_ID:0:1}) -gt 7 ]; then
 	t_skip_permitted "Unsupported OS version"
 fi

--- a/tests/tests/large-fragmented-free.sh
+++ b/tests/tests/large-fragmented-free.sh
@@ -10,30 +10,6 @@ EXTENTS_PER_BTREE_BLOCK=600
 EXTENTS_PER_LIST_BLOCK=8192
 FREED_EXTENTS=$((EXTENTS_PER_BTREE_BLOCK * EXTENTS_PER_LIST_BLOCK))

-#
-# This test specifically creates a pathologically sparse file that will
-# be as expensive as possible to free.  This is usually fine on
-# dedicated or reasonable hardware, but trying to run this in
-# virtualized debug kernels can take a very long time.  This test is
-# about making sure that the server doesn't fail, not that the platform
-# can handle the scale of work that our btree formats happen to require
-# while execution is bogged down with use-after-free memory reference
-# tracking.  So we give the test a lot more breathing room before
-# deciding that its hung.
-#
-echo "== setting longer hung task timeout"
-if [ -w /proc/sys/kernel/hung_task_timeout_secs ]; then
-	secs=$(cat /proc/sys/kernel/hung_task_timeout_secs)
-	test "$secs" -gt 0 || \
-		t_fail "confusing value '$secs' from /proc/sys/kernel/hung_task_timeout_secs"
-	restore_hung_task_timeout()
-	{
-		echo "$secs" > /proc/sys/kernel/hung_task_timeout_secs
-	}
-	trap restore_hung_task_timeout EXIT
-	echo "$((secs * 5))" > /proc/sys/kernel/hung_task_timeout_secs
-fi
-
 echo "== creating fragmented extents"
 fragmented_data_extents $FREED_EXTENTS $EXTENTS_PER_BTREE_BLOCK "$T_D0/alloc" "$T_D0/move"

--- a/tests/tests/lock-recover-invalidate.sh
+++ b/tests/tests/lock-recover-invalidate.sh
@@ -38,6 +38,6 @@ while [ "$SECONDS" -lt "$END" ]; do
 done

 echo "== stopping background load"
-kill $load_pids
+t_silent_kill $load_pids

 t_pass
--- a/tests/tests/lock-shrink-read-race.sh
+++ b/tests/tests/lock-shrink-read-race.sh
@@ -0,0 +1,40 @@
+#
+# We had a lock server refcounting bug that could let one thread get a
+# reference on a lock struct that was being freed by another thread.  We
+# were able to reproduce this by having all clients try and produce a
+# lot of read and null requests.
+#
+# This will manfiest as a hung lock and timed out test runs, probably
+# with hung task messages on the console.  Depending on how the race
+# turns out, it can trigger KASAN warnings in
+# process_waiting_requests().
+#
+
+READERS_PER=3
+SECS=30
+
+echo "=== setup"
+touch "$T_D0/file"
+
+echo "=== spin reading and shrinking"
+END=$((SECONDS + SECS))
+for m in $(t_fs_nrs); do
+	eval file="\$T_D${m}/file"
+
+	# lots of tasks reading as fast as they can
+	for t in $(seq 1 $READERS_PER); do
+		(while [ $SECONDS -lt $END ]; do
+			stat $file > /dev/null
+		 done) &
+	done
+	# one task shrinking (triggering null requests) and reading
+	(while [ $SECONDS -lt $END ]; do
+		stat $file > /dev/null
+		t_trigger_arm_silent statfs_lock_purge $m
+		stat -f "$file" > /dev/null
+	 done) &
+done
+
+wait
+
+t_pass
--- a/tests/tests/mmap.sh
+++ b/tests/tests/mmap.sh
@@ -0,0 +1,54 @@
+#
+# test mmap() and normal read/write consistency between different nodes
+#
+
+t_require_commands mmap_stress mmap_validate scoutfs xfs_io
+
+echo "== mmap_stress"
+mmap_stress 8192 2000 "$T_D0/mmap_stress" "$T_D1/mmap_stress" "$T_D2/mmap_stress" "$T_D3/mmap_stress" "$T_D4/mmap_stress" | sed 's/:.*//g' | sort
+
+echo "== basic mmap/read/write consistency checks"
+mmap_validate 256 1000 "$T_D0/mmap_val1" "$T_D1/mmap_val1"
+mmap_validate 8192 1000 "$T_D0/mmap_val2" "$T_D1/mmap_val2"
+mmap_validate 88400 1000 "$T_D0/mmap_val3" "$T_D1/mmap_val3"
+
+echo "== mmap read from offline extent"
+F="$T_D0/mmap-offline"
+touch "$F"
+xfs_io -c "pwrite -S 0xEA 0 8192" "$F" > /dev/null
+cp "$F" "${F}-stage"
+vers=$(scoutfs stat -s data_version "$F")
+scoutfs release "$F" -V "$vers" -o 0 -l 8192
+scoutfs get-fiemap -L "$F"
+xfs_io -c "mmap -rwx 0 8192" \
+	-c "mread -v 512 16" "$F" &
+sleep 1
+# should be 1 - data waiting
+jobs | wc -l
+scoutfs stage "${F}-stage" "$F" -V "$vers" -o 0 -l 8192
+# xfs_io thread <here> will output 16 bytes of read data
+sleep 1
+# should be 0 - no more waiting jobs, xfs_io should have exited
+jobs | wc -l
+scoutfs get-fiemap -L "$F"
+
+echo "== mmap write to an offline extent"
+# reuse the same file
+scoutfs release "$F" -V "$vers" -o 0 -l 8192
+scoutfs get-fiemap -L "$F"
+xfs_io -c "mmap -rwx 0 8192" \
+	-c "mwrite -S 0x11 528 16" "$F" &
+sleep 1
+# should be 1 job waiting
+jobs | wc -l
+scoutfs stage "${F}-stage" "$F" -V "$vers" -o 0 -l 8192
+# no output here from write
+sleep 1
+# should be 0 - no more waiting jobs, xfs_io should have exited
+jobs | wc -l
+scoutfs get-fiemap -L "$F"
+# read back contents to assure write changed the file
+dd status=none if="$F" bs=1 count=48 skip=512 | hexdump -C
+
+echo "== done"
+t_pass
--- a/tests/tests/offline-extent-waiting.sh
+++ b/tests/tests/offline-extent-waiting.sh
@@ -157,7 +157,7 @@ echo "truncate should be waiting for first block:"
 expect_wait "$DIR/file" "change_size" $ino 0
 scoutfs stage "$DIR/golden" "$DIR/file" -V "$vers" -o 0 -l $BYTES
 sleep .1
-echo "trunate should no longer be waiting:"
+echo "truncate should no longer be waiting:"
 scoutfs data-waiting -B 0 -I 0 -p "$DIR" | wc -l
 cat "$DIR/golden" > "$DIR/file"
 vers=$(scoutfs stat -s data_version "$DIR/file")
@@ -168,10 +168,13 @@ scoutfs release "$DIR/file" -V "$vers" -o 0 -l $BYTES
 # overwrite, not truncate+write
 dd if="$DIR/other" of="$DIR/file" \
 	bs=$BS count=$BLOCKS conv=notrunc status=none &
+pid="$!"
 sleep .1
 echo "should be waiting for write"
 expect_wait "$DIR/file" "write" $ino 0
 scoutfs stage "$DIR/golden" "$DIR/file" -V "$vers" -o 0 -l $BYTES
+# wait for the background dd to complete
+wait "$pid" 2> /dev/null
 cmp "$DIR/file" "$DIR/other"

 echo "== cleanup"
--- a/tests/tests/orphan-inodes.sh
+++ b/tests/tests/orphan-inodes.sh
@@ -5,18 +5,6 @@
 t_require_commands sleep touch sync stat handle_cat kill rm
 t_require_mounts 2

-#
-# usually bash prints an annoying output message when jobs
-# are killed.  We can avoid that by redirecting stderr for
-# the bash process when it reaps the jobs that are killed.
-#
-silent_kill() {
-	exec {ERR}>&2 2>/dev/null
-	kill "$@"
-	wait "$@"
-	exec 2>&$ERR {ERR}>&-
-}
-
 #
 # We don't have a great way to test that inode items still exist.   We
 # don't prevent opening handles with nlink 0 today, so we'll use that.
@@ -52,7 +40,7 @@ inode_exists $ino || echo "$ino didn't exist"

 echo "== orphan from failed evict deletion is picked up"
 # pending kill signal stops evict from getting locks and deleting
-silent_kill $pid
+t_silent_kill $pid
 t_set_sysfs_mount_option 0 orphan_scan_delay_ms 1000
 sleep 5
 inode_exists $ino && echo "$ino still exists"
@@ -70,7 +58,7 @@ for nr in $(t_fs_nrs); do
 	rm -f "$path"
 done
 sync
-silent_kill $pids
+t_silent_kill $pids
 for nr in $(t_fs_nrs); do
 	t_force_umount $nr
 done
@@ -79,10 +67,49 @@ t_mount_all
 while test -d $(echo /sys/fs/scoutfs/*/fence/* | cut -d " " -f 1); do
 	sleep .5
 done
+
+
+sv=$(t_server_nr)
+
+# wait for reclaim_open_log_tree() to complete for each mount
+while [ $(t_counter reclaimed_open_logs $sv) -lt $T_NR_MOUNTS ]; do
+	sleep 1
+done
+
+# wait for finalize_and_start_log_merge() to find no active merges in flight
+# and not find any finalized trees
+while [ $(t_counter log_merge_no_finalized $sv) -lt 1 ]; do
+	sleep 1
+done
+
 # wait for orphan scans to run
 t_set_all_sysfs_mount_options orphan_scan_delay_ms 1000
-# also have to wait for delayed log merge work from mount
-sleep 15
+# wait until we see two consecutive orphan scan attempts without
+# any inode deletion forward progress in each mount
+for nr in $(t_fs_nrs); do
+	C=0
+	LOSA=$(t_counter orphan_scan_attempts $nr)
+	LDOP=$(t_counter inode_deleted $nr)
+
+	while [ $C -lt 2 ]; do
+		sleep 1
+
+		OSA=$(t_counter orphan_scan_attempts $nr)
+		DOP=$(t_counter inode_deleted $nr)
+
+		if [ $OSA != $LOSA ]; then
+			if [ $DOP == $LDOP ]; then
+				(( C++ ))
+			else
+				C=0
+			fi
+		fi
+
+		LOSA=$OSA
+		LDOP=$DOP
+	done
+done
+
 for ino in $inos; do
 	inode_exists $ino && echo "$ino still exists"
 done
@@ -131,7 +158,7 @@ while [ $SECONDS -lt $END ]; do
 	done

 	# trigger eviction deletion of each file in each mount
-	silent_kill $pids
+	t_silent_kill $pids

 	wait || t_fail "handle_fsetxattr failed"

--- a/tests/tests/parallel_restore.sh
+++ b/tests/tests/parallel_restore.sh
@@ -1,78 +0,0 @@
-#
-# validate parallel restore library
-#
-
-t_require_commands scoutfs parallel_restore find xargs
-
-SCR="$T_TMPDIR/mnt.scratch"
-mkdir -p "$SCR"
-
-scratch_mkfs() {
-	scoutfs mkfs $@ \
-		-A -f -Q 0,127.0.0.1,53000 $T_EX_META_DEV $T_EX_DATA_DEV
-}
-
-scratch_check() {
-	# give ample time for writes to commit
-	sleep 1
-	sync
-	scoutfs check -d ${T_TMPDIR}/check.debug $T_EX_META_DEV $T_EX_DATA_DEV
-}
-
-scratch_mount() {
-	mount -t scoutfs -o metadev_path=$T_EX_META_DEV,quorum_slot_nr=0 $T_EX_DATA_DEV $SCR
-}
-
-
-echo "== simple mkfs/restore/mount"
-# meta device just big enough for reserves and the metadata we'll fill
-scratch_mkfs -V 2 -m 10G -d 60G > $T_TMP.mkfs.out 2>&1 || t_fail "mkfs failed"
-parallel_restore -m "$T_EX_META_DEV" > /dev/null || t_fail "parallel_restore"
-scratch_check || t_fail "check failed"
-scratch_mount
-
-scoutfs statfs -p "$SCR" | grep -v -e 'fsid' -e 'rid'
-find "$SCR" -exec scoutfs list-hidden-xattrs {} \; | wc
-scoutfs search-xattrs -p "$SCR" scoutfs.hide.srch.sam_vol_F01030L6 -p "$SCR" | wc
-find "$SCR" -type f -name "file-*" | head -n 4 | xargs -n 1 scoutfs get-fiemap -L
-scoutfs df -p "$SCR"
-scoutfs quota-list -p "$SCR"
-umount "$SCR"
-scratch_check || t_fail "check after mount failed"
-
-echo "== just under ENOSPC"
-scratch_mkfs -V 2 -m 10G -d 60G > $T_TMP.mkfs.out 2>&1 || t_fail "mkfs failed"
-parallel_restore -m "$T_EX_META_DEV" -n 3000000 > /dev/null || t_fail "parallel_restore"
-scratch_check || t_fail "check failed"
-scratch_mount
-scoutfs df -p "$SCR"
-umount "$SCR"
-scratch_check || t_fail "check after mount failed"
-
-echo "== just over ENOSPC"
-scratch_mkfs -V 2 -m 10G -d 60G > $T_TMP.mkfs.out 2>&1 || t_fail "mkfs failed"
-parallel_restore -m "$T_EX_META_DEV" -n 3500000 | grep died 2>&1 && t_fail "parallel_restore"
-scratch_check || t_fail "check failed"
-
-echo "== ENOSPC"
-scratch_mkfs -V 2 -m 10G -d 60G > $T_TMP.mkfs.out 2>&1 || t_fail "mkfs failed"
-parallel_restore -m "$T_EX_META_DEV" -d 600:1000 -f 600:1000 -n 4000000 | grep died 2>&1 && t_fail "parallel_restore"
-
-echo "== attempt to restore data device"
-scratch_mkfs -V 2 -m 10G -d 60G > $T_TMP.mkfs.out 2>&1 || t_fail "mkfs failed"
-parallel_restore -m "$T_EX_DATA_DEV" | grep died 2>&1 && t_fail "parallel_restore"
-
-echo "== attempt format_v1 restore"
-scratch_mkfs -V 1 -m 10G -d 60G > $T_TMP.mkfs.out 2>&1 || t_fail "mkfs failed"
-parallel_restore -m "$T_EX_META_DEV" | grep died 2>&1 && t_fail "parallel_restore"
-
-echo "== test if previously mounted"
-scratch_mkfs -V 2 -m 10G -d 60G > $T_TMP.mkfs.out 2>&1 || t_fail "mkfs failed"
-mount -t scoutfs -o metadev_path=$T_EX_META_DEV,quorum_slot_nr=0 \
-	"$T_EX_DATA_DEV" "$SCR"
-umount "$SCR"
-parallel_restore -m "$T_EX_META_DEV" | grep died 2>&1 && t_fail "parallel_restore"
-
-echo "== cleanup"
-rmdir "$SCR"
-t_pass
--- a/tests/tests/simple-readdir.sh
+++ b/tests/tests/simple-readdir.sh
@@ -0,0 +1,37 @@
+#
+# verify d_off output of xfs_io is consistent.
+#
+
+t_require_commands xfs_io
+
+filt()
+{
+	grep d_off | cut -d ' ' -f 1,4-
+}
+
+echo "== create content"
+for s in $(seq 1 7 250); do
+	f=$(printf '%*s' $s | tr ' ' 'a')
+	touch ${T_D0}/$f
+done
+
+echo "== readdir all"
+xfs_io -c "readdir -v" $T_D0 | filt
+
+echo "== readdir offset"
+xfs_io -c "readdir -v -o 20" $T_D0 | filt
+
+echo "== readdir len (bytes)"
+xfs_io -c "readdir -v -l 193" $T_D0 | filt
+
+echo "== introduce gap"
+for s in $(seq 57 7 120); do
+	f=$(printf '%*s' $s | tr ' ' 'a')
+	rm -f ${T_D0}/$f
+done
+xfs_io -c "readdir -v" $T_D0 | filt
+
+echo "== cleanup"
+rm -rf $T_D0
+
+t_pass
--- a/tests/tests/xfstests.sh
+++ b/tests/tests/xfstests.sh
@@ -65,26 +65,14 @@ EOF

 cat << EOF > local.exclude
 generic/003	# missing atime update in buffered read
-generic/029	# mmap missing
-generic/030	# mmap missing
 generic/075	# file content mismatch failures (fds, etc)
-generic/080	# mmap missing
 generic/103	# enospc causes trans commit failures
 generic/108	# mount fails on failing device?
 generic/112	# file content mismatch failures (fds, etc)
-generic/120	# (can't exec 'cause no mmap)
-generic/126	# (can't exec 'cause no mmap)
-generic/141	# mmap missing
 generic/213	# enospc causes trans commit failures
-generic/215	# mmap missing
-generic/246	# mmap missing
-generic/247	# mmap missing
-generic/248	# mmap missing
 generic/318	# can't support user namespaces until v5.11
 generic/321	# requires selinux enabled for '+' in ls?
-generic/325	# mmap missing
 generic/338	# BUG_ON update inode error handling
-generic/346	# mmap missing
 generic/347	# _dmthin_mount doesn't work?
 generic/356	# swap
 generic/357	# swap
@@ -92,16 +80,13 @@ generic/409	# bind mounts not scripted yet
 generic/410	# bind mounts not scripted yet
 generic/411	# bind mounts not scripted yet
 generic/423	# symlink inode size is strlen() + 1 on scoutfs
-generic/428	# mmap missing
 generic/430	# xfs_io copy_range missing in el7
 generic/431	# xfs_io copy_range missing in el7
 generic/432	# xfs_io copy_range missing in el7
 generic/433	# xfs_io copy_range missing in el7
 generic/434	# xfs_io copy_range missing in el7
-generic/437	# mmap missing
 generic/441	# dm-mapper
 generic/444	# el9's posix_acl_update_mode is buggy ?
-generic/452	# exec test - no mmap
 generic/467	# open_by_handle ESTALE
 generic/472	# swap
 generic/484	# dm-mapper
@@ -118,11 +103,9 @@ generic/565	# xfs_io copy_range missing in el7
 generic/568	# falloc not resulting in block count increase
 generic/569	# swap
 generic/570	# swap
-generic/614	# mmap missing
 generic/620	# dm-hugedisk
-generic/633	# mmap, id-mapped mounts missing in el7
+generic/633	# id-mapped mounts missing in el7
 generic/636	# swap
-generic/638	# mmap missing
 generic/641	# swap
 generic/643	# swap
 EOF
--- a/utils/Makefile
+++ b/utils/Makefile
@@ -7,7 +7,7 @@ FMTIOC_H := format.h ioctl.h
 FMTIOC_KMOD := $(addprefix ../kmod/src/,$(FMTIOC_H))

 CFLAGS := -Wall -O2 -Werror -D_FILE_OFFSET_BITS=64 -g -msse4.2 \
-	-I src/ -fno-strict-aliasing \
+	-fno-strict-aliasing \
 	-DSCOUTFS_FORMAT_HASH=0x$(SCOUTFS_FORMAT_HASH)LLU

 ifneq ($(wildcard $(firstword $(FMTIOC_KMOD))),)
@@ -15,13 +15,10 @@ CFLAGS += -I../kmod/src
 endif

 BIN := src/scoutfs
-OBJ_DIRS := src src/check
-OBJ := $(foreach dir,$(OBJ_DIRS),$(patsubst %.c,%.o,$(wildcard $(dir)/*.c)))
-DEPS := $(foreach dir,$(OBJ_DIRS),$(wildcard $(dir)/*.d))
+OBJ := $(patsubst %.c,%.o,$(wildcard src/*.c))
+DEPS := $(wildcard */*.d)

-AR := src/scoutfs_parallel_restore.a
-
-all: $(BIN) $(AR)
+all: $(BIN)

 ifneq ($(DEPS),)
 -include $(DEPS)
@@ -39,10 +36,6 @@ $(BIN): $(OBJ)
 	$(QU)  [BIN $@]
 	$(VE)gcc -o $@ $^ -luuid -lm -lcrypto -lblkid

-$(AR): $(OBJ)
-	$(QU)  [AR $@]
-	$(VE)ar rcs $@ $^
-
 %.o %.d: %.c Makefile sparse.sh
 	$(QU)  [CC $<]
 	$(VE)gcc $(CFLAGS) -MD -MP -MF $*.d -c $< -o $*.o
--- a/utils/man/scoutfs.5
+++ b/utils/man/scoutfs.5
@@ -130,6 +130,24 @@ the server for the filesystem if it is elected leader.
 The assigned number must match one of the slots defined with \-Q options
 when the filesystem was created with mkfs.  If the number assigned
 doesn't match a number created during mkfs then the mount will fail.
+.TP
+.B tcp_keepalive_timeout_ms=<number>
+This option sets the amount of time, in milliseconds, that a client
+connection will wait for active TCP packets, before deciding that
+the connection is dead. This setting is per-mount and only changes
+the behavior of that mount.
+.sp
+The default value of this setting is 10000msec (10s). Any precision
+beyond a whole second is likely unrealistic due to the nature of
+TCP keepalive mechanisms in the Linux kernel. Valid values are any
+value higher than 3000 (3s). Values that are higher than 30000msec
+(30s) will likely interfere with other embedded timeout values.
+.sp
+The TCP keepalive mechanism is complex and observing a lost connection
+quickly is important to maintain cluster stability. If the local
+network suffers from intermittent outages this option may provide
+some respite to overcome these outages without the cluster becoming
+desynchronized.
 .SH VOLUME OPTIONS
 Volume options are persistent options which are stored in the super
 block in the metadata device and which apply to all mounts of the volume.
--- a/utils/man/scoutfs.8
+++ b/utils/man/scoutfs.8
@@ -76,41 +76,6 @@ run when the file system will not be mounted.
 .RE
 .PD

-.TP
-.BI "check META-DEVICE DATA-DEVICE [-d|--debug FILE]"
-.sp
-Performs an offline file system check. The program iterates through all the
-data structures on disk directly - the filesystem must not be mounted while
-this operation is running.
-.RS 1.0i
-.PD 0
-.sp
-.TP
-.B "-d, --debug FILE"
-An output file where the program can output debug information about the
-state of the filesystem as it performs the check. If
-.B FILE
-is "-", the debug output is written to the Standard Error output.
-.TP
-.RE
-.sp
-.B RETURN VALUE
-The check function can return the following exit codes:
-.RS
-.TP
-\fB 0 \fR - no filesystem issues detected
-.TP
-\fB 1 \fR - file system issues were detected
-.TP
-\fB 8 \fR - operational error
-.TP
-\fB 16 \fR - usage error
-.TP
-\fB 32 \fR - cancelled by user (SIGINT)
-.TP
-.RE
-.PD
-
 .TP
 .BI "counters [-t|--table] SYSFS-DIR"
 .sp
--- a/utils/scoutfs-utils.spec.in
+++ b/utils/scoutfs-utils.spec.in
@@ -54,8 +54,6 @@ cp man/*.8.gz $RPM_BUILD_ROOT%{_mandir}/man8/.
 install -m 755 -D src/scoutfs $RPM_BUILD_ROOT%{_sbindir}/scoutfs
 install -m 644 -D src/ioctl.h $RPM_BUILD_ROOT%{_includedir}/scoutfs/ioctl.h
 install -m 644 -D src/format.h $RPM_BUILD_ROOT%{_includedir}/scoutfs/format.h
-install -m 644 -D src/parallel_restore.h $RPM_BUILD_ROOT%{_includedir}/scoutfs/parallel_restore.h
-install -m 644 -D src/scoutfs_parallel_restore.a $RPM_BUILD_ROOT%{_libdir}/scoutfs/libscoutfs_parallel_restore.a
 install -m 755 -D fenced/scoutfs-fenced $RPM_BUILD_ROOT%{_libexecdir}/scoutfs-fenced/scoutfs-fenced
 install -m 644 -D fenced/scoutfs-fenced.service $RPM_BUILD_ROOT%{_unitdir}/scoutfs-fenced.service
 install -m 644 -D fenced/scoutfs-fenced.conf.example $RPM_BUILD_ROOT%{_sysconfdir}/scoutfs/scoutfs-fenced.conf.example
@@ -72,7 +70,6 @@ install -m 644 -D fenced/scoutfs-fenced.conf.example $RPM_BUILD_ROOT%{_sysconfdi
 %files -n scoutfs-devel
 %defattr(644,root,root,755)
 %{_includedir}/scoutfs
-%{_libdir}/scoutfs

 %clean
 rm -rf %{buildroot}
--- a/utils/sparse.sh
+++ b/utils/sparse.sh
@@ -1,7 +1,7 @@
 #!/bin/bash

-# can we find sparse?  If not, we're done.
-which sparse > /dev/null 2>&1 || exit 0
+# must have sparse.  Fail with error message, mask success path.
+which sparse > /dev/null || exit 1

 # 
 # one of the problems with using sparse in userspace is that it picks up
@@ -22,6 +22,11 @@ RE="$RE|warning: memset with byte count of 4194304"
 # some sparse versions don't know about some builtins
 RE="$RE|error: undefined identifier '__builtin_fpclassify'"

+# on el8, sparse can't handle __has_include for some reason when _GNU_SOURCE
+# is defined, and we need that for O_DIRECT.
+RE="$RE|note: in included file .through /usr/include/sys/stat.h.:"
+RE="$RE|/usr/include/bits/statx.h:30:6: error: "
+
 #
 # don't filter out 'too many errors' here, it can signify that
 # sparse doesn't understand something and is throwing a *ton*
--- a/utils/src/check/alloc.c
+++ b/utils/src/check/alloc.c
@@ -1,166 +0,0 @@
-#include <unistd.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <stdbool.h>
-#include <sys/mman.h>
-#include <errno.h>
-
-#include "sparse.h"
-#include "util.h"
-#include "format.h"
-#include "bitmap.h"
-#include "key.h"
-
-#include "alloc.h"
-#include "block.h"
-#include "btree.h"
-#include "extent.h"
-#include "iter.h"
-#include "sns.h"
-
-/*
- * We check the list blocks serially.
- *
- * XXX:
- *  - compare ref seqs
- *  - detect cycles?
- */
-int alloc_list_meta_iter(struct scoutfs_alloc_list_head *lhead, extent_cb_t cb, void *cb_arg)
-{
-	struct scoutfs_alloc_list_block *lblk;
-	struct scoutfs_block_ref ref;
-	struct block *blk = NULL;
-	u64 blkno;
-	int ret;
-
-	ref = lhead->ref;
-
-	while (ref.blkno) {
-		blkno = le64_to_cpu(ref.blkno);
-
-		ret = cb(blkno, 1, cb_arg);
-		if (ret < 0) {
-			ret = xlate_iter_errno(ret);
-			goto out;
-		}
-
-		ret = block_get(&blk, blkno, 0);
-		if (ret < 0)
-			goto out;
-
-		lblk = block_buf(blk);
-		/* XXX verify block */
-		ret = block_hdr_valid(blk, blkno, 0, SCOUTFS_BLOCK_MAGIC_ALLOC_LIST);
-		if (ret < 0)
-			goto out;
-
-		/* XXX sort?   maybe */
-
-		ref = lblk->next;
-
-		block_put(&blk);
-	}
-
-	ret = 0;
-out:
-	return ret;
-}
-
-int alloc_root_meta_iter(struct scoutfs_alloc_root *root, extent_cb_t cb, void *cb_arg)
-{
-	return btree_meta_iter(&root->root, cb, cb_arg);
-}
-
-int alloc_list_extent_iter(struct scoutfs_alloc_list_head *lhead, extent_cb_t cb, void *cb_arg)
-{
-	struct scoutfs_alloc_list_block *lblk;
-	struct scoutfs_block_ref ref;
-	struct block *blk = NULL;
-	u64 blkno;
-	int ret;
-	int i;
-
-	ref = lhead->ref;
-
-	while (ref.blkno) {
-		blkno = le64_to_cpu(ref.blkno);
-
-		ret = block_get(&blk, blkno, 0);
-		if (ret < 0)
-			goto out;
-
-		sns_push("alloc_list_block", blkno, 0);
-
-		lblk = block_buf(blk);
-		/* XXX verify block */
-		ret = block_hdr_valid(blk, blkno, 0, SCOUTFS_BLOCK_MAGIC_ALLOC_LIST);
-		if (ret < 0)
-			goto out;
-		/* XXX sort?   maybe */
-
-		ret = 0;
-		for (i = 0; i < le32_to_cpu(lblk->nr); i++) {
-			blkno = le64_to_cpu(lblk->blknos[le32_to_cpu(lblk->start) + i]);
-
-			ret = cb(blkno, 1, cb_arg);
-			if (ret < 0)
-				break;
-		}
-
-		ref = lblk->next;
-
-		block_put(&blk);
-		sns_pop();
-		if (ret < 0) {
-			ret = xlate_iter_errno(ret);
-			goto out;
-		}
-	}
-
-	ret = 0;
-out:
-	return ret;
-}
-
-static bool valid_free_extent_key(struct scoutfs_key *key)
-{
-	return (key->sk_zone == SCOUTFS_FREE_EXTENT_BLKNO_ZONE ||
-	        key->sk_zone == SCOUTFS_FREE_EXTENT_ORDER_ZONE) &&
-	       (!key->_sk_fourth && !key->sk_type &&
-		(key->sk_zone == SCOUTFS_FREE_EXTENT_ORDER_ZONE || !key->_sk_third));
-}
-
-static int free_item_cb(struct scoutfs_key *key, void *val, u16 val_len, void *cb_arg)
-{
-	struct extent_cb_arg_t *ecba = cb_arg;
-	u64 start;
-	u64 len;
-
-	/* XXX not sure these eios are what we want */
-
-	if (val_len != 0)
-		return -EIO;
-
-	if (!valid_free_extent_key(key))
-		return -EIO;
-
-	if (key->sk_zone == SCOUTFS_FREE_EXTENT_ORDER_ZONE)
-		return -ECHECK_ITER_DONE;
-
-	start = le64_to_cpu(key->skfb_end) - le64_to_cpu(key->skfb_len) + 1;
-	len = le64_to_cpu(key->skfb_len);
-
-	return ecba->cb(start, len, ecba->cb_arg);
-}
-
-/*
- * Call the callback with each of the primary BLKNO free extents stored
- * in item in the given alloc root.  It doesn't visit the secondary
- * ORDER extents.
- */
-int alloc_root_extent_iter(struct scoutfs_alloc_root *root, extent_cb_t cb, void *cb_arg)
-{
-	struct extent_cb_arg_t ecba = { .cb = cb, .cb_arg = cb_arg };
-
-	return btree_item_iter(&root->root, free_item_cb, &ecba);
-}
--- a/utils/src/check/alloc.h
+++ b/utils/src/check/alloc.h
@@ -1,12 +0,0 @@
-#ifndef _SCOUTFS_UTILS_CHECK_ALLOC_H
-#define _SCOUTFS_UTILS_CHECK_ALLOC_H
-
-#include "extent.h"
-
-int alloc_list_meta_iter(struct scoutfs_alloc_list_head *lhead, extent_cb_t cb, void *cb_arg);
-int alloc_root_meta_iter(struct scoutfs_alloc_root *root, extent_cb_t cb, void *cb_arg);
-
-int alloc_list_extent_iter(struct scoutfs_alloc_list_head *lhead, extent_cb_t cb, void *cb_arg);
-int alloc_root_extent_iter(struct scoutfs_alloc_root *root, extent_cb_t cb, void *cb_arg);
-
-#endif
--- a/utils/src/check/block.c
+++ b/utils/src/check/block.c
@@ -1,613 +0,0 @@
-#define _ISOC11_SOURCE /* aligned_alloc */
-#define _DEFAULT_SOURCE /* syscall() */
-#include <stdlib.h>
-#include <unistd.h>
-#include <stdbool.h>
-#include <stdio.h>
-#include <errno.h>
-#include <sys/syscall.h>
-#include <linux/aio_abi.h>
-
-#include "sparse.h"
-#include "util.h"
-#include "format.h"
-#include "list.h"
-#include "cmp.h"
-#include "hash.h"
-
-#include "block.h"
-#include "debug.h"
-#include "super.h"
-#include "eno.h"
-#include "crc.h"
-#include "sns.h"
-
-static struct block_data {
-	struct list_head *hash_lists;
-	size_t hash_nr;
-
-	struct list_head active_head;
-	struct list_head inactive_head;
-	struct list_head dirty_list;
-	size_t nr_active;
-	size_t nr_inactive;
-	size_t nr_dirty;
-
-	int meta_fd;
-	size_t max_cached;
-	size_t nr_events;
-
-	aio_context_t ctx;
-	struct iocb *iocbs;
-	struct iocb **iocbps;
-	struct io_event *events;
-} global_bdat;
-
-struct block {
-	struct list_head hash_head;
-	struct list_head lru_head;
-	struct list_head dirty_head;
-	struct list_head submit_head;
-	unsigned long refcount;
-	unsigned long uptodate:1,
-		      active:1;
-	u64 blkno;
-	void *buf;
-	size_t size;
-};
-
-#define BLK_FMT \
-	"blkno %llu rc %ld d %u a %u"
-#define BLK_ARG(blk) \
-	(blk)->blkno, (blk)->refcount, !list_empty(&(blk)->dirty_head), blk->active
-#define debug_blk(blk, fmt, args...) \
-	debug(fmt " " BLK_FMT, ##args, BLK_ARG(blk))
-
-/*
- * This just allocates and initialzies the block.  The caller is
- * responsible for putting it on the appropriate initial lists and
- * managing refcounts.
- */
-static struct block *alloc_block(struct block_data *bdat, u64 blkno, size_t size)
-{
-	struct block *blk;
-
-	blk = calloc(1, sizeof(struct block));
-	if (blk) {
-		blk->buf = aligned_alloc(4096, size); /* XXX static alignment :/ */
-		if (!blk->buf) {
-			free(blk);
-			blk = NULL;
-		} else {
-			INIT_LIST_HEAD(&blk->hash_head);
-			INIT_LIST_HEAD(&blk->lru_head);
-			INIT_LIST_HEAD(&blk->dirty_head);
-			INIT_LIST_HEAD(&blk->submit_head);
-			blk->blkno = blkno;
-			blk->size = size;
-		}
-	}
-
-	return blk;
-}
-
-static void free_block(struct block_data *bdat, struct block *blk)
-{
-	debug_blk(blk, "free");
-
-	if (!list_empty(&blk->lru_head)) {
-		if (blk->active)
-			bdat->nr_active--;
-		else
-			bdat->nr_inactive--;
-		list_del(&blk->lru_head);
-	}
-
-	if (!list_empty(&blk->dirty_head)) {
-		bdat->nr_dirty--;
-		list_del(&blk->dirty_head);
-	}
-
-	if (!list_empty(&blk->hash_head))
-		list_del(&blk->hash_head);
-
-	if (!list_empty(&blk->submit_head))
-		list_del(&blk->submit_head);
-
-	free(blk->buf);
-	free(blk);
-}
-
-static bool blk_is_dirty(struct block *blk)
-{
-	return !list_empty(&blk->dirty_head);
-}
-
-/*
- * Rebalance the cache.
- *
- * First we shrink the cache to limit it to max_cached blocks.
- * Logically, we walk from oldest to newest in the inactive list and
- * then in the active list.  Since these lists are physically one
- * list_head list we achieve this with a reverse walk starting from the
- * active head.
- *
- * Then we rebalnace the size of the two lists.  The constraint is that
- * we don't let the active list grow larger than the inactive list.  We
- * move blocks from the oldest tail of the active list to the newest
- * head of the inactive list.
- *
- * <- [active head] <-> [ .. active list .. ] <-> [inactive head] <-> [ .. inactive list .. ] ->
- */
-static void rebalance_cache(struct block_data *bdat)
-{
-	struct block *blk;
-	struct block *blk_;
-
-	list_for_each_entry_safe_reverse(blk, blk_, &bdat->active_head, lru_head) {
-		if ((bdat->nr_active + bdat->nr_inactive) < bdat->max_cached)
-			break;
-
-		if (&blk->lru_head == &bdat->inactive_head || blk->refcount > 0 ||
-		    blk_is_dirty(blk))
-			continue;
-
-		free_block(bdat, blk);
-	}
-
-	list_for_each_entry_safe_reverse(blk, blk_, &bdat->inactive_head, lru_head) {
-		if (bdat->nr_active <= bdat->nr_inactive || &blk->lru_head == &bdat->active_head)
-			break;
-
-		list_move(&blk->lru_head, &bdat->inactive_head);
-		blk->active = 0;
-		bdat->nr_active--;
-		bdat->nr_inactive++;
-	}
-}
-
-static void make_active(struct block_data *bdat, struct block *blk)
-{
-	if (!blk->active) {
-		if (!list_empty(&blk->lru_head)) {
-			list_move(&blk->lru_head, &bdat->active_head);
-			bdat->nr_inactive--;
-		} else {
-			list_add(&blk->lru_head, &bdat->active_head);
-		}
-
-		blk->active = 1;
-		bdat->nr_active++;
-	}
-}
-
-static int compar_iocbp(const void *A, const void *B)
-{
-	struct iocb *a = *(struct iocb **)A;
-	struct iocb *b = *(struct iocb **)B;
-
-	return scoutfs_cmp(a->aio_offset, b->aio_offset);
-}
-
-static int submit_and_wait(struct block_data *bdat, struct list_head *list)
-{
-	struct io_event *event;
-	struct iocb *iocb;
-	struct block *blk;
-	int ret;
-	int err;
-	int nr;
-	int i;
-
-	err = 0;
-	nr = 0;
-	list_for_each_entry(blk, list, submit_head) {
-		iocb = &bdat->iocbs[nr];
-		bdat->iocbps[nr] = iocb;
-
-		memset(iocb, 0, sizeof(struct iocb));
-
-		iocb->aio_data = (intptr_t)blk;
-		iocb->aio_lio_opcode = blk_is_dirty(blk) ? IOCB_CMD_PWRITE : IOCB_CMD_PREAD;
-		iocb->aio_fildes = bdat->meta_fd;
-		iocb->aio_buf = (intptr_t)blk->buf;
-		iocb->aio_nbytes = blk->size;
-		iocb->aio_offset = blk->blkno * blk->size;
-
-		nr++;
-
-		debug_blk(blk, "submit");
-
-		if ((nr < bdat->nr_events) && blk->submit_head.next != list)
-			continue;
-
-		qsort(bdat->iocbps, nr, sizeof(bdat->iocbps[0]), compar_iocbp);
-
-		ret = syscall(__NR_io_submit, bdat->ctx, nr, bdat->iocbps);
-		if (ret != nr) {
-			if (ret >= 0)
-				errno = EIO;
-			ret = -errno;
-			fprintf(stderr, "fatal system error submitting async IO: "ENO_FMT"\n",
-				ENO_ARG(-ret));
-			goto out;
-		}
-
-		ret = syscall(__NR_io_getevents, bdat->ctx, nr, nr, bdat->events, NULL);
-		if (ret != nr) {
-			if (ret >= 0)
-				errno = EIO;
-			ret = -errno;
-			fprintf(stderr, "fatal system error getting IO events: "ENO_FMT"\n",
-				ENO_ARG(-ret));
-			goto out;
-		}
-
-		ret = 0;
-		for (i = 0; i < nr; i++) {
-			event = &bdat->events[i];
-			iocb = (struct iocb *)(intptr_t)event->obj;
-			blk = (struct block *)(intptr_t)event->data;
-
-			debug_blk(blk, "complete res %lld", (long long)event->res);
-
-			if (event->res >= 0 && event->res != blk->size)
-				event->res = -EIO;
-
-			/* io errors are fatal */
-			if (event->res < 0) {
-				ret = event->res;
-				goto out;
-			}
-
-			if (iocb->aio_lio_opcode == IOCB_CMD_PREAD) {
-				blk->uptodate = 1;
-			} else {
-				list_del_init(&blk->dirty_head);
-				bdat->nr_dirty--;
-			}
-		}
-		nr = 0;
-	}
-
-	ret = 0;
-out:
-	return ret ?: err;
-}
-
-static void inc_refcount(struct block *blk)
-{
-	blk->refcount++;
-}
-
-void block_put(struct block **blkp)
-{
-	struct block_data *bdat = &global_bdat;
-	struct block *blk = *blkp;
-
-	if (blk) {
-		blk->refcount--;
-		*blkp = NULL;
-
-		rebalance_cache(bdat);
-	}
-}
-
-static struct list_head *hash_bucket(struct block_data *bdat, u64 blkno)
-{
-	u32 hash = scoutfs_hash32(&blkno, sizeof(blkno));
-
-	return &bdat->hash_lists[hash % bdat->hash_nr];
-}
-
-int block_hdr_valid(struct block *blk, u64 blkno, int bf, u32 magic)
-{
-	struct scoutfs_block_header *hdr;
-	size_t size = (bf & BF_SM) ? SCOUTFS_BLOCK_SM_SIZE : SCOUTFS_BLOCK_LG_SIZE;
-	int ret;
-	u32 crc;
-
-	ret = block_get(&blk, blkno, bf);
-	if (ret < 0) {
-		fprintf(stderr, "error reading block %llu\n", blkno);
-		goto out;
-	}
-
-	hdr = block_buf(blk);
-
-	crc = crc_block(hdr, size);
-
-	/*
-	 * a bad CRC is easy to repair, so we pass a different error code
-	 * back. Unless the other data is also wrong - then it's EINVAL
-	 * to signal that this isn't a valid block hdr at all.
-	 */
-	if (le32_to_cpu(hdr->crc) != crc)
-		ret = -EIO; /* keep checking other fields */
-
-	if (le32_to_cpu(hdr->magic) != magic)
-		ret = -EINVAL;
-
-	/*
-	 * Our first caller fills in global_super. Until this completes,
-	 * we can't do this check.
-	 */
-	if ((blkno != SCOUTFS_SUPER_BLKNO) &&
-	    (hdr->fsid != global_super->hdr.fsid))
-		ret = -EINVAL;
-
-	block_put(&blk);
-
-	debug("%s blk_hdr_valid blkno %llu size %lu crc 0x%08x magic 0x%08x ret %d",
-	      sns_str(), blkno, size, le32_to_cpu(hdr->crc), le32_to_cpu(hdr->magic),
-	      ret);
-
-out:
-	return ret;
-}
-
-static struct block *get_or_alloc(struct block_data *bdat, u64 blkno, int bf)
-{
-	struct list_head *bucket = hash_bucket(bdat, blkno);
-	struct block *search;
-	struct block *blk;
-	size_t size;
-
-	size = (bf & BF_SM) ? SCOUTFS_BLOCK_SM_SIZE : SCOUTFS_BLOCK_LG_SIZE;
-
-	blk = NULL;
-	list_for_each_entry(search, bucket, hash_head) {
-		if (search->blkno == blkno && search->size == size) {
-			blk = search;
-			break;
-		}
-	}
-
-	if (!blk) {
-		blk = alloc_block(bdat, blkno, size);
-		if (blk) {
-			list_add(&blk->hash_head, bucket);
-			list_add(&blk->lru_head, &bdat->inactive_head);
-			bdat->nr_inactive++;
-		}
-	}
-	if (blk)
-		inc_refcount(blk);
-
-	return blk;
-}
-
-/*
- * Get a block.
- *
- * The caller holds a refcount to the block while it's in use that
- * prevents it from being removed from the cache.  It must be dropped
- * with block_put();
- */
-int block_get(struct block **blk_ret, u64 blkno, int bf)
-{
-	struct block_data *bdat = &global_bdat;
-	struct block *blk;
-	LIST_HEAD(list);
-	int ret;
-
-	blk = get_or_alloc(bdat, blkno, bf);
-	if (!blk) {
-		ret = -ENOMEM;
-		goto out;
-	}
-
-	if ((bf & BF_ZERO)) {
-		memset(blk->buf, 0, blk->size);
-		blk->uptodate = 1;
-	}
-
-	if (bf & BF_OVERWRITE)
-		blk->uptodate = 1;
-
-	if (!blk->uptodate) {
-		list_add(&blk->submit_head, &list);
-		ret = submit_and_wait(bdat, &list);
-		list_del_init(&blk->submit_head);
-		if (ret < 0)
-			goto out;
-	}
-
-	if ((bf & BF_DIRTY) && !blk_is_dirty(blk)) {
-		list_add_tail(&bdat->dirty_list, &blk->dirty_head);
-		bdat->nr_dirty++;
-	}
-
-	make_active(bdat, blk);
-
-	rebalance_cache(bdat);
-	ret = 0;
-out:
-	if (ret < 0)
-		block_put(&blk);
-	*blk_ret = blk;
-	return ret;
-}
-
-void *block_buf(struct block *blk)
-{
-	return blk->buf;
-}
-
-size_t block_size(struct block *blk)
-{
-	return blk->size;
-}
-
-/*
- * Drop the block from the cache, regardless of if it was free or not.
- * This is used to avoid writing blocks which were dirtied but then
- * later freed.
- *
- * The block is immediately freed and can't be referenced after this
- * returns.
- */
-void block_drop(struct block **blkp)
-{
-	struct block_data *bdat = &global_bdat;
-
-	free_block(bdat, *blkp);
-	*blkp = NULL;
-	rebalance_cache(bdat);
-}
-
-/*
- * This doesn't quite work for mixing large and small blocks, but that's
- * fine, we never do that.
- */
-static int compar_u64(const void *A, const void *B)
-{
-	u64 a = *((u64 *)A);
-	u64 b = *((u64 *)B);
-
-	return scoutfs_cmp(a, b);
-}
-
-/*
- * This read-ahead is synchronous and errors are ignored.  If any of the
- * blknos aren't present in the cache then we issue concurrent reads for
- * them and wait.  Any existing cached blocks will be left as is.
- *
- * We might be trying to read a lot more than the number of events so we
- * sort the caller's blknos before iterating over them rather than
- * relying on submission sorting the blocks in each submitted set.
- */
-void block_readahead(u64 *blknos, size_t nr)
-{
-	struct block_data *bdat = &global_bdat;
-	struct block *blk;
-	struct block *blk_;
-	LIST_HEAD(list);
-	size_t i;
-
-	if (nr == 0)
-		return;
-
-	qsort(blknos, nr, sizeof(blknos[0]), compar_u64);
-
-	for (i = 0; i < nr; i++) {
-		blk = get_or_alloc(bdat, blknos[i], 0);
-		if (blk) {
-			if (!blk->uptodate)
-				list_add_tail(&blk->submit_head, &list);
-			else
-				block_put(&blk);
-		}
-	}
-
-	(void)submit_and_wait(bdat, &list);
-
-	list_for_each_entry_safe(blk, blk_, &list, submit_head) {
-		list_del_init(&blk->submit_head);
-		block_put(&blk);
-	}
-
-	rebalance_cache(bdat);
-}
-
-/*
- * The caller's block changes form a consistent transaction.  If the amount of dirty
- * blocks is large enough we issue a write.
- */
-int block_try_commit(bool force)
-{
-	struct block_data *bdat = &global_bdat;
-	struct block *blk;
-	struct block *blk_;
-	LIST_HEAD(list);
-	int ret;
-
-	if (!force && bdat->nr_dirty < bdat->nr_events)
-		return 0;
-
-	list_for_each_entry(blk, &bdat->dirty_list, dirty_head) {
-		list_add_tail(&blk->submit_head, &list);
-		inc_refcount(blk);
-	}
-
-	ret = submit_and_wait(bdat, &list);
-
-	list_for_each_entry_safe(blk, blk_, &list, submit_head) {
-		list_del_init(&blk->submit_head);
-		block_put(&blk);
-	}
-
-	if (ret < 0) {
-		fprintf(stderr, "error writing dirty transaction blocks\n");
-		goto out;
-	}
-
-	ret = block_get(&blk, SCOUTFS_SUPER_BLKNO, BF_SM | BF_OVERWRITE | BF_DIRTY);
-	if (ret == 0) {
-		list_add(&blk->submit_head, &list);
-		ret = submit_and_wait(bdat, &list);
-		list_del_init(&blk->submit_head);
-		block_put(&blk);
-	} else {
-		ret = -ENOMEM;
-	}
-	if (ret < 0)
-		fprintf(stderr, "error writing super block to commit transaction\n");
-
-out:
-	rebalance_cache(bdat);
-	return ret;
-}
-
-int block_setup(int meta_fd, size_t max_cached_bytes, size_t max_dirty_bytes)
-{
-	struct block_data *bdat = &global_bdat;
-	size_t i;
-	int ret;
-
-	bdat->max_cached = DIV_ROUND_UP(max_cached_bytes, SCOUTFS_BLOCK_LG_SIZE);
-	bdat->hash_nr = bdat->max_cached / 4;
-	bdat->nr_events = DIV_ROUND_UP(max_dirty_bytes, SCOUTFS_BLOCK_LG_SIZE);
-
-	bdat->iocbs = calloc(bdat->nr_events, sizeof(bdat->iocbs[0]));
-	bdat->iocbps = calloc(bdat->nr_events, sizeof(bdat->iocbps[0]));
-	bdat->events = calloc(bdat->nr_events, sizeof(bdat->events[0]));
-	bdat->hash_lists = calloc(bdat->hash_nr, sizeof(bdat->hash_lists[0]));
-	if (!bdat->iocbs || !bdat->iocbps || !bdat->events || !bdat->hash_lists) {
-		ret = -ENOMEM;
-		goto out;
-	}
-
-	INIT_LIST_HEAD(&bdat->active_head);
-	INIT_LIST_HEAD(&bdat->inactive_head);
-	INIT_LIST_HEAD(&bdat->dirty_list);
-	bdat->meta_fd = meta_fd;
-	list_add(&bdat->inactive_head, &bdat->active_head);
-
-	for (i = 0; i < bdat->hash_nr; i++)
-		INIT_LIST_HEAD(&bdat->hash_lists[i]);
-
-	ret = syscall(__NR_io_setup, bdat->nr_events, &bdat->ctx);
-
-out:
-	if (ret < 0) {
-		free(bdat->iocbs);
-		free(bdat->iocbps);
-		free(bdat->events);
-		free(bdat->hash_lists);
-	}
-
-	return ret;
-}
-
-void block_shutdown(void)
-{
-	struct block_data *bdat = &global_bdat;
-
-	syscall(SYS_io_destroy, bdat->ctx);
-
-	free(bdat->iocbs);
-	free(bdat->iocbps);
-	free(bdat->events);
-	free(bdat->hash_lists);
-}
--- a/utils/src/check/block.h
+++ b/utils/src/check/block.h
@@ -1,34 +0,0 @@
-#ifndef _SCOUTFS_UTILS_CHECK_BLOCK_H_
-#define _SCOUTFS_UTILS_CHECK_BLOCK_H_
-
-#include <unistd.h>
-#include <stdbool.h>
-
-struct block;
-
-#include "sparse.h"
-
-/* block flags passed to block_get() */
-enum {
-	BF_ZERO      = (1 << 0), /* zero contents buf as block is returned */
-	BF_DIRTY     = (1 << 1), /* block will be written with transaction */
-	BF_SM        = (1 << 2), /* small 4k block instead of large 64k block */
-	BF_OVERWRITE = (1 << 3), /* caller will overwrite contents, don't read */
-};
-
-int block_get(struct block **blk_ret, u64 blkno, int bf);
-void block_put(struct block **blkp);
-
-void *block_buf(struct block *blk);
-size_t block_size(struct block *blk);
-void block_drop(struct block **blkp);
-
-void block_readahead(u64 *blknos, size_t nr);
-int block_try_commit(bool force);
-
-int block_setup(int meta_fd, size_t max_cached_bytes, size_t max_dirty_bytes);
-void block_shutdown(void);
-
-int block_hdr_valid(struct block *blk, u64 blkno, int bf, u32 magic);
-
-#endif
--- a/utils/src/check/btree.c
+++ b/utils/src/check/btree.c
@@ -1,217 +0,0 @@
-#include <unistd.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <stdbool.h>
-#include <errno.h>
-
-#include "sparse.h"
-#include "util.h"
-#include "format.h"
-#include "key.h"
-#include "avl.h"
-
-#include "block.h"
-#include "btree.h"
-#include "extent.h"
-#include "iter.h"
-#include "sns.h"
-#include "meta.h"
-#include "problem.h"
-
-static inline void *item_val(struct scoutfs_btree_block *bt, struct scoutfs_btree_item *item)
-{
-	return (void *)bt + le16_to_cpu(item->val_off);
-}
-
-static void readahead_refs(struct scoutfs_btree_block *bt)
-{
-	struct scoutfs_btree_item *item;
-	struct scoutfs_avl_node *node;
-	struct scoutfs_block_ref *ref;
-	u64 *blknos;
-	u64 blkno;
-	u16 valid = 0;
-	u16 nr = le16_to_cpu(bt->nr_items);
-	int i;
-
-	blknos = calloc(nr, sizeof(blknos[0]));
-	if (!blknos)
-		return;
-
-	node = avl_first(&bt->item_root);
-
-	for (i = 0; i < nr; i++) {
-		item = container_of(node, struct scoutfs_btree_item, node);
-		ref = item_val(bt, item);
-		blkno = le64_to_cpu(ref->blkno);
-
-		if (valid_meta_blkno(blkno))
-			blknos[valid++] = blkno;
-
-		node = avl_next(&bt->item_root, &item->node);
-	}
-
-	if (valid > 0)
-		block_readahead(blknos, valid);
-	free(blknos);
-}
-
-/*
- * Call the callback on the referenced block.  Then if the block
- * contains referneces read it and recurse into all its references.
- */
-static int btree_ref_meta_iter(struct scoutfs_block_ref *ref, unsigned level, extent_cb_t cb,
-			       void *cb_arg)
-{
-	struct scoutfs_btree_item *item;
-	struct scoutfs_btree_block *bt;
-	struct scoutfs_avl_node *node;
-	struct block *blk = NULL;
-	u64 blkno;
-	int ret;
-	int i;
-
-	blkno = le64_to_cpu(ref->blkno);
-	if (!blkno)
-		return 0;
-
-	ret = cb(blkno, 1, cb_arg);
-	if (ret < 0) {
-		ret = xlate_iter_errno(ret);
-		return 0;
-	}
-
-	if (level == 0)
-		return 0;
-
-	ret = block_get(&blk, blkno, 0);
-	if (ret < 0)
-		return ret;
-
-	ret = block_hdr_valid(blk, blkno, 0, SCOUTFS_BLOCK_MAGIC_BTREE);
-	if (ret < 0)
-		return ret;
-
-	sns_push("btree_parent", blkno, 0);
-
-	bt = block_buf(blk);
-
-	/* XXX integrate verification with block cache */
-	if (bt->level != level) {
-		problem(PB_BTREE_BLOCK_BAD_LEVEL, "expected %u level %u", level, bt->level);
-		ret = -EINVAL;
-		goto out;
-	}
-
-	/* read-ahead last level of parents */
-	if (level == 2)
-		readahead_refs(bt);
-
-	node = avl_first(&bt->item_root);
-
-	for (i = 0; i < le16_to_cpu(bt->nr_items); i++) {
-		item = container_of(node, struct scoutfs_btree_item, node);
-		ref = item_val(bt, item);
-
-		ret = btree_ref_meta_iter(ref, level - 1, cb, cb_arg);
-		if (ret < 0)
-			goto out;
-
-		node = avl_next(&bt->item_root, &item->node);
-	}
-
-	ret = 0;
-out:
-	block_put(&blk);
-	sns_pop();
-
-	return ret;
-}
-
-int btree_meta_iter(struct scoutfs_btree_root *root, extent_cb_t cb, void *cb_arg)
-{
-	/* XXX check root */
-	if (root->height == 0)
-		return 0;
-
-	return btree_ref_meta_iter(&root->ref, root->height - 1, cb, cb_arg);
-}
-
-static int btree_ref_item_iter(struct scoutfs_block_ref *ref, unsigned level,
-			       btree_item_cb_t cb, void *cb_arg)
-{
-	struct scoutfs_btree_item *item;
-	struct scoutfs_btree_block *bt;
-	struct scoutfs_avl_node *node;
-	struct block *blk = NULL;
-	u64 blkno;
-	int ret;
-	int i;
-
-	blkno = le64_to_cpu(ref->blkno);
-	if (!blkno)
-		return 0;
-
-	ret = block_get(&blk, blkno, 0);
-	if (ret < 0)
-		return ret;
-
-	if (level)
-		sns_push("btree_parent", blkno, 0);
-	else
-		sns_push("btree_leaf", blkno, 0);
-
-	ret = block_hdr_valid(blk, blkno, 0, SCOUTFS_BLOCK_MAGIC_BTREE);
-	if (ret < 0)
-		return ret;
-
-	bt = block_buf(blk);
-
-	/* XXX integrate verification with block cache */
-	if (bt->level != level) {
-		problem(PB_BTREE_BLOCK_BAD_LEVEL, "expected %u level %u", level, bt->level);
-		ret = -EINVAL;
-		goto out;
-	}
-
-	/* read-ahead leaves that contain items */
-	if (level == 1)
-		readahead_refs(bt);
-
-	node = avl_first(&bt->item_root);
-
-	for (i = 0; i < le16_to_cpu(bt->nr_items); i++) {
-		item = container_of(node, struct scoutfs_btree_item, node);
-
-		if (level) {
-			ref = item_val(bt, item);
-			ret = btree_ref_item_iter(ref, level - 1, cb, cb_arg);
-		} else {
-			ret = cb(&item->key, item_val(bt, item),
-				 le16_to_cpu(item->val_len), cb_arg);
-			debug("free item key "SK_FMT" ret %d", SK_ARG(&item->key), ret);
-		}
-		if (ret < 0) {
-			ret = xlate_iter_errno(ret);
-			goto out;
-		}
-
-		node = avl_next(&bt->item_root, &item->node);
-	}
-
-	ret = 0;
-out:
-	block_put(&blk);
-	sns_pop();
-
-	return ret;
-}
-
-int btree_item_iter(struct scoutfs_btree_root *root, btree_item_cb_t cb, void *cb_arg)
-{
-	/* XXX check root */
-	if (root->height == 0)
-		return 0;
-
-	return btree_ref_item_iter(&root->ref, root->height - 1, cb, cb_arg);
-}
--- a/utils/src/check/btree.h
+++ b/utils/src/check/btree.h
@@ -1,14 +0,0 @@
-#ifndef _SCOUTFS_UTILS_CHECK_BTREE_H_
-#define _SCOUTFS_UTILS_CHECK_BTREE_H_
-
-#include "util.h"
-#include "format.h"
-
-#include "extent.h"
-
-typedef int (*btree_item_cb_t)(struct scoutfs_key *key, void *val, u16 val_len, void *cb_arg);
-
-int btree_meta_iter(struct scoutfs_btree_root *root, extent_cb_t cb, void *cb_arg);
-int btree_item_iter(struct scoutfs_btree_root *root, btree_item_cb_t cb, void *cb_arg);
-
-#endif
--- a/utils/src/check/check.c
+++ b/utils/src/check/check.c
@@ -1,184 +0,0 @@
-#define _GNU_SOURCE /* O_DIRECT */
-#include <unistd.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <sys/types.h>
-#include <sys/stat.h>
-#include <sys/ioctl.h>
-#include <fcntl.h>
-#include <errno.h>
-#include <string.h>
-#include <assert.h>
-#include <stdbool.h>
-#include <argp.h>
-
-#include "sparse.h"
-#include "parse.h"
-#include "util.h"
-#include "format.h"
-#include "ioctl.h"
-#include "cmd.h"
-#include "dev.h"
-
-#include "alloc.h"
-#include "block.h"
-#include "debug.h"
-#include "meta.h"
-#include "super.h"
-#include "problem.h"
-
-struct check_args {
-	char *meta_device;
-	char *data_device;
-	char *debug_path;
-};
-
-static int do_check(struct check_args *args)
-{
-	int debug_fd = -1;
-	int meta_fd = -1;
-	int data_fd = -1;
-	int ret;
-
-	if (args->debug_path) {
-		if (strcmp(args->debug_path, "-") == 0)
-			debug_fd = dup(STDERR_FILENO);
-		else
-			debug_fd = open(args->debug_path, O_WRONLY | O_CREAT | O_TRUNC, 0644);
-		if (debug_fd < 0) {
-			ret = -errno;
-			fprintf(stderr, "error opening debug output file '%s': %s (%d)\n",
-				args->debug_path, strerror(errno), errno);
-			goto out;
-		}
-
-		debug_enable(debug_fd);
-	}
-
-	meta_fd = open(args->meta_device, O_DIRECT | O_RDWR | O_EXCL);
-	if (meta_fd < 0) {
-		ret = -errno;
-		fprintf(stderr, "failed to open meta device '%s': %s (%d)\n",
-			args->meta_device, strerror(errno), errno);
-		goto out;
-	}
-
-	data_fd = open(args->data_device, O_DIRECT | O_RDWR | O_EXCL);
-	if (data_fd < 0) {
-		ret = -errno;
-		fprintf(stderr, "failed to open data device '%s': %s (%d)\n",
-			args->data_device, strerror(errno), errno);
-		goto out;
-	}
-
-	ret = block_setup(meta_fd, 128 * 1024 * 1024, 32 * 1024 * 1024);
-	if (ret < 0)
-		goto out;
-
-	/*
-	 * At some point we may convert this to a multi-pass system where we may
-	 * try and repair items, and, as long as repairs are made, we will rerun
-	 * the checks more times. We may need to start counting how many problems we
-	 * fix in the process of these loops, so that we don't stall on unrepairable
-	 * problems and are making actual repair progress. IOW - when we do a full
-	 * check loop without any problems fixed, we stop trying.
-	 */
-	ret = check_supers(data_fd) ?:
-	      check_super_in_use(meta_fd) ?:
-	      check_meta_alloc() ?:
-	      check_super_crc();
-
-	if (ret < 0)
-		goto out;
-
-	debug("problem count %lu", problems_count());
-	if (problems_count() > 0)
-		printf("Problems detected.\n");
-
-out:
-	/* and tear it all down */
-	block_shutdown();
-	super_shutdown();
-	debug_disable();
-
-	if (meta_fd >= 0)
-		close(meta_fd);
-	if (data_fd >= 0)
-		close(data_fd);
-	if (debug_fd >= 0)
-		close(debug_fd);
-
-	return ret;
-}
-
-static int parse_opt(int key, char *arg, struct argp_state *state)
-{
-	struct check_args *args = state->input;
-
-	switch (key) {
-	case 'd':
-		args->debug_path = strdup_or_error(state, arg);
-		break;
-	case 'e':
-	case ARGP_KEY_ARG:
-		if (!args->meta_device)
-			args->meta_device = strdup_or_error(state, arg);
-		else if (!args->data_device)
-			args->data_device = strdup_or_error(state, arg);
-		else
-			argp_error(state, "more than two device arguments given");
-		break;
-	case ARGP_KEY_FINI:
-		if (!args->meta_device)
-			argp_error(state, "no metadata device argument given");
-		if (!args->data_device)
-			argp_error(state, "no data device argument given");
-		break;
-	default:
-		break;
-	}
-
-	return 0;
-}
-
-static struct argp_option options[] = {
-	{ "debug", 'd', "FILE_PATH", 0, "Path to debug output file, will be created or truncated"},
-	{ NULL }
-};
-
-static struct argp argp = {
-	options,
-	parse_opt,
-	"META-DEVICE DATA-DEVICE",
-	"Check filesystem consistency"
-};
-
-/* Exit codes used by fsck-type programs */
-#define FSCK_EX_NONDESTRUCT	1	/* File system errors corrected */
-#define FSCK_EX_UNCORRECTED	4	/* File system errors left uncorrected */
-#define FSCK_EX_ERROR		8	/* Operational error */
-#define FSCK_EX_USAGE		16	/* Usage or syntax error */
-
-static int check_cmd(int argc, char **argv)
-{
-	struct check_args check_args = {NULL};
-	int ret;
-
-	ret = argp_parse(&argp, argc, argv, 0, NULL, &check_args);
-	if (ret)
-		exit(FSCK_EX_USAGE);
-
-	ret = do_check(&check_args);
-	if (ret < 0)
-		ret = FSCK_EX_ERROR;
-
-	if (problems_count() > 0)
-		ret |= FSCK_EX_UNCORRECTED;
-
-	exit(ret);
-}
-
-static void __attribute__((constructor)) check_ctor(void)
-{
-	cmd_register_argp("check", &argp, GROUP_CORE, check_cmd);
-}
--- a/utils/src/check/debug.c
+++ b/utils/src/check/debug.c
@@ -1,16 +0,0 @@
-#include <stdlib.h>
-
-#include "debug.h"
-
-int debug_fd = -1;
-
-void debug_enable(int fd)
-{
-	debug_fd = fd;
-}
-
-void debug_disable(void)
-{
-	if (debug_fd >= 0)
-		debug_fd = -1;
-}
--- a/utils/src/check/debug.h
+++ b/utils/src/check/debug.h
@@ -1,17 +0,0 @@
-#ifndef _SCOUTFS_UTILS_CHECK_DEBUG_H_
-#define _SCOUTFS_UTILS_CHECK_DEBUG_H_
-
-#include <stdio.h>
-
-#define debug(fmt, args...)				\
-do {							\
-	if (debug_fd >= 0)				\
-		dprintf(debug_fd, fmt"\n", ##args);	\
-} while (0)
-
-extern int debug_fd;
-
-void debug_enable(int fd);
-void debug_disable(void);
-
-#endif
--- a/utils/src/check/eno.h
+++ b/utils/src/check/eno.h
@@ -1,9 +0,0 @@
-#ifndef _SCOUTFS_UTILS_CHECK_ENO_H_
-#define _SCOUTFS_UTILS_CHECK_ENO_H_
-
-#include <errno.h>
-
-#define ENO_FMT		"%d (%s)"
-#define ENO_ARG(eno)	eno, strerror(eno)
-
-#endif
--- a/utils/src/check/extent.c
+++ b/utils/src/check/extent.c
@@ -1,313 +0,0 @@
-#include <unistd.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <stdbool.h>
-#include <errno.h>
-
-#include "util.h"
-#include "lk_rbtree_wrapper.h"
-
-#include "debug.h"
-#include "extent.h"
-
-/*
- * In-memory extent management in rbtree nodes.
- */
-
-bool extents_overlap(u64 a_start, u64 a_len, u64 b_start, u64 b_len)
-{
-	u64 a_end = a_start + a_len;
-	u64 b_end = b_start + b_len;
-
-	return !((a_end <= b_start) || (b_end <= a_start));
-}
-
-static int ext_contains(struct extent_node *ext, u64 start, u64 len)
-{
-	return ext->start <= start && ext->start + ext->len >= start + len;
-}
-
-/*
- * True if the given extent is bisected by the given range; there's
- * leftover containing extents on both the left and right sides of the
- * range in the extent.
- */
-static int ext_bisected(struct extent_node *ext, u64 start, u64 len)
-{
-	return ext->start < start && ext->start + ext->len > start + len;
-}
-
-static struct extent_node *ext_from_rbnode(struct rb_node *rbnode)
-{
-	return rbnode ? container_of(rbnode, struct extent_node, rbnode) : NULL;
-}
-
-static struct extent_node *next_ext(struct extent_node *ext)
-{
-	return ext ? ext_from_rbnode(rb_next(&ext->rbnode)) : NULL;
-}
-
-static struct extent_node *prev_ext(struct extent_node *ext)
-{
-	return ext ? ext_from_rbnode(rb_prev(&ext->rbnode)) : NULL;
-}
-
-struct walk_results {
-	unsigned bisect_to_leaf:1;
-	struct extent_node *found;
-	struct extent_node *next;
-	struct rb_node *parent;
-	struct rb_node **node;
-};
-
-static void walk_extents(struct extent_root *root, u64 start, u64 len, struct walk_results *wlk)
-{
-	struct rb_node **node = &root->rbroot.rb_node;
-	struct extent_node *ext;
-	u64 end = start + len;
-	int cmp;
-
-	wlk->found = NULL;
-	wlk->next = NULL;
-	wlk->parent = NULL;
-
-	while (*node) {
-		wlk->parent = *node;
-		ext = ext_from_rbnode(*node);
-		cmp = end <= ext->start ? -1 :
-		      start >= ext->start + ext->len ? 1 : 0;
-
-		if (cmp < 0) {
-			node = &ext->rbnode.rb_left;
-			wlk->next = ext;
-		} else if (cmp > 0) {
-			node = &ext->rbnode.rb_right;
-		} else {
-			wlk->found = ext;
-			if (!(wlk->bisect_to_leaf && ext_bisected(ext, start, len)))
-				break;
-			/* walk right so we can insert greater right from bisection */
-			node = &ext->rbnode.rb_right;
-		}
-	}
-
-	wlk->node = node;
-}
-
-/*
- * Return an extent that overlaps with the given range.
- */
-int extent_lookup(struct extent_root *root, u64 start, u64 len, struct extent_node *found)
-{
-	struct walk_results wlk = { 0, };
-	int ret;
-
-	walk_extents(root, start, len, &wlk);
-	if (wlk.found) {
-		memset(found, 0, sizeof(struct extent_node));
-		found->start = wlk.found->start;
-		found->len = wlk.found->len;
-		ret = 0;
-	} else {
-		ret = -ENOENT;
-	}
-
-	return ret;
-}
-
-/*
- * Callers can iterate through direct node references and are entirely
- * responsible for consistency when doing so.
- */
-struct extent_node *extent_first(struct extent_root *root)
-{
-	struct walk_results wlk = { 0, };
-
-	walk_extents(root, 0, 1, &wlk);
-
-	return wlk.found ?: wlk.next;
-}
-
-struct extent_node *extent_next(struct extent_node *ext)
-{
-	return next_ext(ext);
-}
-
-struct extent_node *extent_prev(struct extent_node *ext)
-{
-	return prev_ext(ext);
-}
-
-/*
- * Insert a new extent into the tree.  We can extend existing nodes,
- * merge with neighbours, or remove existing extents entirely if we
- * insert a range that fully spans existing nodes.
- */
-static int walk_insert(struct extent_root *root, u64 start, u64 len, int found_err)
-{
-	struct walk_results wlk = { 0, };
-	struct extent_node *ext;
-	struct extent_node *nei;
-	int ret;
-
-	walk_extents(root, start, len, &wlk);
-
-	ext = wlk.found;
-	if (ext && found_err) {
-		ret = found_err;
-		goto out;
-	}
-
-	if (!ext) {
-		ext = malloc(sizeof(struct extent_node));
-		if (!ext) {
-			ret = -ENOMEM;
-			goto out;
-		}
-
-		ext->start = start;
-		ext->len = len;
-
-		rb_link_node(&ext->rbnode, wlk.parent, wlk.node);
-		rb_insert_color(&ext->rbnode, &root->rbroot);
-	}
-
-	/* start by expanding an existing extent if our range is larger */
-	if (start < ext->start) {
-		ext->len += ext->start - start;
-		ext->start = start;
-	}
-	if (ext->start + ext->len < start + len)
-		ext->len += (start + len) - (ext->start + ext->len);
-
-	/* drop any fully spanned neighbors, possibly merging with a final adjacent one */
-
-	while ((nei = prev_ext(ext))) {
-		if (nei->start + nei->len < ext->start)
-			break;
-
-		if (nei->start < ext->start) {
-			ext->len += ext->start - nei->start;
-			ext->start = nei->start;
-		}
-
-		rb_erase(&nei->rbnode, &root->rbroot);
-		free(nei);
-	}
-
-	while ((nei = next_ext(ext))) {
-		if (ext->start + ext->len < nei->start)
-			break;
-
-		if (ext->start + ext->len < nei->start + nei->len)
-			ext->len += (nei->start + nei->len) - (ext->start + ext->len);
-
-		rb_erase(&nei->rbnode, &root->rbroot);
-		free(nei);
-	}
-
-	ret = 0;
-out:
-	if (ret < 0)
-		debug("start %llu len %llu ret %d", start, len, ret);
-	return ret;
-}
-
-/*
- * Insert a new extent.  The specified extent must not overlap with any
- * existing extents or -EEXIST is returned.
- */
-int extent_insert_new(struct extent_root *root, u64 start, u64 len)
-{
-	return walk_insert(root, start, len, true);
-}
-
-/*
- * Insert an extent, extending any existing extents that may overlap.
- */
-int extent_insert_extend(struct extent_root *root, u64 start, u64 len)
-{
-	return walk_insert(root, start, len, false);
-}
-
-/*
- * Remove the specified extent from an existing node.  The given extent must be fully
- * contained in a single node or -ENOENT is returned.
- */
-int extent_remove(struct extent_root *root, u64 start, u64 len)
-{
-	struct extent_node *ext;
-	struct extent_node *ins;
-	struct walk_results wlk = {
-		.bisect_to_leaf = 1,
-	};
-	int ret;
-
-	walk_extents(root, start, len, &wlk);
-
-	if (!(ext = wlk.found) || !ext_contains(ext, start, len)) {
-		ret = -ENOENT;
-		goto out;
-	}
-
-	if (ext_bisected(ext, start, len)) {
-		debug("found bisected start %llu len %llu", ext->start, ext->len);
-		ins = malloc(sizeof(struct extent_node));
-		if (!ins) {
-			ret = -ENOMEM;
-			goto out;
-		}
-
-		ins->start = start + len;
-		ins->len = (ext->start + ext->len) - ins->start;
-
-		rb_link_node(&ins->rbnode, wlk.parent, wlk.node);
-		rb_insert_color(&ins->rbnode, &root->rbroot);
-	}
-
-	if (start > ext->start) {
-		ext->len = start - ext->start;
-	} else if (len < ext->len) {
-		ext->start += len;
-		ext->len -= len;
-	} else {
-		rb_erase(&ext->rbnode, &root->rbroot);
-	}
-
-	ret = 0;
-out:
-	debug("start %llu len %llu ret %d", start, len, ret);
-
-	return ret;
-}
-
-void extent_root_init(struct extent_root *root)
-{
-	root->rbroot = RB_ROOT;
-	root->total = 0;
-}
-
-void extent_root_free(struct extent_root *root)
-{
-	struct extent_node *ext;
-	struct rb_node *node;
-	struct rb_node *tmp;
-
-	for (node = rb_first(&root->rbroot); node && ((tmp = rb_next(node)), 1); node = tmp) {
-		ext = rb_entry(node, struct extent_node, rbnode);
-		rb_erase(&ext->rbnode, &root->rbroot);
-		free(ext);
-	}
-}
-
-void extent_root_print(struct extent_root *root)
-{
-	struct extent_node *ext;
-	struct rb_node *node;
-	struct rb_node *tmp;
-
-	for (node = rb_first(&root->rbroot); node && ((tmp = rb_next(node)), 1); node = tmp) {
-		ext = rb_entry(node, struct extent_node, rbnode);
-		debug("  start %llu len %llu", ext->start, ext->len);
-	}
-}
--- a/utils/src/check/extent.h
+++ b/utils/src/check/extent.h
@@ -1,38 +0,0 @@
-#ifndef _SCOUTFS_UTILS_CHECK_EXTENT_H_
-#define _SCOUTFS_UTILS_CHECK_EXTENT_H_
-
-#include "lk_rbtree_wrapper.h"
-
-struct extent_root {
-	struct rb_root rbroot;
-	u64 total;
-};
-
-struct extent_node {
-	struct rb_node rbnode;
-	u64 start;
-	u64 len;
-};
-
-typedef int (*extent_cb_t)(u64 start, u64 len, void *arg);
-
-struct extent_cb_arg_t {
-	extent_cb_t cb;
-	void *cb_arg;
-};
-
-bool extents_overlap(u64 a_start, u64 a_len, u64 b_start, u64 b_len);
-
-int extent_lookup(struct extent_root *root, u64 start, u64 len, struct extent_node *found);
-struct extent_node *extent_first(struct extent_root *root);
-struct extent_node *extent_next(struct extent_node *ext);
-struct extent_node *extent_prev(struct extent_node *ext);
-int extent_insert_new(struct extent_root *root, u64 start, u64 len);
-int extent_insert_extend(struct extent_root *root, u64 start, u64 len);
-int extent_remove(struct extent_root *root, u64 start, u64 len);
-
-void extent_root_init(struct extent_root *root);
-void extent_root_free(struct extent_root *root);
-void extent_root_print(struct extent_root *root);
-
-#endif
--- a/utils/src/check/image.c
+++ b/utils/src/check/image.c
@@ -1,540 +0,0 @@
-#define _GNU_SOURCE /* O_DIRECT */
-#include <unistd.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <fcntl.h>
-#include <errno.h>
-#include <string.h>
-#include <stdbool.h>
-#include <argp.h>
-
-#include "sparse.h"
-#include "bitmap.h"
-#include "parse.h"
-#include "util.h"
-#include "format.h"
-#include "crc.h"
-#include "cmd.h"
-#include "dev.h"
-
-#include "alloc.h"
-#include "block.h"
-#include "btree.h"
-#include "log_trees.h"
-#include "super.h"
-
-/* huh. */
-#define OFF_MAX (off_t)((u64)((off_t)~0ULL) >> 1)
-
-#define SCOUTFS_META_IMAGE_HEADER_MAGIC		0x8aee00d098fa60c5ULL
-#define SCOUTFS_META_IMAGE_BLOCK_HEADER_MAGIC	0x70bd5e9269effd86ULL
-
-struct scoutfs_meta_image_header {
-	__le64 magic;
-	__le64 total_bytes;
-	__le32 version;
-} __packed;
-
-struct scoutfs_meta_image_block_header {
-	__le64 magic;
-	__le64 offset;
-	__le32 size;
-	__le32 crc;
-} __packed;
-
-struct image_args {
-	char *meta_device;
-	bool is_read;
-	bool show_header;
-	u64 ra_window;
-};
-
-struct block_bitmaps {
-	unsigned long *bits;
-	u64 size;
-	u64 count;
-};
-
-#define errf(fmt, args...) \
-	dprintf(STDERR_FILENO, fmt, ##args)
-
-static int set_meta_bit(u64 start, u64 len, void *arg)
-{
-	struct block_bitmaps *bm = arg;
-	int ret;
-
-	if (len != 1) {
-		ret = -EINVAL;
-	} else {
-		if (!test_bit(bm->bits, start)) {
-			set_bit(bm->bits, start);
-			bm->count++;
-		}
-		ret = 0;
-	}
-
-	return ret;
-}
-
-static int get_ref_bits(struct block_bitmaps *bm)
-{
-	struct scoutfs_super_block *super = global_super;
-	int ret;
-	u64 i;
-
-	/*
-	 * There are almost no small blocks we need to read, so we read
-	 * them as the large blocks that contain them to simplify the
-	 * block reading process.
-	 */
-	set_meta_bit(SCOUTFS_SUPER_BLKNO >> SCOUTFS_BLOCK_SM_LG_SHIFT, 1, bm);
-
-	for (i = 0; i < SCOUTFS_QUORUM_BLOCKS; i++)
-		set_meta_bit((SCOUTFS_QUORUM_BLKNO + i) >> SCOUTFS_BLOCK_SM_LG_SHIFT, 1, bm);
-
-	ret = alloc_root_meta_iter(&super->meta_alloc[0], set_meta_bit, bm) ?:
-	      alloc_root_meta_iter(&super->meta_alloc[1], set_meta_bit, bm) ?:
-	      alloc_root_meta_iter(&super->data_alloc, set_meta_bit, bm) ?:
-	      alloc_list_meta_iter(&super->server_meta_avail[0], set_meta_bit, bm) ?:
-	      alloc_list_meta_iter(&super->server_meta_avail[1], set_meta_bit, bm) ?:
-	      alloc_list_meta_iter(&super->server_meta_freed[0], set_meta_bit, bm) ?:
-	      alloc_list_meta_iter(&super->server_meta_freed[1], set_meta_bit, bm) ?:
-	      btree_meta_iter(&super->fs_root, set_meta_bit, bm) ?:
-	      btree_meta_iter(&super->logs_root, set_meta_bit, bm) ?:
-	      btree_meta_iter(&super->log_merge, set_meta_bit, bm) ?:
-	      btree_meta_iter(&super->mounted_clients, set_meta_bit, bm) ?:
-	      btree_meta_iter(&super->srch_root, set_meta_bit, bm) ?:
-	      log_trees_meta_iter(set_meta_bit, bm);
-
-	return ret;
-}
-
-/*
- * Note that this temporarily modifies the header that it's given.
- */
-static __le32 calc_crc(struct scoutfs_meta_image_block_header *bh, void *buf, size_t size)
-{
-	__le32 saved = bh->crc;
-	u32 crc = ~0;
-
-	bh->crc = 0;
-	crc = crc32c(crc, bh, sizeof(*bh));
-	crc = crc32c(crc, buf, size);
-	bh->crc = saved;
-
-	return cpu_to_le32(crc);
-}
-
-static void printf_header(struct scoutfs_meta_image_header *hdr)
-{
-	errf("magic: 0x%016llx\n"
-	     "total_bytes: %llu\n"
-	     "version: %u\n",
-	       le64_to_cpu(hdr->magic),
-	       le64_to_cpu(hdr->total_bytes),
-	       le32_to_cpu(hdr->version));
-}
-
-typedef ssize_t (*rw_func_t)(int fd, void *buf, size_t count, off_t offset);
-
-static inline ssize_t rw_read(int fd, void *buf, size_t count, off_t offset)
-{
-	return read(fd, buf, count);
-}
-
-static inline ssize_t rw_pread(int fd, void *buf, size_t count, off_t offset)
-{
-	return pread(fd, buf, count, offset);
-}
-
-static inline ssize_t rw_write(int fd, void *buf, size_t count, off_t offset)
-{
-	return write(fd, buf, count);
-}
-
-static inline ssize_t rw_pwrite(int fd, void *buf, size_t count, off_t offset)
-{
-	return pwrite(fd, buf, count, offset);
-}
-
-static int rw_full_count(rw_func_t func, u64 *tot, int fd, void *buf, size_t count, off_t offset)
-{
-	ssize_t sret;
-
-	while (count > 0) {
-		sret = func(fd, buf, count, offset);
-		if (sret <= 0 || sret > count) {
-			if (sret < 0)
-				return -errno;
-			else
-				return -EIO;
-		}
-
-		if (tot)
-			*tot += sret;
-		buf += sret;
-		count -= sret;
-	}
-
-	return 0;
-}
-
-static int read_image(struct image_args *args, int fd, struct block_bitmaps *bm)
-{
-	struct scoutfs_meta_image_block_header bh;
-	struct scoutfs_meta_image_header hdr;
-	u64 opening;
-	void *buf;
-	off_t off;
-	u64 bit;
-	u64 ra;
-	int ret;
-
-	buf = malloc(SCOUTFS_BLOCK_LG_SIZE);
-	if (!buf) {
-		ret = -ENOMEM;
-		goto out;
-	}
-
-	hdr.magic = cpu_to_le64(SCOUTFS_META_IMAGE_HEADER_MAGIC);
-	hdr.total_bytes = cpu_to_le64(sizeof(hdr) +
-				      (bm->count * (SCOUTFS_BLOCK_LG_SIZE + sizeof(bh))));
-	hdr.version = cpu_to_le32(1);
-
-	if (args->show_header) {
-		printf_header(&hdr);
-		ret = 0;
-		goto out;
-	}
-
-	ret = rw_full_count(rw_write, NULL, STDOUT_FILENO, &hdr, sizeof(hdr), 0);
-	if (ret < 0)
-		goto out;
-
-	opening = args->ra_window;
-	ra = 0;
-	bit = 0;
-
-	for (bit = 0; (bit = find_next_set_bit(bm->bits, bit, bm->size)) < bm->size; bit++) {
-
-		/* readahead to open the full window, then a block at a time */
-		do {
-			ra = find_next_set_bit(bm->bits, ra, bm->size);
-			if (ra < bm->size) {
-				off = ra << SCOUTFS_BLOCK_LG_SHIFT;
-				posix_fadvise(fd, off, SCOUTFS_BLOCK_LG_SIZE, POSIX_FADV_WILLNEED);
-				ra++;
-				if (opening)
-					opening -= min(opening, SCOUTFS_BLOCK_LG_SIZE);
-			}
-		} while (opening > 0);
-
-		off = bit << SCOUTFS_BLOCK_LG_SHIFT;
-		ret = rw_full_count(rw_pread, NULL, fd, buf, SCOUTFS_BLOCK_LG_SIZE, off);
-		if (ret < 0)
-			goto out;
-
-		/*
-		 * Might as well try to drop the pages we've used to
-		 * reduce memory pressure on our read-ahead pages that
-		 * are waiting.
-		 */
-		posix_fadvise(fd, off, SCOUTFS_BLOCK_LG_SIZE, POSIX_FADV_DONTNEED);
-
-		bh.magic = cpu_to_le64(SCOUTFS_META_IMAGE_BLOCK_HEADER_MAGIC);
-		bh.offset = cpu_to_le64(off);
-		bh.size = cpu_to_le32(SCOUTFS_BLOCK_LG_SIZE);
-		bh.crc = calc_crc(&bh, buf, SCOUTFS_BLOCK_LG_SIZE);
-
-		ret = rw_full_count(rw_write, NULL, STDOUT_FILENO, &bh, sizeof(bh), 0) ?:
-		      rw_full_count(rw_write, NULL, STDOUT_FILENO, buf, SCOUTFS_BLOCK_LG_SIZE, 0);
-		if (ret < 0)
-			goto out;
-	}
-
-out:
-	free(buf);
-
-	return ret;
-}
-
-static int invalid_header(struct scoutfs_meta_image_header *hdr)
-{
-	if (le64_to_cpu(hdr->magic) != SCOUTFS_META_IMAGE_HEADER_MAGIC) {
-		errf("bad image header magic 0x%016llx (!= expected %016llx)\n",
-		       le64_to_cpu(hdr->magic), SCOUTFS_META_IMAGE_HEADER_MAGIC);
-
-	} else if (le32_to_cpu(hdr->version) != 1) {
-		errf("unknown image header version %u\n", le32_to_cpu(hdr->version));
-
-	} else {
-		return 0;
-	}
-
-	return -EIO;
-}
-
-/*
- * Doesn't catch offset+size overflowing, presumes pwrite() will return
- * an error.
- */
-static int invalid_block_header(struct scoutfs_meta_image_block_header *bh)
-{
-	if (le64_to_cpu(bh->magic) != SCOUTFS_META_IMAGE_BLOCK_HEADER_MAGIC) {
-		errf("bad block header magic 0x%016llx (!= expected %016llx)\n",
-		       le64_to_cpu(bh->magic), SCOUTFS_META_IMAGE_BLOCK_HEADER_MAGIC);
-
-	} else if (le32_to_cpu(bh->size) == 0) {
-		errf("invalid block header size %u\n", le32_to_cpu(bh->size));
-
-	} else if (le32_to_cpu(bh->size) > SIZE_MAX) {
-		errf("block header size %u too large for size_t (> %zu)\n",
-		       le32_to_cpu(bh->size), (size_t)SIZE_MAX);
-
-	} else if (le64_to_cpu(bh->offset) > OFF_MAX) {
-		errf("block header offset %llu too large for off_t (> %llu)\n",
-		       le64_to_cpu(bh->offset), (u64)OFF_MAX);
-
-	} else {
-		return 0;
-	}
-
-	return -EIO;
-}
-
-static int write_image(struct image_args *args, int fd, struct block_bitmaps *bm)
-{
-	struct scoutfs_meta_image_block_header bh;
-	struct scoutfs_meta_image_header hdr;
-	size_t writeback_batch = (2 * 1024 * 1024);
-	size_t buf_size;
-	size_t dirty;
-	size_t size;
-	off_t first;
-	off_t last;
-	off_t off;
-	__le32 calc;
-	void *buf;
-	u64 tot;
-	int ret;
-
-	tot = 0;
-
-	ret = rw_full_count(rw_read, &tot, STDIN_FILENO, &hdr, sizeof(hdr), 0);
-	if (ret < 0)
-		goto out;
-
-	if (args->show_header) {
-		printf_header(&hdr);
-		ret = 0;
-		goto out;
-	}
-
-	ret = invalid_header(&hdr);
-	if (ret < 0)
-		goto out;
-
-	dirty = 0;
-	first = OFF_MAX;
-	last = 0;
-	buf = NULL;
-	buf_size = 0;
-
-	while (tot < le64_to_cpu(hdr.total_bytes)) {
-
-		ret = rw_full_count(rw_read, &tot, STDIN_FILENO, &bh, sizeof(bh), 0);
-		if (ret < 0)
-			goto out;
-
-		ret = invalid_block_header(&bh);
-		if (ret < 0)
-			goto out;
-
-		size = le32_to_cpu(bh.size);
-		if (buf_size < size) {
-			buf = realloc(buf, size);
-			if (!buf) {
-				ret = -ENOMEM;
-				goto out;
-			}
-
-			buf_size = size;
-		}
-
-		ret = rw_full_count(rw_read, &tot, STDIN_FILENO, buf, size, 0);
-		if (ret < 0)
-			goto out;
-
-		calc = calc_crc(&bh, buf, size);
-		if (calc != bh.crc) {
-			errf("crc err");
-			ret = -EIO;
-			goto out;
-		}
-
-		off = le64_to_cpu(bh.offset);
-
-		ret = rw_full_count(rw_pwrite, NULL, fd, buf, size, off);
-		if (ret < 0)
-			goto out;
-
-		dirty += size;
-		first = min(first, off);
-		last = max(last, off);
-		if (dirty >= writeback_batch) {
-			posix_fadvise(fd, first, last, POSIX_FADV_DONTNEED);
-			dirty = 0;
-			first = OFF_MAX;
-			last = 0;
-		}
-	}
-
-	ret = fsync(fd);
-	if (ret < 0) {
-		ret = -errno;
-		goto out;
-	}
-
-out:
-	return ret;
-}
-
-static int do_image(struct image_args *args)
-{
-	struct block_bitmaps bm = { .bits = NULL };
-	int meta_fd = -1;
-	u64 dev_size;
-	mode_t mode;
-	int ret;
-
-	mode = args->is_read ? O_RDONLY : O_RDWR;
-
-	meta_fd = open(args->meta_device, mode);
-	if (meta_fd < 0) {
-		ret = -errno;
-		errf("failed to open meta device '%s': %s (%d)\n",
-		     args->meta_device, strerror(errno), errno);
-		goto out;
-	}
-
-	if (args->is_read) {
-		ret = flush_device(meta_fd);
-		if (ret < 0)
-			goto out;
-
-		ret = get_device_size(args->meta_device, meta_fd, &dev_size);
-		if (ret < 0)
-			goto out;
-
-		bm.size = DIV_ROUND_UP(dev_size, SCOUTFS_BLOCK_LG_SIZE);
-		bm.bits = calloc(1, round_up(bm.size, BITS_PER_LONG) / 8);
-		if (!bm.bits) {
-			ret = -ENOMEM;
-			goto out;
-		}
-
-		ret = block_setup(meta_fd, 128 * 1024 * 1024, 32 * 1024 * 1024) ?:
-		      check_supers(-1) ?:
-		      get_ref_bits(&bm) ?:
-		      read_image(args, meta_fd, &bm);
-		block_shutdown();
-	} else {
-		ret = write_image(args, meta_fd, &bm);
-	}
-out:
-	free(bm.bits);
-
-	if (meta_fd >= 0)
-		close(meta_fd);
-
-	return ret;
-}
-
-static int parse_opt(int key, char *arg, struct argp_state *state)
-{
-	struct image_args *args = state->input;
-	int ret;
-
-	switch (key) {
-	case 'h':
-		args->show_header = true;
-		break;
-	case 'r':
-		ret = parse_u64(arg, &args->ra_window);
-		if (ret)
-			argp_error(state, "readahead winddoe parse error");
-		break;
-	case ARGP_KEY_ARG:
-		if (!args->meta_device)
-			args->meta_device = strdup_or_error(state, arg);
-		else
-			argp_error(state, "more than two device arguments given");
-		break;
-	case ARGP_KEY_FINI:
-		if (!args->meta_device)
-			argp_error(state, "no metadata device argument given");
-		break;
-	default:
-		break;
-	}
-
-	return 0;
-}
-
-static struct argp_option options[] = {
-	{ "show-header", 'h', NULL, 0, "Print image header and exit without processing stream" },
-	{ "readahead", 'r', "NR", 0, "Maintain read-ahead window of NR blocks" },
-	{ NULL }
-};
-
-static struct argp read_image_argp = {
-	options,
-	parse_opt,
-	"META-DEVICE",
-	"Read metadata image stream from metadata device file"
-};
-
-#define DEFAULT_RA_WINDOW (512 * 1024)
-
-static int read_image_cmd(int argc, char **argv)
-{
-	struct image_args image_args = {
-		.is_read = true,
-		.ra_window = DEFAULT_RA_WINDOW,
-	};
-	int ret;
-
-	ret = argp_parse(&read_image_argp, argc, argv, 0, NULL, &image_args);
-	if (ret)
-		return ret;
-
-	return do_image(&image_args);
-}
-
-static struct argp write_image_argp = {
-	options,
-	parse_opt,
-	"META-DEVICE",
-	"Write metadata image stream to metadata device file"
-};
-
-static int write_image_cmd(int argc, char **argv)
-{
-	struct image_args image_args = {
-		.is_read = false,
-		.ra_window = DEFAULT_RA_WINDOW,
-	};
-	int ret;
-
-	ret = argp_parse(&write_image_argp, argc, argv, 0, NULL, &image_args);
-	if (ret)
-		return ret;
-
-	return do_image(&image_args);
-}
-
-static void __attribute__((constructor)) image_ctor(void)
-{
-	cmd_register_argp("read-metadata-image", &read_image_argp, GROUP_CORE, read_image_cmd);
-	cmd_register_argp("write-metadata-image", &write_image_argp, GROUP_CORE, write_image_cmd);
-}
--- a/utils/src/check/iter.h
+++ b/utils/src/check/iter.h
@@ -1,15 +0,0 @@
-#ifndef _SCOUTFS_UTILS_CHECK_ITER_H_
-#define _SCOUTFS_UTILS_CHECK_ITER_H_
-
-/*
- * Callbacks can return a weird -errno that we'll never use to indicate
- * that iteration can stop and return 0 for success.
- */
-#define ECHECK_ITER_DONE EL2HLT
-
-static inline int xlate_iter_errno(int ret)
-{
-	return ret == -ECHECK_ITER_DONE ? 0 : ret;
-}
-
-#endif
--- a/utils/src/check/log_trees.c
+++ b/utils/src/check/log_trees.c
@@ -1,98 +0,0 @@
-#include <unistd.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <errno.h>
-
-#include "sparse.h"
-#include "util.h"
-#include "format.h"
-#include "key.h"
-
-#include "alloc.h"
-#include "btree.h"
-#include "debug.h"
-#include "extent.h"
-#include "iter.h"
-#include "sns.h"
-#include "log_trees.h"
-#include "super.h"
-
-struct iter_args {
-	extent_cb_t cb;
-	void *cb_arg;
-};
-
-static int lt_meta_iter(struct scoutfs_key *key, void *val, u16 val_len, void *cb_arg)
-{
-	struct iter_args *ia = cb_arg;
-	struct scoutfs_log_trees *lt;
-	int ret;
-
-	if (val_len != sizeof(struct scoutfs_log_trees))
-		; /* XXX */
-
-	lt = val;
-
-	sns_push("log_trees", le64_to_cpu(lt->rid), le64_to_cpu(lt->nr));
-
-	debug("lt rid 0x%16llx nr %llu", le64_to_cpu(lt->rid), le64_to_cpu(lt->nr));
-
-	sns_push("meta_avail", 0, 0);
-	ret = alloc_list_meta_iter(&lt->meta_avail, ia->cb, ia->cb_arg);
-	sns_pop();
-	if (ret < 0)
-		goto out;
-
-	sns_push("meta_freed", 0, 0);
-	ret = alloc_list_meta_iter(&lt->meta_freed, ia->cb, ia->cb_arg);
-	sns_pop();
-	if (ret < 0)
-		goto out;
-
-	sns_push("item_root", 0, 0);
-	ret = btree_meta_iter(&lt->item_root, ia->cb, ia->cb_arg);
-	sns_pop();
-	if (ret < 0)
-		goto out;
-
-	if (lt->bloom_ref.blkno) {
-		sns_push("bloom_ref", 0, 0);
-		ret = ia->cb(le64_to_cpu(lt->bloom_ref.blkno), 1, ia->cb_arg);
-		sns_pop();
-		if (ret < 0) {
-			ret = xlate_iter_errno(ret);
-			goto out;
-		}
-	}
-
-	sns_push("data_avail", 0, 0);
-	ret = alloc_root_meta_iter(&lt->data_avail, ia->cb, ia->cb_arg);
-	sns_pop();
-	if (ret < 0)
-		goto out;
-
-	sns_push("data_freed", 0, 0);
-	ret = alloc_root_meta_iter(&lt->data_freed, ia->cb, ia->cb_arg);
-	sns_pop();
-	if (ret < 0)
-		goto out;
-
-	ret = 0;
-out:
-	sns_pop();
-
-	return ret;
-}
-
-/*
- * Call the callers callback with the extent of all the metadata block references contained
- * in log btrees.  We walk the logs_root btree items and walk all the metadata structures
- * they reference.
- */
-int log_trees_meta_iter(extent_cb_t cb, void *cb_arg)
-{
-	struct scoutfs_super_block *super = global_super;
-	struct iter_args ia = { .cb = cb, .cb_arg = cb_arg };
-
-	return btree_item_iter(&super->logs_root, lt_meta_iter, &ia);
-}
--- a/utils/src/check/log_trees.h
+++ b/utils/src/check/log_trees.h
@@ -1,8 +0,0 @@
-#ifndef _SCOUTFS_UTILS_CHECK_LOG_TREES_H_
-#define _SCOUTFS_UTILS_CHECK_LOG_TREES_H_
-
-#include "extent.h"
-
-int log_trees_meta_iter(extent_cb_t cb, void *cb_arg);
-
-#endif
--- a/utils/src/check/meta.c
+++ b/utils/src/check/meta.c
@@ -1,367 +0,0 @@
-#include <unistd.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <stdbool.h>
-#include <sys/mman.h>
-#include <errno.h>
-
-#include "sparse.h"
-#include "util.h"
-#include "format.h"
-#include "bitmap.h"
-#include "key.h"
-
-#include "alloc.h"
-#include "btree.h"
-#include "debug.h"
-#include "extent.h"
-#include "sns.h"
-#include "log_trees.h"
-#include "meta.h"
-#include "problem.h"
-#include "super.h"
-
-static struct meta_data {
-	struct extent_root meta_refed;
-	struct extent_root meta_free;
-	struct {
-		u64 ref_blocks;
-		u64 free_extents;
-		u64 free_blocks;
-	} stats;
-} global_mdat;
-
-bool valid_meta_blkno(u64 blkno)
-{
-	u64 tot = le64_to_cpu(global_super->total_meta_blocks);
-
-	return blkno >= SCOUTFS_META_DEV_START_BLKNO && blkno < tot;
-}
-
-static bool valid_meta_extent(u64 start, u64 len)
-{
-	u64 tot = le64_to_cpu(global_super->total_meta_blocks);
-	bool valid;
-
-	valid = len > 0 &&
-		start >= SCOUTFS_META_DEV_START_BLKNO &&
-		start < tot &&
-		len <= tot &&
-		((start + len) <= tot) &&
-		((start + len) > start);
-
-	debug("start %llu len %llu valid %u", start, len, !!valid);
-
-	if (!valid)
-		problem(PB_META_EXTENT_INVALID, "start %llu len %llu", start, len);
-
-	return valid;
-}
-
-/*
- * Track references to individual metadata blocks.  This uses the extent
- * callback type but is only ever called for single block references.
- * Any reference to a block that has already been referenced is
- * considered invalid and is ignored.  Later repair will resolve
- * duplicate references.
- */
-static int insert_meta_ref(u64 start, u64 len, void *arg)
-{
-	struct meta_data *mdat = &global_mdat;
-	struct extent_root *root = arg;
-	int ret = 0;
-
-	/* this is tracking single metadata block references */
-	if (len != 1) {
-		ret = -EINVAL;
-		goto out;
-	}
-
-	if (valid_meta_blkno(start)) {
-		ret = extent_insert_new(root, start, len);
-		if (ret == 0)
-			mdat->stats.ref_blocks++;
-		else if (ret == -EEXIST)
-			problem(PB_META_REF_OVERLAPS_EXISTING, "blkno %llu", start);
-	}
-
-out:
-	return ret;
-}
-
-static int insert_meta_free(u64 start, u64 len, void *arg)
-{
-	struct meta_data *mdat = &global_mdat;
-	struct extent_root *root = arg;
-	int ret = 0;
-
-	if (valid_meta_extent(start, len)) {
-		ret = extent_insert_new(root, start, len);
-		if (ret == 0) {
-			mdat->stats.free_extents++;
-			mdat->stats.free_blocks++;
-
-		} else if (ret == -EEXIST) {
-			problem(PB_META_FREE_OVERLAPS_EXISTING,
-				"start %llu llen %llu", start, len);
-		}
-
-	}
-
-	return ret;
-}
-
-/*
- * Walk all metadata references in the system.  This walk doesn't need
- * to read metadata that doesn't contain any metadata references so it
- * can skip the bulk of metadata blocks.  This gives us the set of
- * referenced metadata blocks which we can then use to repair metadata
- * allocator structures.
- */
-static int get_meta_refs(void)
-{
-	struct meta_data *mdat = &global_mdat;
-	struct scoutfs_super_block *super = global_super;
-	int ret;
-
-	extent_root_init(&mdat->meta_refed);
-
-	/* XXX record reserved blocks around super as referenced */
-
-	sns_push("meta_alloc", 0, 0);
-	ret = alloc_root_meta_iter(&super->meta_alloc[0], insert_meta_ref, &mdat->meta_refed);
-	sns_pop();
-	if (ret < 0)
-		goto out;
-
-	sns_push("meta_alloc", 1, 0);
-	ret = alloc_root_meta_iter(&super->meta_alloc[1], insert_meta_ref, &mdat->meta_refed);
-	sns_pop();
-	if (ret < 0)
-		goto out;
-
-	sns_push("data_alloc", 1, 0);
-	ret = alloc_root_meta_iter(&super->data_alloc, insert_meta_ref, &mdat->meta_refed);
-	sns_pop();
-	if (ret < 0)
-		goto out;
-
-	sns_push("server_meta_avail", 0, 0);
-	ret = alloc_list_meta_iter(&super->server_meta_avail[0],
-				   insert_meta_ref, &mdat->meta_refed);
-	sns_pop();
-	if (ret < 0)
-		goto out;
-
-	sns_push("server_meta_avail", 1, 0);
-	ret = alloc_list_meta_iter(&super->server_meta_avail[1],
-				   insert_meta_ref, &mdat->meta_refed);
-	sns_pop();
-	if (ret < 0)
-		goto out;
-
-	sns_push("server_meta_freed", 0, 0);
-	ret = alloc_list_meta_iter(&super->server_meta_freed[0],
-				   insert_meta_ref, &mdat->meta_refed);
-	sns_pop();
-	if (ret < 0)
-		goto out;
-
-	sns_push("server_meta_freed", 1, 0);
-	ret = alloc_list_meta_iter(&super->server_meta_freed[1],
-				   insert_meta_ref, &mdat->meta_refed);
-	sns_pop();
-	if (ret < 0)
-		goto out;
-
-	sns_push("fs_root", 0, 0);
-	ret = btree_meta_iter(&super->fs_root, insert_meta_ref, &mdat->meta_refed);
-	sns_pop();
-	if (ret < 0)
-		goto out;
-
-	sns_push("logs_root", 0, 0);
-	ret = btree_meta_iter(&super->logs_root, insert_meta_ref, &mdat->meta_refed);
-	sns_pop();
-	if (ret < 0)
-		goto out;
-
-	sns_push("log_merge", 0, 0);
-	ret = btree_meta_iter(&super->log_merge, insert_meta_ref, &mdat->meta_refed);
-	sns_pop();
-	if (ret < 0)
-		goto out;
-
-	sns_push("mounted_clients", 0, 0);
-	ret = btree_meta_iter(&super->mounted_clients, insert_meta_ref, &mdat->meta_refed);
-	sns_pop();
-	if (ret < 0)
-		goto out;
-
-	sns_push("srch_root", 0, 0);
-	ret = btree_meta_iter(&super->srch_root, insert_meta_ref, &mdat->meta_refed);
-	sns_pop();
-	if (ret < 0)
-		goto out;
-
-	ret = log_trees_meta_iter(insert_meta_ref, &mdat->meta_refed);
-	if (ret < 0)
-		goto out;
-
-	debug("found %llu referenced metadata blocks", mdat->stats.ref_blocks);
-	ret = 0;
-out:
-	return ret;
-}
-
-static int get_meta_free(void)
-{
-	struct meta_data *mdat = &global_mdat;
-	struct scoutfs_super_block *super = global_super;
-	int ret;
-
-	extent_root_init(&mdat->meta_free);
-
-	sns_push("meta_alloc", 0, 0);
-	ret = alloc_root_extent_iter(&super->meta_alloc[0], insert_meta_free, &mdat->meta_free);
-	sns_pop();
-	if (ret < 0)
-		goto out;
-
-	sns_push("meta_alloc", 1, 0);
-	ret = alloc_root_extent_iter(&super->meta_alloc[1], insert_meta_free, &mdat->meta_free);
-	sns_pop();
-	if (ret < 0)
-		goto out;
-
-	sns_push("server_meta_avail", 0, 0);
-	ret = alloc_list_extent_iter(&super->server_meta_avail[0],
-				     insert_meta_free, &mdat->meta_free);
-	sns_pop();
-	if (ret < 0)
-		goto out;
-
-	sns_push("server_meta_avail", 1, 0);
-	ret = alloc_list_extent_iter(&super->server_meta_avail[1],
-				     insert_meta_free, &mdat->meta_free);
-	sns_pop();
-	if (ret < 0)
-		goto out;
-
-	sns_push("server_meta_freed", 0, 0);
-	ret = alloc_list_extent_iter(&super->server_meta_freed[0],
-				     insert_meta_free, &mdat->meta_free);
-	sns_pop();
-	if (ret < 0)
-		goto out;
-
-	sns_push("server_meta_freed", 1, 0);
-	ret = alloc_list_extent_iter(&super->server_meta_freed[1],
-				     insert_meta_free, &mdat->meta_free);
-	sns_pop();
-	if (ret < 0)
-		goto out;
-
-	debug("found %llu free metadata blocks in %llu extents",
-	       mdat->stats.free_blocks, mdat->stats.free_extents);
-	ret = 0;
-out:
-	return ret;
-}
-
-/*
- * All the space between referenced blocks must be recorded in the free
- * extents.  The free extent walk didn't check that the extents
- * overlapped with references, we do that here.  Remember that metadata
- * block references were merged into extents here, the refed extents
- * aren't necessarily all a single block.
- */
-static int compare_refs_and_free(void)
-{
-	struct meta_data *mdat = &global_mdat;
-	struct extent_node *ref;
-	struct extent_node *free;
-	struct extent_node *next;
-	struct extent_node *prev;
-	u64 expect;
-	u64 start;
-	u64 end;
-
-	expect = 0;
-	ref = extent_first(&mdat->meta_refed);
-	free = extent_first(&mdat->meta_free);
-	while (ref || free) {
-
-		debug("exp %llu ref %llu.%llu free %llu.%llu",
-			expect, ref ? ref->start : 0, ref ? ref->len : 0,
-			free ? free->start : 0, free ? free->len : 0);
-
-		/* referenced marked free, remove ref from free and continue from same point */
-		if (ref && free && extents_overlap(ref->start, ref->len, free->start, free->len)) {
-			debug("ref extent %llu.%llu overlaps free %llu %llu",
-				ref->start, ref->len, free->start, free->len);
-
-			start = max(ref->start, free->start);
-			end = min(ref->start + ref->len, free->start + free->len);
-
-			prev = extent_prev(free);
-
-			extent_remove(&mdat->meta_free, start, end - start);
-
-			if (prev)
-				free = extent_next(prev);
-			else
-				free = extent_first(&mdat->meta_free);
-			continue;
-		}
-
-		/* see which extent starts earlier */
-		if (!free || (ref && ref->start <= free->start))
-			next = ref;
-		else
-			next = free;
-
-		/* untracked region before next extent */
-		if (expect < next->start) {
-			debug("missing free extent %llu.%llu", expect, next->start - expect);
-			expect = next->start;
-			continue;
-		}
-
-
-		/* didn't overlap, advance past next extent */
-		expect = next->start + next->len;
-		if (next == ref)
-			ref = extent_next(ref);
-		else
-			free = extent_next(free);
-	}
-
-	return 0;
-}
-
-/*
- * Check the metadata allocators by comparing the set of referenced
- * blocks with the set of free blocks that are stored in free btree
- * items and alloc list blocks.
- */
-int check_meta_alloc(void)
-{
-	int ret;
-
-	ret = get_meta_refs();
-	if (ret < 0)
-		goto out;
-
-	ret = get_meta_free();
-	if (ret < 0)
-		goto out;
-
-	ret = compare_refs_and_free();
-	if (ret < 0)
-		goto out;
-
-	ret = 0;
-out:
-	return ret;
-}
--- a/utils/src/check/meta.h
+++ b/utils/src/check/meta.h
@@ -1,9 +0,0 @@
-#ifndef _SCOUTFS_UTILS_CHECK_META_H_
-#define _SCOUTFS_UTILS_CHECK_META_H_
-
-bool valid_meta_blkno(u64 blkno);
-
-int check_meta_alloc(void);
-
-#endif
-
--- a/utils/src/check/padding.c
+++ b/utils/src/check/padding.c
@@ -1,23 +0,0 @@
-#include <string.h>
-#include <stdbool.h>
-
-#include "util.h"
-#include "padding.h"
-
-bool padding_is_zeros(const void *data, size_t sz)
-{
-	static char zeros[32] = {0,};
-	const size_t batch = array_size(zeros);
-
-	while (sz >= batch) {
-		if (memcmp(data, zeros, batch))
-			return false;
-		data += batch;
-		sz -= batch;
-	}
-
-	if (sz > 0 && memcmp(data, zeros, sz))
-		return false;
-
-	return true;
-}
--- a/utils/src/check/padding.h
+++ b/utils/src/check/padding.h
@@ -1,6 +0,0 @@
-#ifndef _SCOUTFS_UTILS_CHECK_PADDING_H_
-#define _SCOUTFS_UTILS_CHECK_PADDING_H_
-
-bool padding_is_zeros(const void *data, size_t sz);
-
-#endif
--- a/utils/src/check/problem.c
+++ b/utils/src/check/problem.c
@@ -1,44 +0,0 @@
-#include <stdio.h>
-#include <stdint.h>
-
-#include "problem.h"
-
-#define PROB_STR(pb) [pb] = #pb
-char *prob_strs[] = {
-	PROB_STR(PB_META_EXTENT_INVALID),
-	PROB_STR(PB_META_REF_OVERLAPS_EXISTING),
-	PROB_STR(PB_META_FREE_OVERLAPS_EXISTING),
-	PROB_STR(PB_BTREE_BLOCK_BAD_LEVEL),
-	PROB_STR(PB_SB_HDR_CRC_INVALID),
-	PROB_STR(PB_SB_HDR_MAGIC_INVALID),
-	PROB_STR(PB_FS_IN_USE),
-	PROB_STR(PB_MOUNTED_CLIENTS_REF_BLKNO),
-	PROB_STR(PB_SB_BAD_FLAG),
-	PROB_STR(PB_SB_BAD_FMT_VERS),
-	PROB_STR(PB_QCONF_WRONG_VERSION),
-	PROB_STR(PB_QSLOT_BAD_FAM),
-	PROB_STR(PB_QSLOT_BAD_PORT),
-	PROB_STR(PB_QSLOT_NO_ADDR),
-	PROB_STR(PB_QSLOT_BAD_ADDR),
-	PROB_STR(PB_DATA_DEV_SB_INVALID),
-};
-
-static struct problem_data {
-	uint64_t counts[PB__NR];
-	uint64_t count;
-} global_pdat;
-
-void problem_record(prob_t pb)
-{
-	struct problem_data *pdat = &global_pdat;
-
-	pdat->counts[pb]++;
-	pdat->count++;
-}
-
-uint64_t problems_count(void)
-{
-	struct problem_data *pdat = &global_pdat;
-
-	return pdat->count;
-}
--- a/utils/src/check/problem.h
+++ b/utils/src/check/problem.h
@@ -1,38 +0,0 @@
-#ifndef _SCOUTFS_UTILS_CHECK_PROBLEM_H_
-#define _SCOUTFS_UTILS_CHECK_PROBLEM_H_
-
-#include "debug.h"
-#include "sns.h"
-
-typedef enum {
-	PB_META_EXTENT_INVALID,
-	PB_META_REF_OVERLAPS_EXISTING,
-	PB_META_FREE_OVERLAPS_EXISTING,
-	PB_BTREE_BLOCK_BAD_LEVEL,
-	PB_SB_HDR_CRC_INVALID,
-	PB_SB_HDR_MAGIC_INVALID,
-	PB_FS_IN_USE,
-	PB_MOUNTED_CLIENTS_REF_BLKNO,
-	PB_SB_BAD_FLAG,
-	PB_SB_BAD_FMT_VERS,
-	PB_QCONF_WRONG_VERSION,
-	PB_QSLOT_BAD_FAM,
-	PB_QSLOT_BAD_PORT,
-	PB_QSLOT_NO_ADDR,
-	PB_QSLOT_BAD_ADDR,
-	PB_DATA_DEV_SB_INVALID,
-	PB__NR,
-} prob_t;
-
-extern char *prob_strs[];
-
-#define problem(pb, fmt, ...)							\
-do {										\
-	debug("problem found: "#pb": %s: "fmt, sns_str(), __VA_ARGS__);	\
-	problem_record(pb);							\
-} while (0)
-
-void problem_record(prob_t pb);
-uint64_t problems_count(void);
-
-#endif
--- a/utils/src/check/sns.c
+++ b/utils/src/check/sns.c
@@ -1,118 +0,0 @@
-#include <stdlib.h>
-#include <string.h>
-
-#include "sns.h"
-
-/*
- * This "str num stack" is used to describe our location in metadata at
- * any given time.
- *
- * As we descend into structures we pop a string on decribing them,
- * perhaps with associated numbers.  Pushing and popping is very cheap
- * and only rarely do we format the stack into a string, as an arbitrary
- * example:
- *   super.fs_root.btree_parent:1231.btree_leaf:3231"
- */
-
-#define SNS_MAX_DEPTH	1000
-#define SNS_STR_SIZE	(SNS_MAX_DEPTH * (SNS_MAX_STR_LEN + 1 + 16 + 1))
-
-static struct sns_data {
-	unsigned int depth;
-
-	struct sns_entry {
-		char *str;
-		size_t len;
-		u64 a;
-		u64 b;
-	} ents[SNS_MAX_DEPTH];
-
-	char str[SNS_STR_SIZE];
-
-} global_lsdat;
-
-void _sns_push(char *str, size_t len, u64 a, u64 b)
-{
-	struct sns_data *lsdat = &global_lsdat;
-
-	if (lsdat->depth < SNS_MAX_DEPTH) {
-		lsdat->ents[lsdat->depth++] = (struct sns_entry) {
-			.str = str,
-			.len = len,
-			.a = a,
-			.b = b,
-		};
-	}
-}
-
-void sns_pop(void)
-{
-	struct sns_data *lsdat = &global_lsdat;
-
-	if (lsdat->depth > 0)
-		lsdat->depth--;
-}
-
-static char *append_str(char *pos, char *str, size_t len)
-{
-	memcpy(pos, str, len);
-	return pos + len;
-}
-
-/*
- * This is not called for x = 0 so we don't need to emit an initial 0.
- * We could by using do {} while instead of while {}.
- */
-static char *append_u64x(char *pos, u64 x)
-{
-	static char hex[] = "0123456789abcdef";
-
-	while (x) {
-		*pos++ = hex[x & 0xf];
-		x >>= 4;
-	}
-
-	return pos;
-}
-
-static char *append_char(char *pos, char c)
-{
-	*(pos++) = c;
-	return pos;
-}
-
-/*
- * Return a pointer to a null terminated string that describes the
- * current location stack.  The string buffer is global.
- */
-char *sns_str(void)
-{
-	struct sns_data *lsdat = &global_lsdat;
-	struct sns_entry *ent;
-	char *pos;
-	int i;
-
-	pos = lsdat->str;
-	for (i = 0; i < lsdat->depth; i++) {
-		ent = &lsdat->ents[i];
-
-		if (i)
-			pos = append_char(pos, '.');
-
-		pos = append_str(pos, ent->str, ent->len);
-
-		if (ent->a) {
-			pos = append_char(pos, ':');
-			pos = append_u64x(pos, ent->a);
-		}
-
-		if (ent->b) {
-			pos = append_char(pos, ':');
-			pos = append_u64x(pos, ent->b);
-		}
-	}
-
-	*pos = '\0';
-
-	return lsdat->str;
-}
--- a/utils/src/check/sns.h
+++ b/utils/src/check/sns.h
@@ -1,20 +0,0 @@
-#ifndef _SCOUTFS_UTILS_CHECK_SNS_H_
-#define _SCOUTFS_UTILS_CHECK_SNS_H_
-
-#include <assert.h>
-
-#include "sparse.h"
-
-#define SNS_MAX_STR_LEN 20
-
-#define sns_push(str, a, b)					\
-do {								\
-	build_assert(sizeof(str) - 1 <= SNS_MAX_STR_LEN);	\
-	_sns_push((str), sizeof(str) - 1, a, b);		\
-} while (0)
-
-void _sns_push(char *str, size_t len, u64 a, u64 b);
-void sns_pop(void);
-char *sns_str(void);
-
-#endif
--- a/utils/src/check/super.c
+++ b/utils/src/check/super.c
@@ -1,252 +0,0 @@
-#include <unistd.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <errno.h>
-#include <sys/socket.h>
-#include <arpa/inet.h>
-
-#include "sparse.h"
-#include "util.h"
-#include "format.h"
-#include "crc.h"
-
-#include "block.h"
-#include "super.h"
-#include "problem.h"
-
-/*
- * After we check the super blocks we provide a global buffer to track
- * the current super block.  It is referenced to get static information
- * about the system and is also modified and written as part of
- * transactions.
- */
-struct scoutfs_super_block *global_super;
-
-/*
- * Check superblock crc. We can't use global_super here since it's not the
- * whole block itself, but only the struct scoutfs_super_block, so it needs
- * to reload a copy here.
- */
-int check_super_crc(void)
-{
-	struct scoutfs_super_block *super = NULL;
-	struct scoutfs_block_header *hdr;
-	struct block *blk = NULL;
-	u32 crc;
-	int ret;
-
-	ret = block_get(&blk, SCOUTFS_SUPER_BLKNO, BF_SM | BF_DIRTY);
-	if (ret < 0) {
-		fprintf(stderr, "error reading super block\n");
-		return ret;
-	}
-
-	super = block_buf(blk);
-	crc = crc_block((struct scoutfs_block_header *)super, block_size(blk));
-	hdr = &global_super->hdr;
-	debug("superblock crc 0x%04x calculated 0x%04x " "%s", le32_to_cpu(hdr->crc), crc, le32_to_cpu(hdr->crc) == crc ? "(match)" : "(mismatch)");
-
-	if (crc != le32_to_cpu(hdr->crc))
-		problem(PB_SB_HDR_CRC_INVALID, "crc 0x%04x calculated 0x%04x", le32_to_cpu(hdr->crc), crc);
-	block_put(&blk);
-
-	return 0;
-}
-
-/*
- * Crude check for the unlikely cases where the fs appears to still be mounted.
- */
-int check_super_in_use(int meta_fd)
-{
-	int ret = meta_super_in_use(meta_fd, global_super);
-	debug("meta_super_in_use ret %d", ret);
-
-	if (ret < 0)
-		problem(PB_FS_IN_USE, "File system appears in use. ret %d", ret);
-
-	debug("global_super->mounted_clients.ref.blkno 0x%08llx", global_super->mounted_clients.ref.blkno);
-	if (global_super->mounted_clients.ref.blkno != 0)
-		problem(PB_MOUNTED_CLIENTS_REF_BLKNO, "Mounted clients ref blkno 0x%08llx",
-			 global_super->mounted_clients.ref.blkno);
-
-	return ret;
-}
-
-/*
- * quick glance data device superblock checks.
- *
- * -EIO for crc failures, all others -EINVAL
- *
- * caller must have run check_supers() first so that global_super is
- * setup, so that we can cross-ref to it.
- */
-static int check_data_super(int data_fd)
-{
-	struct scoutfs_super_block *super = NULL;
-	char *buf;
-	int ret = 0;
-	u32 crc;
-	ssize_t size = SCOUTFS_BLOCK_SM_SIZE;
-	off_t off = SCOUTFS_SUPER_BLKNO << SCOUTFS_BLOCK_SM_SHIFT;
-
-	buf = aligned_alloc(4096, size); /* XXX static alignment :/ */
-	if (!buf)
-		return -ENOMEM;
-
-	memset(buf, 0, size);
-
-	if (lseek(data_fd, off, SEEK_SET) != off)
-		return -errno;
-
-	if (read(data_fd, buf, size) < 0) {
-		ret = -errno;
-		goto out;
-	}
-
-	super = (struct scoutfs_super_block *)buf;
-
-	crc = crc_block((struct scoutfs_block_header *)buf, size);
-
-	debug("data fsid 0x%016llx", le64_to_cpu(super->hdr.fsid));
-	debug("data super magic 0x%04x", super->hdr.magic);
-	debug("data crc calc 0x%08x exp 0x%08x %s", crc, le32_to_cpu(super->hdr.crc),
-	      crc == le32_to_cpu(super->hdr.crc) ? "(match)" : "(mismatch)");
-	debug("data flags %llu fmt_vers %llu", le64_to_cpu(super->flags), le64_to_cpu(super->fmt_vers));
-
-	if (crc != le32_to_cpu(super->hdr.crc))
-		/* tis but a scratch */
-		ret = -EIO;
-
-	if (le64_to_cpu(super->hdr.fsid) != le64_to_cpu(global_super->hdr.fsid))
-		/* mismatched data bdev? not good */
-		ret = -EINVAL;
-
-	if (le32_to_cpu(super->hdr.magic) != SCOUTFS_BLOCK_MAGIC_SUPER)
-		/* fsid matched but not a superblock? yikes */
-		ret = -EINVAL;
-
-	if (le64_to_cpu(super->flags) != 0) /* !SCOUTFS_FLAG_IS_META_BDEV */
-		ret = -EINVAL;
-
-	if ((le64_to_cpu(super->fmt_vers) < SCOUTFS_FORMAT_VERSION_MIN) ||
-	    (le64_to_cpu(super->fmt_vers) > SCOUTFS_FORMAT_VERSION_MAX))
-		ret = -EINVAL;
-
-	if (ret != 0)
-		problem(PB_DATA_DEV_SB_INVALID, "data device is invalid or corrupt (%d)", ret);
-out:
-	free(buf);
-	return ret;
-}
-
-/*
- * After checking the supers we save a copy of it in a global buffer that's used by
- * other modules to track the current super.  It can be modified and written during commits.
- */
-int check_supers(int data_fd)
-{
-	struct scoutfs_super_block *super = NULL;
-	struct block *blk = NULL;
-	struct scoutfs_quorum_slot* slot = NULL;
-	struct in_addr in;
-	uint16_t family;
-	uint16_t port;
-	int ret;
-
-	sns_push("supers", 0, 0);
-
-	global_super = malloc(sizeof(struct scoutfs_super_block));
-	if (!global_super) {
-		fprintf(stderr, "error allocating super block buffer\n");
-		ret = -ENOMEM;
-		goto out;
-	}
-
-	ret = block_get(&blk, SCOUTFS_SUPER_BLKNO, BF_SM);
-	if (ret < 0) {
-		fprintf(stderr, "error reading super block\n");
-		goto out;
-	}
-
-	ret = block_hdr_valid(blk, SCOUTFS_SUPER_BLKNO, BF_SM, SCOUTFS_BLOCK_MAGIC_SUPER);
-
-	super = block_buf(blk);
-
-	if (ret < 0) {
-		/* */
-		if (ret == -EINVAL) {
-			/* that's really bad */
-			fprintf(stderr, "superblock invalid magic\n");
-			goto out;
-		} else if (ret == -EIO)
-			/* just report/count a CRC error */
-			problem(PB_SB_HDR_MAGIC_INVALID, "superblock magic invalid: 0x%04x is not 0x%04x",
-				super->hdr.magic, SCOUTFS_BLOCK_MAGIC_SUPER);
-	}
-
-	memcpy(global_super, super, sizeof(struct scoutfs_super_block));
-
-	debug("Superblock flag: %llu", global_super->flags);
-	if (le64_to_cpu(global_super->flags) != SCOUTFS_FLAG_IS_META_BDEV)
-		problem(PB_SB_BAD_FLAG, "Bad flag: %llu expecting: 1 or 0", global_super->flags);
-
-	debug("Superblock fmt_vers: %llu", le64_to_cpu(global_super->fmt_vers));
-	if ((le64_to_cpu(global_super->fmt_vers) < SCOUTFS_FORMAT_VERSION_MIN) ||
-	    (le64_to_cpu(global_super->fmt_vers) > SCOUTFS_FORMAT_VERSION_MAX))
-		problem(PB_SB_BAD_FMT_VERS, "Bad fmt_vers: %llu outside supported range (%d-%d)",
-			le64_to_cpu(global_super->fmt_vers), SCOUTFS_FORMAT_VERSION_MIN,
-			SCOUTFS_FORMAT_VERSION_MAX);
-
-	debug("Quorum Config Version: %llu", global_super->qconf.version);
-	if (le64_to_cpu(global_super->qconf.version) != 1)
-		problem(PB_QCONF_WRONG_VERSION, "Wrong Version: %llu (expected 1)", global_super->qconf.version);
-
-	for (int i = 0; i < SCOUTFS_QUORUM_MAX_SLOTS; i++) {
-		slot = &global_super->qconf.slots[i];
-		family = le16_to_cpu(slot->addr.v4.family);
-		port = le16_to_cpu(slot->addr.v4.port);
-		in.s_addr = le32_to_cpu(slot->addr.v4.addr);
-
-		if (family == SCOUTFS_AF_NONE) {
-			debug("Quorum slot %u is empty", i);
-			continue;
-		}
-
-		debug("Quorum slot %u family: %u, port: %u, address: %s", i, family, port, inet_ntoa(in));
-		if (family != SCOUTFS_AF_IPV4)
-			problem(PB_QSLOT_BAD_FAM, "Quorum Slot %u doesn't have valid address", i);
-
-		if (port == 0)
-			problem(PB_QSLOT_BAD_PORT, "Quorum Slot %u has bad port", i);
-
-		if (!in.s_addr) {
-			problem(PB_QSLOT_NO_ADDR, "Quorum Slot %u has not been assigned ipv4 address", i);
-		} else if (!(in.s_addr & 0xff000000)) {
-			problem(PB_QSLOT_BAD_ADDR, "Quorum Slot %u has invalid ipv4 address", i);
-		} else if ((in.s_addr & 0xff) == 0xff) {
-			problem(PB_QSLOT_BAD_ADDR, "Quorum Slot %u has invalid ipv4 address", i);
-		}
-	}
-
-	debug("super magic 0x%04x", global_super->hdr.magic);
-	if (le32_to_cpu(global_super->hdr.magic) != SCOUTFS_BLOCK_MAGIC_SUPER)
-		problem(PB_SB_HDR_MAGIC_INVALID, "superblock magic invalid: 0x%04x is not 0x%04x",
-			global_super->hdr.magic, SCOUTFS_BLOCK_MAGIC_SUPER);
-
-	/* `scoutfs image` command doesn't open data_fd */
-	if (data_fd < 0)
-		ret = 0;
-	else
-		ret = check_data_super(data_fd);
-out:
-	block_put(&blk);
-
-	sns_pop();
-
-	return ret;
-}
-
-void super_shutdown(void)
-{
-	free(global_super);
-}
--- a/utils/src/check/super.h
+++ b/utils/src/check/super.h
@@ -1,12 +0,0 @@
-#ifndef _SCOUTFS_UTILS_CHECK_SUPER_H_
-#define _SCOUTFS_UTILS_CHECK_SUPER_H_
-
-extern struct scoutfs_super_block *global_super;
-
-int check_super_crc();
-int check_supers(int data_fd);
-int super_commit(void);
-int check_super_in_use(int meta_fd);
-void super_shutdown(void);
-
-#endif
--- a/utils/src/parallel_restore.c
+++ b/utils/src/parallel_restore.c
--- a/utils/src/parallel_restore.h
+++ b/utils/src/parallel_restore.h
@@ -1,125 +0,0 @@
-#ifndef _SCOUTFS_PARALLEL_RESTORE_H_
-#define _SCOUTFS_PARALLEL_RESTORE_H_
-
-#include <errno.h>
-
-struct scoutfs_parallel_restore_progress {
-	struct scoutfs_btree_root fs_items;
-	struct scoutfs_btree_root root_items;
-	struct scoutfs_srch_file sfl;
-	struct scoutfs_block_ref bloom_ref;
-	__le64 inode_count;
-	__le64 max_ino;
-};
-
-struct scoutfs_parallel_restore_slice {
-	__le64 fsid;
-	__le64 meta_start;
-	__le64 meta_len;
-};
-
-struct scoutfs_parallel_restore_entry {
-	u64 dir_ino;
-	u64 pos;
-	u64 ino;
-	mode_t mode;
-	char *name;
-	unsigned int name_len;
-};
-
-struct scoutfs_parallel_restore_xattr {
-	u64 ino;
-	u64 pos;
-	char *name;
-	unsigned int name_len;
-	void *value;
-	unsigned int value_len;
-};
-
-struct scoutfs_parallel_restore_inode {
-	/* all inodes */
-	u64 ino;
-	u64 meta_seq;
-	u64 data_seq;
-	u64 nr_xattrs;
-	u32 uid;
-	u32 gid;
-	u32 mode;
-	u32 rdev;
-	u32 flags;
-	u8 pad[4];
-	struct timespec atime;
-	struct timespec ctime;
-	struct timespec mtime;
-	struct timespec crtime;
-	u64 proj;
-
-	/* regular files */
-	u64 data_version;
-	u64 size;
-	bool offline;
-
-	/* only used for directories */
-	u64 nr_subdirs;
-	u64 total_entry_name_bytes;
-
-	/* only used for symlnks */
-	char *target;
-	unsigned int target_len; /* not including null terminator */
-};
-
-struct scoutfs_parallel_restore_quota_rule {
-	u64 limit;
-	u8  prio;
-	u8  op;
-	u8  rule_flags;
-	struct quota_rule_name {
-		u64 val;
-		u8  source;
-		u8  flags;
-	} names [3];
-	char *value;
-	unsigned int value_len;
-};
-
-typedef __typeof__(EINVAL) spr_err_t;
-
-struct scoutfs_parallel_restore_writer;
-
-spr_err_t scoutfs_parallel_restore_create_writer(struct scoutfs_parallel_restore_writer **wrip);
-void scoutfs_parallel_restore_destroy_writer(struct scoutfs_parallel_restore_writer **wrip);
-
-spr_err_t scoutfs_parallel_restore_init_slices(struct scoutfs_parallel_restore_writer *wri,
-					       struct scoutfs_parallel_restore_slice *slices,
-					       int nr);
-spr_err_t scoutfs_parallel_restore_add_slice(struct scoutfs_parallel_restore_writer *wri,
-					    struct scoutfs_parallel_restore_slice *slice);
-spr_err_t scoutfs_parallel_restore_get_slice(struct scoutfs_parallel_restore_writer *wri,
-					    struct scoutfs_parallel_restore_slice *slice);
-
-spr_err_t scoutfs_parallel_restore_add_inode(struct scoutfs_parallel_restore_writer *wri,
-					     struct scoutfs_parallel_restore_inode *inode);
-spr_err_t scoutfs_parallel_restore_add_entry(struct scoutfs_parallel_restore_writer *wri,
-					     struct scoutfs_parallel_restore_entry *entry);
-spr_err_t scoutfs_parallel_restore_add_xattr(struct scoutfs_parallel_restore_writer *wri,
-					     struct scoutfs_parallel_restore_xattr *xattr);
-
-spr_err_t scoutfs_parallel_restore_get_progress(struct scoutfs_parallel_restore_writer *wri,
-						struct scoutfs_parallel_restore_progress *prog);
-spr_err_t scoutfs_parallel_restore_add_progress(struct scoutfs_parallel_restore_writer *wri,
-						struct scoutfs_parallel_restore_progress *prog);
-
-spr_err_t scoutfs_parallel_restore_add_quota_rule(struct scoutfs_parallel_restore_writer *wri,
-						struct scoutfs_parallel_restore_quota_rule *rule);
-
-spr_err_t scoutfs_parallel_restore_write_buf(struct scoutfs_parallel_restore_writer *wri,
-					     void *buf, size_t len, off_t *off_ret,
-					     size_t *count_ret);
-
-spr_err_t scoutfs_parallel_restore_import_super(struct scoutfs_parallel_restore_writer *wri,
-						struct scoutfs_super_block *super, int fd);
-spr_err_t scoutfs_parallel_restore_export_super(struct scoutfs_parallel_restore_writer *wri,
-						struct scoutfs_super_block *super);
-
-
-#endif
--- a/utils/src/util.c
+++ b/utils/src/util.c
@@ -7,7 +7,6 @@
 #include <errno.h>
 #include <stdio.h>
 #include <stdlib.h>
-#include <wordexp.h>

 #include "util.h"
 #include "format.h"
@@ -18,26 +17,15 @@

 static int open_path(char *path, int flags)
 {
-	wordexp_t exp_result;
 	int ret;

-	ret = wordexp(path, &exp_result, WRDE_NOCMD | WRDE_SHOWERR | WRDE_UNDEF);
-	if (ret) {
-		fprintf(stderr, "wordexp() failure for \"%s\": %d\n", path, ret);
-		ret = -EINVAL;
-		goto out;
-	}
-
-	ret = open(exp_result.we_wordv[0], flags);
+	ret = open(path, flags);
 	if (ret < 0) {
 		ret = -errno;
 		fprintf(stderr, "failed to open '%s': %s (%d)\n",
 			path, strerror(errno), errno);
 	}

-out:
-	wordfree(&exp_result);
-
 	return ret;
 }