Merge pull request #254 from versity/zab/shrink_cleanup

Zab/shrink cleanup
2025-12-23 05:25:18 +00:00 · 2025-10-30 08:56:33 -07:00
parent 419079e606 6a70ee03b5
commit 8f3177fe33
16 changed files with 509 additions and 393 deletions
--- a/kmod/src/Makefile.kernelcompat
+++ b/kmod/src/Makefile.kernelcompat
@@ -425,3 +425,48 @@ endif
 ifneq (,$(shell grep 'int ..remap_pages..struct vm_area_struct' include/linux/mm.h))
 ccflags-y += -DKC_MM_REMAP_PAGES
 endif
 #
 # v3.19-4742-g503c358cf192
 #
 # list_lru_shrink_count() and list_lru_shrink_walk() introduced
 #
 ifneq (,$(shell grep 'list_lru_shrink_count.*struct list_lru' include/linux/list_lru.h))
 ccflags-y += -DKC_LIST_LRU_SHRINK_COUNT_WALK
 endif
 #
 # v3.19-4757-g3f97b163207c
 #
 # lru_list_walk_cb lru arg added
 #
 ifneq (,$(shell grep 'struct list_head \*item, spinlock_t \*lock, void \*cb_arg' include/linux/list_lru.h))
 ccflags-y += -DKC_LIST_LRU_WALK_CB_ITEM_LOCK
 endif
 #
 # v6.7-rc4-153-g0a97c01cd20b
 #
 # list_lru_{add,del} -> list_lru_{add,del}_obj
 #
 ifneq (,$(shell grep '^bool list_lru_add_obj' include/linux/list_lru.h))
 ccflags-y += -DKC_LIST_LRU_ADD_OBJ
 endif
 #
 # v6.12-rc6-227-gda0c02516c50
 #
 # lru_list_walk_cb lock arg removed
 #
 ifneq (,$(shell grep 'struct list_lru_one \*list, spinlock_t \*lock, void \*cb_arg' include/linux/list_lru.h))
 ccflags-y += -DKC_LIST_LRU_WALK_CB_LIST_LOCK
 endif
 #
 # v5.1-rc4-273-ge9b98e162aa5
 #
 # introduce stack trace helpers
 #
 ifneq (,$(shell grep '^unsigned int stack_trace_save' include/linux/stacktrace.h))
 ccflags-y += -DKC_STACK_TRACE_SAVE
 endif
--- a/kmod/src/block.c
+++ b/kmod/src/block.c
@@ -22,6 +22,8 @@
 #include <linux/rhashtable.h>
 #include <linux/random.h>
 #include <linux/sched/mm.h>
 #include <linux/list_lru.h>
 #include <linux/stacktrace.h>
 #include "format.h"
 #include "super.h"
@@ -38,26 +40,12 @@
 * than the page size.  Callers can have their own contexts for tracking
 * dirty blocks that are written together.  We pin dirty blocks in
 * memory and only checksum them all as they're all written.
 *
 * Memory reclaim is driven by maintaining two very coarse groups of
 * blocks.  As we access blocks we mark them with an increasing counter
 * to discourage them from being reclaimed.  We then define a threshold
 * at the current counter minus half the population.  Recent blocks have
 * a counter greater than the threshold, and all other blocks with
 * counters less than it are considered older and are candidates for
 * reclaim.  This results in access updates rarely modifying an atomic
 * counter as blocks need to be moved into the recent group, and shrink
 * can randomly scan blocks looking for the half of the population that
 * will be in the old group.  It's reasonably effective, but is
 * particularly efficient and avoids contention between concurrent
 * accesses and shrinking.
 */
 struct block_info {
 	struct super_block *sb;
 	atomic_t total_inserted;
 	atomic64_t access_counter;
 	struct rhashtable ht;
 	struct list_lru lru;
 	wait_queue_head_t waitq;
 	KC_DEFINE_SHRINKER(shrinker);
 	struct work_struct free_work;
@@ -76,28 +64,15 @@ enum block_status_bits {
 	BLOCK_BIT_PAGE_ALLOC,	/* page (possibly high order) allocation */
 	BLOCK_BIT_VIRT,		/* mapped virt allocation */
 	BLOCK_BIT_CRC_VALID,	/* crc has been verified */
 	BLOCK_BIT_ACCESSED,	/* seen by lookup since last lru add/walk */
 };
 /*
 * We want to tie atomic changes in refcounts to whether or not the
 * block is still visible in the hash table, so we store the hash
 * table's reference up at a known high bit.  We could naturally set the
 * inserted bit through excessive refcount increments.  We don't do
 * anything about that but at least warn if we get close.
 *
 * We're avoiding the high byte for no real good reason, just out of a
 * historical fear of implementations that don't provide the full
 * precision.
 */
 #define BLOCK_REF_INSERTED	(1U << 23)
 #define BLOCK_REF_FULL		(BLOCK_REF_INSERTED >> 1)
 struct block_private {
 	struct scoutfs_block bl;
 	struct super_block *sb;
 	atomic_t refcount;
 	u64 accessed;
 	struct rhash_head ht_head;
 	struct list_head lru_head;
 	struct list_head dirty_entry;
 	struct llist_node free_node;
 	unsigned long bits;
@@ -106,13 +81,15 @@ struct block_private {
 		struct page *page;
 		void *virt;
 	};
 	unsigned int stack_len;
 	unsigned long stack[10];
 };
 #define TRACE_BLOCK(which, bp)									\
 do {												\
 	__typeof__(bp) _bp = (bp);								\
 	trace_scoutfs_block_##which(_bp->sb, _bp, _bp->bl.blkno, atomic_read(&_bp->refcount),	\
-				    atomic_read(&_bp->io_count), _bp->bits, _bp->accessed);	\
+				    atomic_read(&_bp->io_count), _bp->bits);	\
 } while (0)
 #define BLOCK_PRIVATE(_bl) \
@@ -126,7 +103,17 @@ static __le32 block_calc_crc(struct scoutfs_block_header *hdr, u32 size)
 	return cpu_to_le32(calc);
 }
-static struct block_private *block_alloc(struct super_block *sb, u64 blkno)
+static noinline void save_block_stack(struct block_private *bp)
 {
 	bp->stack_len = stack_trace_save(bp->stack, ARRAY_SIZE(bp->stack), 2);
 }
 static void print_block_stack(struct block_private *bp)
 {
 	stack_trace_print(bp->stack, bp->stack_len, 1);
 }
 static noinline struct block_private *block_alloc(struct super_block *sb, u64 blkno)
 {
 	struct block_private *bp;
 	unsigned int nofs_flags;
@@ -176,11 +163,13 @@ static struct block_private *block_alloc(struct super_block *sb, u64 blkno)
 	bp->bl.blkno = blkno;
 	bp->sb = sb;
 	atomic_set(&bp->refcount, 1);
 	INIT_LIST_HEAD(&bp->lru_head);
 	INIT_LIST_HEAD(&bp->dirty_entry);
 	set_bit(BLOCK_BIT_NEW, &bp->bits);
 	atomic_set(&bp->io_count, 0);
 	TRACE_BLOCK(allocate, bp);
 	save_block_stack(bp);
 out:
 	if (!bp)
@@ -233,32 +222,85 @@ static void block_free_work(struct work_struct *work)
 }
 /*
- * Get a reference to a block while holding an existing reference.
+ * Users of blocks hold a refcount.  If putting a refcount drops to zero
 * then the block is freed.
 *
 * Acquiring new references and claiming the exclusive right to tear
 * down a block is built around this LIVE_REFCOUNT_BASE refcount value.
 * As blocks are initially cached they have the live base added to their
 * refcount.  Lookups will only increment the refcount and return blocks
 * for reference holders while the refcount is >= than the base.
 *
 * To remove a block from the cache and eventually free it, either by
 * the lru walk in the shrinker, or by reference holders, the live base
 * is removed and turned into a normal refcount increment that will be
 * put by the caller.  This can only be done once for a block, and once
 * its done lookup will not return any more references.
 */
 #define LIVE_REFCOUNT_BASE (INT_MAX ^ (INT_MAX >> 1))
 /*
 * Inc the refcount while holding an incremented refcount.  We can't
 * have so many individual reference holders that they pass the live
 * base.
 */
 static void block_get(struct block_private *bp)
 {
-	WARN_ON_ONCE((atomic_read(&bp->refcount) & ~BLOCK_REF_INSERTED) <= 0);
+	int now = atomic_inc_return(&bp->refcount);
-	atomic_inc(&bp->refcount);
+	BUG_ON(now <= 1);
 	BUG_ON(now == LIVE_REFCOUNT_BASE);
 }
 /*
- * Get a reference to a block as long as it's been inserted in the hash
+ * if (*v >= u) {
- * table and hasn't been removed.
+ * 	*v += a;
 * 	return true;
 * }
 */
-static struct block_private *block_get_if_inserted(struct block_private *bp)
+static bool atomic_add_unless_less(atomic_t *v, int a, int u)
 {
-	int cnt;
+	int c;
 	do {
-		cnt = atomic_read(&bp->refcount);
+		c = atomic_read(v);
-		WARN_ON_ONCE(cnt & BLOCK_REF_FULL);
+		if (c < u)
-		if (!(cnt & BLOCK_REF_INSERTED))
+			return false;
-			return NULL;
+	} while (atomic_cmpxchg(v, c, c + a) != c);
-	} while (atomic_cmpxchg(&bp->refcount, cnt, cnt + 1) != cnt);
+	return true;
 }
-	return bp;
+static bool block_get_if_live(struct block_private *bp)
 {
 	return atomic_add_unless_less(&bp->refcount, 1, LIVE_REFCOUNT_BASE);
 }
 /*
 * If the refcount still has the live base, subtract it and increment
 * the callers refcount that they'll put.
 */
 static bool block_get_remove_live(struct block_private *bp)
 {
 	return atomic_add_unless_less(&bp->refcount, (1 - LIVE_REFCOUNT_BASE), LIVE_REFCOUNT_BASE);
 }
 /*
 * Only get the live base refcount if it is the only refcount remaining.
 * This means that there are no active refcount holders and the block
 * can't be dirty or under IO, which both hold references.
 */
 static bool block_get_remove_live_only(struct block_private *bp)
 {
 	int c;
 	do {
 		c = atomic_read(&bp->refcount);
 		if (c != LIVE_REFCOUNT_BASE)
 			return false;
 	} while (atomic_cmpxchg(&bp->refcount, c, c - LIVE_REFCOUNT_BASE + 1) != c);
 	return true;
 }
 /*
@@ -290,104 +332,73 @@ static const struct rhashtable_params block_ht_params = {
 };
 /*
- * Insert a new block into the hash table.  Once it is inserted in the
+ * Insert the block into the cache so that it's visible for lookups.
- * hash table readers can start getting references.  The caller may have
+ * The caller can hold references (including for a dirty block).
- * multiple refs but the block can't already be inserted.
+ *
 * We make sure the base is added and the block is in the lru once it's
 * in the hash.  If hash table insertion fails it'll be briefly visible
 * in the lru, but won't be isolated/evicted because we hold an
 * incremented refcount in addition to the live base.
 */
 static int block_insert(struct super_block *sb, struct block_private *bp)
 {
 	DECLARE_BLOCK_INFO(sb, binf);
 	int ret;
-	WARN_ON_ONCE(atomic_read(&bp->refcount) & BLOCK_REF_INSERTED);
+	BUG_ON(atomic_read(&bp->refcount) >= LIVE_REFCOUNT_BASE);
-
+	atomic_add(LIVE_REFCOUNT_BASE, &bp->refcount);
 	smp_mb__after_atomic(); /* make sure live base is visible to list_lru walk */
 	list_lru_add_obj(&binf->lru, &bp->lru_head);
 retry:
 	atomic_add(BLOCK_REF_INSERTED, &bp->refcount);
 	ret = rhashtable_lookup_insert_fast(&binf->ht, &bp->ht_head, block_ht_params);
 	if (ret < 0) {
 		atomic_sub(BLOCK_REF_INSERTED, &bp->refcount);
 		if (ret == -EBUSY) {
 			/* wait for pending rebalance to finish */
 			synchronize_rcu();
 			goto retry;
 		} else {
 			atomic_sub(LIVE_REFCOUNT_BASE, &bp->refcount);
 			BUG_ON(atomic_read(&bp->refcount) >= LIVE_REFCOUNT_BASE);
 			list_lru_del_obj(&binf->lru, &bp->lru_head);
 		}
 	} else {
 		atomic_inc(&binf->total_inserted);
 		TRACE_BLOCK(insert, bp);
 	}
 	return ret;
 }
 static u64 accessed_recently(struct block_info *binf)
 {
 	return atomic64_read(&binf->access_counter) - (atomic_read(&binf->total_inserted) >> 1);
 }
 /*
- * Make sure that a block that is being accessed is less likely to be
+ * Indicate to the lru walker that this block has been accessed since it
- * reclaimed if it is seen by the shrinker.   If the block hasn't been
+ * was added or last walked.
 * accessed recently we update its accessed value.
 */
 static void block_accessed(struct super_block *sb, struct block_private *bp)
 {
-	DECLARE_BLOCK_INFO(sb, binf);
+	if (!test_and_set_bit(BLOCK_BIT_ACCESSED, &bp->bits))
 	if (bp->accessed == 0 || bp->accessed < accessed_recently(binf)) {
 		scoutfs_inc_counter(sb, block_cache_access_update);
 		bp->accessed = atomic64_inc_return(&binf->access_counter);
 	}
 }
 /*
- * The caller wants to remove the block from the hash table and has an
+ * Remove the block from the cache.  When this returns the block won't
- * idea what the refcount should be.  If the refcount does still
+ * be visible for additional references from lookup.
 * indicate that the block is hashed, and we're able to clear that bit,
 * then we can remove it from the hash table.
 *
- * The caller makes sure that it's safe to be referencing this block,
+ * We always try and remove from the hash table.  It's safe to remove a
- * either with their own held reference (most everything) or by being in
+ * block that isn't hashed, it just returns -ENOENT.
- * an rcu grace period (shrink).
+ *
- */
+ * This is racing with the lru walk in the shrinker also trying to
-static bool block_remove_cnt(struct super_block *sb, struct block_private *bp, int cnt)
+ * remove idle blocks from the cache.  They both try to remove the live
-{
+ * refcount base and perform their removal and put if they get it.
 	DECLARE_BLOCK_INFO(sb, binf);
 	int ret;
 	if ((cnt & BLOCK_REF_INSERTED) &&
 	    (atomic_cmpxchg(&bp->refcount, cnt, cnt & ~BLOCK_REF_INSERTED) == cnt)) {
 		TRACE_BLOCK(remove, bp);
 		ret = rhashtable_remove_fast(&binf->ht, &bp->ht_head, block_ht_params);
 		WARN_ON_ONCE(ret); /* must have been inserted */
 		atomic_dec(&binf->total_inserted);
 		return true;
 	}
 	return false;
 }
 /*
 * Try to remove the block from the hash table as long as the refcount
 * indicates that it is still in the hash table.  This can be racing
 * with normal refcount changes so it might have to retry.
 */
 static void block_remove(struct super_block *sb, struct block_private *bp)
 {
-	int cnt;
+	DECLARE_BLOCK_INFO(sb, binf);
-	do {
+	rhashtable_remove_fast(&binf->ht, &bp->ht_head, block_ht_params);
 		cnt = atomic_read(&bp->refcount);
 	} while ((cnt & BLOCK_REF_INSERTED) && !block_remove_cnt(sb, bp, cnt));
 }
-/*
+	if (block_get_remove_live(bp)) {
- * Take one shot at removing the block from the hash table if it's still
+		list_lru_del_obj(&binf->lru, &bp->lru_head);
- * in the hash table and the caller has the only other reference.
+		block_put(sb, bp);
- */
+	}
 static bool block_remove_solo(struct super_block *sb, struct block_private *bp)
 {
 	return block_remove_cnt(sb, bp, BLOCK_REF_INSERTED | 1);
 }
 static bool io_busy(struct block_private *bp)
@@ -396,37 +407,6 @@ static bool io_busy(struct block_private *bp)
 	return test_bit(BLOCK_BIT_IO_BUSY, &bp->bits);
 }
 /*
 * Called during shutdown with no other users.
 */
 static void block_remove_all(struct super_block *sb)
 {
 	DECLARE_BLOCK_INFO(sb, binf);
 	struct rhashtable_iter iter;
 	struct block_private *bp;
 	rhashtable_walk_enter(&binf->ht, &iter);
 	rhashtable_walk_start(&iter);
 	for (;;) {
 		bp = rhashtable_walk_next(&iter);
 		if (bp == NULL)
 			break;
 		if (bp == ERR_PTR(-EAGAIN))
 			continue;
 		if (block_get_if_inserted(bp)) {
 			block_remove(sb, bp);
 			WARN_ON_ONCE(atomic_read(&bp->refcount) != 1);
 			block_put(sb, bp);
 		}
 	}
 	rhashtable_walk_stop(&iter);
 	rhashtable_walk_exit(&iter);
 	WARN_ON_ONCE(atomic_read(&binf->total_inserted) != 0);
 }
 /*
 * XXX The io_count and sb fields in the block_private are only used
@@ -543,6 +523,10 @@ static int block_submit_bio(struct super_block *sb, struct block_private *bp,
 	return ret;
 }
 /*
 * Return a block with an elevated refcount if it was present in the
 * hash table and its refcount didn't indicate that it was being freed.
 */
 static struct block_private *block_lookup(struct super_block *sb, u64 blkno)
 {
 	DECLARE_BLOCK_INFO(sb, binf);
@@ -550,8 +534,8 @@ static struct block_private *block_lookup(struct super_block *sb, u64 blkno)
 	rcu_read_lock();
 	bp = rhashtable_lookup(&binf->ht, &blkno, block_ht_params);
-	if (bp)
+	if (bp && !block_get_if_live(bp))
-		bp = block_get_if_inserted(bp);
+		bp = NULL;
 	rcu_read_unlock();
 	return bp;
@@ -1078,102 +1062,108 @@ static unsigned long block_count_objects(struct shrinker *shrink, struct shrink_
 	struct super_block *sb = binf->sb;
 	scoutfs_inc_counter(sb, block_cache_count_objects);
-
+	return list_lru_shrink_count(&binf->lru, sc);
-	return shrinker_min_long(atomic_read(&binf->total_inserted));
+}
 struct isolate_args {
 	struct super_block *sb;
 	struct list_head dispose;
 };
 #define DECLARE_ISOLATE_ARGS(sb_, name_) \
 	struct isolate_args name_ = { \
 		.sb = sb_, \
 		.dispose = LIST_HEAD_INIT(name_.dispose), \
 	}
 static enum lru_status isolate_lru_block(struct list_head *item, struct list_lru_one *list,
 					 void *cb_arg)
 {
 	struct block_private *bp = container_of(item, struct block_private, lru_head);
 	struct isolate_args *ia = cb_arg;
 	TRACE_BLOCK(isolate, bp);
 	/* rotate accessed blocks to the tail of the list (lazy promotion) */
 	if (test_and_clear_bit(BLOCK_BIT_ACCESSED, &bp->bits)) {
 		scoutfs_inc_counter(ia->sb, block_cache_isolate_rotate);
 		return LRU_ROTATE;
 	}
 	/* any refs, including dirty/io, stop us from acquiring lru refcount */
 	if (!block_get_remove_live_only(bp)) {
 		scoutfs_inc_counter(ia->sb, block_cache_isolate_skip);
 		return LRU_SKIP;
 	}
 	scoutfs_inc_counter(ia->sb, block_cache_isolate_removed);
 	list_lru_isolate_move(list, &bp->lru_head, &ia->dispose);
 	return LRU_REMOVED;
 }
 static void shrink_dispose_blocks(struct super_block *sb, struct list_head *dispose)
 {
 	struct block_private *bp;
 	struct block_private *bp__;
 	list_for_each_entry_safe(bp, bp__, dispose, lru_head) {
 		list_del_init(&bp->lru_head);
 		block_remove(sb, bp);
 		block_put(sb, bp);
 	}
 }
 /*
 * Remove a number of cached blocks that haven't been used recently.
 *
 * We don't maintain a strictly ordered LRU to avoid the contention of
 * accesses always moving blocks around in some precise global
 * structure.
 *
 * Instead we use counters to divide the blocks into two roughly equal
 * groups by how recently they were accessed.  We randomly walk all
 * inserted blocks looking for any blocks in the older half to remove
 * and free.  The random walk and there being two groups means that we
 * typically only walk a small multiple of the number we're looking for
 * before we find them all.
 *
 * Our rcu walk of blocks can see blocks in all stages of their life
 * cycle, from dirty blocks to those with 0 references that are queued
 * for freeing.  We only want to free idle inserted blocks so we
 * atomically remove blocks when the only references are ours and the
 * hash table.
 */
 static unsigned long block_scan_objects(struct shrinker *shrink, struct shrink_control *sc)
 {
 	struct block_info *binf = KC_SHRINKER_CONTAINER_OF(shrink, struct block_info);
 	struct super_block *sb = binf->sb;
-	struct rhashtable_iter iter;
+	DECLARE_ISOLATE_ARGS(sb, ia);
-	struct block_private *bp;
+	unsigned long freed;
 	bool stop = false;
 	unsigned long freed = 0;
 	unsigned long nr = sc->nr_to_scan;
 	u64 recently;
 	scoutfs_inc_counter(sb, block_cache_scan_objects);
-	recently = accessed_recently(binf);
+	freed = kc_list_lru_shrink_walk(&binf->lru, sc, isolate_lru_block, &ia);
-	rhashtable_walk_enter(&binf->ht, &iter);
+	shrink_dispose_blocks(sb, &ia.dispose);
 	rhashtable_walk_start(&iter);
 	/*
 	 * This isn't great but I don't see a better way.  We want to
 	 * walk the hash from a random point so that we're not
 	 * constantly walking over the same region that we've already
 	 * freed old blocks within.  The interface doesn't let us do
 	 * this explicitly, but this seems to work?  The difference this
 	 * makes is enormous, around a few orders of magnitude fewer
 	 * _nexts per shrink.
 	 */
 	if (iter.walker.tbl)
 		iter.slot = prandom_u32_max(iter.walker.tbl->size);
 	while (nr > 0) {
 		bp = rhashtable_walk_next(&iter);
 		if (bp == NULL)
 			break;
 		if (bp == ERR_PTR(-EAGAIN)) {
 			/*
 			 * We can be called from reclaim in the allocation
 			 * to resize the hash table itself.  We have to
 			 * return so that the caller can proceed and
 			 * enable hash table iteration again.
 			 */
 			scoutfs_inc_counter(sb, block_cache_shrink_stop);
 			stop = true;
 			break;
 		}
 		scoutfs_inc_counter(sb, block_cache_shrink_next);
 		if (bp->accessed >= recently) {
 			scoutfs_inc_counter(sb, block_cache_shrink_recent);
 			continue;
 		}
 		if (block_get_if_inserted(bp)) {
 			if (block_remove_solo(sb, bp)) {
 				scoutfs_inc_counter(sb, block_cache_shrink_remove);
 				TRACE_BLOCK(shrink, bp);
 				freed++;
 				nr--;
 			}
 			block_put(sb, bp);
 		}
 	}
 	rhashtable_walk_stop(&iter);
 	rhashtable_walk_exit(&iter);
 	if (stop)
 		return SHRINK_STOP;
 	else
 	return freed;
 }
 static enum lru_status dump_lru_block(struct list_head *item, struct list_lru_one *list,
 					 void *cb_arg)
 {
 	struct block_private *bp = container_of(item, struct block_private, lru_head);
 	printk("blkno %llu refcount 0x%x io_count %d bits 0x%lx\n",
 		bp->bl.blkno, atomic_read(&bp->refcount), atomic_read(&bp->io_count),
 		bp->bits);
 	print_block_stack(bp);
 	return LRU_SKIP;
 }
 /*
 * Called during shutdown with no other users.  The isolating walk must
 * find blocks on the lru that only have references for presence on the
 * lru and in the hash table.
 */
 static void block_shrink_all(struct super_block *sb)
 {
 	DECLARE_BLOCK_INFO(sb, binf);
 	DECLARE_ISOLATE_ARGS(sb, ia);
 	long count;
 	count = DIV_ROUND_UP(list_lru_count(&binf->lru), 128) * 2;
 	do {
 		kc_list_lru_walk(&binf->lru, isolate_lru_block, &ia, 128);
 		shrink_dispose_blocks(sb, &ia.dispose);
 	} while (list_lru_count(&binf->lru) > 0 && --count > 0);
 	count = list_lru_count(&binf->lru);
 	if (count > 0) {
 		scoutfs_err(sb, "failed to isolate/dispose %ld blocks", count);
 		kc_list_lru_walk(&binf->lru, dump_lru_block, sb, count);
 	}
 }
 struct sm_block_completion {
 	struct completion comp;
 	int err;
@@ -1276,7 +1266,7 @@ int scoutfs_block_write_sm(struct super_block *sb,
 int scoutfs_block_setup(struct super_block *sb)
 {
 	struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
-	struct block_info *binf;
+	struct block_info *binf = NULL;
 	int ret;
 	binf = kzalloc(sizeof(struct block_info), GFP_KERNEL);
@@ -1285,15 +1275,15 @@ int scoutfs_block_setup(struct super_block *sb)
 		goto out;
 	}
-	ret = rhashtable_init(&binf->ht, &block_ht_params);
+	ret = list_lru_init(&binf->lru);
-	if (ret < 0) {
+	if (ret < 0)
-		kfree(binf);
+		goto out;
 	ret = rhashtable_init(&binf->ht, &block_ht_params);
 	if (ret < 0)
 		goto out;
 	}
 	binf->sb = sb;
 	atomic_set(&binf->total_inserted, 0);
 	atomic64_set(&binf->access_counter, 0);
 	init_waitqueue_head(&binf->waitq);
 	KC_INIT_SHRINKER_FUNCS(&binf->shrinker, block_count_objects,
 			       block_scan_objects);
@@ -1305,8 +1295,10 @@ int scoutfs_block_setup(struct super_block *sb)
 	ret = 0;
 out:
-	if (ret)
+	if (ret < 0 && binf) {
-		scoutfs_block_destroy(sb);
+		list_lru_destroy(&binf->lru);
 		kfree(binf);
 	}
 	return ret;
 }
@@ -1318,9 +1310,10 @@ void scoutfs_block_destroy(struct super_block *sb)
 	if (binf) {
 		KC_UNREGISTER_SHRINKER(&binf->shrinker);
-		block_remove_all(sb);
+		block_shrink_all(sb);
 		flush_work(&binf->free_work);
 		rhashtable_destroy(&binf->ht);
 		list_lru_destroy(&binf->lru);
 		kfree(binf);
 		sbi->block_info = NULL;
--- a/kmod/src/counters.h
+++ b/kmod/src/counters.h
@@ -26,17 +26,15 @@
 	EXPAND_COUNTER(block_cache_alloc_page_order)		\
 	EXPAND_COUNTER(block_cache_alloc_virt)			\
 	EXPAND_COUNTER(block_cache_end_io_error)		\
 	EXPAND_COUNTER(block_cache_isolate_removed)		\
 	EXPAND_COUNTER(block_cache_isolate_rotate)		\
 	EXPAND_COUNTER(block_cache_isolate_skip)		\
 	EXPAND_COUNTER(block_cache_forget)			\
 	EXPAND_COUNTER(block_cache_free)			\
 	EXPAND_COUNTER(block_cache_free_work)			\
 	EXPAND_COUNTER(block_cache_remove_stale)		\
 	EXPAND_COUNTER(block_cache_count_objects)		\
 	EXPAND_COUNTER(block_cache_scan_objects)		\
 	EXPAND_COUNTER(block_cache_shrink)			\
 	EXPAND_COUNTER(block_cache_shrink_next)			\
 	EXPAND_COUNTER(block_cache_shrink_recent)		\
 	EXPAND_COUNTER(block_cache_shrink_remove)		\
 	EXPAND_COUNTER(block_cache_shrink_stop)			\
 	EXPAND_COUNTER(btree_compact_values)			\
 	EXPAND_COUNTER(btree_compact_values_enomem)		\
 	EXPAND_COUNTER(btree_delete)				\
@@ -118,10 +116,11 @@
 	EXPAND_COUNTER(item_pcpu_page_hit)			\
 	EXPAND_COUNTER(item_pcpu_page_miss)			\
 	EXPAND_COUNTER(item_pcpu_page_miss_keys)		\
 	EXPAND_COUNTER(item_read_pages_barrier)			\
 	EXPAND_COUNTER(item_read_pages_retry)			\
 	EXPAND_COUNTER(item_read_pages_split)			\
 	EXPAND_COUNTER(item_shrink_page)			\
 	EXPAND_COUNTER(item_shrink_page_dirty)			\
 	EXPAND_COUNTER(item_shrink_page_reader)			\
 	EXPAND_COUNTER(item_shrink_page_trylock)		\
 	EXPAND_COUNTER(item_update)				\
 	EXPAND_COUNTER(item_write_dirty)			\
--- a/kmod/src/item.c
+++ b/kmod/src/item.c
@@ -86,6 +86,8 @@ struct item_cache_info {
 	/* often walked, but per-cpu refs are fast path */
 	rwlock_t rwlock;
 	struct rb_root pg_root;
 	/* stop readers from caching stale items behind reclaimed cleaned written items */
 	u64 read_dirty_barrier;
 	/* page-granular modification by writers, then exclusive to commit */
 	spinlock_t dirty_lock;
@@ -96,10 +98,6 @@ struct item_cache_info {
 	spinlock_t lru_lock;
 	struct list_head lru_list;
 	unsigned long lru_pages;
 	/* written by page readers, read by shrink */
 	spinlock_t active_lock;
 	struct list_head active_list;
 };
 #define DECLARE_ITEM_CACHE_INFO(sb, name) \
@@ -1285,78 +1283,6 @@ static int cache_empty_page(struct super_block *sb,
 	return 0;
 }
 /*
 * Readers operate independently from dirty items and transactions.
 * They read a set of persistent items and insert them into the cache
 * when there aren't already pages whose key range contains the items.
 * This naturally prefers cached dirty items over stale read items.
 *
 * We have to deal with the case where dirty items are written and
 * invalidated while a read is in flight.   The reader won't have seen
 * the items that were dirty in their persistent roots as they started
 * reading.  By the time they insert their read pages the previously
 * dirty items have been reclaimed and are not in the cache.  The old
 * stale items will be inserted in their place, effectively corrupting
 * by having the dirty items disappear.
 *
 * We fix this by tracking the max seq of items in pages.  As readers
 * start they record the current transaction seq.  Invalidation skips
 * pages with a max seq greater than the first reader seq because the
 * items in the page have to stick around to prevent the readers stale
 * items from being inserted.
 *
 * This naturally only affects a small set of pages with items that were
 * written relatively recently.  If we're in memory pressure then we
 * probably have a lot of pages and they'll naturally have items that
 * were visible to any raders.  We don't bother with the complicated and
 * expensive further refinement of tracking the ranges that are being
 * read and comparing those with pages to invalidate.
 */
 struct active_reader {
 	struct list_head head;
 	u64 seq;
 };
 #define INIT_ACTIVE_READER(rdr) \
 	struct active_reader rdr = { .head = LIST_HEAD_INIT(rdr.head) }
 static void add_active_reader(struct super_block *sb, struct active_reader *active)
 {
 	DECLARE_ITEM_CACHE_INFO(sb, cinf);
 	BUG_ON(!list_empty(&active->head));
 	active->seq = scoutfs_trans_sample_seq(sb);
 	spin_lock(&cinf->active_lock);
 	list_add_tail(&active->head, &cinf->active_list);
 	spin_unlock(&cinf->active_lock);
 }
 static u64 first_active_reader_seq(struct item_cache_info *cinf)
 {
 	struct active_reader *active;
 	u64 first;
 	/* only the calling task adds or deletes this active */
 	spin_lock(&cinf->active_lock);
 	active = list_first_entry_or_null(&cinf->active_list, struct active_reader, head);
 	first = active ? active->seq : U64_MAX;
 	spin_unlock(&cinf->active_lock);
 	return first;
 }
 static void del_active_reader(struct item_cache_info *cinf, struct active_reader *active)
 {
 	/* only the calling task adds or deletes this active */
 	if (!list_empty(&active->head)) {
 		spin_lock(&cinf->active_lock);
 		list_del_init(&active->head);
 		spin_unlock(&cinf->active_lock);
 	}
 }
 /*
 * Add a newly read item to the pages that we're assembling for
 * insertion into the cache.   These pages are private, they only exist
@@ -1450,24 +1376,34 @@ static int read_page_item(struct super_block *sb, struct scoutfs_key *key, u64 s
 * and duplicates, we insert any resulting pages which don't overlap
 * with existing cached pages.
 *
 * We only insert uncached regions because this is called with cluster
 * locks held, but without locking the cache.  The regions we read can
 * be stale with respect to the current cache, which can be read and
 * dirtied by other cluster lock holders on our node, but the cluster
 * locks protect the stable items we read.  Invalidation is careful not
 * to drop pages that have items that we couldn't see because they were
 * dirty when we started reading.
 *
 * The forest item reader is reading stable trees that could be
 * overwritten.  It can return -ESTALE which we return to the caller who
 * will retry the operation and work with a new set of more recent
 * btrees.
 *
 * We only insert uncached regions because this is called with cluster
 * locks held, but without locking the cache.  The regions we read can
 * be stale with respect to the current cache, which can be read and
 * dirtied by other cluster lock holders on our node, but the cluster
 * locks protect the stable items we read.
 *
 * Using the presence of locally written dirty pages to override stale
 * read pages only works if, well, the more recent locally written pages
 * are still present.  Readers are totally decoupled from writers and
 * can have a set of items that is very old indeed.  In the mean time
 * more recent items would have been dirtied locally, committed,
 * cleaned, and reclaimed.  We have a coarse barrier which ensures that
 * readers can't insert items read from old roots from before local data
 * was written.  If a write completes while a read is in progress the
 * read will have to retry.  The retried read can use cached blocks so
 * we're relying on reads being much faster than writes to reduce the
 * overhead to mostly cpu work of recollecting the items from cached
 * blocks via a more recent root from the server.
 */
 static int read_pages(struct super_block *sb, struct item_cache_info *cinf,
 		      struct scoutfs_key *key, struct scoutfs_lock *lock)
 {
 	struct rb_root root = RB_ROOT;
 	INIT_ACTIVE_READER(active);
 	struct cached_page *right = NULL;
 	struct cached_page *pg;
 	struct cached_page *rd;
@@ -1480,6 +1416,7 @@ static int read_pages(struct super_block *sb, struct item_cache_info *cinf,
 	struct rb_node *par;
 	struct rb_node *pg_tmp;
 	struct rb_node *item_tmp;
 	u64 rdbar;
 	int pgi;
 	int ret;
@@ -1493,8 +1430,9 @@ static int read_pages(struct super_block *sb, struct item_cache_info *cinf,
 	pg->end = lock->end;
 	rbtree_insert(&pg->node, NULL, &root.rb_node, &root);
-	/* set active reader seq before reading persistent roots */
+	read_lock(&cinf->rwlock);
-	add_active_reader(sb, &active);
+	rdbar = cinf->read_dirty_barrier;
 	read_unlock(&cinf->rwlock);
 	start = lock->start;
 	end = lock->end;
@@ -1533,6 +1471,13 @@ static int read_pages(struct super_block *sb, struct item_cache_info *cinf,
 retry:
 	write_lock(&cinf->rwlock);
 	/* can't insert if write has cleaned since we read */
 	if (cinf->read_dirty_barrier != rdbar) {
 		scoutfs_inc_counter(sb, item_read_pages_barrier);
 		ret = -ESTALE;
 		goto unlock;
 	}
 	while ((rd = first_page(&root))) {
 		pg = page_rbtree_walk(sb, &cinf->pg_root, &rd->start, &rd->end,
@@ -1570,12 +1515,12 @@ retry:
 		}
 	}
 	ret = 0;
 unlock:
 	write_unlock(&cinf->rwlock);
 	ret = 0;
 out:
 	del_active_reader(cinf, &active);
 	/* free any pages we left dangling on error */
 	for_each_page_safe(&root, rd, pg_tmp) {
 		rbtree_erase(&rd->node, &root);
@@ -1635,6 +1580,7 @@ retry:
 			ret = read_pages(sb, cinf, key, lock);
 		if (ret < 0 && ret != -ESTALE)
 			goto out;
 		scoutfs_inc_counter(sb, item_read_pages_retry);
 		goto retry;
 	}
@@ -2415,6 +2361,11 @@ int scoutfs_item_write_done(struct super_block *sb)
 	struct cached_item *tmp;
 	struct cached_page *pg;
 	/* don't let read_pages miss written+cleaned items */
 	write_lock(&cinf->rwlock);
 	cinf->read_dirty_barrier++;
 	write_unlock(&cinf->rwlock);
 	spin_lock(&cinf->dirty_lock);
 	while ((pg = list_first_entry_or_null(&cinf->dirty_list, struct cached_page, dirty_head))) {
 		if (write_trylock(&pg->rwlock)) {
@@ -2593,24 +2544,15 @@ static unsigned long item_cache_scan_objects(struct shrinker *shrink,
 	struct cached_page *tmp;
 	struct cached_page *pg;
 	unsigned long freed = 0;
 	u64 first_reader_seq;
 	int nr = sc->nr_to_scan;
 	scoutfs_inc_counter(sb, item_cache_scan_objects);
 	/* can't invalidate pages with items that weren't visible to first reader */
 	first_reader_seq = first_active_reader_seq(cinf);
 	write_lock(&cinf->rwlock);
 	spin_lock(&cinf->lru_lock);
 	list_for_each_entry_safe(pg, tmp, &cinf->lru_list, lru_head) {
 		if (first_reader_seq <= pg->max_seq) {
 			scoutfs_inc_counter(sb, item_shrink_page_reader);
 			continue;
 		}
 		if (!write_trylock(&pg->rwlock)) {
 			scoutfs_inc_counter(sb, item_shrink_page_trylock);
 			continue;
@@ -2677,8 +2619,6 @@ int scoutfs_item_setup(struct super_block *sb)
 	atomic_set(&cinf->dirty_pages, 0);
 	spin_lock_init(&cinf->lru_lock);
 	INIT_LIST_HEAD(&cinf->lru_list);
 	spin_lock_init(&cinf->active_lock);
 	INIT_LIST_HEAD(&cinf->active_list);
 	cinf->pcpu_pages = alloc_percpu(struct item_percpu_pages);
 	if (!cinf->pcpu_pages)
@@ -2711,8 +2651,6 @@ void scoutfs_item_destroy(struct super_block *sb)
 	int cpu;
 	if (cinf) {
 		BUG_ON(!list_empty(&cinf->active_list));
 #ifdef KC_CPU_NOTIFIER
 		unregister_hotcpu_notifier(&cinf->notifier);
 #endif
--- a/kmod/src/kernelcompat.c
+++ b/kmod/src/kernelcompat.c
@@ -81,3 +81,69 @@ kc_generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov,
 	return written ? written : status;
 }
 #endif
 #include <linux/list_lru.h>
 #ifdef KC_LIST_LRU_WALK_CB_ITEM_LOCK
 static enum lru_status kc_isolate(struct list_head *item, spinlock_t *lock, void *cb_arg)
 {
 	struct kc_isolate_args *args = cb_arg;
 	/* isolate doesn't use list, nr_items updated in caller */
 	return args->isolate(item, NULL, args->cb_arg);
 }
 unsigned long kc_list_lru_walk(struct list_lru *lru, kc_list_lru_walk_cb_t isolate, void *cb_arg,
 				      unsigned long nr_to_walk)
 {
 	struct kc_isolate_args args = {
 		.isolate = isolate,
 		.cb_arg = cb_arg,
 	};
 	return list_lru_walk(lru, kc_isolate, &args, nr_to_walk);
 }
 unsigned long kc_list_lru_shrink_walk(struct list_lru *lru, struct shrink_control *sc,
 				      kc_list_lru_walk_cb_t isolate, void *cb_arg)
 {
 	struct kc_isolate_args args = {
 		.isolate = isolate,
 		.cb_arg = cb_arg,
 	};
 	return list_lru_shrink_walk(lru, sc, kc_isolate, &args);
 }
 #endif
 #ifdef KC_LIST_LRU_WALK_CB_LIST_LOCK
 static enum lru_status kc_isolate(struct list_head *item, struct list_lru_one *list,
 				  spinlock_t *lock, void *cb_arg)
 {
 	struct kc_isolate_args *args = cb_arg;
 	return args->isolate(item, list, args->cb_arg);
 }
 unsigned long kc_list_lru_walk(struct list_lru *lru, kc_list_lru_walk_cb_t isolate, void *cb_arg,
 				      unsigned long nr_to_walk)
 {
 	struct kc_isolate_args args = {
 		.isolate = isolate,
 		.cb_arg = cb_arg,
 	};
 	return list_lru_walk(lru, kc_isolate, &args, nr_to_walk);
 }
 unsigned long kc_list_lru_shrink_walk(struct list_lru *lru, struct shrink_control *sc,
 				      kc_list_lru_walk_cb_t isolate, void *cb_arg)
 {
 	struct kc_isolate_args args = {
 		.isolate = isolate,
 		.cb_arg = cb_arg,
 	};
 	return list_lru_shrink_walk(lru, sc, kc_isolate, &args);
 }
 #endif
--- a/kmod/src/kernelcompat.h
+++ b/kmod/src/kernelcompat.h
@@ -410,4 +410,77 @@ static inline vm_fault_t vmf_error(int err)
 }
 #endif
 #include <linux/list_lru.h>
 #ifndef KC_LIST_LRU_SHRINK_COUNT_WALK
 /* we don't bother with sc->{nid,memcg} (which doesn't exist in oldest kernels) */
 static inline unsigned long list_lru_shrink_count(struct list_lru *lru,
                                                  struct shrink_control *sc)
 {
        return list_lru_count(lru);
 }
 static inline unsigned long
 list_lru_shrink_walk(struct list_lru *lru, struct shrink_control *sc,
 		     list_lru_walk_cb isolate, void *cb_arg)
 {
 	return list_lru_walk(lru, isolate, cb_arg, sc->nr_to_scan);
 }
 #endif
 #ifndef KC_LIST_LRU_ADD_OBJ
 #define list_lru_add_obj list_lru_add
 #define list_lru_del_obj list_lru_del
 #endif
 #if defined(KC_LIST_LRU_WALK_CB_LIST_LOCK) || defined(KC_LIST_LRU_WALK_CB_ITEM_LOCK)
 struct list_lru_one;
 typedef enum lru_status (*kc_list_lru_walk_cb_t)(struct list_head *item, struct list_lru_one *list,
 						 void *cb_arg);
 struct kc_isolate_args {
 	kc_list_lru_walk_cb_t isolate;
 	void *cb_arg;
 };
 unsigned long kc_list_lru_walk(struct list_lru *lru, kc_list_lru_walk_cb_t isolate, void *cb_arg,
 			       unsigned long nr_to_walk);
 unsigned long kc_list_lru_shrink_walk(struct list_lru *lru, struct shrink_control *sc,
 				      kc_list_lru_walk_cb_t isolate, void *cb_arg);
 #else
 #define kc_list_lru_shrink_walk list_lru_shrink_walk
 #endif
 #if defined(KC_LIST_LRU_WALK_CB_ITEM_LOCK)
 /* isolate moved by hand, nr_items updated in walk as _REMOVE returned */
 static inline void list_lru_isolate_move(struct list_lru_one *list, struct list_head *item,
 					 struct list_head *head)
 {
        list_move(item, head);
 }
 #endif
 #ifndef KC_STACK_TRACE_SAVE
 #include <linux/stacktrace.h>
 static inline unsigned int stack_trace_save(unsigned long *store, unsigned int size,
 					    unsigned int skipnr)
 {
        struct stack_trace trace = {
                .entries        = store,
                .max_entries    = size,
                .skip           = skipnr,
        };
        save_stack_trace(&trace);
        return trace.nr_entries;
 }
 static inline void stack_trace_print(unsigned long *entries, unsigned int nr_entries, int spaces)
 {
        struct stack_trace trace = {
                .entries        = entries,
                .nr_entries     = nr_entries,
        };
 	print_stack_trace(&trace, spaces);
 }
 #endif
 #endif
--- a/kmod/src/scoutfs_trace.h
+++ b/kmod/src/scoutfs_trace.h
@@ -2526,8 +2526,8 @@ TRACE_EVENT(scoutfs_block_stale,
 DECLARE_EVENT_CLASS(scoutfs_block_class,
 	TP_PROTO(struct super_block *sb, void *bp, u64 blkno, int refcount, int io_count,
-		 unsigned long bits, __u64 accessed),
+		 unsigned long bits),
-	TP_ARGS(sb, bp, blkno, refcount, io_count, bits, accessed),
+	TP_ARGS(sb, bp, blkno, refcount, io_count, bits),
 	TP_STRUCT__entry(
 		SCSB_TRACE_FIELDS
 		__field(void *, bp)
@@ -2535,7 +2535,6 @@ DECLARE_EVENT_CLASS(scoutfs_block_class,
 		__field(int, refcount)
 		__field(int, io_count)
 		__field(long, bits)
 		__field(__u64, accessed)
 	),
 	TP_fast_assign(
 		SCSB_TRACE_ASSIGN(sb);
@@ -2544,71 +2543,65 @@ DECLARE_EVENT_CLASS(scoutfs_block_class,
 		__entry->refcount = refcount;
 		__entry->io_count = io_count;
 		__entry->bits = bits;
 		__entry->accessed = accessed;
 	),
-	TP_printk(SCSBF" bp %p blkno %llu refcount %d io_count %d bits 0x%lx accessed %llu",
+	TP_printk(SCSBF" bp %p blkno %llu refcount %x io_count %d bits 0x%lx",
 		  SCSB_TRACE_ARGS, __entry->bp, __entry->blkno, __entry->refcount,
-		  __entry->io_count, __entry->bits, __entry->accessed)
+		  __entry->io_count, __entry->bits)
 );
 DEFINE_EVENT(scoutfs_block_class, scoutfs_block_allocate,
 	TP_PROTO(struct super_block *sb, void *bp, u64 blkno,
-		 int refcount, int io_count, unsigned long bits,
+		 int refcount, int io_count, unsigned long bits),
-		 __u64 accessed),
+	TP_ARGS(sb, bp, blkno, refcount, io_count, bits)
 	TP_ARGS(sb, bp, blkno, refcount, io_count, bits, accessed)
 );
 DEFINE_EVENT(scoutfs_block_class, scoutfs_block_free,
 	TP_PROTO(struct super_block *sb, void *bp, u64 blkno,
-		 int refcount, int io_count, unsigned long bits,
+		 int refcount, int io_count, unsigned long bits),
-		 __u64 accessed),
+	TP_ARGS(sb, bp, blkno, refcount, io_count, bits)
 	TP_ARGS(sb, bp, blkno, refcount, io_count, bits, accessed)
 );
 DEFINE_EVENT(scoutfs_block_class, scoutfs_block_insert,
 	TP_PROTO(struct super_block *sb, void *bp, u64 blkno,
-		 int refcount, int io_count, unsigned long bits,
+		 int refcount, int io_count, unsigned long bits),
-		 __u64 accessed),
+	TP_ARGS(sb, bp, blkno, refcount, io_count, bits)
 	TP_ARGS(sb, bp, blkno, refcount, io_count, bits, accessed)
 );
 DEFINE_EVENT(scoutfs_block_class, scoutfs_block_remove,
 	TP_PROTO(struct super_block *sb, void *bp, u64 blkno,
-		 int refcount, int io_count, unsigned long bits,
+		 int refcount, int io_count, unsigned long bits),
-		 __u64 accessed),
+	TP_ARGS(sb, bp, blkno, refcount, io_count, bits)
 	TP_ARGS(sb, bp, blkno, refcount, io_count, bits, accessed)
 );
 DEFINE_EVENT(scoutfs_block_class, scoutfs_block_end_io,
 	TP_PROTO(struct super_block *sb, void *bp, u64 blkno,
-		 int refcount, int io_count, unsigned long bits,
+		 int refcount, int io_count, unsigned long bits),
-		 __u64 accessed),
+	TP_ARGS(sb, bp, blkno, refcount, io_count, bits)
 	TP_ARGS(sb, bp, blkno, refcount, io_count, bits, accessed)
 );
 DEFINE_EVENT(scoutfs_block_class, scoutfs_block_submit,
 	TP_PROTO(struct super_block *sb, void *bp, u64 blkno,
-		 int refcount, int io_count, unsigned long bits,
+		 int refcount, int io_count, unsigned long bits),
-		 __u64 accessed),
+	TP_ARGS(sb, bp, blkno, refcount, io_count, bits)
 	TP_ARGS(sb, bp, blkno, refcount, io_count, bits, accessed)
 );
 DEFINE_EVENT(scoutfs_block_class, scoutfs_block_invalidate,
 	TP_PROTO(struct super_block *sb, void *bp, u64 blkno,
-		 int refcount, int io_count, unsigned long bits,
+		 int refcount, int io_count, unsigned long bits),
-		 __u64 accessed),
+	TP_ARGS(sb, bp, blkno, refcount, io_count, bits)
 	TP_ARGS(sb, bp, blkno, refcount, io_count, bits, accessed)
 );
 DEFINE_EVENT(scoutfs_block_class, scoutfs_block_mark_dirty,
 	TP_PROTO(struct super_block *sb, void *bp, u64 blkno,
-		 int refcount, int io_count, unsigned long bits,
+		 int refcount, int io_count, unsigned long bits),
-		 __u64 accessed),
+	TP_ARGS(sb, bp, blkno, refcount, io_count, bits)
 	TP_ARGS(sb, bp, blkno, refcount, io_count, bits, accessed)
 );
 DEFINE_EVENT(scoutfs_block_class, scoutfs_block_forget,
 	TP_PROTO(struct super_block *sb, void *bp, u64 blkno,
-		 int refcount, int io_count, unsigned long bits,
+		 int refcount, int io_count, unsigned long bits),
-		 __u64 accessed),
+	TP_ARGS(sb, bp, blkno, refcount, io_count, bits)
 	TP_ARGS(sb, bp, blkno, refcount, io_count, bits, accessed)
 );
 DEFINE_EVENT(scoutfs_block_class, scoutfs_block_shrink,
 	TP_PROTO(struct super_block *sb, void *bp, u64 blkno,
-		 int refcount, int io_count, unsigned long bits,
+		 int refcount, int io_count, unsigned long bits),
-		 __u64 accessed),
+	TP_ARGS(sb, bp, blkno, refcount, io_count, bits)
-	TP_ARGS(sb, bp, blkno, refcount, io_count, bits, accessed)
+);
 DEFINE_EVENT(scoutfs_block_class, scoutfs_block_isolate,
 	TP_PROTO(struct super_block *sb, void *bp, u64 blkno,
 		 int refcount, int io_count, unsigned long bits),
 	TP_ARGS(sb, bp, blkno, refcount, io_count, bits)
 );
 DECLARE_EVENT_CLASS(scoutfs_ext_next_class,
--- a/kmod/src/server.c
+++ b/kmod/src/server.c
@@ -2134,7 +2134,7 @@ static int server_srch_commit_compact(struct super_block *sb,
 					  &super->srch_root, rid, sc,
 					  &av, &fr);
 	mutex_unlock(&server->srch_mutex);
-	if (ret < 0) /* XXX very bad, leaks allocators */
+	if (ret < 0)
 		goto apply;
 	/* reclaim allocators if they were set by _srch_commit_ */
@@ -2144,10 +2144,10 @@ static int server_srch_commit_compact(struct super_block *sb,
 	      scoutfs_alloc_splice_list(sb, &server->alloc, &server->wri,
 					server->other_freed, &fr);
 	mutex_unlock(&server->alloc_mutex);
 	WARN_ON(ret < 0); /* XXX leaks allocators */
 apply:
 	ret = server_apply_commit(sb, &hold, ret);
 out:
 	WARN_ON(ret < 0); /* XXX leaks allocators */
 	return scoutfs_net_response(sb, conn, cmd, id, ret, NULL, 0);
 }
--- a/kmod/src/srch.c
+++ b/kmod/src/srch.c
@@ -1406,7 +1406,7 @@ int scoutfs_srch_commit_compact(struct super_block *sb,
 			ret = -EIO;
 		scoutfs_btree_put_iref(&iref);
 	}
-	if (ret < 0) /* XXX leaks allocators */
+	if (ret < 0)
 		goto out;
 	/* restore busy to pending if the operation failed */
@@ -1426,10 +1426,8 @@ int scoutfs_srch_commit_compact(struct super_block *sb,
 	/* update file references if we finished compaction (!deleting) */
 	if (!(res->flags & SCOUTFS_SRCH_COMPACT_FLAG_DELETE)) {
 		ret = commit_files(sb, alloc, wri, root, res);
-		if (ret < 0) {
+		if (ret < 0)
 			/* XXX we can't commit, shutdown? */
 			goto out;
 		}
 		/* transition flags for deleting input files */
 		for (i = 0; i < res->nr; i++) {
@@ -1456,7 +1454,7 @@ update:
 			      le64_to_cpu(pending->id), 0);
 		ret = scoutfs_btree_insert(sb, alloc, wri, root, &key,
 					   pending, sizeof(*pending));
-		if (ret < 0)
+		if (WARN_ON_ONCE(ret < 0)) /* XXX inconsistency */
 			goto out;
 	}
@@ -1469,7 +1467,6 @@ update:
 		BUG_ON(err); /* both busy and pending present */
 	}
 out:
 	WARN_ON_ONCE(ret < 0); /* XXX inconsistency */
 	kfree(busy);
 	return ret;
 }
--- a/tests/run-tests.sh
+++ b/tests/run-tests.sh
@@ -90,6 +90,7 @@ done
 # set some T_ defaults
 T_TRACE_DUMP="0"
 T_TRACE_PRINTK="0"
 T_PORT_START="19700"
 # array declarations to be able to use array ops
 declare -a T_TRACE_GLOB
@@ -265,6 +266,17 @@ for e in T_META_DEVICE T_DATA_DEVICE T_EX_META_DEV T_EX_DATA_DEV T_KMOD T_RESULT
 	eval $e=\"$(readlink -f "${!e}")\"
 done
 # try and check ports, but not necessary
 T_TEST_PORT="$T_PORT_START"
 T_SCRATCH_PORT="$((T_PORT_START + 100))"
 T_DEV_PORT="$((T_PORT_START + 200))"
 read local_start local_end < /proc/sys/net/ipv4/ip_local_port_range
 if [ -n "$local_start" -a -n "$local_end" -a "$local_start" -lt "$local_end" ]; then
 	if [ ! "$T_DEV_PORT" -lt "$local_start" -a ! "$T_TEST_PORT" -gt "$local_end" ]; then
 		die "listening port range $T_TEST_PORT - $T_DEV_PORT is within local dynamic port range $local_start - $local_end in /proc/sys/net/ipv4/ip_local_port_range"
 	fi
 fi
 # permute sequence?
 T_SEQUENCE=sequence
 if [ -n "$T_SHUF" ]; then
@@ -363,7 +375,7 @@ fi
 quo=""
 if [ -n "$T_MKFS" ]; then
 	for i in $(seq -0 $((T_QUORUM - 1))); do
-		quo="$quo -Q $i,127.0.0.1,$((42000 + i))"
+		quo="$quo -Q $i,127.0.0.1,$((T_TEST_PORT + i))"
 	done
 	msg "making new filesystem with $T_QUORUM quorum members"
--- a/tests/tests/basic-bad-mounts.sh
+++ b/tests/tests/basic-bad-mounts.sh
@@ -15,7 +15,7 @@ echo "== prepare devices, mount point, and logs"
 SCR="$T_TMPDIR/mnt.scratch"
 mkdir -p "$SCR"
 > $T_TMP.mount.out
-scoutfs mkfs -f -Q 0,127.0.0.1,53000 "$T_EX_META_DEV" "$T_EX_DATA_DEV" > $T_TMP.mkfs.out 2>&1 \
+scoutfs mkfs -f -Q 0,127.0.0.1,$T_SCRATCH_PORT "$T_EX_META_DEV" "$T_EX_DATA_DEV" > $T_TMP.mkfs.out 2>&1 \
 	|| t_fail "mkfs failed"
 echo "== bad devices, bad options"
--- a/tests/tests/change-devices.sh
+++ b/tests/tests/change-devices.sh
@@ -11,7 +11,7 @@ truncate -s $sz "$T_TMP.equal"
 truncate -s $large_sz "$T_TMP.large"
 echo "== make scratch fs"
-t_quiet scoutfs mkfs -f -Q 0,127.0.0.1,53000 "$T_EX_META_DEV" "$T_EX_DATA_DEV"
+t_quiet scoutfs mkfs -f -Q 0,127.0.0.1,$T_SCRATCH_PORT "$T_EX_META_DEV" "$T_EX_DATA_DEV"
 SCR="$T_TMPDIR/mnt.scratch"
 mkdir -p "$SCR"
--- a/tests/tests/enospc.sh
+++ b/tests/tests/enospc.sh
@@ -57,7 +57,7 @@ test "$before" == "$after" || \
 # XXX this is all pretty manual, would be nice to have helpers
 echo "== make small meta fs"
 # meta device just big enough for reserves and the metadata we'll fill
-scoutfs mkfs -A -f -Q 0,127.0.0.1,53000 -m 10G "$T_EX_META_DEV" "$T_EX_DATA_DEV" > $T_TMP.mkfs.out 2>&1 || \
+scoutfs mkfs -A -f -Q 0,127.0.0.1,$T_SCRATCH_PORT -m 10G "$T_EX_META_DEV" "$T_EX_DATA_DEV" > $T_TMP.mkfs.out 2>&1 || \
 	t_fail "mkfs failed"
 SCR="$T_TMPDIR/mnt.scratch"
 mkdir -p "$SCR"
--- a/tests/tests/format-version-forward-back.sh
+++ b/tests/tests/format-version-forward-back.sh
@@ -89,7 +89,7 @@ for vers in $(seq $MIN $((MAX - 1))); do
 	old_module="$builds/$vers/scoutfs.ko"
 	echo "mkfs $vers" >> "$T_TMP.log"
-	t_quiet $old_scoutfs mkfs -f -Q 0,127.0.0.1,53000 "$T_EX_META_DEV" "$T_EX_DATA_DEV" \
+	t_quiet $old_scoutfs mkfs -f -Q 0,127.0.0.1,$T_SCRATCH_PORT "$T_EX_META_DEV" "$T_EX_DATA_DEV" \
 		|| t_fail "mkfs $vers failed"
 	echo "mount $vers with $vers" >> "$T_TMP.log"
--- a/tests/tests/resize-devices.sh
+++ b/tests/tests/resize-devices.sh
@@ -72,7 +72,7 @@ quarter_data=$(echo "$size_data / 4" | bc)
 # XXX this is all pretty manual, would be nice to have helpers
 echo "== make initial small fs"
-scoutfs mkfs -A -f -Q 0,127.0.0.1,53000 -m $quarter_meta -d $quarter_data \
+scoutfs mkfs -A -f -Q 0,127.0.0.1,$T_SCRATCH_PORT -m $quarter_meta -d $quarter_data \
 	"$T_EX_META_DEV" "$T_EX_DATA_DEV" > $T_TMP.mkfs.out 2>&1 || \
 		t_fail "mkfs failed"
 SCR="$T_TMPDIR/mnt.scratch"
--- a/tests/tests/xfstests.sh
+++ b/tests/tests/xfstests.sh
@@ -50,9 +50,9 @@ t_quiet sync
 cat << EOF > local.config
 export FSTYP=scoutfs
 export MKFS_OPTIONS="-f"
-export MKFS_TEST_OPTIONS="-Q 0,127.0.0.1,42000"
+export MKFS_TEST_OPTIONS="-Q 0,127.0.0.1,$T_TEST_PORT"
-export MKFS_SCRATCH_OPTIONS="-Q 0,127.0.0.1,43000"
+export MKFS_SCRATCH_OPTIONS="-Q 0,127.0.0.1,$T_SCRATCH_PORT"
-export MKFS_DEV_OPTIONS="-Q 0,127.0.0.1,44000"
+export MKFS_DEV_OPTIONS="-Q 0,127.0.0.1,$T_DEV_PORT"
 export TEST_DEV=$T_DB0
 export TEST_DIR=$T_M0
 export SCRATCH_META_DEV=$T_EX_META_DEV