Remove unused orig keys from forest read items

These orig copies of the start and end keys serve no purpose. I think they were an editing mistake left over from a version where retries could happen within the read_items call. Signed-off-by: Zach Brown <zab@versity.com>
Make forest_read_items bloom key optional
2026-06-09 21:22:36 +00:00 · 2026-04-27 10:15:19 -07:00 · 2026-04-27 10:15:19 -07:00 · 2026-04-27 10:15:19 -07:00 · 2026-04-27 10:15:19 -07:00 · 2026-04-27 10:15:19 -07:00
76 changed files with 4800 additions and 674 deletions
@@ -1,6 +1,118 @@
 Versity ScoutFS Release Notes
 =============================

+---
+v1.30
+\
+*Apr 21, 2026*
+
+Fix a problem reading the accumulated totals of contributing .totl.
+xattrs when log merging is in progress.  The problem would have readers
+of the totals calculate the sums incorrectly.
+
+Fix a problem updating quota rules.  There was a race where updates
+could be corrupted if they happened while a transaction was being
+written.
+
+Fix a problem deleting files with .indx. xattrs.  The internal indexing
+metadata wouldn't be properly deleted so the files would still claim to
+be present and visible in the index, though the file no longer existed.
+
+---
+v1.29
+\
+*Mar 25, 2026*
+
+Add a repair mechanism for mount logs that weren't properly resolved as
+mounts left the cluster.  The presence of these logs prevents log
+merging from making forward progress and the backlog of logs over time
+can cause operations to slow to a crawl.  With the repair mechanism in
+place the orphaned logs don't stop merging and operations proceed as
+usual.
+
+Add an ioctl for turning offline unmapped file regions into sparse
+regions.
+
+---
+v1.28
+\
+*Feb 5, 2026*
+
+Fix a bug that lead to incorrect negative caching of ACL entries
+starting in version 9.6 of distribution kernels in the enterprise linux
+family.  This would manifest as ACLs seemingly disappearing,
+particularly default ACLs on directories.  The persistent ACLs always
+existed but because of internal API incompatibility some readers
+couldn't see them and would cache that they didn't exist.
+
+---
+v1.27
+\
+*Jan 15, 2026*
+
+Switch away from using the general VM cache reclaim machinery to reduce
+idle cluster locks in the client.  The VM treated locks like a cache and
+let many accumulate, presuming that it would be efficient to free them
+in batches.  Lock freeing requires network communication so this could
+result in enormous backlogs in network messages (on the order of
+hundreds of thousands) and could result in signifcant delays of other
+network messaging.
+
+Fix inefficient network receive processing while many messages are in
+the send queue.  This consumed sufficient CPU to cause significant
+stalls, perhaps resulting in hung task warning messages due to delayed
+lock message delivery.
+
+Fix a server livelock case that could happen while committing client
+transactions that contain a large amount of freed file data extents.
+This would present as client tasks hanging and a server task spinning
+consuming cpu.
+
+Fix a rare server request processing failure that doesn't deal with
+retransmission of a request that a previous server partially processed.
+This would present as hung client tasks and repeated "error -2
+committing log merge: getting merge status item" kernel messages.
+
+Fix an unneccessary server shutdown during specific circumstances in
+client lock recovery.  The shutdown was due to server state and was
+ultimately harmless.  The next server that started up would proceed
+accordingly.
+
+---
+v1.26
+\
+*Nov 17, 2025*
+
+Add the ino\_alloc\_per\_lock mount option.  This changes the number of
+inode numbers allocated under each cluster lock and can alleviate lock
+contention for some patterns of larger file creation.
+
+Add the tcp\_keepalive\_timeout\_ms mount option.  This can enable the
+system to survive longer periods of networking outages.
+
+Fix a rare double free of internal btree metadata blocks when merging
+log trees.  The duplicated freed metadata block numbers would cause
+persistent errors in the server, preventing the server from starting and
+hanging the system.
+
+Fix the data\_wait interface to not require the correct data\_version of
+the inode when raising an error.  This lets callers raise errors when
+they're unable to recall the details of the inode to discover its
+data\_version.
+
+Change scoutfs to more aggressively reclaim cached memory when under
+memory pressure.  This makes scoutfs behave more like other kernel
+components and it integrates better with the reclaim policy heuristics
+in the VM core of the kernel.
+
+Change scoutfs to more efficiently transmit and receive socket messages.
+Under heavy load this can process messages sufficiently more quickly to
+avoid hung task messages for tasks that were waiting for cluster lock
+messages to be processed.
+
+Fix faulty server block commit budget calculations that were generating
+spurious "holders exceeded alloc budget" console messages.
+
 ---
 v1.25
 \
@@ -13,6 +13,7 @@ scoutfs-y +=			\
 	avl.o			\
 	alloc.o			\
 	block.o			\
+	bsearch_index.o		\
 	btree.o			\
 	client.o		\
 	counters.o		\
@@ -36,6 +37,7 @@ scoutfs-y +=			\
 	per_task.o		\
 	quorum.o		\
 	quota.o			\
+	raw.o			\
 	recov.o			\
 	scoutfs_trace.o		\
 	server.o		\
@@ -479,10 +479,20 @@ ifneq (,$(shell grep '^unsigned int stack_trace_save' include/linux/stacktrace.h
 ccflags-y += -DKC_STACK_TRACE_SAVE
 endif

-# v6.1-rc1-4-g7420332a6ff4
 #
-# .get_acl() method now has dentry arg (and mnt_idmap). The old get_acl has been renamed
-# to get_inode_acl() and is still available as well, but has an extra rcu param.
-ifneq (,$(shell grep 'struct posix_acl ...get_acl..struct mnt_idmap ., struct dentry' include/linux/fs.h))
-ccflags-y += -DKC_GET_ACL_DENTRY
+# v6.1-rc1-2-g138060ba92b3
+#
+# set_acl now passed a struct dentry instead of inode.
+#
+ifneq (,$(shell grep 'int ..set_acl.*struct dentry' include/linux/fs.h))
+ccflags-y += -DKC_SET_ACL_DENTRY
+endif
+
+#
+# v6.1-rc1-3-gcac2f8b8d8b5
+#
+# get_acl renamed to get_inode_acl.
+#
+ifneq (,$(shell grep 'struct posix_acl.*get_inode_acl' include/linux/fs.h))
+ccflags-y += -DKC_GET_INODE_ACL
 endif
@@ -107,20 +107,22 @@ struct posix_acl *scoutfs_get_acl_locked(struct inode *inode, int type, struct s
 	return acl;
 }

-#ifdef KC_GET_ACL_DENTRY
-struct posix_acl *scoutfs_get_acl(KC_VFS_NS_DEF
-				  struct dentry *dentry, int type)
-{
-	struct inode *inode = dentry->d_inode;
+#ifdef KC_GET_INODE_ACL
+struct posix_acl *scoutfs_get_acl(struct inode *inode, int type, bool rcu)
 #else
 struct posix_acl *scoutfs_get_acl(struct inode *inode, int type)
-{
 #endif
+{
 	struct super_block *sb = inode->i_sb;
 	struct scoutfs_lock *lock = NULL;
 	struct posix_acl *acl;
 	int ret;

+#ifdef KC_GET_INODE_ACL
+	if (rcu)
+		return ERR_PTR(-ECHILD);
+#endif
+
 #ifndef KC___POSIX_ACL_CREATE
 	if (!IS_POSIXACL(inode))
 		return NULL;
@@ -208,7 +210,7 @@ out:
 	return ret;
 }

-#ifdef KC_GET_ACL_DENTRY
+#ifdef KC_SET_ACL_DENTRY
 int scoutfs_set_acl(KC_VFS_NS_DEF
 		    struct dentry *dentry, struct posix_acl *acl, int type)
 {
@@ -254,9 +256,8 @@ int scoutfs_acl_get_xattr(struct dentry *dentry, const char *name, void *value,
 	if (!IS_POSIXACL(dentry->d_inode))
 		return -EOPNOTSUPP;

-#ifdef KC_GET_ACL_DENTRY
-	acl = scoutfs_get_acl(KC_VFS_INIT_NS
-			      dentry, type);
+#ifdef KC_GET_INODE_ACL
+	acl = scoutfs_get_acl(dentry->d_inode, type, false);
 #else
 	acl = scoutfs_get_acl(dentry->d_inode, type);
 #endif
@@ -305,7 +306,7 @@ int scoutfs_acl_set_xattr(struct dentry *dentry, const char *name, const void *v
 		}
 	}

-#ifdef KC_GET_ACL_DENTRY
+#ifdef KC_SET_ACL_DENTRY
 	ret = scoutfs_set_acl(KC_VFS_INIT_NS dentry, acl, type);
 #else
 	ret = scoutfs_set_acl(dentry->d_inode, acl, type);
@@ -1,12 +1,16 @@
 #ifndef _SCOUTFS_ACL_H_
 #define _SCOUTFS_ACL_H_

-#ifdef KC_GET_ACL_DENTRY
-struct posix_acl *scoutfs_get_acl(KC_VFS_NS_DEF struct dentry *dentry, int type);
-int scoutfs_set_acl(KC_VFS_NS_DEF struct dentry *dentry, struct posix_acl *acl, int type);
+#ifdef KC_SET_ACL_DENTRY
+int scoutfs_set_acl(KC_VFS_NS_DEF
+		    struct dentry *dentry, struct posix_acl *acl, int type);
+#else
+int scoutfs_set_acl(struct inode *inode, struct posix_acl *acl, int type);
+#endif
+#ifdef KC_GET_INODE_ACL
+struct posix_acl *scoutfs_get_acl(struct inode *inode, int type, bool rcu);
 #else
 struct posix_acl *scoutfs_get_acl(struct inode *inode, int type);
-int scoutfs_set_acl(struct inode *inode, struct posix_acl *acl, int type);
 #endif
 struct posix_acl *scoutfs_get_acl_locked(struct inode *inode, int type, struct scoutfs_lock *lock);
 int scoutfs_set_acl_locked(struct inode *inode, struct posix_acl *acl, int type,
@@ -1,6 +1,8 @@
 #ifndef _SCOUTFS_BLOCK_H_
 #define _SCOUTFS_BLOCK_H_

+struct scoutfs_alloc;
+
 struct scoutfs_block_writer {
 	spinlock_t lock;
 	struct list_head dirty_list;
@@ -0,0 +1,59 @@
+/*
+ * Copyright (C) 2026 Versity Software, Inc.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ */
+#include <linux/kernel.h>
+#include <linux/bsearch.h>
+
+#include "bsearch_index.h"
+
+struct bsearch_index_key {
+	int (*cmp)(const void *key, const void *elt);
+	/* the key has to be const, so we have to update the index through a pointer */
+	void **index_elt;
+	const void *key;
+	size_t size;
+};
+
+static int cmp_index(const void *key, const void *elt)
+{
+	const struct bsearch_index_key *bik = key;
+	int cmp = bik->cmp(bik->key, elt);
+
+	if (cmp > 0)
+		*(bik->index_elt) = (void *)elt + bik->size;
+	else
+		*(bik->index_elt) = (void *)elt;
+
+	return cmp;
+}
+
+/*
+ * A bsearch() wrapper that returns the index of the element of the
+ * array that the key would be stored in to maintain sort order.  It's
+ * the first element where the existing element is greater than the key.
+ * It returns the size of the array if the key is greater than the last
+ * element in the array.
+ */
+size_t bsearch_index(const void *key, const void *base, size_t num, size_t size,
+		     int (*cmp)(const void *key, const void *elt))
+{
+	void *index_elt = (void *)base;
+	struct bsearch_index_key bik = {
+		.cmp = cmp,
+		.index_elt = &index_elt,
+		.key = key,
+		.size = size,
+	};
+
+	bsearch(&bik, base, num, size, cmp_index);
+	return ((unsigned long)index_elt - (unsigned long)base) / size;
+}
@@ -0,0 +1,7 @@
+#ifndef _SCOUTFS_BSEARCH_INDEX_H_
+#define _SCOUTFS_BSEARCH_INDEX_H_
+
+size_t bsearch_index(const void *key, const void *base, size_t num, size_t size,
+		     int (*cmp)(const void *key, const void *elt));
+
+#endif
@@ -1816,6 +1816,11 @@ int scoutfs_btree_dirty(struct super_block *sb,
 * Call the users callback on all the items in the leaf that we find.
 * We also set the caller's keys for the first and last possible keys
 * that could exist in the leaf block.
+ *
+ * The callback can set a new key to continue reading from rather than
+ * iterating over all the items.  It modifies the key and returns
+ * -ESRCH, which performs a new avl search.  If the modified key falls
+ * outside of the range of keys in the block then we return.
 */
 int scoutfs_btree_read_items(struct super_block *sb,
 			     struct scoutfs_btree_root *root,
@@ -1829,6 +1834,7 @@ int scoutfs_btree_read_items(struct super_block *sb,
 	struct scoutfs_avl_node *next_node;
 	struct scoutfs_avl_node *node;
 	struct btree_walk_key_range kr;
+	struct scoutfs_key cb_key;
 	struct scoutfs_block *bl;
 	int ret;

@@ -1842,22 +1848,32 @@ int scoutfs_btree_read_items(struct super_block *sb,
 	if (scoutfs_key_compare(&kr.end, end) < 0)
 		*end = kr.end;

-	node = scoutfs_avl_search(&bt->item_root, cmp_key_item, start, NULL,
+	cb_key = *start;
+search:
+	node = scoutfs_avl_search(&bt->item_root, cmp_key_item, &cb_key, NULL,
 				  NULL, &next_node, NULL) ?: next_node;
 	while (node) {
 		item = node_item(node);
 		if (scoutfs_key_compare(&item->key, end) > 0)
 			break;

-		ret = cb(sb, item_key(item), le64_to_cpu(item->seq), item->flags,
+		cb_key = *item_key(item);
+		ret = cb(sb, &cb_key, le64_to_cpu(item->seq), item->flags,
 			 item_val(bt, item), item_val_len(item), arg);
-		if (ret < 0)
-			break;
+		if (ret < 0) {
+			if (ret == -ESRCH) {
+				if (scoutfs_key_compare(&cb_key, start) >= 0)
+					goto search;
+				ret = 0;
+			}
+			goto out;
+		}

 		node = scoutfs_avl_next(&bt->item_root, node);
 	}

 	scoutfs_block_put(sb, bl);
+	ret = 0;
 out:
 	return ret;
 }
@@ -2183,6 +2199,8 @@ static int merge_read_item(struct super_block *sb, struct scoutfs_key *key, u64
 		if (ret > 0) {
 			if (ret == SCOUTFS_DELTA_COMBINED) {
 				scoutfs_inc_counter(sb, btree_merge_delta_combined);
+				if (seq > found->seq)
+					found->seq = seq;
 			} else if (ret == SCOUTFS_DELTA_COMBINED_NULL) {
 				scoutfs_inc_counter(sb, btree_merge_delta_null);
 				free_mitem(rng, found);
@@ -2486,6 +2504,14 @@ int scoutfs_btree_merge(struct super_block *sb,
 			mitem = next_mitem(mitem);
 			free_mitem(&rng, tmp);
 		}
+
+		if (mitem && walk_val_len == 0 &&
+		    !(walk_flags & (BTW_INSERT | BTW_DELETE)) &&
+		    scoutfs_trigger(sb, LOG_MERGE_FORCE_PARTIAL)) {
+			ret = -ERANGE;
+			*next_ret = mitem->key;
+			goto out;
+		}
 	}

 	ret = 0;
@@ -125,7 +125,6 @@
 	EXPAND_COUNTER(item_update)				\
 	EXPAND_COUNTER(item_write_dirty)			\
 	EXPAND_COUNTER(lock_alloc)				\
-	EXPAND_COUNTER(lock_count_objects)			\
 	EXPAND_COUNTER(lock_free)				\
 	EXPAND_COUNTER(lock_grant_request)			\
 	EXPAND_COUNTER(lock_grant_response)			\
@@ -139,13 +138,13 @@
 	EXPAND_COUNTER(lock_lock_error)				\
 	EXPAND_COUNTER(lock_nonblock_eagain)			\
 	EXPAND_COUNTER(lock_recover_request)			\
-	EXPAND_COUNTER(lock_scan_objects)			\
 	EXPAND_COUNTER(lock_shrink_attempted)			\
-	EXPAND_COUNTER(lock_shrink_aborted)			\
-	EXPAND_COUNTER(lock_shrink_work)			\
+	EXPAND_COUNTER(lock_shrink_request_failed)		\
 	EXPAND_COUNTER(lock_unlock)				\
 	EXPAND_COUNTER(lock_wait)				\
+	EXPAND_COUNTER(log_merge_complete)			\
 	EXPAND_COUNTER(log_merge_no_finalized)			\
+	EXPAND_COUNTER(log_merge_start)				\
 	EXPAND_COUNTER(log_merge_wait_timeout)			\
 	EXPAND_COUNTER(net_dropped_response)			\
 	EXPAND_COUNTER(net_send_bytes)				\
@@ -160,6 +159,7 @@
 	EXPAND_COUNTER(orphan_scan)				\
 	EXPAND_COUNTER(orphan_scan_attempts)			\
 	EXPAND_COUNTER(orphan_scan_cached)			\
+	EXPAND_COUNTER(orphan_scan_empty)			\
 	EXPAND_COUNTER(orphan_scan_error)			\
 	EXPAND_COUNTER(orphan_scan_item)			\
 	EXPAND_COUNTER(orphan_scan_omap_set)			\
@@ -79,8 +79,10 @@ static void item_from_extent(struct scoutfs_key *key,
 		.skdx_end = cpu_to_le64(start + len - 1),
 		.skdx_len = cpu_to_le64(len),
 	};
-	dv->blkno = cpu_to_le64(map);
-	dv->flags = flags;
+	*dv = (struct scoutfs_data_extent_val) {
+		.blkno = cpu_to_le64(map),
+		.flags = flags,
+	};
 }

 static void ext_from_item(struct scoutfs_extent *ext,
@@ -1515,6 +1517,101 @@ out:
 	return ret;
 }

+/*
+ * Punch holes in offline extents.  This is a very specific tool that
+ * only does one job: it converts extents from offline to sparse.  It
+ * returns an error if it encounters an extent that isn't offline or has
+ * a block mapping.  It ignores i_size completely; it does not test it,
+ * and does not update it.
+ *
+ * The caller has the inode locked in the vfs and performed basic sanity
+ * checks.  We manage transactions and the extent_sem which is ordered
+ * inside the transaction.
+ */
+int scoutfs_data_punch_offline(struct inode *inode, u64 iblock, u64 last, u64 data_version,
+			       struct scoutfs_lock *lock)
+{
+	struct scoutfs_inode_info *si = SCOUTFS_I(inode);
+	struct super_block *sb = inode->i_sb;
+	struct data_ext_args args = {
+		.ino = scoutfs_ino(inode),
+		.inode = inode,
+		.lock = lock,
+	};
+	struct scoutfs_extent ext;
+	LIST_HEAD(ind_locks);
+	int ret;
+	int i;
+
+	if (WARN_ON_ONCE(iblock > last)) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	/* idiomatic to call start,last with 0,~0, clamp last to last possible */
+	last = min(last, SCOUTFS_BLOCK_SM_MAX);
+
+	ret = 0;
+	while (iblock <= last) {
+		ret = scoutfs_inode_index_lock_hold(inode, &ind_locks, true, false) ?:
+		      scoutfs_dirty_inode_item(inode, lock);
+		if (ret < 0)
+			break;
+
+		down_write(&si->extent_sem);
+
+		for (i = 0; i < 32 && (iblock <= last); i++) {
+			ret = scoutfs_ext_next(sb, &data_ext_ops, &args, iblock, 1, &ext);
+			if (ret == -ENOENT) {
+				iblock = last + 1;
+				ret = 0;
+				break;
+			}
+
+			if (ret < 0)
+				break;
+
+			if (ext.start > last) {
+				iblock = last + 1;
+				break;
+			}
+
+			if (ext.map) {
+				ret = -EINVAL;
+				break;
+			}
+
+			if (ext.flags & SEF_OFFLINE) {
+				if (iblock > ext.start) {
+					ext.len -= iblock - ext.start;
+					ext.start = iblock;
+				}
+				ext.len = min(ext.len, last - ext.start + 1);
+				ext.flags &= ~SEF_OFFLINE;
+
+				ret = scoutfs_ext_set(sb, &data_ext_ops, &args,
+						      ext.start, ext.len, ext.map, ext.flags);
+				if (ret < 0)
+					break;
+			}
+
+			iblock = ext.start + ext.len;
+		}
+
+		up_write(&si->extent_sem);
+
+		scoutfs_update_inode_item(inode, lock, &ind_locks);
+		scoutfs_release_trans(sb);
+		scoutfs_inode_index_unlock(sb, &ind_locks);
+
+		if (ret < 0)
+			break;
+	}
+
+out:
+	return ret;
+}
+
 /*
 * This copies to userspace :/
 */
@@ -57,6 +57,8 @@ int scoutfs_data_init_offline_extent(struct inode *inode, u64 size,
 int scoutfs_data_move_blocks(struct inode *from, u64 from_off,
 			     u64 byte_len, struct inode *to, u64 to_off, bool to_stage,
 			     u64 data_version);
+int scoutfs_data_punch_offline(struct inode *inode, u64 iblock, u64 last, u64 data_version,
+			       struct scoutfs_lock *lock);

 int scoutfs_data_wait_check(struct inode *inode, loff_t pos, loff_t len,
 			    u8 sef, u8 op, struct scoutfs_data_wait *ow,
@@ -587,10 +587,12 @@ static int add_entry_items(struct super_block *sb, u64 dir_ino, u64 hash,
 	}

 	/* initialize the dent */
-	dent->ino = cpu_to_le64(ino);
-	dent->hash = cpu_to_le64(hash);
-	dent->pos = cpu_to_le64(pos);
-	dent->type = mode_to_type(mode);
+	*dent = (struct scoutfs_dirent) {
+		.ino = cpu_to_le64(ino),
+		.hash = cpu_to_le64(hash),
+		.pos = cpu_to_le64(pos),
+		.type = mode_to_type(mode),
+	};
 	memcpy(dent->name, name, name_len);

 	init_dirent_key(&ent_key, SCOUTFS_DIRENT_TYPE, dir_ino, hash, pos);
@@ -2006,7 +2008,11 @@ const struct inode_operations scoutfs_symlink_iops = {
 #ifdef KC_LINUX_HAVE_RHEL_IOPS_WRAPPER
 	.removexattr	= generic_removexattr,
 #endif
+#ifdef KC_GET_INODE_ACL
+	.get_inode_acl	= scoutfs_get_acl,
+#else
 	.get_acl	= scoutfs_get_acl,
+#endif
 #ifndef KC_LINUX_HAVE_RHEL_IOPS_WRAPPER
 	.tmpfile	= scoutfs_tmpfile,
 	.rename		= scoutfs_rename_common,
@@ -2052,8 +2058,12 @@ const struct inode_operations scoutfs_dir_iops = {
 	.removexattr	= generic_removexattr,
 #endif
 	.listxattr	= scoutfs_listxattr,
+#ifdef KC_GET_INODE_ACL
+	.get_inode_acl	= scoutfs_get_acl,
+#else
 	.get_acl	= scoutfs_get_acl,
-#ifdef KC_GET_ACL_DENTRY
+#endif
+#ifdef KC_SET_ACL_DENTRY
 	.set_acl	= scoutfs_set_acl,
 #endif
 	.symlink	= scoutfs_symlink,
@@ -114,6 +114,42 @@ static struct scoutfs_block *read_bloom_ref(struct super_block *sb, struct scout
 	return bl;
 }

+/*
+ * Returns >0 if there was a bloom block and all the bits were present.
+ */
+static int all_bloom_bits_present(struct super_block *sb, struct scoutfs_block_ref *ref,
+				  struct forest_bloom_nrs *bloom)
+{
+	struct scoutfs_bloom_block *bb;
+	struct scoutfs_block *bl;
+	int i;
+
+	if (ref->blkno == 0)
+		return 0;
+
+	bl = read_bloom_ref(sb, ref);
+	if (IS_ERR(bl))
+		return PTR_ERR(bl);
+
+	bb = bl->data;
+
+	for (i = 0; i < ARRAY_SIZE(bloom->nrs); i++) {
+		if (!test_bit_le(bloom->nrs[i], bb->bits))
+			break;
+	}
+
+	scoutfs_block_put(sb, bl);
+
+	/* one of the bloom bits wasn't set */
+	if (i != ARRAY_SIZE(bloom->nrs)) {
+		scoutfs_inc_counter(sb, forest_bloom_fail);
+		return 0;
+	}
+
+	scoutfs_inc_counter(sb, forest_bloom_pass);
+	return 1;
+}
+
 /*
 * This is an unlocked iteration across all the btrees to find a hint at
 * the next key that the caller could read.  It's used to find out what
@@ -227,9 +263,13 @@ static int forest_read_items(struct super_block *sb, struct scoutfs_key *key, u6
 }

 /*
- * For each forest btree whose bloom block indicates that the lock might
- * have items stored, call the caller's callback for every item in the
- * leaf block in each tree which contains the key.
+ * Call the caller's callback for every item in the leaf blocks in each
+ * forest btree that contain the caller's key.
+ *
+ * If a bloom key is provided then each log tree's bloom block is
+ * checked and only trees with all the bloom key's bloom bits set will
+ * be read from.  When the bloom key is null all trees will be read
+ * from.
 *
 * The btree iter calls clamp the caller's range to the tightest range
 * that covers all the blocks.  Any keys outside of this range can't be
@@ -239,33 +279,26 @@ static int forest_read_items(struct super_block *sb, struct scoutfs_key *key, u6
 * to reset their state and retry with a newer version of the btrees.
 */
 int scoutfs_forest_read_items_roots(struct super_block *sb, struct scoutfs_net_roots *roots,
-				    struct scoutfs_key *key, struct scoutfs_key *bloom_key,
-				    struct scoutfs_key *start, struct scoutfs_key *end,
-				    scoutfs_forest_item_cb cb, void *arg)
+				    u64 merge_input_seq, struct scoutfs_key *key,
+				    struct scoutfs_key *bloom_key, struct scoutfs_key *start,
+				    struct scoutfs_key *end, scoutfs_forest_item_cb cb, void *arg)
 {
 	struct forest_read_items_data rid = {
 		.cb = cb,
 		.cb_arg = arg,
 	};
 	struct scoutfs_log_trees lt;
-	struct scoutfs_bloom_block *bb;
 	struct forest_bloom_nrs bloom;
 	SCOUTFS_BTREE_ITEM_REF(iref);
-	struct scoutfs_block *bl;
 	struct scoutfs_key ltk;
-	struct scoutfs_key orig_start = *start;
-	struct scoutfs_key orig_end = *end;
 	int ret;
-	int i;

 	scoutfs_inc_counter(sb, forest_read_items);
-	calc_bloom_nrs(&bloom, bloom_key);
+	if (bloom_key)
+		calc_bloom_nrs(&bloom, bloom_key);

 	trace_scoutfs_forest_using_roots(sb, &roots->fs_root, &roots->logs_root);

-	*start = orig_start;
-	*end = orig_end;
-
 	/* start with fs root items */
 	rid.fic |= FIC_FS_ROOT;
 	ret = scoutfs_btree_read_items(sb, &roots->fs_root, key, start, end,
@@ -292,40 +325,29 @@ int scoutfs_forest_read_items_roots(struct super_block *sb, struct scoutfs_net_r
 			goto out; /* including stale */
 		}

-		if (lt.bloom_ref.blkno == 0)
+		/* we're not expecting -ENOENT from _read_items */
+		if (lt.item_root.ref.blkno == 0)
 			continue;

-		bl = read_bloom_ref(sb, &lt.bloom_ref);
-		if (IS_ERR(bl)) {
-			ret = PTR_ERR(bl);
-			goto out;
-		}
-		bb = bl->data;
-
-		for (i = 0; i < ARRAY_SIZE(bloom.nrs); i++) {
-			if (!test_bit_le(bloom.nrs[i], bb->bits))
-				break;
+		if (bloom_key) {
+			ret = all_bloom_bits_present(sb, &lt.bloom_ref, &bloom);
+			if (ret < 0)
+				goto out;
+			if (ret == 0)
+				continue;
 		}

-		scoutfs_block_put(sb, bl);
-
-		/* one of the bloom bits wasn't set */
-		if (i != ARRAY_SIZE(bloom.nrs)) {
-			scoutfs_inc_counter(sb, forest_bloom_fail);
-			continue;
-		}
-
-		scoutfs_inc_counter(sb, forest_bloom_pass);
-
-		if ((le64_to_cpu(lt.flags) & SCOUTFS_LOG_TREES_FINALIZED))
-			rid.fic |= FIC_FINALIZED;
+		if ((le64_to_cpu(lt.flags) & SCOUTFS_LOG_TREES_FINALIZED) &&
+		    (merge_input_seq == 0 ||
+		     le64_to_cpu(lt.finalize_seq) < merge_input_seq))
+			rid.fic |= FIC_MERGE_INPUT;

 		ret = scoutfs_btree_read_items(sb, &lt.item_root, key, start,
 					       end, forest_read_items, &rid);
 		if (ret < 0)
 			goto out;

-		rid.fic &= ~FIC_FINALIZED;
+		rid.fic &= ~FIC_MERGE_INPUT;
 	}

 	ret = 0;
@@ -345,7 +367,7 @@ int scoutfs_forest_read_items(struct super_block *sb,

 	ret = scoutfs_client_get_roots(sb, &roots);
 	if (ret == 0)
-		ret = scoutfs_forest_read_items_roots(sb, &roots, key, bloom_key, start, end,
+		ret = scoutfs_forest_read_items_roots(sb, &roots, 0, key, bloom_key, start, end,
 						      cb, arg);
 	return ret;
 }
@@ -793,7 +815,7 @@ out:
 	if (ret)
 		scoutfs_forest_destroy(sb);

-	return 0;
+	return ret;
 }

 void scoutfs_forest_start(struct super_block *sb)
@@ -11,7 +11,7 @@ struct scoutfs_lock;
 /* caller gives an item to the callback */
 enum {
 	FIC_FS_ROOT = (1 << 0),
-	FIC_FINALIZED = (1 << 1),
+	FIC_MERGE_INPUT = (1 << 1),
 };
 typedef int (*scoutfs_forest_item_cb)(struct super_block *sb, struct scoutfs_key *key, u64 seq,
 				      u8 flags, void *val, int val_len, int fic, void *arg);
@@ -25,9 +25,9 @@ int scoutfs_forest_read_items(struct super_block *sb,
 			      struct scoutfs_key *end,
 			      scoutfs_forest_item_cb cb, void *arg);
 int scoutfs_forest_read_items_roots(struct super_block *sb, struct scoutfs_net_roots *roots,
-				    struct scoutfs_key *key, struct scoutfs_key *bloom_key,
-				    struct scoutfs_key *start, struct scoutfs_key *end,
-				    scoutfs_forest_item_cb cb, void *arg);
+				    u64 merge_input_seq, struct scoutfs_key *key,
+				    struct scoutfs_key *bloom_key, struct scoutfs_key *start,
+				    struct scoutfs_key *end, scoutfs_forest_item_cb cb, void *arg);
 int scoutfs_forest_set_bloom_bits(struct super_block *sb,
 				  struct scoutfs_lock *lock);
 void scoutfs_forest_set_max_seq(struct super_block *sb, u64 max_seq);
@@ -149,8 +149,12 @@ static const struct inode_operations scoutfs_file_iops = {
 	.removexattr	= generic_removexattr,
 #endif
 	.listxattr	= scoutfs_listxattr,
+#ifdef KC_GET_INODE_ACL
+	.get_inode_acl	= scoutfs_get_acl,
+#else
 	.get_acl	= scoutfs_get_acl,
-#ifdef KC_GET_ACL_DENTRY
+#endif
+#ifdef KC_SET_ACL_DENTRY
 	.set_acl	= scoutfs_set_acl,
 #endif
 	.fiemap		= scoutfs_data_fiemap,
@@ -165,8 +169,12 @@ static const struct inode_operations scoutfs_special_iops = {
 	.removexattr	= generic_removexattr,
 #endif
 	.listxattr	= scoutfs_listxattr,
+#ifdef KC_GET_INODE_ACL
+	.get_inode_acl	= scoutfs_get_acl,
+#else
 	.get_acl	= scoutfs_get_acl,
-#ifdef KC_GET_ACL_DENTRY
+#endif
+#ifdef KC_SET_ACL_DENTRY
 	.set_acl	= scoutfs_set_acl,
 #endif
 };
@@ -1482,12 +1490,6 @@ static int remove_index_items(struct super_block *sb, u64 ino,
 * Return an allocated and unused inode number.  Returns -ENOSPC if
 * we're out of inode.
 *
- * Each parent directory has its own pool of free inode numbers.  Items
- * are sorted by their inode numbers as they're stored in segments.
- * This will tend to group together files that are created in a
- * directory at the same time in segments.  Concurrent creation across
- * different directories will be stored in their own regions.
- *
 * Inode numbers are never reclaimed.  If the inode is evicted or we're
 * unmounted the pending inode numbers will be lost.  Asking for a
 * relatively small number from the server each time will tend to
@@ -1497,12 +1499,18 @@ static int remove_index_items(struct super_block *sb, u64 ino,
 int scoutfs_alloc_ino(struct super_block *sb, bool is_dir, u64 *ino_ret)
 {
 	DECLARE_INODE_SB_INFO(sb, inf);
+	struct scoutfs_mount_options opts;
 	struct inode_allocator *ia;
 	u64 ino;
 	u64 nr;
 	int ret;

-	ia = is_dir ? &inf->dir_ino_alloc : &inf->ino_alloc;
+	scoutfs_options_read(sb, &opts);
+
+	if (is_dir && opts.ino_alloc_per_lock == SCOUTFS_LOCK_INODE_GROUP_NR)
+		ia = &inf->dir_ino_alloc;
+	else
+		ia = &inf->ino_alloc;

 	spin_lock(&ia->lock);

@@ -1523,6 +1531,17 @@ int scoutfs_alloc_ino(struct super_block *sb, bool is_dir, u64 *ino_ret)
 	*ino_ret = ia->ino++;
 	ia->nr--;

+	if (opts.ino_alloc_per_lock != SCOUTFS_LOCK_INODE_GROUP_NR) {
+		nr = ia->ino & SCOUTFS_LOCK_INODE_GROUP_MASK;
+		if (nr >= opts.ino_alloc_per_lock) {
+			nr = SCOUTFS_LOCK_INODE_GROUP_NR - nr;
+			if (nr > ia->nr)
+				nr = ia->nr;
+			ia->ino += nr;
+			ia->nr -= nr;
+		}
+	}
+
 	spin_unlock(&ia->lock);
 	ret = 0;
 out:
@@ -1626,10 +1645,14 @@ int scoutfs_inode_orphan_delete(struct super_block *sb, u64 ino, struct scoutfs_
 				struct scoutfs_lock *primary)
 {
 	struct scoutfs_key key;
+	int ret;

 	init_orphan_key(&key, ino);

-	return scoutfs_item_delete_force(sb, &key, lock, primary);
+	ret = scoutfs_item_delete_force(sb, &key, lock, primary);
+	trace_scoutfs_inode_orphan_delete(sb, ino, ret);
+
+	return ret;
 }

 /*
@@ -1711,6 +1734,8 @@ out:
 		scoutfs_release_trans(sb);
 	scoutfs_inode_index_unlock(sb, &ind_locks);

+	trace_scoutfs_delete_inode_end(sb, ino, mode, size, ret);
+
 	return ret;
 }

@@ -1806,6 +1831,9 @@ out:
 * they've checked that the inode could really be deleted.  We serialize
 * on a bit in the lock data so that we only have one deletion attempt
 * per inode under this mount's cluster lock.
+ *
+ * Returns -EAGAIN if we either did some cleanup work or are unable to finish
+ * cleaning up this inode right now.
 */
 static int try_delete_inode_items(struct super_block *sb, u64 ino)
 {
@@ -1819,6 +1847,8 @@ static int try_delete_inode_items(struct super_block *sb, u64 ino)
 	int bit_nr;
 	int ret;

+	trace_scoutfs_try_delete(sb, ino);
+
 	ret = scoutfs_lock_ino(sb, SCOUTFS_LOCK_WRITE, 0, ino, &lock);
 	if (ret < 0)
 		goto out;
@@ -1831,27 +1861,32 @@ static int try_delete_inode_items(struct super_block *sb, u64 ino)

 	/* only one local attempt per inode at a time */
 	if (test_and_set_bit(bit_nr, ldata->trying)) {
-		ret = 0;
+		trace_scoutfs_try_delete_local_busy(sb, ino);
+		ret = -EAGAIN;
 		goto out;
 	}
 	clear_trying = true;

 	/* can't delete if it's cached in local or remote mounts */
 	if (scoutfs_omap_test(sb, ino) || test_bit_le(bit_nr, ldata->map.bits)) {
-		ret = 0;
+		trace_scoutfs_try_delete_cached(sb, ino);
+		ret = -EAGAIN;
 		goto out;
 	}

 	scoutfs_inode_init_key(&key, ino);
 	ret = lookup_inode_item(sb, &key, &sinode, lock);
 	if (ret < 0) {
-		if (ret == -ENOENT)
+		if (ret == -ENOENT) {
+			trace_scoutfs_try_delete_no_item(sb, ino);
 			ret = 0;
+		}
 		goto out;
 	}

 	if (le32_to_cpu(sinode.nlink) > 0) {
-		ret = 0;
+		trace_scoutfs_try_delete_has_links(sb, ino, le32_to_cpu(sinode.nlink));
+		ret = -EAGAIN;
 		goto out;
 	}

@@ -1860,8 +1895,10 @@ static int try_delete_inode_items(struct super_block *sb, u64 ino)
 		goto out;

 	ret = delete_inode_items(sb, ino, &sinode, lock, orph_lock);
-	if (ret == 0)
+	if (ret == 0) {
+		ret = -EAGAIN;
 		scoutfs_inc_counter(sb, inode_deleted);
+	}

 out:
 	if (clear_trying)
@@ -2063,6 +2100,10 @@ void scoutfs_inode_schedule_orphan_dwork(struct super_block *sb)
 * a locally cached inode.  Then we ask the server for the open map
 * containing the inode.  Only if we don't see any cached users do we do
 * the expensive work of acquiring locks to try and delete the items.
+ *
+ * We need to track whether there is any orphan cleanup work remaining so
+ * that tests such as inode-deletion can watch the orphan_scan_empty counter
+ * to determine when inode cleanup from open-unlink scenarios is complete.
 */
 static void inode_orphan_scan_worker(struct work_struct *work)
 {
@@ -2074,11 +2115,14 @@ static void inode_orphan_scan_worker(struct work_struct *work)
 	SCOUTFS_BTREE_ITEM_REF(iref);
 	struct scoutfs_key last;
 	struct scoutfs_key key;
+	bool work_todo = false;
 	u64 group_nr;
 	int bit_nr;
 	u64 ino;
 	int ret;

+	trace_scoutfs_orphan_scan_start(sb);
+
 	scoutfs_inc_counter(sb, orphan_scan);

 	init_orphan_key(&last, U64_MAX);
@@ -2098,8 +2142,10 @@ static void inode_orphan_scan_worker(struct work_struct *work)
 		init_orphan_key(&key, ino);
 		ret = scoutfs_btree_next(sb, &roots.fs_root, &key, &iref);
 		if (ret < 0) {
-			if (ret == -ENOENT)
+			if (ret == -ENOENT) {
+				trace_scoutfs_orphan_scan_work(sb, 0);
 				break;
+			}
 			goto out;
 		}

@@ -2114,6 +2160,7 @@ static void inode_orphan_scan_worker(struct work_struct *work)

 		/* locally cached inodes will try to delete as they evict */
 		if (scoutfs_omap_test(sb, ino)) {
+			work_todo = true;
 			scoutfs_inc_counter(sb, orphan_scan_cached);
 			continue;
 		}
@@ -2129,13 +2176,22 @@ static void inode_orphan_scan_worker(struct work_struct *work)

 		/* remote cached inodes will also try to delete */
 		if (test_bit_le(bit_nr, omap.bits)) {
+			work_todo = true;
 			scoutfs_inc_counter(sb, orphan_scan_omap_set);
 			continue;
 		}

 		/* seemingly orphaned and unused, get locks and check for sure */
 		scoutfs_inc_counter(sb, orphan_scan_attempts);
+		trace_scoutfs_orphan_scan_work(sb, ino);
+
 		ret = try_delete_inode_items(sb, ino);
+		if (ret == -EAGAIN) {
+			work_todo = true;
+			ret = 0;
+		}
+
+		trace_scoutfs_orphan_scan_end(sb, ino, ret);
 	}

 	ret = 0;
@@ -2144,6 +2200,11 @@ out:
 	if (ret < 0)
 		scoutfs_inc_counter(sb, orphan_scan_error);

+	if (!work_todo)
+		scoutfs_inc_counter(sb, orphan_scan_empty);
+
+	trace_scoutfs_orphan_scan_stop(sb, work_todo);
+
 	scoutfs_inode_schedule_orphan_dwork(sb);
 }

@@ -49,6 +49,7 @@
 #include "quota.h"
 #include "scoutfs_trace.h"
 #include "util.h"
+#include "raw.h"

 /*
 * We make inode index items coherent by locking fixed size regions of
@@ -415,8 +416,6 @@ static long scoutfs_ioc_data_wait_err(struct file *file, unsigned long arg)
 		return 0;
 	if ((args.op & SCOUTFS_IOC_DWO_UNKNOWN) || !IS_ERR_VALUE(args.err))
 		return -EINVAL;
-	if ((args.op & SCOUTFS_IOC_DWO_UNKNOWN) || !IS_ERR_VALUE(args.err))
-		return -EINVAL;

 	trace_scoutfs_ioc_data_wait_err(sb, &args);

@@ -1669,6 +1668,141 @@ out:
 	return ret;
 }

+static long scoutfs_ioc_punch_offline(struct file *file, unsigned long arg)
+{
+	struct inode *inode = file_inode(file);
+	struct super_block *sb = inode->i_sb;
+	struct scoutfs_ioctl_punch_offline __user *upo = (void __user *)arg;
+	struct scoutfs_ioctl_punch_offline po;
+	struct scoutfs_lock *lock = NULL;
+	u64 iblock;
+	u64 last;
+	u64 tmp;
+	int ret;
+
+	if (copy_from_user(&po, upo, sizeof(po)))
+		return -EFAULT;
+
+	if (po.len == 0)
+		return 0;
+
+	if (check_add_overflow(po.offset, po.len - 1, &tmp) ||
+	    (po.offset & SCOUTFS_BLOCK_SM_MASK) ||
+	    (po.len & SCOUTFS_BLOCK_SM_MASK))
+		return -EOVERFLOW;
+
+	if (po.flags)
+		return -EINVAL;
+
+	ret = mnt_want_write_file(file);
+	if (ret < 0)
+		return ret;
+
+	inode_lock(inode);
+
+	ret = scoutfs_lock_inode(sb, SCOUTFS_LOCK_WRITE,
+				 SCOUTFS_LKF_REFRESH_INODE, inode, &lock);
+	if (ret)
+		goto out;
+
+	if (!S_ISREG(inode->i_mode)) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	if (!(file->f_mode & FMODE_WRITE)) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	ret = inode_permission(KC_VFS_INIT_NS inode, MAY_WRITE);
+	if (ret < 0)
+		goto out;
+
+	if (scoutfs_inode_data_version(inode) != po.data_version) {
+		ret = -ESTALE;
+		goto out;
+	}
+
+	if ((ret = scoutfs_inode_check_retention(inode)))
+		goto out;
+
+	iblock = po.offset >> SCOUTFS_BLOCK_SM_SHIFT;
+	last = (po.offset + po.len - 1) >> SCOUTFS_BLOCK_SM_SHIFT;
+
+	ret = scoutfs_data_punch_offline(inode, iblock, last, po.data_version, lock);
+
+out:
+	scoutfs_unlock(sb, lock, SCOUTFS_LOCK_WRITE);
+	inode_unlock(inode);
+	mnt_drop_write_file(file);
+
+	return ret;
+}
+
+static long scoutfs_ioc_raw_read_meta_seq(struct file *file, unsigned long arg)
+{
+	struct super_block *sb = file_inode(file)->i_sb;
+	struct scoutfs_ioctl_raw_read_meta_seq __user *urms = (void __user *)arg;
+	struct scoutfs_ioctl_raw_read_meta_seq rms;
+	int ret;
+
+	if (!capable(CAP_SYS_ADMIN)) {
+		ret = -EPERM;
+		goto out;
+	}
+
+	if (copy_from_user(&rms, urms, sizeof(rms))) {
+		ret = -EFAULT;
+		goto out;
+	}
+
+	if (rms.results_size == 0) {
+		ret = 0;
+		goto out;
+	}
+
+	if (rms.results_size < sizeof(struct scoutfs_ioctl_meta_seq) ||
+	    rms.results_size > INT_MAX) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	ret = scoutfs_raw_read_meta_seq(sb, &rms, &rms.last);
+	if (ret >= 0 && copy_to_user(&urms->last, &rms.last, sizeof(rms.last)))
+		ret = -EFAULT;
+out:
+	return ret;
+}
+
+static long scoutfs_ioc_raw_read_inode_info(struct file *file, unsigned long arg)
+{
+	struct super_block *sb = file_inode(file)->i_sb;
+	struct scoutfs_ioctl_raw_read_inode_info __user *urii = (void __user *)arg;
+	struct scoutfs_ioctl_raw_read_inode_info rii;
+	int ret;
+
+	if (!capable(CAP_SYS_ADMIN)) {
+		ret = -EPERM;
+		goto out;
+	}
+
+	if (copy_from_user(&rii, urii, sizeof(rii))) {
+		ret = -EFAULT;
+		goto out;
+	}
+
+	if (rii.inos_count == 0 || rii.results_size > INT_MAX ||
+	    !IS_ALIGNED(rii.inos_ptr, __alignof__(__u64))) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	ret = scoutfs_raw_read_inode_info(sb, &rii);
+out:
+	return ret;
+}
+
 long scoutfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 {
 	switch (cmd) {
@@ -1718,6 +1852,12 @@ long scoutfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 		return scoutfs_ioc_mod_quota_rule(file, arg, false);
 	case SCOUTFS_IOC_READ_XATTR_INDEX:
 		return scoutfs_ioc_read_xattr_index(file, arg);
+	case SCOUTFS_IOC_PUNCH_OFFLINE:
+		return scoutfs_ioc_punch_offline(file, arg);
+	case SCOUTFS_IOC_RAW_READ_META_SEQ:
+		return scoutfs_ioc_raw_read_meta_seq(file, arg);
+	case SCOUTFS_IOC_RAW_READ_INODE_INFO:
+		return scoutfs_ioc_raw_read_inode_info(file, arg);
 	}

 	return -ENOTTY;
@@ -15,20 +15,6 @@

 #define SCOUTFS_IOCTL_MAGIC 0xE8  /* arbitrarily chosen hole in ioctl-number.rst */

-/*
- * Packed scoutfs keys rarely cross the ioctl boundary so we have a
- * translation struct.
- */
-struct scoutfs_ioctl_key {
-	__le64	_sk_first;
-	__le64	_sk_second;
-	__le64	_sk_third;
-	__u8	_sk_fourth;
-	__u8	sk_type;
-	__u8	sk_zone;
-	__u8	_pad[5];
-};
-
 struct scoutfs_ioctl_walk_inodes_entry {
 	__u64 major;
 	__u64 ino;
@@ -848,4 +834,197 @@ struct scoutfs_ioctl_read_xattr_index {
 #define SCOUTFS_IOC_READ_XATTR_INDEX \
 	_IOR(SCOUTFS_IOCTL_MAGIC, 23, struct scoutfs_ioctl_read_xattr_index)

+/*
+ * This is a limited and specific version of hole punching.  It's an
+ * archive layer operation that only converts unmapped offline extents
+ * into sparse extents.  It is intended to be used when restoring sparse
+ * files after the initial creation set the entire file size offline.
+ *
+ * The offset and len fields are in units of bytes and must be aligned
+ * to the small (4KiB) block size.  All regions of offline extents
+ * covered by the region will be converted into sparse online extents,
+ * including regions that straddle the boundaries of the region.  Any
+ * existing sparse extents in the region are ignored.
+ *
+ * The data_version must match the inode or EINVAL is returned.  The
+ * data_version is not modified by this operation.
+ *
+ * EINVAL is returned if any mapped extents are found in the region.  If
+ * an error is returned then partial progress may have been made.
+ */
+struct scoutfs_ioctl_punch_offline {
+	__u64 offset;
+	__u64 len;
+	__u64 data_version;
+	__u64 flags;
+};
+
+#define SCOUTFS_IOC_PUNCH_OFFLINE \
+	_IOW(SCOUTFS_IOCTL_MAGIC, 24, struct scoutfs_ioctl_punch_offline)
+
+/*
+ * Read meta_seq items without cluster locking.
+ *
+ * @start is the first meta_seq item value that could be returned.
+ * {0,0} is the minimum.
+ *
+ * @end is the last meta_seq item value that could be returned.
+ * {U64_MAX, U64_MAX} is the maximum.
+ *
+ * @last is only set on success from the call.  It's the last meta_seq
+ * item that could have been returned.  This lets the caller detect that
+ * the full input range wasn't explored.  Another call can be made with
+ * start set to just after this.
+ *
+ * @results_ptr is a pointer to an array of (struct
+ * scoutfs_ioctl_meta_seq) elements that were found in the input range.
+ *
+ * @results_size is the count of elements in the results_ptr array and
+ * the maximum number of results that can be returned.  There must be
+ * room for at least one result.
+ *
+ * Return existing meta_seq items starting from @start until @last.
+ * Partial results can be returned and is indicated by @last being set
+ * to an item before @last.
+ *
+ * The results are sorted first by increasing meta_seq and then by
+ * increasing ino.  All of the results are from one version of file
+ * system metadata.  This means that an inode can not be found multiple
+ * times within the results of one call.
+ *
+ * This call ignores currently dirty transactions and reads persistent
+ * items directly.  A transaction can be written after this call and
+ * cause meta_seq items to appear before or within the results from this
+ * call.
+ *
+ * The number of meta_seq items stored in the results buffer is returned
+ * and @last is updated.  0 items can be returned if none are found
+ * within the input range.
+ *
+ * Unique errors:
+ *
+ *  -EINVAL: The result count was 0 or greater than INT_MAX.
+ *
+ *  -ESTALE: The results could not be read from one stable version of
+ *    file system metadata.  Decrease the number of inodes requested.
+ */
+struct scoutfs_ioctl_meta_seq {
+	__u64 meta_seq;
+	__u64 ino;
+};
+struct scoutfs_ioctl_raw_read_meta_seq {
+	struct scoutfs_ioctl_meta_seq start;
+	struct scoutfs_ioctl_meta_seq end;
+	struct scoutfs_ioctl_meta_seq last;
+	__u64 results_ptr;
+	__u32 results_size;
+	__u32 _pad;
+};
+#define SCOUTFS_IOC_RAW_READ_META_SEQ \
+	_IOR(SCOUTFS_IOCTL_MAGIC, 25, struct scoutfs_ioctl_raw_read_meta_seq)
+
+
+/*
+ * Read inode metadata without cluster locking.
+ *
+ * @inos_ptr is a pointer to an aligned array of 64bit inode numbers.
+ *
+ * @inos_count is the number of elements in the array.  The inode
+ * numbers must not be zero, must strictly increase, and must not
+ * contain any duplicates.
+ *
+ * @names_ptr is a pointer to a byte array of xattr names to return with
+ * each inode.  The names are identical to those used in
+ * {get,set}xattr(2).  The names must be null terminated and no two
+ * names may be equal.
+ *
+ * @names_count is the number of names that will be found in the
+ * names_ptr buffer.
+ *
+ * @results_ptr is a pointer to a buffer that will be filled by the read
+ * inode info results.  The result structs and payloads are not aligned.
+ * Callers will almost certainly need to copy them into aligned
+ * addresses before referencing their contents.
+ *
+ * @results_size is the number of bytes available in the results_ptr
+ * buffer.
+ *
+ * For each inode an _INODE result will always be returned.  Then a
+ * _XATTR result will be returned for each xattr on the inode that
+ * matches one of the given input names.
+ *
+ * Each call will not return partial results. -ERANGE is returned if the
+ * results for the requested inodes do not fit in the results buffer.
+ *
+ * The info for one call is from one consistent version of the file
+ * system metadata.  The call can have to retry if it sees metadata
+ * change during its call.  -ESTALE will be returned if it was not able
+ * to read all the inodes info from one metadata version.  The number of
+ * inodes being read can be decreased to avoid this.
+ *
+ * Inodes with an nlink of 0 are not returned.
+ *
+ * The size in bytes of filled results is returned.  A non-zero return
+ * will always include at least one full
+ * (struct scoutfs_ioctl_raw_read_result) header.
+ *
+ * Unique errors:
+ *
+ *  -EINVAL: The inode count can't be zero. The inos ptr must be aligned
+ *    to __u64 alignment.  The results buffer size can't be larger than
+ *    INT_MAX.  Inode numbers can't be zero, must be sorted, and can't
+ *    have duplicates.  The xattr names must be unique, null terminated,
+ *    and less than 256 bytes long.
+ *
+ *  -ERANGE: The results for the requested inodes do not fit in the
+ *    results buffer.  Increase the buffer size (perhaps allowing for all
+ *    xattrs with large values) or decrease the number of inodes per call.
+ *
+ *  -ESTALE: The results could not be read from one stable version of
+ *    file system metadata.  Decrease the number of inodes requested.
+ *
+ *  -EUCLEAN: Internal xattr metadata is inconsistent.
+ */
+
+struct scoutfs_ioctl_raw_read_inode_info {
+	__u64 inos_ptr;
+	__u32 inos_count;
+	__u32 names_count;
+	__u64 names_ptr;
+	__u64 results_ptr;
+	__u32 results_size;
+	__u8  _pad[4];
+};
+
+/*
+ * @type is one of the enums that determines the type of the following
+ * result payload.
+ *
+ * @size is the number of bytes of result payload immediately following
+ * the result struct.  It does not include the size of the result struct
+ * header.
+ */
+struct scoutfs_ioctl_raw_read_result {
+	__u32 size;
+	__u8  _pad[7];
+	__u8 type;
+};
+
+/*
+ * The _INODE result contains an initial 64bit inode number followed by a
+ * struct scoutfs_inode as defined in format.h.  The size includes the
+ * 8byte initial inode number.  With that subtracted the size of the
+ * inode struct defines its version (and so the fields it supports).
+ */
+#define SCOUTFS_IOC_RAW_READ_RESULT_INODE	1
+/*
+ * The result payload contains the null terminated name and the value.
+ * The value size can be found by subtracting the null terminated name
+ * length from the result size.
+ */
+#define SCOUTFS_IOC_RAW_READ_RESULT_XATTR	2
+
+#define SCOUTFS_IOC_RAW_READ_INODE_INFO \
+	_IOR(SCOUTFS_IOCTL_MAGIC, 25, struct scoutfs_ioctl_raw_read_inode_info)
+
 #endif
@@ -53,8 +53,10 @@
 * all access to the lock (by revoking it down to a null mode) then the
 * lock is freed.
 *
- * Memory pressure on the client can cause the client to request a null
- * mode from the server so that once its granted the lock can be freed.
+ * Each client has a configurable number of locks that are allowed to
+ * remain idle after being granted, for use by future tasks.  Past the
+ * limit locks are freed by requesting a null mode from the server,
+ * governed by a LRU.
 *
 * So far we've only needed a minimal trylock.  We return -EAGAIN if a
 * lock attempt can't immediately match an existing granted lock.  This
@@ -79,14 +81,11 @@ struct lock_info {
 	bool unmounting;
 	struct rb_root lock_tree;
 	struct rb_root lock_range_tree;
-	KC_DEFINE_SHRINKER(shrinker);
+	u64 nr_locks;
 	struct list_head lru_list;
-	unsigned long long lru_nr;
 	struct workqueue_struct *workq;
 	struct work_struct inv_work;
 	struct list_head inv_list;
-	struct work_struct shrink_work;
-	struct list_head shrink_list;
 	atomic64_t next_refresh_gen;

 	struct dentry *tseq_dentry;
@@ -249,7 +248,6 @@ static void lock_free(struct lock_info *linfo, struct scoutfs_lock *lock)
 	BUG_ON(!RB_EMPTY_NODE(&lock->range_node));
 	BUG_ON(!list_empty(&lock->lru_head));
 	BUG_ON(!list_empty(&lock->inv_head));
-	BUG_ON(!list_empty(&lock->shrink_head));
 	BUG_ON(!list_empty(&lock->cov_list));

 	kfree(lock->inode_deletion_data);
@@ -277,7 +275,6 @@ static struct scoutfs_lock *lock_alloc(struct super_block *sb,
 	INIT_LIST_HEAD(&lock->lru_head);
 	INIT_LIST_HEAD(&lock->inv_head);
 	INIT_LIST_HEAD(&lock->inv_list);
-	INIT_LIST_HEAD(&lock->shrink_head);
 	spin_lock_init(&lock->cov_list_lock);
 	INIT_LIST_HEAD(&lock->cov_list);

@@ -410,6 +407,7 @@ static bool lock_insert(struct super_block *sb, struct scoutfs_lock *ins)
 	rb_link_node(&ins->node, parent, node);
 	rb_insert_color(&ins->node, &linfo->lock_tree);

+	linfo->nr_locks++;
 	scoutfs_tseq_add(&linfo->tseq_tree, &ins->tseq_entry);

 	return true;
@@ -424,6 +422,7 @@ static void lock_remove(struct lock_info *linfo, struct scoutfs_lock *lock)
 	rb_erase(&lock->range_node, &linfo->lock_range_tree);
 	RB_CLEAR_NODE(&lock->range_node);

+	linfo->nr_locks--;
 	scoutfs_tseq_del(&linfo->tseq_tree, &lock->tseq_entry);
 }

@@ -463,10 +462,8 @@ static void __lock_del_lru(struct lock_info *linfo, struct scoutfs_lock *lock)
 {
 	assert_spin_locked(&linfo->lock);

-	if (!list_empty(&lock->lru_head)) {
+	if (!list_empty(&lock->lru_head))
 		list_del_init(&lock->lru_head);
-		linfo->lru_nr--;
-	}
 }

 /*
@@ -525,14 +522,16 @@ static struct scoutfs_lock *create_lock(struct super_block *sb,
 * indicate that the lock wasn't idle.  If it really is idle then we
 * either free it if it's null or put it back on the lru.
 */
-static void put_lock(struct lock_info *linfo,struct scoutfs_lock *lock)
+static void __put_lock(struct lock_info *linfo, struct scoutfs_lock *lock, bool tail)
 {
 	assert_spin_locked(&linfo->lock);

 	if (lock_idle(lock)) {
 		if (lock->mode != SCOUTFS_LOCK_NULL) {
-			list_add_tail(&lock->lru_head, &linfo->lru_list);
-			linfo->lru_nr++;
+			if (tail)
+				list_add_tail(&lock->lru_head, &linfo->lru_list);
+			else
+				list_add(&lock->lru_head, &linfo->lru_list);
 		} else {
 			lock_remove(linfo, lock);
 			lock_free(linfo, lock);
@@ -540,6 +539,11 @@ static void put_lock(struct lock_info *linfo,struct scoutfs_lock *lock)
 	}
 }

+static inline void put_lock(struct lock_info *linfo, struct scoutfs_lock *lock)
+{
+	__put_lock(linfo, lock, true);
+}
+
 /*
 * The caller has made a change (set a lock mode) which can let one of the
 * invalidating locks make forward progress.
@@ -713,14 +717,14 @@ static void lock_invalidate_worker(struct work_struct *work)
 		/* only lock protocol, inv can't call subsystems after shutdown */
 		if (!linfo->shutdown) {
 			ret = lock_invalidate(sb, lock, nl->old_mode, nl->new_mode);
-			BUG_ON(ret);
+			BUG_ON(ret < 0 && ret != -ENOLINK);
 		}

 		/* respond with the key and modes from the request, server might have died */
 		ret = scoutfs_client_lock_response(sb, ireq->net_id, nl);
 		if (ret == -ENOTCONN)
 			ret = 0;
-		BUG_ON(ret);
+		BUG_ON(ret < 0 && ret != -ENOLINK);

 		scoutfs_inc_counter(sb, lock_invalidate_response);
 	}
@@ -875,6 +879,69 @@ int scoutfs_lock_recover_request(struct super_block *sb, u64 net_id,
 	return ret;
 }

+/*
+ * This is called on every _lock call to try and keep the number of
+ * locks under the idle count.  We're intentionally trying to throttle
+ * shrinking bursts by tying its frequency to lock use.  It will only
+ * send requests to free unused locks, though, so it's always possible
+ * to exceed the high water mark under heavy load.
+ *
+ * We send a null request and the lock will be freed by the response
+ * once all users drain.  If this races with invalidation then the
+ * server will only send the grant response once the invalidation is
+ * finished.
+ */
+static bool try_shrink_lock(struct super_block *sb, struct lock_info *linfo, bool force)
+{
+	struct scoutfs_mount_options opts;
+	struct scoutfs_lock *lock = NULL;
+	struct scoutfs_net_lock nl;
+	int ret = 0;
+
+	scoutfs_options_read(sb, &opts);
+
+	/* avoiding lock contention with unsynchronized test, don't mind temp false results */
+	if (!force && (list_empty(&linfo->lru_list) ||
+	               READ_ONCE(linfo->nr_locks) <= opts.lock_idle_count))
+		return false;
+
+	spin_lock(&linfo->lock);
+
+	lock = list_first_entry_or_null(&linfo->lru_list, struct scoutfs_lock, lru_head);
+	if (lock && (force || (linfo->nr_locks > opts.lock_idle_count))) {
+		__lock_del_lru(linfo, lock);
+		lock->request_pending = 1;
+
+		nl.key = lock->start;
+		nl.old_mode = lock->mode;
+		nl.new_mode = SCOUTFS_LOCK_NULL;
+	} else {
+		lock = NULL;
+	}
+
+	spin_unlock(&linfo->lock);
+
+	if (lock) {
+		ret = scoutfs_client_lock_request(sb, &nl);
+		if (ret < 0) {
+			scoutfs_inc_counter(sb, lock_shrink_request_failed);
+
+			spin_lock(&linfo->lock);
+
+			lock->request_pending = 0;
+			wake_up(&lock->waitq);
+			__put_lock(linfo, lock, false);
+
+			spin_unlock(&linfo->lock);
+		} else {
+			scoutfs_inc_counter(sb, lock_shrink_attempted);
+			trace_scoutfs_lock_shrink(sb, lock);
+		}
+	}
+
+	return lock && ret == 0;
+}
+
 static bool lock_wait_cond(struct super_block *sb, struct scoutfs_lock *lock,
 			   enum scoutfs_lock_mode mode)
 {
@@ -937,6 +1004,8 @@ static int lock_key_range(struct super_block *sb, enum scoutfs_lock_mode mode, i
 	if (WARN_ON_ONCE(scoutfs_trans_held()))
 		return -EDEADLK;

+	try_shrink_lock(sb, linfo, false);
+
 	spin_lock(&linfo->lock);

 	/* drops and re-acquires lock if it allocates */
@@ -1024,19 +1093,24 @@ out_unlock:
 	return ret;
 }

+void scoutfs_lock_get_fs_item_range(u64 ino, struct scoutfs_key *start, struct scoutfs_key *end)
+{
+	scoutfs_key_set_zeros(start);
+	start->sk_zone = SCOUTFS_FS_ZONE;
+	start->ski_ino = cpu_to_le64(ino & ~(u64)SCOUTFS_LOCK_INODE_GROUP_MASK);
+
+	scoutfs_key_set_ones(end);
+	end->sk_zone = SCOUTFS_FS_ZONE;
+	end->ski_ino = cpu_to_le64(ino | SCOUTFS_LOCK_INODE_GROUP_MASK);
+}
+
 int scoutfs_lock_ino(struct super_block *sb, enum scoutfs_lock_mode mode, int flags, u64 ino,
 		     struct scoutfs_lock **ret_lock)
 {
 	struct scoutfs_key start;
 	struct scoutfs_key end;

-	scoutfs_key_set_zeros(&start);
-	start.sk_zone = SCOUTFS_FS_ZONE;
-	start.ski_ino = cpu_to_le64(ino & ~(u64)SCOUTFS_LOCK_INODE_GROUP_MASK);
-
-	scoutfs_key_set_ones(&end);
-	end.sk_zone = SCOUTFS_FS_ZONE;
-	end.ski_ino = cpu_to_le64(ino | SCOUTFS_LOCK_INODE_GROUP_MASK);
+	scoutfs_lock_get_fs_item_range(ino, &start, &end);

 	return lock_key_range(sb, mode, flags, &start, &end, ret_lock);
 }
@@ -1380,134 +1454,12 @@ bool scoutfs_lock_protected(struct scoutfs_lock *lock, struct scoutfs_key *key,
 					  &lock->start, &lock->end) == 0;
 }

-/*
- * The shrink callback got the lock, marked it request_pending, and put
- * it on the shrink list.  We send a null request and the lock will be
- * freed by the response once all users drain.  If this races with
- * invalidation then the server will only send the grant response once
- * the invalidation is finished.
- */
-static void lock_shrink_worker(struct work_struct *work)
-{
-	struct lock_info *linfo = container_of(work, struct lock_info,
-					       shrink_work);
-	struct super_block *sb = linfo->sb;
-	struct scoutfs_net_lock nl;
-	struct scoutfs_lock *lock;
-	struct scoutfs_lock *tmp;
-	LIST_HEAD(list);
-	int ret;
-
-	scoutfs_inc_counter(sb, lock_shrink_work);
-
-	spin_lock(&linfo->lock);
-	list_splice_init(&linfo->shrink_list, &list);
-	spin_unlock(&linfo->lock);
-
-	list_for_each_entry_safe(lock, tmp, &list, shrink_head) {
-		list_del_init(&lock->shrink_head);
-
-		/* unlocked lock access, but should be stable since we queued */
-		nl.key = lock->start;
-		nl.old_mode = lock->mode;
-		nl.new_mode = SCOUTFS_LOCK_NULL;
-
-		ret = scoutfs_client_lock_request(sb, &nl);
-		if (ret) {
-			/* oh well, not freeing */
-			scoutfs_inc_counter(sb, lock_shrink_aborted);
-
-			spin_lock(&linfo->lock);
-
-			lock->request_pending = 0;
-			wake_up(&lock->waitq);
-			put_lock(linfo, lock);
-
-			spin_unlock(&linfo->lock);
-		}
-	}
-}
-
-static unsigned long lock_count_objects(struct shrinker *shrink,
-					struct shrink_control *sc)
-{
-	struct lock_info *linfo = KC_SHRINKER_CONTAINER_OF(shrink, struct lock_info);
-	struct super_block *sb = linfo->sb;
-
-	scoutfs_inc_counter(sb, lock_count_objects);
-
-	return shrinker_min_long(linfo->lru_nr);
-}
-
-/*
- * Start the shrinking process for locks on the lru.  If a lock is on
- * the lru then it can't have any active users.  We don't want to block
- * or allocate here so all we do is get the lock, mark it request
- * pending, and kick off the work.  The work sends a null request and
- * eventually the lock is freed by its response.
- *
- * Only a racing lock attempt that isn't matched can prevent the lock
- * from being freed.  It'll block waiting to send its request for its
- * mode which will prevent the lock from being freed when the null
- * response arrives.
- */
-static unsigned long lock_scan_objects(struct shrinker *shrink,
-				       struct shrink_control *sc)
-{
-	struct lock_info *linfo = KC_SHRINKER_CONTAINER_OF(shrink, struct lock_info);
-	struct super_block *sb = linfo->sb;
-	struct scoutfs_lock *lock;
-	struct scoutfs_lock *tmp;
-	unsigned long freed = 0;
-	unsigned long nr = sc->nr_to_scan;
-	bool added = false;
-
-	scoutfs_inc_counter(sb, lock_scan_objects);
-
-	spin_lock(&linfo->lock);
-
-restart:
-	list_for_each_entry_safe(lock, tmp, &linfo->lru_list, lru_head) {
-
-		BUG_ON(!lock_idle(lock));
-		BUG_ON(lock->mode == SCOUTFS_LOCK_NULL);
-		BUG_ON(!list_empty(&lock->shrink_head));
-
-		if (nr-- == 0)
-			break;
-
-		__lock_del_lru(linfo, lock);
-		lock->request_pending = 1;
-		list_add_tail(&lock->shrink_head, &linfo->shrink_list);
-		added = true;
-		freed++;
-
-		scoutfs_inc_counter(sb, lock_shrink_attempted);
-		trace_scoutfs_lock_shrink(sb, lock);
-
-		/* could have bazillions of idle locks */
-		if (cond_resched_lock(&linfo->lock))
-			goto restart;
-	}
-
-	spin_unlock(&linfo->lock);
-
-	if (added)
-		queue_work(linfo->workq, &linfo->shrink_work);
-
-	trace_scoutfs_lock_shrink_exit(sb, sc->nr_to_scan, freed);
-	return freed;
-}
-
 void scoutfs_free_unused_locks(struct super_block *sb)
 {
-	struct lock_info *linfo = SCOUTFS_SB(sb)->lock_info;
-	struct shrink_control sc = {
-		.gfp_mask = GFP_NOFS,
-		.nr_to_scan = INT_MAX,
-	};
+	DECLARE_LOCK_INFO(sb, linfo);

-	lock_scan_objects(KC_SHRINKER_FN(&linfo->shrinker), &sc);
+	while (try_shrink_lock(sb, linfo, true))
+		cond_resched();
 }

 static void lock_tseq_show(struct seq_file *m, struct scoutfs_tseq_entry *ent)
@@ -1590,10 +1542,10 @@ u64 scoutfs_lock_ino_refresh_gen(struct super_block *sb, u64 ino)
 * transitions and sending requests.   We set the shutdown flag to catch
 * anyone who breaks this rule.
 *
- * We unregister the shrinker so that we won't try and send null
- * requests in response to memory pressure.  The locks will all be
- * unceremoniously dropped once we get a farewell response from the
- * server which indicates that they destroyed our locking state.
+ * With no more lock callers, we'll no longer try to shrink the pool of
+ * granted locks.  We'll free all of them as _destroy() is called after
+ * the farewell response indicates that the server tore down all our
+ * lock state.
 *
 * We will still respond to invalidation requests that have to be
 * processed to let unmount in other mounts acquire locks and make
@@ -1613,10 +1565,6 @@ void scoutfs_lock_shutdown(struct super_block *sb)

 	trace_scoutfs_lock_shutdown(sb, linfo);

-	/* stop the shrinker from queueing work */
-	KC_UNREGISTER_SHRINKER(&linfo->shrinker);
-	flush_work(&linfo->shrink_work);
-
 	/* cause current and future lock calls to return errors */
 	spin_lock(&linfo->lock);
 	linfo->shutdown = true;
@@ -1707,8 +1655,6 @@ void scoutfs_lock_destroy(struct super_block *sb)
 			list_del_init(&lock->inv_head);
 			lock->invalidate_pending = 0;
 		}
-		if (!list_empty(&lock->shrink_head))
-			list_del_init(&lock->shrink_head);
 		lock_remove(linfo, lock);
 		lock_free(linfo, lock);
 	}
@@ -1733,14 +1679,9 @@ int scoutfs_lock_setup(struct super_block *sb)
 	spin_lock_init(&linfo->lock);
 	linfo->lock_tree = RB_ROOT;
 	linfo->lock_range_tree = RB_ROOT;
-	KC_INIT_SHRINKER_FUNCS(&linfo->shrinker, lock_count_objects,
-			       lock_scan_objects);
-	KC_REGISTER_SHRINKER(&linfo->shrinker, "scoutfs-lock:" SCSBF, SCSB_ARGS(sb));
 	INIT_LIST_HEAD(&linfo->lru_list);
 	INIT_WORK(&linfo->inv_work, lock_invalidate_worker);
 	INIT_LIST_HEAD(&linfo->inv_list);
-	INIT_WORK(&linfo->shrink_work, lock_shrink_worker);
-	INIT_LIST_HEAD(&linfo->shrink_list);
 	atomic64_set(&linfo->next_refresh_gen, 0);
 	scoutfs_tseq_tree_init(&linfo->tseq_tree, lock_tseq_show);

@@ -65,6 +65,7 @@ int scoutfs_lock_invalidate_request(struct super_block *sb, u64 net_id,
 int scoutfs_lock_recover_request(struct super_block *sb, u64 net_id,
 				 struct scoutfs_key *key);

+void scoutfs_lock_get_fs_item_range(u64 ino, struct scoutfs_key *start, struct scoutfs_key *end);
 int scoutfs_lock_inode(struct super_block *sb, enum scoutfs_lock_mode mode, int flags,
 		       struct inode *inode, struct scoutfs_lock **ret_lock);
 int scoutfs_lock_ino(struct super_block *sb, enum scoutfs_lock_mode mode, int flags, u64 ino,
@@ -506,6 +506,19 @@ out:
 * because we don't know which locks they'll hold.  Once recover
 * finishes the server calls us to kick all the locks that were waiting
 * during recovery.
+ *
+ * The calling server shuts down if we return errors indicating that we
+ * weren't able to ensure forward progress in the lock state machine.
+ *
+ * Failure to send to a disconnected client is not a fatal error.
+ * During normal disconnection the client's state is removed before
+ * their connection is destroyed.  We can't use state to try and send to
+ * a non-existing connection.  But a client that fails to reconnect is
+ * disconnected before being fenced.  If we have multiple disconnected
+ * clients we can try to send to one while cleaning up another.  If
+ * they've uncleanly disconnected their locks are going to be removed
+ * and the lock can make forward progress again.  Or we'll shutdown for
+ * failure to fence.
 */
 static int process_waiting_requests(struct super_block *sb,
 				    struct server_lock_node *snode)
@@ -597,6 +610,10 @@ static int process_waiting_requests(struct super_block *sb,
 out:
 	put_server_lock(inf, snode);

+	/* disconnected clients will be fenced, trying to send to them isn't fatal */
+	if (ret == -ENOTCONN)
+		ret = 0;
+
 	return ret;
 }

@@ -35,6 +35,12 @@ do {									\
 	}								\
 } while (0)								\

+#define scoutfs_bug_on_err(sb, err, fmt, args...) \
+do { \
+	__typeof__(err) _err = (err); \
+	scoutfs_bug_on(sb, _err < 0 && _err != -ENOLINK, fmt, ##args); \
+} while (0)
+
 /*
 * Each message is only generated once per volume.  Remounting resets
 * the messages.
@@ -21,6 +21,7 @@
 #include <net/tcp.h>
 #include <linux/log2.h>
 #include <linux/jhash.h>
+#include <linux/rbtree.h>

 #include "format.h"
 #include "counters.h"
@@ -125,6 +126,7 @@ struct message_send {
 	unsigned long dead:1;
 	struct list_head head;
 	scoutfs_net_response_t resp_func;
+	struct rb_node node;
 	void *resp_data;
 	struct scoutfs_net_header nh;
 };
@@ -161,49 +163,118 @@ static bool nh_is_request(struct scoutfs_net_header *nh)
 	return !nh_is_response(nh);
 }

+static int cmp_sorted_msend(u64 pos, struct message_send *msend)
+{
+	if (nh_is_request(&msend->nh))
+		return pos < le64_to_cpu(msend->nh.id) ? -1 :
+		       pos > le64_to_cpu(msend->nh.id) ? 1 : 0;
+	else
+		return pos < le64_to_cpu(msend->nh.seq) ? -1 :
+		       pos > le64_to_cpu(msend->nh.seq) ? 1 : 0;
+}
+
+static struct message_send *search_sorted_msends(struct rb_root *root, u64 pos, struct rb_node *ins)
+{
+	struct rb_node **node = &root->rb_node;
+	struct rb_node *parent = NULL;
+	struct message_send *msend = NULL;
+	struct message_send *next = NULL;
+	int cmp = -1;
+
+	while (*node) {
+		parent = *node;
+		msend = container_of(*node, struct message_send, node);
+
+		cmp = cmp_sorted_msend(pos, msend);
+		if (cmp < 0) {
+			next = msend;
+			node = &(*node)->rb_left;
+		} else if (cmp > 0) {
+			node = &(*node)->rb_right;
+		} else {
+			next = msend;
+			break;
+		}
+	}
+
+	BUG_ON(cmp == 0 && ins);
+
+	if (ins) {
+		rb_link_node(ins, parent, node);
+		rb_insert_color(ins, root);
+	}
+
+	return next;
+}
+
+static struct message_send *next_sorted_msend(struct message_send *msend)
+{
+	struct rb_node *node = rb_next(&msend->node);
+
+	return node ? rb_entry(node, struct message_send, node) : NULL;
+}
+
+#define for_each_sorted_msend(MSEND_, TMP_, ROOT_, POS_) \
+	for (MSEND_ = search_sorted_msends(ROOT_, POS_, NULL); \
+	     MSEND_ != NULL && ({ TMP_ = next_sorted_msend(MSEND_); true; }); \
+	     MSEND_ = TMP_)
+
+static void insert_sorted_msend(struct scoutfs_net_connection *conn, struct message_send *msend)
+{
+	BUG_ON(!RB_EMPTY_NODE(&msend->node));
+
+	if (nh_is_request(&msend->nh))
+		search_sorted_msends(&conn->req_root, le64_to_cpu(msend->nh.id), &msend->node);
+	else
+		search_sorted_msends(&conn->resp_root, le64_to_cpu(msend->nh.seq), &msend->node);
+}
+
+static void erase_sorted_msend(struct scoutfs_net_connection *conn, struct message_send *msend)
+{
+	if (!RB_EMPTY_NODE(&msend->node)) {
+		if (nh_is_request(&msend->nh))
+			rb_erase(&msend->node, &conn->req_root);
+		else
+			rb_erase(&msend->node, &conn->resp_root);
+		RB_CLEAR_NODE(&msend->node);
+	}
+}
+
+static void move_sorted_msends(struct scoutfs_net_connection *dst_conn, struct rb_root *dst_root,
+			       struct scoutfs_net_connection *src_conn, struct rb_root *src_root)
+{
+	struct message_send *msend;
+	struct message_send *tmp;
+
+	for_each_sorted_msend(msend, tmp, src_root, 0) {
+		erase_sorted_msend(src_conn, msend);
+		insert_sorted_msend(dst_conn, msend);
+	}
+}
+
 /*
- * We return dead requests so that the caller can stop searching other
- * lists for the dead request that we found.
+ * Pending requests are uniquely identified by the id they were assigned
+ * as they were first put on the send queue.
 */
-static struct message_send *search_list(struct scoutfs_net_connection *conn,
-					struct list_head *list,
-					u8 cmd, u64 id)
+static struct message_send *find_request(struct scoutfs_net_connection *conn, u8 cmd, u64 id)
 {
 	struct message_send *msend;

 	assert_spin_locked(&conn->lock);

-	list_for_each_entry(msend, list, head) {
-		if (nh_is_request(&msend->nh) && msend->nh.cmd == cmd &&
-		    le64_to_cpu(msend->nh.id) == id)
-			return msend;
-	}
-
-	return NULL;
-}
-
-/*
- * Find an active send request on the lists.  It's almost certainly
- * waiting on the resend queue but it could be actively being sent.
- */
-static struct message_send *find_request(struct scoutfs_net_connection *conn,
-					 u8 cmd, u64 id)
-{
-	struct message_send *msend;
-
-	msend = search_list(conn, &conn->resend_queue, cmd, id) ?:
-		search_list(conn, &conn->send_queue, cmd, id);
-	if (msend && msend->dead)
+	msend = search_sorted_msends(&conn->req_root, id, NULL);
+	if (msend && !(msend->nh.cmd == cmd && le64_to_cpu(msend->nh.id) == id))
 		msend = NULL;
+
 	return msend;
 }

 /*
- * Complete a send message by moving it to the send queue and marking it
- * to be freed.  It won't be visible to callers trying to find sends.
+ * Free a send message by moving it to the send queue and marking it
+ * dead.  It is removed from the sorted rb roots so it won't be visible
+ * as a request for response processing.
 */
-static void complete_send(struct scoutfs_net_connection *conn,
-			  struct message_send *msend)
+static void queue_dead_free(struct scoutfs_net_connection *conn, struct message_send *msend)
 {
 	assert_spin_locked(&conn->lock);

@@ -213,6 +284,7 @@ static void complete_send(struct scoutfs_net_connection *conn,

 	msend->dead = 1;
 	list_move(&msend->head, &conn->send_queue);
+	erase_sorted_msend(conn, msend);
 	queue_work(conn->workq, &conn->send_work);
 }

@@ -264,7 +336,7 @@ static inline u8 net_err_from_host(struct super_block *sb, int error)
 				     error);
 		}

-		return -EINVAL;
+		return SCOUTFS_NET_ERR_EINVAL;
 	}

 	return net_errs[ind];
@@ -370,6 +442,7 @@ static int submit_send(struct super_block *sb,
 	msend->resp_func = resp_func;
 	msend->resp_data = resp_data;
 	msend->dead = 0;
+	RB_CLEAR_NODE(&msend->node);

 	msend->nh.seq = cpu_to_le64(seq);
 	msend->nh.recv_seq = 0;  /* set when sent, not when queued */
@@ -390,6 +463,7 @@ static int submit_send(struct super_block *sb,
 	} else {
 		list_add_tail(&msend->head, &conn->resend_queue);
 	}
+	insert_sorted_msend(conn, msend);

 	if (id_ret)
 		*id_ret = le64_to_cpu(msend->nh.id);
@@ -459,7 +533,7 @@ static int process_response(struct scoutfs_net_connection *conn,
 	if (msend) {
 		resp_func = msend->resp_func;
 		resp_data = msend->resp_data;
-		complete_send(conn, msend);
+		queue_dead_free(conn, msend);
 	} else {
 		scoutfs_inc_counter(sb, net_dropped_response);
 	}
@@ -550,43 +624,21 @@ static void queue_ordered_proc(struct scoutfs_net_connection *conn, struct messa
 * Free live responses up to and including the seq by marking them dead
 * and moving them to the send queue to be freed.
 */
-static bool move_acked_responses(struct scoutfs_net_connection *conn,
-				 struct list_head *list, u64 seq)
+static void free_acked_responses(struct scoutfs_net_connection *conn, u64 seq)
 {
 	struct message_send *msend;
 	struct message_send *tmp;
-	bool moved = false;
-
-	assert_spin_locked(&conn->lock);
-
-	list_for_each_entry_safe(msend, tmp, list, head) {
-		if (le64_to_cpu(msend->nh.seq) > seq)
-			break;
-		if (!nh_is_response(&msend->nh) || msend->dead)
-			continue;
-
-		msend->dead = 1;
-		list_move(&msend->head, &conn->send_queue);
-		moved = true;
-	}
-
-	return moved;
-}
-
-/* acks are processed inline in the recv worker */
-static void free_acked_responses(struct scoutfs_net_connection *conn, u64 seq)
-{
-	bool moved;

 	spin_lock(&conn->lock);

-	moved = move_acked_responses(conn, &conn->send_queue, seq) |
-		move_acked_responses(conn, &conn->resend_queue, seq);
+	for_each_sorted_msend(msend, tmp, &conn->resp_root, 0) {
+		if (le64_to_cpu(msend->nh.seq) > seq)
+			break;
+
+		queue_dead_free(conn, msend);
+	}

 	spin_unlock(&conn->lock);
-
-	if (moved)
-		queue_work(conn->workq, &conn->send_work);
 }

 static int k_recvmsg(struct socket *sock, void *buf, unsigned len)
@@ -824,9 +876,11 @@ static int k_sendmsg_full(struct socket *sock, struct kvec *kv, unsigned long nr
 	return ret;
 }

-static void free_msend(struct net_info *ninf, struct message_send *msend)
+static void free_msend(struct net_info *ninf, struct scoutfs_net_connection *conn,
+		       struct message_send *msend)
 {
 	list_del_init(&msend->head);
+	erase_sorted_msend(conn, msend);
 	scoutfs_tseq_del(&ninf->msg_tseq_tree, &msend->tseq_entry);
 	kfree(msend);
 }
@@ -866,9 +920,10 @@ static void scoutfs_net_send_worker(struct work_struct *work)
 		count = 0;

 		spin_lock(&conn->lock);
+
 		list_for_each_entry_safe(msend, _msend_, &conn->send_queue, head) {
 			if (msend->dead) {
-				free_msend(ninf, msend);
+				free_msend(ninf, conn, msend);
 				continue;
 			}

@@ -957,7 +1012,7 @@ static void scoutfs_net_destroy_worker(struct work_struct *work)

 	list_splice_init(&conn->resend_queue, &conn->send_queue);
 	list_for_each_entry_safe(msend, tmp, &conn->send_queue, head)
-		free_msend(ninf, msend);
+		free_msend(ninf, conn, msend);

 	/* accepted sockets are removed from their listener's list */
 	if (conn->listening_conn) {
@@ -1303,7 +1358,7 @@ static void scoutfs_net_shutdown_worker(struct work_struct *work)
 							struct message_send, head))) {
 			resp_func = msend->resp_func;
 			resp_data = msend->resp_data;
-			free_msend(ninf, msend);
+			free_msend(ninf, conn, msend);
 			spin_unlock(&conn->lock);

 			call_resp_func(sb, conn, resp_func, resp_data, NULL, 0, -ECONNABORTED);
@@ -1319,7 +1374,7 @@ static void scoutfs_net_shutdown_worker(struct work_struct *work)
 	list_splice_tail_init(&conn->send_queue, &conn->resend_queue);
 	list_for_each_entry_safe(msend, tmp, &conn->resend_queue, head) {
 		if (msend->nh.cmd == SCOUTFS_NET_CMD_GREETING)
-			free_msend(ninf, msend);
+			free_msend(ninf, conn, msend);
 	}

 	clear_conn_fl(conn, saw_greeting);
@@ -1493,6 +1548,8 @@ scoutfs_net_alloc_conn(struct super_block *sb,
 	atomic64_set(&conn->recv_seq, 0);
 	INIT_LIST_HEAD(&conn->send_queue);
 	INIT_LIST_HEAD(&conn->resend_queue);
+	conn->req_root = RB_ROOT;
+	conn->resp_root = RB_ROOT;
 	INIT_WORK(&conn->listen_work, scoutfs_net_listen_worker);
 	INIT_WORK(&conn->connect_work, scoutfs_net_connect_worker);
 	INIT_WORK(&conn->send_work, scoutfs_net_send_worker);
@@ -1705,7 +1762,7 @@ void scoutfs_net_client_greeting(struct super_block *sb,
 		atomic64_set(&conn->recv_seq, 0);
 		list_for_each_entry_safe(msend, tmp, &conn->resend_queue, head){
 			if (nh_is_response(&msend->nh))
-				free_msend(ninf, msend);
+				free_msend(ninf, conn, msend);
 		}
 	}

@@ -1808,6 +1865,8 @@ restart:
 		BUG_ON(!list_empty(&reconn->send_queue));
 		/* queued greeting response is racing, can be in send or resend queue */
 		list_splice_tail_init(&reconn->resend_queue, &conn->resend_queue);
+		move_sorted_msends(conn, &conn->req_root, reconn, &reconn->req_root);
+		move_sorted_msends(conn, &conn->resp_root, reconn, &reconn->resp_root);

 		/* new conn info is unused, swap, old won't call down */
 		swap(conn->info, reconn->info);
@@ -67,6 +67,8 @@ struct scoutfs_net_connection {
 	u64 next_send_id;
 	struct list_head send_queue;
 	struct list_head resend_queue;
+	struct rb_root req_root;
+	struct rb_root resp_root;

 	atomic64_t recv_seq;
 	unsigned int ordered_proc_nr;
@@ -33,6 +33,8 @@ enum {
 	Opt_acl,
 	Opt_data_prealloc_blocks,
 	Opt_data_prealloc_contig_only,
+	Opt_ino_alloc_per_lock,
+	Opt_lock_idle_count,
 	Opt_log_merge_wait_timeout_ms,
 	Opt_metadev_path,
 	Opt_noacl,
@@ -47,6 +49,8 @@ static const match_table_t tokens = {
 	{Opt_acl, "acl"},
 	{Opt_data_prealloc_blocks, "data_prealloc_blocks=%s"},
 	{Opt_data_prealloc_contig_only, "data_prealloc_contig_only=%s"},
+	{Opt_ino_alloc_per_lock, "ino_alloc_per_lock=%s"},
+	{Opt_lock_idle_count, "lock_idle_count=%s"},
 	{Opt_log_merge_wait_timeout_ms, "log_merge_wait_timeout_ms=%s"},
 	{Opt_metadev_path, "metadev_path=%s"},
 	{Opt_noacl, "noacl"},
@@ -117,6 +121,10 @@ static void free_options(struct scoutfs_mount_options *opts)
 	kfree(opts->metadev_path);
 }

+#define MIN_LOCK_IDLE_COUNT	32
+#define DEFAULT_LOCK_IDLE_COUNT	(10 * 1000)
+#define MAX_LOCK_IDLE_COUNT	(100 * 1000)
+
 #define MIN_LOG_MERGE_WAIT_TIMEOUT_MS		100UL
 #define DEFAULT_LOG_MERGE_WAIT_TIMEOUT_MS	500
 #define MAX_LOG_MERGE_WAIT_TIMEOUT_MS		(60 * MSEC_PER_SEC)
@@ -136,6 +144,8 @@ static void init_default_options(struct scoutfs_mount_options *opts)

 	opts->data_prealloc_blocks = SCOUTFS_DATA_PREALLOC_DEFAULT_BLOCKS;
 	opts->data_prealloc_contig_only = 1;
+	opts->ino_alloc_per_lock = SCOUTFS_LOCK_INODE_GROUP_NR;
+	opts->lock_idle_count = DEFAULT_LOCK_IDLE_COUNT;
 	opts->log_merge_wait_timeout_ms = DEFAULT_LOG_MERGE_WAIT_TIMEOUT_MS;
 	opts->orphan_scan_delay_ms = -1;
 	opts->quorum_heartbeat_timeout_ms = SCOUTFS_QUORUM_DEF_HB_TIMEO_MS;
@@ -143,6 +153,21 @@ static void init_default_options(struct scoutfs_mount_options *opts)
 	opts->tcp_keepalive_timeout_ms = DEFAULT_TCP_KEEPALIVE_TIMEOUT_MS;
 }

+static int verify_lock_idle_count(struct super_block *sb, int ret, int val)
+{
+	if (ret < 0) {
+		scoutfs_err(sb, "failed to parse lock_idle_count value");
+		return -EINVAL;
+	}
+	if (val < MIN_LOCK_IDLE_COUNT || val > MAX_LOCK_IDLE_COUNT) {
+		scoutfs_err(sb, "invalid lock_idle_count value %d, must be between %u and %u",
+			    val, MIN_LOCK_IDLE_COUNT, MAX_LOCK_IDLE_COUNT);
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
 static int verify_log_merge_wait_timeout_ms(struct super_block *sb, int ret, int val)
 {
 	if (ret < 0) {
@@ -238,6 +263,18 @@ static int parse_options(struct super_block *sb, char *options, struct scoutfs_m
 			opts->data_prealloc_contig_only = nr;
 			break;

+		case Opt_ino_alloc_per_lock:
+			ret = match_int(args, &nr);
+			if (ret < 0 || nr < 1 || nr > SCOUTFS_LOCK_INODE_GROUP_NR) {
+				scoutfs_err(sb, "invalid ino_alloc_per_lock option, must be between 1 and %u",
+					    SCOUTFS_LOCK_INODE_GROUP_NR);
+				if (ret == 0)
+					ret = -EINVAL;
+				return ret;
+			}
+			opts->ino_alloc_per_lock = nr;
+			break;
+
 		case Opt_tcp_keepalive_timeout_ms:
 			ret = match_int(args, &nr);
 			ret = verify_tcp_keepalive_timeout_ms(sb, ret, nr);
@@ -246,6 +283,14 @@ static int parse_options(struct super_block *sb, char *options, struct scoutfs_m
 			opts->tcp_keepalive_timeout_ms = nr;
 			break;

+		case Opt_lock_idle_count:
+			ret = match_int(args, &nr);
+			ret = verify_lock_idle_count(sb, ret, nr);
+			if (ret < 0)
+				return ret;
+			opts->lock_idle_count = nr;
+			break;
+
 		case Opt_log_merge_wait_timeout_ms:
 			ret = match_int(args, &nr);
 			ret = verify_log_merge_wait_timeout_ms(sb, ret, nr);
@@ -393,6 +438,7 @@ int scoutfs_options_show(struct seq_file *seq, struct dentry *root)
 		seq_puts(seq, ",acl");
 	seq_printf(seq, ",data_prealloc_blocks=%llu", opts.data_prealloc_blocks);
 	seq_printf(seq, ",data_prealloc_contig_only=%u", opts.data_prealloc_contig_only);
+	seq_printf(seq, ",ino_alloc_per_lock=%u", opts.ino_alloc_per_lock);
 	seq_printf(seq, ",metadev_path=%s", opts.metadev_path);
 	if (!is_acl)
 		seq_puts(seq, ",noacl");
@@ -481,6 +527,82 @@ static ssize_t data_prealloc_contig_only_store(struct kobject *kobj, struct kobj
 }
 SCOUTFS_ATTR_RW(data_prealloc_contig_only);

+static ssize_t ino_alloc_per_lock_show(struct kobject *kobj, struct kobj_attribute *attr,
+					 char *buf)
+{
+	struct super_block *sb = SCOUTFS_SYSFS_ATTRS_SB(kobj);
+	struct scoutfs_mount_options opts;
+
+	scoutfs_options_read(sb, &opts);
+
+	return snprintf(buf, PAGE_SIZE, "%u", opts.ino_alloc_per_lock);
+}
+static ssize_t ino_alloc_per_lock_store(struct kobject *kobj, struct kobj_attribute *attr,
+					  const char *buf, size_t count)
+{
+	struct super_block *sb = SCOUTFS_SYSFS_ATTRS_SB(kobj);
+	DECLARE_OPTIONS_INFO(sb, optinf);
+	char nullterm[20]; /* more than enough for octal -U32_MAX */
+	long val;
+	int len;
+	int ret;
+
+	len = min(count, sizeof(nullterm) - 1);
+	memcpy(nullterm, buf, len);
+	nullterm[len] = '\0';
+
+	ret = kstrtol(nullterm, 0, &val);
+	if (ret < 0 || val < 1 || val > SCOUTFS_LOCK_INODE_GROUP_NR) {
+		scoutfs_err(sb, "invalid ino_alloc_per_lock option, must be between 1 and %u",
+			    SCOUTFS_LOCK_INODE_GROUP_NR);
+		return -EINVAL;
+	}
+
+	write_seqlock(&optinf->seqlock);
+	optinf->opts.ino_alloc_per_lock = val;
+	write_sequnlock(&optinf->seqlock);
+
+	return count;
+}
+SCOUTFS_ATTR_RW(ino_alloc_per_lock);
+
+static ssize_t lock_idle_count_show(struct kobject *kobj, struct kobj_attribute *attr,
+						char *buf)
+{
+	struct super_block *sb = SCOUTFS_SYSFS_ATTRS_SB(kobj);
+	struct scoutfs_mount_options opts;
+
+	scoutfs_options_read(sb, &opts);
+
+	return snprintf(buf, PAGE_SIZE, "%u", opts.lock_idle_count);
+}
+static ssize_t lock_idle_count_store(struct kobject *kobj, struct kobj_attribute *attr,
+						 const char *buf, size_t count)
+{
+	struct super_block *sb = SCOUTFS_SYSFS_ATTRS_SB(kobj);
+	DECLARE_OPTIONS_INFO(sb, optinf);
+	char nullterm[30]; /* more than enough for octal -U64_MAX */
+	int val;
+	int len;
+	int ret;
+
+	len = min(count, sizeof(nullterm) - 1);
+	memcpy(nullterm, buf, len);
+	nullterm[len] = '\0';
+
+	ret = kstrtoint(nullterm, 0, &val);
+	ret = verify_lock_idle_count(sb, ret, val);
+	if (ret == 0) {
+		write_seqlock(&optinf->seqlock);
+		optinf->opts.lock_idle_count = val;
+		write_sequnlock(&optinf->seqlock);
+		ret = count;
+	}
+
+	return ret;
+}
+SCOUTFS_ATTR_RW(lock_idle_count);
+
 static ssize_t log_merge_wait_timeout_ms_show(struct kobject *kobj, struct kobj_attribute *attr,
 						char *buf)
 {
@@ -621,6 +743,8 @@ SCOUTFS_ATTR_RO(quorum_slot_nr);
 static struct attribute *options_attrs[] = {
 	SCOUTFS_ATTR_PTR(data_prealloc_blocks),
 	SCOUTFS_ATTR_PTR(data_prealloc_contig_only),
+	SCOUTFS_ATTR_PTR(ino_alloc_per_lock),
+	SCOUTFS_ATTR_PTR(lock_idle_count),
 	SCOUTFS_ATTR_PTR(log_merge_wait_timeout_ms),
 	SCOUTFS_ATTR_PTR(metadev_path),
 	SCOUTFS_ATTR_PTR(orphan_scan_delay_ms),
@@ -8,6 +8,8 @@
 struct scoutfs_mount_options {
 	u64 data_prealloc_blocks;
 	bool data_prealloc_contig_only;
+	unsigned int ino_alloc_per_lock;
+	int lock_idle_count;
 	unsigned int log_merge_wait_timeout_ms;
 	char *metadev_path;
 	unsigned int orphan_scan_delay_ms;
@@ -1195,8 +1195,8 @@ static struct attribute *quorum_attrs[] = {

 static inline bool valid_ipv4_unicast(__be32 addr)
 {
-	return !(ipv4_is_multicast(addr) && ipv4_is_lbcast(addr) &&
-		 ipv4_is_zeronet(addr) && ipv4_is_local_multicast(addr));
+	return !(ipv4_is_multicast(addr) || ipv4_is_lbcast(addr) ||
+		 ipv4_is_zeronet(addr) || ipv4_is_local_multicast(addr));
 }

 static inline bool valid_ipv4_port(__be16 port)
@@ -34,6 +34,7 @@
 #include "totl.h"
 #include "util.h"
 #include "quota.h"
+#include "trans.h"
 #include "counters.h"
 #include "scoutfs_trace.h"

@@ -1086,6 +1087,10 @@ int scoutfs_quota_mod_rule(struct super_block *sb, bool is_add,
 	if (ret < 0)
 		goto out;

+	ret = scoutfs_hold_trans(sb, true);
+	if (ret < 0)
+		goto out;
+
 	down_write(&qtinf->rwsem);

 	if (is_add) {
@@ -1095,28 +1100,30 @@ int scoutfs_quota_mod_rule(struct super_block *sb, bool is_add,
 		else if (ret == 0)
 			ret = -EEXIST;
 		if (ret < 0)
-			goto unlock;
+			goto release;

 		rule_to_rule_val(&rv, &rule);
 		ret = scoutfs_item_create(sb, &key, &rv, sizeof(rv), lock);
 		if (ret < 0)
-			goto unlock;
+			goto release;

 	} else {
 		ret = find_rule(sb, &rule, &key, lock) ?:
 		      scoutfs_item_delete(sb, &key, lock);
 		if (ret < 0)
-			goto unlock;
+			goto release;
 	}

 	scoutfs_quota_invalidate(sb);
 	ret = 0;

-unlock:
+release:
 	up_write(&qtinf->rwsem);
-	scoutfs_unlock(sb, lock, SCOUTFS_LOCK_WRITE);
+	scoutfs_release_trans(sb);

 out:
+	scoutfs_unlock(sb, lock, SCOUTFS_LOCK_WRITE);
+
 	if (is_add)
 		trace_scoutfs_quota_add_rule(sb, &rule, ret);
 	else
@@ -0,0 +1,744 @@
+/*
+ * Copyright (C) 2026 Versity Software, Inc.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ */
+#include <linux/kernel.h>
+#include <linux/fs.h>
+#include <linux/list_sort.h>
+#include <linux/sort.h>
+
+#include "format.h"
+#include "key.h"
+#include "block.h"
+#include "inode.h"
+#include "forest.h"
+#include "client.h"
+#include "ioctl.h"
+#include "lock.h"
+#include "xattr.h"
+#include "attr_x.h"
+#include "bsearch_index.h"
+#include "raw.h"
+
+struct fs_item {
+	struct list_head head;
+	struct scoutfs_key key;
+	u64 seq;
+	int val_len;
+	bool deletion;
+	/* val is aligned so we can deref structs in vals */
+	u8 val[0] __aligned(ARCH_KMALLOC_MINALIGN);
+};
+
+static int save_fs_item(struct list_head *list, struct scoutfs_key *key, u64 seq, u8 flags,
+			void *val, int val_len)
+{
+	struct fs_item *fsi;
+
+	/* max btree val len is hundreds of bytes */
+	fsi = kmalloc(offsetof(struct fs_item, val[val_len]), GFP_NOFS);
+	if (!fsi)
+		return -ENOMEM;
+
+	fsi->key = *key;
+	fsi->seq = seq;
+	fsi->val_len = val_len;
+	fsi->deletion = !!(flags & SCOUTFS_ITEM_FLAG_DELETION);
+	if (val_len > 0)
+		memcpy(fsi->val, val, val_len);
+	list_add_tail(&fsi->head, list);
+
+	return 0;
+}
+
+static void free_fs_item(struct fs_item *fsi)
+{
+	if (!list_empty(&fsi->head))
+		list_del_init(&fsi->head);
+	kfree(fsi);
+}
+
+static void free_fs_items(struct list_head *list)
+{
+	struct fs_item *fsi;
+	struct fs_item *tmp;
+
+	list_for_each_entry_safe(fsi, tmp, list, head)
+		free_fs_item(fsi);
+}
+
+static struct fs_item *next_fs_item(struct list_head *list, struct fs_item *fsi)
+{
+	list_for_each_entry_continue(fsi, list, head)
+		return fsi;
+	return NULL;
+}
+
+static int cmp_fs_items(void *priv, KC_LIST_CMP_CONST struct list_head *A,
+			KC_LIST_CMP_CONST struct list_head *B)
+{
+	KC_LIST_CMP_CONST struct fs_item *a =
+		container_of(A, KC_LIST_CMP_CONST struct fs_item, head);
+	KC_LIST_CMP_CONST struct fs_item *b =
+		container_of(B, KC_LIST_CMP_CONST struct fs_item, head);
+
+	return scoutfs_key_compare(&a->key, &b->key) ?: -scoutfs_cmp(a->seq, b->seq);
+}
+
+static void sort_and_remove(struct list_head *list, struct scoutfs_key *end)
+{
+	struct fs_item *prev;
+	struct fs_item *fsi;
+	struct fs_item *tmp;
+
+	list_sort(NULL, list, cmp_fs_items);
+
+	/* start by removing any items read before end was decreased by later blocks */
+	list_for_each_entry_safe_reverse(fsi, tmp, list, head) {
+		if (scoutfs_key_compare(&fsi->key, end) > 0)
+			free_fs_item(fsi);
+		else
+			break;
+	}
+
+	prev = NULL;
+	list_for_each_entry_safe(fsi, tmp, list, head) {
+		/* remove this item if it's an older version of previous item */
+		if (prev && scoutfs_key_compare(&prev->key, &fsi->key) == 0) {
+			free_fs_item(fsi);
+			continue;
+		}
+
+		/* remove previous deletion item once it has removed all older versions */
+		if (prev && prev->deletion)
+			free_fs_item(prev);
+
+		/* next item might match this, record to compare */
+		prev = fsi;
+	}
+
+	/* remove the last item if it's a deletion */
+	list_for_each_entry_reverse(fsi, list, head) {
+		if (fsi->deletion)
+			free_fs_item(fsi);
+		break;
+	}
+}
+
+static int save_all_items(struct super_block *sb, struct scoutfs_key *key, u64 seq, u8 flags,
+			  void *val, int val_len, int fic, void *arg)
+{
+	struct list_head *list = arg;
+
+	return save_fs_item(list, key, seq, flags, val, val_len);
+}
+
+/* -------------- */
+
+static void ms_from_key(struct scoutfs_ioctl_meta_seq *ms, struct scoutfs_key *key)
+{
+	ms->meta_seq = le64_to_cpu(key->skii_major);
+	ms->ino = le64_to_cpu(key->skii_ino);
+}
+
+/*
+ * Increment the key's ino->meta_seq so that we don't land between items.
+ */
+static void inc_meta_seq(struct scoutfs_key *key)
+{
+	le64_add_cpu(&key->skii_ino, 1);
+	if (key->skii_ino == 0)
+		le64_add_cpu(&key->skii_major, 1);
+}
+
+int scoutfs_raw_read_meta_seq(struct super_block *sb,
+			      struct scoutfs_ioctl_raw_read_meta_seq *rms,
+			      struct scoutfs_ioctl_meta_seq *last_ret)
+{
+	struct scoutfs_ioctl_meta_seq __user *ums;
+	struct scoutfs_ioctl_meta_seq ms;
+	struct scoutfs_net_roots roots;
+	DECLARE_SAVED_REFS(saved);
+	struct scoutfs_key start;
+	struct scoutfs_key last;
+	struct scoutfs_key key;
+	struct scoutfs_key end;
+	struct fs_item *fsi;
+	struct fs_item *tmp;
+	LIST_HEAD(list);
+	int retries;
+	int copied;
+	int count;
+	int ret;
+
+	ums = (void __user *)rms->results_ptr;
+	count = rms->results_size / sizeof(struct scoutfs_ioctl_meta_seq);
+	retries = 10;
+	copied = 0;
+
+	scoutfs_inode_init_index_key(&last, SCOUTFS_INODE_INDEX_META_SEQ_TYPE,
+				     rms->end.meta_seq, 0, rms->end.ino);
+
+retry:
+	ret = scoutfs_client_get_roots(sb, &roots);
+	if (ret)
+		goto out;
+
+	scoutfs_inode_init_index_key(&key, SCOUTFS_INODE_INDEX_META_SEQ_TYPE,
+				     rms->start.meta_seq, 0, rms->start.ino);
+
+	for (;;) {
+		start = key;
+		end = last;
+		ret = scoutfs_forest_read_items_roots(sb, &roots, 0, &key, NULL, &start, &end,
+						      save_all_items, &list);
+		if (ret < 0)
+			goto out;
+
+		sort_and_remove(&list, &end);
+
+		list_for_each_entry_safe(fsi, tmp, &list, head) {
+
+			if (copied == count) {
+				/* results are full, set end to before item can't return */
+				end = fsi->key;
+				le64_add_cpu(&end.skii_ino, -1ULL);
+				ret = 0;
+				goto out;
+			}
+
+			ms_from_key(&ms, &fsi->key);
+			if (copy_to_user(&ums[copied], &ms, sizeof(ms))) {
+				ret = -EFAULT;
+				goto out;
+			}
+
+			free_fs_item(fsi);
+			copied++;
+		}
+
+		if (scoutfs_key_compare(&end, &last) >= 0) {
+			end = last;
+			break;
+		}
+
+		key = end;
+		inc_meta_seq(&key);
+	}
+
+	ret = 0;
+out:
+	free_fs_items(&list);
+
+	ret = scoutfs_block_check_stale(sb, ret, &saved, &roots.fs_root.ref, &roots.logs_root.ref);
+	if (ret == -ESTALE && copied == 0 && retries-- > 0)
+		goto retry;
+
+	ms_from_key(last_ret, &end);
+
+	return ret ?: copied;
+}
+
+/* -------------- */
+
+struct inode_info_context {
+	size_t nr_inos;
+	u64 *inos;
+
+	size_t nr_names;
+	struct xattr_name {
+		u64 hash;
+		char *name;
+		u8 name_len; /* no null */
+	} *names;
+
+	struct list_head fs_items;
+};
+
+static int cmp_u64(const void *A, const void *B)
+{
+	const u64 *a = A;
+	const u64 *b = B;
+
+	return scoutfs_cmp(*a, *b);
+}
+
+static int cmp_name_hash(const void *A, const void *B)
+{
+	const struct xattr_name *a = A;
+	const struct xattr_name *b = B;
+
+	return scoutfs_cmp(a->hash, b->hash);
+}
+
+static int cmp_name_string(const void *A, const void *B)
+{
+	const struct xattr_name *a = A;
+	const struct xattr_name *b = B;
+
+	return scoutfs_cmp(a->name_len, b->name_len) ?: memcmp(a->name, b->name, a->name_len);
+}
+
+static int setup_context(struct inode_info_context *ctx,
+			 struct scoutfs_ioctl_raw_read_inode_info *rii)
+{
+	__u64 __user *uinos = (void __user *)rii->inos_ptr;
+	char __user *uname;
+	long len_null;
+	long len;
+	int ret;
+	u32 i;
+
+	ctx->nr_inos = rii->inos_count;
+	ctx->nr_names = rii->names_count;
+	INIT_LIST_HEAD(&ctx->fs_items);
+
+	ctx->inos = kvmalloc_array(ctx->nr_inos, sizeof(ctx->inos[0]), GFP_KERNEL);
+	ctx->names = kvcalloc(ctx->nr_names, sizeof(ctx->names[0]), GFP_KERNEL);
+	if (!ctx->inos || !ctx->names) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	if (copy_from_user(ctx->inos, uinos, ctx->nr_inos * sizeof(ctx->inos[0]))) {
+		ret = -EFAULT;
+		goto out;
+	}
+
+	/* inos must not be 0 and must increase and contain no duplicates */
+	if (ctx->inos[0] == 0) {
+		ret = -EINVAL;
+		goto out;
+	}
+	for (i = 1; i < ctx->nr_inos; i++) {
+		if (ctx->inos[i] <= ctx->inos[i - 1]) {
+			ret = -EINVAL;
+			goto out;
+		}
+	}
+
+	uname = (void __user *)rii->names_ptr;
+	for (i = 0; i < ctx->nr_names; i++) {
+		len_null = SCOUTFS_XATTR_MAX_NAME_LEN + 1;
+		ret = strnlen_user(uname, len_null);
+		if (ret <= 1 || ret > len_null) {
+			if (ret >= 0)
+				ret = -EINVAL;
+			goto out;
+		}
+		len_null = ret;
+		len = len_null - 1;
+
+		ctx->names[i].name_len = len;
+		ctx->names[i].name = kmalloc(len_null, GFP_KERNEL);
+		if (!ctx->names[i].name) {
+			ret = -ENOMEM;
+			goto out;
+		}
+
+		ret = strncpy_from_user(ctx->names[i].name, uname, len_null);
+		if (ret != len) {
+			if (ret >= 0)
+				ret = -EINVAL;
+			goto out;
+		}
+
+		ctx->names[i].hash = scoutfs_xattr_name_hash(ctx->names[i].name, len);
+		uname += len_null;
+	}
+
+	/* make sure all the names differ */
+	sort(ctx->names, ctx->nr_names, sizeof(ctx->names[0]), cmp_name_string, NULL);
+	for (i = 1; i < ctx->nr_names; i++) {
+		if (cmp_name_string(&ctx->names[i - 1], &ctx->names[i]) == 0) {
+			ret = -EINVAL;
+			goto out;
+		}
+	}
+
+	/* then leave them sorted by hash */
+	sort(ctx->names, ctx->nr_names, sizeof(ctx->names[0]), cmp_name_hash, NULL);
+
+	ret = 0;
+out:
+	return ret;
+}
+
+static void free_context(struct inode_info_context *ctx)
+{
+	int i;
+
+	kvfree(ctx->inos);
+
+	if (ctx->names) {
+		for (i = 0; i < ctx->nr_names; i++) {
+			if (!ctx->names[i].name)
+				break;
+			kfree(ctx->names[i].name);
+		}
+		kvfree(ctx->names);
+	}
+}
+
+/*
+ * Iterate over fs items and save any that we're interested in.  We want
+ * inode struct items and any xattr items whose hashes collide with the
+ * xattr names we're searching for.
+ *
+ * Our forest calls can be advancing through the key space as we see
+ * slices that intersect with blocks in trees.  And each forest caller
+ * can be resetting the key position to the start of each forest block
+ * it reads in an intersection.
+ *
+ * From this callback's perspective, the key can be jumping all over the
+ * place.  We don't have any iterative position state.  For each key we
+ * decide if we want to save it and then set the key to the next key we
+ * want after the current key.  We'll combine all the saved keys later.
+ */
+static int save_info_items(struct super_block *sb, struct scoutfs_key *key, u64 seq,
+			   u8 flags, void *val, int val_len, int fic, void *arg)
+{
+	u64 ino = le64_to_cpu(key->_sk_first);
+	struct inode_info_context *ctx = arg;
+	struct xattr_name name;
+	size_t name_ind;
+	size_t ino_ind;
+	bool hash_match;
+	bool ino_match;
+	int ret;
+
+	ino_ind = bsearch_index(&ino, ctx->inos, ctx->nr_inos, sizeof(ctx->inos[0]), cmp_u64);
+	ino_match = ino_ind < ctx->nr_inos && ctx->inos[ino_ind] == ino;
+
+	/* jump to to next ino, could be for this key if we're before the ino struct */
+	if (!ino_match || key->sk_type < SCOUTFS_INODE_TYPE)
+		goto next_inode;
+
+	/* find our search position in xattrs */
+	if (key->sk_type < SCOUTFS_XATTR_TYPE) {
+		name_ind = 0;
+		hash_match = false;
+
+	} else if (key->sk_type == SCOUTFS_XATTR_TYPE) {
+		name = (struct xattr_name) { .hash = le64_to_cpu(key->skx_name_hash) };
+		name_ind = bsearch_index(&name, ctx->names, ctx->nr_names, sizeof(ctx->names[0]),
+					 cmp_name_hash);
+		hash_match = name_ind < ctx->nr_names && ctx->names[name_ind].hash == name.hash;
+	} else {
+		name_ind = ctx->nr_names;
+		hash_match = false;
+	}
+
+	/* save inode items for our search and all xattr items that match search hashes */
+	if (key->sk_type == SCOUTFS_INODE_TYPE || hash_match) {
+		ret = save_fs_item(&ctx->fs_items, key,  seq, flags, val, val_len);
+		if (ret < 0)
+			goto out;
+	}
+
+	/* let the caller continue iterating through matching xattr items */
+	if (hash_match) {
+		ret = 0;
+		goto out;
+	}
+
+	/* jump to the next xattr */
+	if (name_ind < ctx->nr_names) {
+		scoutfs_xattr_init_key(key, ino, ctx->names[name_ind].hash, 0);
+		ret = -ESRCH;
+		goto out;
+	}
+
+	/* no more xattrs, must be done with this ino */
+	ino_ind++;
+
+next_inode:
+	/* now jump to next inode struct key, or we're done */
+	if (ino_ind < ctx->nr_inos)
+		scoutfs_inode_init_key(key, ctx->inos[ino_ind]);
+	else
+		scoutfs_key_set_ones(key);
+
+	ret = -ESRCH;
+out:
+	return ret;
+}
+
+static int copy_to_user_off(void __user *dst, size_t *dst_off, size_t dst_size,
+			    void *src, size_t copy_size)
+{
+	if (copy_size == 0)
+		return 0;
+	if (*dst_off + copy_size > dst_size)
+		return -ERANGE;
+	if (copy_to_user(dst + *dst_off, src, copy_size))
+		return -EFAULT;
+
+	*dst_off += copy_size;
+	return 0;
+}
+
+static int copy_result_to_user(void __user *ures, size_t *off, size_t size, u8 type,
+			       void *a_data, size_t a_len, void *b_data, size_t b_len,
+			       size_t extra_size)
+{
+	struct scoutfs_ioctl_raw_read_result res;
+	const size_t szof_res = sizeof(struct scoutfs_ioctl_raw_read_result);
+
+	memzero_explicit(&res, szof_res);
+	res = (struct scoutfs_ioctl_raw_read_result) {
+		.size = a_len + b_len + extra_size,
+		.type = type,
+	};
+
+	return copy_to_user_off(ures, off, size, &res, szof_res) ?:
+	       (a_len ? copy_to_user_off(ures, off, size, a_data, a_len) : 0) ?:
+	       (b_len ? copy_to_user_off(ures, off, size, b_data, b_len) : 0);
+}
+
+static int copy_item_results_to_user(struct super_block *sb, struct inode_info_context *ctx,
+				     void __user *ures, size_t *off, size_t size,
+				     struct fs_item *fsi)
+{
+	struct scoutfs_inode *cinode;
+	struct scoutfs_xattr *xat;
+	static char null = '\0';
+	size_t len;
+	u64 ino;
+	int ret = 0;
+
+	if (fsi->key.sk_type == SCOUTFS_INODE_TYPE) {
+		cinode = (void *)fsi->val;
+		ino = le64_to_cpu(fsi->key.ski_ino);
+
+		ret = copy_result_to_user(ures, off, size, SCOUTFS_IOC_RAW_READ_RESULT_INODE,
+					  &ino, sizeof(ino), cinode, sizeof(struct scoutfs_inode),
+					  0);
+
+	} else if (fsi->key.sk_type == SCOUTFS_XATTR_TYPE) {
+		if (fsi->key.skx_part == 0) {
+			xat = (void *)fsi->val;
+			ret = copy_result_to_user(ures, off, size,
+						  SCOUTFS_IOC_RAW_READ_RESULT_XATTR, xat->name,
+						  xat->name_len, &null, sizeof(null),
+						  le16_to_cpu(xat->val_len));
+			if (ret == 0 && xat->val_len != 0) {
+				/* then append the start of the value */
+				len = fsi->val_len -
+				      offsetof(struct scoutfs_xattr, name[xat->name_len]);
+				ret = copy_to_user_off(ures, off, size, xat->name + xat->name_len,
+						       len);
+			}
+		} else {
+			/* continue appending partial values */
+			ret = copy_to_user_off(ures, off, size, fsi->val, fsi->val_len);
+		}
+	}
+
+	return ret;
+}
+
+static bool ignore_zero_nlink(struct inode_info_context *ctx, struct fs_item *fsi)
+{
+	struct scoutfs_inode *cinode = (void *)fsi->val;
+
+	return cinode->nlink == 0;
+}
+
+static bool ignore_xattr_name(struct inode_info_context *ctx, struct fs_item *fsi)
+{
+	struct scoutfs_xattr *xat = (void *)fsi->val;
+	struct xattr_name name = {
+		.hash = le64_to_cpu(fsi->key.skx_name_hash),
+		.name = xat->name,
+		.name_len = xat->name_len,
+	};
+	size_t i;
+
+	for (i = bsearch_index(&name, ctx->names, ctx->nr_names, sizeof(ctx->names[0]),
+			       cmp_name_hash);
+	     i < ctx->nr_names && name.hash == ctx->names[i].hash; i++) {
+		if (cmp_name_string(&name, &ctx->names[i]) == 0)
+			return false;
+	}
+
+	return true;
+}
+
+static int copy_results_to_user(struct super_block *sb, struct inode_info_context *ctx,
+				struct scoutfs_ioctl_raw_read_inode_info *rii)
+{
+	void __user *ures = (void __user *)rii->results_ptr;
+	struct scoutfs_xattr *xat;
+	struct fs_item *next;
+	struct fs_item *fsi;
+	struct fs_item *tmp;
+	size_t xattr_end;
+	size_t off;
+	__le64 in_ino;
+	__le64 in_id;
+	int ret;
+
+	in_ino = 0;
+	xattr_end = 0;
+	in_id = 0;
+	off = 0;
+
+	list_for_each_entry_safe(fsi, tmp, &ctx->fs_items, head) {
+		/*
+		 * ignore:
+		 *  - inodes with an nlink of 0
+		 *  - all items for an ino after the inode struct that we're ignoring 
+		 *  - first xattr parts with a name we don't need
+		 *  - additional xattr parts when we ignored the first
+		 */
+		if ((fsi->key.sk_type == SCOUTFS_INODE_TYPE && ignore_zero_nlink(ctx, fsi)) ||
+		    (fsi->key.sk_type > SCOUTFS_INODE_TYPE && fsi->key._sk_first != in_ino) ||
+		    (fsi->key.sk_type == SCOUTFS_XATTR_TYPE &&
+		     ((fsi->key.skx_part == 0 && ignore_xattr_name(ctx, fsi)) ||
+		      (fsi->key.skx_part > 0 && fsi->key.skx_id != in_id)))) {
+			free_fs_item(fsi);
+			in_ino = 0;
+			in_id = 0;
+			continue;
+		}
+
+		/* advance ino/xattr stream context state machine */
+		if (fsi->key.sk_type == SCOUTFS_INODE_TYPE) {
+			in_ino = fsi->key.ski_ino;
+			in_id = 0;
+		} else if (fsi->key.sk_type == SCOUTFS_XATTR_TYPE && fsi->key.skx_part == 0) {
+			in_id = fsi->key.skx_id;
+			/* save the required offset after the complete xattr */
+			xat = (void *)fsi->val;
+			xattr_end = off + sizeof(struct scoutfs_ioctl_raw_read_result) +
+				    xat->name_len + 1 + le16_to_cpu(xat->val_len);
+		}
+
+		/* copy results, usually with header, but additional xattr parts copied raw */
+		ret = copy_item_results_to_user(sb, ctx, ures, &off, rii->results_size, fsi);
+		if (ret < 0)
+			goto out;
+
+		/* make sure we saw all xattr parts and copied the correct size */
+		if (xattr_end > 0 &&
+		    !((next = next_fs_item(&ctx->fs_items, fsi)) &&
+		      next->key.sk_type == SCOUTFS_XATTR_TYPE && next->key.skx_ino == in_ino &&
+		      next->key.skx_id == in_id)) {
+			if (off != xattr_end) {
+				ret = -EUCLEAN;
+				goto out;
+			}
+			xattr_end = 0;
+		}
+	}
+
+	ret = 0;
+out:
+	return ret ?: off;
+}
+
+/*
+ * If the key is for an inode we're not interested in, or if its past
+ * the xattr items, then advance to the next inode.  This is used
+ * between forest read items calls to avoid leaf blocks.  The callback
+ * takes care of iterating through the items for an inode across
+ * multiple leaves.
+ */
+static void advance_key_ino(struct scoutfs_key *key, struct inode_info_context *ctx)
+{
+	u64 ino = le64_to_cpu(key->_sk_first);
+	size_t ino_ind;
+
+	ino_ind = bsearch_index(&ino, ctx->inos, ctx->nr_inos, sizeof(ctx->inos[0]), cmp_u64);
+	if (ino_ind < ctx->nr_inos && ctx->inos[ino_ind] == ino) {
+		if (key->sk_type <= SCOUTFS_XATTR_TYPE)
+			return;
+		else
+			ino_ind++;
+	}
+
+	if (ino_ind < ctx->nr_inos)
+		scoutfs_inode_init_key(key, ctx->inos[ino_ind]);
+	else
+		scoutfs_key_set_ones(key);
+}
+
+int scoutfs_raw_read_inode_info(struct super_block *sb,
+				struct scoutfs_ioctl_raw_read_inode_info *rii)
+{
+	struct inode_info_context ctx = {0, };
+	struct scoutfs_net_roots roots;
+	DECLARE_SAVED_REFS(saved);
+	struct scoutfs_key lock_start;
+	struct scoutfs_key lock_end;
+	struct scoutfs_key start;
+	struct scoutfs_key last;
+	struct scoutfs_key key;
+	struct scoutfs_key end;
+	LIST_HEAD(list);
+	int retries = 10;
+	int ret;
+
+	ret = setup_context(&ctx, rii);
+	if (ret < 0)
+		goto out;
+
+	if (ctx.nr_names > 0)
+		scoutfs_xattr_init_key(&last, ctx.inos[ctx.nr_inos -1],
+				       ctx.names[ctx.nr_names - 1].hash, U64_MAX);
+	else
+		scoutfs_inode_init_key(&last, ctx.inos[ctx.nr_inos - 1]);
+
+retry:
+	ret = scoutfs_client_get_roots(sb, &roots);
+	if (ret)
+		goto out;
+
+	scoutfs_inode_init_key(&key, ctx.inos[0]);
+
+	while (scoutfs_key_compare(&key, &last) <= 0) {
+		scoutfs_lock_get_fs_item_range(le64_to_cpu(key._sk_first), &lock_start, &lock_end);
+
+		start = key;
+		end = last;
+		if (scoutfs_key_compare(&lock_end, &end) < 0)
+			end = lock_end;
+
+		ret = scoutfs_forest_read_items_roots(sb, &roots, 0, &key, &lock_start,
+						      &start, &end, save_info_items, &ctx);
+		if (ret < 0)
+			goto out;
+
+		/* save each sorted batch, might have partial results for an inode */
+		sort_and_remove(&ctx.fs_items, &end);
+		list_splice_tail_init(&ctx.fs_items, &list);
+
+		key = end;
+		if (!scoutfs_key_is_ones(&key)) {
+			scoutfs_key_inc(&key);
+			advance_key_ino(&key, &ctx);
+		}
+	}
+
+	list_splice_tail_init(&list, &ctx.fs_items);
+	ret = copy_results_to_user(sb, &ctx, rii);
+out:
+	free_fs_items(&list);
+	free_fs_items(&ctx.fs_items);
+
+	ret = scoutfs_block_check_stale(sb, ret, &saved, &roots.fs_root.ref, &roots.logs_root.ref);
+	if (ret == -ESTALE && retries-- > 0)
+		goto retry;
+
+	free_context(&ctx);
+	return ret;
+}
@@ -0,0 +1,10 @@
+#ifndef _SCOUTFS_RAW_H_
+#define _SCOUTFS_RAW_H_
+
+int scoutfs_raw_read_meta_seq(struct super_block *sb,
+			      struct scoutfs_ioctl_raw_read_meta_seq *rms,
+			      struct scoutfs_ioctl_meta_seq *last_ret);
+int scoutfs_raw_read_inode_info(struct super_block *sb,
+				struct scoutfs_ioctl_raw_read_inode_info *rii);
+
+#endif
@@ -789,6 +789,80 @@ TRACE_EVENT(scoutfs_inode_walk_writeback,
 		  __entry->ino, __entry->write, __entry->ret)
 );

+TRACE_EVENT(scoutfs_orphan_scan_start,
+	TP_PROTO(struct super_block *sb),
+
+	TP_ARGS(sb),
+
+	TP_STRUCT__entry(
+		SCSB_TRACE_FIELDS
+	),
+
+	TP_fast_assign(
+		SCSB_TRACE_ASSIGN(sb);
+	),
+
+	TP_printk(SCSBF, SCSB_TRACE_ARGS)
+);
+
+TRACE_EVENT(scoutfs_orphan_scan_stop,
+	TP_PROTO(struct super_block *sb, bool work_todo),
+
+	TP_ARGS(sb, work_todo),
+
+	TP_STRUCT__entry(
+		SCSB_TRACE_FIELDS
+		__field(bool, work_todo)
+	),
+
+	TP_fast_assign(
+		SCSB_TRACE_ASSIGN(sb);
+		__entry->work_todo = work_todo;
+	),
+
+	TP_printk(SCSBF" work_todo %d", SCSB_TRACE_ARGS, __entry->work_todo)
+);
+
+TRACE_EVENT(scoutfs_orphan_scan_work,
+	TP_PROTO(struct super_block *sb, __u64 ino),
+
+	TP_ARGS(sb, ino),
+
+	TP_STRUCT__entry(
+		SCSB_TRACE_FIELDS
+		__field(__u64, ino)
+	),
+
+	TP_fast_assign(
+		SCSB_TRACE_ASSIGN(sb);
+		__entry->ino = ino;
+	),
+
+	TP_printk(SCSBF" ino %llu", SCSB_TRACE_ARGS,
+		  __entry->ino)
+);
+
+TRACE_EVENT(scoutfs_orphan_scan_end,
+	TP_PROTO(struct super_block *sb, __u64 ino, int ret),
+
+	TP_ARGS(sb, ino, ret),
+
+	TP_STRUCT__entry(
+		SCSB_TRACE_FIELDS
+		__field(__u64, ino)
+		__field(int, ret)
+	),
+
+	TP_fast_assign(
+		SCSB_TRACE_ASSIGN(sb);
+		__entry->ino = ino;
+		__entry->ret = ret;
+	),
+
+	TP_printk(SCSBF" ino %llu ret %d", SCSB_TRACE_ARGS,
+		  __entry->ino, __entry->ret)
+);
+
 DECLARE_EVENT_CLASS(scoutfs_lock_info_class,
 	TP_PROTO(struct super_block *sb, struct lock_info *linfo),

@@ -1036,6 +1110,82 @@ TRACE_EVENT(scoutfs_orphan_inode,
 		  MINOR(__entry->dev), __entry->ino)
 );

+DECLARE_EVENT_CLASS(scoutfs_try_delete_class,
+        TP_PROTO(struct super_block *sb, u64 ino),
+        TP_ARGS(sb, ino),
+        TP_STRUCT__entry(
+		SCSB_TRACE_FIELDS
+		__field(__u64, ino)
+        ),
+        TP_fast_assign(
+		SCSB_TRACE_ASSIGN(sb);
+		__entry->ino = ino;
+        ),
+	TP_printk(SCSBF" ino %llu", SCSB_TRACE_ARGS, __entry->ino)
+);
+
+DEFINE_EVENT(scoutfs_try_delete_class, scoutfs_try_delete,
+        TP_PROTO(struct super_block *sb, u64 ino),
+        TP_ARGS(sb, ino)
+);
+
+DEFINE_EVENT(scoutfs_try_delete_class, scoutfs_try_delete_local_busy,
+        TP_PROTO(struct super_block *sb, u64 ino),
+        TP_ARGS(sb, ino)
+);
+
+DEFINE_EVENT(scoutfs_try_delete_class, scoutfs_try_delete_cached,
+        TP_PROTO(struct super_block *sb, u64 ino),
+        TP_ARGS(sb, ino)
+);
+
+DEFINE_EVENT(scoutfs_try_delete_class, scoutfs_try_delete_no_item,
+        TP_PROTO(struct super_block *sb, u64 ino),
+        TP_ARGS(sb, ino)
+);
+
+TRACE_EVENT(scoutfs_try_delete_has_links,
+	TP_PROTO(struct super_block *sb, u64 ino, unsigned int nlink),
+
+	TP_ARGS(sb, ino, nlink),
+
+	TP_STRUCT__entry(
+		SCSB_TRACE_FIELDS
+		__field(__u64, ino)
+		__field(unsigned int, nlink)
+	),
+
+	TP_fast_assign(
+		SCSB_TRACE_ASSIGN(sb);
+		__entry->ino = ino;
+		__entry->nlink = nlink;
+	),
+
+	TP_printk(SCSBF" ino %llu nlink %u", SCSB_TRACE_ARGS, __entry->ino,
+		  __entry->nlink)
+);
+
+TRACE_EVENT(scoutfs_inode_orphan_delete,
+	TP_PROTO(struct super_block *sb, u64 ino, int ret),
+
+	TP_ARGS(sb, ino, ret),
+
+	TP_STRUCT__entry(
+		SCSB_TRACE_FIELDS
+		__field(__u64, ino)
+		__field(int, ret)
+	),
+
+	TP_fast_assign(
+		SCSB_TRACE_ASSIGN(sb);
+		__entry->ino = ino;
+		__entry->ret = ret;
+	),
+
+	TP_printk(SCSBF" ino %llu ret %d", SCSB_TRACE_ARGS, __entry->ino,
+		__entry->ret)
+);
+
 TRACE_EVENT(scoutfs_delete_inode,
 	TP_PROTO(struct super_block *sb, u64 ino, umode_t mode, u64 size),

@@ -1060,6 +1210,32 @@ TRACE_EVENT(scoutfs_delete_inode,
 		  __entry->mode, __entry->size)
 );

+TRACE_EVENT(scoutfs_delete_inode_end,
+	TP_PROTO(struct super_block *sb, u64 ino, umode_t mode, u64 size, int ret),
+
+	TP_ARGS(sb, ino, mode, size, ret),
+
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(__u64, ino)
+		__field(umode_t, mode)
+		__field(__u64, size)
+		__field(int, ret)
+	),
+
+	TP_fast_assign(
+		__entry->dev = sb->s_dev;
+		__entry->ino = ino;
+		__entry->mode = mode;
+		__entry->size = size;
+		__entry->ret = ret;
+	),
+
+	TP_printk("dev %d,%d ino %llu, mode 0x%x size %llu, ret %d",
+		  MAJOR(__entry->dev), MINOR(__entry->dev), __entry->ino,
+		  __entry->mode, __entry->size, __entry->ret)
+);
+
 DECLARE_EVENT_CLASS(scoutfs_key_class,
        TP_PROTO(struct super_block *sb, struct scoutfs_key *key),
        TP_ARGS(sb, key),
@@ -1443,28 +1619,6 @@ DEFINE_EVENT(scoutfs_work_class, scoutfs_data_return_server_extents_exit,
        TP_ARGS(sb, data, ret)
 );

-DECLARE_EVENT_CLASS(scoutfs_shrink_exit_class,
-        TP_PROTO(struct super_block *sb, unsigned long nr_to_scan, int ret),
-        TP_ARGS(sb, nr_to_scan, ret),
-        TP_STRUCT__entry(
-		__field(void *, sb)
-		__field(unsigned long, nr_to_scan)
-		__field(int, ret)
-        ),
-        TP_fast_assign(
-		__entry->sb = sb;
-		__entry->nr_to_scan = nr_to_scan;
-		__entry->ret = ret;
-        ),
-        TP_printk("sb %p nr_to_scan %lu ret %d",
-		  __entry->sb, __entry->nr_to_scan, __entry->ret)
-);
-
-DEFINE_EVENT(scoutfs_shrink_exit_class, scoutfs_lock_shrink_exit,
-        TP_PROTO(struct super_block *sb, unsigned long nr_to_scan, int ret),
-        TP_ARGS(sb, nr_to_scan, ret)
-);
-
 TRACE_EVENT(scoutfs_rename,
 	TP_PROTO(struct super_block *sb, struct inode *old_dir,
 		 struct dentry *old_dentry, struct inode *new_dir,
@@ -3097,6 +3251,24 @@ TRACE_EVENT(scoutfs_ioc_search_xattrs,
 		  __entry->ino, __entry->last_ino)
 );

+TRACE_EVENT(scoutfs_trigger_fired,
+	TP_PROTO(struct super_block *sb, const char *name),
+
+	TP_ARGS(sb, name),
+
+	TP_STRUCT__entry(
+		SCSB_TRACE_FIELDS
+		__field(const char *, name)
+	),
+
+	TP_fast_assign(
+		SCSB_TRACE_ASSIGN(sb);
+		__entry->name = name;
+	),
+
+	TP_printk(SCSBF" %s", SCSB_TRACE_ARGS, __entry->name)
+);
+
 #endif /* _TRACE_SCOUTFS_H */

 /* This part must be outside protection */
@@ -41,6 +41,7 @@
 #include "recov.h"
 #include "omap.h"
 #include "fence.h"
+#include "triggers.h"

 /*
 * Every active mount can act as the server that listens on a net
@@ -255,6 +256,14 @@ static void server_down(struct server_info *server)
 		cmpxchg(&server->status, was, SERVER_DOWN);
 }

+static void init_mounted_client_key(struct scoutfs_key *key, u64 rid)
+{
+	*key = (struct scoutfs_key) {
+		.sk_zone = SCOUTFS_MOUNTED_CLIENT_ZONE,
+		.skmc_rid = cpu_to_le64(rid),
+	};
+}
+
 /*
 * The per-holder allocation block use budget balances batching
 * efficiency and concurrency.  The larger this gets, the fewer
@@ -962,6 +971,28 @@ static int find_log_trees_item(struct super_block *sb,
 	return ret;
 }

+/*
+ * Return true if the given rid has a mounted_clients entry.
+ */
+static bool rid_is_mounted(struct super_block *sb, u64 rid)
+{
+	DECLARE_SERVER_INFO(sb, server);
+	struct scoutfs_super_block *super = DIRTY_SUPER_SB(sb);
+	SCOUTFS_BTREE_ITEM_REF(iref);
+	struct scoutfs_key key;
+	int ret;
+
+	init_mounted_client_key(&key, rid);
+
+	mutex_lock(&server->mounted_clients_mutex);
+	ret = scoutfs_btree_lookup(sb, &super->mounted_clients, &key, &iref);
+	if (ret == 0)
+		scoutfs_btree_put_iref(&iref);
+	mutex_unlock(&server->mounted_clients_mutex);
+
+	return ret == 0;
+}
+
 /*
 * Find the log_trees item with the greatest nr for each rid.  Fills the
 * caller's log_trees and sets the key before the returned log_trees for
@@ -994,10 +1025,11 @@ static int for_each_rid_last_lt(struct super_block *sb, struct scoutfs_btree_roo
 }

 /*
- * Log merge range items are stored at the starting fs key of the range.
- * The only fs key field that doesn't hold information is the zone, so
- * we use the zone to differentiate all types that we store in the log
- * merge tree.
+ * Log merge range items are stored at the starting fs key of the range
+ * with the zone overwritten to indicate the log merge item type.  This
+ * day0 mistake loses sorting information for items in the different
+ * zones in the fs root, so the range items aren't strictly sorted by
+ * the starting key of their range.
 */
 static void init_log_merge_key(struct scoutfs_key *key, u8 zone, u64 first,
 			       u64 second)
@@ -1029,6 +1061,51 @@ static int next_log_merge_item_key(struct super_block *sb, struct scoutfs_btree_
 	return ret;
 }

+/*
+ * The range items aren't sorted by their range.start because
+ * _RANGE_ZONE clobbers the range's zone.  We sweep all the items and
+ * find the range with the next least starting key that's greater than
+ * the caller's starting key.  We have to be careful to iterate over the
+ * log_merge tree keys because the ranges can overlap as they're mapped
+ * to the log_merge keys by clobbering their zone.
+ */
+static int next_log_merge_range(struct super_block *sb, struct scoutfs_btree_root *root,
+				struct scoutfs_key *start, struct scoutfs_log_merge_range *rng)
+{
+	struct scoutfs_log_merge_range *next;
+	SCOUTFS_BTREE_ITEM_REF(iref);
+	struct scoutfs_key key;
+	int ret;
+
+	key = *start;
+	key.sk_zone = SCOUTFS_LOG_MERGE_RANGE_ZONE;
+	scoutfs_key_set_ones(&rng->start);
+
+	do {
+		ret = scoutfs_btree_next(sb, root, &key, &iref);
+		if (ret == 0) {
+			if (iref.key->sk_zone != SCOUTFS_LOG_MERGE_RANGE_ZONE) {
+				ret = -ENOENT;
+			} else if (iref.val_len != sizeof(struct scoutfs_log_merge_range)) {
+				ret = -EIO;
+			} else {
+				next = iref.val;
+				if (scoutfs_key_compare(&next->start, &rng->start) < 0 &&
+				    scoutfs_key_compare(&next->start, start) >= 0)
+					*rng = *next;
+				key = *iref.key;
+				scoutfs_key_inc(&key);
+			}
+			scoutfs_btree_put_iref(&iref);
+		}
+	} while (ret == 0);
+
+	if (ret == -ENOENT && !scoutfs_key_is_ones(&rng->start))
+		ret = 0;
+
+	return ret;
+}
+
 static int next_log_merge_item(struct super_block *sb,
 			       struct scoutfs_btree_root *root,
 			       u8 zone, u64 first, u64 second,
@@ -1174,6 +1251,60 @@ static int do_finalize_ours(struct super_block *sb,
 * happens to arrive at just the right time.  That's fine, merging will
 * ignore and tear down the empty input.
 */
+
+static int reclaim_open_log_tree(struct super_block *sb, u64 rid);
+
+/*
+ * Reclaim log trees for rids that have no mounted_clients entry.
+ * They block merges by appearing active.  reclaim_open_log_tree
+ * may need multiple commits to drain allocators (-EINPROGRESS).
+ *
+ * The caller holds logs_mutex and a commit, both are dropped and
+ * re-acquired around each reclaim call.  Returns >0 if any orphans
+ * were reclaimed so the caller can re-check state that may have
+ * changed while the lock was dropped.
+ */
+static int reclaim_orphan_log_trees(struct super_block *sb, u64 rid,
+				    struct commit_hold *hold)
+{
+	struct server_info *server = SCOUTFS_SB(sb)->server_info;
+	struct scoutfs_super_block *super = DIRTY_SUPER_SB(sb);
+	struct scoutfs_log_trees lt;
+	struct scoutfs_key key;
+	bool found = false;
+	u64 orphan_rid;
+	int ret;
+	int err;
+
+	scoutfs_key_init_log_trees(&key, U64_MAX, U64_MAX);
+	while ((ret = for_each_rid_last_lt(sb, &super->logs_root, &key, &lt)) > 0) {
+
+		if ((le64_to_cpu(lt.flags) & SCOUTFS_LOG_TREES_FINALIZED) ||
+		    le64_to_cpu(lt.rid) == rid ||
+		    rid_is_mounted(sb, le64_to_cpu(lt.rid)))
+			continue;
+
+		orphan_rid = le64_to_cpu(lt.rid);
+		scoutfs_err(sb, "reclaiming orphan log trees for rid %016llx nr %llu",
+			    orphan_rid, le64_to_cpu(lt.nr));
+		found = true;
+
+		do {
+			mutex_unlock(&server->logs_mutex);
+			err = reclaim_open_log_tree(sb, orphan_rid);
+			ret = server_apply_commit(sb, hold,
+						  err == -EINPROGRESS ? 0 : err);
+			server_hold_commit(sb, hold);
+			mutex_lock(&server->logs_mutex);
+		} while (err == -EINPROGRESS && ret == 0);
+
+		if (ret < 0)
+			break;
+	}
+
+	return ret < 0 ? ret : found;
+}
+
 #define FINALIZE_POLL_MIN_DELAY_MS	5U
 #define FINALIZE_POLL_MAX_DELAY_MS	100U
 #define FINALIZE_POLL_DELAY_GROWTH_PCT	150U
@@ -1214,6 +1345,16 @@ static int finalize_and_start_log_merge(struct super_block *sb, struct scoutfs_l
 			break;
 		}

+		ret = reclaim_orphan_log_trees(sb, rid, hold);
+		if (ret < 0) {
+			err_str = "reclaiming orphan log trees";
+			break;
+		}
+		if (ret > 0) {
+			/* lock was dropped, re-check merge status */
+			continue;
+		}
+
 		/* look for finalized and other active log btrees */
 		saw_finalized = false;
 		others_active = false;
@@ -1245,9 +1386,13 @@ static int finalize_and_start_log_merge(struct super_block *sb, struct scoutfs_l
 		 * meta was low so that deleted items are merged
 		 * promptly and freed blocks can bring the client out of
 		 * enospc.
+		 *
+		 * The trigger can be used to force a log merge in cases where
+		 * a test only generates small amounts of change.
 		 */
 		finalize_ours = (lt->item_root.height > 2) ||
-				(le32_to_cpu(lt->meta_avail.flags) & SCOUTFS_ALLOC_FLAG_LOW);
+				(le32_to_cpu(lt->meta_avail.flags) & SCOUTFS_ALLOC_FLAG_LOW) ||
+				scoutfs_trigger(sb, LOG_MERGE_FORCE_FINALIZE_OURS);

 		trace_scoutfs_server_finalize_decision(sb, rid, saw_finalized, others_active,
 						       ours_visible, finalize_ours, delay_ms,
@@ -1356,6 +1501,8 @@ static int finalize_and_start_log_merge(struct super_block *sb, struct scoutfs_l
 			BUG_ON(err); /* inconsistent */
 		}

+		scoutfs_inc_counter(sb, log_merge_start);
+
 		/* we're done, caller can make forward progress */
 		break;
 	}
@@ -1572,7 +1719,8 @@ static int server_get_log_trees(struct super_block *sb,
 		goto update;
 	}

-	ret = alloc_move_empty(sb, &super->data_alloc, &lt.data_freed, 100);
+	ret = alloc_move_empty(sb, &super->data_alloc, &lt.data_freed,
+			       COMMIT_HOLD_ALLOC_BUDGET / 2);
 	if (ret == -EINPROGRESS)
 		ret = 0;
 	if (ret < 0) {
@@ -1682,6 +1830,7 @@ static int server_commit_log_trees(struct super_block *sb,
 	int ret;

 	if (arg_len != sizeof(struct scoutfs_log_trees)) {
+		err_str = "invalid message log_trees size";
 		ret = -EINVAL;
 		goto out;
 	}
@@ -1745,7 +1894,7 @@ static int server_commit_log_trees(struct super_block *sb,

 	ret = scoutfs_btree_update(sb, &server->alloc, &server->wri,
 				   &super->logs_root, &key, &lt, sizeof(lt));
-	BUG_ON(ret < 0); /* dirtying should have guaranteed success */
+	BUG_ON(ret < 0); /* dirtying should have guaranteed success, srch item inconsistent */
 	if (ret < 0)
 		err_str = "updating log trees item";

@@ -1753,11 +1902,10 @@ unlock:
 	mutex_unlock(&server->logs_mutex);

 	ret = server_apply_commit(sb, &hold, ret);
+out:
 	if (ret < 0)
 		scoutfs_err(sb, "server error %d committing client logs for rid %016llx, nr %llu: %s",
 			    ret, rid, le64_to_cpu(lt.nr), err_str);
-out:
-	WARN_ON_ONCE(ret < 0);
 	return scoutfs_net_response(sb, conn, cmd, id, ret, NULL, 0);
 }

@@ -1867,13 +2015,15 @@ static int reclaim_open_log_tree(struct super_block *sb, u64 rid)
 	       scoutfs_alloc_splice_list(sb, &server->alloc, &server->wri, server->other_freed,
 					 &lt.meta_avail)) ?:
 	      (err_str = "empty data_avail",
-	       alloc_move_empty(sb, &super->data_alloc, &lt.data_avail, 100)) ?:
+	       alloc_move_empty(sb, &super->data_alloc, &lt.data_avail,
+				COMMIT_HOLD_ALLOC_BUDGET / 2)) ?:
 	      (err_str = "empty data_freed",
-	       alloc_move_empty(sb, &super->data_alloc, &lt.data_freed, 100));
+	       alloc_move_empty(sb, &super->data_alloc, &lt.data_freed,
+				COMMIT_HOLD_ALLOC_BUDGET / 2));
 	mutex_unlock(&server->alloc_mutex);

 	/* only finalize, allowing merging, once the allocators are fully freed */
-	if (ret == 0) {
+	if (ret == 0 && !scoutfs_trigger(sb, RECLAIM_SKIP_FINALIZE)) {
 		/* the transaction is no longer open */
 		lt.commit_trans_seq = lt.get_trans_seq;

@@ -1925,7 +2075,8 @@ static int get_stable_trans_seq(struct super_block *sb, u64 *last_seq_ret)
 	scoutfs_key_init_log_trees(&key, U64_MAX, U64_MAX);
 	while ((ret = for_each_rid_last_lt(sb, &super->logs_root, &key, &lt)) > 0) {
 		if ((le64_to_cpu(lt.get_trans_seq) > le64_to_cpu(lt.commit_trans_seq)) &&
-		     le64_to_cpu(lt.get_trans_seq) <= last_seq) {
+		     le64_to_cpu(lt.get_trans_seq) <= last_seq &&
+		     rid_is_mounted(sb, le64_to_cpu(lt.rid))) {
 			last_seq = le64_to_cpu(lt.get_trans_seq) - 1;
 		}
 	}
@@ -2094,7 +2245,7 @@ static int server_srch_get_compact(struct super_block *sb,

 apply:
 	ret = server_apply_commit(sb, &hold, ret);
-	WARN_ON_ONCE(ret < 0 && ret != -ENOENT); /* XXX leaked busy item */
+	WARN_ON_ONCE(ret < 0 && ret != -ENOENT && ret != -ENOLINK); /* XXX leaked busy item */
 out:
 	ret = scoutfs_net_response(sb, conn, cmd, id, ret,
 				   sc, sizeof(struct scoutfs_srch_compact));
@@ -2460,6 +2611,8 @@ static int splice_log_merge_completions(struct super_block *sb,
 		queue_work(server->wq, &server->log_merge_free_work);
 	else
 		err_str = "deleting merge status item";
+
+	scoutfs_inc_counter(sb, log_merge_complete);
 out:
 	if (upd_stat) {
 		init_log_merge_key(&key, SCOUTFS_LOG_MERGE_STATUS_ZONE, 0, 0);
@@ -2472,10 +2625,9 @@ out:
 		}
 	}

-	if (ret < 0)
-		scoutfs_err(sb, "server error %d splicing log merge completion: %s", ret, err_str);
-
-	BUG_ON(ret); /* inconsistent */
+	/* inconsistent */
+	scoutfs_bug_on_err(sb, ret,
+			   "server error %d splicing log merge completion: %s", ret, err_str);

 	return ret ?: einprogress;
 }
@@ -2720,10 +2872,7 @@ restart:

 	/* find the next range, always checking for splicing */
 	for (;;) {
-		key = stat.next_range_key;
-		key.sk_zone = SCOUTFS_LOG_MERGE_RANGE_ZONE;
-		ret = next_log_merge_item_key(sb, &super->log_merge, SCOUTFS_LOG_MERGE_RANGE_ZONE,
-					      &key, &rng, sizeof(rng));
+		ret = next_log_merge_range(sb, &super->log_merge, &stat.next_range_key, &rng);
 		if (ret < 0 && ret != -ENOENT) {
 			err_str = "finding merge range item";
 			goto out;
@@ -2994,7 +3143,13 @@ static int server_commit_log_merge(struct super_block *sb,
 				  SCOUTFS_LOG_MERGE_STATUS_ZONE, 0, 0,
 				  &stat, sizeof(stat));
 	if (ret < 0) {
-		err_str = "getting merge status item";
+		/*
+		 * During a retransmission, it's possible that the server
+		 * already committed and resolved this log merge. ENOENT
+		 * is expected in that case.
+		 */
+		if (ret != -ENOENT)
+			err_str = "getting merge status item";
 		goto out;
 	}

@@ -3473,14 +3628,6 @@ out:
 	return scoutfs_net_response(sb, conn, cmd, id, ret, &nst, sizeof(nst));
 }

-static void init_mounted_client_key(struct scoutfs_key *key, u64 rid)
-{
-	*key = (struct scoutfs_key) {
-		.sk_zone = SCOUTFS_MOUNTED_CLIENT_ZONE,
-		.skmc_rid = cpu_to_le64(rid),
-	};
-}
-
 static bool invalid_mounted_client_item(struct scoutfs_btree_item_ref *iref)
 {
 	return (iref->val_len != sizeof(struct scoutfs_mounted_client_btree_val));
@@ -30,6 +30,11 @@ void scoutfs_totl_merge_init(struct scoutfs_totl_merging *merg)
 	memset(merg, 0, sizeof(struct scoutfs_totl_merging));
 }

+/*
+ * bin the incoming merge inputs so that we can resolve delta items
+ * properly. Finalized logs that are merge inputs are kept separately
+ * from those that are not.
+ */
 void scoutfs_totl_merge_contribute(struct scoutfs_totl_merging *merg,
 				   u64 seq, u8 flags, void *val, int val_len, int fic)
 {
@@ -39,10 +44,10 @@ void scoutfs_totl_merge_contribute(struct scoutfs_totl_merging *merg,
 		merg->fs_seq = seq;
 		merg->fs_total = le64_to_cpu(tval->total);
 		merg->fs_count = le64_to_cpu(tval->count);
-	} else if (fic & FIC_FINALIZED) {
-		merg->fin_seq = seq;
-		merg->fin_total += le64_to_cpu(tval->total);
-		merg->fin_count += le64_to_cpu(tval->count);
+	} else if (fic & FIC_MERGE_INPUT) {
+		merg->inp_seq = seq;
+		merg->inp_total += le64_to_cpu(tval->total);
+		merg->inp_count += le64_to_cpu(tval->count);
 	} else {
 		merg->log_seq = seq;
 		merg->log_total += le64_to_cpu(tval->total);
@@ -53,15 +58,18 @@ void scoutfs_totl_merge_contribute(struct scoutfs_totl_merging *merg,
 /*
 * .totl. item merging has to be careful because the log btree merging
 * code can write partial results to the fs_root.  This means that a
- * reader can see both cases where new finalized logs should be applied
- * to the old fs items and where old finalized logs have already been
- * applied to the partially merged fs items.  Currently active logged
- * items are always applied on top of all cases.
+ * reader can see both cases where merge input deltas should be applied
+ * to the old fs items and where they have already been applied to the
+ * partially merged fs items.
+ *
+ * Only finalized log trees that are inputs to the current merge cycle
+ * are tracked in the inp_ bucket.  Finalized trees that aren't merge
+ * inputs and active log trees are always applied unconditionally since
+ * they cannot be in fs_root.
 *
 * These cases are differentiated with a combination of sequence numbers
- * in items, the count of contributing xattrs, and a flag
- * differentiating finalized and active logged items.  This lets us
- * recognize all cases, including when finalized logs were merged and
+ * in items and the count of contributing xattrs.  This lets us
+ * recognize all cases, including when merge inputs were merged and
 * deleted the fs item.
 */
 void scoutfs_totl_merge_resolve(struct scoutfs_totl_merging *merg, __u64 *total, __u64 *count)
@@ -75,14 +83,14 @@ void scoutfs_totl_merge_resolve(struct scoutfs_totl_merging *merg, __u64 *total,
 		*count = merg->fs_count;
 	}

-	/* apply finalized logs if they're newer or creating */
-	if (((merg->fs_seq != 0) && (merg->fin_seq > merg->fs_seq)) ||
-	    ((merg->fs_seq == 0) && (merg->fin_count > 0))) {
-		*total += merg->fin_total;
-		*count += merg->fin_count;
+	/* apply merge input deltas if they're newer or creating */
+	if (((merg->fs_seq != 0) && (merg->inp_seq > merg->fs_seq)) ||
+	    ((merg->fs_seq == 0) && (merg->inp_count > 0))) {
+		*total += merg->inp_total;
+		*count += merg->inp_count;
 	}

-	/* always apply active logs which must be newer than fs and finalized */
+	/* always apply non-input finalized and active logs */
 	if (merg->log_seq > 0) {
 		*total += merg->log_total;
 		*count += merg->log_count;
@@ -7,9 +7,9 @@ struct scoutfs_totl_merging {
 	u64 fs_seq;
 	u64 fs_total;
 	u64 fs_count;
-	u64 fin_seq;
-	u64 fin_total;
-	s64 fin_count;
+	u64 inp_seq;
+	u64 inp_total;
+	s64 inp_count;
 	u64 log_seq;
 	u64 log_total;
 	s64 log_count;
@@ -18,6 +18,7 @@

 #include "super.h"
 #include "triggers.h"
+#include "scoutfs_trace.h"

 /*
 * We have debugfs files we can write to which arm triggers which
@@ -39,10 +40,13 @@ struct scoutfs_triggers {

 static char *names[] = {
 	[SCOUTFS_TRIGGER_BLOCK_REMOVE_STALE] = "block_remove_stale",
+	[SCOUTFS_TRIGGER_LOG_MERGE_FORCE_FINALIZE_OURS] = "log_merge_force_finalize_ours",
 	[SCOUTFS_TRIGGER_SRCH_COMPACT_LOGS_PAD_SAFE] = "srch_compact_logs_pad_safe",
 	[SCOUTFS_TRIGGER_SRCH_FORCE_LOG_ROTATE] = "srch_force_log_rotate",
 	[SCOUTFS_TRIGGER_SRCH_MERGE_STOP_SAFE] = "srch_merge_stop_safe",
 	[SCOUTFS_TRIGGER_STATFS_LOCK_PURGE] = "statfs_lock_purge",
+	[SCOUTFS_TRIGGER_RECLAIM_SKIP_FINALIZE] = "reclaim_skip_finalize",
+	[SCOUTFS_TRIGGER_LOG_MERGE_FORCE_PARTIAL] = "log_merge_force_partial",
 };

 bool scoutfs_trigger_test_and_clear(struct super_block *sb, unsigned int t)
@@ -51,6 +55,7 @@ bool scoutfs_trigger_test_and_clear(struct super_block *sb, unsigned int t)
 	atomic_t *atom;
 	int old;
 	int mem;
+	bool fired;

 	BUG_ON(t >= SCOUTFS_TRIGGER_NR);
 	atom = &triggers->atomics[t];
@@ -64,7 +69,12 @@ bool scoutfs_trigger_test_and_clear(struct super_block *sb, unsigned int t)
 		mem = atomic_cmpxchg(atom, old, 0);
 	} while (mem && mem != old);

-	return !!mem;
+	fired = !!mem;
+
+	if (fired)
+		trace_scoutfs_trigger_fired(sb, names[t]);
+
+	return fired;
 }

 int scoutfs_setup_triggers(struct super_block *sb)
@@ -3,10 +3,13 @@

 enum scoutfs_trigger {
 	SCOUTFS_TRIGGER_BLOCK_REMOVE_STALE,
+	SCOUTFS_TRIGGER_LOG_MERGE_FORCE_FINALIZE_OURS,
 	SCOUTFS_TRIGGER_SRCH_COMPACT_LOGS_PAD_SAFE,
 	SCOUTFS_TRIGGER_SRCH_FORCE_LOG_ROTATE,
 	SCOUTFS_TRIGGER_SRCH_MERGE_STOP_SAFE,
 	SCOUTFS_TRIGGER_STATFS_LOCK_PURGE,
+	SCOUTFS_TRIGGER_RECLAIM_SKIP_FINALIZE,
+	SCOUTFS_TRIGGER_LOG_MERGE_FORCE_PARTIAL,
 	SCOUTFS_TRIGGER_NR,
 };

@@ -95,6 +95,7 @@ struct wkic_info {
 	/* block reading slow path */
 	struct mutex roots_mutex;
 	struct scoutfs_net_roots roots;
+	u64 merge_input_seq;
 	u64 roots_read_seq;
 	ktime_t roots_expire;

@@ -805,29 +806,79 @@ static void free_page_list(struct super_block *sb, struct list_head *list)
 * read_seq number so that we can compare the age of the items in cached
 * pages.  Only one request to refresh the roots is in progress at a
 * time.  This is the slow path that's only used when the cache isn't
- * populated and the roots aren't cached.  The root request is fast
- * enough, especially compared to the resulting item reading IO, that we
- * don't mind hiding it behind a trivial mutex.
+ * populated and the roots aren't cached.
+ *
+ * We read roots directly from the on-disk superblock rather than
+ * requesting them from the server so that we can also read the
+ * log_merge btree from the same superblock.  The merge status item
+ * seq tells us which finalized log trees are inputs to the current
+ * merge, which is needed to correctly resolve totl delta items.
 */
-static int get_roots(struct super_block *sb, struct wkic_info *winf,
-		     struct scoutfs_net_roots *roots_ret, u64 *read_seq, bool force_new)
+static int refresh_roots(struct super_block *sb, struct wkic_info *winf)
+{
+	struct scoutfs_super_block *super;
+	struct scoutfs_log_merge_status *stat;
+	SCOUTFS_BTREE_ITEM_REF(iref);
+	struct scoutfs_key key;
+	int ret;
+
+	super = kmalloc(sizeof(*super), GFP_NOFS);
+	if (!super)
+		return -ENOMEM;
+
+	ret = scoutfs_read_super(sb, super);
+	if (ret < 0)
+		goto out;
+
+	winf->roots = (struct scoutfs_net_roots){
+		.fs_root = super->fs_root,
+		.logs_root = super->logs_root,
+		.srch_root = super->srch_root,
+	};
+
+	winf->merge_input_seq = 0;
+	if (super->log_merge.ref.blkno) {
+		scoutfs_key_set_zeros(&key);
+		key.sk_zone = SCOUTFS_LOG_MERGE_STATUS_ZONE;
+		ret = scoutfs_btree_lookup(sb, &super->log_merge, &key, &iref);
+		if (ret == 0) {
+			if (iref.val_len == sizeof(*stat)) {
+				stat = iref.val;
+				winf->merge_input_seq = le64_to_cpu(stat->seq);
+			} else {
+				ret = -EUCLEAN;
+			}
+			scoutfs_btree_put_iref(&iref);
+		} else if (ret == -ENOENT) {
+			ret = 0;
+		}
+		if (ret < 0)
+			goto out;
+	}
+
+	winf->roots_read_seq++;
+	winf->roots_expire = ktime_add_ms(ktime_get_raw(), WKIC_CACHE_LIFETIME_MS);
+out:
+	kfree(super);
+	return ret;
+}
+
+static int get_roots(struct super_block *sb, struct wkic_info *winf,
+		     struct scoutfs_net_roots *roots_ret, u64 *merge_input_seq,
+		     u64 *read_seq, bool force_new)
 {
-	struct scoutfs_net_roots roots;
 	int ret;

 	mutex_lock(&winf->roots_mutex);

 	if (force_new || ktime_before(winf->roots_expire, ktime_get_raw())) {
-		ret = scoutfs_client_get_roots(sb, &roots);
+		ret = refresh_roots(sb, winf);
 		if (ret < 0)
 			goto out;
-
-		winf->roots = roots;
-		winf->roots_read_seq++;
-		winf->roots_expire = ktime_add_ms(ktime_get_raw(), WKIC_CACHE_LIFETIME_MS);
 	}

 	*roots_ret = winf->roots;
+	*merge_input_seq = winf->merge_input_seq;
 	*read_seq = winf->roots_read_seq;
 	ret = 0;
 out:
@@ -870,24 +921,30 @@ static int insert_read_pages(struct super_block *sb, struct wkic_info *winf,
 	struct scoutfs_key end;
 	struct wkic_page *wpage;
 	LIST_HEAD(pages);
-	u64 read_seq;
+	u64 merge_input_seq;
+	u64 read_seq = 0;
 	int ret;

 	ret = 0;
 retry_stale:
-	ret = get_roots(sb, winf, &roots, &read_seq, ret == -ESTALE);
+	ret = get_roots(sb, winf, &roots, &merge_input_seq, &read_seq, ret == -ESTALE);
 	if (ret < 0)
-		goto out;
+		goto check_stale;

 	start = *range_start;
 	end = *range_end;
-	ret = scoutfs_forest_read_items_roots(sb, &roots, key, range_start, &start, &end,
-					      read_items_cb, &root);
+	ret = scoutfs_forest_read_items_roots(sb, &roots, merge_input_seq, key, range_start,
+					      &start, &end, read_items_cb, &root);
 	trace_scoutfs_wkic_read_items(sb, key, &start, &end);
+check_stale:
 	ret = scoutfs_block_check_stale(sb, ret, &saved, &roots.fs_root.ref, &roots.logs_root.ref);
 	if (ret < 0) {
-		if (ret == -ESTALE)
+		if (ret == -ESTALE) {
+			/* not safe to retry due to delta items, must restart clean */
+			free_item_tree(&root);
+			root = RB_ROOT;
 			goto retry_stale;
+		}
 		goto out;
 	}

@@ -47,7 +47,7 @@
 *  - add acl support and call generic xattr->handlers for SYSTEM
 */

-static u32 xattr_name_hash(const char *name, unsigned int name_len)
+u32 scoutfs_xattr_name_hash(const char *name, unsigned int name_len)
 {
 	return crc32c(U32_MAX, name, name_len);
 }
@@ -65,8 +65,7 @@ static unsigned int xattr_nr_parts(struct scoutfs_xattr *xat)
 				      le16_to_cpu(xat->val_len));
 }

-static void init_xattr_key(struct scoutfs_key *key, u64 ino, u32 name_hash,
-			   u64 id)
+void scoutfs_xattr_init_key(struct scoutfs_key *key, u64 ino, u32 name_hash, u64 id)
 {
 	*key = (struct scoutfs_key) {
 		.sk_zone = SCOUTFS_FS_ZONE,
@@ -187,10 +186,10 @@ static int get_next_xattr(struct inode *inode, struct scoutfs_key *key,
 		return -EINVAL;

 	if (name_len)
-		name_hash = xattr_name_hash(name, name_len);
+		name_hash = scoutfs_xattr_name_hash(name, name_len);

-	init_xattr_key(key, scoutfs_ino(inode), name_hash, id);
-	init_xattr_key(&last, scoutfs_ino(inode), U32_MAX, U64_MAX);
+	scoutfs_xattr_init_key(key, scoutfs_ino(inode), name_hash, id);
+	scoutfs_xattr_init_key(&last, scoutfs_ino(inode), U32_MAX, U64_MAX);

 	for (;;) {
 		ret = scoutfs_item_next(sb, key, &last, xat, xat_bytes, lock);
@@ -335,8 +334,8 @@ static int create_xattr_items(struct inode *inode, u64 id, struct scoutfs_xattr
 	int len;
 	int i;

-	init_xattr_key(&key, scoutfs_ino(inode),
-		       xattr_name_hash(xat->name, xat->name_len), id);
+	scoutfs_xattr_init_key(&key, scoutfs_ino(inode),
+		       scoutfs_xattr_name_hash(xat->name, xat->name_len), id);

 	for (i = 0; i < new_parts; i++) {
 		key.skx_part = i;
@@ -365,7 +364,7 @@ static int delete_xattr_items(struct inode *inode, u32 name_hash, u64 id,
 	int ret = 0;
 	int i;

-	init_xattr_key(&key, scoutfs_ino(inode), name_hash, id);
+	scoutfs_xattr_init_key(&key, scoutfs_ino(inode), name_hash, id);

 	/* dirty additional existing old items */
 	for (i = 1; i < nr_parts; i++) {
@@ -407,8 +406,8 @@ static int change_xattr_items(struct inode *inode, u64 id,
 	int i;
 	int ret;

-	init_xattr_key(&key, scoutfs_ino(inode),
-		       xattr_name_hash(xat->name, xat->name_len), id);
+	scoutfs_xattr_init_key(&key, scoutfs_ino(inode),
+			       scoutfs_xattr_name_hash(xat->name, xat->name_len), id);

 	/* dirty existing old items */
 	for (i = 0; i < old_parts; i++) {
@@ -1224,8 +1223,8 @@ int scoutfs_xattr_drop(struct super_block *sb, u64 ino,
 		goto out;
 	}

-	init_xattr_key(&key, ino, 0, 0);
-	init_xattr_key(&last, ino, U32_MAX, U64_MAX);
+	scoutfs_xattr_init_key(&key, ino, 0, 0);
+	scoutfs_xattr_init_key(&last, ino, U32_MAX, U64_MAX);

 	for (;;) {
 		ret = scoutfs_item_next(sb, &key, &last, (void *)xat, bytes,
@@ -1265,6 +1264,7 @@ int scoutfs_xattr_drop(struct super_block *sb, u64 ino,
 			ret = parse_indx_key(&tag_key, xat->name, xat->name_len, ino);
 			if (ret < 0)
 				goto out;
+			scoutfs_xattr_set_indx_key_xid(&tag_key, le64_to_cpu(key.skx_id));
 		}

 		if ((tgs.totl || tgs.indx) && locked_zone != tag_key.sk_zone) {
@@ -10,6 +10,9 @@ struct scoutfs_xattr_prefix_tags {

 extern const struct xattr_handler *scoutfs_xattr_handlers[];

+u32 scoutfs_xattr_name_hash(const char *name, unsigned int name_len);
+void scoutfs_xattr_init_key(struct scoutfs_key *key, u64 ino, u32 name_hash, u64 id);
+
 int scoutfs_xattr_get_locked(struct inode *inode, const char *name, void *buffer, size_t size,
 			     struct scoutfs_lock *lck);
 int scoutfs_xattr_set_locked(struct inode *inode, const char *name, size_t name_len,
@@ -12,3 +12,4 @@ src/o_tmpfile_umask
 src/o_tmpfile_linkat
 src/mmap_stress
 src/mmap_validate
+src/watch_raw_inode_change
@@ -15,7 +15,8 @@ BIN := src/createmany			\
 	src/o_tmpfile_umask		\
 	src/o_tmpfile_linkat		\
 	src/mmap_stress			\
-	src/mmap_validate
+	src/mmap_validate		\
+	src/watch_raw_inode_change

 DEPS := $(wildcard src/*.d)

@@ -8,36 +8,33 @@

 echo "$0 running rid '$SCOUTFS_FENCED_REQ_RID' ip '$SCOUTFS_FENCED_REQ_IP' args '$@'"

-log() {
-	echo "$@" > /dev/stderr
+echo_fail() {
+	echo "$@" >&2
 	exit 1
 }

-echo_fail() {
-	echo "$@" > /dev/stderr
-	exit 1
+# silence error messages
+quiet_cat()
+{
+	cat "$@" 2>/dev/null
 }

 rid="$SCOUTFS_FENCED_REQ_RID"

+shopt -s nullglob
 for fs in /sys/fs/scoutfs/*; do
-	[ ! -d "$fs" ] && continue
+	fs_rid="$(quiet_cat $fs/rid)"
+	nr="$(quiet_cat $fs/data_device_maj_min)"
+	[ ! -d "$fs" -o "$fs_rid" != "$rid" ] && continue

-	fs_rid="$(cat $fs/rid)" || \
-		echo_fail "failed to get rid in $fs"
-	if [ "$fs_rid" != "$rid" ]; then
-		continue
+	mnt=$(findmnt -l -n -t scoutfs -o TARGET -S $nr)
+	[ -z "$mnt" ] && continue
+
+	if ! umount -qf "$mnt"; then
+		if [ -d "$fs" ]; then
+			echo_fail "umount -qf $mnt failed"
+		fi
 	fi
-
-	nr="$(cat $fs/data_device_maj_min)" || \
-		echo_fail "failed to get data device major:minor in $fs"
-
-	mnts=$(findmnt -l -n -t scoutfs -o TARGET -S $nr) || \
-		echo_fail "findmnt -t scoutfs -S $nr failed"
-	for mnt in $mnts; do
-		umount -f "$mnt" || \
-			echo_fail "umout -f $mnt failed"
-	done
 done

 exit 0
@@ -20,9 +20,6 @@ t_filter_fs()
 # [ 2687.691366] BUG: KASAN: stack-out-of-bounds in get_reg+0x1bc/0x230
 # ...
 # [ 2687.706220] ==================================================================
-# [ 2687.707284] Disabling lock debugging due to kernel taint
-#
-# That final lock debugging message may not be included.
 #
 ignore_harmless_unwind_kasan_stack_oob()
 {
@@ -46,10 +43,6 @@ awk '
 		saved=""
        }
        ( in_soob == 2 && $0 ~ /==================================================================/ ) {
-                in_soob = 3
-                soob_nr = NR
-        }
-        ( in_soob == 3 && NR > soob_nr && $0 !~ /Disabling lock debugging/ ) {
                in_soob = 0
        }
        ( !in_soob ) { print $0 }
@@ -61,6 +54,58 @@ awk '
 '
 }

+#
+# in el97+, XFS can generate a spurious lockdep circular dependency
+# warning about reclaim. Fixed upstream in e.g. v5.7-rc4-129-g6dcde60efd94
+#
+ignore_harmless_xfs_lockdep_warning()
+{
+awk '
+	BEGIN {
+		in_block = 0
+		block_nr = 0
+		buf = ""
+	}
+	( !in_block && $0 ~ /======================================================/ ) {
+		in_block = 1
+		block_nr = NR
+		buf = $0 "\n"
+		next
+	}
+	( in_block == 1 && NR == (block_nr + 1) ) {
+		if (match($0, /WARNING: possible circular locking dependency detected/) != 0) {
+			in_block = 2
+			buf = buf $0 "\n"
+		} else {
+			in_block = 0
+			printf "%s", buf
+			print $0
+			buf = ""
+		}
+		next
+	}
+	( in_block == 2 ) {
+		buf = buf $0 "\n"
+		if ($0 ~ /<\/TASK>/) {
+			if (buf ~ /xfs_(nondir_|dir_)?ilock_class/ && buf ~ /fs_reclaim/) {
+				# known xfs lockdep false positive, discard
+			} else {
+				printf "%s", buf
+			}
+			in_block = 0
+			buf = ""
+		}
+		next
+	}
+	{ print $0 }
+	END {
+		if (buf) {
+			printf "%s", buf
+		}
+	}
+'
+}
+
 #
 # Filter out expected messages.  Putting messages here implies that
 # tests aren't relying on messages to discover failures.. they're
@@ -121,6 +166,10 @@ t_filter_dmesg()

 	# in debugging kernels we can slow things down a bit
 	re="$re|hrtimer: interrupt took .*"
+	re="$re|clocksource: Long readout interval"
+
+	# orphan log trees reclaim is handled, not an error
+	re="$re|scoutfs .* reclaiming orphan log trees"

 	# fencing tests force unmounts and trigger timeouts
 	re="$re|scoutfs .* forcing unmount"
@@ -166,6 +215,16 @@ t_filter_dmesg()
 	# perf warning that it adjusted sample rate
 	re="$re|perf: interrupt took too long.*lowering kernel.perf_event_max_sample_rate.*"

+	# some ci test guests are unresponsive
+	re="$re|longest quorum heartbeat .* delay"
+
+	# creating block devices may trigger this
+	re="$re|block device autoloading is deprecated and will be removed."
+
+	# lockdep or kasan warnings can cause this
+	re="$re|Disabling lock debugging due to kernel taint"
+
 	egrep -v "($re)" | \
-		ignore_harmless_unwind_kasan_stack_oob
+		ignore_harmless_unwind_kasan_stack_oob | \
+		ignore_harmless_xfs_lockdep_warning
 }
@@ -283,6 +283,30 @@ t_reinsert_remount_all()
 	t_quiet t_mount_all || t_fail "mounting all failed"
 }

+#
+# scratch helpers
+#
+t_scratch_mkfs()
+{
+	scoutfs mkfs -f -Q 0,127.0.0.1,$T_SCRATCH_PORT "$T_EX_META_DEV" "$T_EX_DATA_DEV" "$@" > $T_TMP.mkfs.out 2>&1 || \
+		t_fail "scratch mkfs failed"
+}
+
+t_scratch_mount()
+{
+	mkdir -p "$T_MSCR"
+	mount -t scoutfs -o metadev_path=$T_EX_META_DEV,quorum_slot_nr=0 "$@" "$T_EX_DATA_DEV" "$T_MSCR" || \
+		t_fail "scratch mount failed"
+}
+
+t_scratch_umount()
+{
+	umount "$T_MSCR" || \
+		t_fail "scratch umount failed"
+	rmdir "$T_MSCR"
+}
+
+
 t_trigger_path() {
 	local nr="$1"

@@ -498,3 +522,121 @@ t_restore_all_sysfs_mount_options() {
 		t_set_sysfs_mount_option $i $name "${_saved_opts[$ind]}"
 	done
 }
+
+t_force_log_merge() {
+	local sv=$(t_server_nr)
+	local merges_started
+	local last_merges_started
+	local merges_completed
+	local last_merges_completed
+
+	while true; do
+		last_merges_started=$(t_counter log_merge_start $sv)
+		last_merges_completed=$(t_counter log_merge_complete $sv)
+
+		t_trigger_arm_silent log_merge_force_finalize_ours $sv
+
+		t_sync_seq_index
+
+		while test "$(t_trigger_get log_merge_force_finalize_ours $sv)" == "1"; do
+			sleep .5
+		done
+
+		merges_started=$(t_counter log_merge_start $sv)
+
+		if (( merges_started > last_merges_started )); then
+			merges_completed=$(t_counter log_merge_complete $sv)
+
+			while (( merges_completed == last_merges_completed )); do
+				sleep .5
+				merges_completed=$(t_counter log_merge_complete $sv)
+			done
+			break
+		fi
+	done
+}
+
+declare -A _last_scan
+t_get_orphan_scan_runs() {
+	local i
+
+	for i in $(t_fs_nrs); do
+		_last_scan[$i]=$(t_counter orphan_scan $i)
+	done
+}
+
+t_wait_for_orphan_scan_runs() {
+	local i
+	local scan
+
+	t_get_orphan_scan_runs
+
+	for i in $(t_fs_nrs); do
+		while true; do
+			scan=$(t_counter orphan_scan $i)
+			if (( scan != _last_scan[$i] )); then
+				break
+			fi
+			sleep .5
+		done
+	done
+}
+
+declare -A _last_empty
+t_get_orphan_scan_empty() {
+	local i
+
+	for i in $(t_fs_nrs); do
+		_last_empty[$i]=$(t_counter orphan_scan_empty $i)
+	done
+}
+
+t_wait_for_no_orphans() {
+	local i;
+	local working;
+	local empty;
+
+	t_get_orphan_scan_empty
+
+	while true; do
+		working=0
+
+		t_wait_for_orphan_scan_runs
+
+		for i in $(t_fs_nrs); do
+			empty=$(t_counter orphan_scan_empty $i)
+			if (( empty == _last_empty[$i] )); then
+				(( working++ ))
+			else
+				(( _last_empty[$i] = empty ))
+			fi
+		done
+
+		if (( working == 0 )); then
+			break
+		fi
+
+		sleep 1
+	done
+}
+
+#
+# Repeatedly run the arguments as a command, sleeping in between, until
+# it returns success.  The first argument is a relative timeout in
+# seconds.  The remaining arguments are the command and its arguments.
+#
+# If the timeout expires without the command returning 0 then the test
+# fails.
+#
+t_wait_until_timeout() {
+	local relative="$1"
+	local expire="$((SECONDS + relative))"
+	shift
+
+	while (( SECONDS < expire )); do
+		"$@" && return
+		sleep 1
+	done
+
+	t_fail "command failed for $relative sec: $@"
+}
@@ -43,9 +43,14 @@ t_tap_progress()
 	local testname=$1
 	local result=$2

+	local stmsg=""
 	local diff=""
 	local dmsg=""

+	if [[ -s $T_RESULTS/tmp/${testname}/status.msg ]]; then
+		stmsg="1"
+	fi
+
 	if [[ -s "$T_RESULTS/tmp/${testname}/dmesg.new" ]]; then
 		dmsg="1"
 	fi
@@ -61,6 +66,7 @@ t_tap_progress()
 		echo "# ${testname} ** skipped - permitted **"
 	else
 		echo "not ok ${i} - ${testname}"
+
 		case ${result} in
 		101)
 			echo "# ${testname} ** skipped **"
@@ -70,6 +76,13 @@ t_tap_progress()
 			;;
 		esac

+		if [[ -n "${stmsg}" ]]; then
+			echo "#"
+			echo "# status:"
+			echo "#"
+			cat $T_RESULTS/tmp/${testname}/status.msg | sed 's/^/# - /'
+		fi
+
 		if [[ -n "${diff}" ]]; then
 			echo "#"
 			echo "# diff:"
@@ -0,0 +1,6 @@
+== make scratch fs
+== create uid/gids
+== set acls and permissions
+== compare output
+== drop caches and compare again
+== cleanup scratch fs
@@ -0,0 +1,54 @@
+== testing invalid read-xattr-index arguments
+bad index position entry argument 'bad', it must be in the form "a.b.ino" where each value can be prefixed by '0' for octal or '0x' for hex
+scoutfs: read-xattr-index failed: Invalid argument (22)
+bad index position entry argument '1.2', it must be in the form "a.b.ino" where each value can be prefixed by '0' for octal or '0x' for hex
+scoutfs: read-xattr-index failed: Invalid argument (22)
+initial major index position '256' must be between 0 and 255, inclusive.
+scoutfs: read-xattr-index failed: Invalid argument (22)
+first index position 1.2.3 must be less than last index position 0.0.0
+scoutfs: read-xattr-index failed: Invalid argument (22)
+first index position 1.2.0 must be less than last index position 1.1.2
+scoutfs: read-xattr-index failed: Invalid argument (22)
+first index position 2.2.2 must be less than last index position 2.2.1
+scoutfs: read-xattr-index failed: Invalid argument (22)
+== testing invalid names
+setfattr: /mnt/test/test/basic-xattr-indx/invalid: Invalid argument
+setfattr: /mnt/test/test/basic-xattr-indx/invalid: Invalid argument
+setfattr: /mnt/test/test/basic-xattr-indx/invalid: Invalid argument
+setfattr: /mnt/test/test/basic-xattr-indx/invalid: Invalid argument
+setfattr: /mnt/test/test/basic-xattr-indx/invalid: Invalid argument
+setfattr: /mnt/test/test/basic-xattr-indx/invalid: Invalid argument
+setfattr: /mnt/test/test/basic-xattr-indx/invalid: Invalid argument
+setfattr: /mnt/test/test/basic-xattr-indx/invalid: Invalid argument
+setfattr: /mnt/test/test/basic-xattr-indx/invalid: Invalid argument
+setfattr: /mnt/test/test/basic-xattr-indx/invalid: Invalid argument
+setfattr: /mnt/test/test/basic-xattr-indx/invalid: Invalid argument
+setfattr: /mnt/test/test/basic-xattr-indx/invalid: Numerical result out of range
+== testing boundary values
+0.0 found
+255.max found
+== indx xattr must have no value
+setfattr: /mnt/test/test/basic-xattr-indx/noval: Invalid argument
+setfattr: /mnt/test/test/basic-xattr-indx/noval: Invalid argument
+== set indx xattr and verify index entry
+found
+== setting same indx xattr again is a no-op
+found
+== removing non-existent indx xattr succeeds
+setfattr: /mnt/test/test/basic-xattr-indx/file: No such attribute
+still found
+== explicit xattr removal cleans up index entry
+== file deletion cleans up index entry
+found before delete
+== multiple indx xattrs on one file cleaned up by deletion
+entries before delete: 2
+entries after delete: 0
+== partial removal leaves other entries
+300 found
+== multiple files at same index position
+files at same position: 2
+surviving file found
+== cross-mount visibility
+found on mount 1
+== duplicate position deduplication
+entries for same position: 1
@@ -17,7 +17,7 @@ ino not found in dseq index
 mount 0 contents after mount 1 rm: contents
 ino found in dseq index
 ino found in dseq index
-stat: cannot stat '/mnt/test/test/inode-deletion/file': No such file or directory
+stat: cannot stat '/mnt/test/test/inode-deletion/badfile': No such file or directory
 ino not found in dseq index
 ino not found in dseq index
 == lots of deletions use one open map
@@ -0,0 +1,3 @@
+== create orphan log_trees entry via trigger
+== verify orphan is reclaimed and merge completes
+== verify orphan reclaim was logged
@@ -0,0 +1,460 @@
+== missing options should fail ==
+punch-offline: must provide offset
+Try `punch-offline --help' or `punch-offline --usage' for more information.
+punch-offline: must provide length
+Try `punch-offline --help' or `punch-offline --usage' for more information.
+punch-offline: must provide data_version
+Try `punch-offline --help' or `punch-offline --usage' for more information.
+== can't hole punch dir or special ==
+failed to open '/mnt/test.0/test/punch-offline/dir': Is a directory (21)
+scoutfs: punch-offline failed: Is a directory (21)
+== punching an empty file does nothing ==
+== punch outside of i_size does nothing ==
+== can't hole punch online extent ==
+0: offset: 0 length: 4096 flags: ..L
+extents: 1
+punch_offline ioctl failed: Invalid argument (22)
+scoutfs: punch-offline failed: Invalid argument (22)
+0: offset: 0 length: 4096 flags: ..L
+extents: 1
+== can't hole punch unwritten extent ==
+0: offset: 0 length: 12288 flags: .UL
+extents: 1
+punch_offline ioctl failed: Invalid argument (22)
+scoutfs: punch-offline failed: Invalid argument (22)
+0: offset: 0 length: 12288 flags: .UL
+extents: 1
+== hole punch offline extent ==
+0: offset: 0 length: 12288 flags: O.L
+extents: 1
+0: offset: 0 length: 4096 flags: O..
+1: offset: 8192 length: 4096 flags: O.L
+extents: 2
+== can't hole punch non-aligned bsz offset or len ==
+0: offset: 0 length: 12288 flags: O.L
+extents: 1
+punch_offline ioctl failed: Value too large for defined data type (75)
+scoutfs: punch-offline failed: Value too large for defined data type (75)
+punch_offline ioctl failed: Value too large for defined data type (75)
+scoutfs: punch-offline failed: Value too large for defined data type (75)
+punch_offline ioctl failed: Value too large for defined data type (75)
+scoutfs: punch-offline failed: Value too large for defined data type (75)
+punch_offline ioctl failed: Value too large for defined data type (75)
+scoutfs: punch-offline failed: Value too large for defined data type (75)
+punch_offline ioctl failed: Value too large for defined data type (75)
+scoutfs: punch-offline failed: Value too large for defined data type (75)
+punch_offline ioctl failed: Value too large for defined data type (75)
+scoutfs: punch-offline failed: Value too large for defined data type (75)
+0: offset: 0 length: 12288 flags: O.L
+extents: 1
+== can't hole punch mismatched data_version ==
+0: offset: 0 length: 12288 flags: O.L
+extents: 1
+punch_offline ioctl failed: Stale file handle (116)
+scoutfs: punch-offline failed: Stale file handle (116)
+punch_offline ioctl failed: Stale file handle (116)
+scoutfs: punch-offline failed: Stale file handle (116)
+punch_offline ioctl failed: Stale file handle (116)
+scoutfs: punch-offline failed: Stale file handle (116)
+0: offset: 0 length: 12288 flags: O.L
+extents: 1
+== Punch hole crossing multiple extents ==
+0: offset: 0 length: 7 flags: O.L
+extents: 1
+0: offset: 0 length: 1 flags: O..
+1: offset: 2 length: 1 flags: O..
+2: offset: 4 length: 1 flags: O..
+3: offset: 6 length: 1 flags: O.L
+extents: 4
+0: offset: 0 length: 1 flags: O..
+1: offset: 6 length: 1 flags: O.L
+extents: 2
+== punch hole starting at a hole ==
+0: offset: 0 length: 7 flags: O.L
+extents: 1
+0: offset: 0 length: 1 flags: O..
+1: offset: 2 length: 1 flags: O..
+2: offset: 4 length: 1 flags: O..
+3: offset: 6 length: 1 flags: O.L
+extents: 4
+0: offset: 0 length: 1 flags: O..
+1: offset: 6 length: 1 flags: O.L
+extents: 2
+== large punch ==
+0: offset: 0 length: 1572864 flags: O.L
+extents: 1
+0: offset: 0 length: 134123 flags: O..
+1: offset: 202466 length: 264807 flags: O..
+2: offset: 535616 length: 199007 flags: O..
+3: offset: 802966 length: 769898 flags: O.L
+extents: 4
+== overlapping punches with lots of extents ==
+0: offset: 0 length: 4194304 flags: O.L
+extents: 1
+extents: 512
+extents: 505
+extents: 378
+extents: 252
+0: offset: 0 length: 4096 flags: O..
+1: offset: 8192 length: 4096 flags: O..
+2: offset: 32768 length: 4096 flags: O..
+3: offset: 40960 length: 4096 flags: O..
+4: offset: 65536 length: 4096 flags: O..
+5: offset: 73728 length: 4096 flags: O..
+6: offset: 98304 length: 4096 flags: O..
+7: offset: 106496 length: 4096 flags: O..
+8: offset: 196608 length: 4096 flags: O..
+9: offset: 204800 length: 4096 flags: O..
+10: offset: 229376 length: 4096 flags: O..
+11: offset: 237568 length: 4096 flags: O..
+12: offset: 262144 length: 4096 flags: O..
+13: offset: 270336 length: 4096 flags: O..
+14: offset: 294912 length: 4096 flags: O..
+15: offset: 303104 length: 4096 flags: O..
+16: offset: 327680 length: 4096 flags: O..
+17: offset: 335872 length: 4096 flags: O..
+18: offset: 360448 length: 4096 flags: O..
+19: offset: 368640 length: 4096 flags: O..
+20: offset: 393216 length: 4096 flags: O..
+21: offset: 401408 length: 4096 flags: O..
+22: offset: 425984 length: 4096 flags: O..
+23: offset: 434176 length: 4096 flags: O..
+24: offset: 458752 length: 4096 flags: O..
+25: offset: 466944 length: 4096 flags: O..
+26: offset: 491520 length: 4096 flags: O..
+27: offset: 499712 length: 4096 flags: O..
+28: offset: 720896 length: 4096 flags: O..
+29: offset: 729088 length: 4096 flags: O..
+30: offset: 753664 length: 4096 flags: O..
+31: offset: 761856 length: 4096 flags: O..
+32: offset: 786432 length: 4096 flags: O..
+33: offset: 794624 length: 4096 flags: O..
+34: offset: 819200 length: 4096 flags: O..
+35: offset: 827392 length: 4096 flags: O..
+36: offset: 851968 length: 4096 flags: O..
+37: offset: 860160 length: 4096 flags: O..
+38: offset: 884736 length: 4096 flags: O..
+39: offset: 892928 length: 4096 flags: O..
+40: offset: 917504 length: 4096 flags: O..
+41: offset: 925696 length: 4096 flags: O..
+42: offset: 950272 length: 4096 flags: O..
+43: offset: 958464 length: 4096 flags: O..
+44: offset: 983040 length: 4096 flags: O..
+45: offset: 991232 length: 4096 flags: O..
+46: offset: 1015808 length: 4096 flags: O..
+47: offset: 1024000 length: 4096 flags: O..
+48: offset: 1048576 length: 4096 flags: O..
+49: offset: 1056768 length: 4096 flags: O..
+50: offset: 1081344 length: 4096 flags: O..
+51: offset: 1089536 length: 4096 flags: O..
+52: offset: 1114112 length: 4096 flags: O..
+53: offset: 1122304 length: 4096 flags: O..
+54: offset: 1146880 length: 4096 flags: O..
+55: offset: 1155072 length: 4096 flags: O..
+56: offset: 1179648 length: 4096 flags: O..
+57: offset: 1187840 length: 4096 flags: O..
+58: offset: 1212416 length: 4096 flags: O..
+59: offset: 1220608 length: 4096 flags: O..
+60: offset: 1245184 length: 4096 flags: O..
+61: offset: 1253376 length: 4096 flags: O..
+62: offset: 1277952 length: 4096 flags: O..
+63: offset: 1286144 length: 4096 flags: O..
+64: offset: 1310720 length: 4096 flags: O..
+65: offset: 1318912 length: 4096 flags: O..
+66: offset: 1343488 length: 4096 flags: O..
+67: offset: 1351680 length: 4096 flags: O..
+68: offset: 1376256 length: 4096 flags: O..
+69: offset: 1384448 length: 4096 flags: O..
+70: offset: 1409024 length: 4096 flags: O..
+71: offset: 1417216 length: 4096 flags: O..
+72: offset: 1441792 length: 4096 flags: O..
+73: offset: 1449984 length: 4096 flags: O..
+74: offset: 1474560 length: 4096 flags: O..
+75: offset: 1482752 length: 4096 flags: O..
+76: offset: 1507328 length: 4096 flags: O..
+77: offset: 1515520 length: 4096 flags: O..
+78: offset: 1540096 length: 4096 flags: O..
+79: offset: 1548288 length: 4096 flags: O..
+80: offset: 1572864 length: 4096 flags: O..
+81: offset: 1581056 length: 4096 flags: O..
+82: offset: 1605632 length: 4096 flags: O..
+83: offset: 1613824 length: 4096 flags: O..
+84: offset: 1638400 length: 4096 flags: O..
+85: offset: 1646592 length: 4096 flags: O..
+86: offset: 1671168 length: 4096 flags: O..
+87: offset: 1679360 length: 4096 flags: O..
+88: offset: 1703936 length: 4096 flags: O..
+89: offset: 1712128 length: 4096 flags: O..
+90: offset: 1736704 length: 4096 flags: O..
+91: offset: 1744896 length: 4096 flags: O..
+92: offset: 1769472 length: 4096 flags: O..
+93: offset: 1777664 length: 4096 flags: O..
+94: offset: 1802240 length: 4096 flags: O..
+95: offset: 1810432 length: 4096 flags: O..
+96: offset: 1835008 length: 4096 flags: O..
+97: offset: 1843200 length: 4096 flags: O..
+98: offset: 1867776 length: 4096 flags: O..
+99: offset: 1875968 length: 4096 flags: O..
+100: offset: 1900544 length: 4096 flags: O..
+101: offset: 1908736 length: 4096 flags: O..
+102: offset: 1933312 length: 4096 flags: O..
+103: offset: 1941504 length: 4096 flags: O..
+104: offset: 1966080 length: 4096 flags: O..
+105: offset: 1974272 length: 4096 flags: O..
+106: offset: 1998848 length: 4096 flags: O..
+107: offset: 2007040 length: 4096 flags: O..
+108: offset: 2031616 length: 4096 flags: O..
+109: offset: 2039808 length: 4096 flags: O..
+110: offset: 2064384 length: 4096 flags: O..
+111: offset: 2072576 length: 4096 flags: O..
+112: offset: 2097152 length: 4096 flags: O..
+113: offset: 2105344 length: 4096 flags: O..
+114: offset: 2129920 length: 4096 flags: O..
+115: offset: 2138112 length: 4096 flags: O..
+116: offset: 2162688 length: 4096 flags: O..
+117: offset: 2170880 length: 4096 flags: O..
+118: offset: 2195456 length: 4096 flags: O..
+119: offset: 2203648 length: 4096 flags: O..
+120: offset: 2228224 length: 4096 flags: O..
+121: offset: 2236416 length: 4096 flags: O..
+122: offset: 2260992 length: 4096 flags: O..
+123: offset: 2269184 length: 4096 flags: O..
+124: offset: 2293760 length: 4096 flags: O..
+125: offset: 2301952 length: 4096 flags: O..
+126: offset: 2326528 length: 4096 flags: O..
+127: offset: 2334720 length: 4096 flags: O..
+128: offset: 2359296 length: 4096 flags: O..
+129: offset: 2367488 length: 4096 flags: O..
+130: offset: 2392064 length: 4096 flags: O..
+131: offset: 2400256 length: 4096 flags: O..
+132: offset: 2424832 length: 4096 flags: O..
+133: offset: 2433024 length: 4096 flags: O..
+134: offset: 2457600 length: 4096 flags: O..
+135: offset: 2465792 length: 4096 flags: O..
+136: offset: 2490368 length: 4096 flags: O..
+137: offset: 2498560 length: 4096 flags: O..
+138: offset: 2523136 length: 4096 flags: O..
+139: offset: 2531328 length: 4096 flags: O..
+140: offset: 2555904 length: 4096 flags: O..
+141: offset: 2564096 length: 4096 flags: O..
+142: offset: 2588672 length: 4096 flags: O..
+143: offset: 2596864 length: 4096 flags: O..
+144: offset: 2621440 length: 4096 flags: O..
+145: offset: 2629632 length: 4096 flags: O..
+146: offset: 2654208 length: 4096 flags: O..
+147: offset: 2662400 length: 4096 flags: O..
+148: offset: 2686976 length: 4096 flags: O..
+149: offset: 2695168 length: 4096 flags: O..
+150: offset: 2719744 length: 4096 flags: O..
+151: offset: 2727936 length: 4096 flags: O..
+152: offset: 2752512 length: 4096 flags: O..
+153: offset: 2760704 length: 4096 flags: O..
+154: offset: 2785280 length: 4096 flags: O..
+155: offset: 2793472 length: 4096 flags: O..
+156: offset: 2818048 length: 4096 flags: O..
+157: offset: 2826240 length: 4096 flags: O..
+158: offset: 2850816 length: 4096 flags: O..
+159: offset: 2859008 length: 4096 flags: O..
+160: offset: 2883584 length: 4096 flags: O..
+161: offset: 2891776 length: 4096 flags: O..
+162: offset: 2916352 length: 4096 flags: O..
+163: offset: 2924544 length: 4096 flags: O..
+164: offset: 2949120 length: 4096 flags: O..
+165: offset: 2957312 length: 4096 flags: O..
+166: offset: 2981888 length: 4096 flags: O..
+167: offset: 2990080 length: 4096 flags: O..
+168: offset: 3014656 length: 4096 flags: O..
+169: offset: 3022848 length: 4096 flags: O..
+170: offset: 3047424 length: 4096 flags: O..
+171: offset: 3055616 length: 4096 flags: O..
+172: offset: 3080192 length: 4096 flags: O..
+173: offset: 3088384 length: 4096 flags: O..
+174: offset: 3112960 length: 4096 flags: O..
+175: offset: 3121152 length: 4096 flags: O..
+176: offset: 3145728 length: 4096 flags: O..
+177: offset: 3153920 length: 4096 flags: O..
+178: offset: 3178496 length: 4096 flags: O..
+179: offset: 3186688 length: 4096 flags: O..
+180: offset: 3211264 length: 4096 flags: O..
+181: offset: 3219456 length: 4096 flags: O..
+182: offset: 3244032 length: 4096 flags: O..
+183: offset: 3252224 length: 4096 flags: O..
+184: offset: 3276800 length: 4096 flags: O..
+185: offset: 3284992 length: 4096 flags: O..
+186: offset: 3309568 length: 4096 flags: O..
+187: offset: 3317760 length: 4096 flags: O..
+188: offset: 3342336 length: 4096 flags: O..
+189: offset: 3350528 length: 4096 flags: O..
+190: offset: 3375104 length: 4096 flags: O..
+191: offset: 3383296 length: 4096 flags: O..
+192: offset: 3407872 length: 4096 flags: O..
+193: offset: 3416064 length: 4096 flags: O..
+194: offset: 3440640 length: 4096 flags: O..
+195: offset: 3448832 length: 4096 flags: O..
+196: offset: 3473408 length: 4096 flags: O..
+197: offset: 3481600 length: 4096 flags: O..
+198: offset: 3506176 length: 4096 flags: O..
+199: offset: 3514368 length: 4096 flags: O..
+200: offset: 3538944 length: 4096 flags: O..
+201: offset: 3547136 length: 4096 flags: O..
+202: offset: 3571712 length: 4096 flags: O..
+203: offset: 3579904 length: 4096 flags: O..
+204: offset: 3604480 length: 4096 flags: O..
+205: offset: 3612672 length: 4096 flags: O..
+206: offset: 3637248 length: 4096 flags: O..
+207: offset: 3645440 length: 4096 flags: O..
+208: offset: 3670016 length: 4096 flags: O..
+209: offset: 3678208 length: 4096 flags: O..
+210: offset: 3702784 length: 4096 flags: O..
+211: offset: 3710976 length: 4096 flags: O..
+212: offset: 3735552 length: 4096 flags: O..
+213: offset: 3743744 length: 4096 flags: O..
+214: offset: 3768320 length: 4096 flags: O..
+215: offset: 3776512 length: 4096 flags: O..
+216: offset: 3801088 length: 4096 flags: O..
+217: offset: 3809280 length: 4096 flags: O..
+218: offset: 3833856 length: 4096 flags: O..
+219: offset: 3842048 length: 4096 flags: O..
+220: offset: 3866624 length: 4096 flags: O..
+221: offset: 3874816 length: 4096 flags: O..
+222: offset: 3899392 length: 4096 flags: O..
+223: offset: 3907584 length: 4096 flags: O..
+224: offset: 3932160 length: 4096 flags: O..
+225: offset: 3940352 length: 4096 flags: O..
+226: offset: 3964928 length: 4096 flags: O..
+227: offset: 3973120 length: 4096 flags: O..
+228: offset: 3997696 length: 4096 flags: O..
+229: offset: 4005888 length: 4096 flags: O..
+230: offset: 4030464 length: 4096 flags: O..
+231: offset: 4038656 length: 4096 flags: O..
+232: offset: 4063232 length: 4096 flags: O..
+233: offset: 4071424 length: 4096 flags: O..
+234: offset: 4096000 length: 4096 flags: O..
+235: offset: 4104192 length: 4096 flags: O..
+236: offset: 4128768 length: 4096 flags: O..
+237: offset: 4136960 length: 4096 flags: O..
+238: offset: 4161536 length: 4096 flags: O..
+239: offset: 4169728 length: 4096 flags: O.L
+extents: 240
+0: offset: 0 length: 1 flags: O..
+1: offset: 8 length: 1 flags: O..
+2: offset: 16 length: 1 flags: O..
+3: offset: 24 length: 1 flags: O..
+4: offset: 48 length: 1 flags: O..
+5: offset: 56 length: 1 flags: O..
+6: offset: 64 length: 1 flags: O..
+7: offset: 72 length: 1 flags: O..
+8: offset: 80 length: 1 flags: O..
+9: offset: 88 length: 1 flags: O..
+10: offset: 96 length: 1 flags: O..
+11: offset: 104 length: 1 flags: O..
+12: offset: 112 length: 1 flags: O..
+13: offset: 120 length: 1 flags: O..
+14: offset: 176 length: 1 flags: O..
+15: offset: 184 length: 1 flags: O..
+16: offset: 192 length: 1 flags: O..
+17: offset: 200 length: 1 flags: O..
+18: offset: 208 length: 1 flags: O..
+19: offset: 216 length: 1 flags: O..
+20: offset: 224 length: 1 flags: O..
+21: offset: 232 length: 1 flags: O..
+22: offset: 240 length: 1 flags: O..
+23: offset: 248 length: 1 flags: O..
+24: offset: 256 length: 1 flags: O..
+25: offset: 264 length: 1 flags: O..
+26: offset: 272 length: 1 flags: O..
+27: offset: 280 length: 1 flags: O..
+28: offset: 288 length: 1 flags: O..
+29: offset: 296 length: 1 flags: O..
+30: offset: 304 length: 1 flags: O..
+31: offset: 312 length: 1 flags: O..
+32: offset: 320 length: 1 flags: O..
+33: offset: 328 length: 1 flags: O..
+34: offset: 336 length: 1 flags: O..
+35: offset: 344 length: 1 flags: O..
+36: offset: 352 length: 1 flags: O..
+37: offset: 360 length: 1 flags: O..
+38: offset: 368 length: 1 flags: O..
+39: offset: 376 length: 1 flags: O..
+40: offset: 384 length: 1 flags: O..
+41: offset: 392 length: 1 flags: O..
+42: offset: 400 length: 1 flags: O..
+43: offset: 408 length: 1 flags: O..
+44: offset: 416 length: 1 flags: O..
+45: offset: 424 length: 1 flags: O..
+46: offset: 432 length: 1 flags: O..
+47: offset: 440 length: 1 flags: O..
+48: offset: 448 length: 1 flags: O..
+49: offset: 456 length: 1 flags: O..
+50: offset: 464 length: 1 flags: O..
+51: offset: 472 length: 1 flags: O..
+52: offset: 480 length: 1 flags: O..
+53: offset: 488 length: 1 flags: O..
+54: offset: 496 length: 1 flags: O..
+55: offset: 504 length: 1 flags: O..
+56: offset: 512 length: 1 flags: O..
+57: offset: 520 length: 1 flags: O..
+58: offset: 528 length: 1 flags: O..
+59: offset: 536 length: 1 flags: O..
+60: offset: 544 length: 1 flags: O..
+61: offset: 552 length: 1 flags: O..
+62: offset: 560 length: 1 flags: O..
+63: offset: 568 length: 1 flags: O..
+64: offset: 576 length: 1 flags: O..
+65: offset: 584 length: 1 flags: O..
+66: offset: 592 length: 1 flags: O..
+67: offset: 600 length: 1 flags: O..
+68: offset: 608 length: 1 flags: O..
+69: offset: 616 length: 1 flags: O..
+70: offset: 624 length: 1 flags: O..
+71: offset: 632 length: 1 flags: O..
+72: offset: 640 length: 1 flags: O..
+73: offset: 648 length: 1 flags: O..
+74: offset: 656 length: 1 flags: O..
+75: offset: 664 length: 1 flags: O..
+76: offset: 672 length: 1 flags: O..
+77: offset: 680 length: 1 flags: O..
+78: offset: 688 length: 1 flags: O..
+79: offset: 696 length: 1 flags: O..
+80: offset: 704 length: 1 flags: O..
+81: offset: 712 length: 1 flags: O..
+82: offset: 720 length: 1 flags: O..
+83: offset: 728 length: 1 flags: O..
+84: offset: 736 length: 1 flags: O..
+85: offset: 744 length: 1 flags: O..
+86: offset: 752 length: 1 flags: O..
+87: offset: 760 length: 1 flags: O..
+88: offset: 768 length: 1 flags: O..
+89: offset: 776 length: 1 flags: O..
+90: offset: 784 length: 1 flags: O..
+91: offset: 792 length: 1 flags: O..
+92: offset: 800 length: 1 flags: O..
+93: offset: 808 length: 1 flags: O..
+94: offset: 816 length: 1 flags: O..
+95: offset: 824 length: 1 flags: O..
+96: offset: 832 length: 1 flags: O..
+97: offset: 840 length: 1 flags: O..
+98: offset: 848 length: 1 flags: O..
+99: offset: 856 length: 1 flags: O..
+100: offset: 864 length: 1 flags: O..
+101: offset: 872 length: 1 flags: O..
+102: offset: 880 length: 1 flags: O..
+103: offset: 888 length: 1 flags: O..
+104: offset: 896 length: 1 flags: O..
+105: offset: 904 length: 1 flags: O..
+106: offset: 912 length: 1 flags: O..
+107: offset: 920 length: 1 flags: O..
+108: offset: 928 length: 1 flags: O..
+109: offset: 936 length: 1 flags: O..
+110: offset: 944 length: 1 flags: O..
+111: offset: 952 length: 1 flags: O..
+112: offset: 960 length: 1 flags: O..
+113: offset: 968 length: 1 flags: O..
+114: offset: 976 length: 1 flags: O..
+115: offset: 984 length: 1 flags: O..
+116: offset: 992 length: 1 flags: O..
+117: offset: 1000 length: 1 flags: O..
+118: offset: 1008 length: 1 flags: O..
+119: offset: 1016 length: 1 flags: O.L
+extents: 120
+extents: 0
@@ -0,0 +1,3 @@
+== setup
+expected 4681
+== cleanup
@@ -39,20 +39,6 @@ cmd() {
 		die "cmd failed (check the run.log)"
 }

-# we can record pids to kill as we exit, we kill in reverse added order
-declare -a atexit_kill_pids
-atexit_kill()
-{
-	local pid
-
-	for pid in $(echo ${atexit_kill_pids[*]} | rev); do
-		if test -e "/proc/$pid/status" ; then
-			kill "$pid"
-		fi
-	done
-}
-trap atexit_kill EXIT
-
 show_help()
 {
 cat << EOF
@@ -414,7 +400,8 @@ if [ -n "$T_INSMOD" ]; then
 fi

 if [ -n "$T_TRACE_MULT" ]; then
-	orig_trace_size=$(cat /sys/kernel/debug/tracing/buffer_size_kb)
+#	orig_trace_size=$(cat /sys/kernel/debug/tracing/buffer_size_kb)
+	orig_trace_size=1408
 	mult_trace_size=$((orig_trace_size * T_TRACE_MULT))
 	msg "increasing trace buffer size from $orig_trace_size KiB to $mult_trace_size KiB"
 	echo $mult_trace_size > /sys/kernel/debug/tracing/buffer_size_kb
@@ -452,6 +439,30 @@ cmd grep .  /sys/kernel/debug/tracing/options/trace_printk \
 	    /sys/kernel/debug/tracing/buffer_size_kb \
 	    /proc/sys/kernel/ftrace_dump_on_oops

+# we can record pids to kill as we exit, we kill in reverse added order
+atexit_kill_pids=""
+add_atexit_kill_pid()
+{
+	atexit_kill_pids="$1 $atexit_kill_pids"
+}
+atexit_kill()
+{
+	local pid
+
+	# suppress bg function exited messages
+	exec {ERR}>&2 2>/dev/null
+
+	for pid in $atexit_kill_pids; do
+		if test -e "/proc/$pid/status" ; then
+			kill "$pid"
+			wait "$pid"
+		fi
+	done
+
+	exec 2>&$ERR {ERR}>&-
+}
+trap atexit_kill EXIT
+
 #
 # Build a fenced config that runs scripts out of the repository rather
 # than the default system directory
@@ -467,7 +478,7 @@ T_FENCED_LOG="$T_RESULTS/fenced.log"

 $T_UTILS/fenced/scoutfs-fenced > "$T_FENCED_LOG" 2>&1 &
 fenced_pid=$!
-atexit_kill_pids+=($fenced_pid)
+add_atexit_kill_pid $fenced_pid

 #
 # some critical failures will cause fs operations to hang.  We can watch
@@ -494,15 +505,17 @@ crash_monitor()
 		fi

 		if [ "$bad" != 0 ]; then
-			echo "run-tests monitor triggering crash"
+			echo "run-tests monitor syncing and triggering crash"
+			# hail mary, the sync could well hang
+			(echo s > /proc/sysrq-trigger) &
+			sleep 5
 			echo c > /proc/sysrq-trigger
-			# bg function doesn't reload bash, $$ is parent run-tests.sh
-			kill -9 $$
+			exit 1
 		fi
 	done
 }
 crash_monitor &
-atexit_kill_pids+=($!)
+add_atexit_kill_pid $!

 # setup dm tables
 echo "0 $(blockdev --getsz $T_META_DEVICE) linear $T_META_DEVICE 0" > \
@@ -615,6 +628,9 @@ for t in $tests; do
 		cmd rm -rf "$T_TMPDIR"
 		cmd mkdir -p "$T_TMPDIR"

+		# assign scratch mount point in temporary dir
+		T_MSCR="$T_TMPDIR/scratch"
+
 		# create a test name dir in the fs, clean up old data as needed
 		T_DS=""
 		for i in $(seq 0 $((T_NR_MOUNTS - 1))); do
@@ -678,8 +694,8 @@ for t in $tests; do
 		if [ "$sts" == "$T_PASS_STATUS" ]; then
 			dmesg | t_filter_dmesg > "$T_TMPDIR/dmesg.after"
 			diff --old-line-format="" --unchanged-line-format="" \
-				"$T_TMPDIR/dmesg.before" "$T_TMPDIR/dmesg.after" > \
-				"$T_TMPDIR/dmesg.new"
+				"$T_TMPDIR/dmesg.before" "$T_TMPDIR/dmesg.after" | \
+				grep -v '^$' > "$T_TMPDIR/dmesg.new"

 			if [ -s "$T_TMPDIR/dmesg.new" ]; then
 				message="unexpected messages in dmesg"
@@ -2,6 +2,7 @@ export-get-name-parent.sh
 basic-block-counts.sh
 basic-bad-mounts.sh
 basic-posix-acl.sh
+basic-acl-consistency.sh
 inode-items-updated.sh
 simple-inode-index.sh
 simple-staging.sh
@@ -10,6 +11,7 @@ simple-readdir.sh
 get-referring-entries.sh
 fallocate.sh
 basic-truncate.sh
+punch-offline.sh
 data-prealloc.sh
 setattr_more.sh
 offline-extent-waiting.sh
@@ -24,7 +26,9 @@ srch-basic-functionality.sh
 simple-xattr-unit.sh
 retention-basic.sh
 totl-xattr-tag.sh
+basic-xattr-indx.sh
 quota.sh
+totl-merge-read.sh
 lock-refleak.sh
 lock-shrink-consistency.sh
 lock-shrink-read-race.sh
@@ -48,6 +52,7 @@ setup-error-teardown.sh
 resize-devices.sh
 change-devices.sh
 fence-and-reclaim.sh
+orphan-log-trees.sh
 quorum-heartbeat-timeout.sh
 orphan-inodes.sh
 mount-unmount-race.sh
@@ -19,6 +19,7 @@
 #include <sys/types.h>
 #include <stdio.h>
 #include <sys/stat.h>
+#include <inttypes.h>
 #include <fcntl.h>
 #include <unistd.h>
 #include <stdlib.h>
@@ -29,7 +30,7 @@
 #include <errno.h>

 static int size = 0;
-static int count = 0; /* XXX make this duration instead */
+static int duration = 0;

 struct thread_info {
 	int nr;
@@ -41,6 +42,8 @@ static void *run_test_func(void *ptr)
 	void *buf = NULL;
 	char *addr = NULL;
 	struct thread_info *tinfo = ptr;
+	uint64_t seconds = 0;
+	struct timespec ts;
 	int c = 0;
 	int fd;
 	ssize_t read, written, ret;
@@ -61,9 +64,15 @@ static void *run_test_func(void *ptr)

 	usleep(100000); /* 0.1sec to allow all threads to start roughly at the same time */

+	clock_gettime(CLOCK_REALTIME, &ts); /* record start time */
+	seconds = ts.tv_sec + duration;
+
 	for (;;) {
-		if (++c > count)
-			break;
+		if (++c % 16 == 0) {
+			clock_gettime(CLOCK_REALTIME, &ts);
+			if (ts.tv_sec >= seconds)
+				break;
+		}

 		switch (rand() % 4) {
 		case 0: /* pread */
@@ -99,6 +108,8 @@ static void *run_test_func(void *ptr)
 			memcpy(addr, buf, size); /* noerr */
 			break;
 		}
+
+		usleep(10000);
 	}

 	munmap(addr, size);
@@ -120,7 +131,7 @@ int main(int argc, char **argv)
 	int i;

 	if (argc != 8) {
-		fprintf(stderr, "%s requires 7 arguments - size count file1 file2 file3 file4 file5\n", argv[0]);
+		fprintf(stderr, "%s requires 7 arguments - size duration file1 file2 file3 file4 file5\n", argv[0]);
 		exit(-1);
 	}

@@ -130,9 +141,9 @@ int main(int argc, char **argv)
 		exit(-1);
 	}

-	count = atoi(argv[2]);
-	if (count < 0) {
-		fprintf(stderr, "invalid count, must be greater than 0\n");
+	duration = atoi(argv[2]);
+	if (duration < 0) {
+		fprintf(stderr, "invalid duration, must be greater than or equal to 0\n");
 		exit(-1);
 	}

@@ -0,0 +1,664 @@
+/*
+ * Copyright (C) 2026 Versity Software, Inc.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ */
+
+#ifndef _GNU_SOURCE
+#define _GNU_SOURCE
+#endif
+#include <unistd.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <string.h>
+#include <sys/ioctl.h>
+#include <fcntl.h>
+#include <errno.h>
+#include <time.h>
+#include <linux/types.h>
+#include <assert.h>
+#include <stdbool.h>
+
+#include "../../utils/src/util.h"
+#include "ioctl.h"
+#include "format.h"
+
+/*
+ * This is a quick example of using the raw reading ioctls to get info
+ * on inodes as they change.  We maintain an array of meta_seq items for
+ * inodes that we've seen.  If we read the current meta_seq items and
+ * see differences then we get inode info and update our array with what
+ * we find.
+ *
+ * This only maintains one array and sorts it back and forth as we walk
+ * the meta_seq items and then search by inode number.  This will
+ * eventually use far too much cpu as the number of inodes increases.
+ */
+
+#define MSF		"%llu.%llu"
+#define MSA(ms)		(ms)->meta_seq, (ms)->ino
+#define NERRF		"nerr %d (\"%s\")"
+#define NERRA(nerr)	nerr, strerror(-nerr)
+
+#define prerror(fmt, args...) \
+	fprintf(stderr, "error: "fmt"\n", ##args)
+
+#define prdebug(fmt, args...) \
+do { \
+	if (opts.debug) \
+		printf(fmt"\n", ##args); \
+} while (0)
+
+static struct opts {
+	bool debug;
+	char *path;
+	char *names;
+	size_t names_size;
+	size_t names_count;
+} opts;
+
+struct stats {
+	__u64 start;
+	__u64 last;
+
+	struct per_call {
+		__u64 begin;
+		__u64 calls;
+		__u64 time;
+		__u64 inos;
+	} rms, rii;
+
+	__u64 inodes;
+	__u64 add;
+	__u64 remove;
+	__u64 update;
+
+	unsigned lines;
+} stats;
+
+struct meta_seq_array {
+	size_t nr;
+	size_t alloc;
+	struct scoutfs_ioctl_meta_seq *ms;
+};
+
+#define INO_BATCH	1000
+/* *2 for gratuitous allowance for struct expansion */
+#define RESULTS_SIZE	(INO_BATCH * 2 * (sizeof(struct scoutfs_ioctl_raw_read_result) + \
+		                          sizeof(__u64) + \
+		                          180 /* ~= sizeof(struct scoutfs_inode) */ + \
+		                          sizeof(struct scoutfs_ioctl_inode_attr_x)))
+
+#define NSEC_PER_SEC 1000000000
+
+static __u64 get_ns(void)
+{
+	struct timespec tp;
+	int ret;
+
+	ret = clock_gettime(CLOCK_MONOTONIC, &tp);
+	if (ret != 0) {
+		ret = -errno;
+		prerror("clock_gettime() error: "NERRF, NERRA(ret));
+		exit(2);
+	}
+
+	return ((__u64)tp.tv_sec * NSEC_PER_SEC) + (__u64)tp.tv_nsec;
+}
+static void begin_call(struct per_call *pc)
+{
+	pc->begin = get_ns();
+}
+
+static void end_call(struct per_call *pc)
+{
+	pc->calls++;
+	pc->time += get_ns() - pc->begin;
+}
+
+static int expand_array(struct meta_seq_array *arr, size_t additional)
+{
+#define ALLOC_BATCH	(1024 * 1024 / (sizeof(struct scoutfs_ioctl_meta_seq)))
+	struct scoutfs_ioctl_meta_seq *ms;
+	size_t expand;
+
+	if (arr->nr + additional <= arr->alloc)
+		return 0;
+
+	expand = arr->alloc + ALLOC_BATCH;
+	ms = reallocarray(arr->ms, expand, sizeof(arr->ms[0]));
+	if (!ms) {
+		prerror("allocating ms array with %zu elements failed", expand);
+		return -ENOMEM;
+	}
+
+	arr->alloc = expand;
+	arr->ms = ms;
+
+	return 0;
+}
+
+static void inc_ms(struct scoutfs_ioctl_meta_seq *ms)
+{
+	if (++ms->ino == 0)
+		ms->meta_seq++;
+}
+
+static void set_ms(struct scoutfs_ioctl_meta_seq *ms, __u64 meta_seq, __u64 ino)
+{
+	ms->meta_seq = meta_seq;
+	ms->ino = ino;
+}
+
+static int compar_ms_ino(const void *A, const void *B)
+{
+	const struct scoutfs_ioctl_meta_seq *a = A;
+	const struct scoutfs_ioctl_meta_seq *b = B;
+
+	return a->ino < b->ino ? -1 : a->ino > b->ino ? 1 : 0;
+}
+
+static int compar_ms_meta_seq(const void *A, const void *B)
+{
+	const struct scoutfs_ioctl_meta_seq *a = A;
+	const struct scoutfs_ioctl_meta_seq *b = B;
+
+	return a->meta_seq < b->meta_seq ? -1 : a->meta_seq > b->meta_seq ? 1 :
+	       compar_ms_ino(A, B);
+}
+
+static int compar_u64(const void *A, const void *B)
+{
+	const __u64 *a = A;
+	const __u64 *b = B;
+
+	return *a < *b ? -1 : *a > *b ? 1 : 0;
+}
+
+struct bsearch_ind_key {
+	int (*compar)(const void *a, const void *b);
+	void *key;
+	size_t size;
+	void **index;
+};
+
+static int bsearch_ind_compar(const void *a, const void *b)
+{
+	const struct bsearch_ind_key *bik = (const void *)((unsigned long)a ^ 1);
+	int cmp;
+
+	/* this key hack only works if compar is always called where a is key and b is &base[..] */
+	assert((unsigned long)a & 1);
+	assert(!((unsigned long)b & 1));
+
+	cmp = bik->compar(bik->key, b);
+	if (cmp > 0)
+		*(bik->index) = (void *)b + bik->size;
+	else
+		*(bik->index) = (void *)b;
+
+	return cmp;
+}
+
+static size_t bsearch_ind(const void *key, const void *base, size_t nmemb, size_t size,
+			  int (*compar)(const void *a, const void *b))
+{
+	void *index = (void *)base;
+	struct bsearch_ind_key bik = {
+		.compar = compar,
+		.key = (void *)key,
+		.size = size,
+		.index = &index,
+	};
+
+	bsearch((void *)(((unsigned long)&bik) | 1), base, nmemb, size, bsearch_ind_compar);
+
+	return (index - base) / size;
+}
+
+/*
+ * Generate a sorted list of inode numbers for the meta_seq items that
+ * differ between the results from raw_read_meta_seq and the items we
+ * have saved in our array. 
+ */
+static int differing_inos(__u64 *inos, struct meta_seq_array *arr,
+			  struct scoutfs_ioctl_meta_seq *start,
+			  struct scoutfs_ioctl_meta_seq *last,
+			  struct scoutfs_ioctl_meta_seq *ms, size_t nr)
+{
+	size_t arr_last;
+	size_t a;
+	size_t m;
+	int nr_inos;
+	int cmp;
+	int i;
+	int n;
+
+	/* find where we're going to stop in arr */
+	arr_last = bsearch_ind(last, arr->ms, arr->nr, sizeof(arr->ms[0]), compar_ms_meta_seq);
+	if (arr_last < arr->nr && compar_ms_meta_seq(&arr->ms[arr_last], last) == 0)
+		arr_last++;
+
+	a = bsearch_ind(start, arr->ms, arr->nr, sizeof(arr->ms[0]), compar_ms_meta_seq);
+
+	for (m = 0, nr_inos = 0; (a < arr_last || m < nr) && nr_inos < INO_BATCH; ) {
+
+		prdebug("diffing: m %zu nr %zu | a %zu arr_last %zu | nr_inos %d",
+			m, nr, a, arr_last, nr_inos);
+		if (a < arr_last)
+			prdebug("  arr->ms[%zu] = "MSF, a, MSA(&arr->ms[a]));
+		if (m < nr)
+			prdebug("  ms[%zu] = "MSF, m, MSA(&ms[m]));
+
+		/* setup comparison to copy lesser or only */
+		if (a < arr_last && m < nr)
+			cmp = compar_ms_meta_seq(&arr->ms[a], &ms[m]);
+		else if (a < arr_last)
+			cmp = -1;
+		else
+			cmp = 1;
+
+		prdebug("  cmp %d", cmp);
+
+		if (cmp == 0) {
+			/* ignore both when they match */
+			a++;
+			m++;
+		} else if (cmp < 0) {
+			inos[nr_inos++] = arr->ms[a++].ino;
+		} else { /* cmp > 0 */
+			inos[nr_inos++] = ms[m++].ino;
+		}
+	}
+
+	/* if we didn't consume all the read meta_seq then we might need to clamp last */
+	if (m < nr && compar_ms_meta_seq(&ms[m], last) <= 0) {
+		*last = ms[m];
+		last->ino--; /* must be non-zero, can't wrap */
+	}
+
+	/* sort and remove duplicate inode numbers */
+	if (nr_inos > 0) {
+		qsort(inos, nr_inos, sizeof(inos[0]), compar_u64);
+		for (i = 1, n = 1; i < nr_inos; i++) {
+			if (inos[i] != inos[n - 1])
+				inos[n++] = inos[i];
+		}
+		nr_inos = n;
+	}
+
+	return nr_inos;
+}
+
+/*
+ * We're not really validating the result stream.  We assume that the offset currently
+ * points at an inode.  We fill the caller's ms with its info then iterate through
+ * all its results until the next ino.
+ */
+static ssize_t read_inode_results(void *buf, size_t off, size_t size,
+				  struct scoutfs_ioctl_meta_seq *found)
+{
+	struct scoutfs_ioctl_raw_read_result res;
+	size_t len;
+	__le64 ms;
+
+	found->ino = 0;
+
+	while (off < size) {
+		memcpy(&res, buf + off, sizeof(res));
+		prdebug("res %u %u", res.type, res.size);
+
+		if (res.type == SCOUTFS_IOC_RAW_READ_RESULT_INODE && found->ino != 0)
+			break;
+
+		off += sizeof(res);
+
+		switch(res.type) {
+			case SCOUTFS_IOC_RAW_READ_RESULT_INODE:
+				memcpy(&found->ino, buf + off, sizeof(__u64));
+				memcpy(&ms, buf + off + sizeof(__u64) +
+				       offsetof(struct scoutfs_inode, meta_seq), sizeof(__le64));
+				found->meta_seq = le64_to_cpu(ms);
+				prdebug("res ino %llu ms %llu", found->ino, found->meta_seq);
+				break;
+
+			case SCOUTFS_IOC_RAW_READ_RESULT_XATTR:
+				len = strlen((char *)buf + off) + 1;
+				prdebug("res xattr '%s' len %d: '%.*s'",
+					(char *)buf + off, 
+					(int)(res.size - len),
+					(int)(res.size - len),
+					(char *)buf + off + len);
+				break;
+		};
+		off += res.size;
+	}
+
+	return off;
+}
+
+/*
+ * inos[] contains the inode numbers that we're interested in.  Get
+ * their info and update our array with what we find.
+ */
+static int read_inode_info(int fd, void *buf, struct meta_seq_array *arr, __u64 *inos, int nr_inos)
+{
+	struct scoutfs_ioctl_raw_read_inode_info rii;
+	struct scoutfs_ioctl_meta_seq found;
+	struct scoutfs_ioctl_meta_seq ms;
+	ssize_t off;
+	size_t size;
+	size_t ind;
+	size_t added;
+	int i;
+	int ret;
+
+	rii = (struct scoutfs_ioctl_raw_read_inode_info) {
+		.inos_ptr = (unsigned long)inos,
+		.inos_count = nr_inos,
+		.names_ptr = (unsigned long)opts.names,
+		.names_count = opts.names_count,
+		.results_ptr = (unsigned long)buf,
+		.results_size = RESULTS_SIZE,
+	};
+
+	begin_call(&stats.rii);
+	ret = ioctl(fd, SCOUTFS_IOC_RAW_READ_INODE_INFO, &rii);
+	if (ret < 0) {
+		ret = -errno;
+		prerror("READ_INODE_INFO ioctl failed: "NERRF, NERRA(ret));
+		goto out;
+	}
+	end_call(&stats.rii);
+
+	prdebug("gii ret %d", ret);
+
+	off = 0;
+	size = ret;
+	set_ms(&found, 0, 0);
+	added = 0;
+	i = 0;
+
+	/* sort by ino so we can search by ino for updates */
+	qsort(arr->ms, arr->nr, sizeof(arr->ms[0]), compar_ms_ino);
+
+	while (i < nr_inos) {
+		/* find next ino */
+		if (!found.ino && off < size) {
+			off = read_inode_results(buf, off, size, &found);
+			if (off < 0) {
+				ret = off;
+				goto out;
+			}
+			stats.rii.inos++;
+		}
+
+		if (i < nr_inos && (!found.ino || inos[i] < found.ino)) {
+			/* delete any record of inodes we didn't find */
+			set_ms(&ms, UINT64_MAX, inos[i]);
+			i++;
+
+		} else if (found.ino) {
+			/* update/add arr to match the found ino */
+			ms = found;
+			if (i < nr_inos && inos[i] == found.ino)
+				i++;
+			set_ms(&found, 0, 0);
+		}
+
+		/* find existing record */
+		ind = bsearch_ind(&ms, arr->ms, arr->nr, sizeof(arr->ms[0]), compar_ms_ino);
+		if (ind < arr->nr && arr->ms[ind].ino == ms.ino) {
+			/* update existing ino, can be marking for deletion */
+			prdebug("updating arr [%zu] ino %llu ms %llu -> %llu",
+					ind, ms.ino, arr->ms[ind].meta_seq, ms.meta_seq);
+			arr->ms[ind].meta_seq = ms.meta_seq;
+			if (ms.meta_seq == UINT64_MAX)
+				stats.remove++;
+			else
+				stats.update++;
+
+		} else if (ms.meta_seq != UINT64_MAX) {
+			/* append new found, maintaining existing sorting */
+			arr->ms[arr->nr + added] = ms;
+			prdebug("adding arr [%zu] ino %llu ms %llu",
+					arr->nr + added, ms.ino, ms.meta_seq);
+			added++;
+			stats.add++;
+		}
+	}
+
+	/* sort by seq again for next meta seq read */
+	arr->nr += added;
+	qsort(arr->ms, arr->nr, sizeof(arr->ms[0]), compar_ms_meta_seq);
+
+	/* and trim off any deletions */
+	while (arr->nr > 0 && arr->ms[arr->nr - 1].meta_seq == UINT64_MAX)
+		arr->nr--;
+
+	ret = 0;
+out:
+	return ret;
+}
+
+static double secs(u64 a_ns, u64 b_ns)
+{
+	return (double)(a_ns - b_ns) / NSEC_PER_SEC;
+}
+
+static double nr_per_sec(u64 nr, __u64 nsec)
+{
+	if (nsec == 0)
+		return 0;
+
+	return (double)nr / secs(nsec, 0);
+}
+
+static void print_stats(void)
+{
+	u64 now = get_ns();
+
+	if (secs(now, stats.last) < 1.0)
+		return;
+
+	if ((stats.lines++ % 16) == 0) {
+		printf("%6s | %-29s | %-23s | %-23s\n",
+			"", "inodes", "meta_seq", "inode_info");
+		printf("%6s | %8s %6s %6s %6s | %7s %7s %7s | %7s %7s %7s\n",
+			"now",
+			"total", "add", "remove", "update",
+			"calls", "inos", "inos/s",
+			"calls", "inos", "inos/s");
+	}
+
+	printf("%6.3lf | %8llu %6llu %6llu %6llu | %7llu %7llu %7.0lf | %7llu %7llu %7.0lf\n",
+		secs(now, stats.start),
+		stats.inodes, stats.add, stats.remove, stats.update,
+		stats.rms.calls, stats.rms.inos, nr_per_sec(stats.rms.inos, stats.rms.time),
+		stats.rii.calls, stats.rii.inos, nr_per_sec(stats.rms.inos, stats.rii.time));
+
+	stats.last = now;
+
+	{
+		struct stats save = stats;
+		stats = (struct stats) {
+			.start = save.start,
+			.last = save.last,
+			.lines = save.lines,
+		};
+	}
+}
+
+static void add_xattr(char *name)
+{
+	size_t len_null;
+	char *names;
+	int ret;
+
+	len_null = strlen(name) + 1;
+	names = realloc(opts.names, opts.names_size + len_null);
+	if (!names) {
+		ret = -errno;
+		prerror("allocation of xattr names buffer failed: "NERRF, NERRA(ret));
+		exit(3);
+	}
+
+	memcpy(names + opts.names_size, name, len_null);
+
+	opts.names = names;
+	opts.names_size += len_null;
+	opts.names_count++;
+}
+
+static bool parse_opts(int argc, char **argv)
+{
+	bool usage = false;
+	int c;
+
+	opts = (struct opts) {
+		.debug = false,
+	};
+
+        while ((c = getopt(argc, argv, "dp:x:")) != -1) {
+                switch(c) {
+                case 'd':
+                        opts.debug = true;
+                        break;
+                case 'p':
+                        opts.path = strdup(optarg);
+                        break;
+                case 'x':
+			add_xattr(optarg);
+                        break;
+                case '?':
+                        printf("Unknown option '%c'\n", optopt);
+			usage = true;
+                }
+	}
+
+	if (!usage) {
+		usage = true;
+		if (!opts.path)
+			printf("need -p path option\n");
+		else
+			usage = false;
+	}
+
+	if (usage) {
+		printf("\nusage:\n"
+		       " -d      | enable verbose debugging output\n"
+		       " -p PATH | path to file system to watch\n"
+		       " -x NAME | try to read named xattr with inodes, can be many\n"
+		      );
+		return false;
+	}
+
+	return true;
+}
+
+int main(int argc, char **argv)
+{
+	struct scoutfs_ioctl_raw_read_meta_seq rms = {0,};
+	struct scoutfs_ioctl_meta_seq *ms;
+	struct meta_seq_array arr = {0,};
+	__u64 *inos = NULL;
+	void *buf = NULL;
+	int fd = -1;
+	int nr_inos;
+	int nr;
+	int i;
+	int ret;
+
+	if (!parse_opts(argc, argv))
+		exit(1);
+
+	inos = calloc(INO_BATCH, sizeof(inos[0]));
+	buf = malloc(RESULTS_SIZE);
+	if (!inos || !buf) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	rms.results_ptr = (unsigned long)buf;
+	rms.results_size = min(RESULTS_SIZE, INO_BATCH * sizeof(struct scoutfs_ioctl_meta_seq));
+
+	fd = open(opts.path, O_RDONLY);
+	if (fd == -1) {
+		perror("error");
+		exit(1);
+	}
+
+	stats.start = get_ns();
+
+	for (;;) {
+		set_ms(&rms.start, 0, 0);
+		set_ms(&rms.end, UINT64_MAX, UINT64_MAX);
+
+		do {
+			begin_call(&stats.rms);
+			ret = ioctl(fd, SCOUTFS_IOC_RAW_READ_META_SEQ, &rms);
+			if (ret < 0) {
+				ret = -errno;
+				prerror("READ_META_SEQ ioctl failed, "
+					"start "MSF" end "MSF", "NERRF,
+					MSA(&rms.start), MSA(&rms.end), NERRA(ret));
+				goto out;
+			}
+			end_call(&stats.rms);
+			stats.rms.inos += ret;
+
+			prdebug("RMS last "MSF" ret %d:", MSA(&rms.last), ret);
+
+			nr = ret;
+			ms = buf;
+
+			if (opts.debug && nr > 0) {
+				for (i = 0; i < nr; i++)
+					prdebug(" [%u] "MSF"", i, MSA(&ms[i]));
+			}
+
+			nr_inos = differing_inos(inos, &arr, &rms.start, &rms.last, ms, nr);
+
+			if (nr_inos > 0) {
+				prdebug("diff inos %d:", nr_inos);
+				for (i = 0; i < nr_inos; i++)
+					prdebug(" [%u] %llu", i, inos[i]);
+
+				ret = expand_array(&arr, nr_inos) ?:
+				      read_inode_info(fd, buf, &arr, inos, nr_inos);
+				if (ret < 0)
+					goto out;
+			}
+
+			stats.inodes = arr.nr;
+			print_stats();
+
+			rms.start = rms.last;
+			inc_ms(&rms.start);
+
+		} while (rms.last.meta_seq != UINT64_MAX || rms.last.ino != UINT64_MAX);
+
+
+		sleep(1);
+	}
+
+	ret = 0;
+out:
+	if (fd >= 0)
+		close(fd);
+
+	free(inos);
+	free(buf);
+	free(arr.ms);
+	free(opts.names);
+
+	return ret;
+}
@@ -0,0 +1,117 @@
+
+#
+# Test basic clustered posix acl consistency.
+#
+
+t_require_commands getfacl setfacl
+
+GETFACL="getfacl --absolute-names"
+
+filter_scratch() {
+	sed "s@$T_MSCR@t_mscr@g"
+}
+
+acl_compare()
+{
+	diff -u - <($GETFACL $T_MSCR/data/dir_a/dir_b | filter_scratch) <<EOF1
+# file: t_mscr/data/dir_a/dir_b
+# owner: t_usr_3
+# group: t_grp_3
+# flags: -s-
+user::rwx
+group::rwx
+group:t_grp_2:r-x
+mask::rwx
+other::---
+default:user::rwx
+default:group::rwx
+default:group:t_grp_2:r-x
+default:group:t_grp_3:rwx
+default:mask::rwx
+default:other::---
+
+EOF1
+
+	test $? -eq 0 || t_fail "dir_b differs"
+
+	diff -u - <($GETFACL -p $T_MSCR/data/dir_a/dir_b/dir_c/dir_d | filter_scratch) <<EOF3
+# file: t_mscr/data/dir_a/dir_b/dir_c/dir_d
+# owner: t_usr_1
+# group: t_grp_1
+# flags: -s-
+user::rwx
+group::rwx
+group:t_grp_2:r-x
+mask::rwx
+other::---
+default:user::rwx
+default:group::rwx
+default:group:t_grp_2:r-x
+default:group:t_grp_3:rwx
+default:mask::rwx
+default:other::---
+
+EOF3
+	test $? -eq 0 || t_fail "dir_d differs"
+
+	diff -u - <($GETFACL $T_MSCR/data/dir_a/dir_b/dir_c | filter_scratch) <<EOF2
+# file: t_mscr/data/dir_a/dir_b/dir_c
+# owner: t_usr_3
+# group: t_grp_2
+# flags: -s-
+user::rwx
+group::rwx
+group:t_grp_2:r-x
+mask::rwx
+other::---
+default:user::rwx
+default:group::rwx
+default:group:t_grp_2:r-x
+default:group:t_grp_3:rwx
+default:mask::rwx
+default:other::---
+
+EOF2
+	test $? -eq 0 || t_fail "dir_c differs"
+}
+echo "== make scratch fs"
+t_scratch_mkfs
+t_scratch_mount
+
+rm -rf $T_MSCR/data
+
+echo "== create uid/gids"
+groupadd -g 7101 t_grp_1 > /dev/null 2>&1
+useradd -g 7101 -u 7101 t_usr_1 > /dev/null 2>&1
+groupadd -g 7102 t_grp_2 > /dev/null 2>&1
+groupadd -g 7103 t_grp_3 > /dev/null 2>&1
+useradd -g 7103 -u 7103 t_usr_3 > /dev/null 2>&1
+
+echo "== set acls and permissions"
+mkdir -p $T_MSCR/data/dir_a/dir_b
+chown t_usr_3:t_grp_3 $T_MSCR/data/dir_a/dir_b
+chmod 2770 $T_MSCR/data/dir_a/dir_b
+setfacl -m g:t_grp_2:rx $T_MSCR/data/dir_a/dir_b
+setfacl -m d:g:t_grp_2:rx $T_MSCR/data/dir_a/dir_b
+setfacl -m d:g:t_grp_3:rwx $T_MSCR/data/dir_a/dir_b
+
+mkdir -p $T_MSCR/data/dir_a/dir_b/dir_c
+chown t_usr_3:t_grp_2 $T_MSCR/data/dir_a/dir_b/dir_c
+setfacl -x g:t_grp_3 $T_MSCR/data/dir_a/dir_b/dir_c
+
+mkdir -p $T_MSCR/data/dir_a/dir_b/dir_c/dir_d
+chown t_usr_1:t_grp_1 $T_MSCR/data/dir_a/dir_b/dir_c/dir_d
+setfacl -x g:t_grp_3 $T_MSCR/data/dir_a/dir_b/dir_c/dir_d
+
+echo "== compare output"
+acl_compare
+
+echo "== drop caches and compare again"
+sync
+echo 3 > /proc/sys/vm/drop_caches
+acl_compare
+
+echo "== cleanup scratch fs"
+t_scratch_umount
+
+t_pass
@@ -12,25 +12,22 @@ mount_fail()
 }

 echo "== prepare devices, mount point, and logs"
-SCR="$T_TMPDIR/mnt.scratch"
-mkdir -p "$SCR"
+t_scratch_mkfs
 > $T_TMP.mount.out
-scoutfs mkfs -f -Q 0,127.0.0.1,$T_SCRATCH_PORT "$T_EX_META_DEV" "$T_EX_DATA_DEV" > $T_TMP.mkfs.out 2>&1 \
-	|| t_fail "mkfs failed"

 echo "== bad devices, bad options"
-mount_fail -o _bad /dev/null /dev/null "$SCR"
+mount_fail -o _bad /dev/null /dev/null "$T_MSCR"

 echo "== swapped devices"
-mount_fail -o metadev_path=$T_EX_DATA_DEV,quorum_slot_nr=0 "$T_EX_META_DEV" "$SCR"
+mount_fail -o metadev_path=$T_EX_DATA_DEV,quorum_slot_nr=0 "$T_EX_META_DEV" "$T_MSCR"

 echo "== both meta devices"
-mount_fail -o metadev_path=$T_EX_META_DEV,quorum_slot_nr=0 "$T_EX_META_DEV" "$SCR"
+mount_fail -o metadev_path=$T_EX_META_DEV,quorum_slot_nr=0 "$T_EX_META_DEV" "$T_MSCR"

 echo "== both data devices"
-mount_fail -o metadev_path=$T_EX_DATA_DEV,quorum_slot_nr=0 "$T_EX_DATA_DEV" "$SCR"
+mount_fail -o metadev_path=$T_EX_DATA_DEV,quorum_slot_nr=0 "$T_EX_DATA_DEV" "$T_MSCR"

 echo "== good volume, bad option and good options"
-mount_fail -o _bad,metadev_path=$T_EX_META_DEV,quorum_slot_nr=0 "$T_EX_DATA_DEV" "$SCR" 
+mount_fail -o _bad,metadev_path=$T_EX_META_DEV,quorum_slot_nr=0 "$T_EX_DATA_DEV" "$T_MSCR"

 t_pass
@@ -0,0 +1,143 @@
+#
+# Test basic .indx. xattr tag functionality and index entry lifecycle
+#
+
+t_require_commands touch rm setfattr scoutfs stat
+t_require_mounts 2
+
+# query index from a specific mount, default mount 0
+read_xattr_index()
+{
+	local nr="${1:-0}"
+	local mnt="$(eval echo \$T_M$nr)"
+	shift
+
+	sync
+	echo 1 > $(t_debugfs_path $nr)/drop_weak_item_cache
+	scoutfs read-xattr-index -p "$mnt" "$@"
+}
+
+MAJOR=5
+MINOR=100
+
+echo "== testing invalid read-xattr-index arguments"
+scoutfs read-xattr-index -p "$T_M0" bad 2>&1
+scoutfs read-xattr-index -p "$T_M0" 1.2 2>&1
+scoutfs read-xattr-index -p "$T_M0" 1.2.3 256.0.0 2>&1
+scoutfs read-xattr-index -p "$T_M0" 1.2.3 0.0.0 2>&1
+scoutfs read-xattr-index -p "$T_M0" 1.2.0 1.1.2 2>&1
+scoutfs read-xattr-index -p "$T_M0" 2.2.2 2.2.1 2>&1
+
+echo "== testing invalid names"
+touch "$T_D0/invalid"
+setfattr -n scoutfs.hide.indx.test.$MAJOR "$T_D0/invalid" 2>&1 | t_filter_fs
+setfattr -n scoutfs.hide.indx.test "$T_D0/invalid" 2>&1 | t_filter_fs
+setfattr -n scoutfs.hide.indx.test.. "$T_D0/invalid" 2>&1 | t_filter_fs
+setfattr -n scoutfs.hide.indx.test..$MINOR "$T_D0/invalid" 2>&1 | t_filter_fs
+setfattr -n scoutfs.hide.indx.test.$MAJOR. "$T_D0/invalid" 2>&1 | t_filter_fs
+setfattr -n scoutfs.hide.indx.test.256.$MINOR "$T_D0/invalid" 2>&1 | t_filter_fs
+setfattr -n scoutfs.hide.indx.test.abc.$MINOR "$T_D0/invalid" 2>&1 | t_filter_fs
+setfattr -n scoutfs.hide.indx.test.$MAJOR.abc "$T_D0/invalid" 2>&1 | t_filter_fs
+setfattr -n scoutfs.hide.indx.test.-1.$MINOR "$T_D0/invalid" 2>&1 | t_filter_fs
+setfattr -n scoutfs.hide.indx.test.$MAJOR.-1 "$T_D0/invalid" 2>&1 | t_filter_fs
+setfattr -n scoutfs.hide.indx.test.18446744073709551616.$MINOR "$T_D0/invalid" 2>&1 | t_filter_fs
+setfattr -n scoutfs.hide.indx.$(printf 'x%.0s' $(seq 1 240)).$MAJOR.$MINOR "$T_D0/invalid" 2>&1 | t_filter_fs
+rm -f "$T_D0/invalid"
+
+echo "== testing boundary values"
+touch "$T_D0/boundary"
+INO=$(stat -c "%i" "$T_D0/boundary")
+setfattr -n scoutfs.hide.indx.test.0.0 "$T_D0/boundary"
+read_xattr_index 0 0.0.0 0.0.-1 | awk '($3 == "'$INO'") {print "0.0 found"}'
+setfattr -x scoutfs.hide.indx.test.0.0 "$T_D0/boundary"
+setfattr -n scoutfs.hide.indx.test.255.18446744073709551615 "$T_D0/boundary"
+read_xattr_index 0 255.0.0 255.-1.-1 | awk '($3 == "'$INO'") {print "255.max found"}'
+setfattr -x scoutfs.hide.indx.test.255.18446744073709551615 "$T_D0/boundary"
+rm -f "$T_D0/boundary"
+
+echo "== indx xattr must have no value"
+touch "$T_D0/noval"
+setfattr -n scoutfs.hide.indx.test.$MAJOR.$MINOR -v "" "$T_D0/noval" 2>&1 | t_filter_fs
+setfattr -n scoutfs.hide.indx.test.$MAJOR.$MINOR -v 0 "$T_D0/noval" 2>&1 | t_filter_fs
+setfattr -n scoutfs.hide.indx.test.$MAJOR.$MINOR -v 1 "$T_D0/noval" 2>&1 | t_filter_fs
+rm -f "$T_D0/noval"
+
+echo "== set indx xattr and verify index entry"
+touch "$T_D0/file"
+INO=$(stat -c "%i" "$T_D0/file")
+setfattr -n scoutfs.hide.indx.test.$MAJOR.$MINOR "$T_D0/file"
+read_xattr_index 0 $MAJOR.0.0 $MAJOR.-1.-1 | awk '($3 == "'$INO'") {print "found"}'
+
+echo "== setting same indx xattr again is a no-op"
+setfattr -n scoutfs.hide.indx.test.$MAJOR.$MINOR "$T_D0/file"
+read_xattr_index 0 $MAJOR.0.0 $MAJOR.-1.-1 | awk '($3 == "'$INO'") {print "found"}'
+
+echo "== removing non-existent indx xattr succeeds"
+setfattr -x scoutfs.hide.indx.nonexistent.$MAJOR.999 "$T_D0/file" 2>&1 | t_filter_fs
+read_xattr_index 0 $MAJOR.0.0 $MAJOR.-1.-1 | awk '($3 == "'$INO'") {print "still found"}'
+
+echo "== explicit xattr removal cleans up index entry"
+setfattr -x scoutfs.hide.indx.test.$MAJOR.$MINOR "$T_D0/file"
+read_xattr_index 0 $MAJOR.0.0 $MAJOR.-1.-1 | awk '($3 == "'$INO'") {print "found orphan"}'
+rm -f "$T_D0/file"
+
+echo "== file deletion cleans up index entry"
+touch "$T_D0/file2"
+INO=$(stat -c "%i" "$T_D0/file2")
+setfattr -n scoutfs.hide.indx.test.$MAJOR.$MINOR "$T_D0/file2"
+read_xattr_index 0 $MAJOR.0.0 $MAJOR.-1.-1 | awk '($3 == "'$INO'") {print "found before delete"}'
+rm -f "$T_D0/file2"
+read_xattr_index 0 $MAJOR.0.0 $MAJOR.-1.-1 | awk '($3 == "'$INO'") {print "found orphan after delete"}'
+
+echo "== multiple indx xattrs on one file cleaned up by deletion"
+touch "$T_D0/file3"
+INO=$(stat -c "%i" "$T_D0/file3")
+setfattr -n scoutfs.hide.indx.a.$MAJOR.200 "$T_D0/file3"
+setfattr -n scoutfs.hide.indx.b.$MAJOR.300 "$T_D0/file3"
+BEFORE=$(read_xattr_index 0 $MAJOR.0.0 $MAJOR.-1.-1 | awk '($3 == "'$INO'")' | wc -l)
+echo "entries before delete: $BEFORE"
+rm -f "$T_D0/file3"
+AFTER=$(read_xattr_index 0 $MAJOR.0.0 $MAJOR.-1.-1 | awk '($3 == "'$INO'")' | wc -l)
+echo "entries after delete: $AFTER"
+
+echo "== partial removal leaves other entries"
+touch "$T_D0/partial"
+INO=$(stat -c "%i" "$T_D0/partial")
+setfattr -n scoutfs.hide.indx.a.$MAJOR.200 "$T_D0/partial"
+setfattr -n scoutfs.hide.indx.b.$MAJOR.300 "$T_D0/partial"
+setfattr -x scoutfs.hide.indx.a.$MAJOR.200 "$T_D0/partial"
+read_xattr_index 0 $MAJOR.200.0 $MAJOR.200.-1 | awk '($3 == "'$INO'") {print "200 found"}'
+read_xattr_index 0 $MAJOR.300.0 $MAJOR.300.-1 | awk '($3 == "'$INO'") {print "300 found"}'
+rm -f "$T_D0/partial"
+
+echo "== multiple files at same index position"
+touch "$T_D0/multi_a" "$T_D0/multi_b"
+INO_A=$(stat -c "%i" "$T_D0/multi_a")
+INO_B=$(stat -c "%i" "$T_D0/multi_b")
+setfattr -n scoutfs.hide.indx.test.$MAJOR.$MINOR "$T_D0/multi_a"
+setfattr -n scoutfs.hide.indx.test.$MAJOR.$MINOR "$T_D0/multi_b"
+COUNT=$(read_xattr_index 0 $MAJOR.$MINOR.0 $MAJOR.$MINOR.-1 | wc -l)
+echo "files at same position: $COUNT"
+rm -f "$T_D0/multi_a"
+read_xattr_index 0 $MAJOR.$MINOR.0 $MAJOR.$MINOR.-1 | awk '($3 == "'$INO_A'") {print "deleted file still found"}'
+read_xattr_index 0 $MAJOR.$MINOR.0 $MAJOR.$MINOR.-1 | awk '($3 == "'$INO_B'") {print "surviving file found"}'
+rm -f "$T_D0/multi_b"
+
+echo "== cross-mount visibility"
+touch "$T_D0/file4"
+INO=$(stat -c "%i" "$T_D0/file4")
+setfattr -n scoutfs.hide.indx.test.$MAJOR.$MINOR "$T_D0/file4"
+read_xattr_index 1 $MAJOR.0.0 $MAJOR.-1.-1 | awk '($3 == "'$INO'") {print "found on mount 1"}'
+rm -f "$T_D0/file4"
+read_xattr_index 1 $MAJOR.0.0 $MAJOR.-1.-1 | awk '($3 == "'$INO'") {print "found orphan on mount 1"}'
+
+echo "== duplicate position deduplication"
+touch "$T_D0/file5"
+INO=$(stat -c "%i" "$T_D0/file5")
+setfattr -n scoutfs.hide.indx.aa.$MAJOR.$MINOR "$T_D0/file5"
+setfattr -n scoutfs.hide.indx.bb.$MAJOR.$MINOR "$T_D0/file5"
+COUNT=$(read_xattr_index 0 $MAJOR.0.0 $MAJOR.-1.-1 | awk '($3 == "'$INO'")' | wc -l)
+echo "entries for same position: $COUNT"
+rm -f "$T_D0/file5"
+
+t_pass
@@ -11,9 +11,8 @@ truncate -s $sz "$T_TMP.equal"
 truncate -s $large_sz "$T_TMP.large"

 echo "== make scratch fs"
-t_quiet scoutfs mkfs -f -Q 0,127.0.0.1,$T_SCRATCH_PORT "$T_EX_META_DEV" "$T_EX_DATA_DEV"
-SCR="$T_TMPDIR/mnt.scratch"
-mkdir -p "$SCR"
+t_scratch_mkfs
+mkdir -p "$T_MSCR"

 echo "== small new data device fails"
 t_rc scoutfs prepare-empty-data-device "$T_EX_META_DEV" "$T_TMP.small"
@@ -23,13 +22,13 @@ t_rc scoutfs prepare-empty-data-device --check "$T_EX_META_DEV" "$T_TMP.small"
 t_rc scoutfs prepare-empty-data-device --check "$T_EX_META_DEV"

 echo "== preparing while mounted fails"
-mount -t scoutfs -o metadev_path=$T_EX_META_DEV,quorum_slot_nr=0 "$T_EX_DATA_DEV" "$SCR"
+mount -t scoutfs -o metadev_path=$T_EX_META_DEV,quorum_slot_nr=0 "$T_EX_DATA_DEV" "$T_MSCR"
 t_rc scoutfs prepare-empty-data-device "$T_EX_META_DEV" "$T_TMP.equal"
-umount "$SCR"
+umount "$T_MSCR"

 echo "== preparing without recovery fails"
-mount -t scoutfs -o metadev_path=$T_EX_META_DEV,quorum_slot_nr=0 "$T_EX_DATA_DEV" "$SCR"
-umount -f "$SCR"
+mount -t scoutfs -o metadev_path=$T_EX_META_DEV,quorum_slot_nr=0 "$T_EX_DATA_DEV" "$T_MSCR"
+umount -f "$T_MSCR"
 t_rc scoutfs prepare-empty-data-device "$T_EX_META_DEV" "$T_TMP.equal"

 echo "== check sees metadata errors"
@@ -37,16 +36,16 @@ t_rc scoutfs prepare-empty-data-device --check "$T_EX_META_DEV"
 t_rc scoutfs prepare-empty-data-device --check "$T_EX_META_DEV" "$T_TMP.equal"

 echo "== preparing with file data fails"
-mount -t scoutfs -o metadev_path=$T_EX_META_DEV,quorum_slot_nr=0 "$T_EX_DATA_DEV" "$SCR"
-echo hi > "$SCR"/file
-umount "$SCR"
+mount -t scoutfs -o metadev_path=$T_EX_META_DEV,quorum_slot_nr=0 "$T_EX_DATA_DEV" "$T_MSCR"
+echo hi > "$T_MSCR"/file
+umount "$T_MSCR"
 scoutfs print "$T_EX_META_DEV" > "$T_TMP.print"
 t_rc scoutfs prepare-empty-data-device "$T_EX_META_DEV" "$T_TMP.equal"

 echo "== preparing after emptied"
-mount -t scoutfs -o metadev_path=$T_EX_META_DEV,quorum_slot_nr=0 "$T_EX_DATA_DEV" "$SCR"
-rm -f "$SCR"/file
-umount "$SCR"
+mount -t scoutfs -o metadev_path=$T_EX_META_DEV,quorum_slot_nr=0 "$T_EX_DATA_DEV" "$T_MSCR"
+rm -f "$T_MSCR"/file
+umount "$T_MSCR"
 t_rc scoutfs prepare-empty-data-device "$T_EX_META_DEV" "$T_TMP.equal"

 echo "== checks pass"
@@ -55,22 +54,22 @@ t_rc scoutfs prepare-empty-data-device --check "$T_EX_META_DEV" "$T_TMP.equal"

 echo "== using prepared"
 scr_loop=$(losetup --find --show "$T_TMP.equal")
-mount -t scoutfs -o metadev_path=$T_EX_META_DEV,quorum_slot_nr=0 "$scr_loop" "$SCR"
-touch "$SCR"/equal_prepared
-equal_tot=$(scoutfs statfs -s total_data_blocks -p "$SCR")
-umount "$SCR"
+mount -t scoutfs -o metadev_path=$T_EX_META_DEV,quorum_slot_nr=0 "$scr_loop" "$T_MSCR"
+touch "$T_MSCR"/equal_prepared
+equal_tot=$(scoutfs statfs -s total_data_blocks -p "$T_MSCR")
+umount "$T_MSCR"
 losetup -d "$scr_loop"

 echo "== preparing larger and resizing"
 t_rc scoutfs prepare-empty-data-device "$T_EX_META_DEV" "$T_TMP.large"
 scr_loop=$(losetup --find --show "$T_TMP.large")
-mount -t scoutfs -o metadev_path=$T_EX_META_DEV,quorum_slot_nr=0 "$scr_loop" "$SCR"
-touch "$SCR"/large_prepared
-ls "$SCR"
-scoutfs resize-devices -p "$SCR" -d $large_sz
-large_tot=$(scoutfs statfs -s total_data_blocks -p "$SCR")
+mount -t scoutfs -o metadev_path=$T_EX_META_DEV,quorum_slot_nr=0 "$scr_loop" "$T_MSCR"
+touch "$T_MSCR"/large_prepared
+ls "$T_MSCR"
+scoutfs resize-devices -p "$T_MSCR" -d $large_sz
+large_tot=$(scoutfs statfs -s total_data_blocks -p "$T_MSCR")
 test "$large_tot" -gt "$equal_tot" ; echo "resized larger test rc: $?"
-umount "$SCR"
+umount "$T_MSCR"
 losetup -d "$scr_loop"

 echo "== cleanup"
@@ -54,21 +54,16 @@ after=$(free_blocks Data "$T_M0")
 test "$before" == "$after" || \
 	t_fail "$after free data blocks after rm, expected $before"

-# XXX this is all pretty manual, would be nice to have helpers
 echo "== make small meta fs"
 # meta device just big enough for reserves and the metadata we'll fill
-scoutfs mkfs -A -f -Q 0,127.0.0.1,$T_SCRATCH_PORT -m 10G "$T_EX_META_DEV" "$T_EX_DATA_DEV" > $T_TMP.mkfs.out 2>&1 || \
-	t_fail "mkfs failed"
-SCR="$T_TMPDIR/mnt.scratch"
-mkdir -p "$SCR"
-mount -t scoutfs -o metadev_path=$T_EX_META_DEV,quorum_slot_nr=0 \
-	"$T_EX_DATA_DEV" "$SCR"
+t_scratch_mkfs -A -m 10G
+t_scratch_mount

 echo "== create large xattrs until we fill up metadata"
-mkdir -p "$SCR/xattrs"
+mkdir -p "$T_MSCR/xattrs"

 for f in $(seq 1 100000); do
-	file="$SCR/xattrs/file-$f"
+	file="$T_MSCR/xattrs/file-$f"
 	touch "$file"

 	LC_ALL=C create_xattr_loop -c 1000 -n user.scoutfs-enospc -p "$file" -s 65535 > $T_TMP.cxl 2>&1
@@ -84,10 +79,10 @@ for f in $(seq 1 100000); do
 done

 echo "== remove files with xattrs after enospc"
-rm -rf "$SCR/xattrs"
+rm -rf "$T_MSCR/xattrs"

 echo "== make sure we can create again"
-file="$SCR/file-after"
+file="$T_MSCR/file-after"
 C=120
 while (( C-- )); do
 	touch $file 2> /dev/null && break
@@ -99,7 +94,6 @@ sync
 rm -f "$file"

 echo "== cleanup small meta fs"
-umount "$SCR"
-rmdir "$SCR"
+t_scratch_umount

 t_pass
@@ -5,6 +5,9 @@
 t_require_commands sleep touch grep sync scoutfs
 t_require_mounts 2

+# regularly see ~20/~30s
+VERIFY_TIMEOUT_SECS=90
+
 #
 # Make sure that all mounts can read the results of a write from each
 # mount.
@@ -40,8 +43,10 @@ verify_fenced_run()

 	for rid in $rids; do
 		grep -q ".* running rid '$rid'.* args 'ignored run args'" "$T_FENCED_LOG" || \
-			t_fail "fenced didn't execute RUN script for rid $rid"
+			return 1
 	done
+
+	return 0
 }

 echo "== make sure all mounts can see each other"
@@ -54,14 +59,7 @@ rid=$(t_mount_rid $cl)
 echo "cl $cl sv $sv rid $rid" >> "$T_TMP.log"
 sync
 t_force_umount $cl
-# wait for client reconnection to timeout
-while grep -q $rid $(t_debugfs_path $sv)/connections; do
-	sleep .5
-done
-while t_rid_is_fencing $rid; do
-	sleep .5
-done
-verify_fenced_run $rid
+t_wait_until_timeout $VERIFY_TIMEOUT_SECS verify_fenced_run $rid
 t_mount $cl
 check_read_write

@@ -83,15 +81,7 @@ for cl in $(t_fs_nrs); do
 	t_force_umount $cl
 done

-# wait for all client reconnections to timeout
-while egrep -q "($pattern)" $(t_debugfs_path $sv)/connections; do
-	sleep .5
-done
-# wait for all fence requests to complete
-while test -d $(echo /sys/fs/scoutfs/*/fence/* | cut -d " " -f 1); do
-	sleep .5
-done
-verify_fenced_run $rids
+t_wait_until_timeout $VERIFY_TIMEOUT_SECS verify_fenced_run $rids
 # remount all the clients
 for cl in $(t_fs_nrs); do
 	if [ $cl == $sv ]; then
@@ -107,12 +97,7 @@ rid=$(t_mount_rid $sv)
 echo "sv $sv rid $rid" >> "$T_TMP.log"
 sync
 t_force_umount $sv
-t_wait_for_leader
-# wait until new server is done fencing unmounted leader rid
-while t_rid_is_fencing $rid; do
-	sleep .5
-done
-verify_fenced_run $rid
+t_wait_until_timeout $VERIFY_TIMEOUT_SECS verify_fenced_run $rid
 t_mount $sv
 check_read_write

@@ -127,11 +112,7 @@ for nr in $(t_fs_nrs); do
 	t_force_umount $nr
 done
 t_mount_all
-# wait for all fence requests to complete
-while test -d $(echo /sys/fs/scoutfs/*/fence/* | cut -d " " -f 1); do
-	sleep .5
-done
-verify_fenced_run $rids
+t_wait_until_timeout $VERIFY_TIMEOUT_SECS verify_fenced_run $rids
 check_read_write

 t_pass
@@ -72,7 +72,7 @@ touch $T_D0/dir/file
 mkdir $T_D0/dir/dir
 ln -s $T_D0/dir/file $T_D0/dir/symlink
 mknod $T_D0/dir/char c 1 3 # null
-mknod $T_D0/dir/block b 7 0 # loop0
+mknod $T_D0/dir/block b 42 0 # SAMPLE block dev - nonexistant/demo use only number
 for name in $(ls -UA $T_D0/dir | sort); do
 	ino=$(stat -c '%i' $T_D0/dir/$name)
 	$GRE $ino | filter_types
@@ -53,26 +53,40 @@ exec {FD1}>&-  # close
 exec {FD2}>&-  # close
 check_ino_index "$ino" "$dseq" "$T_M0"

+# Hurry along the orphan scanners. If any are currently asleep, we will
+# have to wait at least their current scan interval before they wake up,
+# run, and notice their new interval.
+t_save_all_sysfs_mount_options orphan_scan_delay_ms
+t_set_all_sysfs_mount_options orphan_scan_delay_ms 500
+t_wait_for_orphan_scan_runs
+
 echo "== remote unopened unlink deletes"
 echo "contents" > "$T_D0/file"
 ino=$(stat -c "%i" "$T_D0/file")
 dseq=$(scoutfs stat -s data_seq "$T_D0/file")
 rm -f "$T_D1/file"
+# cross-mount deletion falls back to the orphan scanner when the
+# creating mount still has the inode cached, wait for it to complete
+t_force_log_merge
+# wait for orphan scanners to pick up the unlinked inode and become idle
+t_wait_for_no_orphans
 check_ino_index "$ino" "$dseq" "$T_M0"
 check_ino_index "$ino" "$dseq" "$T_M1"

 echo "== unlink wait for open on other mount"
-echo "contents" > "$T_D0/file"
-ino=$(stat -c "%i" "$T_D0/file")
-dseq=$(scoutfs stat -s data_seq "$T_D0/file")
-exec {FD}<"$T_D0/file"
-rm -f "$T_D1/file"
+echo "contents" > "$T_D0/badfile"
+ino=$(stat -c "%i" "$T_D0/badfile")
+dseq=$(scoutfs stat -s data_seq "$T_D0/badfile")
+exec {FD}<"$T_D0/badfile"
+rm -f "$T_D1/badfile"
 echo "mount 0 contents after mount 1 rm: $(cat <&$FD)"
 check_ino_index "$ino" "$dseq" "$T_M0"
 check_ino_index "$ino" "$dseq" "$T_M1"
 exec {FD}>&-  # close
 # we know that revalidating will unhash the remote dentry
-stat "$T_D0/file" 2>&1 | sed 's/cannot statx/cannot stat/' | t_filter_fs
+stat "$T_D0/badfile" 2>&1 | sed 's/cannot statx/cannot stat/' | t_filter_fs
+t_force_log_merge
+t_wait_for_no_orphans
 check_ino_index "$ino" "$dseq" "$T_M0"
 check_ino_index "$ino" "$dseq" "$T_M1"

@@ -83,16 +97,20 @@ rm -f "$T_D0/dir"/files-*
 rmdir "$T_D0/dir"

 echo "== open files survive remote scanning orphans"
-echo "contents" > "$T_D0/file"
-ino=$(stat -c "%i" "$T_D0/file")
-dseq=$(scoutfs stat -s data_seq "$T_D0/file")
-exec {FD}<"$T_D0/file"
-rm -f "$T_D0/file"
+echo "contents" > "$T_D0/lastfile"
+ino=$(stat -c "%i" "$T_D0/lastfile")
+dseq=$(scoutfs stat -s data_seq "$T_D0/lastfile")
+exec {FD}<"$T_D0/lastfile"
+rm -f "$T_D0/lastfile"
 t_umount 1
 t_mount 1
 echo "mount 0 contents after mount 1 remounted: $(cat <&$FD)"
 exec {FD}>&-  # close
+t_force_log_merge
+t_wait_for_no_orphans
 check_ino_index "$ino" "$dseq" "$T_M0"
 check_ino_index "$ino" "$dseq" "$T_M1"

+t_restore_all_sysfs_mount_options orphan_scan_delay_ms
+
 t_pass
@@ -5,7 +5,7 @@
 t_require_commands mmap_stress mmap_validate scoutfs xfs_io

 echo "== mmap_stress"
-mmap_stress 8192 2000 "$T_D0/mmap_stress" "$T_D1/mmap_stress" "$T_D2/mmap_stress" "$T_D3/mmap_stress" "$T_D4/mmap_stress" | sed 's/:.*//g' | sort
+mmap_stress 8192 30 "$T_D0/mmap_stress" "$T_D0/mmap_stress" "$T_D0/mmap_stress" "$T_D3/mmap_stress" "$T_D3/mmap_stress" | sed 's/:.*//g' | sort

 echo "== basic mmap/read/write consistency checks"
 mmap_validate 256 1000 "$T_D0/mmap_val1" "$T_D1/mmap_val1"
@@ -0,0 +1,52 @@
+#
+# Test that orphaned log_trees entries from unmounted rids are
+# finalized and merged.
+#
+# An orphan log_trees entry is one whose rid has no mounted_clients
+# entry.  This can happen from incomplete reclaim across server
+# failovers.  We simulate it with the reclaim_skip_finalize trigger
+# which makes reclaim_open_log_tree skip the finalization step.
+#
+
+t_require_commands touch scoutfs
+t_require_mounts 2
+
+TIMEOUT=90
+
+echo "== create orphan log_trees entry via trigger"
+sv=$(t_server_nr)
+cl=$(t_first_client_nr)
+rid=$(t_mount_rid $cl)
+
+touch "$T_D0/file" "$T_D1/file"
+sync
+
+# arm the trigger so reclaim skips finalization
+t_trigger_arm_silent reclaim_skip_finalize $sv
+
+# force unmount the client, server will fence and reclaim it
+# but the trigger makes reclaim leave log_trees unfinalized
+t_force_umount $cl
+
+# wait for fencing to run
+verify_fenced() {
+	grep -q "running rid '$rid'" "$T_FENCED_LOG" 2>/dev/null
+}
+t_wait_until_timeout $TIMEOUT verify_fenced
+
+# give the server time to complete reclaim after fence
+sleep 5
+
+# remount the client so t_force_log_merge can sync all mounts.
+# the client gets a new rid; the old rid's log_trees is the orphan.
+t_mount $cl
+
+echo "== verify orphan is reclaimed and merge completes"
+t_force_log_merge
+
+echo "== verify orphan reclaim was logged"
+if ! dmesg | grep -q "reclaiming orphan log trees for rid $rid"; then
+	t_fail "expected orphan reclaim message for rid $rid in dmesg"
+fi
+
+t_pass
@@ -0,0 +1,152 @@
+
+t_require_commands scoutfs dd fallocate
+
+FILE="$T_D0/file"
+DIR="$T_D0/dir"
+
+echo "== missing options should fail =="
+rm -rf $DIR && mkdir -p $DIR
+scoutfs punch-offline $DIR -l 4096 -V 0
+scoutfs punch-offline $DIR -o 0 -V 0
+scoutfs punch-offline $DIR -o 0 -l 4096
+
+echo "== can't hole punch dir or special =="
+rm -rf $DIR && mkdir -p $DIR
+scoutfs punch-offline $DIR -o 0 -l 4096 -V 0
+
+echo "== punching an empty file does nothing =="
+rm -f $FILE && touch $FILE
+scoutfs punch-offline $FILE -o 0 -l 4096 -V 0
+
+echo "== punch outside of i_size does nothing =="
+dd if=/dev/zero of=$FILE bs=4096 count=1 status=none
+scoutfs punch-offline $FILE -o 4096 -l 4096 -V 1
+
+echo "== can't hole punch online extent =="
+scoutfs get-fiemap -Lb $FILE
+scoutfs punch-offline $FILE -o 0 -l 4096 -V 1
+scoutfs get-fiemap -Lb $FILE
+
+echo "== can't hole punch unwritten extent =="
+rm -rf $FILE && touch $FILE
+fallocate -l $((4096 * 3)) $FILE
+vers=$(scoutfs stat -s data_version "$FILE")
+scoutfs get-fiemap -Lb $FILE
+scoutfs punch-offline $FILE -o 4096 -l 4096 -V $vers
+scoutfs get-fiemap -Lb $FILE
+
+echo "== hole punch offline extent =="
+rm -rf $FILE && touch $FILE
+fallocate -l $((4096 * 3)) $FILE
+vers=$(scoutfs stat -s data_version "$FILE")
+scoutfs release $FILE --data-version $vers
+scoutfs get-fiemap -Lb $FILE
+scoutfs punch-offline $FILE -o 4096 -l 4096 -V $vers
+scoutfs get-fiemap -Lb $FILE
+
+echo "== can't hole punch non-aligned bsz offset or len =="
+rm -rf $FILE && touch $FILE
+fallocate -l $((4096 * 3)) $FILE
+vers=$(scoutfs stat -s data_version "$FILE")
+scoutfs release $FILE --data-version $vers
+scoutfs get-fiemap -Lb $FILE
+scoutfs punch-offline $FILE -o 4095 -l 4096 -V $vers
+scoutfs punch-offline $FILE -o 1 -l 4096 -V $vers
+scoutfs punch-offline $FILE -o 4096 -l 409700 -V $vers
+scoutfs punch-offline $FILE -o 4096 -l 4097 -V $vers
+scoutfs punch-offline $FILE -o 4096 -l 4095 -V $vers
+scoutfs punch-offline $FILE -o 4096 -l 1 -V $vers
+scoutfs punch-offline $FILE -o 4096 -l 0 -V $vers
+scoutfs get-fiemap -Lb $FILE
+
+echo "== can't hole punch mismatched data_version =="
+rm -rf $FILE && touch $FILE
+fallocate -l $((4096 * 3)) $FILE
+vers=$(scoutfs stat -s data_version "$FILE")
+scoutfs release $FILE --data-version $vers
+scoutfs get-fiemap -Lb $FILE
+scoutfs punch-offline $FILE -o 4096 -l 4096 -V 0
+scoutfs punch-offline $FILE -o 4096 -l 4096 -V 2
+scoutfs punch-offline $FILE -o 4096 -l 4096 -V 9999
+scoutfs get-fiemap -Lb $FILE
+
+echo "== Punch hole crossing multiple extents =="
+rm -rf $FILE && touch $FILE
+fallocate -l $((7 * 4096)) $FILE
+vers=$(scoutfs stat -s data_version "$FILE")
+scoutfs release $FILE --data-version $vers
+scoutfs get-fiemap -L $FILE
+scoutfs punch-offline $FILE -o $((1 * 4096)) -l 4096 -V $vers
+scoutfs punch-offline $FILE -o $((3 * 4096)) -l 4096 -V $vers
+scoutfs punch-offline $FILE -o $((5 * 4096)) -l 4096 -V $vers
+# 0.1.2.3
+scoutfs get-fiemap -L $FILE
+scoutfs punch-offline $FILE -o $((2 * 4096)) -l $((3 * 4096)) -V $vers
+# 0.....1
+scoutfs get-fiemap -L $FILE
+
+echo "== punch hole starting at a hole =="
+rm -rf $FILE && touch $FILE
+fallocate -l $((7 * 4096)) $FILE
+vers=$(scoutfs stat -s data_version "$FILE")
+scoutfs release $FILE --data-version $vers
+scoutfs get-fiemap -L $FILE
+scoutfs punch-offline $FILE -o $((1 * 4096)) -l 4096 -V $vers
+scoutfs punch-offline $FILE -o $((3 * 4096)) -l 4096 -V $vers
+scoutfs punch-offline $FILE -o $((5 * 4096)) -l 4096 -V $vers
+# 0.1.2.3
+scoutfs get-fiemap -L $FILE
+scoutfs punch-offline $FILE -o $((1 * 4096)) -l $((5 * 4096)) -V $vers
+# 0.....1
+scoutfs get-fiemap -L $FILE
+
+echo "== large punch =="
+rm -rf $FILE && touch $FILE
+fallocate -l $((6 * 1024 * 1024 * 1024)) $FILE
+vers=$(scoutfs stat -s data_version "$FILE")
+scoutfs release $FILE --data-version $vers
+scoutfs get-fiemap -L $FILE
+scoutfs punch-offline $FILE -o $((134123 * 4096)) -l $((68343 * 4096)) -V $vers
+scoutfs punch-offline $FILE -o $((467273 * 4096)) -l $((68343 * 4096)) -V $vers
+scoutfs punch-offline $FILE -o $((734623 * 4096)) -l $((68343 * 4096)) -V $vers
+scoutfs get-fiemap -L $FILE
+
+echo "== overlapping punches with lots of extents =="
+rm -rf $FILE && touch $FILE
+fallocate -l $((4096 * 1024)) $FILE
+vers=$(scoutfs stat -s data_version "$FILE")
+scoutfs release $FILE --data-version 1
+scoutfs get-fiemap -Lb $FILE
+# punch odd ones away
+for h in $(seq 1 2 1023); do
+	scoutfs punch-offline $FILE -o $((h * 4096)) -l 4096 -V $vers
+done
+scoutfs get-fiemap -Lb $FILE | tail -n 1
+# punch a large hole from 32 to 55, removing 7 extents
+scoutfs punch-offline $FILE -o $((32 * 4096)) -l $((13 * 4096)) -V $vers
+scoutfs get-fiemap -Lb $FILE | tail -n 1
+# punch every 8th @6
+for h in $(seq 6 8 1024); do
+	scoutfs punch-offline $FILE -o $((h * 4096)) -l 4096 -V $vers
+done
+# again @4
+scoutfs get-fiemap -Lb $FILE | tail -n 1
+for h in $(seq 4 8 1024); do
+	scoutfs punch-offline $FILE -o $((h * 4096)) -l 4096 -V $vers
+done
+scoutfs get-fiemap -Lb $FILE | tail -n 1
+# punching a large hole from 127 to 175, removing 12 extents
+scoutfs punch-offline $FILE -o $((127 * 4096)) -l $((48 * 4096)) -V $vers
+scoutfs get-fiemap -Lb $FILE
+# again @2
+for h in $(seq 2 8 1024); do
+	scoutfs punch-offline $FILE -o $((h * 4096)) -l 4096 -V $vers
+done
+scoutfs get-fiemap -L $FILE
+# and again @0, punching away everything remaining extent
+for h in $(seq 0 8 1024); do
+	scoutfs punch-offline $FILE -o $((h * 4096)) -l 4096 -V $vers
+done
+scoutfs get-fiemap -Lb $FILE
+
+t_pass
@@ -62,7 +62,7 @@ test_timeout()
 	sleep 1

 	# tear down the current server/leader
-	t_force_umount $sv
+	t_force_umount $sv &

 	# see how long it takes for the next leader to start
 	start=$(time_ms)
@@ -73,6 +73,7 @@ test_timeout()
 	echo "to $to delay $delay" >> $T_TMP.delay

 	# restore the mount that we tore down
+	wait
 	t_mount $sv

 	# make sure the new leader delay was reasonable, allowing for some slack
@@ -8,19 +8,19 @@ t_require_mounts 2
 echo "=== renameat2 noreplace flag test"

 # give each mount their own dir (lock group) to minimize create contention
-mkdir $T_M0/dir0
-mkdir $T_M1/dir1
+mkdir $T_D0/dir0
+mkdir $T_D1/dir1

 echo "=== run two asynchronous calls to renameat2 NOREPLACE"
 for i in $(seq 0 100); do
        # prepare inputs in isolation
-        touch "$T_M0/dir0/old0"
-        touch "$T_M1/dir1/old1"
+        touch "$T_D0/dir0/old0"
+        touch "$T_D1/dir1/old1"

        # race doing noreplace renames, both can't succeed
-        dumb_renameat2 -n "$T_M0/dir0/old0" "$T_M0/dir0/sharednew" 2> /dev/null &
+        dumb_renameat2 -n "$T_D0/dir0/old0" "$T_D0/dir0/sharednew" 2> /dev/null &
        pid0=$!
-        dumb_renameat2 -n "$T_M1/dir1/old1" "$T_M1/dir0/sharednew" 2> /dev/null &
+        dumb_renameat2 -n "$T_D1/dir1/old1" "$T_D1/dir0/sharednew" 2> /dev/null &
        pid1=$!

        wait $pid0
@@ -31,7 +31,7 @@ for i in $(seq 0 100); do
        test "$rc0" == 0 -a "$rc1" == 0 && t_fail "both renames succeeded"

        # blow away possible files for either race outcome
-        rm -f "$T_M0/dir0/old0" "$T_M1/dir1/old1" "$T_M0/dir0/sharednew" "$T_M1/dir1/sharednew"
+        rm -f "$T_D0/dir0/old0" "$T_D1/dir1/old1" "$T_D0/dir0/sharednew" "$T_D1/dir1/sharednew"
 done

 t_pass
@@ -19,8 +19,8 @@ df_free() {
 }

 same_totals() {
-	cur_meta_tot=$(statfs_total meta "$SCR")
-	cur_data_tot=$(statfs_total data "$SCR")
+	cur_meta_tot=$(statfs_total meta "$T_MSCR")
+	cur_data_tot=$(statfs_total data "$T_MSCR")

 	test "$cur_meta_tot" == "$exp_meta_tot" || \
 		t_fail "cur total_meta_blocks $cur_meta_tot != expected $exp_meta_tot"
@@ -34,10 +34,10 @@ same_totals() {
 # some slop to account for reserved blocks and concurrent allocation.
 #
 devices_grew() {
-	cur_meta_tot=$(statfs_total meta "$SCR")
-	cur_data_tot=$(statfs_total data "$SCR")
-	cur_meta_df=$(df_free MetaData "$SCR")
-	cur_data_df=$(df_free Data "$SCR")
+	cur_meta_tot=$(statfs_total meta "$T_MSCR")
+	cur_data_tot=$(statfs_total data "$T_MSCR")
+	cur_meta_df=$(df_free MetaData "$T_MSCR")
+	cur_data_df=$(df_free Data "$T_MSCR")

 	local grow_meta_tot=$(echo "$exp_meta_tot * 2" | bc)
 	local grow_data_tot=$(echo "$exp_data_tot * 2" | bc)
@@ -70,19 +70,13 @@ size_data=$(blockdev --getsize64 "$T_EX_DATA_DEV")
 quarter_meta=$(echo "$size_meta / 4" | bc)
 quarter_data=$(echo "$size_data / 4" | bc)

-# XXX this is all pretty manual, would be nice to have helpers
 echo "== make initial small fs"
-scoutfs mkfs -A -f -Q 0,127.0.0.1,$T_SCRATCH_PORT -m $quarter_meta -d $quarter_data \
-	"$T_EX_META_DEV" "$T_EX_DATA_DEV" > $T_TMP.mkfs.out 2>&1 || \
-		t_fail "mkfs failed"
-SCR="$T_TMPDIR/mnt.scratch"
-mkdir -p "$SCR"
-mount -t scoutfs -o metadev_path=$T_EX_META_DEV,quorum_slot_nr=0 \
-	"$T_EX_DATA_DEV" "$SCR"
+t_scratch_mkfs -A -m $quarter_meta -d $quarter_data
+t_scratch_mount

 # then calculate sizes based on blocks that mkfs used
-quarter_meta=$(echo "$(statfs_total meta "$SCR") * 64 * 1024" | bc)
-quarter_data=$(echo "$(statfs_total data "$SCR") * 4 * 1024" | bc)
+quarter_meta=$(echo "$(statfs_total meta "$T_MSCR") * 64 * 1024" | bc)
+quarter_data=$(echo "$(statfs_total data "$T_MSCR") * 4 * 1024" | bc)
 whole_meta=$(echo "$quarter_meta * 4" | bc)
 whole_data=$(echo "$quarter_data * 4" | bc)
 outsize_meta=$(echo "$whole_meta * 2" | bc)
@@ -93,59 +87,58 @@ shrink_meta=$(echo "$quarter_meta / 2" | bc)
 shrink_data=$(echo "$quarter_data / 2" | bc)

 # and save expected values for checks
-exp_meta_tot=$(statfs_total meta "$SCR")
-exp_meta_df=$(df_free MetaData "$SCR")
-exp_data_tot=$(statfs_total data "$SCR")
-exp_data_df=$(df_free Data "$SCR")
+exp_meta_tot=$(statfs_total meta "$T_MSCR")
+exp_meta_df=$(df_free MetaData "$T_MSCR")
+exp_data_tot=$(statfs_total data "$T_MSCR")
+exp_data_df=$(df_free Data "$T_MSCR")

 echo "== 0s do nothing"
-scoutfs resize-devices -p "$SCR" 
-scoutfs resize-devices -p "$SCR" -m 0
-scoutfs resize-devices -p "$SCR" -d 0
-scoutfs resize-devices -p "$SCR" -m 0 -d 0
+scoutfs resize-devices -p "$T_MSCR"
+scoutfs resize-devices -p "$T_MSCR" -m 0
+scoutfs resize-devices -p "$T_MSCR" -d 0
+scoutfs resize-devices -p "$T_MSCR" -m 0 -d 0

 echo "== shrinking fails"
-scoutfs resize-devices -p "$SCR" -m $shrink_meta
-scoutfs resize-devices -p "$SCR" -d $shrink_data
-scoutfs resize-devices -p "$SCR" -m $shrink_meta -d $shrink_data
+scoutfs resize-devices -p "$T_MSCR" -m $shrink_meta
+scoutfs resize-devices -p "$T_MSCR" -d $shrink_data
+scoutfs resize-devices -p "$T_MSCR" -m $shrink_meta -d $shrink_data
 same_totals

 echo "== existing sizes do nothing"
-scoutfs resize-devices -p "$SCR" -m $quarter_meta
-scoutfs resize-devices -p "$SCR" -d $quarter_data
-scoutfs resize-devices -p "$SCR" -m $quarter_meta -d $quarter_data
+scoutfs resize-devices -p "$T_MSCR" -m $quarter_meta
+scoutfs resize-devices -p "$T_MSCR" -d $quarter_data
+scoutfs resize-devices -p "$T_MSCR" -m $quarter_meta -d $quarter_data
 same_totals

 echo "== growing outside device fails"
-scoutfs resize-devices -p "$SCR" -m $outsize_meta
-scoutfs resize-devices -p "$SCR" -d $outsize_data
-scoutfs resize-devices -p "$SCR" -m $outsize_meta -d $outsize_data
+scoutfs resize-devices -p "$T_MSCR" -m $outsize_meta
+scoutfs resize-devices -p "$T_MSCR" -d $outsize_data
+scoutfs resize-devices -p "$T_MSCR" -m $outsize_meta -d $outsize_data
 same_totals

 echo "== resizing meta works"
-scoutfs resize-devices -p "$SCR" -m $half_meta
+scoutfs resize-devices -p "$T_MSCR" -m $half_meta
 devices_grew meta

 echo "== resizing data works"
-scoutfs resize-devices -p "$SCR" -d $half_data
+scoutfs resize-devices -p "$T_MSCR" -d $half_data
 devices_grew data

 echo "== shrinking back fails"
-scoutfs resize-devices -p "$SCR" -m $quarter_meta
-scoutfs resize-devices -p "$SCR" -m $quarter_data
+scoutfs resize-devices -p "$T_MSCR" -m $quarter_meta
+scoutfs resize-devices -p "$T_MSCR" -m $quarter_data
 same_totals

 echo "== resizing again does nothing"
-scoutfs resize-devices -p "$SCR" -m $half_meta
-scoutfs resize-devices -p "$SCR" -m $half_data
+scoutfs resize-devices -p "$T_MSCR" -m $half_meta
+scoutfs resize-devices -p "$T_MSCR" -m $half_data
 same_totals

 echo "== resizing to full works"
-scoutfs resize-devices -p "$SCR" -m $whole_meta -d $whole_data
+scoutfs resize-devices -p "$T_MSCR" -m $whole_meta -d $whole_data
 devices_grew meta data

 echo "== cleanup extra fs"
-umount "$SCR"
-rmdir "$SCR"
+t_scratch_umount

 t_pass
@@ -32,7 +32,7 @@ echo "== dirs shouldn't appear in data_seq queries"
 mkdir "$DIR"
 ino=$(stat -c "%i" "$DIR")
 t_sync_seq_index
-query_index data_seq | grep "$ino\>"
+query_index data_seq | awk '($4 == "'$ino'")'

 echo "== two created files are present and come after each other"
 touch "$DIR/first"
@@ -92,13 +92,13 @@ test "$before" -lt "$after" || \
 # didn't skip past deleted dirty items
 #
 echo "== make sure dirtying doesn't livelock walk"
-dd if=/dev/urandom of="$DIR/dirtying" bs=4K count=1 >> $seqres.full 2>&1
+dd if=/dev/urandom of="$DIR/dirtying" bs=4K count=1 >> "$T_TMPDIR/seqres.full" 2>&1
 nr=1
 while [ "$nr" -lt 100 ]; do
-	echo "dirty/walk attempt $nr" >> $seqres.full
+	echo "dirty/walk attempt $nr" >> "$T_TMPDIR/seqres.full"
 	sync
 	dd if=/dev/urandom of="$DIR/dirtying" bs=4K count=1 conv=notrunc \
-		>> $seqres.full 2>&1
+		>> "$T_TMPDIR/seqres.full" 2>&1
 	scoutfs walk-inodes data_seq 0 -1 $DIR/dirtying >& /dev/null 
 	((nr++))
 done
@@ -12,12 +12,12 @@ create_file() {

 	if [ "$blocks" != 0 ]; then
 		dd if=/dev/urandom bs=4096 count=$blocks of="$file" \
-			>> $seqres.full 2>&1
+			>> "$T_TMPDIR/seqres.full" 2>&1
 	fi

 	if [ "$remainder" != 0 ]; then
 		dd if=/dev/urandom bs="$remainder" count=1 of="$file" \
-			conv=notrunc oflag=append >> $seqres.full 2>&1
+			conv=notrunc oflag=append >> "$T_TMPDIR/seqres.full" 2>&1
 	fi
 }

@@ -78,7 +78,7 @@ create_file "$FILE" $((4096 * 1024))
 cp "$FILE"  "$T_TMP"
 nr=1
 while [ "$nr" -lt 10 ]; do
-	echo "attempt $nr" >> $seqres.full 2>&1
+	echo "attempt $nr" >> "$T_TMPDIR/$seqres.full" 2>&1
 	release_vers "$FILE" stat 0 4096K
 	sync
 	echo 3 > /proc/sys/vm/drop_caches
@@ -0,0 +1,50 @@
+#
+# Test that merge_read_item() correctly updates the sequence number when
+# combining delta items from multiple finalized log trees.  Each mount
+# sets a totl value in its own 3-bit lane (powers of 8) so that any
+# double-counting overflows the lane and is caught by: or(v, exp) != exp.
+#
+
+t_require_commands setfattr scoutfs
+t_require_mounts 5
+
+echo "== setup"
+for nr in $(t_fs_nrs); do
+	d=$(eval echo \$T_D$nr)
+	for i in $(seq 1 2500); do : > "$d/f$nr$i"; done
+done
+sync
+t_force_log_merge
+
+vals=(1 8 64 512 4096)
+expected=4681
+n=0
+for nr in $(t_fs_nrs); do
+	d=$(eval echo \$T_D$nr)
+	v=${vals[$((n++))]}
+	for i in $(seq 1 2500); do
+		setfattr -n "scoutfs.totl.t.$i.0.0" -v $v "$d/f$nr$i"
+	done
+done
+
+t_trigger_arm_silent log_merge_force_partial $(t_server_nr)
+
+bad="$T_TMPDIR/bad"
+for nr in $(t_fs_nrs); do
+	( while true; do
+		echo 1 > "$(t_debugfs_path $nr)/drop_weak_item_cache"
+		scoutfs read-xattr-totals -p "$(eval echo \$T_M$nr)" | \
+			awk -F'[ =,]+' -v e=$expected 'or($2+0,e) != e'
+	done ) >> "$bad" &
+done
+
+echo "expected $expected"
+t_force_log_merge
+t_silent_kill $(jobs -p)
+test -s "$bad" && echo "double-counted:" && cat "$bad"
+
+echo "== cleanup"
+for nr in $(t_fs_nrs); do
+	find "$(eval echo \$T_D$nr)" -name "f$nr*" -delete
+done
+t_pass
@@ -62,31 +62,27 @@ test -x "$SCOUTFS_FENCED_RUN" || \
 # files disappear.
 #

-# generate failure messages to stderr while still echoing 0 for the caller
-careful_cat()
+# silence error messages
+quiet_cat()
 {
-	local path="$@"
-
-	cat "$@" || echo 0
+	cat "$@" 2>/dev/null
 }

 while sleep $SCOUTFS_FENCED_DELAY; do
+	shopt -s nullglob
 	for fence in /sys/fs/scoutfs/*/fence/*; do
-		# catches unmatched regex when no dirs
-		if [ ! -d "$fence" ]; then
-			continue
-		fi
-
-		# skip requests that have been handled
-		if [ "$(careful_cat $fence/fenced)" == 1 -o \
-		     "$(careful_cat $fence/error)" == 1 ]; then
-			continue
-		fi

 		srv=$(basename $(dirname $(dirname $fence)))
-		rid="$(cat $fence/rid)"
-		ip="$(cat $fence/ipv4_addr)"
-		reason="$(cat $fence/reason)"
+		fenced="$(quiet_cat $fence/fenced)"
+		error="$(quiet_cat $fence/error)"
+		rid="$(quiet_cat $fence/rid)"
+		ip="$(quiet_cat $fence/ipv4_addr)"
+		reason="$(quiet_cat $fence/reason)"
+
+		# request dirs can linger then disappear after fenced/error is set
+		if [ ! -d "$fence" -o "$fenced" == "1" -o "$error" == "1" ]; then
+			continue
+		fi

 		log_message "server $srv fencing rid $rid at IP $ip for $reason"

@@ -55,6 +55,30 @@ with initial sparse regions (perhaps by multiple threads writing to
 different regions) and wasted space isn't an issue (perhaps because the
 file population contains few small files).
 .TP
+.B ino_alloc_per_lock=<number>
+This option determines how many inode numbers are allocated in the same
+cluster lock.  The default, and maximum, is 1024.  The minimum is 1.
+Allocating fewer inodes per lock can allow more parallelism between
+mounts because there are more locks that cover the same number of
+created files.  This can be helpful when working with smaller numbers of
+large files.
+.TP
+.B lock_idle_count=<number>
+This option sets the number of locks that the client will allow to
+remain idle after being granted.  If the number of locks exceeds this
+count then the client will try to free the oldest locks.  This setting
+is per-mount and only changes the behavior of that mount.
+.sp
+Idle locks are not reclaimed by memory pressure so this option
+determines the limit of how much memory is likely to be pinned by
+allocated idle locks.  Setting this too low can increase latency of
+operations as repeated use of a working set of locks has to request the
+locks from the network rather than using granted idle locks.
+.sp
+The count is not strictly enforced.  Operations are allowed to use locks
+while over the limit to avoid deadlocks under heavy concurrent load.
+Exceeding the count only attempts freeing of idle locks.
+.TP
 .B log_merge_wait_timeout_ms=<number>
 This option sets the amount of time, in milliseconds, that log merge
 creation can wait before timing out.  This setting is per-mount, only
@@ -0,0 +1,127 @@
+#include <sys/ioctl.h>
+#include <fcntl.h>
+#include <errno.h>
+#include <string.h>
+#include <argp.h>
+
+#include "sparse.h"
+#include "parse.h"
+#include "util.h"
+#include "ioctl.h"
+#include "cmd.h"
+
+struct po_args {
+	char *path;
+	u64 offset;
+	u64 length;
+	u64 data_version;
+
+	unsigned offset_set:1,
+	         length_set:1,
+	         data_version_set:1;
+};
+
+static int do_punch_offline(struct po_args *args)
+{
+	struct scoutfs_ioctl_punch_offline ioctl_args;
+	int ret;
+	int fd;
+
+	fd = get_path(args->path, O_RDWR);
+	if (fd < 0)
+		return fd;
+
+	ioctl_args.offset = args->offset;
+	ioctl_args.len = args->length;
+	ioctl_args.data_version = args->data_version;
+	ioctl_args.flags = 0;
+
+	ret = ioctl(fd, SCOUTFS_IOC_PUNCH_OFFLINE, &ioctl_args);
+
+	if (ret < 0) {
+		ret = -errno;
+		fprintf(stderr, "punch_offline ioctl failed: %s (%d)\n",
+			strerror(errno), errno);
+	}
+
+	close(fd);
+	return ret;
+}
+
+static int parse_opt(int key, char *arg, struct argp_state *state)
+{
+	struct po_args *args = state->input;
+	int ret = 0;
+
+	switch (key) {
+	case 'V':
+		ret = parse_u64(arg, &args->data_version);
+		if (ret)
+			return ret;
+		args->data_version_set = 1;
+		break;
+	case 'o': /* offset */
+		ret = parse_human(arg, &args->offset);
+		if (ret)
+			return ret;
+		args->offset_set = 1;
+		break;
+	case 'l': /* length */
+		ret = parse_human(arg, &args->length);
+		if (ret)
+			return ret;
+		args->length_set = 1;
+		break;
+	case ARGP_KEY_ARG:
+		if (!args->path)
+			args->path = strdup_or_error(state, arg);
+		else
+			argp_error(state, "unknown extra argument given");
+		break;
+	case ARGP_KEY_FINI:
+		if (!args->path)
+			argp_error(state, "must provide path to file");
+		if (!args->offset_set)
+			argp_error(state, "must provide offset");
+		if (!args->length_set)
+			argp_error(state, "must provide length");
+		if (!args->data_version_set)
+			argp_error(state, "must provide data_version");
+		break;
+	default:
+		break;
+	}
+
+	return 0;
+}
+
+static struct argp_option options[] = {
+	{ "data-version", 'V', "VERSION", 0, "Data version of the file [Required]"},
+	{ "offset", 'o', "OFFSET", 0, "Offset (bytes or KMGTP units) in file to stage [Required]"},
+	{ "length", 'l', "LENGTH", 0, "Length of range (bytes or KMGTP units) of file to stage. [Required]"},
+	{ NULL }
+};
+
+static struct argp argp = {
+	options,
+	parse_opt,
+	"PATH",
+	"Make a (sparse) hole in the file at offset and with length"
+};
+
+static int punch_offline_cmd(int argc, char **argv)
+{
+	struct po_args po_args = {NULL};
+	int ret;
+
+	ret = argp_parse(&argp, argc, argv, 0, NULL, &po_args);
+	if (ret)
+		return ret;
+
+	return do_punch_offline(&po_args);
+}
+
+static void __attribute__((constructor)) punch_offline_ctor(void)
+{
+	cmd_register_argp("punch-offline", &argp, GROUP_AGENT, punch_offline_cmd);
+}