Remove unused orig keys from forest read items

These orig copies of the start and end keys serve no purpose. I think they were an editing mistake left over from a version where retries could happen within the read_items call. Signed-off-by: Zach Brown <zab@versity.com>
Make forest_read_items bloom key optional
2026-04-30 01:46:54 +00:00 · 2026-04-27 10:15:19 -07:00 · 2026-04-27 10:15:19 -07:00 · 2026-04-27 10:15:19 -07:00 · 2026-04-27 10:15:19 -07:00 · 2026-04-27 10:15:19 -07:00
32 changed files with 2293 additions and 146 deletions
--- a/ReleaseNotes.md
+++ b/ReleaseNotes.md
@@ -1,6 +1,23 @@
 Versity ScoutFS Release Notes
 =============================

+---
+v1.30
+\
+*Apr 21, 2026*
+
+Fix a problem reading the accumulated totals of contributing .totl.
+xattrs when log merging is in progress.  The problem would have readers
+of the totals calculate the sums incorrectly.
+
+Fix a problem updating quota rules.  There was a race where updates
+could be corrupted if they happened while a transaction was being
+written.
+
+Fix a problem deleting files with .indx. xattrs.  The internal indexing
+metadata wouldn't be properly deleted so the files would still claim to
+be present and visible in the index, though the file no longer existed.
+
 ---
 v1.29
 \
--- a/kmod/src/Makefile
+++ b/kmod/src/Makefile
@@ -13,6 +13,7 @@ scoutfs-y +=			\
 	avl.o			\
 	alloc.o			\
 	block.o			\
+	bsearch_index.o		\
 	btree.o			\
 	client.o		\
 	counters.o		\
@@ -36,6 +37,7 @@ scoutfs-y +=			\
 	per_task.o		\
 	quorum.o		\
 	quota.o			\
+	raw.o			\
 	recov.o			\
 	scoutfs_trace.o		\
 	server.o		\
--- a/kmod/src/block.h
+++ b/kmod/src/block.h
@@ -1,6 +1,8 @@
 #ifndef _SCOUTFS_BLOCK_H_
 #define _SCOUTFS_BLOCK_H_

+struct scoutfs_alloc;
+
 struct scoutfs_block_writer {
 	spinlock_t lock;
 	struct list_head dirty_list;
--- a/kmod/src/bsearch_index.c
+++ b/kmod/src/bsearch_index.c
@@ -0,0 +1,59 @@
+/*
+ * Copyright (C) 2026 Versity Software, Inc.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ */
+#include <linux/kernel.h>
+#include <linux/bsearch.h>
+
+#include "bsearch_index.h"
+
+struct bsearch_index_key {
+	int (*cmp)(const void *key, const void *elt);
+	/* the key has to be const, so we have to update the index through a pointer */
+	void **index_elt;
+	const void *key;
+	size_t size;
+};
+
+static int cmp_index(const void *key, const void *elt)
+{
+	const struct bsearch_index_key *bik = key;
+	int cmp = bik->cmp(bik->key, elt);
+
+	if (cmp > 0)
+		*(bik->index_elt) = (void *)elt + bik->size;
+	else
+		*(bik->index_elt) = (void *)elt;
+
+	return cmp;
+}
+
+/*
+ * A bsearch() wrapper that returns the index of the element of the
+ * array that the key would be stored in to maintain sort order.  It's
+ * the first element where the existing element is greater than the key.
+ * It returns the size of the array if the key is greater than the last
+ * element in the array.
+ */
+size_t bsearch_index(const void *key, const void *base, size_t num, size_t size,
+		     int (*cmp)(const void *key, const void *elt))
+{
+	void *index_elt = (void *)base;
+	struct bsearch_index_key bik = {
+		.cmp = cmp,
+		.index_elt = &index_elt,
+		.key = key,
+		.size = size,
+	};
+
+	bsearch(&bik, base, num, size, cmp_index);
+	return ((unsigned long)index_elt - (unsigned long)base) / size;
+}
--- a/kmod/src/bsearch_index.h
+++ b/kmod/src/bsearch_index.h
@@ -0,0 +1,7 @@
+#ifndef _SCOUTFS_BSEARCH_INDEX_H_
+#define _SCOUTFS_BSEARCH_INDEX_H_
+
+size_t bsearch_index(const void *key, const void *base, size_t num, size_t size,
+		     int (*cmp)(const void *key, const void *elt));
+
+#endif
--- a/kmod/src/btree.c
+++ b/kmod/src/btree.c
@@ -1816,6 +1816,11 @@ int scoutfs_btree_dirty(struct super_block *sb,
 * Call the users callback on all the items in the leaf that we find.
 * We also set the caller's keys for the first and last possible keys
 * that could exist in the leaf block.
+ *
+ * The callback can set a new key to continue reading from rather than
+ * iterating over all the items.  It modifies the key and returns
+ * -ESRCH, which performs a new avl search.  If the modified key falls
+ * outside of the range of keys in the block then we return.
 */
 int scoutfs_btree_read_items(struct super_block *sb,
 			     struct scoutfs_btree_root *root,
@@ -1829,6 +1834,7 @@ int scoutfs_btree_read_items(struct super_block *sb,
 	struct scoutfs_avl_node *next_node;
 	struct scoutfs_avl_node *node;
 	struct btree_walk_key_range kr;
+	struct scoutfs_key cb_key;
 	struct scoutfs_block *bl;
 	int ret;

@@ -1842,22 +1848,32 @@ int scoutfs_btree_read_items(struct super_block *sb,
 	if (scoutfs_key_compare(&kr.end, end) < 0)
 		*end = kr.end;

-	node = scoutfs_avl_search(&bt->item_root, cmp_key_item, start, NULL,
+	cb_key = *start;
+search:
+	node = scoutfs_avl_search(&bt->item_root, cmp_key_item, &cb_key, NULL,
 				  NULL, &next_node, NULL) ?: next_node;
 	while (node) {
 		item = node_item(node);
 		if (scoutfs_key_compare(&item->key, end) > 0)
 			break;

-		ret = cb(sb, item_key(item), le64_to_cpu(item->seq), item->flags,
+		cb_key = *item_key(item);
+		ret = cb(sb, &cb_key, le64_to_cpu(item->seq), item->flags,
 			 item_val(bt, item), item_val_len(item), arg);
-		if (ret < 0)
-			break;
+		if (ret < 0) {
+			if (ret == -ESRCH) {
+				if (scoutfs_key_compare(&cb_key, start) >= 0)
+					goto search;
+				ret = 0;
+			}
+			goto out;
+		}

 		node = scoutfs_avl_next(&bt->item_root, node);
 	}

 	scoutfs_block_put(sb, bl);
+	ret = 0;
 out:
 	return ret;
 }
@@ -2183,6 +2199,8 @@ static int merge_read_item(struct super_block *sb, struct scoutfs_key *key, u64
 		if (ret > 0) {
 			if (ret == SCOUTFS_DELTA_COMBINED) {
 				scoutfs_inc_counter(sb, btree_merge_delta_combined);
+				if (seq > found->seq)
+					found->seq = seq;
 			} else if (ret == SCOUTFS_DELTA_COMBINED_NULL) {
 				scoutfs_inc_counter(sb, btree_merge_delta_null);
 				free_mitem(rng, found);
@@ -2486,6 +2504,14 @@ int scoutfs_btree_merge(struct super_block *sb,
 			mitem = next_mitem(mitem);
 			free_mitem(&rng, tmp);
 		}
+
+		if (mitem && walk_val_len == 0 &&
+		    !(walk_flags & (BTW_INSERT | BTW_DELETE)) &&
+		    scoutfs_trigger(sb, LOG_MERGE_FORCE_PARTIAL)) {
+			ret = -ERANGE;
+			*next_ret = mitem->key;
+			goto out;
+		}
 	}

 	ret = 0;
--- a/kmod/src/forest.c
+++ b/kmod/src/forest.c
@@ -114,6 +114,42 @@ static struct scoutfs_block *read_bloom_ref(struct super_block *sb, struct scout
 	return bl;
 }

+/*
+ * Returns >0 if there was a bloom block and all the bits were present.
+ */
+static int all_bloom_bits_present(struct super_block *sb, struct scoutfs_block_ref *ref,
+				  struct forest_bloom_nrs *bloom)
+{
+	struct scoutfs_bloom_block *bb;
+	struct scoutfs_block *bl;
+	int i;
+
+	if (ref->blkno == 0)
+		return 0;
+
+	bl = read_bloom_ref(sb, ref);
+	if (IS_ERR(bl))
+		return PTR_ERR(bl);
+
+	bb = bl->data;
+
+	for (i = 0; i < ARRAY_SIZE(bloom->nrs); i++) {
+		if (!test_bit_le(bloom->nrs[i], bb->bits))
+			break;
+	}
+
+	scoutfs_block_put(sb, bl);
+
+	/* one of the bloom bits wasn't set */
+	if (i != ARRAY_SIZE(bloom->nrs)) {
+		scoutfs_inc_counter(sb, forest_bloom_fail);
+		return 0;
+	}
+
+	scoutfs_inc_counter(sb, forest_bloom_pass);
+	return 1;
+}
+
 /*
 * This is an unlocked iteration across all the btrees to find a hint at
 * the next key that the caller could read.  It's used to find out what
@@ -227,9 +263,13 @@ static int forest_read_items(struct super_block *sb, struct scoutfs_key *key, u6
 }

 /*
- * For each forest btree whose bloom block indicates that the lock might
- * have items stored, call the caller's callback for every item in the
- * leaf block in each tree which contains the key.
+ * Call the caller's callback for every item in the leaf blocks in each
+ * forest btree that contain the caller's key.
+ *
+ * If a bloom key is provided then each log tree's bloom block is
+ * checked and only trees with all the bloom key's bloom bits set will
+ * be read from.  When the bloom key is null all trees will be read
+ * from.
 *
 * The btree iter calls clamp the caller's range to the tightest range
 * that covers all the blocks.  Any keys outside of this range can't be
@@ -239,33 +279,26 @@ static int forest_read_items(struct super_block *sb, struct scoutfs_key *key, u6
 * to reset their state and retry with a newer version of the btrees.
 */
 int scoutfs_forest_read_items_roots(struct super_block *sb, struct scoutfs_net_roots *roots,
-				    struct scoutfs_key *key, struct scoutfs_key *bloom_key,
-				    struct scoutfs_key *start, struct scoutfs_key *end,
-				    scoutfs_forest_item_cb cb, void *arg)
+				    u64 merge_input_seq, struct scoutfs_key *key,
+				    struct scoutfs_key *bloom_key, struct scoutfs_key *start,
+				    struct scoutfs_key *end, scoutfs_forest_item_cb cb, void *arg)
 {
 	struct forest_read_items_data rid = {
 		.cb = cb,
 		.cb_arg = arg,
 	};
 	struct scoutfs_log_trees lt;
-	struct scoutfs_bloom_block *bb;
 	struct forest_bloom_nrs bloom;
 	SCOUTFS_BTREE_ITEM_REF(iref);
-	struct scoutfs_block *bl;
 	struct scoutfs_key ltk;
-	struct scoutfs_key orig_start = *start;
-	struct scoutfs_key orig_end = *end;
 	int ret;
-	int i;

 	scoutfs_inc_counter(sb, forest_read_items);
-	calc_bloom_nrs(&bloom, bloom_key);
+	if (bloom_key)
+		calc_bloom_nrs(&bloom, bloom_key);

 	trace_scoutfs_forest_using_roots(sb, &roots->fs_root, &roots->logs_root);

-	*start = orig_start;
-	*end = orig_end;
-
 	/* start with fs root items */
 	rid.fic |= FIC_FS_ROOT;
 	ret = scoutfs_btree_read_items(sb, &roots->fs_root, key, start, end,
@@ -292,40 +325,29 @@ int scoutfs_forest_read_items_roots(struct super_block *sb, struct scoutfs_net_r
 			goto out; /* including stale */
 		}

-		if (lt.bloom_ref.blkno == 0)
+		/* we're not expecting -ENOENT from _read_items */
+		if (lt.item_root.ref.blkno == 0)
 			continue;

-		bl = read_bloom_ref(sb, &lt.bloom_ref);
-		if (IS_ERR(bl)) {
-			ret = PTR_ERR(bl);
-			goto out;
-		}
-		bb = bl->data;
-
-		for (i = 0; i < ARRAY_SIZE(bloom.nrs); i++) {
-			if (!test_bit_le(bloom.nrs[i], bb->bits))
-				break;
+		if (bloom_key) {
+			ret = all_bloom_bits_present(sb, &lt.bloom_ref, &bloom);
+			if (ret < 0)
+				goto out;
+			if (ret == 0)
+				continue;
 		}

-		scoutfs_block_put(sb, bl);
-
-		/* one of the bloom bits wasn't set */
-		if (i != ARRAY_SIZE(bloom.nrs)) {
-			scoutfs_inc_counter(sb, forest_bloom_fail);
-			continue;
-		}
-
-		scoutfs_inc_counter(sb, forest_bloom_pass);
-
-		if ((le64_to_cpu(lt.flags) & SCOUTFS_LOG_TREES_FINALIZED))
-			rid.fic |= FIC_FINALIZED;
+		if ((le64_to_cpu(lt.flags) & SCOUTFS_LOG_TREES_FINALIZED) &&
+		    (merge_input_seq == 0 ||
+		     le64_to_cpu(lt.finalize_seq) < merge_input_seq))
+			rid.fic |= FIC_MERGE_INPUT;

 		ret = scoutfs_btree_read_items(sb, &lt.item_root, key, start,
 					       end, forest_read_items, &rid);
 		if (ret < 0)
 			goto out;

-		rid.fic &= ~FIC_FINALIZED;
+		rid.fic &= ~FIC_MERGE_INPUT;
 	}

 	ret = 0;
@@ -345,7 +367,7 @@ int scoutfs_forest_read_items(struct super_block *sb,

 	ret = scoutfs_client_get_roots(sb, &roots);
 	if (ret == 0)
-		ret = scoutfs_forest_read_items_roots(sb, &roots, key, bloom_key, start, end,
+		ret = scoutfs_forest_read_items_roots(sb, &roots, 0, key, bloom_key, start, end,
 						      cb, arg);
 	return ret;
 }
--- a/kmod/src/forest.h
+++ b/kmod/src/forest.h
@@ -11,7 +11,7 @@ struct scoutfs_lock;
 /* caller gives an item to the callback */
 enum {
 	FIC_FS_ROOT = (1 << 0),
-	FIC_FINALIZED = (1 << 1),
+	FIC_MERGE_INPUT = (1 << 1),
 };
 typedef int (*scoutfs_forest_item_cb)(struct super_block *sb, struct scoutfs_key *key, u64 seq,
 				      u8 flags, void *val, int val_len, int fic, void *arg);
@@ -25,9 +25,9 @@ int scoutfs_forest_read_items(struct super_block *sb,
 			      struct scoutfs_key *end,
 			      scoutfs_forest_item_cb cb, void *arg);
 int scoutfs_forest_read_items_roots(struct super_block *sb, struct scoutfs_net_roots *roots,
-				    struct scoutfs_key *key, struct scoutfs_key *bloom_key,
-				    struct scoutfs_key *start, struct scoutfs_key *end,
-				    scoutfs_forest_item_cb cb, void *arg);
+				    u64 merge_input_seq, struct scoutfs_key *key,
+				    struct scoutfs_key *bloom_key, struct scoutfs_key *start,
+				    struct scoutfs_key *end, scoutfs_forest_item_cb cb, void *arg);
 int scoutfs_forest_set_bloom_bits(struct super_block *sb,
 				  struct scoutfs_lock *lock);
 void scoutfs_forest_set_max_seq(struct super_block *sb, u64 max_seq);
--- a/kmod/src/ioctl.c
+++ b/kmod/src/ioctl.c
@@ -49,6 +49,7 @@
 #include "quota.h"
 #include "scoutfs_trace.h"
 #include "util.h"
+#include "raw.h"

 /*
 * We make inode index items coherent by locking fixed size regions of
@@ -1739,6 +1740,69 @@ out:
 	return ret;
 }

+static long scoutfs_ioc_raw_read_meta_seq(struct file *file, unsigned long arg)
+{
+	struct super_block *sb = file_inode(file)->i_sb;
+	struct scoutfs_ioctl_raw_read_meta_seq __user *urms = (void __user *)arg;
+	struct scoutfs_ioctl_raw_read_meta_seq rms;
+	int ret;
+
+	if (!capable(CAP_SYS_ADMIN)) {
+		ret = -EPERM;
+		goto out;
+	}
+
+	if (copy_from_user(&rms, urms, sizeof(rms))) {
+		ret = -EFAULT;
+		goto out;
+	}
+
+	if (rms.results_size == 0) {
+		ret = 0;
+		goto out;
+	}
+
+	if (rms.results_size < sizeof(struct scoutfs_ioctl_meta_seq) ||
+	    rms.results_size > INT_MAX) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	ret = scoutfs_raw_read_meta_seq(sb, &rms, &rms.last);
+	if (ret >= 0 && copy_to_user(&urms->last, &rms.last, sizeof(rms.last)))
+		ret = -EFAULT;
+out:
+	return ret;
+}
+
+static long scoutfs_ioc_raw_read_inode_info(struct file *file, unsigned long arg)
+{
+	struct super_block *sb = file_inode(file)->i_sb;
+	struct scoutfs_ioctl_raw_read_inode_info __user *urii = (void __user *)arg;
+	struct scoutfs_ioctl_raw_read_inode_info rii;
+	int ret;
+
+	if (!capable(CAP_SYS_ADMIN)) {
+		ret = -EPERM;
+		goto out;
+	}
+
+	if (copy_from_user(&rii, urii, sizeof(rii))) {
+		ret = -EFAULT;
+		goto out;
+	}
+
+	if (rii.inos_count == 0 || rii.results_size > INT_MAX ||
+	    !IS_ALIGNED(rii.inos_ptr, __alignof__(__u64))) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	ret = scoutfs_raw_read_inode_info(sb, &rii);
+out:
+	return ret;
+}
+
 long scoutfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 {
 	switch (cmd) {
@@ -1790,6 +1854,10 @@ long scoutfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 		return scoutfs_ioc_read_xattr_index(file, arg);
 	case SCOUTFS_IOC_PUNCH_OFFLINE:
 		return scoutfs_ioc_punch_offline(file, arg);
+	case SCOUTFS_IOC_RAW_READ_META_SEQ:
+		return scoutfs_ioc_raw_read_meta_seq(file, arg);
+	case SCOUTFS_IOC_RAW_READ_INODE_INFO:
+		return scoutfs_ioc_raw_read_inode_info(file, arg);
 	}

 	return -ENOTTY;
--- a/kmod/src/ioctl.h
+++ b/kmod/src/ioctl.h
@@ -15,20 +15,6 @@

 #define SCOUTFS_IOCTL_MAGIC 0xE8  /* arbitrarily chosen hole in ioctl-number.rst */

-/*
- * Packed scoutfs keys rarely cross the ioctl boundary so we have a
- * translation struct.
- */
-struct scoutfs_ioctl_key {
-	__le64	_sk_first;
-	__le64	_sk_second;
-	__le64	_sk_third;
-	__u8	_sk_fourth;
-	__u8	sk_type;
-	__u8	sk_zone;
-	__u8	_pad[5];
-};
-
 struct scoutfs_ioctl_walk_inodes_entry {
 	__u64 major;
 	__u64 ino;
@@ -876,4 +862,169 @@ struct scoutfs_ioctl_punch_offline {
 #define SCOUTFS_IOC_PUNCH_OFFLINE \
 	_IOW(SCOUTFS_IOCTL_MAGIC, 24, struct scoutfs_ioctl_punch_offline)

+/*
+ * Read meta_seq items without cluster locking.
+ *
+ * @start is the first meta_seq item value that could be returned.
+ * {0,0} is the minimum.
+ *
+ * @end is the last meta_seq item value that could be returned.
+ * {U64_MAX, U64_MAX} is the maximum.
+ *
+ * @last is only set on success from the call.  It's the last meta_seq
+ * item that could have been returned.  This lets the caller detect that
+ * the full input range wasn't explored.  Another call can be made with
+ * start set to just after this.
+ *
+ * @results_ptr is a pointer to an array of (struct
+ * scoutfs_ioctl_meta_seq) elements that were found in the input range.
+ *
+ * @results_size is the count of elements in the results_ptr array and
+ * the maximum number of results that can be returned.  There must be
+ * room for at least one result.
+ *
+ * Return existing meta_seq items starting from @start until @last.
+ * Partial results can be returned and is indicated by @last being set
+ * to an item before @last.
+ *
+ * The results are sorted first by increasing meta_seq and then by
+ * increasing ino.  All of the results are from one version of file
+ * system metadata.  This means that an inode can not be found multiple
+ * times within the results of one call.
+ *
+ * This call ignores currently dirty transactions and reads persistent
+ * items directly.  A transaction can be written after this call and
+ * cause meta_seq items to appear before or within the results from this
+ * call.
+ *
+ * The number of meta_seq items stored in the results buffer is returned
+ * and @last is updated.  0 items can be returned if none are found
+ * within the input range.
+ *
+ * Unique errors:
+ *
+ *  -EINVAL: The result count was 0 or greater than INT_MAX.
+ *
+ *  -ESTALE: The results could not be read from one stable version of
+ *    file system metadata.  Decrease the number of inodes requested.
+ */
+struct scoutfs_ioctl_meta_seq {
+	__u64 meta_seq;
+	__u64 ino;
+};
+struct scoutfs_ioctl_raw_read_meta_seq {
+	struct scoutfs_ioctl_meta_seq start;
+	struct scoutfs_ioctl_meta_seq end;
+	struct scoutfs_ioctl_meta_seq last;
+	__u64 results_ptr;
+	__u32 results_size;
+	__u32 _pad;
+};
+#define SCOUTFS_IOC_RAW_READ_META_SEQ \
+	_IOR(SCOUTFS_IOCTL_MAGIC, 25, struct scoutfs_ioctl_raw_read_meta_seq)
+
+
+/*
+ * Read inode metadata without cluster locking.
+ *
+ * @inos_ptr is a pointer to an aligned array of 64bit inode numbers.
+ *
+ * @inos_count is the number of elements in the array.  The inode
+ * numbers must not be zero, must strictly increase, and must not
+ * contain any duplicates.
+ *
+ * @names_ptr is a pointer to a byte array of xattr names to return with
+ * each inode.  The names are identical to those used in
+ * {get,set}xattr(2).  The names must be null terminated and no two
+ * names may be equal.
+ *
+ * @names_count is the number of names that will be found in the
+ * names_ptr buffer.
+ *
+ * @results_ptr is a pointer to a buffer that will be filled by the read
+ * inode info results.  The result structs and payloads are not aligned.
+ * Callers will almost certainly need to copy them into aligned
+ * addresses before referencing their contents.
+ *
+ * @results_size is the number of bytes available in the results_ptr
+ * buffer.
+ *
+ * For each inode an _INODE result will always be returned.  Then a
+ * _XATTR result will be returned for each xattr on the inode that
+ * matches one of the given input names.
+ *
+ * Each call will not return partial results. -ERANGE is returned if the
+ * results for the requested inodes do not fit in the results buffer.
+ *
+ * The info for one call is from one consistent version of the file
+ * system metadata.  The call can have to retry if it sees metadata
+ * change during its call.  -ESTALE will be returned if it was not able
+ * to read all the inodes info from one metadata version.  The number of
+ * inodes being read can be decreased to avoid this.
+ *
+ * Inodes with an nlink of 0 are not returned.
+ *
+ * The size in bytes of filled results is returned.  A non-zero return
+ * will always include at least one full
+ * (struct scoutfs_ioctl_raw_read_result) header.
+ *
+ * Unique errors:
+ *
+ *  -EINVAL: The inode count can't be zero. The inos ptr must be aligned
+ *    to __u64 alignment.  The results buffer size can't be larger than
+ *    INT_MAX.  Inode numbers can't be zero, must be sorted, and can't
+ *    have duplicates.  The xattr names must be unique, null terminated,
+ *    and less than 256 bytes long.
+ *
+ *  -ERANGE: The results for the requested inodes do not fit in the
+ *    results buffer.  Increase the buffer size (perhaps allowing for all
+ *    xattrs with large values) or decrease the number of inodes per call.
+ *
+ *  -ESTALE: The results could not be read from one stable version of
+ *    file system metadata.  Decrease the number of inodes requested.
+ *
+ *  -EUCLEAN: Internal xattr metadata is inconsistent.
+ */
+
+struct scoutfs_ioctl_raw_read_inode_info {
+	__u64 inos_ptr;
+	__u32 inos_count;
+	__u32 names_count;
+	__u64 names_ptr;
+	__u64 results_ptr;
+	__u32 results_size;
+	__u8  _pad[4];
+};
+
+/*
+ * @type is one of the enums that determines the type of the following
+ * result payload.
+ *
+ * @size is the number of bytes of result payload immediately following
+ * the result struct.  It does not include the size of the result struct
+ * header.
+ */
+struct scoutfs_ioctl_raw_read_result {
+	__u32 size;
+	__u8  _pad[7];
+	__u8 type;
+};
+
+/*
+ * The _INODE result contains an initial 64bit inode number followed by a
+ * struct scoutfs_inode as defined in format.h.  The size includes the
+ * 8byte initial inode number.  With that subtracted the size of the
+ * inode struct defines its version (and so the fields it supports).
+ */
+#define SCOUTFS_IOC_RAW_READ_RESULT_INODE	1
+/*
+ * The result payload contains the null terminated name and the value.
+ * The value size can be found by subtracting the null terminated name
+ * length from the result size.
+ */
+#define SCOUTFS_IOC_RAW_READ_RESULT_XATTR	2
+
+#define SCOUTFS_IOC_RAW_READ_INODE_INFO \
+	_IOR(SCOUTFS_IOCTL_MAGIC, 25, struct scoutfs_ioctl_raw_read_inode_info)
+
 #endif
--- a/kmod/src/lock.c
+++ b/kmod/src/lock.c
@@ -1093,19 +1093,24 @@ out_unlock:
 	return ret;
 }

+void scoutfs_lock_get_fs_item_range(u64 ino, struct scoutfs_key *start, struct scoutfs_key *end)
+{
+	scoutfs_key_set_zeros(start);
+	start->sk_zone = SCOUTFS_FS_ZONE;
+	start->ski_ino = cpu_to_le64(ino & ~(u64)SCOUTFS_LOCK_INODE_GROUP_MASK);
+
+	scoutfs_key_set_ones(end);
+	end->sk_zone = SCOUTFS_FS_ZONE;
+	end->ski_ino = cpu_to_le64(ino | SCOUTFS_LOCK_INODE_GROUP_MASK);
+}
+
 int scoutfs_lock_ino(struct super_block *sb, enum scoutfs_lock_mode mode, int flags, u64 ino,
 		     struct scoutfs_lock **ret_lock)
 {
 	struct scoutfs_key start;
 	struct scoutfs_key end;

-	scoutfs_key_set_zeros(&start);
-	start.sk_zone = SCOUTFS_FS_ZONE;
-	start.ski_ino = cpu_to_le64(ino & ~(u64)SCOUTFS_LOCK_INODE_GROUP_MASK);
-
-	scoutfs_key_set_ones(&end);
-	end.sk_zone = SCOUTFS_FS_ZONE;
-	end.ski_ino = cpu_to_le64(ino | SCOUTFS_LOCK_INODE_GROUP_MASK);
+	scoutfs_lock_get_fs_item_range(ino, &start, &end);

 	return lock_key_range(sb, mode, flags, &start, &end, ret_lock);
 }
--- a/kmod/src/lock.h
+++ b/kmod/src/lock.h
@@ -65,6 +65,7 @@ int scoutfs_lock_invalidate_request(struct super_block *sb, u64 net_id,
 int scoutfs_lock_recover_request(struct super_block *sb, u64 net_id,
 				 struct scoutfs_key *key);

+void scoutfs_lock_get_fs_item_range(u64 ino, struct scoutfs_key *start, struct scoutfs_key *end);
 int scoutfs_lock_inode(struct super_block *sb, enum scoutfs_lock_mode mode, int flags,
 		       struct inode *inode, struct scoutfs_lock **ret_lock);
 int scoutfs_lock_ino(struct super_block *sb, enum scoutfs_lock_mode mode, int flags, u64 ino,
--- a/kmod/src/quota.c
+++ b/kmod/src/quota.c
@@ -34,6 +34,7 @@
 #include "totl.h"
 #include "util.h"
 #include "quota.h"
+#include "trans.h"
 #include "counters.h"
 #include "scoutfs_trace.h"

@@ -1086,6 +1087,10 @@ int scoutfs_quota_mod_rule(struct super_block *sb, bool is_add,
 	if (ret < 0)
 		goto out;

+	ret = scoutfs_hold_trans(sb, true);
+	if (ret < 0)
+		goto out;
+
 	down_write(&qtinf->rwsem);

 	if (is_add) {
@@ -1095,28 +1100,30 @@ int scoutfs_quota_mod_rule(struct super_block *sb, bool is_add,
 		else if (ret == 0)
 			ret = -EEXIST;
 		if (ret < 0)
-			goto unlock;
+			goto release;

 		rule_to_rule_val(&rv, &rule);
 		ret = scoutfs_item_create(sb, &key, &rv, sizeof(rv), lock);
 		if (ret < 0)
-			goto unlock;
+			goto release;

 	} else {
 		ret = find_rule(sb, &rule, &key, lock) ?:
 		      scoutfs_item_delete(sb, &key, lock);
 		if (ret < 0)
-			goto unlock;
+			goto release;
 	}

 	scoutfs_quota_invalidate(sb);
 	ret = 0;

-unlock:
+release:
 	up_write(&qtinf->rwsem);
-	scoutfs_unlock(sb, lock, SCOUTFS_LOCK_WRITE);
+	scoutfs_release_trans(sb);

 out:
+	scoutfs_unlock(sb, lock, SCOUTFS_LOCK_WRITE);
+
 	if (is_add)
 		trace_scoutfs_quota_add_rule(sb, &rule, ret);
 	else
--- a/kmod/src/raw.c
+++ b/kmod/src/raw.c
@@ -0,0 +1,744 @@
+/*
+ * Copyright (C) 2026 Versity Software, Inc.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ */
+#include <linux/kernel.h>
+#include <linux/fs.h>
+#include <linux/list_sort.h>
+#include <linux/sort.h>
+
+#include "format.h"
+#include "key.h"
+#include "block.h"
+#include "inode.h"
+#include "forest.h"
+#include "client.h"
+#include "ioctl.h"
+#include "lock.h"
+#include "xattr.h"
+#include "attr_x.h"
+#include "bsearch_index.h"
+#include "raw.h"
+
+struct fs_item {
+	struct list_head head;
+	struct scoutfs_key key;
+	u64 seq;
+	int val_len;
+	bool deletion;
+	/* val is aligned so we can deref structs in vals */
+	u8 val[0] __aligned(ARCH_KMALLOC_MINALIGN);
+};
+
+static int save_fs_item(struct list_head *list, struct scoutfs_key *key, u64 seq, u8 flags,
+			void *val, int val_len)
+{
+	struct fs_item *fsi;
+
+	/* max btree val len is hundreds of bytes */
+	fsi = kmalloc(offsetof(struct fs_item, val[val_len]), GFP_NOFS);
+	if (!fsi)
+		return -ENOMEM;
+
+	fsi->key = *key;
+	fsi->seq = seq;
+	fsi->val_len = val_len;
+	fsi->deletion = !!(flags & SCOUTFS_ITEM_FLAG_DELETION);
+	if (val_len > 0)
+		memcpy(fsi->val, val, val_len);
+	list_add_tail(&fsi->head, list);
+
+	return 0;
+}
+
+static void free_fs_item(struct fs_item *fsi)
+{
+	if (!list_empty(&fsi->head))
+		list_del_init(&fsi->head);
+	kfree(fsi);
+}
+
+static void free_fs_items(struct list_head *list)
+{
+	struct fs_item *fsi;
+	struct fs_item *tmp;
+
+	list_for_each_entry_safe(fsi, tmp, list, head)
+		free_fs_item(fsi);
+}
+
+static struct fs_item *next_fs_item(struct list_head *list, struct fs_item *fsi)
+{
+	list_for_each_entry_continue(fsi, list, head)
+		return fsi;
+	return NULL;
+}
+
+static int cmp_fs_items(void *priv, KC_LIST_CMP_CONST struct list_head *A,
+			KC_LIST_CMP_CONST struct list_head *B)
+{
+	KC_LIST_CMP_CONST struct fs_item *a =
+		container_of(A, KC_LIST_CMP_CONST struct fs_item, head);
+	KC_LIST_CMP_CONST struct fs_item *b =
+		container_of(B, KC_LIST_CMP_CONST struct fs_item, head);
+
+	return scoutfs_key_compare(&a->key, &b->key) ?: -scoutfs_cmp(a->seq, b->seq);
+}
+
+static void sort_and_remove(struct list_head *list, struct scoutfs_key *end)
+{
+	struct fs_item *prev;
+	struct fs_item *fsi;
+	struct fs_item *tmp;
+
+	list_sort(NULL, list, cmp_fs_items);
+
+	/* start by removing any items read before end was decreased by later blocks */
+	list_for_each_entry_safe_reverse(fsi, tmp, list, head) {
+		if (scoutfs_key_compare(&fsi->key, end) > 0)
+			free_fs_item(fsi);
+		else
+			break;
+	}
+
+	prev = NULL;
+	list_for_each_entry_safe(fsi, tmp, list, head) {
+		/* remove this item if it's an older version of previous item */
+		if (prev && scoutfs_key_compare(&prev->key, &fsi->key) == 0) {
+			free_fs_item(fsi);
+			continue;
+		}
+
+		/* remove previous deletion item once it has removed all older versions */
+		if (prev && prev->deletion)
+			free_fs_item(prev);
+
+		/* next item might match this, record to compare */
+		prev = fsi;
+	}
+
+	/* remove the last item if it's a deletion */
+	list_for_each_entry_reverse(fsi, list, head) {
+		if (fsi->deletion)
+			free_fs_item(fsi);
+		break;
+	}
+}
+
+static int save_all_items(struct super_block *sb, struct scoutfs_key *key, u64 seq, u8 flags,
+			  void *val, int val_len, int fic, void *arg)
+{
+	struct list_head *list = arg;
+
+	return save_fs_item(list, key, seq, flags, val, val_len);
+}
+
+/* -------------- */
+
+static void ms_from_key(struct scoutfs_ioctl_meta_seq *ms, struct scoutfs_key *key)
+{
+	ms->meta_seq = le64_to_cpu(key->skii_major);
+	ms->ino = le64_to_cpu(key->skii_ino);
+}
+
+/*
+ * Increment the key's ino->meta_seq so that we don't land between items.
+ */
+static void inc_meta_seq(struct scoutfs_key *key)
+{
+	le64_add_cpu(&key->skii_ino, 1);
+	if (key->skii_ino == 0)
+		le64_add_cpu(&key->skii_major, 1);
+}
+
+int scoutfs_raw_read_meta_seq(struct super_block *sb,
+			      struct scoutfs_ioctl_raw_read_meta_seq *rms,
+			      struct scoutfs_ioctl_meta_seq *last_ret)
+{
+	struct scoutfs_ioctl_meta_seq __user *ums;
+	struct scoutfs_ioctl_meta_seq ms;
+	struct scoutfs_net_roots roots;
+	DECLARE_SAVED_REFS(saved);
+	struct scoutfs_key start;
+	struct scoutfs_key last;
+	struct scoutfs_key key;
+	struct scoutfs_key end;
+	struct fs_item *fsi;
+	struct fs_item *tmp;
+	LIST_HEAD(list);
+	int retries;
+	int copied;
+	int count;
+	int ret;
+
+	ums = (void __user *)rms->results_ptr;
+	count = rms->results_size / sizeof(struct scoutfs_ioctl_meta_seq);
+	retries = 10;
+	copied = 0;
+
+	scoutfs_inode_init_index_key(&last, SCOUTFS_INODE_INDEX_META_SEQ_TYPE,
+				     rms->end.meta_seq, 0, rms->end.ino);
+
+retry:
+	ret = scoutfs_client_get_roots(sb, &roots);
+	if (ret)
+		goto out;
+
+	scoutfs_inode_init_index_key(&key, SCOUTFS_INODE_INDEX_META_SEQ_TYPE,
+				     rms->start.meta_seq, 0, rms->start.ino);
+
+	for (;;) {
+		start = key;
+		end = last;
+		ret = scoutfs_forest_read_items_roots(sb, &roots, 0, &key, NULL, &start, &end,
+						      save_all_items, &list);
+		if (ret < 0)
+			goto out;
+
+		sort_and_remove(&list, &end);
+
+		list_for_each_entry_safe(fsi, tmp, &list, head) {
+
+			if (copied == count) {
+				/* results are full, set end to before item can't return */
+				end = fsi->key;
+				le64_add_cpu(&end.skii_ino, -1ULL);
+				ret = 0;
+				goto out;
+			}
+
+			ms_from_key(&ms, &fsi->key);
+			if (copy_to_user(&ums[copied], &ms, sizeof(ms))) {
+				ret = -EFAULT;
+				goto out;
+			}
+
+			free_fs_item(fsi);
+			copied++;
+		}
+
+		if (scoutfs_key_compare(&end, &last) >= 0) {
+			end = last;
+			break;
+		}
+
+		key = end;
+		inc_meta_seq(&key);
+	}
+
+	ret = 0;
+out:
+	free_fs_items(&list);
+
+	ret = scoutfs_block_check_stale(sb, ret, &saved, &roots.fs_root.ref, &roots.logs_root.ref);
+	if (ret == -ESTALE && copied == 0 && retries-- > 0)
+		goto retry;
+
+	ms_from_key(last_ret, &end);
+
+	return ret ?: copied;
+}
+
+/* -------------- */
+
+struct inode_info_context {
+	size_t nr_inos;
+	u64 *inos;
+
+	size_t nr_names;
+	struct xattr_name {
+		u64 hash;
+		char *name;
+		u8 name_len; /* no null */
+	} *names;
+
+	struct list_head fs_items;
+};
+
+static int cmp_u64(const void *A, const void *B)
+{
+	const u64 *a = A;
+	const u64 *b = B;
+
+	return scoutfs_cmp(*a, *b);
+}
+
+static int cmp_name_hash(const void *A, const void *B)
+{
+	const struct xattr_name *a = A;
+	const struct xattr_name *b = B;
+
+	return scoutfs_cmp(a->hash, b->hash);
+}
+
+static int cmp_name_string(const void *A, const void *B)
+{
+	const struct xattr_name *a = A;
+	const struct xattr_name *b = B;
+
+	return scoutfs_cmp(a->name_len, b->name_len) ?: memcmp(a->name, b->name, a->name_len);
+}
+
+static int setup_context(struct inode_info_context *ctx,
+			 struct scoutfs_ioctl_raw_read_inode_info *rii)
+{
+	__u64 __user *uinos = (void __user *)rii->inos_ptr;
+	char __user *uname;
+	long len_null;
+	long len;
+	int ret;
+	u32 i;
+
+	ctx->nr_inos = rii->inos_count;
+	ctx->nr_names = rii->names_count;
+	INIT_LIST_HEAD(&ctx->fs_items);
+
+	ctx->inos = kvmalloc_array(ctx->nr_inos, sizeof(ctx->inos[0]), GFP_KERNEL);
+	ctx->names = kvcalloc(ctx->nr_names, sizeof(ctx->names[0]), GFP_KERNEL);
+	if (!ctx->inos || !ctx->names) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	if (copy_from_user(ctx->inos, uinos, ctx->nr_inos * sizeof(ctx->inos[0]))) {
+		ret = -EFAULT;
+		goto out;
+	}
+
+	/* inos must not be 0 and must increase and contain no duplicates */
+	if (ctx->inos[0] == 0) {
+		ret = -EINVAL;
+		goto out;
+	}
+	for (i = 1; i < ctx->nr_inos; i++) {
+		if (ctx->inos[i] <= ctx->inos[i - 1]) {
+			ret = -EINVAL;
+			goto out;
+		}
+	}
+
+	uname = (void __user *)rii->names_ptr;
+	for (i = 0; i < ctx->nr_names; i++) {
+		len_null = SCOUTFS_XATTR_MAX_NAME_LEN + 1;
+		ret = strnlen_user(uname, len_null);
+		if (ret <= 1 || ret > len_null) {
+			if (ret >= 0)
+				ret = -EINVAL;
+			goto out;
+		}
+		len_null = ret;
+		len = len_null - 1;
+
+		ctx->names[i].name_len = len;
+		ctx->names[i].name = kmalloc(len_null, GFP_KERNEL);
+		if (!ctx->names[i].name) {
+			ret = -ENOMEM;
+			goto out;
+		}
+
+		ret = strncpy_from_user(ctx->names[i].name, uname, len_null);
+		if (ret != len) {
+			if (ret >= 0)
+				ret = -EINVAL;
+			goto out;
+		}
+
+		ctx->names[i].hash = scoutfs_xattr_name_hash(ctx->names[i].name, len);
+		uname += len_null;
+	}
+
+	/* make sure all the names differ */
+	sort(ctx->names, ctx->nr_names, sizeof(ctx->names[0]), cmp_name_string, NULL);
+	for (i = 1; i < ctx->nr_names; i++) {
+		if (cmp_name_string(&ctx->names[i - 1], &ctx->names[i]) == 0) {
+			ret = -EINVAL;
+			goto out;
+		}
+	}
+
+	/* then leave them sorted by hash */
+	sort(ctx->names, ctx->nr_names, sizeof(ctx->names[0]), cmp_name_hash, NULL);
+
+	ret = 0;
+out:
+	return ret;
+}
+
+static void free_context(struct inode_info_context *ctx)
+{
+	int i;
+
+	kvfree(ctx->inos);
+
+	if (ctx->names) {
+		for (i = 0; i < ctx->nr_names; i++) {
+			if (!ctx->names[i].name)
+				break;
+			kfree(ctx->names[i].name);
+		}
+		kvfree(ctx->names);
+	}
+}
+
+/*
+ * Iterate over fs items and save any that we're interested in.  We want
+ * inode struct items and any xattr items whose hashes collide with the
+ * xattr names we're searching for.
+ *
+ * Our forest calls can be advancing through the key space as we see
+ * slices that intersect with blocks in trees.  And each forest caller
+ * can be resetting the key position to the start of each forest block
+ * it reads in an intersection.
+ *
+ * From this callback's perspective, the key can be jumping all over the
+ * place.  We don't have any iterative position state.  For each key we
+ * decide if we want to save it and then set the key to the next key we
+ * want after the current key.  We'll combine all the saved keys later.
+ */
+static int save_info_items(struct super_block *sb, struct scoutfs_key *key, u64 seq,
+			   u8 flags, void *val, int val_len, int fic, void *arg)
+{
+	u64 ino = le64_to_cpu(key->_sk_first);
+	struct inode_info_context *ctx = arg;
+	struct xattr_name name;
+	size_t name_ind;
+	size_t ino_ind;
+	bool hash_match;
+	bool ino_match;
+	int ret;
+
+	ino_ind = bsearch_index(&ino, ctx->inos, ctx->nr_inos, sizeof(ctx->inos[0]), cmp_u64);
+	ino_match = ino_ind < ctx->nr_inos && ctx->inos[ino_ind] == ino;
+
+	/* jump to to next ino, could be for this key if we're before the ino struct */
+	if (!ino_match || key->sk_type < SCOUTFS_INODE_TYPE)
+		goto next_inode;
+
+	/* find our search position in xattrs */
+	if (key->sk_type < SCOUTFS_XATTR_TYPE) {
+		name_ind = 0;
+		hash_match = false;
+
+	} else if (key->sk_type == SCOUTFS_XATTR_TYPE) {
+		name = (struct xattr_name) { .hash = le64_to_cpu(key->skx_name_hash) };
+		name_ind = bsearch_index(&name, ctx->names, ctx->nr_names, sizeof(ctx->names[0]),
+					 cmp_name_hash);
+		hash_match = name_ind < ctx->nr_names && ctx->names[name_ind].hash == name.hash;
+	} else {
+		name_ind = ctx->nr_names;
+		hash_match = false;
+	}
+
+	/* save inode items for our search and all xattr items that match search hashes */
+	if (key->sk_type == SCOUTFS_INODE_TYPE || hash_match) {
+		ret = save_fs_item(&ctx->fs_items, key,  seq, flags, val, val_len);
+		if (ret < 0)
+			goto out;
+	}
+
+	/* let the caller continue iterating through matching xattr items */
+	if (hash_match) {
+		ret = 0;
+		goto out;
+	}
+
+	/* jump to the next xattr */
+	if (name_ind < ctx->nr_names) {
+		scoutfs_xattr_init_key(key, ino, ctx->names[name_ind].hash, 0);
+		ret = -ESRCH;
+		goto out;
+	}
+
+	/* no more xattrs, must be done with this ino */
+	ino_ind++;
+
+next_inode:
+	/* now jump to next inode struct key, or we're done */
+	if (ino_ind < ctx->nr_inos)
+		scoutfs_inode_init_key(key, ctx->inos[ino_ind]);
+	else
+		scoutfs_key_set_ones(key);
+
+	ret = -ESRCH;
+out:
+	return ret;
+}
+
+static int copy_to_user_off(void __user *dst, size_t *dst_off, size_t dst_size,
+			    void *src, size_t copy_size)
+{
+	if (copy_size == 0)
+		return 0;
+	if (*dst_off + copy_size > dst_size)
+		return -ERANGE;
+	if (copy_to_user(dst + *dst_off, src, copy_size))
+		return -EFAULT;
+
+	*dst_off += copy_size;
+	return 0;
+}
+
+static int copy_result_to_user(void __user *ures, size_t *off, size_t size, u8 type,
+			       void *a_data, size_t a_len, void *b_data, size_t b_len,
+			       size_t extra_size)
+{
+	struct scoutfs_ioctl_raw_read_result res;
+	const size_t szof_res = sizeof(struct scoutfs_ioctl_raw_read_result);
+
+	memzero_explicit(&res, szof_res);
+	res = (struct scoutfs_ioctl_raw_read_result) {
+		.size = a_len + b_len + extra_size,
+		.type = type,
+	};
+
+	return copy_to_user_off(ures, off, size, &res, szof_res) ?:
+	       (a_len ? copy_to_user_off(ures, off, size, a_data, a_len) : 0) ?:
+	       (b_len ? copy_to_user_off(ures, off, size, b_data, b_len) : 0);
+}
+
+static int copy_item_results_to_user(struct super_block *sb, struct inode_info_context *ctx,
+				     void __user *ures, size_t *off, size_t size,
+				     struct fs_item *fsi)
+{
+	struct scoutfs_inode *cinode;
+	struct scoutfs_xattr *xat;
+	static char null = '\0';
+	size_t len;
+	u64 ino;
+	int ret = 0;
+
+	if (fsi->key.sk_type == SCOUTFS_INODE_TYPE) {
+		cinode = (void *)fsi->val;
+		ino = le64_to_cpu(fsi->key.ski_ino);
+
+		ret = copy_result_to_user(ures, off, size, SCOUTFS_IOC_RAW_READ_RESULT_INODE,
+					  &ino, sizeof(ino), cinode, sizeof(struct scoutfs_inode),
+					  0);
+
+	} else if (fsi->key.sk_type == SCOUTFS_XATTR_TYPE) {
+		if (fsi->key.skx_part == 0) {
+			xat = (void *)fsi->val;
+			ret = copy_result_to_user(ures, off, size,
+						  SCOUTFS_IOC_RAW_READ_RESULT_XATTR, xat->name,
+						  xat->name_len, &null, sizeof(null),
+						  le16_to_cpu(xat->val_len));
+			if (ret == 0 && xat->val_len != 0) {
+				/* then append the start of the value */
+				len = fsi->val_len -
+				      offsetof(struct scoutfs_xattr, name[xat->name_len]);
+				ret = copy_to_user_off(ures, off, size, xat->name + xat->name_len,
+						       len);
+			}
+		} else {
+			/* continue appending partial values */
+			ret = copy_to_user_off(ures, off, size, fsi->val, fsi->val_len);
+		}
+	}
+
+	return ret;
+}
+
+static bool ignore_zero_nlink(struct inode_info_context *ctx, struct fs_item *fsi)
+{
+	struct scoutfs_inode *cinode = (void *)fsi->val;
+
+	return cinode->nlink == 0;
+}
+
+static bool ignore_xattr_name(struct inode_info_context *ctx, struct fs_item *fsi)
+{
+	struct scoutfs_xattr *xat = (void *)fsi->val;
+	struct xattr_name name = {
+		.hash = le64_to_cpu(fsi->key.skx_name_hash),
+		.name = xat->name,
+		.name_len = xat->name_len,
+	};
+	size_t i;
+
+	for (i = bsearch_index(&name, ctx->names, ctx->nr_names, sizeof(ctx->names[0]),
+			       cmp_name_hash);
+	     i < ctx->nr_names && name.hash == ctx->names[i].hash; i++) {
+		if (cmp_name_string(&name, &ctx->names[i]) == 0)
+			return false;
+	}
+
+	return true;
+}
+
+static int copy_results_to_user(struct super_block *sb, struct inode_info_context *ctx,
+				struct scoutfs_ioctl_raw_read_inode_info *rii)
+{
+	void __user *ures = (void __user *)rii->results_ptr;
+	struct scoutfs_xattr *xat;
+	struct fs_item *next;
+	struct fs_item *fsi;
+	struct fs_item *tmp;
+	size_t xattr_end;
+	size_t off;
+	__le64 in_ino;
+	__le64 in_id;
+	int ret;
+
+	in_ino = 0;
+	xattr_end = 0;
+	in_id = 0;
+	off = 0;
+
+	list_for_each_entry_safe(fsi, tmp, &ctx->fs_items, head) {
+		/*
+		 * ignore:
+		 *  - inodes with an nlink of 0
+		 *  - all items for an ino after the inode struct that we're ignoring 
+		 *  - first xattr parts with a name we don't need
+		 *  - additional xattr parts when we ignored the first
+		 */
+		if ((fsi->key.sk_type == SCOUTFS_INODE_TYPE && ignore_zero_nlink(ctx, fsi)) ||
+		    (fsi->key.sk_type > SCOUTFS_INODE_TYPE && fsi->key._sk_first != in_ino) ||
+		    (fsi->key.sk_type == SCOUTFS_XATTR_TYPE &&
+		     ((fsi->key.skx_part == 0 && ignore_xattr_name(ctx, fsi)) ||
+		      (fsi->key.skx_part > 0 && fsi->key.skx_id != in_id)))) {
+			free_fs_item(fsi);
+			in_ino = 0;
+			in_id = 0;
+			continue;
+		}
+
+		/* advance ino/xattr stream context state machine */
+		if (fsi->key.sk_type == SCOUTFS_INODE_TYPE) {
+			in_ino = fsi->key.ski_ino;
+			in_id = 0;
+		} else if (fsi->key.sk_type == SCOUTFS_XATTR_TYPE && fsi->key.skx_part == 0) {
+			in_id = fsi->key.skx_id;
+			/* save the required offset after the complete xattr */
+			xat = (void *)fsi->val;
+			xattr_end = off + sizeof(struct scoutfs_ioctl_raw_read_result) +
+				    xat->name_len + 1 + le16_to_cpu(xat->val_len);
+		}
+
+		/* copy results, usually with header, but additional xattr parts copied raw */
+		ret = copy_item_results_to_user(sb, ctx, ures, &off, rii->results_size, fsi);
+		if (ret < 0)
+			goto out;
+
+		/* make sure we saw all xattr parts and copied the correct size */
+		if (xattr_end > 0 &&
+		    !((next = next_fs_item(&ctx->fs_items, fsi)) &&
+		      next->key.sk_type == SCOUTFS_XATTR_TYPE && next->key.skx_ino == in_ino &&
+		      next->key.skx_id == in_id)) {
+			if (off != xattr_end) {
+				ret = -EUCLEAN;
+				goto out;
+			}
+			xattr_end = 0;
+		}
+	}
+
+	ret = 0;
+out:
+	return ret ?: off;
+}
+
+/*
+ * If the key is for an inode we're not interested in, or if its past
+ * the xattr items, then advance to the next inode.  This is used
+ * between forest read items calls to avoid leaf blocks.  The callback
+ * takes care of iterating through the items for an inode across
+ * multiple leaves.
+ */
+static void advance_key_ino(struct scoutfs_key *key, struct inode_info_context *ctx)
+{
+	u64 ino = le64_to_cpu(key->_sk_first);
+	size_t ino_ind;
+
+	ino_ind = bsearch_index(&ino, ctx->inos, ctx->nr_inos, sizeof(ctx->inos[0]), cmp_u64);
+	if (ino_ind < ctx->nr_inos && ctx->inos[ino_ind] == ino) {
+		if (key->sk_type <= SCOUTFS_XATTR_TYPE)
+			return;
+		else
+			ino_ind++;
+	}
+
+	if (ino_ind < ctx->nr_inos)
+		scoutfs_inode_init_key(key, ctx->inos[ino_ind]);
+	else
+		scoutfs_key_set_ones(key);
+}
+
+int scoutfs_raw_read_inode_info(struct super_block *sb,
+				struct scoutfs_ioctl_raw_read_inode_info *rii)
+{
+	struct inode_info_context ctx = {0, };
+	struct scoutfs_net_roots roots;
+	DECLARE_SAVED_REFS(saved);
+	struct scoutfs_key lock_start;
+	struct scoutfs_key lock_end;
+	struct scoutfs_key start;
+	struct scoutfs_key last;
+	struct scoutfs_key key;
+	struct scoutfs_key end;
+	LIST_HEAD(list);
+	int retries = 10;
+	int ret;
+
+	ret = setup_context(&ctx, rii);
+	if (ret < 0)
+		goto out;
+
+	if (ctx.nr_names > 0)
+		scoutfs_xattr_init_key(&last, ctx.inos[ctx.nr_inos -1],
+				       ctx.names[ctx.nr_names - 1].hash, U64_MAX);
+	else
+		scoutfs_inode_init_key(&last, ctx.inos[ctx.nr_inos - 1]);
+
+retry:
+	ret = scoutfs_client_get_roots(sb, &roots);
+	if (ret)
+		goto out;
+
+	scoutfs_inode_init_key(&key, ctx.inos[0]);
+
+	while (scoutfs_key_compare(&key, &last) <= 0) {
+		scoutfs_lock_get_fs_item_range(le64_to_cpu(key._sk_first), &lock_start, &lock_end);
+
+		start = key;
+		end = last;
+		if (scoutfs_key_compare(&lock_end, &end) < 0)
+			end = lock_end;
+
+		ret = scoutfs_forest_read_items_roots(sb, &roots, 0, &key, &lock_start,
+						      &start, &end, save_info_items, &ctx);
+		if (ret < 0)
+			goto out;
+
+		/* save each sorted batch, might have partial results for an inode */
+		sort_and_remove(&ctx.fs_items, &end);
+		list_splice_tail_init(&ctx.fs_items, &list);
+
+		key = end;
+		if (!scoutfs_key_is_ones(&key)) {
+			scoutfs_key_inc(&key);
+			advance_key_ino(&key, &ctx);
+		}
+	}
+
+	list_splice_tail_init(&list, &ctx.fs_items);
+	ret = copy_results_to_user(sb, &ctx, rii);
+out:
+	free_fs_items(&list);
+	free_fs_items(&ctx.fs_items);
+
+	ret = scoutfs_block_check_stale(sb, ret, &saved, &roots.fs_root.ref, &roots.logs_root.ref);
+	if (ret == -ESTALE && retries-- > 0)
+		goto retry;
+
+	free_context(&ctx);
+	return ret;
+}
--- a/kmod/src/raw.h
+++ b/kmod/src/raw.h
@@ -0,0 +1,10 @@
+#ifndef _SCOUTFS_RAW_H_
+#define _SCOUTFS_RAW_H_
+
+int scoutfs_raw_read_meta_seq(struct super_block *sb,
+			      struct scoutfs_ioctl_raw_read_meta_seq *rms,
+			      struct scoutfs_ioctl_meta_seq *last_ret);
+int scoutfs_raw_read_inode_info(struct super_block *sb,
+				struct scoutfs_ioctl_raw_read_inode_info *rii);
+
+#endif
--- a/kmod/src/totl.c
+++ b/kmod/src/totl.c
@@ -30,6 +30,11 @@ void scoutfs_totl_merge_init(struct scoutfs_totl_merging *merg)
 	memset(merg, 0, sizeof(struct scoutfs_totl_merging));
 }

+/*
+ * bin the incoming merge inputs so that we can resolve delta items
+ * properly. Finalized logs that are merge inputs are kept separately
+ * from those that are not.
+ */
 void scoutfs_totl_merge_contribute(struct scoutfs_totl_merging *merg,
 				   u64 seq, u8 flags, void *val, int val_len, int fic)
 {
@@ -39,10 +44,10 @@ void scoutfs_totl_merge_contribute(struct scoutfs_totl_merging *merg,
 		merg->fs_seq = seq;
 		merg->fs_total = le64_to_cpu(tval->total);
 		merg->fs_count = le64_to_cpu(tval->count);
-	} else if (fic & FIC_FINALIZED) {
-		merg->fin_seq = seq;
-		merg->fin_total += le64_to_cpu(tval->total);
-		merg->fin_count += le64_to_cpu(tval->count);
+	} else if (fic & FIC_MERGE_INPUT) {
+		merg->inp_seq = seq;
+		merg->inp_total += le64_to_cpu(tval->total);
+		merg->inp_count += le64_to_cpu(tval->count);
 	} else {
 		merg->log_seq = seq;
 		merg->log_total += le64_to_cpu(tval->total);
@@ -53,15 +58,18 @@ void scoutfs_totl_merge_contribute(struct scoutfs_totl_merging *merg,
 /*
 * .totl. item merging has to be careful because the log btree merging
 * code can write partial results to the fs_root.  This means that a
- * reader can see both cases where new finalized logs should be applied
- * to the old fs items and where old finalized logs have already been
- * applied to the partially merged fs items.  Currently active logged
- * items are always applied on top of all cases.
+ * reader can see both cases where merge input deltas should be applied
+ * to the old fs items and where they have already been applied to the
+ * partially merged fs items.
+ *
+ * Only finalized log trees that are inputs to the current merge cycle
+ * are tracked in the inp_ bucket.  Finalized trees that aren't merge
+ * inputs and active log trees are always applied unconditionally since
+ * they cannot be in fs_root.
 *
 * These cases are differentiated with a combination of sequence numbers
- * in items, the count of contributing xattrs, and a flag
- * differentiating finalized and active logged items.  This lets us
- * recognize all cases, including when finalized logs were merged and
+ * in items and the count of contributing xattrs.  This lets us
+ * recognize all cases, including when merge inputs were merged and
 * deleted the fs item.
 */
 void scoutfs_totl_merge_resolve(struct scoutfs_totl_merging *merg, __u64 *total, __u64 *count)
@@ -75,14 +83,14 @@ void scoutfs_totl_merge_resolve(struct scoutfs_totl_merging *merg, __u64 *total,
 		*count = merg->fs_count;
 	}

-	/* apply finalized logs if they're newer or creating */
-	if (((merg->fs_seq != 0) && (merg->fin_seq > merg->fs_seq)) ||
-	    ((merg->fs_seq == 0) && (merg->fin_count > 0))) {
-		*total += merg->fin_total;
-		*count += merg->fin_count;
+	/* apply merge input deltas if they're newer or creating */
+	if (((merg->fs_seq != 0) && (merg->inp_seq > merg->fs_seq)) ||
+	    ((merg->fs_seq == 0) && (merg->inp_count > 0))) {
+		*total += merg->inp_total;
+		*count += merg->inp_count;
 	}

-	/* always apply active logs which must be newer than fs and finalized */
+	/* always apply non-input finalized and active logs */
 	if (merg->log_seq > 0) {
 		*total += merg->log_total;
 		*count += merg->log_count;
--- a/kmod/src/totl.h
+++ b/kmod/src/totl.h
@@ -7,9 +7,9 @@ struct scoutfs_totl_merging {
 	u64 fs_seq;
 	u64 fs_total;
 	u64 fs_count;
-	u64 fin_seq;
-	u64 fin_total;
-	s64 fin_count;
+	u64 inp_seq;
+	u64 inp_total;
+	s64 inp_count;
 	u64 log_seq;
 	u64 log_total;
 	s64 log_count;
--- a/kmod/src/triggers.c
+++ b/kmod/src/triggers.c
@@ -46,6 +46,7 @@ static char *names[] = {
 	[SCOUTFS_TRIGGER_SRCH_MERGE_STOP_SAFE] = "srch_merge_stop_safe",
 	[SCOUTFS_TRIGGER_STATFS_LOCK_PURGE] = "statfs_lock_purge",
 	[SCOUTFS_TRIGGER_RECLAIM_SKIP_FINALIZE] = "reclaim_skip_finalize",
+	[SCOUTFS_TRIGGER_LOG_MERGE_FORCE_PARTIAL] = "log_merge_force_partial",
 };

 bool scoutfs_trigger_test_and_clear(struct super_block *sb, unsigned int t)
--- a/kmod/src/triggers.h
+++ b/kmod/src/triggers.h
@@ -9,6 +9,7 @@ enum scoutfs_trigger {
 	SCOUTFS_TRIGGER_SRCH_MERGE_STOP_SAFE,
 	SCOUTFS_TRIGGER_STATFS_LOCK_PURGE,
 	SCOUTFS_TRIGGER_RECLAIM_SKIP_FINALIZE,
+	SCOUTFS_TRIGGER_LOG_MERGE_FORCE_PARTIAL,
 	SCOUTFS_TRIGGER_NR,
 };

--- a/kmod/src/wkic.c
+++ b/kmod/src/wkic.c
@@ -30,7 +30,6 @@
 #include "counters.h"
 #include "scoutfs_trace.h"
 #include "wkic.h"
-#include "msg.h"

 /*
 * This weaker item cache differs from the core item cache in item.c:
@@ -96,6 +95,7 @@ struct wkic_info {
 	/* block reading slow path */
 	struct mutex roots_mutex;
 	struct scoutfs_net_roots roots;
+	u64 merge_input_seq;
 	u64 roots_read_seq;
 	ktime_t roots_expire;

@@ -747,16 +747,6 @@ static void fill_page_items(struct super_block *sb, struct wkic_page *wpage, str
 				rb_erase(&witem->node, root);
 				kfree(witem);
 				continue;
-			} else if (tval->count == 0) {
-				/*
-				 * BUG: there are no contributing items but count != 0,
-				 * which shouldn't happen - we've gone off kilt.
-				 */
-				scoutfs_err(sb, "non-zero value for zero count totl "SK_FMT", dropping item",
-					    SK_ARG(&witem->key));
-				rb_erase(&witem->node, root);
-				kfree(witem);
-				continue;
 			}
 		}

@@ -816,29 +806,79 @@ static void free_page_list(struct super_block *sb, struct list_head *list)
 * read_seq number so that we can compare the age of the items in cached
 * pages.  Only one request to refresh the roots is in progress at a
 * time.  This is the slow path that's only used when the cache isn't
- * populated and the roots aren't cached.  The root request is fast
- * enough, especially compared to the resulting item reading IO, that we
- * don't mind hiding it behind a trivial mutex.
+ * populated and the roots aren't cached.
+ *
+ * We read roots directly from the on-disk superblock rather than
+ * requesting them from the server so that we can also read the
+ * log_merge btree from the same superblock.  The merge status item
+ * seq tells us which finalized log trees are inputs to the current
+ * merge, which is needed to correctly resolve totl delta items.
 */
-static int get_roots(struct super_block *sb, struct wkic_info *winf,
-		     struct scoutfs_net_roots *roots_ret, u64 *read_seq, bool force_new)
+static int refresh_roots(struct super_block *sb, struct wkic_info *winf)
+{
+	struct scoutfs_super_block *super;
+	struct scoutfs_log_merge_status *stat;
+	SCOUTFS_BTREE_ITEM_REF(iref);
+	struct scoutfs_key key;
+	int ret;
+
+	super = kmalloc(sizeof(*super), GFP_NOFS);
+	if (!super)
+		return -ENOMEM;
+
+	ret = scoutfs_read_super(sb, super);
+	if (ret < 0)
+		goto out;
+
+	winf->roots = (struct scoutfs_net_roots){
+		.fs_root = super->fs_root,
+		.logs_root = super->logs_root,
+		.srch_root = super->srch_root,
+	};
+
+	winf->merge_input_seq = 0;
+	if (super->log_merge.ref.blkno) {
+		scoutfs_key_set_zeros(&key);
+		key.sk_zone = SCOUTFS_LOG_MERGE_STATUS_ZONE;
+		ret = scoutfs_btree_lookup(sb, &super->log_merge, &key, &iref);
+		if (ret == 0) {
+			if (iref.val_len == sizeof(*stat)) {
+				stat = iref.val;
+				winf->merge_input_seq = le64_to_cpu(stat->seq);
+			} else {
+				ret = -EUCLEAN;
+			}
+			scoutfs_btree_put_iref(&iref);
+		} else if (ret == -ENOENT) {
+			ret = 0;
+		}
+		if (ret < 0)
+			goto out;
+	}
+
+	winf->roots_read_seq++;
+	winf->roots_expire = ktime_add_ms(ktime_get_raw(), WKIC_CACHE_LIFETIME_MS);
+out:
+	kfree(super);
+	return ret;
+}
+
+static int get_roots(struct super_block *sb, struct wkic_info *winf,
+		     struct scoutfs_net_roots *roots_ret, u64 *merge_input_seq,
+		     u64 *read_seq, bool force_new)
 {
-	struct scoutfs_net_roots roots;
 	int ret;

 	mutex_lock(&winf->roots_mutex);

 	if (force_new || ktime_before(winf->roots_expire, ktime_get_raw())) {
-		ret = scoutfs_client_get_roots(sb, &roots);
+		ret = refresh_roots(sb, winf);
 		if (ret < 0)
 			goto out;
-
-		winf->roots = roots;
-		winf->roots_read_seq++;
-		winf->roots_expire = ktime_add_ms(ktime_get_raw(), WKIC_CACHE_LIFETIME_MS);
 	}

 	*roots_ret = winf->roots;
+	*merge_input_seq = winf->merge_input_seq;
 	*read_seq = winf->roots_read_seq;
 	ret = 0;
 out:
@@ -881,24 +921,30 @@ static int insert_read_pages(struct super_block *sb, struct wkic_info *winf,
 	struct scoutfs_key end;
 	struct wkic_page *wpage;
 	LIST_HEAD(pages);
-	u64 read_seq;
+	u64 merge_input_seq;
+	u64 read_seq = 0;
 	int ret;

 	ret = 0;
 retry_stale:
-	ret = get_roots(sb, winf, &roots, &read_seq, ret == -ESTALE);
+	ret = get_roots(sb, winf, &roots, &merge_input_seq, &read_seq, ret == -ESTALE);
 	if (ret < 0)
-		goto out;
+		goto check_stale;

 	start = *range_start;
 	end = *range_end;
-	ret = scoutfs_forest_read_items_roots(sb, &roots, key, range_start, &start, &end,
-					      read_items_cb, &root);
+	ret = scoutfs_forest_read_items_roots(sb, &roots, merge_input_seq, key, range_start,
+					      &start, &end, read_items_cb, &root);
 	trace_scoutfs_wkic_read_items(sb, key, &start, &end);
+check_stale:
 	ret = scoutfs_block_check_stale(sb, ret, &saved, &roots.fs_root.ref, &roots.logs_root.ref);
 	if (ret < 0) {
-		if (ret == -ESTALE)
+		if (ret == -ESTALE) {
+			/* not safe to retry due to delta items, must restart clean */
+			free_item_tree(&root);
+			root = RB_ROOT;
 			goto retry_stale;
+		}
 		goto out;
 	}

--- a/kmod/src/xattr.c
+++ b/kmod/src/xattr.c
@@ -47,7 +47,7 @@
 *  - add acl support and call generic xattr->handlers for SYSTEM
 */

-static u32 xattr_name_hash(const char *name, unsigned int name_len)
+u32 scoutfs_xattr_name_hash(const char *name, unsigned int name_len)
 {
 	return crc32c(U32_MAX, name, name_len);
 }
@@ -65,8 +65,7 @@ static unsigned int xattr_nr_parts(struct scoutfs_xattr *xat)
 				      le16_to_cpu(xat->val_len));
 }

-static void init_xattr_key(struct scoutfs_key *key, u64 ino, u32 name_hash,
-			   u64 id)
+void scoutfs_xattr_init_key(struct scoutfs_key *key, u64 ino, u32 name_hash, u64 id)
 {
 	*key = (struct scoutfs_key) {
 		.sk_zone = SCOUTFS_FS_ZONE,
@@ -187,10 +186,10 @@ static int get_next_xattr(struct inode *inode, struct scoutfs_key *key,
 		return -EINVAL;

 	if (name_len)
-		name_hash = xattr_name_hash(name, name_len);
+		name_hash = scoutfs_xattr_name_hash(name, name_len);

-	init_xattr_key(key, scoutfs_ino(inode), name_hash, id);
-	init_xattr_key(&last, scoutfs_ino(inode), U32_MAX, U64_MAX);
+	scoutfs_xattr_init_key(key, scoutfs_ino(inode), name_hash, id);
+	scoutfs_xattr_init_key(&last, scoutfs_ino(inode), U32_MAX, U64_MAX);

 	for (;;) {
 		ret = scoutfs_item_next(sb, key, &last, xat, xat_bytes, lock);
@@ -335,8 +334,8 @@ static int create_xattr_items(struct inode *inode, u64 id, struct scoutfs_xattr
 	int len;
 	int i;

-	init_xattr_key(&key, scoutfs_ino(inode),
-		       xattr_name_hash(xat->name, xat->name_len), id);
+	scoutfs_xattr_init_key(&key, scoutfs_ino(inode),
+		       scoutfs_xattr_name_hash(xat->name, xat->name_len), id);

 	for (i = 0; i < new_parts; i++) {
 		key.skx_part = i;
@@ -365,7 +364,7 @@ static int delete_xattr_items(struct inode *inode, u32 name_hash, u64 id,
 	int ret = 0;
 	int i;

-	init_xattr_key(&key, scoutfs_ino(inode), name_hash, id);
+	scoutfs_xattr_init_key(&key, scoutfs_ino(inode), name_hash, id);

 	/* dirty additional existing old items */
 	for (i = 1; i < nr_parts; i++) {
@@ -407,8 +406,8 @@ static int change_xattr_items(struct inode *inode, u64 id,
 	int i;
 	int ret;

-	init_xattr_key(&key, scoutfs_ino(inode),
-		       xattr_name_hash(xat->name, xat->name_len), id);
+	scoutfs_xattr_init_key(&key, scoutfs_ino(inode),
+			       scoutfs_xattr_name_hash(xat->name, xat->name_len), id);

 	/* dirty existing old items */
 	for (i = 0; i < old_parts; i++) {
@@ -1224,8 +1223,8 @@ int scoutfs_xattr_drop(struct super_block *sb, u64 ino,
 		goto out;
 	}

-	init_xattr_key(&key, ino, 0, 0);
-	init_xattr_key(&last, ino, U32_MAX, U64_MAX);
+	scoutfs_xattr_init_key(&key, ino, 0, 0);
+	scoutfs_xattr_init_key(&last, ino, U32_MAX, U64_MAX);

 	for (;;) {
 		ret = scoutfs_item_next(sb, &key, &last, (void *)xat, bytes,
@@ -1265,6 +1264,7 @@ int scoutfs_xattr_drop(struct super_block *sb, u64 ino,
 			ret = parse_indx_key(&tag_key, xat->name, xat->name_len, ino);
 			if (ret < 0)
 				goto out;
+			scoutfs_xattr_set_indx_key_xid(&tag_key, le64_to_cpu(key.skx_id));
 		}

 		if ((tgs.totl || tgs.indx) && locked_zone != tag_key.sk_zone) {
--- a/kmod/src/xattr.h
+++ b/kmod/src/xattr.h
@@ -10,6 +10,9 @@ struct scoutfs_xattr_prefix_tags {

 extern const struct xattr_handler *scoutfs_xattr_handlers[];

+u32 scoutfs_xattr_name_hash(const char *name, unsigned int name_len);
+void scoutfs_xattr_init_key(struct scoutfs_key *key, u64 ino, u32 name_hash, u64 id);
+
 int scoutfs_xattr_get_locked(struct inode *inode, const char *name, void *buffer, size_t size,
 			     struct scoutfs_lock *lck);
 int scoutfs_xattr_set_locked(struct inode *inode, const char *name, size_t name_len,
--- a/tests/.gitignore
+++ b/tests/.gitignore
@@ -12,3 +12,4 @@ src/o_tmpfile_umask
 src/o_tmpfile_linkat
 src/mmap_stress
 src/mmap_validate
+src/watch_raw_inode_change
--- a/tests/Makefile
+++ b/tests/Makefile
@@ -15,7 +15,8 @@ BIN := src/createmany			\
 	src/o_tmpfile_umask		\
 	src/o_tmpfile_linkat		\
 	src/mmap_stress			\
-	src/mmap_validate
+	src/mmap_validate		\
+	src/watch_raw_inode_change

 DEPS := $(wildcard src/*.d)

--- a/tests/funcs/filter.sh
+++ b/tests/funcs/filter.sh
@@ -20,9 +20,6 @@ t_filter_fs()
 # [ 2687.691366] BUG: KASAN: stack-out-of-bounds in get_reg+0x1bc/0x230
 # ...
 # [ 2687.706220] ==================================================================
-# [ 2687.707284] Disabling lock debugging due to kernel taint
-#
-# That final lock debugging message may not be included.
 #
 ignore_harmless_unwind_kasan_stack_oob()
 {
@@ -46,10 +43,6 @@ awk '
 		saved=""
        }
        ( in_soob == 2 && $0 ~ /==================================================================/ ) {
-                in_soob = 3
-                soob_nr = NR
-        }
-        ( in_soob == 3 && NR > soob_nr && $0 !~ /Disabling lock debugging/ ) {
                in_soob = 0
        }
        ( !in_soob ) { print $0 }
@@ -61,6 +54,58 @@ awk '
 '
 }

+#
+# in el97+, XFS can generate a spurious lockdep circular dependency
+# warning about reclaim. Fixed upstream in e.g. v5.7-rc4-129-g6dcde60efd94
+#
+ignore_harmless_xfs_lockdep_warning()
+{
+awk '
+	BEGIN {
+		in_block = 0
+		block_nr = 0
+		buf = ""
+	}
+	( !in_block && $0 ~ /======================================================/ ) {
+		in_block = 1
+		block_nr = NR
+		buf = $0 "\n"
+		next
+	}
+	( in_block == 1 && NR == (block_nr + 1) ) {
+		if (match($0, /WARNING: possible circular locking dependency detected/) != 0) {
+			in_block = 2
+			buf = buf $0 "\n"
+		} else {
+			in_block = 0
+			printf "%s", buf
+			print $0
+			buf = ""
+		}
+		next
+	}
+	( in_block == 2 ) {
+		buf = buf $0 "\n"
+		if ($0 ~ /<\/TASK>/) {
+			if (buf ~ /xfs_(nondir_|dir_)?ilock_class/ && buf ~ /fs_reclaim/) {
+				# known xfs lockdep false positive, discard
+			} else {
+				printf "%s", buf
+			}
+			in_block = 0
+			buf = ""
+		}
+		next
+	}
+	{ print $0 }
+	END {
+		if (buf) {
+			printf "%s", buf
+		}
+	}
+'
+}
+
 #
 # Filter out expected messages.  Putting messages here implies that
 # tests aren't relying on messages to discover failures.. they're
@@ -176,6 +221,10 @@ t_filter_dmesg()
 	# creating block devices may trigger this
 	re="$re|block device autoloading is deprecated and will be removed."

+	# lockdep or kasan warnings can cause this
+	re="$re|Disabling lock debugging due to kernel taint"
+
 	egrep -v "($re)" | \
-		ignore_harmless_unwind_kasan_stack_oob
+		ignore_harmless_unwind_kasan_stack_oob | \
+		ignore_harmless_xfs_lockdep_warning
 }
--- a/tests/golden/basic-xattr-indx
+++ b/tests/golden/basic-xattr-indx
@@ -0,0 +1,54 @@
+== testing invalid read-xattr-index arguments
+bad index position entry argument 'bad', it must be in the form "a.b.ino" where each value can be prefixed by '0' for octal or '0x' for hex
+scoutfs: read-xattr-index failed: Invalid argument (22)
+bad index position entry argument '1.2', it must be in the form "a.b.ino" where each value can be prefixed by '0' for octal or '0x' for hex
+scoutfs: read-xattr-index failed: Invalid argument (22)
+initial major index position '256' must be between 0 and 255, inclusive.
+scoutfs: read-xattr-index failed: Invalid argument (22)
+first index position 1.2.3 must be less than last index position 0.0.0
+scoutfs: read-xattr-index failed: Invalid argument (22)
+first index position 1.2.0 must be less than last index position 1.1.2
+scoutfs: read-xattr-index failed: Invalid argument (22)
+first index position 2.2.2 must be less than last index position 2.2.1
+scoutfs: read-xattr-index failed: Invalid argument (22)
+== testing invalid names
+setfattr: /mnt/test/test/basic-xattr-indx/invalid: Invalid argument
+setfattr: /mnt/test/test/basic-xattr-indx/invalid: Invalid argument
+setfattr: /mnt/test/test/basic-xattr-indx/invalid: Invalid argument
+setfattr: /mnt/test/test/basic-xattr-indx/invalid: Invalid argument
+setfattr: /mnt/test/test/basic-xattr-indx/invalid: Invalid argument
+setfattr: /mnt/test/test/basic-xattr-indx/invalid: Invalid argument
+setfattr: /mnt/test/test/basic-xattr-indx/invalid: Invalid argument
+setfattr: /mnt/test/test/basic-xattr-indx/invalid: Invalid argument
+setfattr: /mnt/test/test/basic-xattr-indx/invalid: Invalid argument
+setfattr: /mnt/test/test/basic-xattr-indx/invalid: Invalid argument
+setfattr: /mnt/test/test/basic-xattr-indx/invalid: Invalid argument
+setfattr: /mnt/test/test/basic-xattr-indx/invalid: Numerical result out of range
+== testing boundary values
+0.0 found
+255.max found
+== indx xattr must have no value
+setfattr: /mnt/test/test/basic-xattr-indx/noval: Invalid argument
+setfattr: /mnt/test/test/basic-xattr-indx/noval: Invalid argument
+== set indx xattr and verify index entry
+found
+== setting same indx xattr again is a no-op
+found
+== removing non-existent indx xattr succeeds
+setfattr: /mnt/test/test/basic-xattr-indx/file: No such attribute
+still found
+== explicit xattr removal cleans up index entry
+== file deletion cleans up index entry
+found before delete
+== multiple indx xattrs on one file cleaned up by deletion
+entries before delete: 2
+entries after delete: 0
+== partial removal leaves other entries
+300 found
+== multiple files at same index position
+files at same position: 2
+surviving file found
+== cross-mount visibility
+found on mount 1
+== duplicate position deduplication
+entries for same position: 1
--- a/tests/golden/totl-merge-read
+++ b/tests/golden/totl-merge-read
@@ -0,0 +1,3 @@
+== setup
+expected 4681
+== cleanup
--- a/tests/run-tests.sh
+++ b/tests/run-tests.sh
@@ -694,8 +694,8 @@ for t in $tests; do
 		if [ "$sts" == "$T_PASS_STATUS" ]; then
 			dmesg | t_filter_dmesg > "$T_TMPDIR/dmesg.after"
 			diff --old-line-format="" --unchanged-line-format="" \
-				"$T_TMPDIR/dmesg.before" "$T_TMPDIR/dmesg.after" > \
-				"$T_TMPDIR/dmesg.new"
+				"$T_TMPDIR/dmesg.before" "$T_TMPDIR/dmesg.after" | \
+				grep -v '^$' > "$T_TMPDIR/dmesg.new"

 			if [ -s "$T_TMPDIR/dmesg.new" ]; then
 				message="unexpected messages in dmesg"
--- a/tests/sequence
+++ b/tests/sequence
@@ -26,7 +26,9 @@ srch-basic-functionality.sh
 simple-xattr-unit.sh
 retention-basic.sh
 totl-xattr-tag.sh
+basic-xattr-indx.sh
 quota.sh
+totl-merge-read.sh
 lock-refleak.sh
 lock-shrink-consistency.sh
 lock-shrink-read-race.sh
--- a/tests/src/watch_raw_inode_change.c
+++ b/tests/src/watch_raw_inode_change.c
@@ -0,0 +1,664 @@
+/*
+ * Copyright (C) 2026 Versity Software, Inc.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ */
+
+#ifndef _GNU_SOURCE
+#define _GNU_SOURCE
+#endif
+#include <unistd.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <string.h>
+#include <sys/ioctl.h>
+#include <fcntl.h>
+#include <errno.h>
+#include <time.h>
+#include <linux/types.h>
+#include <assert.h>
+#include <stdbool.h>
+
+#include "../../utils/src/util.h"
+#include "ioctl.h"
+#include "format.h"
+
+/*
+ * This is a quick example of using the raw reading ioctls to get info
+ * on inodes as they change.  We maintain an array of meta_seq items for
+ * inodes that we've seen.  If we read the current meta_seq items and
+ * see differences then we get inode info and update our array with what
+ * we find.
+ *
+ * This only maintains one array and sorts it back and forth as we walk
+ * the meta_seq items and then search by inode number.  This will
+ * eventually use far too much cpu as the number of inodes increases.
+ */
+
+#define MSF		"%llu.%llu"
+#define MSA(ms)		(ms)->meta_seq, (ms)->ino
+#define NERRF		"nerr %d (\"%s\")"
+#define NERRA(nerr)	nerr, strerror(-nerr)
+
+#define prerror(fmt, args...) \
+	fprintf(stderr, "error: "fmt"\n", ##args)
+
+#define prdebug(fmt, args...) \
+do { \
+	if (opts.debug) \
+		printf(fmt"\n", ##args); \
+} while (0)
+
+static struct opts {
+	bool debug;
+	char *path;
+	char *names;
+	size_t names_size;
+	size_t names_count;
+} opts;
+
+struct stats {
+	__u64 start;
+	__u64 last;
+
+	struct per_call {
+		__u64 begin;
+		__u64 calls;
+		__u64 time;
+		__u64 inos;
+	} rms, rii;
+
+	__u64 inodes;
+	__u64 add;
+	__u64 remove;
+	__u64 update;
+
+	unsigned lines;
+} stats;
+
+struct meta_seq_array {
+	size_t nr;
+	size_t alloc;
+	struct scoutfs_ioctl_meta_seq *ms;
+};
+
+#define INO_BATCH	1000
+/* *2 for gratuitous allowance for struct expansion */
+#define RESULTS_SIZE	(INO_BATCH * 2 * (sizeof(struct scoutfs_ioctl_raw_read_result) + \
+		                          sizeof(__u64) + \
+		                          180 /* ~= sizeof(struct scoutfs_inode) */ + \
+		                          sizeof(struct scoutfs_ioctl_inode_attr_x)))
+
+#define NSEC_PER_SEC 1000000000
+
+static __u64 get_ns(void)
+{
+	struct timespec tp;
+	int ret;
+
+	ret = clock_gettime(CLOCK_MONOTONIC, &tp);
+	if (ret != 0) {
+		ret = -errno;
+		prerror("clock_gettime() error: "NERRF, NERRA(ret));
+		exit(2);
+	}
+
+	return ((__u64)tp.tv_sec * NSEC_PER_SEC) + (__u64)tp.tv_nsec;
+}
+static void begin_call(struct per_call *pc)
+{
+	pc->begin = get_ns();
+}
+
+static void end_call(struct per_call *pc)
+{
+	pc->calls++;
+	pc->time += get_ns() - pc->begin;
+}
+
+static int expand_array(struct meta_seq_array *arr, size_t additional)
+{
+#define ALLOC_BATCH	(1024 * 1024 / (sizeof(struct scoutfs_ioctl_meta_seq)))
+	struct scoutfs_ioctl_meta_seq *ms;
+	size_t expand;
+
+	if (arr->nr + additional <= arr->alloc)
+		return 0;
+
+	expand = arr->alloc + ALLOC_BATCH;
+	ms = reallocarray(arr->ms, expand, sizeof(arr->ms[0]));
+	if (!ms) {
+		prerror("allocating ms array with %zu elements failed", expand);
+		return -ENOMEM;
+	}
+
+	arr->alloc = expand;
+	arr->ms = ms;
+
+	return 0;
+}
+
+static void inc_ms(struct scoutfs_ioctl_meta_seq *ms)
+{
+	if (++ms->ino == 0)
+		ms->meta_seq++;
+}
+
+static void set_ms(struct scoutfs_ioctl_meta_seq *ms, __u64 meta_seq, __u64 ino)
+{
+	ms->meta_seq = meta_seq;
+	ms->ino = ino;
+}
+
+static int compar_ms_ino(const void *A, const void *B)
+{
+	const struct scoutfs_ioctl_meta_seq *a = A;
+	const struct scoutfs_ioctl_meta_seq *b = B;
+
+	return a->ino < b->ino ? -1 : a->ino > b->ino ? 1 : 0;
+}
+
+static int compar_ms_meta_seq(const void *A, const void *B)
+{
+	const struct scoutfs_ioctl_meta_seq *a = A;
+	const struct scoutfs_ioctl_meta_seq *b = B;
+
+	return a->meta_seq < b->meta_seq ? -1 : a->meta_seq > b->meta_seq ? 1 :
+	       compar_ms_ino(A, B);
+}
+
+static int compar_u64(const void *A, const void *B)
+{
+	const __u64 *a = A;
+	const __u64 *b = B;
+
+	return *a < *b ? -1 : *a > *b ? 1 : 0;
+}
+
+struct bsearch_ind_key {
+	int (*compar)(const void *a, const void *b);
+	void *key;
+	size_t size;
+	void **index;
+};
+
+static int bsearch_ind_compar(const void *a, const void *b)
+{
+	const struct bsearch_ind_key *bik = (const void *)((unsigned long)a ^ 1);
+	int cmp;
+
+	/* this key hack only works if compar is always called where a is key and b is &base[..] */
+	assert((unsigned long)a & 1);
+	assert(!((unsigned long)b & 1));
+
+	cmp = bik->compar(bik->key, b);
+	if (cmp > 0)
+		*(bik->index) = (void *)b + bik->size;
+	else
+		*(bik->index) = (void *)b;
+
+	return cmp;
+}
+
+static size_t bsearch_ind(const void *key, const void *base, size_t nmemb, size_t size,
+			  int (*compar)(const void *a, const void *b))
+{
+	void *index = (void *)base;
+	struct bsearch_ind_key bik = {
+		.compar = compar,
+		.key = (void *)key,
+		.size = size,
+		.index = &index,
+	};
+
+	bsearch((void *)(((unsigned long)&bik) | 1), base, nmemb, size, bsearch_ind_compar);
+
+	return (index - base) / size;
+}
+
+/*
+ * Generate a sorted list of inode numbers for the meta_seq items that
+ * differ between the results from raw_read_meta_seq and the items we
+ * have saved in our array. 
+ */
+static int differing_inos(__u64 *inos, struct meta_seq_array *arr,
+			  struct scoutfs_ioctl_meta_seq *start,
+			  struct scoutfs_ioctl_meta_seq *last,
+			  struct scoutfs_ioctl_meta_seq *ms, size_t nr)
+{
+	size_t arr_last;
+	size_t a;
+	size_t m;
+	int nr_inos;
+	int cmp;
+	int i;
+	int n;
+
+	/* find where we're going to stop in arr */
+	arr_last = bsearch_ind(last, arr->ms, arr->nr, sizeof(arr->ms[0]), compar_ms_meta_seq);
+	if (arr_last < arr->nr && compar_ms_meta_seq(&arr->ms[arr_last], last) == 0)
+		arr_last++;
+
+	a = bsearch_ind(start, arr->ms, arr->nr, sizeof(arr->ms[0]), compar_ms_meta_seq);
+
+	for (m = 0, nr_inos = 0; (a < arr_last || m < nr) && nr_inos < INO_BATCH; ) {
+
+		prdebug("diffing: m %zu nr %zu | a %zu arr_last %zu | nr_inos %d",
+			m, nr, a, arr_last, nr_inos);
+		if (a < arr_last)
+			prdebug("  arr->ms[%zu] = "MSF, a, MSA(&arr->ms[a]));
+		if (m < nr)
+			prdebug("  ms[%zu] = "MSF, m, MSA(&ms[m]));
+
+		/* setup comparison to copy lesser or only */
+		if (a < arr_last && m < nr)
+			cmp = compar_ms_meta_seq(&arr->ms[a], &ms[m]);
+		else if (a < arr_last)
+			cmp = -1;
+		else
+			cmp = 1;
+
+		prdebug("  cmp %d", cmp);
+
+		if (cmp == 0) {
+			/* ignore both when they match */
+			a++;
+			m++;
+		} else if (cmp < 0) {
+			inos[nr_inos++] = arr->ms[a++].ino;
+		} else { /* cmp > 0 */
+			inos[nr_inos++] = ms[m++].ino;
+		}
+	}
+
+	/* if we didn't consume all the read meta_seq then we might need to clamp last */
+	if (m < nr && compar_ms_meta_seq(&ms[m], last) <= 0) {
+		*last = ms[m];
+		last->ino--; /* must be non-zero, can't wrap */
+	}
+
+	/* sort and remove duplicate inode numbers */
+	if (nr_inos > 0) {
+		qsort(inos, nr_inos, sizeof(inos[0]), compar_u64);
+		for (i = 1, n = 1; i < nr_inos; i++) {
+			if (inos[i] != inos[n - 1])
+				inos[n++] = inos[i];
+		}
+		nr_inos = n;
+	}
+
+	return nr_inos;
+}
+
+/*
+ * We're not really validating the result stream.  We assume that the offset currently
+ * points at an inode.  We fill the caller's ms with its info then iterate through
+ * all its results until the next ino.
+ */
+static ssize_t read_inode_results(void *buf, size_t off, size_t size,
+				  struct scoutfs_ioctl_meta_seq *found)
+{
+	struct scoutfs_ioctl_raw_read_result res;
+	size_t len;
+	__le64 ms;
+
+	found->ino = 0;
+
+	while (off < size) {
+		memcpy(&res, buf + off, sizeof(res));
+		prdebug("res %u %u", res.type, res.size);
+
+		if (res.type == SCOUTFS_IOC_RAW_READ_RESULT_INODE && found->ino != 0)
+			break;
+
+		off += sizeof(res);
+
+		switch(res.type) {
+			case SCOUTFS_IOC_RAW_READ_RESULT_INODE:
+				memcpy(&found->ino, buf + off, sizeof(__u64));
+				memcpy(&ms, buf + off + sizeof(__u64) +
+				       offsetof(struct scoutfs_inode, meta_seq), sizeof(__le64));
+				found->meta_seq = le64_to_cpu(ms);
+				prdebug("res ino %llu ms %llu", found->ino, found->meta_seq);
+				break;
+
+			case SCOUTFS_IOC_RAW_READ_RESULT_XATTR:
+				len = strlen((char *)buf + off) + 1;
+				prdebug("res xattr '%s' len %d: '%.*s'",
+					(char *)buf + off, 
+					(int)(res.size - len),
+					(int)(res.size - len),
+					(char *)buf + off + len);
+				break;
+		};
+		off += res.size;
+	}
+
+	return off;
+}
+
+/*
+ * inos[] contains the inode numbers that we're interested in.  Get
+ * their info and update our array with what we find.
+ */
+static int read_inode_info(int fd, void *buf, struct meta_seq_array *arr, __u64 *inos, int nr_inos)
+{
+	struct scoutfs_ioctl_raw_read_inode_info rii;
+	struct scoutfs_ioctl_meta_seq found;
+	struct scoutfs_ioctl_meta_seq ms;
+	ssize_t off;
+	size_t size;
+	size_t ind;
+	size_t added;
+	int i;
+	int ret;
+
+	rii = (struct scoutfs_ioctl_raw_read_inode_info) {
+		.inos_ptr = (unsigned long)inos,
+		.inos_count = nr_inos,
+		.names_ptr = (unsigned long)opts.names,
+		.names_count = opts.names_count,
+		.results_ptr = (unsigned long)buf,
+		.results_size = RESULTS_SIZE,
+	};
+
+	begin_call(&stats.rii);
+	ret = ioctl(fd, SCOUTFS_IOC_RAW_READ_INODE_INFO, &rii);
+	if (ret < 0) {
+		ret = -errno;
+		prerror("READ_INODE_INFO ioctl failed: "NERRF, NERRA(ret));
+		goto out;
+	}
+	end_call(&stats.rii);
+
+	prdebug("gii ret %d", ret);
+
+	off = 0;
+	size = ret;
+	set_ms(&found, 0, 0);
+	added = 0;
+	i = 0;
+
+	/* sort by ino so we can search by ino for updates */
+	qsort(arr->ms, arr->nr, sizeof(arr->ms[0]), compar_ms_ino);
+
+	while (i < nr_inos) {
+		/* find next ino */
+		if (!found.ino && off < size) {
+			off = read_inode_results(buf, off, size, &found);
+			if (off < 0) {
+				ret = off;
+				goto out;
+			}
+			stats.rii.inos++;
+		}
+
+		if (i < nr_inos && (!found.ino || inos[i] < found.ino)) {
+			/* delete any record of inodes we didn't find */
+			set_ms(&ms, UINT64_MAX, inos[i]);
+			i++;
+
+		} else if (found.ino) {
+			/* update/add arr to match the found ino */
+			ms = found;
+			if (i < nr_inos && inos[i] == found.ino)
+				i++;
+			set_ms(&found, 0, 0);
+		}
+
+		/* find existing record */
+		ind = bsearch_ind(&ms, arr->ms, arr->nr, sizeof(arr->ms[0]), compar_ms_ino);
+		if (ind < arr->nr && arr->ms[ind].ino == ms.ino) {
+			/* update existing ino, can be marking for deletion */
+			prdebug("updating arr [%zu] ino %llu ms %llu -> %llu",
+					ind, ms.ino, arr->ms[ind].meta_seq, ms.meta_seq);
+			arr->ms[ind].meta_seq = ms.meta_seq;
+			if (ms.meta_seq == UINT64_MAX)
+				stats.remove++;
+			else
+				stats.update++;
+
+		} else if (ms.meta_seq != UINT64_MAX) {
+			/* append new found, maintaining existing sorting */
+			arr->ms[arr->nr + added] = ms;
+			prdebug("adding arr [%zu] ino %llu ms %llu",
+					arr->nr + added, ms.ino, ms.meta_seq);
+			added++;
+			stats.add++;
+		}
+	}
+
+	/* sort by seq again for next meta seq read */
+	arr->nr += added;
+	qsort(arr->ms, arr->nr, sizeof(arr->ms[0]), compar_ms_meta_seq);
+
+	/* and trim off any deletions */
+	while (arr->nr > 0 && arr->ms[arr->nr - 1].meta_seq == UINT64_MAX)
+		arr->nr--;
+
+	ret = 0;
+out:
+	return ret;
+}
+
+static double secs(u64 a_ns, u64 b_ns)
+{
+	return (double)(a_ns - b_ns) / NSEC_PER_SEC;
+}
+
+static double nr_per_sec(u64 nr, __u64 nsec)
+{
+	if (nsec == 0)
+		return 0;
+
+	return (double)nr / secs(nsec, 0);
+}
+
+static void print_stats(void)
+{
+	u64 now = get_ns();
+
+	if (secs(now, stats.last) < 1.0)
+		return;
+
+	if ((stats.lines++ % 16) == 0) {
+		printf("%6s | %-29s | %-23s | %-23s\n",
+			"", "inodes", "meta_seq", "inode_info");
+		printf("%6s | %8s %6s %6s %6s | %7s %7s %7s | %7s %7s %7s\n",
+			"now",
+			"total", "add", "remove", "update",
+			"calls", "inos", "inos/s",
+			"calls", "inos", "inos/s");
+	}
+
+	printf("%6.3lf | %8llu %6llu %6llu %6llu | %7llu %7llu %7.0lf | %7llu %7llu %7.0lf\n",
+		secs(now, stats.start),
+		stats.inodes, stats.add, stats.remove, stats.update,
+		stats.rms.calls, stats.rms.inos, nr_per_sec(stats.rms.inos, stats.rms.time),
+		stats.rii.calls, stats.rii.inos, nr_per_sec(stats.rms.inos, stats.rii.time));
+
+	stats.last = now;
+
+	{
+		struct stats save = stats;
+		stats = (struct stats) {
+			.start = save.start,
+			.last = save.last,
+			.lines = save.lines,
+		};
+	}
+}
+
+static void add_xattr(char *name)
+{
+	size_t len_null;
+	char *names;
+	int ret;
+
+	len_null = strlen(name) + 1;
+	names = realloc(opts.names, opts.names_size + len_null);
+	if (!names) {
+		ret = -errno;
+		prerror("allocation of xattr names buffer failed: "NERRF, NERRA(ret));
+		exit(3);
+	}
+
+	memcpy(names + opts.names_size, name, len_null);
+
+	opts.names = names;
+	opts.names_size += len_null;
+	opts.names_count++;
+}
+
+static bool parse_opts(int argc, char **argv)
+{
+	bool usage = false;
+	int c;
+
+	opts = (struct opts) {
+		.debug = false,
+	};
+
+        while ((c = getopt(argc, argv, "dp:x:")) != -1) {
+                switch(c) {
+                case 'd':
+                        opts.debug = true;
+                        break;
+                case 'p':
+                        opts.path = strdup(optarg);
+                        break;
+                case 'x':
+			add_xattr(optarg);
+                        break;
+                case '?':
+                        printf("Unknown option '%c'\n", optopt);
+			usage = true;
+                }
+	}
+
+	if (!usage) {
+		usage = true;
+		if (!opts.path)
+			printf("need -p path option\n");
+		else
+			usage = false;
+	}
+
+	if (usage) {
+		printf("\nusage:\n"
+		       " -d      | enable verbose debugging output\n"
+		       " -p PATH | path to file system to watch\n"
+		       " -x NAME | try to read named xattr with inodes, can be many\n"
+		      );
+		return false;
+	}
+
+	return true;
+}
+
+int main(int argc, char **argv)
+{
+	struct scoutfs_ioctl_raw_read_meta_seq rms = {0,};
+	struct scoutfs_ioctl_meta_seq *ms;
+	struct meta_seq_array arr = {0,};
+	__u64 *inos = NULL;
+	void *buf = NULL;
+	int fd = -1;
+	int nr_inos;
+	int nr;
+	int i;
+	int ret;
+
+	if (!parse_opts(argc, argv))
+		exit(1);
+
+	inos = calloc(INO_BATCH, sizeof(inos[0]));
+	buf = malloc(RESULTS_SIZE);
+	if (!inos || !buf) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	rms.results_ptr = (unsigned long)buf;
+	rms.results_size = min(RESULTS_SIZE, INO_BATCH * sizeof(struct scoutfs_ioctl_meta_seq));
+
+	fd = open(opts.path, O_RDONLY);
+	if (fd == -1) {
+		perror("error");
+		exit(1);
+	}
+
+	stats.start = get_ns();
+
+	for (;;) {
+		set_ms(&rms.start, 0, 0);
+		set_ms(&rms.end, UINT64_MAX, UINT64_MAX);
+
+		do {
+			begin_call(&stats.rms);
+			ret = ioctl(fd, SCOUTFS_IOC_RAW_READ_META_SEQ, &rms);
+			if (ret < 0) {
+				ret = -errno;
+				prerror("READ_META_SEQ ioctl failed, "
+					"start "MSF" end "MSF", "NERRF,
+					MSA(&rms.start), MSA(&rms.end), NERRA(ret));
+				goto out;
+			}
+			end_call(&stats.rms);
+			stats.rms.inos += ret;
+
+			prdebug("RMS last "MSF" ret %d:", MSA(&rms.last), ret);
+
+			nr = ret;
+			ms = buf;
+
+			if (opts.debug && nr > 0) {
+				for (i = 0; i < nr; i++)
+					prdebug(" [%u] "MSF"", i, MSA(&ms[i]));
+			}
+
+			nr_inos = differing_inos(inos, &arr, &rms.start, &rms.last, ms, nr);
+
+			if (nr_inos > 0) {
+				prdebug("diff inos %d:", nr_inos);
+				for (i = 0; i < nr_inos; i++)
+					prdebug(" [%u] %llu", i, inos[i]);
+
+				ret = expand_array(&arr, nr_inos) ?:
+				      read_inode_info(fd, buf, &arr, inos, nr_inos);
+				if (ret < 0)
+					goto out;
+			}
+
+			stats.inodes = arr.nr;
+			print_stats();
+
+			rms.start = rms.last;
+			inc_ms(&rms.start);
+
+		} while (rms.last.meta_seq != UINT64_MAX || rms.last.ino != UINT64_MAX);
+
+
+		sleep(1);
+	}
+
+	ret = 0;
+out:
+	if (fd >= 0)
+		close(fd);
+
+	free(inos);
+	free(buf);
+	free(arr.ms);
+	free(opts.names);
+
+	return ret;
+}
--- a/tests/tests/basic-xattr-indx.sh
+++ b/tests/tests/basic-xattr-indx.sh
@@ -0,0 +1,143 @@
+#
+# Test basic .indx. xattr tag functionality and index entry lifecycle
+#
+
+t_require_commands touch rm setfattr scoutfs stat
+t_require_mounts 2
+
+# query index from a specific mount, default mount 0
+read_xattr_index()
+{
+	local nr="${1:-0}"
+	local mnt="$(eval echo \$T_M$nr)"
+	shift
+
+	sync
+	echo 1 > $(t_debugfs_path $nr)/drop_weak_item_cache
+	scoutfs read-xattr-index -p "$mnt" "$@"
+}
+
+MAJOR=5
+MINOR=100
+
+echo "== testing invalid read-xattr-index arguments"
+scoutfs read-xattr-index -p "$T_M0" bad 2>&1
+scoutfs read-xattr-index -p "$T_M0" 1.2 2>&1
+scoutfs read-xattr-index -p "$T_M0" 1.2.3 256.0.0 2>&1
+scoutfs read-xattr-index -p "$T_M0" 1.2.3 0.0.0 2>&1
+scoutfs read-xattr-index -p "$T_M0" 1.2.0 1.1.2 2>&1
+scoutfs read-xattr-index -p "$T_M0" 2.2.2 2.2.1 2>&1
+
+echo "== testing invalid names"
+touch "$T_D0/invalid"
+setfattr -n scoutfs.hide.indx.test.$MAJOR "$T_D0/invalid" 2>&1 | t_filter_fs
+setfattr -n scoutfs.hide.indx.test "$T_D0/invalid" 2>&1 | t_filter_fs
+setfattr -n scoutfs.hide.indx.test.. "$T_D0/invalid" 2>&1 | t_filter_fs
+setfattr -n scoutfs.hide.indx.test..$MINOR "$T_D0/invalid" 2>&1 | t_filter_fs
+setfattr -n scoutfs.hide.indx.test.$MAJOR. "$T_D0/invalid" 2>&1 | t_filter_fs
+setfattr -n scoutfs.hide.indx.test.256.$MINOR "$T_D0/invalid" 2>&1 | t_filter_fs
+setfattr -n scoutfs.hide.indx.test.abc.$MINOR "$T_D0/invalid" 2>&1 | t_filter_fs
+setfattr -n scoutfs.hide.indx.test.$MAJOR.abc "$T_D0/invalid" 2>&1 | t_filter_fs
+setfattr -n scoutfs.hide.indx.test.-1.$MINOR "$T_D0/invalid" 2>&1 | t_filter_fs
+setfattr -n scoutfs.hide.indx.test.$MAJOR.-1 "$T_D0/invalid" 2>&1 | t_filter_fs
+setfattr -n scoutfs.hide.indx.test.18446744073709551616.$MINOR "$T_D0/invalid" 2>&1 | t_filter_fs
+setfattr -n scoutfs.hide.indx.$(printf 'x%.0s' $(seq 1 240)).$MAJOR.$MINOR "$T_D0/invalid" 2>&1 | t_filter_fs
+rm -f "$T_D0/invalid"
+
+echo "== testing boundary values"
+touch "$T_D0/boundary"
+INO=$(stat -c "%i" "$T_D0/boundary")
+setfattr -n scoutfs.hide.indx.test.0.0 "$T_D0/boundary"
+read_xattr_index 0 0.0.0 0.0.-1 | awk '($3 == "'$INO'") {print "0.0 found"}'
+setfattr -x scoutfs.hide.indx.test.0.0 "$T_D0/boundary"
+setfattr -n scoutfs.hide.indx.test.255.18446744073709551615 "$T_D0/boundary"
+read_xattr_index 0 255.0.0 255.-1.-1 | awk '($3 == "'$INO'") {print "255.max found"}'
+setfattr -x scoutfs.hide.indx.test.255.18446744073709551615 "$T_D0/boundary"
+rm -f "$T_D0/boundary"
+
+echo "== indx xattr must have no value"
+touch "$T_D0/noval"
+setfattr -n scoutfs.hide.indx.test.$MAJOR.$MINOR -v "" "$T_D0/noval" 2>&1 | t_filter_fs
+setfattr -n scoutfs.hide.indx.test.$MAJOR.$MINOR -v 0 "$T_D0/noval" 2>&1 | t_filter_fs
+setfattr -n scoutfs.hide.indx.test.$MAJOR.$MINOR -v 1 "$T_D0/noval" 2>&1 | t_filter_fs
+rm -f "$T_D0/noval"
+
+echo "== set indx xattr and verify index entry"
+touch "$T_D0/file"
+INO=$(stat -c "%i" "$T_D0/file")
+setfattr -n scoutfs.hide.indx.test.$MAJOR.$MINOR "$T_D0/file"
+read_xattr_index 0 $MAJOR.0.0 $MAJOR.-1.-1 | awk '($3 == "'$INO'") {print "found"}'
+
+echo "== setting same indx xattr again is a no-op"
+setfattr -n scoutfs.hide.indx.test.$MAJOR.$MINOR "$T_D0/file"
+read_xattr_index 0 $MAJOR.0.0 $MAJOR.-1.-1 | awk '($3 == "'$INO'") {print "found"}'
+
+echo "== removing non-existent indx xattr succeeds"
+setfattr -x scoutfs.hide.indx.nonexistent.$MAJOR.999 "$T_D0/file" 2>&1 | t_filter_fs
+read_xattr_index 0 $MAJOR.0.0 $MAJOR.-1.-1 | awk '($3 == "'$INO'") {print "still found"}'
+
+echo "== explicit xattr removal cleans up index entry"
+setfattr -x scoutfs.hide.indx.test.$MAJOR.$MINOR "$T_D0/file"
+read_xattr_index 0 $MAJOR.0.0 $MAJOR.-1.-1 | awk '($3 == "'$INO'") {print "found orphan"}'
+rm -f "$T_D0/file"
+
+echo "== file deletion cleans up index entry"
+touch "$T_D0/file2"
+INO=$(stat -c "%i" "$T_D0/file2")
+setfattr -n scoutfs.hide.indx.test.$MAJOR.$MINOR "$T_D0/file2"
+read_xattr_index 0 $MAJOR.0.0 $MAJOR.-1.-1 | awk '($3 == "'$INO'") {print "found before delete"}'
+rm -f "$T_D0/file2"
+read_xattr_index 0 $MAJOR.0.0 $MAJOR.-1.-1 | awk '($3 == "'$INO'") {print "found orphan after delete"}'
+
+echo "== multiple indx xattrs on one file cleaned up by deletion"
+touch "$T_D0/file3"
+INO=$(stat -c "%i" "$T_D0/file3")
+setfattr -n scoutfs.hide.indx.a.$MAJOR.200 "$T_D0/file3"
+setfattr -n scoutfs.hide.indx.b.$MAJOR.300 "$T_D0/file3"
+BEFORE=$(read_xattr_index 0 $MAJOR.0.0 $MAJOR.-1.-1 | awk '($3 == "'$INO'")' | wc -l)
+echo "entries before delete: $BEFORE"
+rm -f "$T_D0/file3"
+AFTER=$(read_xattr_index 0 $MAJOR.0.0 $MAJOR.-1.-1 | awk '($3 == "'$INO'")' | wc -l)
+echo "entries after delete: $AFTER"
+
+echo "== partial removal leaves other entries"
+touch "$T_D0/partial"
+INO=$(stat -c "%i" "$T_D0/partial")
+setfattr -n scoutfs.hide.indx.a.$MAJOR.200 "$T_D0/partial"
+setfattr -n scoutfs.hide.indx.b.$MAJOR.300 "$T_D0/partial"
+setfattr -x scoutfs.hide.indx.a.$MAJOR.200 "$T_D0/partial"
+read_xattr_index 0 $MAJOR.200.0 $MAJOR.200.-1 | awk '($3 == "'$INO'") {print "200 found"}'
+read_xattr_index 0 $MAJOR.300.0 $MAJOR.300.-1 | awk '($3 == "'$INO'") {print "300 found"}'
+rm -f "$T_D0/partial"
+
+echo "== multiple files at same index position"
+touch "$T_D0/multi_a" "$T_D0/multi_b"
+INO_A=$(stat -c "%i" "$T_D0/multi_a")
+INO_B=$(stat -c "%i" "$T_D0/multi_b")
+setfattr -n scoutfs.hide.indx.test.$MAJOR.$MINOR "$T_D0/multi_a"
+setfattr -n scoutfs.hide.indx.test.$MAJOR.$MINOR "$T_D0/multi_b"
+COUNT=$(read_xattr_index 0 $MAJOR.$MINOR.0 $MAJOR.$MINOR.-1 | wc -l)
+echo "files at same position: $COUNT"
+rm -f "$T_D0/multi_a"
+read_xattr_index 0 $MAJOR.$MINOR.0 $MAJOR.$MINOR.-1 | awk '($3 == "'$INO_A'") {print "deleted file still found"}'
+read_xattr_index 0 $MAJOR.$MINOR.0 $MAJOR.$MINOR.-1 | awk '($3 == "'$INO_B'") {print "surviving file found"}'
+rm -f "$T_D0/multi_b"
+
+echo "== cross-mount visibility"
+touch "$T_D0/file4"
+INO=$(stat -c "%i" "$T_D0/file4")
+setfattr -n scoutfs.hide.indx.test.$MAJOR.$MINOR "$T_D0/file4"
+read_xattr_index 1 $MAJOR.0.0 $MAJOR.-1.-1 | awk '($3 == "'$INO'") {print "found on mount 1"}'
+rm -f "$T_D0/file4"
+read_xattr_index 1 $MAJOR.0.0 $MAJOR.-1.-1 | awk '($3 == "'$INO'") {print "found orphan on mount 1"}'
+
+echo "== duplicate position deduplication"
+touch "$T_D0/file5"
+INO=$(stat -c "%i" "$T_D0/file5")
+setfattr -n scoutfs.hide.indx.aa.$MAJOR.$MINOR "$T_D0/file5"
+setfattr -n scoutfs.hide.indx.bb.$MAJOR.$MINOR "$T_D0/file5"
+COUNT=$(read_xattr_index 0 $MAJOR.0.0 $MAJOR.-1.-1 | awk '($3 == "'$INO'")' | wc -l)
+echo "entries for same position: $COUNT"
+rm -f "$T_D0/file5"
+
+t_pass
--- a/tests/tests/totl-merge-read.sh
+++ b/tests/tests/totl-merge-read.sh
@@ -0,0 +1,50 @@
+#
+# Test that merge_read_item() correctly updates the sequence number when
+# combining delta items from multiple finalized log trees.  Each mount
+# sets a totl value in its own 3-bit lane (powers of 8) so that any
+# double-counting overflows the lane and is caught by: or(v, exp) != exp.
+#
+
+t_require_commands setfattr scoutfs
+t_require_mounts 5
+
+echo "== setup"
+for nr in $(t_fs_nrs); do
+	d=$(eval echo \$T_D$nr)
+	for i in $(seq 1 2500); do : > "$d/f$nr$i"; done
+done
+sync
+t_force_log_merge
+
+vals=(1 8 64 512 4096)
+expected=4681
+n=0
+for nr in $(t_fs_nrs); do
+	d=$(eval echo \$T_D$nr)
+	v=${vals[$((n++))]}
+	for i in $(seq 1 2500); do
+		setfattr -n "scoutfs.totl.t.$i.0.0" -v $v "$d/f$nr$i"
+	done
+done
+
+t_trigger_arm_silent log_merge_force_partial $(t_server_nr)
+
+bad="$T_TMPDIR/bad"
+for nr in $(t_fs_nrs); do
+	( while true; do
+		echo 1 > "$(t_debugfs_path $nr)/drop_weak_item_cache"
+		scoutfs read-xattr-totals -p "$(eval echo \$T_M$nr)" | \
+			awk -F'[ =,]+' -v e=$expected 'or($2+0,e) != e'
+	done ) >> "$bad" &
+done
+
+echo "expected $expected"
+t_force_log_merge
+t_silent_kill $(jobs -p)
+test -s "$bad" && echo "double-counted:" && cat "$bad"
+
+echo "== cleanup"
+for nr in $(t_fs_nrs); do
+	find "$(eval echo \$T_D$nr)" -name "f$nr*" -delete
+done
+t_pass
Author	SHA1	Message	Date
Zach Brown	0c3085c6e8	Remove unused orig keys from forest read items These orig copies of the start and end keys serve no purpose. I think they were an editing mistake left over from a version where retries could happen within the read_items call. Signed-off-by: Zach Brown <zab@versity.com>	2026-04-27 10:15:19 -07:00
Zach Brown	97f2d1ce8d	Make forest_read_items bloom key optional Allow callers of scoutfs_forest_read_items() to not provide a bloom key if they don't want the bloom filter blocks checked. Signed-off-by: Zach Brown <zab@versity.com>	2026-04-27 10:15:19 -07:00
Zach Brown	bd14a369e3	Let btree read items callback modify key Let the callback for scoutfs_btree_read_items() specify a new key to iterate from in the block rather than always iterating over all the items. The callback returns a specific error to trigger this behavior and none of the current callers can return this error value. Signed-off-by: Zach Brown <zab@versity.com>	2026-04-27 10:15:19 -07:00
Zach Brown	448dd74663	Add test example for watching changing inodes Signed-off-by: Zach Brown <zab@versity.com>	2026-04-27 10:15:19 -07:00
Zach Brown	57785066c0	Add raw_read_inode_info ioctl Add an ioctl for reading inode metadata (inode struct and xattrs) in bulk and without cluster locking. Signed-off-by: Zach Brown <zab@versity.com>	2026-04-27 10:15:19 -07:00
Zach Brown	b23022444a	Add scoutfs_lock_get_fs_item_range() Add another lock call for getting lock ranges, this time for fs items. Signed-off-by: Zach Brown <zab@versity.com>	2026-04-27 10:15:19 -07:00
Zach Brown	64931c395d	Add bsearch_index() Add a wrapper around bsearch() that returns an index into the array that the key would occupy rather than only being able to return pointers to array members that match the key. Signed-off-by: Zach Brown <zab@versity.com>	2026-04-27 10:15:19 -07:00
Zach Brown	380442515e	Add scoutfs_alloc declaration to block.h block.h has an undeclared use of struct scoutfs_alloc that was relying on previous headers. Add a declaraion of the struct (that we don't dereference) so the header can be used on its own. Signed-off-by: Zach Brown <zab@versity.com>	2026-04-27 10:15:19 -07:00
Zach Brown	750e998e40	Add raw_read_meta_seq ioctl Add an ioctl for reading the meta_seq index without cluster locking. Signed-off-by: Zach Brown <zab@versity.com>	2026-04-27 10:15:19 -07:00
Zach Brown	747a8bc53d	Remove scoutfs_ioctl_key We've long since removed ioctls that exposed keys directly. Signed-off-by: Zach Brown <zab@versity.com>	2026-04-27 09:58:19 -07:00
Zach Brown	77327ae713	Export xattr key init and hash generation This is going to be used by a bulk metadata gathering operation. Signed-off-by: Zach Brown <zab@versity.com>	2026-04-27 09:58:19 -07:00
Zach Brown	af31b9f1e8	Merge pull request #306 from versity/zab/v1.30 v1.30 Release	2026-04-22 10:43:17 -07:00
Zach Brown	ad65116d8f	v1.30 Release Finish the release notes for the 1.30 release. Signed-off-by: Zach Brown <zab@versity.com>	2026-04-21 16:43:12 -07:00
Zach Brown	e20765a9c7	Merge pull request #300 from versity/auke/more_false_positive_failures Auke/more false positive failures: xfs lockdep miss, newline	2026-04-17 09:17:50 -07:00
Zach Brown	066da5c2a2	Merge pull request #297 from versity/auke/quota_mod_trans_hold Hold transaction in scoutfs_quota_mod_rule to prevent alloc corruption.	2026-04-17 09:16:41 -07:00
Auke Kok	7eacc7139c	Hold transaction in scoutfs_quota_mod_rule to prevent alloc corruption. scoutfs_quota_mod_rule calls scoutfs_item_create/delete which use the transaction allocator but it never held it. Without the hold, a concurrent transaction commit can call scoutfs_alloc_init to reinitialize the allocator while dirty_alloc_blocks is in the middle of setting up the freed list block. This overwrites alloc->freed with the server's fresh (empty) state, causing a blkno mismatch BUG_ON in list_block_add. Reproduced by stressing concurrent quota add/del operations across mounts. Crashdump analysis confirms dirty_list_block COW'd a freed block (fr_old=9842, new blkno=9852) but by the time list_block_add ran, freed.ref.blkno was 0 with first_nr=0 and total_nr=0: the freed list head had been zeroed by a concurrent alloc_init. Fix by adding scoutfs_hold_trans/scoutfs_release_trans around the item modification in scoutfs_quota_mod_rule, preventing transaction commit from racing with the allocator use. Rename the 'unlock' label to 'release' since 'out' now directly does the unlock. The unlock safely handles a NULL lock. Signed-off-by: Auke Kok <auke.kok@versity.com>	2026-04-16 16:20:47 -07:00
Auke Kok	9e3b01b3b4	Filter newlines out dmesg.new Without overly broad filtering empty lines from dmesg, filter them so dmesg.new doesn't trigger a test failure. I don't want to overly process dmesg, so do this as late as possible. The xfs lockdep patterns can forget a leading/trailing empty line, causing a failure despite the explicit removal of the lockdep false positive. Signed-off-by: Auke Kok <auke.kok@versity.com>	2026-04-15 10:36:28 -07:00
Auke Kok	876c233f06	Ignore another xfs lockdep class This already caught xfs_nondir_ilock_class, but recent CI runs have been hitting xfs_dir_ilock_class, too. Signed-off-by: Auke Kok <auke.kok@versity.com>	2026-04-15 10:36:28 -07:00
Zach Brown	6aa5876c71	Merge pull request #301 from versity/auke/el7_uninit_read_seq Squelch gcc uninitialized warning on el7	2026-04-15 09:58:23 -07:00
Auke Kok	7a9f9ec698	Squelch gcc uninitialized warning on el7 The gcc version in el7 can't determine that scoutfs_block_check_stale won't return ret = 0 when the input ret value is < 0, and errors because we might call alloc_wpage with an uninitialized read_seq. Initialize it to 0 to avoid it. Signed-off-by: Auke Kok <auke.kok@versity.com>	2026-04-14 15:09:20 -04:00
Zach Brown	fc0fc1427f	Merge pull request #296 from versity/auke/indx_key_delete Fix indx delete using wrong xid, leaving orphans. && Add basic-xattr-indx tests.	2026-04-13 14:34:37 -07:00
Zach Brown	ec68845201	Merge pull request #289 from versity/auke/merge_read_item_stale_seq Update seq when merging deltas from partial log merge.	2026-04-13 14:10:37 -07:00
Auke Kok	5e2009f939	Avoid double counting deltas from non-input finalized log trees. Readers currently accumulate all finalized log tree deltas into a single bucket for deciding whether they are already in fs_root or not, but, finalized trees that aren't inputs to a current merge will have higher seqs, and thus we may be double applying deltas already merged into fs_root. To distinguish, scoutfs_totl_merge_contribute() needs to know the merge status item seq. We change wkic's get_roots() from using the SCOUTFS_NET_CMD_GET_ROOTS RPC to reading the superblock directly. This is needed because totl merge resolution has to use the same data as the btree roots it is operating on, thus we can't grab it from a SCOUTFS_NET_CMD_GET_ROOTS packet - it likely is different. Signed-off-by: Auke Kok <auke.kok@versity.com>	2026-04-10 13:50:21 -07:00
Auke Kok	8bdc20af21	Rename/reword FINALIZED to MERGE_INPUT. These mislabeled members and enums were clearly not describing the actual data being handled and obfuscating the intent of avoiding mixing merge input items with non-merge input items. Signed-off-by: Auke Kok <auke.kok@versity.com>	2026-04-10 13:50:21 -07:00
Auke Kok	857a39579e	Clear roots when retrying due to stale btree blocks. Before deltas were added this code path was correct, but with deltas we can't just retry this without clearing &root, since it would potentially double count. The condition where this could happen is when there are deltas in several finalized log trees, and we've made progress towards reading some of them, and then encounter a stale btree block. The retry would not clear the collected trees, apply the same delta as was already applied before the retry, and thus double count. Signed-off-by: Auke Kok <auke.kok@versity.com>	2026-04-10 13:50:21 -07:00
Auke Kok	38d36c9f5c	Update seq when merging deltas from partial log merge. Two different clients can write delta's for totl indexes at the same time, recording their changes. When merged, a reader should apply both in order, and only once. To do so, the seq determines whether the delta has been applied already. The code fails to update the seq while walking the trees for deltas to apply. Subsequently, when processing subsequent trees, it could re-process deltas already applied. In case of a large negative delta (e.g. removal of large amounts of files), the totl value could become negative, resulting in quota lockout. The fix is simple: advance the seq when reading partial delta merges to avoid double counting. Signed-off-by: Auke Kok <auke.kok@versity.com>	2026-04-10 13:50:21 -07:00
Auke Kok	b724567b2a	Add log_merge_force_partial trigger for testing partial merges. Add a trigger that forces btree_merge() to return -ERANGE after modifying a leaf's worth of items, causing many small partial merges per merge cycle. This is used by tests to reliably reproduce races that depend on partial merges splicing items into fs_root while finalized logs still exist. The trigger check lives inside btree_merge() where it can observe actual item modification progress, rather than overriding the caller's dirty byte limit argument which applies to the whole writer context. Signed-off-by: Auke Kok <auke.kok@versity.com>	2026-04-10 12:25:30 -07:00
Auke Kok	add1da10dc	Add test for stale seq in merge delta combining. merge_read_item() fails to update found->seq when combining delta items from multiple finalized log trees. Add a test case to replicate the conditions of this issue. Each of 5 mounts sets totl value 1 on 2500 shared keys, giving an expected total of 5 per key. Any total > 5 proves double-counting from a stale seq. The log_merge_force_partial trigger forces many partial merges per cycle, creating the conditions where stale-seq items get spliced into fs_root while finalized logs still exist. Parallel readers on all mounts race against this window to detect double-counted values. Signed-off-by: Auke Kok <auke.kok@versity.com>	2026-04-10 12:25:30 -07:00
Auke Kok	b9c49629a2	Add basic-xattr-indx tests. We had no basic testing for `scoutfs read-xattr-index` whatsoever. This adds your basic negative argument tests, lifecycle tests, the deduplicated reads, and partial removal. This exposes a bug in deletion where the indx entry isn't cleaned up on inode delete. Signed-off-by: Auke Kok <auke.kok@versity.com>	2026-04-08 13:45:56 -07:00
Auke Kok	9737009437	Fix indx delete using wrong xid, leaving orphans. During inode deletion, scoutfs_xattr_drop forgot to set the xid of the xattr after calling parse_indx_key, which hardcodes xid=0, and it is the callers' responsibility. delete_force then deletes the wrong key, and returns no errors on nonexistant keys. So now there is a pending deletion for a non-existant indx and an orphan indx entry in the tree. Subsequent calls to `scoutfs read-xattr-index` will thus return entries for deleted inodes. Signed-off-by: Auke Kok <auke.kok@versity.com>	2026-04-08 11:48:47 -07:00
Zach Brown	3d54ae03e6	Merge pull request #295 from versity/auke/xfs_lockdep_ignore Avoid xfs lockdep false positive dmesg errors.	2026-04-03 09:46:44 -07:00
Auke Kok	e27ec0add6	Avoid xfs lockdep false positive dmesg errors. This xfs lockdep stack trace has at least 2 variants around fs_reclaim, so try and capture it not too precisely here. We can remove "lockdep disabled" in the $re grep -v, because it can affect both this and the kasan one. Signed-off-by: Auke Kok <auke.kok@versity.com>	2026-04-01 14:25:48 -07:00