WIP

2026-06-09 21:22:36 +00:00 · 2024-10-28 15:50:47 -07:00 · 2024-10-28 15:35:10 -07:00 · 2024-10-28 14:34:30 -07:00 · 2024-10-28 14:21:08 -07:00 · 2024-10-25 14:45:52 -07:00
127 changed files with 11237 additions and 303 deletions
@@ -13,8 +13,7 @@

 %if 0%{?el7}
 %global kernel_source() /usr/src/kernels/%{kernel_version}.$(arch)
-%endif
-%if 0%{?el8}
+%else
 %global kernel_source() /usr/src/kernels/%{kernel_version}
 %endif

@@ -22,8 +21,7 @@

 %if 0%{?el7}
 Name:           %{kmod_name}
-%endif
-%if 0%{?el8}
+%else
 Name:           kmod-%{kmod_name}
 %endif
 Summary:        %{kmod_name} kernel module
@@ -35,8 +33,7 @@ URL:            http://scoutfs.org/

 %if 0%{?el7}
 BuildRequires:  %{kernel_module_package_buildreqs}
-%endif
-%if 0%{?el8}
+%else
 BuildRequires:  elfutils-libelf-devel
 %endif
 BuildRequires:  kernel-devel-uname-r = %{kernel_version}
@@ -54,7 +51,8 @@ Source:		%{kmod_name}-kmod-%{kmod_version}.tar
 %endif

 %global install_mod_dir extra/%{kmod_name}
-%if 0%{?el8}
+
+%if ! 0%{?el7}
 %global flavors_to_build x86_64
 %endif

@@ -93,7 +91,7 @@ done
 # mark modules executable so that strip-to-file can strip them
 find %{buildroot} -type f -name \*.ko -exec %{__chmod} u+x \{\} \;

-%if 0%{?el8}
+%if ! 0%{?el7}
 %files
 /lib/modules

@@ -111,8 +109,5 @@ SCOUTFS_RPM_NAME=$(rpm -q %{name} | grep "%{version}-%{release}")
 rpm -ql $SCOUTFS_RPM_NAME | grep '\.ko$' > /var/run/%{name}-modules-%{version}-%{release} || true

 %postun
-if [ -x /sbin/weak-modules ]; then
-    cat /var/run/%{name}-modules-%{version}-%{release} | /sbin/weak-modules --remove-modules --no-initramfs
-fi
-
+cat /var/run/%{name}-modules-%{version}-%{release} | weak-modules --remove-modules --no-initramfs
 rm /var/run/%{name}-modules-%{version}-%{release} || true
@@ -78,8 +78,9 @@ endif
 # v4.8-rc1-29-g31051c85b5e2
 #
 # inode_change_ok() removed - replace with setattr_prepare()
+# v5.11-rc4-7-g2f221d6f7b88 removes extern attribute
 #
-ifneq (,$(shell grep 'extern int setattr_prepare' include/linux/fs.h))
+ifneq (,$(shell grep 'int setattr_prepare' include/linux/fs.h))
 ccflags-y += -DKC_SETATTR_PREPARE
 endif

@@ -258,3 +259,157 @@ endif
 ifneq (,$(shell grep 'static inline const char .xattr_prefix' include/linux/xattr.h))
 ccflags-y += -DKC_XATTR_HANDLER_NAME=1
 endif
+
+#
+# v5.19-rc4-96-g342a72a33407
+#
+# Adds `typedef __u32 __bitwise blk_opf_t` to aid flag checking
+ifneq (,$(shell grep 'typedef __u32 __bitwise blk_opf_t' include/linux/blk_types.h))
+ccflags-y += -DKC_HAVE_BLK_OPF_T=1
+endif
+
+#
+# v5.12-rc6-9-g4f0f586bf0c8
+#
+# list_sort cmp function takes const list_head args
+ifneq (,$(shell grep 'const struct list_head ., const struct list_head .' include/linux/list_sort.h))
+ccflags-y += -DKC_LIST_CMP_CONST_ARG_LIST_HEAD
+endif
+
+# v5.7-523-g88dca4ca5a93
+#
+# The pgprot argument to vmalloc is always PAGE_KERNEL, so it is removed.
+ifneq (,$(shell grep 'extern void .__vmalloc.unsigned long size, gfp_t gfp_mask, pgprot_t prot' include/linux/vmalloc.h))
+ccflags-y += -DKC_VMALLOC_PGPROT_T
+endif
+
+# v6.2-rc1-18-g01beba7957a2
+#
+# fs: port inode_owner_or_capable() to mnt_idmap
+ifneq (,$(shell grep 'bool inode_owner_or_capable.struct user_namespace .mnt_userns' include/linux/fs.h))
+ccflags-y += -DKC_INODE_OWNER_OR_CAPABLE_USERNS
+endif
+
+#
+# v5.11-rc4-5-g47291baa8ddf
+#
+# namei: make permission helpers idmapped mount aware
+ifneq (,$(shell grep 'int inode_permission.struct user_namespace' include/linux/fs.h))
+ccflags-y += -DKC_INODE_PERMISSION_USERNS
+endif
+
+#
+# v5.11-rc4-24-g549c7297717c
+#
+# fs: make helpers idmap mount aware
+# Enlarges the VFS API methods to include user namespace argument.
+ifneq (,$(shell grep 'int ..mknod. .struct user_namespace' include/linux/fs.h))
+ccflags-y += -DKC_VFS_METHOD_USER_NAMESPACE_ARG
+endif
+
+#
+# v5.17-rc2-21-g07888c665b40
+#
+# Detect new style bio_alloc - pass bdev and opf.
+ifneq (,$(shell grep 'struct bio .bio_alloc.struct block_device .bdev' include/linux/bio.h))
+ccflags-y += -DKC_BIO_ALLOC_DEV_OPF_ARGS
+endif
+
+#
+# v5.7-rc4-53-gcddf8a2c4a82
+#
+# fiemap_prep() replaces fiemap_check_flags()
+ifneq (,$(shell grep -s 'int fiemap_prep.struct inode' include/linux/fiemap.h))
+ccflags-y += -DKC_FIEMAP_PREP
+endif
+
+#
+# v5.17-13043-g800ba29547e1
+#
+# generic_perform_write args use kiocb for passing filp and pos
+ifneq (,$(shell grep 'ssize_t generic_perform_write.struct kiocb ., struct iov_iter' include/linux/fs.h))
+ccflags-y += -DKC_GENERIC_PERFORM_WRITE_KIOCB_IOV_ITER
+endif
+
+#
+# v5.7-rc6-2496-g76ee0785f42a
+#
+# net: add sock_set_sndtimeo
+ifneq (,$(shell grep 'void sock_set_sndtimeo.struct sock' include/net/sock.h))
+ccflags-y += -DKC_SOCK_SET_SNDTIMEO
+endif
+
+#
+# v5.8-rc4-1931-gba423fdaa589
+#
+# setsockopt functions are now passed a sockptr_t value instead of char*
+ifneq (,$(shell grep -s 'include .linux/sockptr.h.' include/linux/net.h))
+ccflags-y += -DKC_SETSOCKOPT_SOCKPTR_T
+endif
+
+#
+# v5.7-rc6-2507-g71c48eb81c9e
+#
+# Adds a bunch of low level TCP sock parameter functions that we want to use.
+ifneq (,$(shell grep 'int tcp_sock_set_keepintvl' include/linux/tcp.h))
+ccflags-y += -DKC_HAVE_TCP_SET_SOCKFN
+endif
+
+#
+# v4.16-rc3-13-ga84d1169164b
+#
+# Fixes y2038 issues with struct timeval.
+ifneq (,$(shell grep -s '^struct __kernel_old_timeval .' include/uapi/linux/time_types.h))
+ccflags-y += -DKC_KERNEL_OLD_TIMEVAL_STRUCT
+endif
+
+#
+# v5.19-rc4-52-ge33c267ab70d
+#
+# register_shrinker now requires a name, used for debug stats etc.
+ifneq (,$(shell grep 'int __printf.*register_shrinker.struct shrinker .shrinker,' include/linux/shrinker.h))
+ccflags-y += -DKC_SHRINKER_NAME
+endif
+
+#
+# v5.18-rc5-246-gf132ab7d3ab0
+#
+# mpage_readpage() is now replaced with mpage_read_folio.
+ifneq (,$(shell grep 'int mpage_read_folio.struct folio .folio' include/linux/mpage.h))
+ccflags-y += -DKC_MPAGE_READ_FOLIO
+endif
+
+#
+# v5.18-rc5-219-gb3992d1e2ebc
+#
+# block_write_begin() no longer is being passed aop_flags
+ifneq (,$(shell grep -C1 'int block_write_begin' include/linux/buffer_head.h | tail -n 2 | grep 'unsigned flags'))
+ccflags-y += -DKC_BLOCK_WRITE_BEGIN_AOP_FLAGS
+endif
+
+#
+# v6.0-rc6-9-g863f144f12ad
+#
+# the .tmpfile() vfs method calling convention changed and now a struct
+# file* is passed to this metiond instead of a dentry. The function also
+# should open the created file and call finish_open_simple() before returning.
+ifneq (,$(shell grep 'extern void d_tmpfile.struct dentry' include/linux/dcache.h))
+ccflags-y += -DKC_D_TMPFILE_DENTRY
+endif
+
+#
+# v6.4-rc2-201-g0733ad800291
+#
+# New blk_mode_t replaces abuse of fmode_t
+ifneq (,$(shell grep 'typedef unsigned int __bitwise blk_mode_t' include/linux/blkdev.h))
+ccflags-y += -DKC_HAVE_BLK_MODE_T
+endif
+
+#
+# v6.4-rc2-186-g2736e8eeb0cc
+#
+# Reworks FMODE_EXCL kludge and instead modifies the blkdev_put() call to pass in
+# the (exclusive) holder to implement FMODE_EXCL handling.
+ifneq (,$(shell grep 'blkdev_put.struct block_device .bdev, void .holder' include/linux/blkdev.h))
+ccflags-y += -DKC_BLKDEV_PUT_HOLDER_ARG
+endif
@@ -98,11 +98,9 @@ struct posix_acl *scoutfs_get_acl_locked(struct inode *inode, int type, struct s
 		acl = ERR_PTR(ret);
 	}

-#ifndef KC___POSIX_ACL_CREATE
 	/* can set null negative cache */
 	if (!IS_ERR(acl))
 		set_cached_acl(inode, type, acl);
-#endif

 	kfree(value);

@@ -155,7 +153,8 @@ int scoutfs_set_acl_locked(struct inode *inode, struct posix_acl *acl, int type,
 	switch (type) {
 	case ACL_TYPE_ACCESS:
 		if (acl) {
-			ret = posix_acl_update_mode(inode, &new_mode, &acl);
+			ret = posix_acl_update_mode(KC_VFS_INIT_NS
+						    inode, &new_mode, &acl);
 			if (ret < 0)
 				goto out;
 			set_mode = true;
@@ -194,10 +193,8 @@ int scoutfs_set_acl_locked(struct inode *inode, struct posix_acl *acl, int type,
 	}

 out:
-#ifndef KC___POSIX_ACL_CREATE
 	if (!ret)
 		set_cached_acl(inode, type, acl);
-#endif

 	kfree(value);

@@ -256,7 +253,9 @@ int scoutfs_acl_get_xattr(struct dentry *dentry, const char *name, void *value,
 }

 #ifdef KC_XATTR_STRUCT_XATTR_HANDLER
-int scoutfs_acl_set_xattr(const struct xattr_handler *handler, struct dentry *dentry,
+int scoutfs_acl_set_xattr(const struct xattr_handler *handler,
+			  KC_VFS_NS_DEF
+			  struct dentry *dentry,
 			  struct inode *inode, const char *name, const void *value,
 			  size_t size, int flags)
 {
@@ -269,7 +268,7 @@ int scoutfs_acl_set_xattr(struct dentry *dentry, const char *name, const void *v
 	struct posix_acl *acl = NULL;
 	int ret;

-	if (!inode_owner_or_capable(dentry->d_inode))
+	if (!inode_owner_or_capable(KC_VFS_INIT_NS dentry->d_inode))
 		return -EPERM;

 	if (!IS_POSIXACL(dentry->d_inode))
@@ -10,7 +10,9 @@ int scoutfs_set_acl_locked(struct inode *inode, struct posix_acl *acl, int type,
 int scoutfs_acl_get_xattr(const struct xattr_handler *, struct dentry *dentry,
 			  struct inode *inode, const char *name, void *value,
 			  size_t size);
-int scoutfs_acl_set_xattr(const struct xattr_handler *, struct dentry *dentry,
+int scoutfs_acl_set_xattr(const struct xattr_handler *,
+			  KC_VFS_NS_DEF
+			  struct dentry *dentry,
 			  struct inode *inode, const char *name, const void *value,
 			  size_t size, int flags);
 #else
@@ -14,6 +14,7 @@
 #include <linux/module.h>
 #include <linux/fs.h>
 #include <linux/slab.h>
+#include <linux/blkdev.h>
 #include <linux/sort.h>
 #include <linux/random.h>

@@ -120,8 +120,7 @@ do {												\

 static __le32 block_calc_crc(struct scoutfs_block_header *hdr, u32 size)
 {
-	int off = offsetof(struct scoutfs_block_header, crc) +
-		  FIELD_SIZEOF(struct scoutfs_block_header, crc);
+	int off = offsetofend(struct scoutfs_block_header, crc);
 	u32 calc = crc32c(~0, (char *)hdr + off, size - off);

 	return cpu_to_le32(calc);
@@ -159,7 +158,7 @@ static struct block_private *block_alloc(struct super_block *sb, u64 blkno)
 		 */
 		lockdep_off();
 		nofs_flags = memalloc_nofs_save();
-		bp->virt = __vmalloc(SCOUTFS_BLOCK_LG_SIZE, GFP_NOFS | __GFP_HIGHMEM, PAGE_KERNEL);
+		bp->virt = kc__vmalloc(SCOUTFS_BLOCK_LG_SIZE, GFP_NOFS | __GFP_HIGHMEM);
 		memalloc_nofs_restore(nofs_flags);
 		lockdep_on();

@@ -438,7 +437,7 @@ static void block_remove_all(struct super_block *sb)
 * possible.  Final freeing, verifying checksums, and unlinking errored
 * blocks are all done by future users of the blocks.
 */
-static void block_end_io(struct super_block *sb, unsigned int opf,
+static void block_end_io(struct super_block *sb, blk_opf_t opf,
 			 struct block_private *bp, int err)
 {
 	DECLARE_BLOCK_INFO(sb, binf);
@@ -478,7 +477,7 @@ static void KC_DECLARE_BIO_END_IO(block_bio_end_io, struct bio *bio)
 * Kick off IO for a single block.
 */
 static int block_submit_bio(struct super_block *sb, struct block_private *bp,
-			    unsigned int opf)
+			    blk_opf_t opf)
 {
 	struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
 	struct bio *bio = NULL;
@@ -505,15 +504,13 @@ static int block_submit_bio(struct super_block *sb, struct block_private *bp,

 	for (off = 0; off < SCOUTFS_BLOCK_LG_SIZE; off += PAGE_SIZE) {
 		if (!bio) {
-			bio = bio_alloc(GFP_NOFS, SCOUTFS_BLOCK_LG_PAGES_PER);
+			bio = kc_bio_alloc(sbi->meta_bdev, SCOUTFS_BLOCK_LG_PAGES_PER, opf, GFP_NOFS);
 			if (!bio) {
 				ret = -ENOMEM;
 				break;
 			}

-			kc_bio_set_opf(bio, opf);
 			kc_bio_set_sector(bio, sector + (off >> 9));
-			bio_set_dev(bio, sbi->meta_bdev);
 			bio->bi_end_io = block_bio_end_io;
 			bio->bi_private = bp;

@@ -1201,7 +1198,7 @@ static void KC_DECLARE_BIO_END_IO(sm_block_bio_end_io, struct bio *bio)
 * only layer that sees the full block buffer so we pass the calculated
 * crc to the caller for them to check in their context.
 */
-static int sm_block_io(struct super_block *sb, struct block_device *bdev, unsigned int opf,
+static int sm_block_io(struct super_block *sb, struct block_device *bdev, blk_opf_t opf,
 		       u64 blkno, struct scoutfs_block_header *hdr, size_t len, __le32 *blk_crc)
 {
 	struct scoutfs_block_header *pg_hdr;
@@ -1233,15 +1230,13 @@ static int sm_block_io(struct super_block *sb, struct block_device *bdev, unsign
 		pg_hdr->crc = block_calc_crc(pg_hdr, SCOUTFS_BLOCK_SM_SIZE);
 	}

-	bio = bio_alloc(GFP_NOFS, 1);
+	bio = kc_bio_alloc(bdev, 1, opf, GFP_NOFS);
 	if (!bio) {
 		ret = -ENOMEM;
 		goto out;
 	}

-	kc_bio_set_opf(bio, opf | REQ_SYNC);
 	kc_bio_set_sector(bio, blkno << (SCOUTFS_BLOCK_SM_SHIFT - 9));
-	bio_set_dev(bio, bdev);
 	bio->bi_end_io = sm_block_bio_end_io;
 	bio->bi_private = &sbc;
 	bio_add_page(bio, page, SCOUTFS_BLOCK_SM_SIZE, 0);
@@ -1302,7 +1297,7 @@ int scoutfs_block_setup(struct super_block *sb)
 	init_waitqueue_head(&binf->waitq);
 	KC_INIT_SHRINKER_FUNCS(&binf->shrinker, block_count_objects,
 			       block_scan_objects);
-	KC_REGISTER_SHRINKER(&binf->shrinker);
+	KC_REGISTER_SHRINKER(&binf->shrinker, "scoutfs-block:" SCSBF, SCSB_ARGS(sb));
 	INIT_WORK(&binf->free_work, block_free_work);
 	init_llist_head(&binf->free_llist);

@@ -20,6 +20,7 @@
 #include <net/sock.h>
 #include <net/tcp.h>
 #include <asm/barrier.h>
+#include <linux/overflow.h>

 #include "format.h"
 #include "counters.h"
@@ -68,6 +69,7 @@ int scoutfs_client_alloc_inodes(struct super_block *sb, u64 count,
 	struct client_info *client = SCOUTFS_SB(sb)->client_info;
 	struct scoutfs_net_inode_alloc ial;
 	__le64 lecount = cpu_to_le64(count);
+	u64 tmp;
 	int ret;

 	ret = scoutfs_net_sync_request(sb, client->conn,
@@ -80,7 +82,7 @@ int scoutfs_client_alloc_inodes(struct super_block *sb, u64 count,

 		if (*nr == 0)
 			ret = -ENOSPC;
-		else if (*ino + *nr < *ino)
+		else if (check_add_overflow(*ino, *nr - 1, &tmp))
 			ret = -EINVAL;
 	}

@@ -162,6 +162,8 @@
 	EXPAND_COUNTER(orphan_scan_error)			\
 	EXPAND_COUNTER(orphan_scan_item)			\
 	EXPAND_COUNTER(orphan_scan_omap_set)			\
+	EXPAND_COUNTER(quota_info_count_objects)		\
+	EXPAND_COUNTER(quota_info_scan_objects)			\
 	EXPAND_COUNTER(quorum_candidate_server_stopping)	\
 	EXPAND_COUNTER(quorum_elected)				\
 	EXPAND_COUNTER(quorum_fence_error)			\
@@ -206,10 +208,12 @@
 	EXPAND_COUNTER(trans_commit_meta_alloc_low)		\
 	EXPAND_COUNTER(trans_commit_sync_fs)			\
 	EXPAND_COUNTER(trans_commit_timer)			\
-	EXPAND_COUNTER(trans_commit_written)
+	EXPAND_COUNTER(trans_commit_written)			\
+	EXPAND_COUNTER(wkic_count_objects)			\
+	EXPAND_COUNTER(wkic_scan_objects)

 #define FIRST_COUNTER	alloc_alloc_data
-#define LAST_COUNTER	trans_commit_written
+#define LAST_COUNTER	wkic_scan_objects

 #undef EXPAND_COUNTER
 #define EXPAND_COUNTER(which) struct percpu_counter which;
@@ -20,7 +20,9 @@
 #include <linux/hash.h>
 #include <linux/log2.h>
 #include <linux/falloc.h>
+#include <linux/fiemap.h>
 #include <linux/writeback.h>
+#include <linux/overflow.h>

 #include "format.h"
 #include "super.h"
@@ -679,8 +681,14 @@ int scoutfs_get_block_write(struct inode *inode, sector_t iblock, struct buffer_
 * We can return errors from locking and checking offline extents.  The
 * page is unlocked if we return an error.
 */
+#ifdef KC_MPAGE_READ_FOLIO
+static int scoutfs_read_folio(struct file *file, struct folio *folio)
+{
+	struct page *page = &folio->page;
+#else
 static int scoutfs_readpage(struct file *file, struct page *page)
 {
+#endif
 	struct inode *inode = file->f_inode;
 	struct scoutfs_inode_info *si = SCOUTFS_I(inode);
 	struct super_block *sb = inode->i_sb;
@@ -727,7 +735,11 @@ static int scoutfs_readpage(struct file *file, struct page *page)
 			return ret;
 	}

+#ifdef KC_MPAGE_READ_FOLIO
+	ret = mpage_read_folio(folio, scoutfs_get_block_read);
+#else
 	ret = mpage_readpage(page, scoutfs_get_block_read);
+#endif

 	scoutfs_unlock(sb, inode_lock, SCOUTFS_LOCK_READ);
 	scoutfs_per_task_del(&si->pt_data_lock, &pt_ent);
@@ -825,7 +837,10 @@ struct write_begin_data {

 static int scoutfs_write_begin(struct file *file,
 			       struct address_space *mapping, loff_t pos,
-			       unsigned len, unsigned flags,
+			       unsigned len,
+#ifdef KC_BLOCK_WRITE_BEGIN_AOP_FLAGS
+			       unsigned flags,
+#endif
 			       struct page **pagep, void **fsdata)
 {
 	struct inode *inode = mapping->host;
@@ -860,13 +875,18 @@ retry:
 	if (ret < 0)
 		goto out;

+#ifdef KC_BLOCK_WRITE_BEGIN_AOP_FLAGS
 	/* can't re-enter fs, have trans */
 	flags |= AOP_FLAG_NOFS;
+#endif

 	/* generic write_end updates i_size and calls dirty_inode */
 	ret = scoutfs_dirty_inode_item(inode, wbd->lock) ?:
-	      block_write_begin(mapping, pos, len, flags, pagep,
-				scoutfs_get_block_write);
+	      block_write_begin(mapping, pos, len,
+#ifdef KC_BLOCK_WRITE_BEGIN_AOP_FLAGS
+				flags,
+#endif
+				pagep, scoutfs_get_block_write);
 	if (ret < 0) {
 		scoutfs_release_trans(sb);
 		scoutfs_inode_index_unlock(sb, &wbd->ind_locks);
@@ -1068,6 +1088,7 @@ long scoutfs_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
 	loff_t end;
 	u64 iblock;
 	u64 last;
+	loff_t tmp;
 	s64 ret;

 	/* XXX support more flags */
@@ -1076,14 +1097,14 @@ long scoutfs_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
 		goto out;
 	}

-	/* catch wrapping */
-	if (offset + len < offset) {
-		ret = -EINVAL;
+	if (len == 0) {
+		ret = 0;
 		goto out;
 	}

-	if (len == 0) {
-		ret = 0;
+	/* catch wrapping */
+	if (check_add_overflow(offset, len - 1, &tmp)) {
+		ret = -EINVAL;
 		goto out;
 	}

@@ -1304,8 +1325,8 @@ int scoutfs_data_move_blocks(struct inode *from, u64 from_off,
 		goto out;
 	}

-	ret = inode_permission(from, MAY_WRITE) ?:
-	      inode_permission(to, MAY_WRITE);
+	ret = inode_permission(KC_VFS_INIT_NS from, MAY_WRITE) ?:
+	      inode_permission(KC_VFS_INIT_NS to, MAY_WRITE);
 	if (ret < 0)
 		goto out;

@@ -1543,7 +1564,7 @@ int scoutfs_data_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
 		goto out;
 	}

-	ret = fiemap_check_flags(fieinfo, FIEMAP_FLAG_SYNC);
+	ret = fiemap_prep(inode, fieinfo, start, &len, FIEMAP_FLAG_SYNC);
 	if (ret)
 		goto out;

@@ -1709,12 +1730,16 @@ int scoutfs_data_wait_check(struct inode *inode, loff_t pos, loff_t len,
 	u64 last_block;
 	u64 on;
 	u64 off;
+	loff_t tmp;
 	int ret = 0;

+	if (len == 0)
+		goto out;
+
 	if (WARN_ON_ONCE(sef & SEF_UNKNOWN) ||
 	    WARN_ON_ONCE(op & SCOUTFS_IOC_DWO_UNKNOWN) ||
 	    WARN_ON_ONCE(dw && !RB_EMPTY_NODE(&dw->node)) ||
-	    WARN_ON_ONCE(pos + len < pos)) {
+	    WARN_ON_ONCE(check_add_overflow(pos, len - 1, &tmp))) {
 		ret = -EINVAL;
 		goto out;
 	}
@@ -1890,7 +1915,13 @@ int scoutfs_data_waiting(struct super_block *sb, u64 ino, u64 iblock,
 }

 const struct address_space_operations scoutfs_file_aops = {
+#ifdef KC_MPAGE_READ_FOLIO
+	.dirty_folio		= block_dirty_folio,
+	.invalidate_folio	= block_invalidate_folio,
+	.read_folio		= scoutfs_read_folio,
+#else
 	.readpage		= scoutfs_readpage,
+#endif
 #ifndef KC_FILE_AOPS_READAHEAD
 	.readpages		= scoutfs_readpages,
 #else
@@ -1911,6 +1942,8 @@ const struct file_operations scoutfs_file_fops = {
 #else
 	.read_iter	= scoutfs_file_read_iter,
 	.write_iter	= scoutfs_file_write_iter,
+	.splice_read	= generic_file_splice_read,
+	.splice_write	= iter_file_splice_write,
 #endif
 	.unlocked_ioctl	= scoutfs_ioctl,
 	.fsync		= scoutfs_file_fsync,
@@ -703,8 +703,9 @@ out_unlock:
 	return inode;
 }

-static int scoutfs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode,
-		       dev_t rdev)
+static int scoutfs_mknod(KC_VFS_NS_DEF
+			 struct inode *dir,
+			 struct dentry *dentry, umode_t mode, dev_t rdev)
 {
 	struct super_block *sb = dir->i_sb;
 	struct inode *inode = NULL;
@@ -773,15 +774,20 @@ out:
 }

 /* XXX hmm, do something with excl? */
-static int scoutfs_create(struct inode *dir, struct dentry *dentry,
-			  umode_t mode, bool excl)
+static int scoutfs_create(KC_VFS_NS_DEF
+			  struct inode *dir,
+			  struct dentry *dentry, umode_t mode, bool excl)
 {
-	return scoutfs_mknod(dir, dentry, mode | S_IFREG, 0);
+	return scoutfs_mknod(KC_VFS_NS
+			     dir, dentry, mode | S_IFREG, 0);
 }

-static int scoutfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
+static int scoutfs_mkdir(KC_VFS_NS_DEF
+			 struct inode *dir,
+			 struct dentry *dentry, umode_t mode)
 {
-	return scoutfs_mknod(dir, dentry, mode | S_IFDIR, 0);
+	return scoutfs_mknod(KC_VFS_NS
+			     dir, dentry, mode | S_IFDIR, 0);
 }

 static int scoutfs_link(struct dentry *old_dentry,
@@ -1176,7 +1182,8 @@ static const char *scoutfs_get_link(struct dentry *dentry, struct inode *inode,
 * Symlink target paths can be annoyingly large.  We store relatively
 * rare large paths in multiple items.
 */
-static int scoutfs_symlink(struct inode *dir, struct dentry *dentry,
+static int scoutfs_symlink(KC_VFS_NS_DEF
+			   struct inode *dir, struct dentry *dentry,
 			   const char *symname)
 {
 	struct super_block *sb = dir->i_sb;
@@ -1563,7 +1570,8 @@ static int verify_ancestors(struct super_block *sb, u64 p1, u64 p2,
 * from using parent/child locking orders as two groups can have both
 * parent and child relationships to each other.
 */
-static int scoutfs_rename_common(struct inode *old_dir,
+static int scoutfs_rename_common(KC_VFS_NS_DEF
+				 struct inode *old_dir,
 				 struct dentry *old_dentry, struct inode *new_dir,
 				 struct dentry *new_dentry, unsigned int flags)
 {
@@ -1840,18 +1848,21 @@ static int scoutfs_rename(struct inode *old_dir,
 			  struct dentry *old_dentry, struct inode *new_dir,
 			  struct dentry *new_dentry)
 {
-	return scoutfs_rename_common(old_dir, old_dentry, new_dir, new_dentry, 0);
+	return scoutfs_rename_common(KC_VFS_INIT_NS
+				     old_dir, old_dentry, new_dir, new_dentry, 0);
 }
 #endif

-static int scoutfs_rename2(struct inode *old_dir,
+static int scoutfs_rename2(KC_VFS_NS_DEF
+			  struct inode *old_dir,
 			  struct dentry *old_dentry, struct inode *new_dir,
 			  struct dentry *new_dentry, unsigned int flags)
 {
 	if (flags & ~RENAME_NOREPLACE)
 		return -EINVAL;

-	return scoutfs_rename_common(old_dir, old_dentry, new_dir, new_dentry, flags);
+	return scoutfs_rename_common(KC_VFS_NS
+				     old_dir, old_dentry, new_dir, new_dentry, flags);
 }

 #ifdef KC_FMODE_KABI_ITERATE
@@ -1863,8 +1874,18 @@ static int scoutfs_dir_open(struct inode *inode, struct file *file)
 }
 #endif

-static int scoutfs_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode)
+static int scoutfs_tmpfile(KC_VFS_NS_DEF
+			   struct inode *dir,
+#ifdef KC_D_TMPFILE_DENTRY
+			   struct dentry *dentry,
+#else
+			   struct file *file,
+#endif
+			   umode_t mode)
 {
+#ifndef KC_D_TMPFILE_DENTRY
+	struct dentry *dentry = file->f_path.dentry;
+#endif
 	struct super_block *sb = dir->i_sb;
 	struct inode *inode = NULL;
 	struct scoutfs_lock *dir_lock = NULL;
@@ -1891,7 +1912,11 @@ static int scoutfs_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mod
 	si->crtime = inode->i_mtime;
 	insert_inode_hash(inode);
 	ihold(inode); /* need to update inode modifications in d_tmpfile */
+#ifdef KC_D_TMPFILE_DENTRY
 	d_tmpfile(dentry, inode);
+#else
+	d_tmpfile(file, inode);
+#endif
 	inode_inc_iversion(inode);
 	scoutfs_forest_inc_inode_count(sb);

@@ -1899,6 +1924,10 @@ static int scoutfs_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mod
 	scoutfs_update_inode_item(dir, dir_lock, &ind_locks);
 	scoutfs_inode_index_unlock(sb, &ind_locks);

+#ifndef KC_D_TMPFILE_DENTRY
+	ret = finish_open_simple(file, 0);
+#endif
+
 out:
 	scoutfs_release_trans(sb);
 	scoutfs_inode_index_unlock(sb, &ind_locks);
@@ -105,12 +105,12 @@ static ssize_t elapsed_secs_show(struct kobject *kobj,
 {
 	DECLARE_FENCE_FROM_KOBJ(fence, kobj);
 	ktime_t now = ktime_get();
-	struct timeval tv = { 0, };
+	ktime_t t = ns_to_ktime(0);

 	if (ktime_after(now, fence->start_kt))
-		tv = ktime_to_timeval(ktime_sub(now, fence->start_kt));
+		t = ktime_sub(now, fence->start_kt);

-	return snprintf(buf, PAGE_SIZE, "%llu", (long long)tv.tv_sec);
+	return snprintf(buf, PAGE_SIZE, "%llu", (long long)ktime_divns(t, NSEC_PER_SEC));
 }
 SCOUTFS_ATTR_RO(elapsed_secs);

@@ -267,7 +267,8 @@ out:
 }
 #endif

-int scoutfs_permission(struct inode *inode, int mask)
+int scoutfs_permission(KC_VFS_NS_DEF
+		       struct inode *inode, int mask)
 {
 	struct super_block *sb = inode->i_sb;
 	struct scoutfs_lock *inode_lock = NULL;
@@ -281,7 +282,8 @@ int scoutfs_permission(struct inode *inode, int mask)
 	if (ret)
 		return ret;

-	ret = generic_permission(inode, mask);
+	ret = generic_permission(KC_VFS_INIT_NS
+				 inode, mask);

 	scoutfs_unlock(sb, inode_lock, SCOUTFS_LOCK_READ);

@@ -10,7 +10,8 @@ ssize_t scoutfs_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
 ssize_t scoutfs_file_read_iter(struct kiocb *, struct iov_iter *);
 ssize_t scoutfs_file_write_iter(struct kiocb *, struct iov_iter *);
 #endif
-int scoutfs_permission(struct inode *inode, int mask);
+int scoutfs_permission(KC_VFS_NS_DEF
+		       struct inode *inode, int mask);
 loff_t scoutfs_file_llseek(struct file *file, loff_t offset, int whence);

 #endif	/* _SCOUTFS_FILE_H_ */
@@ -373,7 +373,8 @@ int scoutfs_getattr(struct vfsmount *mnt, struct dentry *dentry,
 {
 	struct inode *inode = dentry->d_inode;
 #else
-int scoutfs_getattr(const struct path *path, struct kstat *stat,
+int scoutfs_getattr(KC_VFS_NS_DEF
+		    const struct path *path, struct kstat *stat,
 		    u32 request_mask, unsigned int query_flags)
 {
 	struct inode *inode = d_inode(path->dentry);
@@ -385,7 +386,8 @@ int scoutfs_getattr(const struct path *path, struct kstat *stat,
 	ret = scoutfs_lock_inode(sb, SCOUTFS_LOCK_READ,
 				 SCOUTFS_LKF_REFRESH_INODE, inode, &lock);
 	if (ret == 0) {
-		generic_fillattr(inode, stat);
+		generic_fillattr(KC_VFS_INIT_NS
+				 inode, stat);
 		scoutfs_unlock(sb, lock, SCOUTFS_LOCK_READ);
 	}
 	return ret;
@@ -483,7 +485,8 @@ int scoutfs_complete_truncate(struct inode *inode, struct scoutfs_lock *lock)
 * re-acquire it.  Ideally we'd fix this so that we can acquire the lock
 * instead of the caller.
 */
-int scoutfs_setattr(struct dentry *dentry, struct iattr *attr)
+int scoutfs_setattr(KC_VFS_NS_DEF
+		    struct dentry *dentry, struct iattr *attr)
 {
 	struct inode *inode = dentry->d_inode;
 	struct super_block *sb = inode->i_sb;
@@ -501,7 +504,8 @@ retry:
 				 SCOUTFS_LKF_REFRESH_INODE, inode, &lock);
 	if (ret)
 		return ret;
-	ret = setattr_prepare(dentry, attr);
+	ret = setattr_prepare(KC_VFS_INIT_NS
+			      dentry, attr);
 	if (ret)
 		goto out;

@@ -565,7 +569,8 @@ retry:
 	if (ret < 0)
 		goto release;

-	setattr_copy(inode, attr);
+	setattr_copy(KC_VFS_INIT_NS
+		     inode, attr);
 	inode_inc_iversion(inode);
 	scoutfs_update_inode_item(inode, lock, &ind_locks);

@@ -979,10 +984,10 @@ static bool inode_has_index(umode_t mode, u8 type)
 	}
 }

-static int cmp_index_lock(void *priv, struct list_head *A, struct list_head *B)
+static int cmp_index_lock(void *priv, KC_LIST_CMP_CONST struct list_head *A, KC_LIST_CMP_CONST struct list_head *B)
 {
-	struct index_lock *a = list_entry(A, struct index_lock, head);
-	struct index_lock *b = list_entry(B, struct index_lock, head);
+	KC_LIST_CMP_CONST struct index_lock *a = list_entry(A, KC_LIST_CMP_CONST struct index_lock, head);
+	KC_LIST_CMP_CONST struct index_lock *b = list_entry(B, KC_LIST_CMP_CONST struct index_lock, head);

 	return ((int)a->type - (int)b->type) ?:
 	       scoutfs_cmp_u64s(a->major, b->major) ?:
@@ -1562,7 +1567,8 @@ int scoutfs_new_inode(struct super_block *sb, struct inode *dir, umode_t mode, d
 	scoutfs_inode_set_data_seq(inode);

 	inode->i_ino = ino; /* XXX overflow */
-	inode_init_owner(inode, dir, mode);
+	inode_init_owner(KC_VFS_INIT_NS
+			 inode, dir, mode);
 	inode_set_bytes(inode, 0);
 	inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode);
 	inode->i_rdev = rdev;
@@ -135,10 +135,12 @@ int scoutfs_inode_refresh(struct inode *inode, struct scoutfs_lock *lock);
 int scoutfs_getattr(struct vfsmount *mnt, struct dentry *dentry,
 		    struct kstat *stat);
 #else
-int scoutfs_getattr(const struct path *path, struct kstat *stat,
+int scoutfs_getattr(KC_VFS_NS_DEF
+		    const struct path *path, struct kstat *stat,
 		    u32 request_mask, unsigned int query_flags);
 #endif
-int scoutfs_setattr(struct dentry *dentry, struct iattr *attr);
+int scoutfs_setattr(KC_VFS_NS_DEF
+		    struct dentry *dentry, struct iattr *attr);

 int scoutfs_inode_orphan_create(struct super_block *sb, u64 ino, struct scoutfs_lock *lock,
 				struct scoutfs_lock *primary);
@@ -23,6 +23,7 @@
 #include <linux/aio.h>
 #include <linux/list_sort.h>
 #include <linux/backing-dev.h>
+#include <linux/overflow.h>

 #include "format.h"
 #include "key.h"
@@ -47,6 +48,7 @@
 #include "wkic.h"
 #include "quota.h"
 #include "scoutfs_trace.h"
+#include "util.h"

 /*
 * We make inode index items coherent by locking fixed size regions of
@@ -288,6 +290,7 @@ static long scoutfs_ioc_release(struct file *file, unsigned long arg)
 	u64 online;
 	u64 offline;
 	u64 isize;
+	__u64 tmp;
 	int ret;

 	if (copy_from_user(&args, (void __user *)arg, sizeof(args)))
@@ -297,12 +300,11 @@ static long scoutfs_ioc_release(struct file *file, unsigned long arg)

 	if (args.length == 0)
 		return 0;
-	if (((args.offset + args.length) < args.offset) ||
+	if ((check_add_overflow(args.offset, args.length - 1, &tmp)) ||
 	    (args.offset & SCOUTFS_BLOCK_SM_MASK) ||
 	    (args.length & SCOUTFS_BLOCK_SM_MASK))
 		return -EINVAL;

-
 	ret = mnt_want_write_file(file);
 	if (ret)
 		return ret;
@@ -674,8 +676,8 @@ static long scoutfs_ioc_setattr_more(struct file *file, unsigned long arg)
 		goto out;
 	}

-	iax->x_mask = SCOUTFS_IOC_IAX_DATA_VERSION | SCOUTFS_IOC_IAX_CTIME |
-		      SCOUTFS_IOC_IAX_CRTIME | SCOUTFS_IOC_IAX_SIZE;
+	iax->x_mask = SCOUTFS_IOC_IAX_CTIME | SCOUTFS_IOC_IAX_CRTIME |
+		      SCOUTFS_IOC_IAX_SIZE;
 	iax->data_version = sm.data_version;
 	iax->ctime_sec = sm.ctime_sec;
 	iax->ctime_nsec = sm.ctime_nsec;
@@ -686,6 +688,9 @@ static long scoutfs_ioc_setattr_more(struct file *file, unsigned long arg)
 	if (sm.flags & SCOUTFS_IOC_SETATTR_MORE_OFFLINE)
 		iax->x_flags |= SCOUTFS_IOC_IAX_F_SIZE_OFFLINE;

+	if (sm.data_version != 0)
+		iax->x_mask |= SCOUTFS_IOC_IAX_DATA_VERSION;
+
 	ret = mnt_want_write_file(file);
 	if (ret < 0)
 		goto out;
@@ -713,7 +718,8 @@ static long scoutfs_ioc_listxattr_hidden(struct file *file, unsigned long arg)
 	int total = 0;
 	int ret;

-	ret = inode_permission(inode, MAY_READ);
+	ret = inode_permission(KC_VFS_INIT_NS
+			       inode, MAY_READ);
 	if (ret < 0)
 		goto out;

@@ -951,6 +957,7 @@ static long scoutfs_ioc_move_blocks(struct file *file, unsigned long arg)
 	struct scoutfs_ioctl_move_blocks mb;
 	struct file *from_file;
 	struct inode *from;
+	u64 tmp;
 	int ret;

 	if (copy_from_user(&mb, umb, sizeof(mb)))
@@ -959,8 +966,8 @@ static long scoutfs_ioc_move_blocks(struct file *file, unsigned long arg)
 	if (mb.len == 0)
 		return 0;

-	if (mb.from_off + mb.len < mb.from_off ||
-	    mb.to_off + mb.len < mb.to_off)
+	if ((check_add_overflow(mb.from_off, mb.len - 1, &tmp)) ||
+	    (check_add_overflow(mb.to_off, mb.len - 1, &tmp)))
 		return -EOVERFLOW;

 	from_file = fget(mb.from_fd);
@@ -2241,18 +2241,18 @@ u64 scoutfs_item_dirty_pages(struct super_block *sb)
 	return (u64)atomic_read(&cinf->dirty_pages);
 }

-static int cmp_pg_start(void *priv, struct list_head *A, struct list_head *B)
+static int cmp_pg_start(void *priv, KC_LIST_CMP_CONST struct list_head *A, KC_LIST_CMP_CONST struct list_head *B)
 {
-	struct cached_page *a = list_entry(A, struct cached_page, dirty_head);
-	struct cached_page *b = list_entry(B, struct cached_page, dirty_head);
+	KC_LIST_CMP_CONST struct cached_page *a = list_entry(A, KC_LIST_CMP_CONST struct cached_page, dirty_head);
+	KC_LIST_CMP_CONST struct cached_page *b = list_entry(B, KC_LIST_CMP_CONST struct cached_page, dirty_head);

 	return scoutfs_key_compare(&a->start, &b->start);
 }

-static int cmp_item_key(void *priv, struct list_head *A, struct list_head *B)
+static int cmp_item_key(void *priv, KC_LIST_CMP_CONST struct list_head *A, KC_LIST_CMP_CONST struct list_head *B)
 {
-	struct cached_item *a = list_entry(A, struct cached_item, dirty_head);
-	struct cached_item *b = list_entry(B, struct cached_item, dirty_head);
+	KC_LIST_CMP_CONST struct cached_item *a = list_entry(A, KC_LIST_CMP_CONST struct cached_item, dirty_head);
+	KC_LIST_CMP_CONST struct cached_item *b = list_entry(B, KC_LIST_CMP_CONST struct cached_item, dirty_head);

 	return scoutfs_key_compare(&a->key, &b->key);
 }
@@ -2693,7 +2693,7 @@ int scoutfs_item_setup(struct super_block *sb)

 	KC_INIT_SHRINKER_FUNCS(&cinf->shrinker, item_cache_count_objects,
 			       item_cache_scan_objects);
-	KC_REGISTER_SHRINKER(&cinf->shrinker);
+	KC_REGISTER_SHRINKER(&cinf->shrinker, "scoutfs-item:" SCSBF, SCSB_ARGS(sb));
 #ifdef KC_CPU_NOTIFIER
        cinf->notifier.notifier_call = item_cpu_callback;
        register_hotcpu_notifier(&cinf->notifier);
@@ -67,12 +67,11 @@ kc_generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov,
 			       unsigned long nr_segs, loff_t pos, loff_t *ppos,
 			       size_t count, ssize_t written)
 {
-	struct file *file = iocb->ki_filp;
 	ssize_t status;
 	struct iov_iter i;

 	iov_iter_init(&i, WRITE, iov, nr_segs, count);
-	status = generic_perform_write(file, &i, pos);
+	status = kc_generic_perform_write(iocb, &i, pos);

 	if (likely(status >= 0)) {
 		written += status;
@@ -197,7 +197,11 @@ struct timespec64 kc_current_time(struct inode *inode);
 } while (0)

 #define KC_SHRINKER_CONTAINER_OF(ptr, type) container_of(ptr, type, shrinker)
-#define KC_REGISTER_SHRINKER(ptr) (register_shrinker(ptr))
+#ifdef KC_SHRINKER_NAME
+#define KC_REGISTER_SHRINKER register_shrinker
+#else
+#define KC_REGISTER_SHRINKER(ptr, fmt, ...) (register_shrinker(ptr))
+#endif /* KC_SHRINKER_NAME */
 #define KC_UNREGISTER_SHRINKER(ptr) (unregister_shrinker(ptr))
 #define KC_SHRINKER_FN(ptr) (ptr)
 #else
@@ -224,7 +228,7 @@ struct kc_shrinker_wrapper {
 	_wrap->shrink.seeks = DEFAULT_SEEKS;			\
 } while (0)
 #define KC_SHRINKER_CONTAINER_OF(ptr, type) container_of(container_of(ptr, struct kc_shrinker_wrapper, shrink), type, shrinker)
-#define KC_REGISTER_SHRINKER(ptr) (register_shrinker(ptr.shrink))
+#define KC_REGISTER_SHRINKER(ptr, fmt, ...) (register_shrinker(ptr.shrink))
 #define KC_UNREGISTER_SHRINKER(ptr) (unregister_shrinker(ptr.shrink))
 #define KC_SHRINKER_FN(ptr) (ptr.shrink)

@@ -271,6 +275,167 @@ ssize_t kc_generic_file_buffered_write(struct kiocb *iocb, const struct iovec *i
               unsigned long nr_segs, loff_t pos, loff_t *ppos,
               size_t count, ssize_t written);
 #define generic_file_buffered_write kc_generic_file_buffered_write
+#ifdef KC_GENERIC_PERFORM_WRITE_KIOCB_IOV_ITER
+static inline int kc_generic_perform_write(struct kiocb *iocb, struct iov_iter *iter, loff_t pos)
+{
+	iocb->ki_pos = pos;
+	return generic_perform_write(iocb, iter);
+}
+#else
+static inline int kc_generic_perform_write(struct kiocb *iocb, struct iov_iter *iter, loff_t pos)
+{
+	struct file *file = iocb->ki_filp;
+	return generic_perform_write(file, iter, pos);
+}
+#endif
+#endif // KC_GENERIC_FILE_BUFFERED_WRITE
+
+#ifndef KC_HAVE_BLK_OPF_T
+/* typedef __u32 __bitwise blk_opf_t; */
+typedef unsigned int blk_opf_t;
+#endif
+
+#ifdef KC_LIST_CMP_CONST_ARG_LIST_HEAD
+#define KC_LIST_CMP_CONST const
+#else
+#define KC_LIST_CMP_CONST
+#endif
+
+#ifdef KC_VMALLOC_PGPROT_T
+#define kc__vmalloc(size, gfp_mask) __vmalloc(size, gfp_mask, PAGE_KERNEL)
+#else
+#define kc__vmalloc __vmalloc
+#endif
+
+#ifdef KC_VFS_METHOD_USER_NAMESPACE_ARG
+#define KC_VFS_NS_DEF struct user_namespace *mnt_user_ns,
+#define KC_VFS_NS mnt_user_ns,
+#define KC_VFS_INIT_NS &init_user_ns,
+#else
+#define KC_VFS_NS_DEF
+#define KC_VFS_NS
+#define KC_VFS_INIT_NS
+#endif
+
+#ifdef KC_BIO_ALLOC_DEV_OPF_ARGS
+#define kc_bio_alloc bio_alloc
+#else
+#include <linux/bio.h>
+static inline struct bio *kc_bio_alloc(struct block_device *bdev, unsigned short nr_vecs,
+				       blk_opf_t opf, gfp_t gfp_mask)
+{
+	struct bio *b = bio_alloc(gfp_mask, nr_vecs);
+	if (b) {
+		kc_bio_set_opf(b, opf);
+		bio_set_dev(b, bdev);
+	}
+	return b;
+}
+#endif
+
+#ifndef KC_FIEMAP_PREP
+#define fiemap_prep(inode, fieinfo, start, len, flags) fiemap_check_flags(fieinfo, flags)
+#endif
+
+#ifndef KC_KERNEL_OLD_TIMEVAL_STRUCT
+#define __kernel_old_timeval timeval
+#define ns_to_kernel_old_timeval(ktime) ns_to_timeval(ktime.tv64)
+#endif
+
+#ifdef KC_SOCK_SET_SNDTIMEO
+#include <net/sock.h>
+static inline int kc_sock_set_sndtimeo(struct socket *sock, s64 secs)
+{
+	sock_set_sndtimeo(sock->sk, secs);
+	return 0;
+}
+static inline int kc_tcp_sock_set_rcvtimeo(struct socket *sock, ktime_t to)
+{
+	struct __kernel_old_timeval tv;
+	sockptr_t kopt;
+
+	tv = ns_to_kernel_old_timeval(to);
+
+	kopt = KERNEL_SOCKPTR(&tv);
+
+	return sock_setsockopt(sock, SOL_SOCKET, SO_RCVTIMEO_NEW,
+			       kopt, sizeof(tv));
+}
+#else
+#include <net/sock.h>
+static inline int kc_sock_set_sndtimeo(struct socket *sock, s64 secs)
+{
+	struct timeval tv = { .tv_sec = secs, .tv_usec = 0 };
+	return kernel_setsockopt(sock, SOL_SOCKET, SO_SNDTIMEO,
+				 (char *)&tv, sizeof(tv));
+}
+static inline int kc_tcp_sock_set_rcvtimeo(struct socket *sock, ktime_t to)
+{
+	struct __kernel_old_timeval tv;
+
+	tv = ns_to_kernel_old_timeval(to);
+	return kernel_setsockopt(sock, SOL_SOCKET, SO_RCVTIMEO,
+				 (char *)&tv, sizeof(tv));
+}
+#endif
+
+#ifdef KC_SETSOCKOPT_SOCKPTR_T
+static inline int kc_sock_setsockopt(struct socket *sock, int level, int op, int *optval, unsigned int optlen)
+{
+	sockptr_t kopt = KERNEL_SOCKPTR(optval);
+	return sock_setsockopt(sock, level, op, kopt, sizeof(optval));
+}
+#else
+static inline int kc_sock_setsockopt(struct socket *sock, int level, int op, int *optval, unsigned int optlen)
+{
+	return kernel_setsockopt(sock, level, op, (char *)optval, sizeof(optval));
+}
+#endif
+
+#ifdef KC_HAVE_TCP_SET_SOCKFN
+#include <linux/net.h>
+#include <net/tcp.h>
+static inline int kc_tcp_sock_set_keepintvl(struct socket *sock, int val)
+{
+	return tcp_sock_set_keepintvl(sock->sk, val);
+}
+static inline int kc_tcp_sock_set_keepidle(struct socket *sock, int val)
+{
+	return tcp_sock_set_keepidle(sock->sk, val);
+}
+static inline int kc_tcp_sock_set_user_timeout(struct socket *sock, int val)
+{
+	tcp_sock_set_user_timeout(sock->sk, val);
+	return 0;
+}
+static inline int kc_tcp_sock_set_nodelay(struct socket *sock)
+{
+	tcp_sock_set_nodelay(sock->sk);
+	return 0;
+}
+#else
+#include <linux/net.h>
+#include <net/tcp.h>
+static inline int kc_tcp_sock_set_keepintvl(struct socket *sock, int val)
+{
+	int optval = val;
+	return kernel_setsockopt(sock, SOL_TCP, TCP_KEEPINTVL, (char *)&optval, sizeof(optval));
+}
+static inline int kc_tcp_sock_set_keepidle(struct socket *sock, int val)
+{
+	int optval = val;
+	return kernel_setsockopt(sock, SOL_TCP, TCP_KEEPIDLE, (char *)&optval, sizeof(optval));
+}
+static inline int kc_tcp_sock_set_user_timeout(struct socket *sock, int val)
+{
+	int optval = val;
+	return kernel_setsockopt(sock, SOL_TCP, TCP_USER_TIMEOUT, (char *)&optval, sizeof(optval));
+}
+static inline int kc_tcp_sock_set_nodelay(struct socket *sock)
+{
+	int optval = 1;
+	return kernel_setsockopt(sock, SOL_TCP, TCP_NODELAY, (char *)&optval, sizeof(optval));
+}
 #endif

 #endif
@@ -125,8 +125,8 @@ static inline bool scoutfs_key_is_ones(struct scoutfs_key *key)
 * other alternatives across keys that first differ in any of the
 * values.  Say maybe 20% faster than memcmp.
 */
-static inline int scoutfs_key_compare(struct scoutfs_key *a,
-				      struct scoutfs_key *b)
+static inline int scoutfs_key_compare(const struct scoutfs_key *a,
+				      const struct scoutfs_key *b)
 {
 	return scoutfs_cmp(a->sk_zone, b->sk_zone) ?:
 	  scoutfs_cmp(le64_to_cpu(a->_sk_first), le64_to_cpu(b->_sk_first)) ?:
@@ -142,10 +142,10 @@ static inline int scoutfs_key_compare(struct scoutfs_key *a,
 *       1: a_start > b_end
 *  else 0: ranges overlap
 */
-static inline int scoutfs_key_compare_ranges(struct scoutfs_key *a_start,
-				             struct scoutfs_key *a_end,
-				             struct scoutfs_key *b_start,
-				             struct scoutfs_key *b_end)
+static inline int scoutfs_key_compare_ranges(const struct scoutfs_key *a_start,
+				             const struct scoutfs_key *a_end,
+				             const struct scoutfs_key *b_start,
+				             const struct scoutfs_key *b_end)
 {
 	return scoutfs_key_compare(a_end, b_start) < 0 ? -1 :
 	       scoutfs_key_compare(a_start, b_end) > 0 ? 1 :
@@ -1732,7 +1732,7 @@ int scoutfs_lock_setup(struct super_block *sb)
 	linfo->lock_range_tree = RB_ROOT;
 	KC_INIT_SHRINKER_FUNCS(&linfo->shrinker, lock_count_objects,
 			       lock_scan_objects);
-	KC_REGISTER_SHRINKER(&linfo->shrinker);
+	KC_REGISTER_SHRINKER(&linfo->shrinker, "scoutfs-lock:" SCSBF, SCSB_ARGS(sb));
 	INIT_LIST_HEAD(&linfo->lru_list);
 	INIT_WORK(&linfo->inv_work, lock_invalidate_worker);
 	INIT_LIST_HEAD(&linfo->inv_list);
@@ -904,53 +904,44 @@ static void destroy_conn(struct scoutfs_net_connection *conn)
 static int sock_opts_and_names(struct scoutfs_net_connection *conn,
 			       struct socket *sock)
 {
-	struct timeval tv;
 	int optval;
 	int ret;

 	/* we use a keepalive timeout instead of send timeout */
-	tv.tv_sec = 0;
-	tv.tv_usec = 0;
-	ret = kernel_setsockopt(sock, SOL_SOCKET, SO_SNDTIMEO,
-				(char *)&tv, sizeof(tv));
+	ret = kc_sock_set_sndtimeo(sock, 0);
 	if (ret)
 		goto out;

 	/* not checked when user_timeout != 0, but for clarity */
 	optval = UNRESPONSIVE_PROBES;
-	ret = kernel_setsockopt(sock, SOL_TCP, TCP_KEEPCNT,
-				(char *)&optval, sizeof(optval));
+	ret = kc_sock_setsockopt(sock, SOL_TCP, TCP_KEEPCNT,
+				&optval, sizeof(optval));
 	if (ret)
 		goto out;

 	BUILD_BUG_ON(UNRESPONSIVE_PROBES >= UNRESPONSIVE_TIMEOUT_SECS);
 	optval = UNRESPONSIVE_TIMEOUT_SECS - (UNRESPONSIVE_PROBES);
-	ret = kernel_setsockopt(sock, SOL_TCP, TCP_KEEPIDLE,
-				(char *)&optval, sizeof(optval));
+	ret = kc_tcp_sock_set_keepidle(sock, optval);
 	if (ret)
 		goto out;

 	optval = 1;
-	ret = kernel_setsockopt(sock, SOL_TCP, TCP_KEEPINTVL,
-				(char *)&optval, sizeof(optval));
+	ret = kc_tcp_sock_set_keepintvl(sock, optval);
 	if (ret)
 		goto out;

 	optval = UNRESPONSIVE_TIMEOUT_SECS * MSEC_PER_SEC;
-	ret = kernel_setsockopt(sock, SOL_TCP, TCP_USER_TIMEOUT,
-				(char *)&optval, sizeof(optval));
+	ret = kc_tcp_sock_set_user_timeout(sock, optval);
 	if (ret)
 		goto out;

 	optval = 1;
-	ret = kernel_setsockopt(sock, SOL_SOCKET, SO_KEEPALIVE,
-				(char *)&optval, sizeof(optval));
+	ret = kc_sock_setsockopt(sock, SOL_SOCKET, SO_KEEPALIVE,
+				&optval, sizeof(optval));
 	if (ret)
 		goto out;

-	optval = 1;
-	ret = kernel_setsockopt(sock, SOL_TCP, TCP_NODELAY,
-				(char *)&optval, sizeof(optval));
+	ret = kc_tcp_sock_set_nodelay(sock);
 	if (ret)
 		goto out;

@@ -1049,7 +1040,6 @@ static void scoutfs_net_connect_worker(struct work_struct *work)
 	DEFINE_CONN_FROM_WORK(conn, work, connect_work);
 	struct super_block *sb = conn->sb;
 	struct socket *sock;
-	struct timeval tv;
 	int ret;

 	trace_scoutfs_net_connect_work_enter(sb, 0, 0);
@@ -1060,11 +1050,8 @@ static void scoutfs_net_connect_worker(struct work_struct *work)

 	sock->sk->sk_allocation = GFP_NOFS;

-	/* caller specified connect timeout */
-	tv.tv_sec = conn->connect_timeout_ms / MSEC_PER_SEC;
-	tv.tv_usec = (conn->connect_timeout_ms % MSEC_PER_SEC) * USEC_PER_MSEC;
-	ret = kernel_setsockopt(sock, SOL_SOCKET, SO_SNDTIMEO,
-				(char *)&tv, sizeof(tv));
+	/* caller specified connect timeout, defaults to 1 sec */
+	ret = kc_sock_set_sndtimeo(sock, conn->connect_timeout_ms / MSEC_PER_SEC);
 	if (ret) {
 		sock_release(sock);
 		goto out;
@@ -1462,8 +1449,8 @@ int scoutfs_net_bind(struct super_block *sb,
 	sock->sk->sk_allocation = GFP_NOFS;

 	optval = 1;
-	ret = kernel_setsockopt(sock, SOL_SOCKET, SO_REUSEADDR,
-				(char *)&optval, sizeof(optval));
+	ret = kc_sock_setsockopt(sock, SOL_SOCKET, SO_REUSEADDR,
+				&optval, sizeof(optval));
 	if (ret)
 		goto out;

@@ -303,7 +303,6 @@ static int recv_msg(struct super_block *sb, struct quorum_host_msg *msg,
 	DECLARE_QUORUM_INFO(sb, qinf);
 	struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
 	struct scoutfs_quorum_message qmes;
-	struct timeval tv;
 	ktime_t rel_to;
 	ktime_t now;
 	int ret;
@@ -328,14 +327,10 @@ static int recv_msg(struct super_block *sb, struct quorum_host_msg *msg,
 	else
 		rel_to = ns_to_ktime(0);

-	tv = ktime_to_timeval(rel_to);
-	if (tv.tv_sec == 0 && tv.tv_usec == 0) {
+	if (ktime_compare(rel_to, ns_to_ktime(NSEC_PER_USEC)) <= 0) {
 		mh.msg_flags |= MSG_DONTWAIT;
 	} else {
-		ret = kernel_setsockopt(qinf->sock, SOL_SOCKET, SO_RCVTIMEO,
-					(char *)&tv, sizeof(tv));
-		if (ret < 0)
-			return ret;
+		ret = kc_tcp_sock_set_rcvtimeo(qinf->sock, rel_to);
 	}

 #ifdef KC_MSGHDR_STRUCT_IOV_ITER
@@ -486,7 +481,7 @@ static void set_quorum_block_event(struct super_block *sb, struct scoutfs_quorum
 	if (WARN_ON_ONCE(event < 0 || event >= SCOUTFS_QUORUM_EVENT_NR))
 		return;

-	getnstimeofday64(&ts);
+	ktime_get_ts64(&ts);
 	le64_add_cpu(&blk->write_nr, 1);

 	ev = &blk->events[event];
@@ -1325,8 +1320,8 @@ int scoutfs_quorum_setup(struct super_block *sb)
 	qinf = kzalloc(sizeof(struct quorum_info), GFP_KERNEL);
 	super = kmalloc(sizeof(struct scoutfs_super_block), GFP_KERNEL);
 	if (qinf)
-		qinf->hb_delay = __vmalloc(HB_DELAY_NR * sizeof(struct count_recent),
-					   GFP_KERNEL | __GFP_ZERO, PAGE_KERNEL);
+		qinf->hb_delay = kc__vmalloc(HB_DELAY_NR * sizeof(struct count_recent),
+					   GFP_KERNEL | __GFP_ZERO);
 	if (!qinf || !super || !qinf->hb_delay) {
 		if (qinf)
 			vfree(qinf->hb_delay);
@@ -34,6 +34,7 @@
 #include "totl.h"
 #include "util.h"
 #include "quota.h"
+#include "counters.h"
 #include "scoutfs_trace.h"

 /*
@@ -219,6 +220,8 @@ static unsigned long count_cached_checks(struct shrinker *shrink, struct shrink_
 {
 	struct squota_info *qtinf = KC_SHRINKER_CONTAINER_OF(shrink, struct squota_info);

+	scoutfs_inc_counter(qtinf->sb, quota_info_count_objects);
+
 	return shrinker_min_long(atomic64_read(&qtinf->nr_checks));
 }

@@ -237,6 +240,8 @@ static unsigned long scan_cached_checks(struct shrinker *shrink, struct shrink_c
 	struct squota_check *chk;
 	int err;

+	scoutfs_inc_counter(qtinf->sb, quota_info_scan_objects);
+
 	rcu_read_lock();

 	while (nr > 0 && retries > 0 && (chk = lookup_random_check(&qtinf->check_ht))) {
@@ -1221,7 +1226,7 @@ int scoutfs_quota_setup(struct super_block *sb)
 	init_waitqueue_head(&qtinf->waitq);

 	KC_INIT_SHRINKER_FUNCS(&qtinf->shrinker, count_cached_checks, scan_cached_checks);
-	KC_REGISTER_SHRINKER(&qtinf->shrinker);
+	KC_REGISTER_SHRINKER(&qtinf->shrinker, "scoutfs-quota:" SCSBF, SCSB_ARGS(sb));

 	sbi->squota_info = qtinf;

@@ -76,10 +76,10 @@ static struct recov_pending *lookup_pending(struct recov_info *recinf, u64 rid,
 * We keep the pending list sorted by rid so that we can iterate over
 * them.  The list should be small and shouldn't be used often.
 */
-static int cmp_pending_rid(void *priv, struct list_head *A, struct list_head *B)
+static int cmp_pending_rid(void *priv, KC_LIST_CMP_CONST struct list_head *A, KC_LIST_CMP_CONST struct list_head *B)
 {
-	struct recov_pending *a = list_entry(A, struct recov_pending, head);
-	struct recov_pending *b = list_entry(B, struct recov_pending, head);
+	KC_LIST_CMP_CONST struct recov_pending *a = list_entry(A, KC_LIST_CMP_CONST struct recov_pending, head);
+	KC_LIST_CMP_CONST struct recov_pending *b = list_entry(B, KC_LIST_CMP_CONST struct recov_pending, head);

 	return scoutfs_cmp_u64s(a->rid, b->rid);
 }
@@ -24,7 +24,6 @@

 #include <linux/tracepoint.h>
 #include <linux/in.h>
-#include <linux/unaligned/access_ok.h>

 #include "key.h"
 #include "format.h"
@@ -298,7 +298,7 @@ static void check_holder_budget(struct super_block *sb, struct server_info *serv
 {
 	static bool exceeded_once = false;
 	struct commit_hold *hold;
-	struct timespec ts;
+	struct timespec64 ts;
 	u32 avail_used;
 	u32 freed_used;
 	u32 avail_now;
@@ -330,7 +330,7 @@ static void check_holder_budget(struct super_block *sb, struct server_info *serv
 		    cusers->freed_before, freed_now);

 	list_for_each_entry(hold, &cusers->holding, entry) {
-		ts = ktime_to_timespec(hold->start);
+		ts = ktime_to_timespec64(hold->start);
 		scoutfs_err(sb, "exceeding hold start %llu.%09llu av %u fr %u",
 			    (u64)ts.tv_sec, (u64)ts.tv_nsec, hold->avail, hold->freed);
 		hold->exceeded = true;
@@ -445,7 +445,7 @@ static int server_apply_commit(struct super_block *sb, struct commit_hold *hold,
 {
 	DECLARE_SERVER_INFO(sb, server);
 	struct commit_users *cusers = &server->cusers;
-	struct timespec ts;
+	struct timespec64 ts;

 	spin_lock(&cusers->lock);

@@ -454,7 +454,7 @@ static int server_apply_commit(struct super_block *sb, struct commit_hold *hold,
 	check_holder_budget(sb, server, cusers);

 	if (hold->exceeded) {
-		ts = ktime_to_timespec(hold->start);
+		ts = ktime_to_timespec64(hold->start);
 		scoutfs_err(sb, "exceeding hold start %llu.%09llu stack:",
 			    (u64)ts.tv_sec, (u64)ts.tv_nsec);
 		dump_stack();
@@ -18,6 +18,7 @@
 #include <linux/pagemap.h>
 #include <linux/vmalloc.h>
 #include <linux/sort.h>
+#include <asm/unaligned.h>

 #include "super.h"
 #include "format.h"
@@ -1588,8 +1589,7 @@ static int kway_merge(struct super_block *sb,
 	nr_parents = max_t(unsigned long, 1, roundup_pow_of_two(nr) - 1);
 	/* root at [1] for easy sib/parent index calc, final pad for odd sib */
 	nr_nodes = 1 + nr_parents + nr + 1;
-	tnodes = __vmalloc(nr_nodes * sizeof(struct tourn_node),
-			   GFP_NOFS, PAGE_KERNEL);
+	tnodes = kc__vmalloc(nr_nodes * sizeof(struct tourn_node), GFP_NOFS);
 	if (!tnodes)
 		return -ENOMEM;

@@ -160,7 +160,11 @@ static void scoutfs_metadev_close(struct super_block *sb)
 		 * from kill_sb->put_super.
 		 */
 		lockdep_off();
+#ifdef KC_BLKDEV_PUT_HOLDER_ARG
+		blkdev_put(sbi->meta_bdev, sb);
+#else
 		blkdev_put(sbi->meta_bdev, SCOUTFS_META_BDEV_MODE);
+#endif
 		lockdep_on();
 		sbi->meta_bdev = NULL;
 	}
@@ -523,7 +527,11 @@ static int scoutfs_fill_super(struct super_block *sb, void *data, int silent)
 		goto out;
 	}

+#ifdef KC_BLKDEV_PUT_HOLDER_ARG
+	meta_bdev = blkdev_get_by_path(opts.metadev_path, SCOUTFS_META_BDEV_MODE, sb, NULL);
+#else
 	meta_bdev = blkdev_get_by_path(opts.metadev_path, SCOUTFS_META_BDEV_MODE, sb);
+#endif
 	if (IS_ERR(meta_bdev)) {
 		scoutfs_err(sb, "could not open metadev: error %ld",
 			    PTR_ERR(meta_bdev));
@@ -101,7 +101,11 @@ static inline bool SCOUTFS_IS_META_BDEV(struct scoutfs_super_block *super_block)
 	return !!(le64_to_cpu(super_block->flags) & SCOUTFS_FLAG_IS_META_BDEV);
 }

+#ifdef KC_HAVE_BLK_MODE_T
+#define SCOUTFS_META_BDEV_MODE (BLK_OPEN_READ | BLK_OPEN_WRITE | BLK_OPEN_EXCL)
+#else
 #define SCOUTFS_META_BDEV_MODE (FMODE_READ | FMODE_WRITE | FMODE_EXCL)
+#endif

 static inline bool scoutfs_forcing_unmount(struct super_block *sb)
 {
@@ -13,6 +13,7 @@
 #include <linux/kernel.h>
 #include <linux/slab.h>
 #include <linux/fs.h>
+#include <linux/blkdev.h>

 #include "super.h"
 #include "sysfs.h"
@@ -93,13 +93,9 @@ int scoutfs_setup_triggers(struct super_block *sb)
 		goto out;
 	}

-	for (i = 0; i < ARRAY_SIZE(triggers->atomics); i++) {
-		if (!debugfs_create_atomic_t(names[i], 0644, triggers->dir,
-					     &triggers->atomics[i])) {
-			ret = -ENOMEM;
-			goto out;
-		}
-	}
+	for (i = 0; i < ARRAY_SIZE(triggers->atomics); i++)
+		debugfs_create_atomic_t(names[i], 0644, triggers->dir,
+					&triggers->atomics[i]);

 	ret = 0;
 out:
@@ -183,6 +183,13 @@ static void *scoutfs_tseq_seq_next(struct seq_file *m, void *v, loff_t *pos)
 	ent = tseq_rb_next(ent);
 	if (ent)
 		*pos = ent->pos;
+	else
+		/*
+		 * once we hit the end, *pos is never used, but it has to
+		 * be updated to avoid an error in bpf_seq_read()
+		 */
+		(*pos)++;
+
 	return ent;
 }

@@ -27,6 +27,7 @@
 #include "totl.h"
 #include "counters.h"
 #include "util.h"
+#include "counters.h"
 #include "scoutfs_trace.h"
 #include "wkic.h"

@@ -552,6 +553,8 @@ static unsigned long wkic_shrink_count(struct shrinker *shrink, struct shrink_co
 {
 	struct wkic_info *winf = KC_SHRINKER_CONTAINER_OF(shrink, struct wkic_info);

+	scoutfs_inc_counter(winf->sb, wkic_count_objects);
+
 	return shrinker_min_long(atomic64_read(&winf->shrink_count));
 }

@@ -564,6 +567,8 @@ static unsigned long wkic_shrink_scan(struct shrinker *shrink, struct shrink_con
 	unsigned long after;
 	LIST_HEAD(empty_list);

+	scoutfs_inc_counter(winf->sb, wkic_scan_objects);
+
 	if (sc->nr_to_scan > 0) {
 		before = wkic_shrink_count(shrink, sc);
 		update_trees(sb, winf, &empty_list, sc->nr_to_scan, NULL, NULL, true);
@@ -1108,7 +1113,7 @@ int scoutfs_wkic_setup(struct super_block *sb)

 	winf->sb = sb;
 	KC_INIT_SHRINKER_FUNCS(&winf->shrinker, wkic_shrink_count, wkic_shrink_scan);
-	KC_REGISTER_SHRINKER(&winf->shrinker);
+	KC_REGISTER_SHRINKER(&winf->shrinker, "scoutfs-weak_item:" SCSBF, SCSB_ARGS(sb));

 	sbi->wkic_info = winf;
 	return 0;
@@ -1026,7 +1026,9 @@ static int scoutfs_xattr_get_handler

 static int scoutfs_xattr_set_handler
 #ifdef KC_XATTR_STRUCT_XATTR_HANDLER
-		(const struct xattr_handler *handler, struct dentry *dentry,
+		(const struct xattr_handler *handler,
+		 KC_VFS_NS_DEF
+		 struct dentry *dentry,
 		 struct inode *inode, const char *name, const void *value,
 		 size_t size, int flags)
 {
@@ -0,0 +1,244 @@
+package main
+
+import (
+	"flag"
+	"fmt"
+	"log"
+	"os"
+	"path/filepath"
+	"sync"
+	"syscall"
+
+	"restore/pkg/restore"
+)
+
+type options struct {
+	metaPath   string
+	sourceDir  string
+	numWorkers int
+}
+
+// hardlinkTracker keeps track of inodes we've already processed
+type hardlinkTracker struct {
+	sync.Mutex
+	seen map[uint64]bool
+}
+
+func newHardlinkTracker() *hardlinkTracker {
+	return &hardlinkTracker{
+		seen: make(map[uint64]bool),
+	}
+}
+
+func (h *hardlinkTracker) isNewInode(ino uint64, nlink bool) bool {
+	if !nlink {
+		return true
+	}
+
+	h.Lock()
+	defer h.Unlock()
+
+	if _, exists := h.seen[ino]; exists {
+		return false
+	}
+
+	h.seen[ino] = true
+	return true
+}
+
+// getFileInfo extracts file information from os.FileInfo
+func getFileInfo(info os.FileInfo) restore.FileInfo {
+	stat := info.Sys().(*syscall.Stat_t)
+
+	// Use target inode number if specified, otherwise use actual inode number
+	ino := uint64(stat.Ino)
+
+	return restore.FileInfo{
+		Ino:       ino,
+		Mode:      uint32(stat.Mode),
+		Uid:       uint32(stat.Uid),
+		Gid:       uint32(stat.Gid),
+		Size:      uint64(stat.Size),
+		Rdev:      uint64(stat.Rdev),
+		AtimeSec:  stat.Atim.Sec,
+		AtimeNsec: stat.Atim.Nsec,
+		MtimeSec:  stat.Mtim.Sec,
+		MtimeNsec: stat.Mtim.Nsec,
+		CtimeSec:  stat.Ctim.Sec,
+		CtimeNsec: stat.Ctim.Nsec,
+		IsDir:     info.IsDir(),
+		IsRegular: stat.Mode&syscall.S_IFMT == syscall.S_IFREG,
+	}
+}
+
+// getXAttrs gets extended attributes for a file/directory
+func getXAttrs(path string) ([]restore.XAttr, error) {
+	size, err := syscall.Listxattr(path, nil)
+	if err != nil || size == 0 {
+		return nil, err
+	}
+
+	buf := make([]byte, size)
+	size, err = syscall.Listxattr(path, buf)
+	if err != nil {
+		return nil, err
+	}
+
+	var xattrs []restore.XAttr
+	start := 0
+	for i := 0; i < size; i++ {
+		if buf[i] == 0 {
+			name := string(buf[start:i])
+			value, err := syscall.Getxattr(path, name, nil)
+			if err != nil {
+				continue
+			}
+
+			valueBuf := make([]byte, value)
+			_, err = syscall.Getxattr(path, name, valueBuf)
+			if err != nil {
+				continue
+			}
+
+			xattrs = append(xattrs, restore.XAttr{
+				Name:  name,
+				Value: valueBuf,
+			})
+			start = i + 1
+		}
+	}
+
+	return xattrs, nil
+}
+
+func restorePath(writer *restore.WorkerWriter, hlTracker *hardlinkTracker, path string, parentIno uint64) error {
+	entries, err := os.ReadDir(path)
+	if err != nil {
+		return fmt.Errorf("failed to read directory: %v", err)
+	}
+	log.Printf("Restoring path: %s", path)
+	var subdirs int
+	var nameBytes int
+
+	for pos, entry := range entries {
+		if entry.Name() == "." || entry.Name() == ".." {
+			continue
+		}
+
+		info, err := entry.Info()
+		if err != nil {
+			return fmt.Errorf("failed to get entry info: %v", err)
+		}
+
+		stat, ok := info.Sys().(*syscall.Stat_t)
+		if !ok {
+			return fmt.Errorf("failed to get stat_t")
+		}
+		nameBytes += len(entry.Name())
+		fullPath := filepath.Join(path, entry.Name())
+
+		// Recurse into directories
+		if info.IsDir() {
+			subdirs++
+
+			if err := restorePath(writer, hlTracker, fullPath, uint64(stat.Ino)); err != nil {
+				return err
+			}
+
+		}
+
+		err = writer.CreateEntry(parentIno, uint64(pos), uint64(stat.Ino), uint32(info.Mode()), entry.Name())
+		if err != nil {
+			return fmt.Errorf("failed to create entry: %v", err)
+		}
+
+		// Handle inode
+		isHardlink := stat.Nlink > 1
+		if !info.IsDir() && hlTracker.isNewInode(uint64(stat.Ino), isHardlink) {
+			fileInfo := getFileInfo(info)
+			err = writer.CreateInode(fileInfo)
+			if err != nil {
+				return fmt.Errorf("failed to create inode: %v", err)
+			}
+
+			// Handle xattrs
+			xattrs, err := getXAttrs(fullPath)
+			if err == nil {
+				for pos, xattr := range xattrs {
+					err = writer.CreateXAttr(uint64(stat.Ino), uint64(pos), xattr)
+					if err != nil {
+						return fmt.Errorf("failed to create xattr: %v", err)
+					}
+				}
+			}
+		}
+	}
+	// Get directory info
+	dirInfo, err := os.Stat(path)
+	if err != nil {
+		return fmt.Errorf("failed to stat directory: %v", err)
+	}
+
+	// Create directory inode
+	dirFileInfo := getFileInfo(dirInfo)
+	dirFileInfo.NrSubdirs = uint64(subdirs)
+	dirFileInfo.NameBytes = uint64(nameBytes)
+
+	return writer.CreateInode(dirFileInfo)
+}
+
+func main() {
+	opts := options{}
+	flag.StringVar(&opts.metaPath, "m", "", "path to metadata device")
+	flag.StringVar(&opts.sourceDir, "s", "", "path to source directory")
+	flag.IntVar(&opts.numWorkers, "w", 4, "number of worker threads")
+	flag.Parse()
+
+	if opts.metaPath == "" || opts.sourceDir == "" {
+		flag.Usage()
+		os.Exit(1)
+	}
+
+	// Create master and worker writers
+	master, workers, err := restore.NewWriters(opts.metaPath, opts.numWorkers)
+	if err != nil {
+		fmt.Fprintf(os.Stderr, "Failed to create writers: %v\n", err)
+		os.Exit(1)
+	}
+	defer master.Destroy()
+
+	// Create hardlink tracker
+	hlTracker := newHardlinkTracker()
+
+	// Start workers
+	var wg sync.WaitGroup
+	for i, worker := range workers {
+		wg.Add(1)
+		go func(w *restore.WorkerWriter, workerNum int) {
+			defer wg.Done()
+
+			// Each worker processes a subset of the directory tree
+			if err := restorePath(w, hlTracker, opts.sourceDir, 1); err != nil {
+				fmt.Fprintf(os.Stderr, "Worker %d failed: %v\n", workerNum, err)
+				os.Exit(1)
+			}
+			// Create root inode for source directory
+			rootInfo, err := os.Stat(opts.sourceDir)
+			if err != nil {
+				fmt.Fprintf(os.Stderr, "Failed to stat source directory: %v\n", err)
+				os.Exit(1)
+			}
+			w.CreateInode(getFileInfo(rootInfo))
+			err = w.Destroy()
+			if err != nil {
+				fmt.Fprintf(os.Stderr, "Failed to destroy worker: %v\n", err)
+				os.Exit(1)
+			}
+		}(worker, i)
+	}
+
+	// Wait for all workers to complete
+	wg.Wait()
+
+	fmt.Println("Restore completed successfully")
+}
@@ -0,0 +1,3 @@
+module restore
+
+go 1.21.11
@@ -0,0 +1,472 @@
+package restore
+
+/*
+#cgo CFLAGS: -I${SRCDIR}/../../../utils/src -I${SRCDIR}/../../../kmod/src
+#cgo LDFLAGS: -L${SRCDIR}/../../../utils/src -l:scoutfs_parallel_restore.a -lm
+
+#include <stdlib.h>
+#include <linux/types.h>
+#include <stdbool.h>
+#include <math.h>
+#include "sparse.h"
+#include "util.h"
+#include "format.h"
+#include "parallel_restore.h"
+
+// If there are any type conflicts, you might need to add:
+// #include "kernel_types.h"
+*/
+import "C"
+import (
+    "errors"
+    "fmt"
+    "sync"
+    "syscall"
+    "unsafe"
+)
+
+const batchSize = 1000
+const bufSize = 2 * 1024 * 1024
+
+type WorkerWriter struct {
+    writer      *C.struct_scoutfs_parallel_restore_writer
+    progressCh  chan *ScoutfsParallelWriterProgress
+    fileCreated int64
+    devFd       int
+    buf         unsafe.Pointer
+    wg          *sync.WaitGroup
+}
+
+type MasterWriter struct {
+    writer     *C.struct_scoutfs_parallel_restore_writer
+    progressCh chan *ScoutfsParallelWriterProgress
+    workers    []*WorkerWriter
+    wg         sync.WaitGroup
+    slice      *C.struct_scoutfs_parallel_restore_slice // Add slice field
+    progressWg sync.WaitGroup
+    devFd      int
+    super      *C.struct_scoutfs_super_block
+}
+
+type ScoutfsParallelWriterProgress struct {
+    Progress *C.struct_scoutfs_parallel_restore_progress
+    Slice    *C.struct_scoutfs_parallel_restore_slice
+}
+
+func (m *MasterWriter) aggregateProgress() {
+    defer m.progressWg.Done()
+    for progress := range m.progressCh {
+        ret := C.scoutfs_parallel_restore_add_progress(m.writer, progress.Progress)
+        if ret != 0 {
+            // Handle error appropriately, e.g., log it
+            fmt.Printf("Failed to add progress, error code: %d\n", ret)
+        }
+        if progress.Slice != nil {
+            ret = C.scoutfs_parallel_restore_add_slice(m.writer, progress.Slice)
+            C.free(unsafe.Pointer(progress.Slice))
+            if ret != 0 {
+                // Handle error appropriately, e.g., log it
+                fmt.Printf("Failed to add slice, error code: %d\n", ret)
+            }
+        }
+        // Free the C-allocated progress structures
+        C.free(unsafe.Pointer(progress.Progress))
+    }
+}
+
+func (m *MasterWriter) Destroy() {
+    m.wg.Wait()
+    close(m.progressCh)
+    m.progressWg.Wait()
+
+    if m.slice != nil {
+        C.free(unsafe.Pointer(m.slice)) // Free slice on error
+    }
+    if m.super != nil {
+        C.free(unsafe.Pointer(m.super)) // Free superblock on error
+    }
+    if m.devFd != 0 {
+        syscall.Close(m.devFd)
+    }
+    // Destroy master writer
+    C.scoutfs_parallel_restore_destroy_writer(&m.writer)
+}
+
+func NewWriters(path string, numWriters int) (*MasterWriter, []*WorkerWriter, error) {
+    if numWriters <= 1 {
+        return nil, nil, errors.New("number of writers must be positive")
+    }
+
+    devFd, err := syscall.Open(path, syscall.O_DIRECT|syscall.O_RDWR|syscall.O_EXCL, 0)
+    if err != nil {
+        return nil, nil, fmt.Errorf("failed to open metadata device '%s': %v", path, err)
+    }
+
+    var masterWriter MasterWriter
+    masterWriter.progressCh = make(chan *ScoutfsParallelWriterProgress, numWriters*2)
+    masterWriter.workers = make([]*WorkerWriter, 0, numWriters-1)
+    masterWriter.devFd = devFd
+
+    var ret C.int
+    // Allocate aligned memory for superblock
+    var super unsafe.Pointer
+    ret = C.posix_memalign(&super, 4096, C.SCOUTFS_BLOCK_SM_SIZE)
+    if ret != 0 {
+        masterWriter.Destroy()
+        return nil, nil, fmt.Errorf("failed to allocate aligned memory for superblock: %d", ret)
+    }
+    masterWriter.super = (*C.struct_scoutfs_super_block)(super)
+
+    // Read the superblock from devFd
+    superOffset := C.SCOUTFS_SUPER_BLKNO << C.SCOUTFS_BLOCK_SM_SHIFT
+    count, err := syscall.Pread(devFd, (*[1 << 30]byte)(super)[:C.SCOUTFS_BLOCK_SM_SIZE], int64(superOffset))
+    if err != nil {
+        masterWriter.Destroy()
+        return nil, nil, fmt.Errorf("failed to read superblock: %v", err)
+    }
+    if count != int(C.SCOUTFS_BLOCK_SM_SIZE) {
+        masterWriter.Destroy()
+        return nil, nil, fmt.Errorf("failed to read superblock, bytes read: %d", count)
+    }
+
+    // Check if the superblock is valid.
+    if C.le64_to_cpu(masterWriter.super.flags)&C.SCOUTFS_FLAG_IS_META_BDEV == 0 {
+        masterWriter.Destroy()
+        return nil, nil, errors.New("superblock is not a metadata device")
+    }
+
+    // Create master writer
+    ret = C.scoutfs_parallel_restore_create_writer(&masterWriter.writer)
+    if ret != 0 {
+        masterWriter.Destroy()
+        return nil, nil, errors.New("failed to create master writer")
+    }
+
+    ret = C.scoutfs_parallel_restore_import_super(masterWriter.writer, masterWriter.super, C.int(devFd))
+    if ret != 0 {
+        masterWriter.Destroy()
+        return nil, nil, fmt.Errorf("failed to import superblock, error code: %d", ret)
+    }
+
+    // Initialize slices for each worker
+    masterWriter.slice = (*C.struct_scoutfs_parallel_restore_slice)(C.malloc(C.size_t(numWriters) *
+        C.size_t(unsafe.Sizeof(C.struct_scoutfs_parallel_restore_slice{}))))
+    if masterWriter.slice == nil {
+        masterWriter.Destroy()
+        return nil, nil, errors.New("failed to allocate slices")
+    }
+
+    ret = C.scoutfs_parallel_restore_init_slices(masterWriter.writer,
+        masterWriter.slice,
+        C.int(numWriters))
+    if ret != 0 {
+        masterWriter.Destroy()
+        return nil, nil, errors.New("failed to initialize slices")
+    }
+
+    ret = C.scoutfs_parallel_restore_add_slice(masterWriter.writer, masterWriter.slice)
+    if ret != 0 {
+        masterWriter.Destroy()
+        return nil, nil, errors.New("failed to add slice to master writer")
+    }
+
+    // Create worker writers
+    for i := 1; i < numWriters; i++ {
+        var bufPtr unsafe.Pointer
+        if ret := C.posix_memalign(&bufPtr, 4096, bufSize); ret != 0 {
+            masterWriter.Destroy()
+            return nil, nil, fmt.Errorf("failed to allocate aligned worker buffer: %d", ret)
+        }
+
+        worker := &WorkerWriter{
+            progressCh: masterWriter.progressCh,
+            buf:        bufPtr,
+            wg:         &masterWriter.wg,
+        }
+        ret = C.scoutfs_parallel_restore_create_writer(&worker.writer)
+        if ret != 0 {
+            masterWriter.Destroy()
+            return nil, nil, errors.New("failed to create worker writer")
+        }
+
+        masterWriter.wg.Add(1)
+
+        // Use each slice for the corresponding worker
+        slice := (*C.struct_scoutfs_parallel_restore_slice)(unsafe.Pointer(uintptr(unsafe.Pointer(masterWriter.slice)) +
+            uintptr(i)*unsafe.Sizeof(C.struct_scoutfs_parallel_restore_slice{})))
+        ret = C.scoutfs_parallel_restore_add_slice(worker.writer, slice)
+        if ret != 0 {
+            C.scoutfs_parallel_restore_destroy_writer(&worker.writer)
+            masterWriter.Destroy()
+            return nil, nil, errors.New("failed to add slice to worker writer")
+        }
+
+        masterWriter.workers = append(masterWriter.workers, worker)
+    }
+    masterWriter.progressWg.Add(1)
+    go masterWriter.aggregateProgress()
+
+    return &masterWriter, masterWriter.workers, nil
+
+}
+
+func (w *WorkerWriter) getProgress(withSlice bool) (*ScoutfsParallelWriterProgress, error) {
+    progress := (*C.struct_scoutfs_parallel_restore_progress)(
+        C.malloc(C.size_t(unsafe.Sizeof(C.struct_scoutfs_parallel_restore_progress{}))),
+    )
+    if progress == nil {
+        return nil, errors.New("failed to allocate memory for progress")
+    }
+
+    // Fetch the current progress from the C library
+    ret := C.scoutfs_parallel_restore_get_progress(w.writer, progress)
+    if ret != 0 {
+        C.free(unsafe.Pointer(progress))
+        return nil, fmt.Errorf("failed to get progress, error code: %d", ret)
+    }
+
+    var slice *C.struct_scoutfs_parallel_restore_slice
+    if withSlice {
+        slice = (*C.struct_scoutfs_parallel_restore_slice)(
+            C.malloc(C.size_t(unsafe.Sizeof(C.struct_scoutfs_parallel_restore_slice{}))),
+        )
+        if slice == nil {
+            C.free(unsafe.Pointer(progress))
+            return nil, errors.New("failed to allocate memory for slice")
+        }
+
+        // Optionally fetch the slice information
+        ret = C.scoutfs_parallel_restore_get_slice(w.writer, slice)
+        if ret != 0 {
+            C.free(unsafe.Pointer(progress))
+            C.free(unsafe.Pointer(slice))
+            return nil, fmt.Errorf("failed to get slice, error code: %d", ret)
+        }
+    }
+
+    return &ScoutfsParallelWriterProgress{
+        Progress: progress,
+        Slice:    slice,
+    }, nil
+}
+
+// writeBuffer writes data from the buffer to the device file descriptor.
+// It uses scoutfs_parallel_restore_write_buf to get data and pwrite to write it.
+func (w *WorkerWriter) writeBuffer() (int64, error) {
+    var totalWritten int64
+    var count int64
+    var off int64
+    var ret C.int
+
+    // Allocate memory for off and count
+    offPtr := (*C.off_t)(unsafe.Pointer(&off))
+    countPtr := (*C.size_t)(unsafe.Pointer(&count))
+
+    for {
+        ret = C.scoutfs_parallel_restore_write_buf(w.writer, w.buf,
+            C.size_t(bufSize), offPtr, countPtr)
+
+        if ret != 0 {
+            return totalWritten, fmt.Errorf("failed to write buffer: error code %d", ret)
+        }
+
+        if count > 0 {
+            n, err := syscall.Pwrite(w.devFd, unsafe.Slice((*byte)(w.buf), count), off)
+            if err != nil {
+                return totalWritten, fmt.Errorf("pwrite failed: %v", err)
+            }
+            if n != int(count) {
+                return totalWritten, fmt.Errorf("pwrite wrote %d bytes; expected %d", n, count)
+            }
+            totalWritten += int64(n)
+        }
+
+        if count == 0 {
+            break
+        }
+    }
+
+    return totalWritten, nil
+}
+
+func (w *WorkerWriter) InsertEntry(entry *C.struct_scoutfs_parallel_restore_entry) error {
+    // Add the entry using the C library
+    ret := C.scoutfs_parallel_restore_add_entry(w.writer, entry)
+    if ret != 0 {
+        return fmt.Errorf("failed to add entry, error code: %d", ret)
+    }
+
+    // Increment the fileCreated counter
+    w.fileCreated++
+    if w.fileCreated >= batchSize {
+        _, err := w.writeBuffer()
+        if err != nil {
+            return fmt.Errorf("error writing buffers: %v", err)
+        }
+        // Allocate memory for progress and slice structures
+        progress, err := w.getProgress(false)
+        if err != nil {
+            return err
+        }
+        // Send the progress update to the shared progress channel
+        w.progressCh <- progress
+        // Reset the fileCreated counter
+        w.fileCreated = 0
+    }
+
+    return nil
+}
+
+func (w *WorkerWriter) InsertXattr(xattr *C.struct_scoutfs_parallel_restore_xattr) error {
+    ret := C.scoutfs_parallel_restore_add_xattr(w.writer, xattr)
+    if ret != 0 {
+        return fmt.Errorf("failed to add xattr, error code: %d", ret)
+    }
+    return nil
+}
+
+func (w *WorkerWriter) InsertInode(inode *C.struct_scoutfs_parallel_restore_inode) error {
+    ret := C.scoutfs_parallel_restore_add_inode(w.writer, inode)
+    if ret != 0 {
+        return fmt.Errorf("failed to add inode, error code: %d", ret)
+    }
+    return nil
+}
+
+// should only be called once
+func (w *WorkerWriter) Destroy() error {
+    defer w.wg.Done()
+    // Send final progress if there are remaining entries
+    if w.fileCreated > 0 {
+        _, err := w.writeBuffer()
+        if err != nil {
+            return err
+        }
+        progress := &ScoutfsParallelWriterProgress{
+            Progress: (*C.struct_scoutfs_parallel_restore_progress)(C.malloc(C.size_t(unsafe.Sizeof(C.struct_scoutfs_parallel_restore_progress{})))),
+            Slice:    (*C.struct_scoutfs_parallel_restore_slice)(C.malloc(C.size_t(unsafe.Sizeof(C.struct_scoutfs_parallel_restore_slice{})))),
+        }
+        w.progressCh <- progress
+        w.fileCreated = 0
+    }
+
+    if w.buf != nil {
+        C.free(w.buf)
+        w.buf = nil
+    }
+
+    C.scoutfs_parallel_restore_destroy_writer(&w.writer)
+    return nil
+}
+
+// Add these new types and functions to the existing restore.go file
+
+type FileInfo struct {
+    Ino       uint64
+    Mode      uint32
+    Uid       uint32
+    Gid       uint32
+    Size      uint64
+    Rdev      uint64
+    AtimeSec  int64
+    AtimeNsec int64
+    MtimeSec  int64
+    MtimeNsec int64
+    CtimeSec  int64
+    CtimeNsec int64
+    NrSubdirs uint64
+    NameBytes uint64
+    IsDir     bool
+    IsRegular bool
+}
+
+type XAttr struct {
+    Name  string
+    Value []byte
+}
+
+// CreateInode creates a C inode structure from FileInfo
+func (w *WorkerWriter) CreateInode(info FileInfo) error {
+    inode := (*C.struct_scoutfs_parallel_restore_inode)(C.malloc(C.size_t(unsafe.Sizeof(C.struct_scoutfs_parallel_restore_inode{}))))
+    if inode == nil {
+        return fmt.Errorf("failed to allocate inode")
+    }
+    defer C.free(unsafe.Pointer(inode))
+
+    inode.ino = C.__u64(info.Ino)
+    inode.mode = C.__u32(info.Mode)
+    inode.uid = C.__u32(info.Uid)
+    inode.gid = C.__u32(info.Gid)
+    inode.size = C.__u64(info.Size)
+    inode.rdev = C.uint(info.Rdev)
+
+    inode.atime.tv_sec = C.__time_t(info.AtimeSec)
+    inode.atime.tv_nsec = C.long(info.AtimeNsec)
+    inode.mtime.tv_sec = C.__time_t(info.MtimeSec)
+    inode.mtime.tv_nsec = C.long(info.MtimeNsec)
+    inode.ctime.tv_sec = C.__time_t(info.CtimeSec)
+    inode.ctime.tv_nsec = C.long(info.CtimeNsec)
+    inode.crtime = inode.ctime
+
+    if info.IsRegular && info.Size > 0 {
+        inode.offline = C.bool(true)
+    }
+
+    if info.IsDir {
+        inode.nr_subdirs = C.__u64(info.NrSubdirs)
+        inode.total_entry_name_bytes = C.__u64(info.NameBytes)
+    }
+
+    return w.InsertInode(inode)
+}
+
+// CreateEntry creates a directory entry
+func (w *WorkerWriter) CreateEntry(dirIno uint64, pos uint64, ino uint64, mode uint32, name string) error {
+    entryC := (*C.struct_scoutfs_parallel_restore_entry)(C.malloc(C.size_t(unsafe.Sizeof(C.struct_scoutfs_parallel_restore_entry{})) + C.size_t(len(name))))
+
+    if entryC == nil {
+        return fmt.Errorf("failed to allocate entry")
+    }
+    defer C.free(unsafe.Pointer(entryC))
+
+    entryC.dir_ino = C.__u64(dirIno)
+    entryC.pos = C.__u64(pos)
+    entryC.ino = C.__u64(ino)
+    entryC.mode = C.__u32(mode)
+    entryC.name_len = C.uint(len(name))
+
+    entryC.name = (*C.char)(C.malloc(C.size_t(len(name))))
+    if entryC.name == nil {
+        return fmt.Errorf("failed to allocate entry name")
+    }
+    defer C.free(unsafe.Pointer(entryC.name))
+    copy((*[1 << 30]byte)(unsafe.Pointer(entryC.name))[:len(name)], []byte(name))
+
+    return w.InsertEntry(entryC)
+}
+
+// CreateXAttr creates an extended attribute
+func (w *WorkerWriter) CreateXAttr(ino uint64, pos uint64, xattr XAttr) error {
+    xattrC := (*C.struct_scoutfs_parallel_restore_xattr)(C.malloc(C.size_t(unsafe.Sizeof(C.struct_scoutfs_parallel_restore_xattr{})) + C.size_t(len(xattr.Name)) + C.size_t(len(xattr.Value))))
+    if xattrC == nil {
+        return fmt.Errorf("failed to allocate xattr")
+    }
+    defer C.free(unsafe.Pointer(xattrC))
+
+    xattrC.ino = C.__u64(ino)
+    xattrC.pos = C.__u64(pos)
+    xattrC.name_len = C.uint(len(xattr.Name))
+    xattrC.value_len = C.__u32(len(xattr.Value))
+
+    xattrC.name = (*C.char)(C.malloc(C.size_t(len(xattr.Name))))
+    if xattrC.name == nil {
+        return fmt.Errorf("failed to allocate xattr name")
+    }
+    defer C.free(unsafe.Pointer(xattrC.name))
+
+    copy((*[1 << 30]byte)(unsafe.Pointer(xattrC.name))[:len(xattr.Name)], []byte(xattr.Name))
+
+    xattrC.value = unsafe.Pointer(&xattr.Value[0])
+
+    return w.InsertXattr(xattrC)
+}
@@ -0,0 +1,10 @@
+package restore
+
+import "testing"
+
+func TestNewWriters(t *testing.T) {
+	_, _, err := NewWriters("/tmp", 2)
+	if err != nil {
+		t.Fatalf("failed to create master writer: %v", err)
+	}
+}
@@ -0,0 +1 @@
+v2022.05.01-2-g787cd20
@@ -13,7 +13,9 @@ BIN := src/createmany			\
 	src/create_xattr_loop		\
 	src/fragmented_data_extents	\
 	src/o_tmpfile_umask		\
-	src/o_tmpfile_linkat
+	src/o_tmpfile_linkat		\
+	src/parallel_restore		\
+	src/restore_copy

 DEPS := $(wildcard src/*.d)

@@ -23,8 +25,12 @@ ifneq ($(DEPS),)
 -include $(DEPS)
 endif

+src/parallel_restore_cflags := ../utils/src/scoutfs_parallel_restore.a -lm
+src/restore_copy_cflags := ../utils/src/scoutfs_parallel_restore.a -lm
+
 $(BIN): %: %.c Makefile
-	gcc $(CFLAGS) -MD -MP -MF $*.d $< -o $@
+	gcc $(CFLAGS) -MD -MP -MF $*.d $< -o $@ $($(@)_cflags)
+

 .PHONY: clean
 clean:
@@ -7,8 +7,9 @@ t_status_msg()
 export T_PASS_STATUS=100
 export T_SKIP_STATUS=101
 export T_FAIL_STATUS=102
+export T_SKIP_PERMITTED_STATUS=103
 export T_FIRST_STATUS="$T_PASS_STATUS"
-export T_LAST_STATUS="$T_FAIL_STATUS"
+export T_LAST_STATUS="$T_SKIP_PERMITTED_STATUS"

 t_pass()
 {
@@ -21,6 +22,17 @@ t_skip()
 	exit $T_SKIP_STATUS
 }

+#
+# This exit code is *reserved* for tests that are up-front never going to work
+# in certain cases. This should be expressly documented per-case and made
+# abundantly clear before merging. The test itself should document its case.
+#
+t_skip_permitted()
+{
+	t_status_msg "$@"
+	exit $T_SKIP_PERMITTED_STATUS
+}
+
 t_fail()
 {
 	t_status_msg "$@"
@@ -143,14 +143,23 @@ t_filter_dmesg()
 	# change-devices causes loop device resizing
 	re="$re|loop: module loaded"
 	re="$re|loop[0-9].* detected capacity change from.*"
+	re="$re|dm-[0-9].* detected capacity change from.*"

 	# ignore systemd-journal rotating
 	re="$re|systemd-journald.*"

+	# process accounting can be noisy
+	re="$re|Process accounting resumed.*"
+
 	# format vers back/compat tries bad mounts
 	re="$re|scoutfs .* error.*outside of supported version.*"
 	re="$re|scoutfs .* error.*could not get .*super.*"

+	# ignore "unsafe core pattern" when xfstests tries to disable cores"
+	re="$re|Unsafe core_pattern used with fs.suid_dumpable=2.*"
+	re="$re|Pipe handler or fully qualified core dump path required.*"
+	re="$re|Set kernel.core_pattern before fs.suid_dumpable.*"
+
 	egrep -v "($re)" | \
 		ignore_harmless_unwind_kasan_stack_oob
 }
@@ -0,0 +1,155 @@
+== setup test directory
+== getfacl
+directory drwxr-xr-x 0 0 0 '.'
+# file: .
+# owner: root
+# group: root
+user::rwx
+group::r-x
+other::r-x
+
+== basic non-acl access through permissions
+directory drwxr-xr-x 0 44444 0 'dir-testuid'
+touch: cannot touch 'dir-testuid/file-group-write': Permission denied
+touch: cannot touch 'symlinkdir-testuid/symlink-file-group-write': Permission denied
+regular empty file -rw-r--r-- 22222 44444 0 'dir-testuid/file-group-write'
+regular empty file -rw-r--r-- 22222 44444 0 'symlinkdir-testuid/symlink-file-group-write'
+== basic acl access
+directory drwxr-xr-x 0 0 0 'dir-root'
+touch: cannot touch 'dir-root/file-group-write': Permission denied
+touch: cannot touch 'symlinkdir-root/file-group-write': Permission denied
+# file: dir-root
+# owner: root
+# group: root
+user::rwx
+user:22222:rwx
+group::r-x
+mask::rwx
+other::r-x
+
+regular empty file -rw-r--r-- 22222 0 0 'dir-root/file-group-write'
+regular empty file -rw-r--r-- 22222 0 0 'symlinkdir-root/file-group-write'
+== directory exec
+Success
+Success
+# file: dir-root
+# owner: root
+# group: root
+user::rwx
+user:22222:rw-
+group::r-x
+mask::rwx
+other::r-x
+
+Failed
+Failed
+# file: dir-root
+# owner: root
+# group: root
+user::rwx
+user:22222:rw-
+group::r-x
+group:44444:rwx
+mask::rwx
+other::r-x
+
+Success
+Success
+== get/set attr
+regular empty file -rw-r--r-- 0 0 0 'file-root'
+setfattr: file-root: Permission denied
+# file: file-root
+# owner: root
+# group: root
+user::rw-
+user:22222:rw-
+group::r--
+mask::rw-
+other::r--
+
+# file: file-root
+user.test2="Success"
+
+# file: file-root
+# owner: root
+# group: root
+user::rw-
+group::r--
+mask::r--
+other::r--
+
+setfattr: file-root: Permission denied
+# file: file-root
+user.test2="Success"
+
+# file: file-root
+# owner: root
+# group: root
+user::rw-
+group::r--
+group:44444:rw-
+mask::rw-
+other::r--
+
+# file: file-root
+user.test2="Success"
+user.test4="Success"
+
+== inheritance / default acl
+directory drwxr-xr-x 0 0 0 'dir-root2'
+mkdir: cannot create directory 'dir-root2/dir': Permission denied
+touch: cannot touch 'dir-root2/dir/file': No such file or directory
+# file: dir-root2
+# owner: root
+# group: root
+user::rwx
+group::r-x
+other::r-x
+default:user::rwx
+default:user:22222:rwx
+default:group::r-x
+default:mask::rwx
+default:other::r-x
+
+mkdir: cannot create directory 'dir-root2/dir': Permission denied
+touch: cannot touch 'dir-root2/dir/file': No such file or directory
+# file: dir-root2
+# owner: root
+# group: root
+user::rwx
+user:22222:rwx
+group::r-x
+mask::rwx
+other::r-x
+default:user::rwx
+default:user:22222:rwx
+default:group::r-x
+default:mask::rwx
+default:other::r-x
+
+directory drwxrwxr-x 22222 0 4 'dir-root2/dir'
+# file: dir-root2/dir
+# owner: 22222
+# group: root
+user::rwx
+user:22222:rwx
+group::r-x
+mask::rwx
+other::r-x
+default:user::rwx
+default:user:22222:rwx
+default:group::r-x
+default:mask::rwx
+default:other::r-x
+
+regular empty file -rw-rw-r-- 22222 0 0 'dir-root2/dir/file'
+# file: dir-root2/dir/file
+# owner: 22222
+# group: root
+user::rw-
+user:22222:rwx	#effective:rw-
+group::r-x	#effective:r--
+mask::rw-
+other::r--
+
+== cleanup
@@ -56,3 +56,4 @@ mv: cannot move '/mnt/test/test/basic-posix-consistency/dir/c/clobber' to '/mnt/
 == inode indexes match after removing and syncing
 == concurrent creates make one file
 one-file
+== cleanup
@@ -25,3 +25,4 @@ rc: 0
 equal_prepared
 large_prepared
 resized larger test rc: 0
+== cleanup
@@ -1,29 +1,29 @@
 == initial writes smaller than prealloc grow to prealloc size
-/mnt/test/test/data-prealloc/file-1: 7 extents found
-/mnt/test/test/data-prealloc/file-2: 7 extents found
+/mnt/test/test/data-prealloc/file-1: extents: 7
+/mnt/test/test/data-prealloc/file-2: extents: 7
 == larger files get full prealloc extents
-/mnt/test/test/data-prealloc/file-1: 9 extents found
-/mnt/test/test/data-prealloc/file-2: 9 extents found
+/mnt/test/test/data-prealloc/file-1: extents: 9
+/mnt/test/test/data-prealloc/file-2: extents: 9
 == non-streaming writes with contig have per-block extents
-/mnt/test/test/data-prealloc/file-1: 32 extents found
-/mnt/test/test/data-prealloc/file-2: 32 extents found
+/mnt/test/test/data-prealloc/file-1: extents: 32
+/mnt/test/test/data-prealloc/file-2: extents: 32
 == any writes to region prealloc get full extents
-/mnt/test/test/data-prealloc/file-1: 4 extents found
-/mnt/test/test/data-prealloc/file-2: 4 extents found
-/mnt/test/test/data-prealloc/file-1: 4 extents found
-/mnt/test/test/data-prealloc/file-2: 4 extents found
+/mnt/test/test/data-prealloc/file-1: extents: 4
+/mnt/test/test/data-prealloc/file-2: extents: 4
+/mnt/test/test/data-prealloc/file-1: extents: 4
+/mnt/test/test/data-prealloc/file-2: extents: 4
 == streaming offline writes get full extents either way
-/mnt/test/test/data-prealloc/file-1: 4 extents found
-/mnt/test/test/data-prealloc/file-2: 4 extents found
-/mnt/test/test/data-prealloc/file-1: 4 extents found
-/mnt/test/test/data-prealloc/file-2: 4 extents found
+/mnt/test/test/data-prealloc/file-1: extents: 4
+/mnt/test/test/data-prealloc/file-2: extents: 4
+/mnt/test/test/data-prealloc/file-1: extents: 4
+/mnt/test/test/data-prealloc/file-2: extents: 4
 == goofy preallocation amounts work
-/mnt/test/test/data-prealloc/file-1: 5 extents found
-/mnt/test/test/data-prealloc/file-2: 5 extents found
-/mnt/test/test/data-prealloc/file-1: 5 extents found
-/mnt/test/test/data-prealloc/file-2: 5 extents found
-/mnt/test/test/data-prealloc/file-1: 3 extents found
-/mnt/test/test/data-prealloc/file-2: 3 extents found
+/mnt/test/test/data-prealloc/file-1: extents: 6
+/mnt/test/test/data-prealloc/file-2: extents: 6
+/mnt/test/test/data-prealloc/file-1: extents: 6
+/mnt/test/test/data-prealloc/file-2: extents: 6
+/mnt/test/test/data-prealloc/file-1: extents: 3
+/mnt/test/test/data-prealloc/file-2: extents: 3
 == block writes into region allocs hole
 wrote blk 24
 wrote blk 32
@@ -0,0 +1,28 @@
+== simple mkfs/restore/mount
+committed_seq     1120
+total_meta_blocks 163840
+total_data_blocks 15728640
+   1440    1440   57120
+     80      80     400
+0: offset: 0 length: 1 flags: O.L
+extents: 1
+0: offset: 0 length: 1 flags: O.L
+extents: 1
+0: offset: 0 length: 1 flags: O.L
+extents: 1
+0: offset: 0 length: 1 flags: O.L
+extents: 1
+    Type  Size     Total   Used      Free  Use%  
+MetaData  64KB    163840  34722    129118    21  
+    Data   4KB  15728640     64  15728576     0  
+  7 13,L,- 15,L,- 17,L,- I 33 -
+== just under ENOSPC
+    Type  Size     Total    Used      Free  Use%  
+MetaData  64KB    163840  155666      8174    95  
+    Data   4KB  15728640      64  15728576     0  
+== just over ENOSPC
+== ENOSPC
+== attempt to restore data device
+== attempt format_v1 restore
+== test if previously mounted
+== cleanup
@@ -38,3 +38,4 @@ dd: error writing '/mnt/test/test/quota/dir/file': Disk quota exceeded
 fallocate: fallocate failed: Disk quota exceeded
 == added rules work after bulk restore
 touch: cannot touch '/mnt/test/test/quota/dir/file': Disk quota exceeded
+== cleanup
@@ -22,10 +22,8 @@ scoutfs: setattr failed: Invalid argument (22)
 == large ctime is set
 1972-02-19 00:06:25.999999999 +0000
 == large offline extents are created
-Filesystem type is: 554f4353
-File size of /mnt/test/test/setattr_more/file is 40988672 (10007 blocks of 4096 bytes)
- ext:     logical_offset:        physical_offset: length:   expected: flags:
-   0:        0..   10006:          0..     10006:  10007:             unknown,eof
-/mnt/test/test/setattr_more/file: 1 extent found
+0: offset: 0 0 length: 10007 flags: O.L
+extents: 1
 == correct offline extent length
 976563
+== omitting data_version should not fail
@@ -1,5 +1,9 @@
 == create/release/stage single block file
+0: offset: 0 0 length: 1 flags: O.L
+extents: 1
 == create/release/stage larger file
+0: offset: 0 0 length: 4096 flags: O.L
+extents: 1
 == multiple release,drop_cache,stage cycles
 == release+stage shouldn't change stat, data seq or vers
 == stage does change meta_seq
@@ -7,16 +11,22 @@
 stage: must provide file version with --data-version
 Try `stage --help' or `stage --usage' for more information.
 == wrapped region fails
-stage returned -1, not 4096: error Invalid argument (22)
+stage returned -1, not 8192: error Invalid argument (22)
 scoutfs: stage failed: Input/output error (5)
 == non-block aligned offset fails
 stage returned -1, not 4095: error Invalid argument (22)
 scoutfs: stage failed: Input/output error (5)
+0: offset: 0 0 length: 1 flags: O.L
+extents: 1
 == non-block aligned len within block fails
 stage returned -1, not 1024: error Invalid argument (22)
 scoutfs: stage failed: Input/output error (5)
+0: offset: 0 0 length: 1 flags: O.L
+extents: 1
 == partial final block that writes to i_size does work
 == zero length stage doesn't bring blocks online
+0: offset: 0 0 length: 100 flags: O.L
+extents: 1
 == stage of non-regular file fails
 ioctl failed: Inappropriate ioctl for device (25)
 stage: must provide file version with --data-version
@@ -1,2 +1,3 @@
 == create initial files
 == race stage and release
+== cleanup
@@ -5,24 +5,44 @@ generic/004
 generic/005
 generic/006
 generic/007
+generic/008
+generic/009
 generic/011
+generic/012
 generic/013
 generic/014
+generic/015
+generic/016
+generic/018
 generic/020
+generic/021
+generic/022
 generic/023
 generic/024
+generic/025
+generic/026
 generic/028
+generic/031
 generic/032
+generic/033
 generic/034
 generic/035
 generic/037
 generic/039
 generic/040
 generic/041
+generic/050
+generic/052
 generic/053
 generic/056
 generic/057
+generic/058
+generic/059
+generic/060
+generic/061
 generic/062
+generic/063
+generic/064
 generic/065
 generic/066
 generic/067
@@ -31,42 +51,193 @@ generic/070
 generic/071
 generic/073
 generic/076
+generic/078
+generic/079
+generic/081
+generic/082
 generic/084
 generic/086
 generic/087
 generic/088
 generic/090
+generic/091
 generic/092
+generic/094
+generic/096
+generic/097
 generic/098
+generic/099
 generic/101
 generic/104
 generic/105
 generic/106
 generic/107
+generic/110
+generic/111
+generic/113
+generic/114
+generic/115
+generic/116
 generic/117
+generic/118
+generic/119
+generic/121
+generic/122
+generic/123
 generic/124
+generic/128
 generic/129
+generic/130
 generic/131
+generic/134
+generic/135
+generic/136
+generic/138
+generic/139
+generic/140
+generic/142
+generic/143
+generic/144
+generic/145
+generic/146
+generic/147
+generic/148
+generic/149
+generic/150
+generic/151
+generic/152
+generic/153
+generic/154
+generic/155
+generic/156
+generic/157
+generic/158
+generic/159
+generic/160
+generic/161
+generic/162
+generic/163
 generic/169
+generic/171
+generic/172
+generic/173
+generic/174
+generic/177
+generic/178
+generic/179
+generic/180
+generic/181
+generic/182
+generic/183
 generic/184
+generic/185
+generic/188
+generic/189
+generic/190
+generic/191
+generic/193
+generic/194
+generic/195
+generic/196
+generic/197
+generic/198
+generic/199
+generic/200
+generic/201
+generic/202
+generic/203
+generic/205
+generic/206
+generic/207
+generic/210
+generic/211
+generic/212
+generic/214
+generic/216
+generic/217
+generic/218
+generic/219
+generic/220
 generic/221
+generic/222
+generic/223
+generic/225
+generic/227
 generic/228
+generic/229
+generic/230
+generic/235
 generic/236
 generic/237
+generic/238
+generic/240
+generic/244
 generic/245
 generic/249
+generic/250
+generic/252
+generic/253
+generic/254
+generic/255
+generic/256
 generic/257
 generic/258
+generic/259
+generic/260
+generic/261
+generic/262
+generic/263
+generic/264
+generic/265
+generic/266
+generic/267
+generic/268
+generic/271
+generic/272
+generic/276
+generic/277
+generic/278
+generic/279
+generic/281
+generic/282
+generic/283
+generic/284
 generic/286
+generic/287
+generic/288
+generic/289
+generic/290
+generic/291
+generic/292
+generic/293
 generic/294
+generic/295
+generic/296
+generic/301
+generic/302
+generic/303
+generic/304
+generic/305
 generic/306
 generic/307
 generic/308
 generic/309
+generic/312
 generic/313
+generic/314
 generic/315
+generic/316
+generic/317
 generic/319
 generic/322
+generic/324
+generic/326
+generic/327
+generic/328
+generic/329
+generic/330
+generic/331
+generic/332
 generic/335
 generic/336
 generic/337
@@ -74,10 +245,255 @@ generic/341
 generic/342
 generic/343
 generic/348
+generic/353
+generic/355
+generic/358
+generic/359
 generic/360
+generic/361
+generic/362
+generic/363
+generic/364
+generic/365
+generic/366
+generic/367
+generic/368
+generic/369
+generic/370
+generic/371
+generic/372
+generic/373
+generic/374
 generic/375
 generic/376
 generic/377
+generic/378
+generic/379
+generic/380
+generic/381
+generic/382
+generic/383
+generic/384
+generic/385
+generic/386
+generic/389
+generic/391
+generic/392
+generic/393
+generic/394
+generic/395
+generic/396
+generic/397
+generic/398
+generic/400
+generic/401
+generic/402
+generic/403
+generic/404
+generic/406
+generic/407
+generic/408
+generic/412
+generic/413
+generic/414
+generic/417
+generic/419
+generic/420
+generic/421
+generic/422
+generic/424
+generic/425
+generic/426
+generic/427
+generic/436
+generic/439
+generic/440
+generic/443
+generic/445
+generic/446
+generic/448
+generic/449
+generic/450
+generic/451
+generic/453
+generic/454
+generic/456
+generic/458
+generic/460
+generic/462
+generic/463
+generic/465
+generic/466
+generic/468
+generic/469
+generic/470
+generic/471
+generic/474
+generic/477
+generic/478
+generic/479
+generic/480
+generic/481
+generic/483
+generic/485
+generic/486
+generic/487
+generic/488
+generic/489
+generic/490
+generic/491
+generic/492
+generic/498
+generic/499
+generic/501
+generic/502
+generic/503
+generic/504
+generic/505
+generic/506
+generic/507
+generic/508
+generic/509
+generic/510
+generic/511
+generic/512
+generic/513
+generic/514
+generic/515
+generic/516
+generic/517
+generic/518
+generic/519
+generic/520
+generic/523
+generic/524
+generic/525
+generic/526
+generic/527
+generic/528
+generic/529
+generic/530
+generic/531
+generic/533
+generic/534
+generic/535
+generic/536
+generic/537
+generic/538
+generic/539
+generic/540
+generic/541
+generic/542
+generic/543
+generic/544
+generic/545
+generic/546
+generic/547
+generic/548
+generic/549
+generic/550
+generic/552
+generic/553
+generic/555
+generic/556
+generic/557
+generic/566
+generic/567
+generic/571
+generic/572
+generic/573
+generic/574
+generic/575
+generic/576
+generic/577
+generic/578
+generic/580
+generic/581
+generic/582
+generic/583
+generic/584
+generic/586
+generic/587
+generic/588
+generic/591
+generic/592
+generic/593
+generic/594
+generic/595
+generic/596
+generic/597
+generic/598
+generic/599
+generic/600
+generic/601
+generic/602
+generic/603
+generic/604
+generic/605
+generic/606
+generic/607
+generic/608
+generic/609
+generic/610
+generic/611
+generic/612
+generic/613
+generic/618
+generic/621
+generic/623
+generic/624
+generic/625
+generic/626
+generic/628
+generic/629
+generic/630
+generic/632
+generic/634
+generic/635
+generic/637
+generic/639
+generic/640
+generic/644
+generic/645
+generic/646
+generic/647
+generic/651
+generic/652
+generic/653
+generic/654
+generic/655
+generic/657
+generic/658
+generic/659
+generic/660
+generic/661
+generic/662
+generic/663
+generic/664
+generic/665
+generic/666
+generic/667
+generic/668
+generic/669
+generic/673
+generic/674
+generic/675
+generic/676
+generic/677
+generic/678
+generic/679
+generic/680
+generic/681
+generic/682
+generic/683
+generic/684
+generic/685
+generic/686
+generic/687
+generic/688
+generic/689
+shared/002
+shared/032
 Not
 run:
 generic/008
@@ -251,8 +667,6 @@ generic/331
 generic/332
 generic/353
 generic/355
-generic/356
-generic/357
 generic/358
 generic/359
 generic/361
@@ -278,11 +692,174 @@ generic/383
 generic/384
 generic/385
 generic/386
-shared/001
+generic/391
+generic/392
+generic/395
+generic/396
+generic/397
+generic/398
+generic/400
+generic/402
+generic/404
+generic/406
+generic/407
+generic/408
+generic/412
+generic/413
+generic/414
+generic/417
+generic/419
+generic/420
+generic/421
+generic/422
+generic/424
+generic/425
+generic/427
+generic/439
+generic/440
+generic/446
+generic/449
+generic/450
+generic/451
+generic/453
+generic/454
+generic/456
+generic/458
+generic/462
+generic/463
+generic/465
+generic/466
+generic/468
+generic/469
+generic/470
+generic/471
+generic/474
+generic/485
+generic/487
+generic/488
+generic/491
+generic/492
+generic/499
+generic/501
+generic/503
+generic/505
+generic/506
+generic/507
+generic/508
+generic/511
+generic/513
+generic/514
+generic/515
+generic/516
+generic/517
+generic/518
+generic/519
+generic/520
+generic/528
+generic/530
+generic/536
+generic/537
+generic/538
+generic/539
+generic/540
+generic/541
+generic/542
+generic/543
+generic/544
+generic/545
+generic/546
+generic/548
+generic/549
+generic/550
+generic/552
+generic/553
+generic/555
+generic/556
+generic/566
+generic/567
+generic/572
+generic/573
+generic/574
+generic/575
+generic/576
+generic/577
+generic/578
+generic/580
+generic/581
+generic/582
+generic/583
+generic/584
+generic/586
+generic/587
+generic/588
+generic/591
+generic/592
+generic/593
+generic/594
+generic/595
+generic/596
+generic/597
+generic/598
+generic/599
+generic/600
+generic/601
+generic/602
+generic/603
+generic/605
+generic/606
+generic/607
+generic/608
+generic/609
+generic/610
+generic/612
+generic/613
+generic/621
+generic/623
+generic/624
+generic/625
+generic/626
+generic/628
+generic/629
+generic/630
+generic/635
+generic/644
+generic/645
+generic/646
+generic/647
+generic/651
+generic/652
+generic/653
+generic/654
+generic/655
+generic/657
+generic/658
+generic/659
+generic/660
+generic/661
+generic/662
+generic/663
+generic/664
+generic/665
+generic/666
+generic/667
+generic/668
+generic/669
+generic/673
+generic/674
+generic/675
+generic/677
+generic/678
+generic/679
+generic/680
+generic/681
+generic/682
+generic/683
+generic/684
+generic/685
+generic/686
+generic/687
+generic/688
+generic/689
 shared/002
-shared/003
-shared/004
 shared/032
-shared/051
-shared/289
-Passed all 79 tests
+Passed all 495 tests
@@ -515,6 +515,7 @@ msg "running tests"
 passed=0
 skipped=0
 failed=0
+skipped_permitted=0
 for t in $tests; do
 	# tests has basenames from sequence, get path and name
 	t="tests/$t"
@@ -557,6 +558,9 @@ for t in $tests; do

 	printf "  %-30s $stats" "$test_name"

+	# mark in dmesg as to what test we are running
+	echo "run scoutfs test $test_name" > /dev/kmsg
+
 	# record dmesg before
 	dmesg | t_filter_dmesg > "$T_TMPDIR/dmesg.before"

@@ -618,6 +622,10 @@ for t in $tests; do
 		grep -s -v "^$test_name " "$last" > "$last.tmp"
 		echo "$test_name $stats" >> "$last.tmp"
 		mv -f "$last.tmp" "$last"
+	elif [ "$sts" == "$T_SKIP_PERMITTED_STATUS" ]; then
+		echo "  [ skipped (permitted): $message ]"
+		echo "$test_name skipped (permitted) $message " >> "$T_RESULTS/skip.log"
+		((skipped_permitted++))
 	elif [ "$sts" == "$T_SKIP_STATUS" ]; then
 		echo "  [ skipped: $message ]"
 		echo "$test_name $message" >> "$T_RESULTS/skip.log"
@@ -631,7 +639,7 @@ for t in $tests; do
 	fi
 done

-msg "all tests run: $passed passed, $skipped skipped, $failed failed"
+msg "all tests run: $passed passed, $skipped skipped, $skipped_permitted skipped (permitted), $failed failed"


 if [ -n "$T_TRACE_GLOB" -o -n "$T_TRACE_PRINTK" ]; then
@@ -1,6 +1,7 @@
 export-get-name-parent.sh
 basic-block-counts.sh
 basic-bad-mounts.sh
+basic-posix-acl.sh
 inode-items-updated.sh
 simple-inode-index.sh
 simple-staging.sh
@@ -53,4 +54,5 @@ archive-light-cycle.sh
 block-stale-reads.sh
 inode-deletion.sh
 renameat2-noreplace.sh
+parallel_restore.sh
 xfstests.sh
@@ -0,0 +1,838 @@
+#define _GNU_SOURCE /* O_DIRECT */
+#include <stdlib.h>
+#include <stdio.h>
+#include <stdbool.h>
+#include <unistd.h>
+#include <fcntl.h>
+#include <sys/types.h>
+#include <sys/xattr.h>
+#include <ctype.h>
+#include <string.h>
+#include <errno.h>
+#include <limits.h>
+#include <time.h>
+#include <sys/prctl.h>
+#include <signal.h>
+#include <sys/socket.h>
+
+#include "../../utils/src/sparse.h"
+#include "../../utils/src/util.h"
+#include "../../utils/src/list.h"
+#include "../../utils/src/parse.h"
+#include "../../kmod/src/format.h"
+#include "../../utils/src/parallel_restore.h"
+
+/*
+ * XXX:
+ *  - add a nice description of what's going on
+ *  - mention allocator contention
+ *  - test child process dying handling
+ *  - root dir entry name length is wrong
+ */
+
+#define ERRF " errno %d (%s)"
+#define ERRA errno, strerror(errno)
+
+#define error_exit(cond, fmt, args...)			\
+do {							\
+	if (cond) {					\
+		printf("error: "fmt"\n", ##args);	\
+		exit(1);				\
+	}						\
+} while (0)
+
+#define dprintf(fmt, args...)		\
+do {					\
+	if (0)				\
+		printf(fmt, ##args);	\
+} while (0)
+
+#define REG_MODE (S_IFREG | 0644)
+#define DIR_MODE (S_IFDIR | 0755)
+
+struct opts {
+	unsigned long long buf_size;
+
+	unsigned long long write_batch;
+	unsigned long long low_dirs;
+	unsigned long long high_dirs;
+	unsigned long long low_files;
+	unsigned long long high_files;
+	char *meta_path;
+	unsigned long long total_files;
+	bool read_only;
+	unsigned long long seed;
+	unsigned long long nr_writers;
+};
+
+static void usage(void)
+{
+	printf("usage:\n"
+	       " -b NR       | threads write blocks in batches files (100000)\n"
+	       " -d LOW:HIGH | range of subdirs per directory (5:10)\n"
+	       " -f LOW:HIGH | range of files per directory (10:20)\n"
+	       " -m PATH     | path to metadata device\n"
+	       " -n NR       | total number of files to create (100)\n"
+	       " -r          | read-only, all work except writing, measure cpu cost\n"
+	       " -s NR       | randomization seed (random)\n"
+	       " -w NR       | number of writing processes to fork (online cpus)\n"
+	       );
+}
+
+static size_t write_bufs(struct opts *opts, struct scoutfs_parallel_restore_writer *wri,
+			 void *buf, size_t buf_size, int dev_fd)
+{
+	size_t total = 0;
+	size_t count;
+	off_t off;
+	int ret;
+
+	do {
+		ret = scoutfs_parallel_restore_write_buf(wri, buf, buf_size, &off, &count);
+		error_exit(ret, "write buf %d", ret);
+
+		if (count > 0) {
+			if (!opts->read_only)
+				ret = pwrite(dev_fd, buf, count, off);
+			else
+				ret = count;
+			error_exit(ret != count, "pwrite count %zu ret %d", count, ret);
+			total += ret;
+		}
+	} while (count > 0);
+
+	return total;
+}
+
+struct gen_inode {
+	struct scoutfs_parallel_restore_inode inode;
+	struct scoutfs_parallel_restore_xattr **xattrs;
+	u64 nr_xattrs;
+	struct scoutfs_parallel_restore_entry **entries;
+	u64 nr_files;
+	u64 nr_entries;
+};
+
+static void free_gino(struct gen_inode *gino)
+{
+	u64 i;
+
+	if (gino) {
+		if (gino->entries) {
+			for (i = 0; i < gino->nr_entries; i++)
+				free(gino->entries[i]);
+			free(gino->entries);
+		}
+		if (gino->xattrs) {
+			for (i = 0; i < gino->nr_xattrs; i++)
+				free(gino->xattrs[i]);
+			free(gino->xattrs);
+		}
+		free(gino);
+	}
+}
+
+static struct scoutfs_parallel_restore_xattr *
+generate_xattr(struct opts *opts, u64 ino, u64 pos, char *name, int name_len, void *value,
+		int value_len)
+{
+	struct scoutfs_parallel_restore_xattr *xattr;
+
+	xattr = malloc(sizeof(struct scoutfs_parallel_restore_xattr) + name_len + value_len);
+	error_exit(!xattr, "error allocating generated xattr");
+
+	*xattr = (struct scoutfs_parallel_restore_xattr) {
+		.ino = ino,
+		.pos = pos,
+		.name_len = name_len,
+		.value_len = value_len,
+	};
+
+	xattr->name = (void *)(xattr + 1);
+	xattr->value = (void *)(xattr->name + name_len);
+
+	memcpy(xattr->name, name, name_len);
+	if (value_len)
+		memcpy(xattr->value, value, value_len);
+
+	return xattr;
+}
+
+static struct gen_inode *generate_inode(struct opts *opts, u64 ino, mode_t mode)
+{
+	struct gen_inode *gino;
+	struct timespec now;
+
+	clock_gettime(CLOCK_REALTIME, &now);
+
+	gino = calloc(1, sizeof(struct gen_inode));
+	error_exit(!gino, "failure allocating generated inode");
+
+	gino->inode = (struct scoutfs_parallel_restore_inode) {
+		.ino = ino,
+		.meta_seq = ino,
+		.data_seq = 0,
+		.mode = mode,
+		.atime = now,
+		.ctime = now,
+		.mtime = now,
+		.crtime = now,
+	};
+
+	/*
+	 * hacky creation of a bunch of xattrs for now.
+	 */
+	if ((mode & S_IFMT) == S_IFREG) {
+		#define NV(n, v) { n, sizeof(n) - 1, v, sizeof(v) - 1, }
+		struct name_val {
+			char *name;
+			int len;
+			char *value;
+			int value_len;
+		} nv[] = {
+			NV("scoutfs.hide.totl.acct.8314611887310466424.2.0", "1"),
+			NV("scoutfs.hide.srch.sam_vol_E01001L6_4", ""),
+			NV("scoutfs.hide.sam_reqcopies", ""),
+			NV("scoutfs.hide.sam_copy_2", ""),
+			NV("scoutfs.hide.totl.acct.F01030L6.8314611887310466424.7.30", "1"),
+			NV("scoutfs.hide.sam_copy_1", ""),
+			NV("scoutfs.hide.srch.sam_vol_F01030L6_4", ""),
+			NV("scoutfs.hide.srch.sam_release_cand", ""),
+			NV("scoutfs.hide.sam_restime", ""),
+			NV("scoutfs.hide.sam_uuid", ""),
+			NV("scoutfs.hide.totl.acct.8314611887310466424.3.0", "1"),
+			NV("scoutfs.hide.srch.sam_vol_F01030L6", ""),
+			NV("scoutfs.hide.srch.sam_uuid_865939b7-24d6-472f-b85c-7ce7afeb813a", ""),
+			NV("scoutfs.hide.srch.sam_vol_E01001L6", ""),
+			NV("scoutfs.hide.totl.acct.E01001L6.8314611887310466424.7.1", "1"),
+			NV("scoutfs.hide.totl.acct.8314611887310466424.4.0", "1"),
+			NV("scoutfs.hide.totl.acct.8314611887310466424.11.0", "1"),
+			NV("scoutfs.hide.totl.acct.8314611887310466424.1.0", "1"),
+		};
+		unsigned int nr = array_size(nv);
+		int i;
+
+		gino->xattrs = calloc(nr, sizeof(struct scoutfs_parallel_restore_xattr *));
+
+		for (i = 0; i < nr; i++)
+			gino->xattrs[i] = generate_xattr(opts, ino, i, nv[i].name, nv[i].len,
+							 nv[i].value, nv[i].value_len);
+
+		gino->nr_xattrs = nr;
+		gino->inode.nr_xattrs = nr;
+
+		gino->inode.size = 4096;
+		gino->inode.offline = true;
+	}
+
+	return gino;
+}
+
+static struct scoutfs_parallel_restore_entry *
+generate_entry(struct opts *opts, char *prefix, u64 nr, u64 dir_ino, u64 pos, u64 ino, mode_t mode)
+{
+	struct scoutfs_parallel_restore_entry *entry;
+	char buf[PATH_MAX];
+	int bytes;
+
+	bytes = snprintf(buf, sizeof(buf), "%s-%llu", prefix, nr);
+
+	entry = malloc(sizeof(struct scoutfs_parallel_restore_entry) + bytes);
+	error_exit(!entry, "error allocating generated entry");
+
+	*entry = (struct scoutfs_parallel_restore_entry) {
+		.dir_ino = dir_ino,
+		.pos = pos,
+		.ino = ino,
+		.mode = mode,
+		.name = (void *)(entry + 1),
+		.name_len = bytes,
+	};
+
+	memcpy(entry->name, buf, bytes);
+
+	return entry;
+}
+
+/*
+ * since the _parallel_restore_quota_rule mimics the squota_rule found in the
+ * kernel we can also mimic its rule_to_irule function
+ */
+
+#define TEST_RULE_STR "7 13,L,- 15,L,- 17,L,- I 33 -"
+
+static struct scoutfs_parallel_restore_quota_rule *
+generate_quota(struct opts *opts)
+{
+	struct scoutfs_parallel_restore_quota_rule *prule;
+	int err;
+
+	prule = calloc(1, sizeof(struct scoutfs_parallel_restore_quota_rule));
+	error_exit(!prule, "Quota rule alloc failed");
+
+	err = sscanf(TEST_RULE_STR, " %hhu %llu,%c,%c %llu,%c,%c %llu,%c,%c %c %llu %c",
+		     &prule->prio,
+			 &prule->names[0].val, &prule->names[0].source, &prule->names[0].flags,
+		     &prule->names[1].val, &prule->names[1].source, &prule->names[1].flags,
+			 &prule->names[2].val, &prule->names[2].source, &prule->names[2].flags,
+			 &prule->op, &prule->limit, &prule->rule_flags);
+	error_exit(err != 13, "invalid quota rule, missing fields. nr fields: %d rule str: %s\n", err, TEST_RULE_STR);
+
+	return prule;
+}
+
+static u64 random64(void)
+{
+	return ((u64)lrand48() << 32) | lrand48();
+}
+
+static u64 random_range(u64 low, u64 high)
+{
+	return low + (random64() % (high - low + 1));
+}
+
+static struct gen_inode *generate_dir(struct opts *opts, u64 dir_ino, u64 ino_start, u64 ino_len,
+				      bool no_dirs)
+{
+	struct scoutfs_parallel_restore_entry *entry;
+	struct gen_inode *gino;
+	u64 nr_entries;
+	u64 nr_files;
+	u64 nr_dirs;
+	u64 ino;
+	char *prefix;
+	mode_t mode;
+	u64 i;
+
+	nr_dirs = no_dirs ? 0 : random_range(opts->low_dirs, opts->high_dirs);
+	nr_files = random_range(opts->low_files, opts->high_files);
+
+	if (1 + nr_dirs + nr_files > ino_len) {
+		nr_dirs = no_dirs ? 0 : (ino_len - 1) / 2;
+		nr_files = (ino_len - 1) - nr_dirs;
+	}
+
+	nr_entries = nr_dirs + nr_files;
+
+	gino = generate_inode(opts, dir_ino, DIR_MODE);
+	error_exit(!gino, "error allocating generated inode");
+
+	gino->inode.nr_subdirs = nr_dirs;
+	gino->nr_files = nr_files;
+
+	if (nr_entries) {
+		gino->entries = calloc(nr_entries, sizeof(struct scoutfs_parallel_restore_entry *));
+		error_exit(!gino->entries, "error allocating generated inode entries");
+
+		gino->nr_entries = nr_entries;
+	}
+
+	mode = DIR_MODE;
+	prefix = "dir";
+	for (i = 0; i < nr_entries; i++) {
+		if (i == nr_dirs) {
+			mode = REG_MODE;
+			prefix = "file";
+		}
+
+		ino = ino_start + i;
+		entry = generate_entry(opts, prefix, ino, gino->inode.ino,
+				       SCOUTFS_DIRENT_FIRST_POS + i, ino, mode);
+
+		gino->entries[i] = entry;
+		gino->inode.total_entry_name_bytes += entry->name_len;
+	}
+
+	return gino;
+}
+
+/*
+ * Restore a generated inode.  If it's a directory then we also restore
+ * all its entries.  The caller is going to descend into subdir entries and generate
+ * those dir inodes.  We have to generate and restore all non-dir inodes referenced
+ * by this inode's entries.
+ */
+static void restore_inode(struct opts *opts, struct scoutfs_parallel_restore_writer *wri,
+			  struct gen_inode *gino)
+{
+	struct gen_inode *nondir;
+	int ret;
+	u64 i;
+
+	ret = scoutfs_parallel_restore_add_inode(wri, &gino->inode);
+	error_exit(ret, "thread add root inode %d", ret);
+
+	for (i = 0; i < gino->nr_entries; i++) {
+		ret = scoutfs_parallel_restore_add_entry(wri, gino->entries[i]);
+		error_exit(ret, "thread add entry %d", ret);
+
+		/* caller only needs subdir entries, generate and free others */
+		if ((gino->entries[i]->mode & S_IFMT) != S_IFDIR) {
+
+			nondir = generate_inode(opts, gino->entries[i]->ino,
+						gino->entries[i]->mode);
+			restore_inode(opts, wri, nondir);
+			free_gino(nondir);
+
+			free(gino->entries[i]);
+			if (i != gino->nr_entries - 1)
+				gino->entries[i] = gino->entries[gino->nr_entries - 1];
+			gino->nr_entries--;
+			gino->nr_files--;
+			i--;
+		}
+	}
+
+	for (i = 0; i < gino->nr_xattrs; i++) {
+		ret = scoutfs_parallel_restore_add_xattr(wri, gino->xattrs[i]);
+		error_exit(ret, "thread add xattr %d", ret);
+	}
+}
+
+struct writer_args {
+	struct list_head head;
+
+	int dev_fd;
+	int pair_fd;
+
+	struct scoutfs_parallel_restore_slice slice;
+	u64 writer_nr;
+	u64 dir_height;
+	u64 ino_start;
+	u64 ino_len;
+};
+
+struct write_result {
+	struct scoutfs_parallel_restore_progress prog;
+	struct scoutfs_parallel_restore_slice slice;
+	__le64 files_created;
+	__le64 bytes_written;
+};
+
+static void write_bufs_and_send(struct opts *opts, struct scoutfs_parallel_restore_writer *wri,
+				  void *buf, size_t buf_size, int dev_fd,
+				  struct write_result *res, bool get_slice, int pair_fd)
+{
+	size_t total;
+	int ret;
+
+	total = write_bufs(opts, wri, buf, buf_size, dev_fd);
+	le64_add_cpu(&res->bytes_written, total);
+
+	ret = scoutfs_parallel_restore_get_progress(wri, &res->prog);
+	error_exit(ret, "get prog %d", ret);
+
+	if (get_slice) {
+		ret = scoutfs_parallel_restore_get_slice(wri, &res->slice);
+		error_exit(ret, "thread get slice %d", ret);
+	}
+
+	ret = write(pair_fd, res, sizeof(struct write_result));
+	error_exit(ret != sizeof(struct write_result), "result send error");
+
+	memset(res, 0, sizeof(struct write_result));
+}
+
+/*
+ * Calculate the number of bytes in toplevel "dir-%llu" entry names for the given
+ * number of writers.
+ */
+static u64 topdir_entry_bytes(u64 nr_writers)
+{
+	u64 bytes = (3 + 1) * nr_writers;
+	u64 limit;
+	u64 done;
+	u64 wid;
+	u64 nr;
+
+	for (done = 0, wid = 1, limit = 10; done < nr_writers; done += nr, wid++, limit *= 10) {
+		nr = min(limit - done, nr_writers - done);
+		bytes += nr * wid;
+	}
+
+	return bytes;
+}
+
+struct dir_pos {
+	struct gen_inode *gino;
+	u64 pos;
+};
+
+static void writer_proc(struct opts *opts, struct writer_args *args)
+{
+	struct scoutfs_parallel_restore_writer *wri = NULL;
+	struct scoutfs_parallel_restore_entry *entry;
+	struct dir_pos *dirs = NULL;
+	struct write_result res;
+	struct gen_inode *gino;
+	void *buf = NULL;
+	u64 level;
+	u64 ino;
+	int ret;
+
+	memset(&res, 0, sizeof(res));
+
+	dirs = calloc(args->dir_height, sizeof(struct dir_pos));
+	error_exit(errno, "error allocating parent dirs "ERRF, ERRA);
+
+	errno = posix_memalign((void **)&buf, 4096, opts->buf_size);
+	error_exit(errno, "error allocating block buf "ERRF, ERRA);
+
+	ret = scoutfs_parallel_restore_create_writer(&wri);
+	error_exit(ret, "create writer %d", ret);
+
+	ret = scoutfs_parallel_restore_add_slice(wri, &args->slice);
+	error_exit(ret, "add slice %d", ret);
+
+	/* writer 0 creates the root dir */
+	if (args->writer_nr == 0) {
+		gino = generate_inode(opts, SCOUTFS_ROOT_INO, DIR_MODE);
+		gino->inode.nr_subdirs = opts->nr_writers;
+		gino->inode.total_entry_name_bytes = topdir_entry_bytes(opts->nr_writers);
+
+		ret = scoutfs_parallel_restore_add_inode(wri, &gino->inode);
+		error_exit(ret, "thread add root inode %d", ret);
+		free_gino(gino);
+	}
+
+	/* create root entry for our top level dir */
+	ino = args->ino_start++;
+	args->ino_len--;
+
+	entry = generate_entry(opts, "top", args->writer_nr,
+			       SCOUTFS_ROOT_INO, SCOUTFS_DIRENT_FIRST_POS + args->writer_nr,
+			       ino, DIR_MODE);
+
+	ret = scoutfs_parallel_restore_add_entry(wri, entry);
+	error_exit(ret, "thread top entry %d", ret);
+	free(entry);
+
+	level = args->dir_height - 1;
+
+	while (args->ino_len > 0 && level < args->dir_height) {
+		gino = dirs[level].gino;
+
+		/* generate and restore if we follow entries */
+		if (!gino) {
+			gino = generate_dir(opts, ino, args->ino_start, args->ino_len, level == 0);
+			args->ino_start += gino->nr_entries;
+			args->ino_len -= gino->nr_entries;
+			le64_add_cpu(&res.files_created, gino->nr_files);
+
+			restore_inode(opts, wri, gino);
+			dirs[level].gino = gino;
+		}
+
+		if (dirs[level].pos == gino->nr_entries) {
+			/* ascend if we're done with this dir */
+			dirs[level].gino = NULL;
+			dirs[level].pos = 0;
+			free_gino(gino);
+			level++;
+
+		} else {
+			/* otherwise descend into subdir entry */
+			ino = gino->entries[dirs[level].pos]->ino;
+			dirs[level].pos++;
+			level--;
+		}
+
+		/* do a partial write at batch intervals when there's still more to do */
+		if (le64_to_cpu(res.files_created) >= opts->write_batch && args->ino_len > 0)
+			write_bufs_and_send(opts, wri, buf, opts->buf_size, args->dev_fd,
+					    &res, false, args->pair_fd);
+	}
+
+	write_bufs_and_send(opts, wri, buf, opts->buf_size, args->dev_fd,
+			    &res, true, args->pair_fd);
+
+	scoutfs_parallel_restore_destroy_writer(&wri);
+
+	free(dirs);
+	free(buf);
+}
+
+/*
+ * If any of our children exited with an error code, we hard exit.
+ * The child processes should themselves report out any errors
+ * encountered. Any remaining children will receive SIGHUP and
+ * terminate.
+ */
+static void sigchld_handler(int signo, siginfo_t *info, void *context)
+{
+	if (info->si_status)
+		exit(EXIT_FAILURE);
+}
+
+static void fork_writer(struct opts *opts, struct writer_args *args)
+{
+	pid_t parent = getpid();
+	pid_t pid;
+	int ret;
+
+	pid = fork();
+	error_exit(pid == -1, "fork error");
+
+	if (pid != 0)
+		return;
+
+	ret = prctl(PR_SET_PDEATHSIG, SIGHUP);
+	error_exit(ret < 0, "failed to set parent death sig");
+
+	printf("pid %u getpid() %u parent %u getppid() %u\n",
+		pid, getpid(), parent, getppid());
+	error_exit(getppid() != parent, "child parent already changed");
+
+	writer_proc(opts, args);
+	exit(0);
+}
+
+static int do_restore(struct opts *opts)
+{
+	struct scoutfs_parallel_restore_writer *wri = NULL;
+	struct scoutfs_parallel_restore_slice *slices = NULL;
+	struct scoutfs_parallel_restore_quota_rule *rule = NULL;
+	struct scoutfs_super_block *super = NULL;
+	struct write_result res;
+	struct writer_args *args;
+	struct timespec begin;
+	struct timespec end;
+	LIST_HEAD(writers);
+	u64 next_ino;
+	u64 ino_per;
+	u64 avg_dirs;
+	u64 avg_files;
+	u64 dir_height;
+	u64 tot_files;
+	u64 tot_bytes;
+	int pair[2] = {-1, -1};
+	float secs;
+	void *buf = NULL;
+	int dev_fd = -1;
+	int ret;
+	int i;
+
+	ret = socketpair(PF_LOCAL, SOCK_STREAM, 0, pair);
+	error_exit(ret, "socketpair error "ERRF, ERRA);
+
+	dev_fd = open(opts->meta_path, O_DIRECT | (opts->read_only ? O_RDONLY : (O_RDWR|O_EXCL)));
+	error_exit(dev_fd < 0, "error opening '%s': "ERRF, opts->meta_path, ERRA);
+
+	errno = posix_memalign((void **)&super, 4096, SCOUTFS_BLOCK_SM_SIZE) ?:
+		posix_memalign((void **)&buf, 4096, opts->buf_size);
+	error_exit(errno, "error allocating block bufs "ERRF, ERRA);
+
+	ret = pread(dev_fd, super, SCOUTFS_BLOCK_SM_SIZE,
+		    SCOUTFS_SUPER_BLKNO << SCOUTFS_BLOCK_SM_SHIFT);
+	error_exit(ret != SCOUTFS_BLOCK_SM_SIZE, "error reading super, ret %d", ret);
+
+	ret = scoutfs_parallel_restore_create_writer(&wri);
+	error_exit(ret, "create writer %d", ret);
+
+	ret = scoutfs_parallel_restore_import_super(wri, super, dev_fd);
+	error_exit(ret, "import super %d", ret);
+
+	rule = generate_quota(opts);
+	ret = scoutfs_parallel_restore_add_quota_rule(wri, rule);
+	free(rule);
+	error_exit(ret, "add quotas %d", ret);
+
+	slices = calloc(1 + opts->nr_writers, sizeof(struct scoutfs_parallel_restore_slice));
+	error_exit(!slices, "alloc slices");
+
+	scoutfs_parallel_restore_init_slices(wri, slices, 1 + opts->nr_writers);
+
+	ret = scoutfs_parallel_restore_add_slice(wri, &slices[0]);
+	error_exit(ret, "add slices[0] %d", ret);
+
+	next_ino = (SCOUTFS_ROOT_INO | SCOUTFS_LOCK_INODE_GROUP_MASK) + 1;
+	ino_per = opts->total_files / opts->nr_writers;
+	avg_dirs = (opts->low_dirs + opts->high_dirs) / 2;
+	avg_files = (opts->low_files + opts->high_files) / 2;
+
+	dir_height = 1;
+	tot_files = avg_files * opts->nr_writers;
+
+	while (tot_files < opts->total_files) {
+		dir_height++;
+		tot_files *= avg_dirs;
+	}
+
+	dprintf("height %llu tot %llu total %llu\n", dir_height, tot_files, opts->total_files);
+
+	clock_gettime(CLOCK_MONOTONIC_RAW, &begin);
+
+	/* start each writing process */
+	for (i = 0; i < opts->nr_writers; i++) {
+		args = calloc(1, sizeof(struct writer_args));
+		error_exit(!args, "alloc writer args");
+
+		args->dev_fd = dev_fd;
+		args->pair_fd = pair[1];
+		args->slice = slices[1 + i];
+		args->writer_nr = i;
+		args->dir_height = dir_height;
+		args->ino_start = next_ino;
+		args->ino_len = ino_per;
+
+		list_add_tail(&args->head, &writers);
+		next_ino += ino_per;
+
+		fork_writer(opts, args);
+	}
+
+	/* read results and watch for writers to finish */
+	tot_files = 0;
+	tot_bytes = 0;
+	i = 0;
+	while (i < opts->nr_writers) {
+		ret = read(pair[0], &res, sizeof(struct write_result));
+		error_exit(ret != sizeof(struct write_result), "result read error %d", ret);
+
+		ret = scoutfs_parallel_restore_add_progress(wri, &res.prog);
+		error_exit(ret, "add thr prog %d", ret);
+
+		if (res.slice.meta_len != 0) {
+			ret = scoutfs_parallel_restore_add_slice(wri, &res.slice);
+			error_exit(ret, "add thr slice %d", ret);
+			i++;
+		}
+
+		tot_files += le64_to_cpu(res.files_created);
+		tot_bytes += le64_to_cpu(res.bytes_written);
+	}
+
+	tot_bytes += write_bufs(opts, wri, buf, opts->buf_size, dev_fd);
+
+	ret = scoutfs_parallel_restore_export_super(wri, super);
+	error_exit(ret, "update super %d", ret);
+
+	if (!opts->read_only) {
+		ret = pwrite(dev_fd, super, SCOUTFS_BLOCK_SM_SIZE,
+			     SCOUTFS_SUPER_BLKNO << SCOUTFS_BLOCK_SM_SHIFT);
+		error_exit(ret != SCOUTFS_BLOCK_SM_SIZE, "error writing super, ret %d", ret);
+	}
+
+	clock_gettime(CLOCK_MONOTONIC_RAW, &end);
+
+	scoutfs_parallel_restore_destroy_writer(&wri);
+
+	secs = ((float)end.tv_sec + ((float)end.tv_nsec/NSEC_PER_SEC)) -
+	       ((float)begin.tv_sec + ((float)begin.tv_nsec/NSEC_PER_SEC));
+	printf("created %llu files in %llu bytes and %f secs => %f bytes/file, %f files/sec\n",
+		tot_files, tot_bytes, secs,
+		(float)tot_bytes / tot_files, (float)tot_files / secs);
+
+	if (dev_fd >= 0)
+		close(dev_fd);
+	if (pair[0] >= 0)
+		close(pair[0]);
+	if (pair[1] >= 0)
+		close(pair[1]);
+	free(super);
+	free(slices);
+	free(buf);
+
+	return 0;
+}
+
+static int parse_low_high(char *str, u64 *low_ret, u64 *high_ret)
+{
+	char *sep;
+	int ret = 0;
+
+	sep = index(str, ':');
+	if (sep) {
+		*sep = '\0';
+		ret = parse_u64(sep + 1, high_ret);
+	}
+
+	if (ret == 0)
+		ret = parse_u64(str, low_ret);
+
+	if (sep)
+		*sep = ':';
+
+	return ret;
+}
+
+int main(int argc, char **argv)
+{
+	struct opts opts = {
+		.buf_size = (32 * 1024 * 1024),
+
+		.write_batch = 1000000,
+		.low_dirs = 5,
+		.high_dirs = 10,
+		.low_files = 10,
+		.high_files = 20,
+		.total_files = 100,
+	};
+	struct sigaction act = { 0 };
+	int ret;
+	int c;
+
+	opts.seed = random64();
+	opts.nr_writers = sysconf(_SC_NPROCESSORS_ONLN);
+
+        while ((c = getopt(argc, argv, "b:d:f:m:n:rs:w:")) != -1) {
+                switch(c) {
+                case 'b':
+			ret = parse_u64(optarg, &opts.write_batch);
+			error_exit(ret, "error parsing -b '%s'\n", optarg);
+			error_exit(opts.write_batch == 0, "-b can't be 0");
+                        break;
+                case 'd':
+			ret = parse_low_high(optarg, &opts.low_dirs, &opts.high_dirs);
+			error_exit(ret, "error parsing -d '%s'\n", optarg);
+                        break;
+                case 'f':
+			ret = parse_low_high(optarg, &opts.low_files, &opts.high_files);
+			error_exit(ret, "error parsing -f '%s'\n", optarg);
+                        break;
+                case 'm':
+                        opts.meta_path = strdup(optarg);
+                        break;
+                case 'n':
+			ret = parse_u64(optarg, &opts.total_files);
+			error_exit(ret, "error parsing -n '%s'\n", optarg);
+                        break;
+                case 'r':
+			opts.read_only = true;
+			break;
+                case 's':
+			ret = parse_u64(optarg, &opts.seed);
+			error_exit(ret, "error parsing -s '%s'\n", optarg);
+                        break;
+                case 'w':
+			ret = parse_u64(optarg, &opts.nr_writers);
+			error_exit(ret, "error parsing -w '%s'\n", optarg);
+                        break;
+                case '?':
+                        printf("Unknown option '%c'\n", optopt);
+                        usage();
+			exit(1);
+                }
+        }
+
+	error_exit(opts.low_dirs > opts.high_dirs, "LOW > HIGH in -d %llu:%llu",
+		   opts.low_dirs, opts.high_dirs);
+	error_exit(opts.low_files > opts.high_files, "LOW > HIGH in -f %llu:%llu",
+		   opts.low_files, opts.high_files);
+	error_exit(!opts.meta_path, "must specify metadata device path with -m");
+
+	printf("recreate with: -d %llu:%llu -f %llu:%llu -n %llu -s %llu -w %llu\n",
+		opts.low_dirs, opts.high_dirs, opts.low_files, opts.high_files,
+		opts.total_files, opts.seed, opts.nr_writers);
+
+	act.sa_flags = SA_SIGINFO | SA_RESTART;
+	act.sa_sigaction = &sigchld_handler;
+	if (sigaction(SIGCHLD, &act, NULL) == -1)
+		error_exit(ret, "error setting up signal handler\n");
+
+	ret = do_restore(&opts);
+
+	free(opts.meta_path);
+
+	return ret == 0 ? 0 : 1;
+}
@@ -0,0 +1,817 @@
+#define _GNU_SOURCE /* O_DIRECT */
+#include <stdlib.h>
+#include <stdio.h>
+#include <stdbool.h>
+#include <unistd.h>
+#include <fcntl.h>
+#include <sys/types.h>
+#include <sys/xattr.h>
+#include <sys/stat.h>
+#include <sys/ioctl.h>
+#include <ctype.h>
+#include <string.h>
+#include <errno.h>
+#include <limits.h>
+#include <time.h>
+#include <sys/prctl.h>
+#include <sys/socket.h>
+#include <sys/signal.h>
+#include <sys/statfs.h>
+#include <dirent.h>
+
+#include "../../utils/src/sparse.h"
+#include "../../utils/src/util.h"
+#include "../../utils/src/list.h"
+#include "../../utils/src/parse.h"
+#include "../../kmod/src/format.h"
+#include "../../kmod/src/ioctl.h"
+#include "../../utils/src/parallel_restore.h"
+
+/*
+ * XXX:
+ */
+
+#define ERRF " errno %d (%s)"
+#define ERRA errno, strerror(errno)
+
+#define error_exit(cond, fmt, args...)			\
+do {							\
+	if (cond) {					\
+		printf("error: "fmt"\n", ##args);	\
+		exit(1);				\
+	}						\
+} while (0)
+
+#define REG_MODE (S_IFREG | 0644)
+#define DIR_MODE (S_IFDIR | 0755)
+#define LNK_MODE (S_IFLNK | 0777)
+
+/*
+ * At about 1k files we seem to be writing about 1MB of data, so
+ * set buffer sizes adequately above that.
+ */
+#define BATCH_FILES 1024
+#define BUF_SIZ 2 * 1024 * 1024
+
+/*
+ * We can't make duplicate inodes for hardlinked files, so we
+ * will need to track these as we generate them. Not too costly
+ * to do, since it's just an integer, and sorting shouldn't matter
+ * until we get into the millions of entries, hopefully.
+ */
+static struct list_head hardlinks;
+struct hardlink_head {
+	struct list_head head;
+	u64 ino;
+};
+
+struct opts {
+	char *meta_path;
+	char *source_dir;
+};
+
+static bool warn_scoutfs = false;
+
+static void usage(void)
+{
+	printf("usage:\n"
+	       " -m PATH     | path to metadata device\n"
+	       " -s PATH     | path to source directory\n"
+	       );
+}
+
+static size_t write_bufs(struct scoutfs_parallel_restore_writer *wri,
+			 void *buf, int dev_fd)
+{
+	size_t total = 0;
+	size_t count;
+	off_t off;
+	int ret;
+
+	do {
+		ret = scoutfs_parallel_restore_write_buf(wri, buf, BUF_SIZ, &off, &count);
+		error_exit(ret, "write buf %d", ret);
+
+		if (count > 0) {
+			ret = pwrite(dev_fd, buf, count, off);
+			error_exit(ret != count, "pwrite count %zu ret %d", count, ret);
+			total += ret;
+		}
+	} while (count > 0);
+
+	return total;
+}
+
+struct write_result {
+	struct scoutfs_parallel_restore_progress prog;
+	struct scoutfs_parallel_restore_slice slice;
+	__le64 files_created;
+	__le64 dirs_created;
+	__le64 bytes_written;
+	bool complete;
+};
+
+static void write_bufs_and_send(struct scoutfs_parallel_restore_writer *wri,
+				void *buf, int dev_fd,
+				struct write_result *res, bool get_slice, int pair_fd)
+{
+	size_t total;
+	int ret;
+
+	total = write_bufs(wri, buf, dev_fd);
+	le64_add_cpu(&res->bytes_written, total);
+
+	ret = scoutfs_parallel_restore_get_progress(wri, &res->prog);
+	error_exit(ret, "get prog %d", ret);
+
+	if (get_slice) {
+		ret = scoutfs_parallel_restore_get_slice(wri, &res->slice);
+		error_exit(ret, "thread get slice %d", ret);
+	}
+
+	ret = write(pair_fd, res, sizeof(struct write_result));
+	error_exit(ret != sizeof(struct write_result), "result send error");
+
+	memset(res, 0, sizeof(struct write_result));
+}
+
+/*
+ * Adding xattrs is supported for files and directories only.
+ *
+ * If the filesystem on which the path resides isn't scoutfs, we omit the
+ * scoutfs specific ioctl to fetch hidden xattrs.
+ *
+ * Untested if the hidden xattr ioctl works on directories or symlinks.
+ */
+static void add_xattrs(struct scoutfs_parallel_restore_writer *wri, char *path, u64 ino, bool is_scoutfs)
+{
+	struct scoutfs_ioctl_listxattr_hidden lxh;
+	struct scoutfs_parallel_restore_xattr *xattr;
+	char *buf = NULL;
+	char *name = NULL;
+	int fd = -1;
+	int bytes;
+	int len;
+	int value_len;
+	int ret;
+	int pos = 0;
+
+	if (!is_scoutfs)
+		goto normal_xattrs;
+
+	fd = open(path, O_RDONLY);
+	error_exit(fd < 0, "open"ERRF, ERRA);
+
+	memset(&lxh, 0, sizeof(lxh));
+	lxh.id_pos = 0;
+	lxh.hash_pos = 0;
+	lxh.buf_bytes = 256 * 1024;
+
+	buf = malloc(lxh.buf_bytes);
+	error_exit(!buf, "alloc xattr_hidden buf");
+	lxh.buf_ptr = (unsigned long)buf;
+
+	/* hidden */
+	for (;;) {
+		ret = ioctl(fd, SCOUTFS_IOC_LISTXATTR_HIDDEN, &lxh);
+		if (ret == 0) /* done */
+			break;
+		error_exit(ret < 0, "listxattr_hidden"ERRF, ERRA);
+		bytes = ret;
+		error_exit(bytes > lxh.buf_bytes, "listxattr_hidden overflow");
+		error_exit(buf[bytes - 1] != '\0', "listxattr_hidden didn't term");
+
+		name = buf;
+
+		do {
+			len = strlen(name);
+			error_exit(len == 0, "listxattr_hidden empty name");
+			error_exit(len > SCOUTFS_XATTR_MAX_NAME_LEN, "listxattr_hidden long name");
+
+			/* get value len */
+			value_len = fgetxattr(fd, name, NULL, 0);
+			error_exit(value_len < 0, "malloc value hidden"ERRF, ERRA);
+
+			/* allocate everything at once */
+			xattr = malloc(sizeof(struct scoutfs_parallel_restore_xattr) + len + value_len);
+			error_exit(!xattr, "error allocating generated xattr");
+
+			*xattr = (struct scoutfs_parallel_restore_xattr) {
+				.ino = ino,
+				.pos = pos++,
+				.name_len = len,
+				.value_len = value_len,
+			};
+			xattr->name = (void *)(xattr + 1);
+			xattr->value = (void *)(xattr->name + len);
+
+			/* get value into xattr directly */
+			ret = fgetxattr(fd, name, (void *)(xattr->name + len), value_len);
+			error_exit(ret != value_len, "fgetxattr value"ERRF, ERRA);
+
+			memcpy(xattr->name, name, len);
+
+			ret = scoutfs_parallel_restore_add_xattr(wri, xattr);
+			error_exit(ret, "add hidden xattr %d", ret);
+
+			free(xattr);
+
+			name += len + 1;
+			bytes -= len + 1;
+		} while (bytes > 0);
+	}
+
+	free(buf);
+	close(fd);
+
+normal_xattrs:
+	value_len = listxattr(path, NULL, 0);
+	error_exit(value_len < 0, "hidden listxattr "ERRF, ERRA);
+	if (value_len == 0)
+		return;
+
+	buf = calloc(1, value_len);
+	error_exit(!buf, "malloc value"ERRF, ERRA);
+
+	ret = listxattr(path, buf, value_len);
+	error_exit(ret < 0, "hidden listxattr %d", ret);
+
+	name = buf;
+	bytes = ret;
+	do {
+		len = strlen(name);
+
+		error_exit(len == 0, "listxattr_hidden empty name");
+		error_exit(len > SCOUTFS_XATTR_MAX_NAME_LEN, "listxattr_hidden long name");
+
+		value_len = getxattr(path, name, NULL, 0);
+		error_exit(value_len < 0, "value "ERRF, ERRA);
+
+		xattr = malloc(sizeof(struct scoutfs_parallel_restore_xattr) + len + value_len);
+		error_exit(!xattr, "error allocating generated xattr");
+
+		*xattr = (struct scoutfs_parallel_restore_xattr) {
+			.ino = ino,
+			.pos = pos++,
+			.name_len = len,
+			.value_len = value_len,
+		};
+		xattr->name = (void *)(xattr + 1);
+		xattr->value = (void *)(xattr->name + len);
+
+		ret = getxattr(path, name, (void *)(xattr->name + len), value_len);
+		error_exit(ret != value_len, "fgetxattr value"ERRF, ERRA);
+
+		memcpy(xattr->name, name, len);
+
+		ret = scoutfs_parallel_restore_add_xattr(wri, xattr);
+		error_exit(ret, "add xattr %d", ret);
+
+		free(xattr);
+
+		name += len + 1;
+		bytes -= len + 1;
+	} while (bytes > 0);
+
+	free(buf);
+}
+
+/*
+ * We can't store the same inode multiple times, so we need to make
+ * sure to account for hardlinks. Maintain a LL that stores the first
+ * hardlink inode we encounter, and every subsequent hardlink to this
+ * inode will omit inserting an inode, and just adds another entry
+ */
+static bool is_new_inode_item(bool nlink, u64 ino)
+{
+	struct hardlink_head *hh_tmp;
+	struct hardlink_head *hh;
+
+	if (!nlink)
+		return true;
+
+	/* lineair search, pretty awful, should be a binary tree */
+	list_for_each_entry_safe(hh, hh_tmp, &hardlinks, head) {
+		if (hh->ino == ino)
+			return false;
+	}
+
+	/* insert item */
+	hh = malloc(sizeof(struct hardlink_head));
+	error_exit(!hh, "malloc");
+	hh->ino = ino;
+	list_add_tail(&hh->head, &hardlinks);
+
+	/*
+	 *  XXX
+	 *
+	 * We can be confident that if we don't traverse filesystems
+	 * that once we've created N entries of an N-linked inode, that
+	 * it can be removed from the LL. This would significantly
+	 * improve the manageability of the list.
+	 *
+	 * All we'd need to do is add a counter and compare it to the nr_links
+	 * field of the inode.
+	 */
+
+	return true;
+}
+
+/*
+ * create the inode data for a given path as best as possible
+ * duplicating the exact data from the source path
+ */
+static struct scoutfs_parallel_restore_inode *read_inode_data(char *path, u64 ino, bool *nlink, bool is_scoutfs)
+{
+	struct scoutfs_parallel_restore_inode *inode = NULL;
+	struct scoutfs_ioctl_stat_more stm;
+	struct stat st;
+	int ret;
+	int fd;
+
+	inode = calloc(1, sizeof(struct scoutfs_parallel_restore_inode));
+	error_exit(!inode, "failure allocating inode");
+
+	ret = lstat(path, &st);
+	error_exit(ret, "failure stat inode");
+
+	/* use exact inode numbers from path, except for root ino */
+	if (ino != SCOUTFS_ROOT_INO)
+		inode->ino = st.st_ino;
+	else
+		inode->ino = SCOUTFS_ROOT_INO;
+
+	inode->mode = st.st_mode;
+	inode->uid = st.st_uid;
+	inode->gid = st.st_gid;
+	inode->atime = st.st_atim;
+	inode->ctime = st.st_ctim;
+	inode->mtime = st.st_mtim;
+	inode->size = st.st_size;
+
+	inode->rdev = st.st_rdev;
+
+	/* scoutfs specific */
+	inode->meta_seq = 0;
+	inode->data_seq = 0;
+	inode->crtime = st.st_ctim;
+
+	if (S_ISREG(inode->mode)) {
+		if (inode->size > 0)
+			inode->offline = true;
+
+		if (is_scoutfs) {
+			fd = open(path, O_RDONLY);
+			error_exit(!fd, "open failure"ERRF, ERRA);
+
+			ret = ioctl(fd, SCOUTFS_IOC_STAT_MORE, &stm);
+			error_exit(ret, "failure SCOUTFS_IOC_STAT_MORE inode");
+
+			inode->meta_seq = stm.meta_seq;
+			inode->data_seq = stm.data_seq;
+			inode->crtime = (struct timespec){.tv_sec = stm.crtime_sec, .tv_nsec = stm.crtime_nsec};
+
+			close(fd);
+		}
+
+	}
+
+	/* pass whether item is hardlinked or not */
+	*nlink = (st.st_nlink > 1);
+
+	return inode;
+}
+
+struct writer_args {
+	struct list_head head;
+
+	int dev_fd;
+	int pair_fd;
+
+	struct scoutfs_parallel_restore_slice slice;
+};
+
+static void restore_path(struct scoutfs_parallel_restore_writer *wri, struct writer_args *args, struct write_result *res, void *buf, char *path, u64 ino)
+{
+	struct scoutfs_parallel_restore_inode *inode;
+	struct scoutfs_parallel_restore_entry *entry;
+	DIR *dirp = NULL;
+	char *subdir = NULL;
+	char link[PATH_MAX + 1];
+	struct dirent *ent;
+	struct statfs stf;
+	int ret = 0;
+	int subdir_count = 0, file_count = 0;
+	size_t ent_len = 0;
+	size_t pos = 0;
+	bool nlink = false;
+	char ind = '?';
+	u64 mode;
+	bool is_scoutfs = false;
+
+	/* get fs info once per path */
+	ret = statfs(path, &stf);
+	error_exit(ret != 0, "statfs"ERRF, ERRA);
+	is_scoutfs = (stf.f_type == 0x554f4353);
+
+	if (!is_scoutfs && !warn_scoutfs) {
+		warn_scoutfs = true;
+		fprintf(stderr, "Non-scoutfs source path detected: scoutfs specific features disabled\n");
+	}
+
+	/* traverse the entire tree */
+	dirp = opendir(path);
+	errno = 0;
+	while ((ent = readdir(dirp))) {
+		if (ent->d_type == DT_DIR) {
+			if ((strcmp(ent->d_name, ".") == 0) ||
+			    (strcmp(ent->d_name, "..") == 0)) {
+				/* position still matters */
+				pos++;
+				continue;
+			}
+
+			/* recurse into subdir */
+			ret = asprintf(&subdir, "%s/%s", path, ent->d_name);
+			error_exit(ret == -1, "asprintf subdir"ERRF, ERRA);
+			restore_path(wri, args, res, buf, subdir, ent->d_ino);
+
+			subdir_count++;
+
+			ent_len += strlen(ent->d_name);
+
+			entry = malloc(sizeof(struct scoutfs_parallel_restore_entry) + strlen(ent->d_name));
+			error_exit(!entry, "error allocating generated entry");
+
+			*entry = (struct scoutfs_parallel_restore_entry) {
+				.dir_ino = ino,
+				.pos = pos++,
+				.ino = ent->d_ino,
+				.mode = DIR_MODE,
+				.name = (void *)(entry + 1),
+				.name_len = strlen(ent->d_name),
+			};
+
+			memcpy(entry->name, ent->d_name, strlen(ent->d_name));
+			ret = scoutfs_parallel_restore_add_entry(wri, entry);
+			error_exit(ret, "add entry %d", ret);
+			free(entry);
+
+			add_xattrs(wri, subdir, ent->d_ino, is_scoutfs);
+
+			free(subdir);
+
+			le64_add_cpu(&res->dirs_created, 1);
+		} else if (ent->d_type == DT_REG) {
+
+			file_count++;
+
+			ent_len += strlen(ent->d_name);
+
+			entry = malloc(sizeof(struct scoutfs_parallel_restore_entry) + strlen(ent->d_name));
+			error_exit(!entry, "error allocating generated entry");
+
+			*entry = (struct scoutfs_parallel_restore_entry) {
+				.dir_ino = ino,
+				.pos = pos++,
+				.ino = ent->d_ino,
+				.mode = REG_MODE,
+				.name = (void *)(entry + 1),
+				.name_len = strlen(ent->d_name),
+			};
+
+			memcpy(entry->name, ent->d_name, strlen(ent->d_name));
+			ret = scoutfs_parallel_restore_add_entry(wri, entry);
+			error_exit(ret, "add entry %d", ret);
+			free(entry);
+
+			ret = asprintf(&subdir, "%s/%s", path, ent->d_name);
+			error_exit(ret == -1, "asprintf subdir"ERRF, ERRA);
+
+			/* file inode */
+			inode = read_inode_data(subdir, ent->d_ino, &nlink, is_scoutfs);
+			fprintf(stdout, "f %s/%s\n", path, ent->d_name);
+			if (is_new_inode_item(nlink, ent->d_ino)) {
+				ret = scoutfs_parallel_restore_add_inode(wri, inode);
+				error_exit(ret, "add reg file inode %d", ret);
+
+				/* xattrs */
+				add_xattrs(wri, subdir, ent->d_ino, is_scoutfs);
+			}
+			free(inode);
+
+			free(subdir);
+
+			le64_add_cpu(&res->files_created, 1);
+		} else if (ent->d_type == DT_LNK) {
+			/* readlink */
+
+			ret = asprintf(&subdir, "%s/%s", path, ent->d_name);
+			error_exit(ret == -1, "asprintf subdir"ERRF, ERRA);
+
+			ent_len += strlen(ent->d_name);
+
+			ret = readlink(subdir, link, PATH_MAX);
+			error_exit(ret < 0, "readlink %d", ret);
+			/* must 0-terminate if we want to print it */
+			link[ret] = 0;
+
+			entry = malloc(sizeof(struct scoutfs_parallel_restore_entry) + strlen(ent->d_name));
+			error_exit(!entry, "error allocating generated entry");
+
+			*entry = (struct scoutfs_parallel_restore_entry) {
+				.dir_ino = ino,
+				.pos = pos++,
+				.ino = ent->d_ino,
+				.mode = LNK_MODE,
+				.name = (void *)(entry + 1),
+				.name_len = strlen(ent->d_name),
+			};
+
+			memcpy(entry->name, ent->d_name, strlen(ent->d_name));
+			ret = scoutfs_parallel_restore_add_entry(wri, entry);
+			error_exit(ret, "add symlink entry %d", ret);
+
+			/* link inode */
+			inode = read_inode_data(subdir, ent->d_ino, &nlink, is_scoutfs);
+
+			fprintf(stdout, "l %s/%s -> %s\n", path, ent->d_name, link);
+
+			inode->mode = LNK_MODE;
+			inode->target = link;
+			inode->target_len = strlen(link) + 1; /* scoutfs null terminates symlinks */
+
+			ret = scoutfs_parallel_restore_add_inode(wri, inode);
+			error_exit(ret, "add syml inode %d", ret);
+
+			free(inode);
+			free(subdir);
+
+			le64_add_cpu(&res->files_created, 1);
+		} else {
+			/* odd stuff */
+			switch(ent->d_type) {
+			case DT_CHR:
+				ind = 'c';
+				mode = S_IFCHR;
+				break;
+			case DT_BLK:
+				ind = 'b';
+				mode = S_IFBLK;
+				break;
+			case DT_FIFO:
+				ind = 'p';
+				mode = S_IFIFO;
+				break;
+			case DT_SOCK:
+				ind = 's';
+				mode = S_IFSOCK;
+				break;
+			default:
+				error_exit(true, "Unknown readdir entry type");
+				;;
+			}
+
+			file_count++;
+
+			ent_len += strlen(ent->d_name);
+
+			entry = malloc(sizeof(struct scoutfs_parallel_restore_entry) + strlen(ent->d_name));
+			error_exit(!entry, "error allocating generated entry");
+
+			*entry = (struct scoutfs_parallel_restore_entry) {
+				.dir_ino = ino,
+				.pos = pos++,
+				.ino = ent->d_ino,
+				.mode = mode,
+				.name = (void *)(entry + 1),
+				.name_len = strlen(ent->d_name),
+			};
+
+			memcpy(entry->name, ent->d_name, strlen(ent->d_name));
+			ret = scoutfs_parallel_restore_add_entry(wri, entry);
+			error_exit(ret, "add entry %d", ret);
+
+			free(entry);
+
+			ret = asprintf(&subdir, "%s/%s", path, ent->d_name);
+			error_exit(ret == -1, "asprintf subdir"ERRF, ERRA);
+
+			/* file inode */
+			inode = read_inode_data(subdir, ent->d_ino, &nlink, is_scoutfs);
+			fprintf(stdout, "%c %lld %s/%s\n", ind, inode->ino, path, ent->d_name);
+			if (is_new_inode_item(nlink, ent->d_ino)) {
+				ret = scoutfs_parallel_restore_add_inode(wri, inode);
+				error_exit(ret, "add reg file inode %d", ret);
+			}
+			free(inode);
+
+			free(subdir);
+
+			le64_add_cpu(&res->files_created, 1);
+		}
+
+		/* batch out changes, will be about 1M */
+		if (le64_to_cpu(res->files_created) > BATCH_FILES) {
+			write_bufs_and_send(wri, buf, args->dev_fd, res, false, args->pair_fd);
+		}
+
+	}
+	if (ent != NULL)
+		error_exit(errno, "readdir"ERRF, ERRA);
+	closedir(dirp);
+
+	/* create the dir itself */
+	inode = read_inode_data(path, ino, &nlink, is_scoutfs);
+	inode->nr_subdirs = subdir_count;
+	inode->total_entry_name_bytes = ent_len;
+	fprintf(stdout, "d %s\n", path);
+
+	ret = scoutfs_parallel_restore_add_inode(wri, inode);
+	error_exit(ret, "add dir inode %d", ret);
+
+	free(inode);
+
+	/* No need to send, we'll send final after last directory is complete */
+}
+
+static int do_restore(struct opts *opts)
+{
+	struct scoutfs_parallel_restore_writer *pwri, *wri = NULL;
+	struct scoutfs_parallel_restore_slice *slices = NULL;
+	struct scoutfs_super_block *super = NULL;
+	struct writer_args *args;
+	struct write_result res;
+	int pair[2] = {-1, -1};
+	LIST_HEAD(writers);
+	void *buf = NULL;
+	void *bufp = NULL;
+	int dev_fd = -1;
+	pid_t pid;
+	int ret;
+	u64 tot_bytes;
+	u64 tot_dirs;
+	u64 tot_files;
+
+	ret = socketpair(PF_LOCAL, SOCK_STREAM, 0, pair);
+	error_exit(ret, "socketpair error "ERRF, ERRA);
+
+	dev_fd = open(opts->meta_path, O_DIRECT | (O_RDWR|O_EXCL));
+	error_exit(dev_fd < 0, "error opening '%s': "ERRF, opts->meta_path, ERRA);
+
+	errno = posix_memalign((void **)&super, 4096, SCOUTFS_BLOCK_SM_SIZE) ?:
+		posix_memalign((void **)&buf, 4096, BUF_SIZ);
+	error_exit(errno, "error allocating block bufs "ERRF, ERRA);
+
+	ret = pread(dev_fd, super, SCOUTFS_BLOCK_SM_SIZE,
+		    SCOUTFS_SUPER_BLKNO << SCOUTFS_BLOCK_SM_SHIFT);
+	error_exit(ret != SCOUTFS_BLOCK_SM_SIZE, "error reading super, ret %d", ret);
+
+	error_exit((super->flags & SCOUTFS_FLAG_IS_META_BDEV) == 0, "super block is not meta dev");
+
+	ret = scoutfs_parallel_restore_create_writer(&wri);
+	error_exit(ret, "create writer %d", ret);
+
+	ret = scoutfs_parallel_restore_import_super(wri, super, dev_fd);
+	error_exit(ret, "import super %d", ret);
+
+	slices = calloc(2, sizeof(struct scoutfs_parallel_restore_slice));
+	error_exit(!slices, "alloc slices");
+
+	scoutfs_parallel_restore_init_slices(wri, slices, 2);
+
+	ret = scoutfs_parallel_restore_add_slice(wri, &slices[0]);
+	error_exit(ret, "add slices[0] %d", ret);
+
+	args = calloc(1, sizeof(struct writer_args));
+	error_exit(!args, "alloc writer args");
+
+	args->dev_fd = dev_fd;
+	args->slice = slices[1];
+	args->pair_fd = pair[1];
+	list_add_tail(&args->head, &writers);
+
+	/* fork writer process */
+	pid = fork();
+	error_exit(pid == -1, "fork error");
+
+	if (pid == 0) {
+		ret = prctl(PR_SET_PDEATHSIG, SIGHUP);
+		error_exit(ret < 0, "failed to set parent death sig");
+
+		errno = posix_memalign((void **)&bufp, 4096, BUF_SIZ);
+		error_exit(errno, "error allocating block bufp "ERRF, ERRA);
+
+		ret = scoutfs_parallel_restore_create_writer(&pwri);
+		error_exit(ret, "create pwriter %d", ret);
+
+		ret = scoutfs_parallel_restore_add_slice(pwri, &args->slice);
+		error_exit(ret, "add pslice %d", ret);
+
+		memset(&res, 0, sizeof(res));
+
+		restore_path(pwri, args, &res, bufp, opts->source_dir, SCOUTFS_ROOT_INO);
+
+		res.complete = true;
+
+		write_bufs_and_send(pwri, buf, args->dev_fd, &res, true, args->pair_fd);
+
+		scoutfs_parallel_restore_destroy_writer(&pwri);
+		free(bufp);
+
+		exit(0);
+	};
+
+	/* read results and wait for writer to finish */
+	tot_bytes = 0;
+	tot_dirs = 1;
+	tot_files = 0;
+	for (;;) {
+		ret = read(pair[0], &res, sizeof(struct write_result));
+		error_exit(ret != sizeof(struct write_result), "result read error %d", ret);
+
+		ret = scoutfs_parallel_restore_add_progress(wri, &res.prog);
+		error_exit(ret, "add thr prog %d", ret);
+
+		if (res.slice.meta_len != 0) {
+			ret = scoutfs_parallel_restore_add_slice(wri, &res.slice);
+			error_exit(ret, "add thr slice %d", ret);
+
+			if (res.complete)
+				break;
+		}
+
+		tot_bytes += le64_to_cpu(res.bytes_written);
+		tot_files += le64_to_cpu(res.files_created);
+		tot_dirs += le64_to_cpu(res.dirs_created);
+	}
+
+	tot_bytes += write_bufs(wri, buf, args->dev_fd);
+
+	fprintf(stdout, "Wrote %lld directories, %lld files, %lld bytes total\n",
+		tot_dirs, tot_files, tot_bytes);
+
+	/* write super to finalize */
+	ret = scoutfs_parallel_restore_export_super(wri, super);
+	error_exit(ret, "update super %d", ret);
+
+	ret = pwrite(dev_fd, super, SCOUTFS_BLOCK_SM_SIZE,
+		     SCOUTFS_SUPER_BLKNO << SCOUTFS_BLOCK_SM_SHIFT);
+	error_exit(ret != SCOUTFS_BLOCK_SM_SIZE, "error writing super, ret %d", ret);
+
+	scoutfs_parallel_restore_destroy_writer(&wri);
+
+	if (dev_fd >= 0)
+		close(dev_fd);
+	if (pair[0] > 0)
+		close(pair[0]);
+	if (pair[1] > 0)
+		close(pair[1]);
+	free(super);
+	free(args);
+	free(slices);
+	free(buf);
+
+	return 0;
+}
+
+int main(int argc, char **argv)
+{
+	struct opts opts = (struct opts){ 0 };
+	struct hardlink_head *hh_tmp;
+	struct hardlink_head *hh;
+	int ret;
+	int c;
+
+	INIT_LIST_HEAD(&hardlinks);
+
+        while ((c = getopt(argc, argv, "b:m:s:")) != -1) {
+                switch(c) {
+                case 'm':
+                        opts.meta_path = strdup(optarg);
+                        break;
+		case 's':
+			opts.source_dir = strdup(optarg);
+			break;
+                case '?':
+                        printf("Unknown option '%c'\n", optopt);
+                        usage();
+			exit(1);
+                }
+        }
+
+	error_exit(!opts.meta_path, "must specify metadata device path with -m");
+	error_exit(!opts.source_dir, "must specify source directory path with -s");
+
+	ret = do_restore(&opts);
+
+	free(opts.meta_path);
+	free(opts.source_dir);
+
+	list_for_each_entry_safe(hh, hh_tmp, &hardlinks, head) {
+		list_del_init(&hh->head);
+		free(hh);
+	}
+
+	return ret == 0 ? 0 : 1;
+}
@@ -0,0 +1,110 @@
+
+#
+# test basic POSIX acl functionality.
+#
+
+t_require_commands stat rm touch mkdir getfacl setfacl id sudo
+t_require_mounts 2
+
+# from quota.sh
+TEST_UID=22222
+TEST_GID=44444
+
+# sys_setreuid() set fs[uid] to e[ug]id
+SET_UID="--ruid=$TEST_UID --euid=$TEST_UID"
+SET_GID="--rgid=$TEST_GID --egid=$TEST_GID --clear-groups"
+
+# helper to avoid capturing dates from ls output
+L() {
+	stat -c "%F %A %u %g %s %N" $@
+}
+
+echo "== setup test directory"
+cd "$T_D0"
+
+echo "== getfacl"
+L .
+getfacl .
+
+echo "== basic non-acl access through permissions"
+rm -rf dir-testuid
+mkdir dir-testuid
+ln -sf dir-testuid symlinkdir-testuid
+chown root:44444 dir-testuid
+L dir-testuid
+setpriv $SET_UID $SET_GID touch dir-testuid/file-group-write
+setpriv $SET_UID $SET_GID touch symlinkdir-testuid/symlink-file-group-write
+chmod g+w dir-testuid
+setpriv $SET_UID $SET_GID touch dir-testuid/file-group-write
+setpriv $SET_UID $SET_GID touch symlinkdir-testuid/symlink-file-group-write
+L dir-testuid/file-group-write
+L symlinkdir-testuid/symlink-file-group-write
+
+echo "== basic acl access"
+rm -rf dir-root
+mkdir dir-root
+ln -sf dir-root symlinkdir-root
+L dir-root
+setpriv $SET_UID touch dir-root/file-group-write
+setpriv $SET_UID touch symlinkdir-root/file-group-write
+setfacl -m u:22222:rwx dir-root
+getfacl dir-root
+setpriv $SET_UID touch dir-root/file-group-write
+setpriv $SET_UID touch symlinkdir-root/file-group-write
+L dir-root/file-group-write
+L symlinkdir-root/file-group-write
+
+echo "== directory exec"
+setpriv $SET_UID bash -c "cd dir-root 2>&- && echo Success"
+setpriv $SET_UID bash -c "cd symlinkdir-root 2>&- && echo Success"
+setfacl -m u:22222:rw dir-root
+getfacl dir-root
+setpriv $SET_UID bash -c "cd dir-root 2>&- || echo Failed"
+setpriv $SET_UID bash -c "cd symlinkdir-root 2>&- || echo Failed"
+setfacl -m g:44444:rwx dir-root
+getfacl dir-root
+setpriv $SET_GID bash -c "cd dir-root 2>&- && echo Success"
+setpriv $SET_GID bash -c "cd symlinkdir-root 2>&- && echo Success"
+
+echo "== get/set attr"
+rm -rf file-root
+touch file-root
+L file-root
+setpriv $SET_UID getfattr -d file-root
+setpriv $SET_UID setfattr -n "user.test1" -v "Success" file-root
+setpriv $SET_UID getfattr -d file-root
+setfacl -m u:22222:rw file-root
+getfacl file-root
+setpriv $SET_UID setfattr -n "user.test2" -v "Success" file-root
+setpriv $SET_UID getfattr -d file-root
+setfacl -x u:22222 file-root
+getfacl file-root
+setpriv $SET_UID setfattr -n "user.test3" -v "Success" file-root
+setpriv $SET_UID getfattr -d file-root
+setfacl -m g:44444:rw file-root
+getfacl file-root
+setpriv $SET_GID setfattr -n "user.test4" -v "Success" file-root
+setpriv $SET_GID getfattr -d file-root
+
+echo "== inheritance / default acl"
+rm -rf dir-root2
+mkdir dir-root2
+L dir-root2
+setpriv $SET_UID mkdir dir-root2/dir
+setpriv $SET_UID touch dir-root2/dir/file
+setfacl -m d:u:22222:rwx dir-root2
+getfacl dir-root2
+setpriv $SET_UID mkdir dir-root2/dir
+setpriv $SET_UID touch dir-root2/dir/file
+setfacl -m u:22222:rwx dir-root2
+getfacl dir-root2
+setpriv $SET_UID mkdir dir-root2/dir
+setpriv $SET_UID touch dir-root2/dir/file
+L dir-root2/dir
+getfacl dir-root2/dir
+L dir-root2/dir/file
+getfacl dir-root2/dir/file
+
+echo "== cleanup"
+
+t_pass
@@ -3,13 +3,13 @@
 # operations in one mount and verify the results in another.
 #

-t_require_commands getfattr setfattr dd filefrag diff touch stat scoutfs
+t_require_commands getfattr setfattr dd diff touch stat scoutfs
 t_require_mounts 2

 GETFATTR="getfattr --absolute-names"
 SETFATTR="setfattr"
 DD="dd status=none"
-FILEFRAG="filefrag -v -b4096"
+FIEMAP="scoutfs get-fiemap"

 echo "== root inode updates flow back and forth"
 sleep 1
@@ -55,8 +55,8 @@ for i in $(seq 1 10); do
 		conv=notrunc oflag=append &
 	wait
 done
-$FILEFRAG "$T_D0/file" | t_filter_fs > "$T_TMP.0"
-$FILEFRAG "$T_D1/file" | t_filter_fs > "$T_TMP.1"
+$FIEMAP "$T_D0/file" > "$T_TMP.0"
+$FIEMAP "$T_D1/file" > "$T_TMP.1"
 diff -u "$T_TMP.0" "$T_TMP.1"

 echo "== unlinked file isn't found"
@@ -210,4 +210,7 @@ done
 wait
 ls "$T_D0/concurrent"

+echo "== cleanup"
+rm -f "$T_TMP.0" "$T_TMP.1"
+
 t_pass
@@ -73,4 +73,7 @@ test "$large_tot" -gt "$equal_tot" ; echo "resized larger test rc: $?"
 umount "$SCR"
 losetup -d "$scr_loop"

+echo "== cleanup"
+rm -f "$T_TMP.small" "$T_TMP.equal" "$T_TMP.large"
+
 t_pass
@@ -28,7 +28,7 @@ while [ "$SECONDS" -lt "$END" ]; do
        for i in $(t_fs_nrs); do
                if [ "$i" -ge "$quorum_nr" ]; then
                        t_umount $i &
-                        echo "umount $i pid $pid quo $quorum_nr" \
+                        echo "umount $i rid $rid quo $quorum_nr" \
                                >> $T_TMP.log
                        mounted[$i]=0
                fi
@@ -53,6 +53,9 @@ while [ "$SECONDS" -lt "$END" ]; do

        for i in "${lock_arr[@]}"; do
                if [[ ! " ${rid_arr[*]} " =~ " $i " ]]; then
+                        echo -e "RID($i) exists" >> $T_TMP.log
+                        echo -e "rid_arr:\n${rid_arr[@]}" >> $T_TMP.log
+                        echo -e "lock_arr:\n${lock_arr[@]}" >> $T_TMP.log
                        t_fail "RID($i): exists when not mounted"
                fi
        done
@@ -2,7 +2,7 @@
 # Test clustered parallel createmany
 #

-t_require_commands mkdir createmany
+t_require_commands mkdir createmany bc
 t_require_mounts 2

 COUNT=50000
@@ -17,14 +17,14 @@ mkdir -p $T_D0/dir/0
 mkdir $T_D1/dir/1

 echo "== measure initial createmany"
-START=$SECONDS
+START=$(date +%s.%N)
 createmany -o "$T_D0/file_" $COUNT >> $T_TMP.full
 sync
-SINGLE=$((SECONDS - START))
-echo single $SINGLE >> $T_TMP.full
+END=$(date +%s.%N)
+SINGLE=$(echo "$END - $START" | bc)

 echo "== measure two concurrent createmany runs"
-START=$SECONDS
+START=$(date +%s.%N)
 (cd $T_D0/dir/0; createmany -o ./file_ $COUNT > /dev/null) &
 pids="$!"
 (cd $T_D1/dir/1; createmany -o ./file_ $COUNT > /dev/null) &
@@ -33,7 +33,9 @@ for p in $pids; do
        wait $p
 done
 sync
-BOTH=$((SECONDS - START))
+END=$(date +%s.%N)
+BOTH=$(echo "$END - $START" | bc)
+
 echo both $BOTH >> $T_TMP.full

 # Multi node still adds significant overhead, even with our CW locks
@@ -44,7 +46,7 @@ echo both $BOTH >> $T_TMP.full
 # exceed this factor should the CW locked items go back to fully
 # synchronized operation.
 FACTOR=200
-if [ "$BOTH" -gt $(($SINGLE*$FACTOR)) ]; then
+if [ $(echo "$BOTH > ( $SINGLE * $FACTOR )" | bc) == "1" ]; then
 	t_fail "both createmany took $BOTH sec, more than $FACTOR x single $SINGLE sec"
 fi

@@ -4,7 +4,16 @@
 # merge adjacent consecutive allocations.  (we don't have multiple
 # allocation cursors)
 #
-t_require_commands scoutfs stat filefrag dd touch truncate
+t_require_commands scoutfs stat dd touch truncate
+
+get_fiemap()
+{
+	scoutfs get-fiemap "$1" | awk '($1 != "extents:") {
+			unwritten = (substr($8, 2, 1) == "U") ? "unwritten" : "";
+			eof = (substr($8, 3, 1) == "L") ? "eof" : "";
+			print $3 ".. " $6 ": " unwritten eof;
+			};'
+}

 write_block()
 {
@@ -76,26 +85,9 @@ print_extents_found()
 {
 	local prefix="$1"

-	filefrag "$prefix"* 2>&1 | grep "extent.*found" | t_filter_fs
-}
-
-#
-# print the logical start, len, and flags if they're there.
-#
-print_logical_extents()
-{
-	local file="$1"
-
-	filefrag -v -b4096 "$file" 2>&1 | t_filter_fs | awk '
-		($1 ~ /[0-9]+:/) {
-			if ($NF !~  /[0-9]+:/) {
-				flags=$NF
-			} else {
-				flags=""
-			}
-			print $2, $6, flags
-		}
-	' | sed 's/last,eof/eof/'
+	for f in "$prefix"-*; do
+		echo "$f: $(scoutfs get-fiemap "$f" | tail -n 1)" | t_filter_fs
+	done
 }

 t_save_all_sysfs_mount_options data_prealloc_blocks
@@ -197,7 +189,7 @@ for sides in 0 1 2 3; do
 done

 echo before:
-print_logical_extents "$prefix"
+get_fiemap "$prefix"

 # now write into the first, middle, and last empty block of each
 t_set_sysfs_mount_option 0 data_prealloc_contig_only 0
@@ -223,7 +215,7 @@ for sides in 0 1 2 3; do
 			# mid (both has 6 blocks internally)
 			2) write_block $prefix $((left + 3)) ;;
 		esac
-		print_logical_extents "$prefix"
+		get_fiemap "$prefix"
 		((base+=8))
 	done
 done
@@ -11,6 +11,11 @@
 # format version.
 #

+# not supported on el9!
+if [ $(source /etc/os-release ; echo ${VERSION_ID:0:1}) -gt 8 ]; then
+	t_skip_permitted "Unsupported OS version"
+fi
+
 mount_has_format_version()
 {
 	local mnt="$1"
@@ -83,9 +83,9 @@ touch "$OTHER"
 ln "$FROM" "$HARD"

 echo "== wrapped offsets should fail"
-HUGE=0x8000000000000000
-scoutfs move-blocks "$FROM" -f "$HUGE" -l "$HUGE" "$TO" -t 0 2>&1 | t_filter_fs
-scoutfs move-blocks "$FROM" -f 0 -l "$HUGE" "$TO" -t "$HUGE" 2>&1 | t_filter_fs
+HUGE=0xfffffffffffff000
+scoutfs move-blocks "$FROM" -f "$HUGE" -l "8192" "$TO" -t 0 2>&1 | t_filter_fs
+scoutfs move-blocks "$FROM" -f 0 -l "$HUGE" "$TO" -t "8192" 2>&1 | t_filter_fs

 echo "== specifying same file fails"
 scoutfs move-blocks "$FROM" -f 0 -l "$BS" "$HARD" -t 0 2>&1 | t_filter_fs
@@ -0,0 +1,78 @@
+#
+# validate parallel restore library
+#
+
+t_require_commands scoutfs parallel_restore find xargs
+
+SCR="$T_TMPDIR/mnt.scratch"
+mkdir -p "$SCR"
+
+scratch_mkfs() {
+	scoutfs mkfs $@ \
+		-A -f -Q 0,127.0.0.1,53000 $T_EX_META_DEV $T_EX_DATA_DEV
+}
+
+scratch_check() {
+	# give ample time for writes to commit
+	sleep 1
+	sync
+	scoutfs check -d ${T_TMPDIR}/check.debug $T_EX_META_DEV $T_EX_DATA_DEV
+}
+
+scratch_mount() {
+	mount -t scoutfs -o metadev_path=$T_EX_META_DEV,quorum_slot_nr=0 $T_EX_DATA_DEV $SCR
+}
+
+
+echo "== simple mkfs/restore/mount"
+# meta device just big enough for reserves and the metadata we'll fill
+scratch_mkfs -V 2 -m 10G -d 60G > $T_TMP.mkfs.out 2>&1 || t_fail "mkfs failed"
+parallel_restore -m "$T_EX_META_DEV" > /dev/null || t_fail "parallel_restore"
+scratch_check || t_fail "check failed"
+scratch_mount
+
+scoutfs statfs -p "$SCR" | grep -v -e 'fsid' -e 'rid'
+find "$SCR" -exec scoutfs list-hidden-xattrs {} \; | wc
+scoutfs search-xattrs -p "$SCR" scoutfs.hide.srch.sam_vol_F01030L6 -p "$SCR" | wc
+find "$SCR" -type f -name "file-*" | head -n 4 | xargs -n 1 scoutfs get-fiemap -L
+scoutfs df -p "$SCR"
+scoutfs quota-list -p "$SCR"
+umount "$SCR"
+scratch_check || t_fail "check after mount failed"
+
+echo "== just under ENOSPC"
+scratch_mkfs -V 2 -m 10G -d 60G > $T_TMP.mkfs.out 2>&1 || t_fail "mkfs failed"
+parallel_restore -m "$T_EX_META_DEV" -n 3000000 > /dev/null || t_fail "parallel_restore"
+scratch_check || t_fail "check failed"
+scratch_mount
+scoutfs df -p "$SCR"
+umount "$SCR"
+scratch_check || t_fail "check after mount failed"
+
+echo "== just over ENOSPC"
+scratch_mkfs -V 2 -m 10G -d 60G > $T_TMP.mkfs.out 2>&1 || t_fail "mkfs failed"
+parallel_restore -m "$T_EX_META_DEV" -n 3500000 | grep died 2>&1 && t_fail "parallel_restore"
+scratch_check || t_fail "check failed"
+
+echo "== ENOSPC"
+scratch_mkfs -V 2 -m 10G -d 60G > $T_TMP.mkfs.out 2>&1 || t_fail "mkfs failed"
+parallel_restore -m "$T_EX_META_DEV" -d 600:1000 -f 600:1000 -n 4000000 | grep died 2>&1 && t_fail "parallel_restore"
+
+echo "== attempt to restore data device"
+scratch_mkfs -V 2 -m 10G -d 60G > $T_TMP.mkfs.out 2>&1 || t_fail "mkfs failed"
+parallel_restore -m "$T_EX_DATA_DEV" | grep died 2>&1 && t_fail "parallel_restore"
+
+echo "== attempt format_v1 restore"
+scratch_mkfs -V 1 -m 10G -d 60G > $T_TMP.mkfs.out 2>&1 || t_fail "mkfs failed"
+parallel_restore -m "$T_EX_META_DEV" | grep died 2>&1 && t_fail "parallel_restore"
+
+echo "== test if previously mounted"
+scratch_mkfs -V 2 -m 10G -d 60G > $T_TMP.mkfs.out 2>&1 || t_fail "mkfs failed"
+mount -t scoutfs -o metadev_path=$T_EX_META_DEV,quorum_slot_nr=0 \
+	"$T_EX_DATA_DEV" "$SCR"
+umount "$SCR"
+parallel_restore -m "$T_EX_META_DEV" | grep died 2>&1 && t_fail "parallel_restore"
+
+echo "== cleanup"
+rmdir "$SCR"
+t_pass
@@ -22,7 +22,7 @@ reset_all()
 	getfattr --absolute-names -d -m - "$T_D0" | \
 		grep "^scoutfs.totl." | \
 		cut -d '=' -f 1 | \
-		xargs -n 1 -I'{}'  setfattr -x '{}' "$T_D0"
+		xargs -I'{}'  setfattr -x '{}' "$T_D0"
 }

 echo "== prepare dir with write perm for test ids"
@@ -147,4 +147,7 @@ sync_and_drop
 setpriv $SET_UID touch "$FILE" 2>&1 | t_filter_fs
 reset_all

+echo "== cleanup"
+rm -f "$T_TMP.lots" "$T_TMP.list"
+
 t_pass
@@ -2,7 +2,7 @@
 # Test correctness of the setattr_more ioctl.
 #

-t_require_commands filefrag scoutfs touch mkdir rm stat mknod
+t_require_commands scoutfs touch mkdir rm stat mknod

 FILE="$T_D0/file"

@@ -55,17 +55,10 @@ scoutfs setattr -t 67305985.999999999 -V 1 -s 1 "$FILE" 2>&1 | t_filter_fs
 TZ=GMT stat -c "%z" "$FILE"
 rm "$FILE"

-#
-# With e2fsprogs-v1.42.10-10-g29758d2f, the output of filefrag 'flags' changes
-# significantly. First, the _LAST flag is now output. Second, the 'unknown'
-# flag is now printed out as 'unknown_loc'. To compensate for this, we check
-# and replace the "correct" output for new versions here with the expected
-# value.
-#
 echo "== large offline extents are created"
 touch "$FILE"
 scoutfs setattr -V 1 -o -s $((10007 * 4096)) "$FILE" 2>&1 | t_filter_fs
-filefrag -v -b4096 "$FILE" 2>&1 | sed 's/last,unknown_loc,eof$/unknown,eof/' | t_filter_fs
+scoutfs get-fiemap "$FILE"
 rm "$FILE"

 # had a bug where we were creating extents that were too long
@@ -75,4 +68,11 @@ scoutfs setattr -V 1 -o -s 4000000000 "$FILE" 2>&1 | t_filter_fs
 scoutfs stat -s offline_blocks "$FILE"
 rm "$FILE"

+# Do not fail if data_version is unset - the unset `0` value should not
+# be passed down to attr_x handling code which will -EINVAL on that.
+echo "== omitting data_version should not fail"
+touch "$FILE"
+scoutfs setattr -s 0 -t 1725670311.0 -r 1725670311.0 "$FILE"
+rm "$FILE"
+
 t_pass
@@ -7,6 +7,7 @@ t_require_commands xfs_io filefrag scoutfs mknod
 # this test wants to ignore unwritten extents
 fiemap_file() {
 	filefrag -v -b4096 "$1" | grep -v "unwritten"
+	scoutfs get-fiemap "$1" | grep -v 'flags:.*U'
 }

 create_file() {
@@ -61,7 +62,10 @@ echo "== release past i_size is fine"
 release_vers "$FILE" stat 400K 4K

 echo "== wrapped blocks fails"
-release_vers "$FILE" stat $vers 0x8000000000000000 0x8000000000000000
+# just under!
+release_vers "$FILE" stat $vers 0xfffffffffffff000 4096
+# this goes over
+release_vers "$FILE" stat $vers 0xfffffffffffff000 8192

 echo "== releasing non-file fails"
 mknod "$CHAR" c 1 3
@@ -105,25 +109,20 @@ for c in $(seq 0 4); do
 		fi
 	done

-	start=$(fiemap_file "$FILE" | \
-		awk '($1 == "0:"){print substr($4, 0, length($4)- 2)}')
-
 	release_vers "$FILE" stat $(($a * 4))K 4K
 	release_vers "$FILE" stat $(($b * 4))K 4K
 	release_vers "$FILE" stat $(($c * 4))K 4K

 	echo -n "$a $b $c:"

-	fiemap_file "$FILE" | \
-		awk 'BEGIN{ORS=""}($1 == (NR - 4)":") {
-			off=substr($2, 0, length($2)- 2);
-			phys=substr($4, 0, length($4)- 2);
-			if (phys > 100) {
-				phys = phys - phys + 100 + off;
-			}
-			len=substr($6, 0, length($6)- 1);
-			print "  (" off, phys, len ")";
-		}'
+	scoutfs get-fiemap "$FILE" | \
+		awk 'BEGIN{ORS=""}($1 != "extents:") {
+			off=$3;
+			len=$6;
+			phys=substr($8, 0, 1);
+			phys = (phys == ".") ? off + 100 : 0;
+			print "  (" off, phys, len ")"
+		};'
 	echo

 	rm "$FILE"
@@ -2,11 +2,7 @@
 # Test correctness of the staging operation
 #

-t_require_commands filefrag dd scoutfs cp cmp rm
-
-fiemap_file() {
-	filefrag -v -b4096 "$1"
-}
+t_require_commands dd scoutfs cp cmp rm

 create_file() {
 	local file="$1"
@@ -62,7 +58,7 @@ create_file "$FILE" 4096
 cp "$FILE"  "$T_TMP"
 release_vers "$FILE" stat 0 4K
 # make sure there only offline extents
-fiemap_file "$FILE" | grep "^[ 0-9]*:" | grep -v "unknown"
+scoutfs get-fiemap "$FILE"
 stage_vers "$FILE" stat 0 4096 "$T_TMP"
 cmp "$FILE" "$T_TMP"
 rm -f "$FILE"
@@ -72,7 +68,7 @@ create_file "$FILE" $((4096 * 4096))
 cp "$FILE"  "$T_TMP"
 release_vers "$FILE" stat 0 16M
 # make sure there only offline extents
-fiemap_file "$FILE" | grep "^[ 0-9]*:" | grep -v "unknown"
+scoutfs get-fiemap "$FILE"
 stage_vers "$FILE" stat 0 $((4096 * 4096)) "$T_TMP"
 cmp "$FILE" "$T_TMP"
 rm -f "$FILE"
@@ -143,8 +139,8 @@ hexdump -C "$FILE"
 rm -f "$FILE"

 echo "== wrapped region fails"
-create_file "$FILE" 4096
-stage_vers "$FILE" stat 0xFFFFFFFFFFFFF000 4096 /dev/zero
+create_file "$FILE" 8192
+stage_vers "$FILE" stat 0xFFFFFFFFFFFFF000 8192 /dev/zero
 rm -f "$FILE"

 echo "== non-block aligned offset fails"
@@ -152,7 +148,7 @@ create_file "$FILE" 4096
 cp "$FILE"  "$T_TMP"
 release_vers "$FILE" stat 0 4K
 stage_vers "$FILE" stat 1 4095 "$T_TMP"
-fiemap_file "$FILE" | grep "^[ 0-9]*:" | grep -v "unknown"
+scoutfs get-fiemap "$FILE"
 rm -f "$FILE"

 echo "== non-block aligned len within block fails"
@@ -160,7 +156,7 @@ create_file "$FILE" 4096
 cp "$FILE"  "$T_TMP"
 release_vers "$FILE" stat 0 4K
 stage_vers "$FILE" stat 0 1024 "$T_TMP"
-fiemap_file "$FILE" | grep "^[ 0-9]*:" | grep -v "unknown"
+scoutfs get-fiemap "$FILE"
 rm -f "$FILE"

 echo "== partial final block that writes to i_size does work"
@@ -175,7 +171,7 @@ echo "== zero length stage doesn't bring blocks online"
 create_file "$FILE" $((4096 * 100))
 release_vers "$FILE" stat 0 400K
 stage_vers "$FILE" stat 4096 0 /dev/zero
-fiemap_file "$FILE" | grep "^[ 0-9]*:" | grep -v "unknown"
+scoutfs get-fiemap "$FILE"
 rm -f "$FILE"

 # XXX yup, needs to be updated for demand staging
@@ -62,4 +62,7 @@ for r in $(seq 1 1000); do
 	done
 done

+echo "== cleanup"
+rm -f "$T_TMP.log"
+
 t_pass
@@ -30,8 +30,13 @@ t_quiet mkdir -p "$T_TMPDIR/mnt.scratch"
 t_quiet cd "$T_XFSTESTS_REPO"
 if [ -z "$T_SKIP_CHECKOUT" ]; then
 	t_quiet git fetch
+	# if we're passed a tag instead of a branch, we can't --track
+	TRACK="--track"
+	if git tag -l | grep -q "$T_XFSTESTS_BRANCH" ; then
+		TRACK=""
+	fi
 	# this remote use is bad, do better
-	t_quiet git checkout -B "$T_XFSTESTS_BRANCH" --track "origin/$T_XFSTESTS_BRANCH"
+	t_quiet git checkout -B "$T_XFSTESTS_BRANCH" ${TRACK} "origin/$T_XFSTESTS_BRANCH"
 fi
 t_quiet make
 t_quiet sync
@@ -81,6 +86,45 @@ generic/325	# mmap missing
 generic/338	# BUG_ON update inode error handling
 generic/346	# mmap missing
 generic/347	# _dmthin_mount doesn't work?
+generic/356	# swap
+generic/357	# swap
+generic/409	# bind mounts not scripted yet
+generic/410	# bind mounts not scripted yet
+generic/411	# bind mounts not scripted yet
+generic/423	# symlink inode size is strlen() + 1 on scoutfs
+generic/428	# mmap missing
+generic/430	# xfs_io copy_range missing in el7
+generic/431	# xfs_io copy_range missing in el7
+generic/432	# xfs_io copy_range missing in el7
+generic/433	# xfs_io copy_range missing in el7
+generic/434	# xfs_io copy_range missing in el7
+generic/437	# mmap missing
+generic/441	# dm-mapper
+generic/444	# el9's posix_acl_update_mode is buggy ?
+generic/452	# exec test - no mmap
+generic/467	# open_by_handle ESTALE
+generic/472	# swap
+generic/484	# dm-mapper
+generic/493	# swap
+generic/494	# swap
+generic/495	# swap
+generic/496	# swap
+generic/497	# swap
+generic/532	# xfs_io statx attrib_mask missing in el7
+generic/554	# swap
+generic/563	# cgroup+loopdev
+generic/564	# xfs_io copy_range missing in el7
+generic/565	# xfs_io copy_range missing in el7
+generic/568	# falloc not resulting in block count increase
+generic/569	# swap
+generic/570	# swap
+generic/614	# mmap missing
+generic/620	# dm-hugedisk
+generic/633	# mmap, id-mapped mounts missing in el7
+generic/636	# swap
+generic/638	# mmap missing
+generic/641	# swap
+generic/643	# swap
 EOF

 t_restore_output
@@ -7,7 +7,7 @@ FMTIOC_H := format.h ioctl.h
 FMTIOC_KMOD := $(addprefix ../kmod/src/,$(FMTIOC_H))

 CFLAGS := -Wall -O2 -Werror -D_FILE_OFFSET_BITS=64 -g -msse4.2 \
-	-fno-strict-aliasing \
+	-I src/ -fno-strict-aliasing \
 	-DSCOUTFS_FORMAT_HASH=0x$(SCOUTFS_FORMAT_HASH)LLU

 ifneq ($(wildcard $(firstword $(FMTIOC_KMOD))),)
@@ -15,10 +15,13 @@ CFLAGS += -I../kmod/src
 endif

 BIN := src/scoutfs
-OBJ := $(patsubst %.c,%.o,$(wildcard src/*.c))
-DEPS := $(wildcard */*.d)
+OBJ_DIRS := src src/check
+OBJ := $(foreach dir,$(OBJ_DIRS),$(patsubst %.c,%.o,$(wildcard $(dir)/*.c)))
+DEPS := $(foreach dir,$(OBJ_DIRS),$(wildcard $(dir)/*.d))

-all: $(BIN)
+AR := src/scoutfs_parallel_restore.a
+
+all: $(BIN) $(AR)

 ifneq ($(DEPS),)
 -include $(DEPS)
@@ -36,6 +39,10 @@ $(BIN): $(OBJ)
 	$(QU)  [BIN $@]
 	$(VE)gcc -o $@ $^ -luuid -lm -lcrypto -lblkid

+$(AR): $(OBJ)
+	$(QU)  [AR $@]
+	$(VE)ar rcs $@ $^
+
 %.o %.d: %.c Makefile sparse.sh
 	$(QU)  [CC $<]
 	$(VE)gcc $(CFLAGS) -MD -MP -MF $*.d -c $< -o $*.o
@@ -76,6 +76,41 @@ run when the file system will not be mounted.
 .RE
 .PD

+.TP
+.BI "check META-DEVICE DATA-DEVICE [-d|--debug FILE]"
+.sp
+Performs an offline file system check. The program iterates through all the
+data structures on disk directly - the filesystem must not be mounted while
+this operation is running.
+.RS 1.0i
+.PD 0
+.sp
+.TP
+.B "-d, --debug FILE"
+An output file where the program can output debug information about the
+state of the filesystem as it performs the check. If
+.B FILE
+is "-", the debug output is written to the Standard Error output.
+.TP
+.RE
+.sp
+.B RETURN VALUE
+The check function can return the following exit codes:
+.RS
+.TP
+\fB 0 \fR - no filesystem issues detected
+.TP
+\fB 1 \fR - file system issues were detected
+.TP
+\fB 8 \fR - operational error
+.TP
+\fB 16 \fR - usage error
+.TP
+\fB 32 \fR - cancelled by user (SIGINT)
+.TP
+.RE
+.PD
+
 .TP
 .BI "counters [-t|--table] SYSFS-DIR"
 .sp
@@ -54,6 +54,8 @@ cp man/*.8.gz $RPM_BUILD_ROOT%{_mandir}/man8/.
 install -m 755 -D src/scoutfs $RPM_BUILD_ROOT%{_sbindir}/scoutfs
 install -m 644 -D src/ioctl.h $RPM_BUILD_ROOT%{_includedir}/scoutfs/ioctl.h
 install -m 644 -D src/format.h $RPM_BUILD_ROOT%{_includedir}/scoutfs/format.h
+install -m 644 -D src/parallel_restore.h $RPM_BUILD_ROOT%{_includedir}/scoutfs/parallel_restore.h
+install -m 644 -D src/scoutfs_parallel_restore.a $RPM_BUILD_ROOT%{_libdir}/scoutfs/libscoutfs_parallel_restore.a
 install -m 755 -D fenced/scoutfs-fenced $RPM_BUILD_ROOT%{_libexecdir}/scoutfs-fenced/scoutfs-fenced
 install -m 644 -D fenced/scoutfs-fenced.service $RPM_BUILD_ROOT%{_unitdir}/scoutfs-fenced.service
 install -m 644 -D fenced/scoutfs-fenced.conf.example $RPM_BUILD_ROOT%{_sysconfdir}/scoutfs/scoutfs-fenced.conf.example
@@ -70,6 +72,7 @@ install -m 644 -D fenced/scoutfs-fenced.conf.example $RPM_BUILD_ROOT%{_sysconfdi
 %files -n scoutfs-devel
 %defattr(644,root,root,755)
 %{_includedir}/scoutfs
+%{_libdir}/scoutfs

 %clean
 rm -rf %{buildroot}
@@ -37,10 +37,23 @@ search=$(gcc -print-search-dirs | awk '($1 == "install:"){print "-I" $2}')
 #
 # We're trying to use sparse against glibc headers which go wild trying to
 # use internal compiler macros to test features.  We copy gcc's and give
-# them to sparse.  But not __SIZE_TYPE__ 'cause sparse defines that one.
+# them to sparse, but not the ones that sparse already has.
 #
-defines=".sparse.gcc-defines.h"
-gcc -dM -E -x c - < /dev/null | grep -v __SIZE_TYPE__ > $defines
+defines=".sparse.gcc-defines.$$.h"
+awk '
+	# save defines from gcc
+	( FNR == NR ) { lines[$2]=$0 }
+
+	# delete defines that sparse also has
+	( FNR < NR ) { delete lines[$2] }
+
+	# dump remaining lines unique to gcc
+	END {
+		for (a in lines) {
+			print lines[a]
+		}
+	}
+' <(gcc -dM -E -x c - < /dev/null) <(sparse -dM -E -x c - < /dev/null) > $defines
 include="-include $defines"

 #
@@ -54,6 +67,9 @@ else
 fi

 sparse $m64 $include $search/include "$@" 2>&1 | egrep -v "($RE)" | tee .sparse.output
+
+rm -f $defines
+
 if  [ -s .sparse.output ]; then
 	exit 1
 else
@@ -10,6 +10,11 @@
 * Just a quick simple native bitmap.
 */

+int test_bit(unsigned long *bits, u64 nr)
+{
+	return !!(bits[nr / BITS_PER_LONG] & (1UL << (nr & (BITS_PER_LONG - 1))));
+}
+
 void set_bit(unsigned long *bits, u64 nr)
 {
 	bits[nr / BITS_PER_LONG] |= 1UL << (nr & (BITS_PER_LONG - 1));
@@ -1,6 +1,7 @@
 #ifndef _BITMAP_H_
 #define _BITMAP_H_

+int test_bit(unsigned long *bits, u64 nr);
 void set_bit(unsigned long *bits, u64 nr);
 void clear_bit(unsigned long *bits, u64 nr);
 u64 find_next_set_bit(unsigned long *start, u64 from, u64 total);
@@ -0,0 +1,20 @@
+#include <errno.h>
+
+#include "sparse.h"
+#include "util.h"
+#include "format.h"
+#include "hash.h"
+#include "bloom.h"
+
+void calc_bloom_nrs(struct scoutfs_key *key, unsigned int *nrs)
+{
+	u64 hash;
+	int i;
+
+	hash = scoutfs_hash64(key, sizeof(struct scoutfs_key));
+
+	for (i = 0; i < SCOUTFS_FOREST_BLOOM_NRS; i++) {
+		nrs[i] = (u32)hash % SCOUTFS_FOREST_BLOOM_BITS;
+		hash >>= SCOUTFS_FOREST_BLOOM_FUNC_BITS;
+	}
+}
@@ -0,0 +1,6 @@
+#ifndef _BLOOM_H_
+#define _BLOOM_H_
+
+void calc_bloom_nrs(struct scoutfs_key *key, unsigned int *nrs);
+
+#endif
@@ -8,7 +8,7 @@
 #include "leaf_item_hash.h"
 #include "btree.h"

-static void init_block(struct scoutfs_btree_block *bt, int level)
+void btree_init_block(struct scoutfs_btree_block *bt, int level)
 {
 	int free;

@@ -33,7 +33,7 @@ void btree_init_root_single(struct scoutfs_btree_root *root,

 	memset(bt, 0, SCOUTFS_BLOCK_LG_SIZE);

-	init_block(bt, 0);
+	btree_init_block(bt, 0);
 }

 static void *alloc_val(struct scoutfs_btree_block *bt, int len)
@@ -1,6 +1,7 @@
 #ifndef _BTREE_H_
 #define _BTREE_H_

+void btree_init_block(struct scoutfs_btree_block *bt, int level);
 void btree_init_root_single(struct scoutfs_btree_root *root,
 			    struct scoutfs_btree_block *bt,
 			    u64 seq, u64 blkno);
@@ -94,6 +94,18 @@ static int do_change_fmt_vers(struct change_fmt_vers_args *args)
 		goto out;
 	}

+	if ((le64_to_cpu(meta_super->flags) & SCOUTFS_FLAG_IS_META_BDEV) == 0) {
+		printf("device argument #1 is not a meta device (swap arguments?)\n");
+		ret = -EINVAL;
+		goto out;
+	}
+
+	if ((le64_to_cpu(data_super->flags) & SCOUTFS_FLAG_IS_META_BDEV) != 0) {
+		printf("device argument #2 is not a data device (swap arguments?)\n");
+		ret = -EINVAL;
+		goto out;
+	}
+
 	if (le64_to_cpu(meta_super->fmt_vers) < SCOUTFS_FORMAT_VERSION_MIN ||
 	    le64_to_cpu(meta_super->fmt_vers) > SCOUTFS_FORMAT_VERSION_MAX) {
 		fprintf(stderr, "meta super block has format version %llu outside of supported version range %u-%u",
@@ -0,0 +1,166 @@
+#include <unistd.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdbool.h>
+#include <sys/mman.h>
+#include <errno.h>
+
+#include "sparse.h"
+#include "util.h"
+#include "format.h"
+#include "bitmap.h"
+#include "key.h"
+
+#include "alloc.h"
+#include "block.h"
+#include "btree.h"
+#include "extent.h"
+#include "iter.h"
+#include "sns.h"
+
+/*
+ * We check the list blocks serially.
+ *
+ * XXX:
+ *  - compare ref seqs
+ *  - detect cycles?
+ */
+int alloc_list_meta_iter(struct scoutfs_alloc_list_head *lhead, extent_cb_t cb, void *cb_arg)
+{
+	struct scoutfs_alloc_list_block *lblk;
+	struct scoutfs_block_ref ref;
+	struct block *blk = NULL;
+	u64 blkno;
+	int ret;
+
+	ref = lhead->ref;
+
+	while (ref.blkno) {
+		blkno = le64_to_cpu(ref.blkno);
+
+		ret = cb(blkno, 1, cb_arg);
+		if (ret < 0) {
+			ret = xlate_iter_errno(ret);
+			goto out;
+		}
+
+		ret = block_get(&blk, blkno, 0);
+		if (ret < 0)
+			goto out;
+
+		lblk = block_buf(blk);
+		/* XXX verify block */
+		ret = block_hdr_valid(blk, blkno, 0, SCOUTFS_BLOCK_MAGIC_ALLOC_LIST);
+		if (ret < 0)
+			goto out;
+
+		/* XXX sort?   maybe */
+
+		ref = lblk->next;
+
+		block_put(&blk);
+	}
+
+	ret = 0;
+out:
+	return ret;
+}
+
+int alloc_root_meta_iter(struct scoutfs_alloc_root *root, extent_cb_t cb, void *cb_arg)
+{
+	return btree_meta_iter(&root->root, cb, cb_arg);
+}
+
+int alloc_list_extent_iter(struct scoutfs_alloc_list_head *lhead, extent_cb_t cb, void *cb_arg)
+{
+	struct scoutfs_alloc_list_block *lblk;
+	struct scoutfs_block_ref ref;
+	struct block *blk = NULL;
+	u64 blkno;
+	int ret;
+	int i;
+
+	ref = lhead->ref;
+
+	while (ref.blkno) {
+		blkno = le64_to_cpu(ref.blkno);
+
+		ret = block_get(&blk, blkno, 0);
+		if (ret < 0)
+			goto out;
+
+		sns_push("alloc_list_block", blkno, 0);
+
+		lblk = block_buf(blk);
+		/* XXX verify block */
+		ret = block_hdr_valid(blk, blkno, 0, SCOUTFS_BLOCK_MAGIC_ALLOC_LIST);
+		if (ret < 0)
+			goto out;
+		/* XXX sort?   maybe */
+
+		ret = 0;
+		for (i = 0; i < le32_to_cpu(lblk->nr); i++) {
+			blkno = le64_to_cpu(lblk->blknos[le32_to_cpu(lblk->start) + i]);
+
+			ret = cb(blkno, 1, cb_arg);
+			if (ret < 0)
+				break;
+		}
+
+		ref = lblk->next;
+
+		block_put(&blk);
+		sns_pop();
+		if (ret < 0) {
+			ret = xlate_iter_errno(ret);
+			goto out;
+		}
+	}
+
+	ret = 0;
+out:
+	return ret;
+}
+
+static bool valid_free_extent_key(struct scoutfs_key *key)
+{
+	return (key->sk_zone == SCOUTFS_FREE_EXTENT_BLKNO_ZONE ||
+	        key->sk_zone == SCOUTFS_FREE_EXTENT_ORDER_ZONE) &&
+	       (!key->_sk_fourth && !key->sk_type &&
+		(key->sk_zone == SCOUTFS_FREE_EXTENT_ORDER_ZONE || !key->_sk_third));
+}
+
+static int free_item_cb(struct scoutfs_key *key, void *val, u16 val_len, void *cb_arg)
+{
+	struct extent_cb_arg_t *ecba = cb_arg;
+	u64 start;
+	u64 len;
+
+	/* XXX not sure these eios are what we want */
+
+	if (val_len != 0)
+		return -EIO;
+
+	if (!valid_free_extent_key(key))
+		return -EIO;
+
+	if (key->sk_zone == SCOUTFS_FREE_EXTENT_ORDER_ZONE)
+		return -ECHECK_ITER_DONE;
+
+	start = le64_to_cpu(key->skfb_end) - le64_to_cpu(key->skfb_len) + 1;
+	len = le64_to_cpu(key->skfb_len);
+
+	return ecba->cb(start, len, ecba->cb_arg);
+}
+
+/*
+ * Call the callback with each of the primary BLKNO free extents stored
+ * in item in the given alloc root.  It doesn't visit the secondary
+ * ORDER extents.
+ */
+int alloc_root_extent_iter(struct scoutfs_alloc_root *root, extent_cb_t cb, void *cb_arg)
+{
+	struct extent_cb_arg_t ecba = { .cb = cb, .cb_arg = cb_arg };
+
+	return btree_item_iter(&root->root, free_item_cb, &ecba);
+}
@@ -0,0 +1,12 @@
+#ifndef _SCOUTFS_UTILS_CHECK_ALLOC_H
+#define _SCOUTFS_UTILS_CHECK_ALLOC_H
+
+#include "extent.h"
+
+int alloc_list_meta_iter(struct scoutfs_alloc_list_head *lhead, extent_cb_t cb, void *cb_arg);
+int alloc_root_meta_iter(struct scoutfs_alloc_root *root, extent_cb_t cb, void *cb_arg);
+
+int alloc_list_extent_iter(struct scoutfs_alloc_list_head *lhead, extent_cb_t cb, void *cb_arg);
+int alloc_root_extent_iter(struct scoutfs_alloc_root *root, extent_cb_t cb, void *cb_arg);
+
+#endif
@@ -0,0 +1,613 @@
+#define _ISOC11_SOURCE /* aligned_alloc */
+#define _DEFAULT_SOURCE /* syscall() */
+#include <stdlib.h>
+#include <unistd.h>
+#include <stdbool.h>
+#include <stdio.h>
+#include <errno.h>
+#include <sys/syscall.h>
+#include <linux/aio_abi.h>
+
+#include "sparse.h"
+#include "util.h"
+#include "format.h"
+#include "list.h"
+#include "cmp.h"
+#include "hash.h"
+
+#include "block.h"
+#include "debug.h"
+#include "super.h"
+#include "eno.h"
+#include "crc.h"
+#include "sns.h"
+
+static struct block_data {
+	struct list_head *hash_lists;
+	size_t hash_nr;
+
+	struct list_head active_head;
+	struct list_head inactive_head;
+	struct list_head dirty_list;
+	size_t nr_active;
+	size_t nr_inactive;
+	size_t nr_dirty;
+
+	int meta_fd;
+	size_t max_cached;
+	size_t nr_events;
+
+	aio_context_t ctx;
+	struct iocb *iocbs;
+	struct iocb **iocbps;
+	struct io_event *events;
+} global_bdat;
+
+struct block {
+	struct list_head hash_head;
+	struct list_head lru_head;
+	struct list_head dirty_head;
+	struct list_head submit_head;
+	unsigned long refcount;
+	unsigned long uptodate:1,
+		      active:1;
+	u64 blkno;
+	void *buf;
+	size_t size;
+};
+
+#define BLK_FMT \
+	"blkno %llu rc %ld d %u a %u"
+#define BLK_ARG(blk) \
+	(blk)->blkno, (blk)->refcount, !list_empty(&(blk)->dirty_head), blk->active
+#define debug_blk(blk, fmt, args...) \
+	debug(fmt " " BLK_FMT, ##args, BLK_ARG(blk))
+
+/*
+ * This just allocates and initialzies the block.  The caller is
+ * responsible for putting it on the appropriate initial lists and
+ * managing refcounts.
+ */
+static struct block *alloc_block(struct block_data *bdat, u64 blkno, size_t size)
+{
+	struct block *blk;
+
+	blk = calloc(1, sizeof(struct block));
+	if (blk) {
+		blk->buf = aligned_alloc(4096, size); /* XXX static alignment :/ */
+		if (!blk->buf) {
+			free(blk);
+			blk = NULL;
+		} else {
+			INIT_LIST_HEAD(&blk->hash_head);
+			INIT_LIST_HEAD(&blk->lru_head);
+			INIT_LIST_HEAD(&blk->dirty_head);
+			INIT_LIST_HEAD(&blk->submit_head);
+			blk->blkno = blkno;
+			blk->size = size;
+		}
+	}
+
+	return blk;
+}
+
+static void free_block(struct block_data *bdat, struct block *blk)
+{
+	debug_blk(blk, "free");
+
+	if (!list_empty(&blk->lru_head)) {
+		if (blk->active)
+			bdat->nr_active--;
+		else
+			bdat->nr_inactive--;
+		list_del(&blk->lru_head);
+	}
+
+	if (!list_empty(&blk->dirty_head)) {
+		bdat->nr_dirty--;
+		list_del(&blk->dirty_head);
+	}
+
+	if (!list_empty(&blk->hash_head))
+		list_del(&blk->hash_head);
+
+	if (!list_empty(&blk->submit_head))
+		list_del(&blk->submit_head);
+
+	free(blk->buf);
+	free(blk);
+}
+
+static bool blk_is_dirty(struct block *blk)
+{
+	return !list_empty(&blk->dirty_head);
+}
+
+/*
+ * Rebalance the cache.
+ *
+ * First we shrink the cache to limit it to max_cached blocks.
+ * Logically, we walk from oldest to newest in the inactive list and
+ * then in the active list.  Since these lists are physically one
+ * list_head list we achieve this with a reverse walk starting from the
+ * active head.
+ *
+ * Then we rebalnace the size of the two lists.  The constraint is that
+ * we don't let the active list grow larger than the inactive list.  We
+ * move blocks from the oldest tail of the active list to the newest
+ * head of the inactive list.
+ *
+ * <- [active head] <-> [ .. active list .. ] <-> [inactive head] <-> [ .. inactive list .. ] ->
+ */
+static void rebalance_cache(struct block_data *bdat)
+{
+	struct block *blk;
+	struct block *blk_;
+
+	list_for_each_entry_safe_reverse(blk, blk_, &bdat->active_head, lru_head) {
+		if ((bdat->nr_active + bdat->nr_inactive) < bdat->max_cached)
+			break;
+
+		if (&blk->lru_head == &bdat->inactive_head || blk->refcount > 0 ||
+		    blk_is_dirty(blk))
+			continue;
+
+		free_block(bdat, blk);
+	}
+
+	list_for_each_entry_safe_reverse(blk, blk_, &bdat->inactive_head, lru_head) {
+		if (bdat->nr_active <= bdat->nr_inactive || &blk->lru_head == &bdat->active_head)
+			break;
+
+		list_move(&blk->lru_head, &bdat->inactive_head);
+		blk->active = 0;
+		bdat->nr_active--;
+		bdat->nr_inactive++;
+	}
+}
+
+static void make_active(struct block_data *bdat, struct block *blk)
+{
+	if (!blk->active) {
+		if (!list_empty(&blk->lru_head)) {
+			list_move(&blk->lru_head, &bdat->active_head);
+			bdat->nr_inactive--;
+		} else {
+			list_add(&blk->lru_head, &bdat->active_head);
+		}
+
+		blk->active = 1;
+		bdat->nr_active++;
+	}
+}
+
+static int compar_iocbp(const void *A, const void *B)
+{
+	struct iocb *a = *(struct iocb **)A;
+	struct iocb *b = *(struct iocb **)B;
+
+	return scoutfs_cmp(a->aio_offset, b->aio_offset);
+}
+
+static int submit_and_wait(struct block_data *bdat, struct list_head *list)
+{
+	struct io_event *event;
+	struct iocb *iocb;
+	struct block *blk;
+	int ret;
+	int err;
+	int nr;
+	int i;
+
+	err = 0;
+	nr = 0;
+	list_for_each_entry(blk, list, submit_head) {
+		iocb = &bdat->iocbs[nr];
+		bdat->iocbps[nr] = iocb;
+
+		memset(iocb, 0, sizeof(struct iocb));
+
+		iocb->aio_data = (intptr_t)blk;
+		iocb->aio_lio_opcode = blk_is_dirty(blk) ? IOCB_CMD_PWRITE : IOCB_CMD_PREAD;
+		iocb->aio_fildes = bdat->meta_fd;
+		iocb->aio_buf = (intptr_t)blk->buf;
+		iocb->aio_nbytes = blk->size;
+		iocb->aio_offset = blk->blkno * blk->size;
+
+		nr++;
+
+		debug_blk(blk, "submit");
+
+		if ((nr < bdat->nr_events) && blk->submit_head.next != list)
+			continue;
+
+		qsort(bdat->iocbps, nr, sizeof(bdat->iocbps[0]), compar_iocbp);
+
+		ret = syscall(__NR_io_submit, bdat->ctx, nr, bdat->iocbps);
+		if (ret != nr) {
+			if (ret >= 0)
+				errno = EIO;
+			ret = -errno;
+			fprintf(stderr, "fatal system error submitting async IO: "ENO_FMT"\n",
+				ENO_ARG(-ret));
+			goto out;
+		}
+
+		ret = syscall(__NR_io_getevents, bdat->ctx, nr, nr, bdat->events, NULL);
+		if (ret != nr) {
+			if (ret >= 0)
+				errno = EIO;
+			ret = -errno;
+			fprintf(stderr, "fatal system error getting IO events: "ENO_FMT"\n",
+				ENO_ARG(-ret));
+			goto out;
+		}
+
+		ret = 0;
+		for (i = 0; i < nr; i++) {
+			event = &bdat->events[i];
+			iocb = (struct iocb *)(intptr_t)event->obj;
+			blk = (struct block *)(intptr_t)event->data;
+
+			debug_blk(blk, "complete res %lld", (long long)event->res);
+
+			if (event->res >= 0 && event->res != blk->size)
+				event->res = -EIO;
+
+			/* io errors are fatal */
+			if (event->res < 0) {
+				ret = event->res;
+				goto out;
+			}
+
+			if (iocb->aio_lio_opcode == IOCB_CMD_PREAD) {
+				blk->uptodate = 1;
+			} else {
+				list_del_init(&blk->dirty_head);
+				bdat->nr_dirty--;
+			}
+		}
+		nr = 0;
+	}
+
+	ret = 0;
+out:
+	return ret ?: err;
+}
+
+static void inc_refcount(struct block *blk)
+{
+	blk->refcount++;
+}
+
+void block_put(struct block **blkp)
+{
+	struct block_data *bdat = &global_bdat;
+	struct block *blk = *blkp;
+
+	if (blk) {
+		blk->refcount--;
+		*blkp = NULL;
+
+		rebalance_cache(bdat);
+	}
+}
+
+static struct list_head *hash_bucket(struct block_data *bdat, u64 blkno)
+{
+	u32 hash = scoutfs_hash32(&blkno, sizeof(blkno));
+
+	return &bdat->hash_lists[hash % bdat->hash_nr];
+}
+
+int block_hdr_valid(struct block *blk, u64 blkno, int bf, u32 magic)
+{
+	struct scoutfs_block_header *hdr;
+	size_t size = (bf & BF_SM) ? SCOUTFS_BLOCK_SM_SIZE : SCOUTFS_BLOCK_LG_SIZE;
+	int ret;
+	u32 crc;
+
+	ret = block_get(&blk, blkno, bf);
+	if (ret < 0) {
+		fprintf(stderr, "error reading block %llu\n", blkno);
+		goto out;
+	}
+
+	hdr = block_buf(blk);
+
+	crc = crc_block(hdr, size);
+
+	/*
+	 * a bad CRC is easy to repair, so we pass a different error code
+	 * back. Unless the other data is also wrong - then it's EINVAL
+	 * to signal that this isn't a valid block hdr at all.
+	 */
+	if (le32_to_cpu(hdr->crc) != crc)
+		ret = -EIO; /* keep checking other fields */
+
+	if (le32_to_cpu(hdr->magic) != magic)
+		ret = -EINVAL;
+
+	/*
+	 * Our first caller fills in global_super. Until this completes,
+	 * we can't do this check.
+	 */
+	if ((blkno != SCOUTFS_SUPER_BLKNO) &&
+	    (hdr->fsid != global_super->hdr.fsid))
+		ret = -EINVAL;
+
+	block_put(&blk);
+
+	debug("%s blk_hdr_valid blkno %llu size %lu crc 0x%08x magic 0x%08x ret %d",
+	      sns_str(), blkno, size, le32_to_cpu(hdr->crc), le32_to_cpu(hdr->magic),
+	      ret);
+
+out:
+	return ret;
+}
+
+static struct block *get_or_alloc(struct block_data *bdat, u64 blkno, int bf)
+{
+	struct list_head *bucket = hash_bucket(bdat, blkno);
+	struct block *search;
+	struct block *blk;
+	size_t size;
+
+	size = (bf & BF_SM) ? SCOUTFS_BLOCK_SM_SIZE : SCOUTFS_BLOCK_LG_SIZE;
+
+	blk = NULL;
+	list_for_each_entry(search, bucket, hash_head) {
+		if (search->blkno == blkno && search->size == size) {
+			blk = search;
+			break;
+		}
+	}
+
+	if (!blk) {
+		blk = alloc_block(bdat, blkno, size);
+		if (blk) {
+			list_add(&blk->hash_head, bucket);
+			list_add(&blk->lru_head, &bdat->inactive_head);
+			bdat->nr_inactive++;
+		}
+	}
+	if (blk)
+		inc_refcount(blk);
+
+	return blk;
+}
+
+/*
+ * Get a block.
+ *
+ * The caller holds a refcount to the block while it's in use that
+ * prevents it from being removed from the cache.  It must be dropped
+ * with block_put();
+ */
+int block_get(struct block **blk_ret, u64 blkno, int bf)
+{
+	struct block_data *bdat = &global_bdat;
+	struct block *blk;
+	LIST_HEAD(list);
+	int ret;
+
+	blk = get_or_alloc(bdat, blkno, bf);
+	if (!blk) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	if ((bf & BF_ZERO)) {
+		memset(blk->buf, 0, blk->size);
+		blk->uptodate = 1;
+	}
+
+	if (bf & BF_OVERWRITE)
+		blk->uptodate = 1;
+
+	if (!blk->uptodate) {
+		list_add(&blk->submit_head, &list);
+		ret = submit_and_wait(bdat, &list);
+		list_del_init(&blk->submit_head);
+		if (ret < 0)
+			goto out;
+	}
+
+	if ((bf & BF_DIRTY) && !blk_is_dirty(blk)) {
+		list_add_tail(&bdat->dirty_list, &blk->dirty_head);
+		bdat->nr_dirty++;
+	}
+
+	make_active(bdat, blk);
+
+	rebalance_cache(bdat);
+	ret = 0;
+out:
+	if (ret < 0)
+		block_put(&blk);
+	*blk_ret = blk;
+	return ret;
+}
+
+void *block_buf(struct block *blk)
+{
+	return blk->buf;
+}
+
+size_t block_size(struct block *blk)
+{
+	return blk->size;
+}
+
+/*
+ * Drop the block from the cache, regardless of if it was free or not.
+ * This is used to avoid writing blocks which were dirtied but then
+ * later freed.
+ *
+ * The block is immediately freed and can't be referenced after this
+ * returns.
+ */
+void block_drop(struct block **blkp)
+{
+	struct block_data *bdat = &global_bdat;
+
+	free_block(bdat, *blkp);
+	*blkp = NULL;
+	rebalance_cache(bdat);
+}
+
+/*
+ * This doesn't quite work for mixing large and small blocks, but that's
+ * fine, we never do that.
+ */
+static int compar_u64(const void *A, const void *B)
+{
+	u64 a = *((u64 *)A);
+	u64 b = *((u64 *)B);
+
+	return scoutfs_cmp(a, b);
+}
+
+/*
+ * This read-ahead is synchronous and errors are ignored.  If any of the
+ * blknos aren't present in the cache then we issue concurrent reads for
+ * them and wait.  Any existing cached blocks will be left as is.
+ *
+ * We might be trying to read a lot more than the number of events so we
+ * sort the caller's blknos before iterating over them rather than
+ * relying on submission sorting the blocks in each submitted set.
+ */
+void block_readahead(u64 *blknos, size_t nr)
+{
+	struct block_data *bdat = &global_bdat;
+	struct block *blk;
+	struct block *blk_;
+	LIST_HEAD(list);
+	size_t i;
+
+	if (nr == 0)
+		return;
+
+	qsort(blknos, nr, sizeof(blknos[0]), compar_u64);
+
+	for (i = 0; i < nr; i++) {
+		blk = get_or_alloc(bdat, blknos[i], 0);
+		if (blk) {
+			if (!blk->uptodate)
+				list_add_tail(&blk->submit_head, &list);
+			else
+				block_put(&blk);
+		}
+	}
+
+	(void)submit_and_wait(bdat, &list);
+
+	list_for_each_entry_safe(blk, blk_, &list, submit_head) {
+		list_del_init(&blk->submit_head);
+		block_put(&blk);
+	}
+
+	rebalance_cache(bdat);
+}
+
+/*
+ * The caller's block changes form a consistent transaction.  If the amount of dirty
+ * blocks is large enough we issue a write.
+ */
+int block_try_commit(bool force)
+{
+	struct block_data *bdat = &global_bdat;
+	struct block *blk;
+	struct block *blk_;
+	LIST_HEAD(list);
+	int ret;
+
+	if (!force && bdat->nr_dirty < bdat->nr_events)
+		return 0;
+
+	list_for_each_entry(blk, &bdat->dirty_list, dirty_head) {
+		list_add_tail(&blk->submit_head, &list);
+		inc_refcount(blk);
+	}
+
+	ret = submit_and_wait(bdat, &list);
+
+	list_for_each_entry_safe(blk, blk_, &list, submit_head) {
+		list_del_init(&blk->submit_head);
+		block_put(&blk);
+	}
+
+	if (ret < 0) {
+		fprintf(stderr, "error writing dirty transaction blocks\n");
+		goto out;
+	}
+
+	ret = block_get(&blk, SCOUTFS_SUPER_BLKNO, BF_SM | BF_OVERWRITE | BF_DIRTY);
+	if (ret == 0) {
+		list_add(&blk->submit_head, &list);
+		ret = submit_and_wait(bdat, &list);
+		list_del_init(&blk->submit_head);
+		block_put(&blk);
+	} else {
+		ret = -ENOMEM;
+	}
+	if (ret < 0)
+		fprintf(stderr, "error writing super block to commit transaction\n");
+
+out:
+	rebalance_cache(bdat);
+	return ret;
+}
+
+int block_setup(int meta_fd, size_t max_cached_bytes, size_t max_dirty_bytes)
+{
+	struct block_data *bdat = &global_bdat;
+	size_t i;
+	int ret;
+
+	bdat->max_cached = DIV_ROUND_UP(max_cached_bytes, SCOUTFS_BLOCK_LG_SIZE);
+	bdat->hash_nr = bdat->max_cached / 4;
+	bdat->nr_events = DIV_ROUND_UP(max_dirty_bytes, SCOUTFS_BLOCK_LG_SIZE);
+
+	bdat->iocbs = calloc(bdat->nr_events, sizeof(bdat->iocbs[0]));
+	bdat->iocbps = calloc(bdat->nr_events, sizeof(bdat->iocbps[0]));
+	bdat->events = calloc(bdat->nr_events, sizeof(bdat->events[0]));
+	bdat->hash_lists = calloc(bdat->hash_nr, sizeof(bdat->hash_lists[0]));
+	if (!bdat->iocbs || !bdat->iocbps || !bdat->events || !bdat->hash_lists) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	INIT_LIST_HEAD(&bdat->active_head);
+	INIT_LIST_HEAD(&bdat->inactive_head);
+	INIT_LIST_HEAD(&bdat->dirty_list);
+	bdat->meta_fd = meta_fd;
+	list_add(&bdat->inactive_head, &bdat->active_head);
+
+	for (i = 0; i < bdat->hash_nr; i++)
+		INIT_LIST_HEAD(&bdat->hash_lists[i]);
+
+	ret = syscall(__NR_io_setup, bdat->nr_events, &bdat->ctx);
+
+out:
+	if (ret < 0) {
+		free(bdat->iocbs);
+		free(bdat->iocbps);
+		free(bdat->events);
+		free(bdat->hash_lists);
+	}
+
+	return ret;
+}
+
+void block_shutdown(void)
+{
+	struct block_data *bdat = &global_bdat;
+
+	syscall(SYS_io_destroy, bdat->ctx);
+
+	free(bdat->iocbs);
+	free(bdat->iocbps);
+	free(bdat->events);
+	free(bdat->hash_lists);
+}
@@ -0,0 +1,34 @@
+#ifndef _SCOUTFS_UTILS_CHECK_BLOCK_H_
+#define _SCOUTFS_UTILS_CHECK_BLOCK_H_
+
+#include <unistd.h>
+#include <stdbool.h>
+
+struct block;
+
+#include "sparse.h"
+
+/* block flags passed to block_get() */
+enum {
+	BF_ZERO      = (1 << 0), /* zero contents buf as block is returned */
+	BF_DIRTY     = (1 << 1), /* block will be written with transaction */
+	BF_SM        = (1 << 2), /* small 4k block instead of large 64k block */
+	BF_OVERWRITE = (1 << 3), /* caller will overwrite contents, don't read */
+};
+
+int block_get(struct block **blk_ret, u64 blkno, int bf);
+void block_put(struct block **blkp);
+
+void *block_buf(struct block *blk);
+size_t block_size(struct block *blk);
+void block_drop(struct block **blkp);
+
+void block_readahead(u64 *blknos, size_t nr);
+int block_try_commit(bool force);
+
+int block_setup(int meta_fd, size_t max_cached_bytes, size_t max_dirty_bytes);
+void block_shutdown(void);
+
+int block_hdr_valid(struct block *blk, u64 blkno, int bf, u32 magic);
+
+#endif
@@ -0,0 +1,217 @@
+#include <unistd.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdbool.h>
+#include <errno.h>
+
+#include "sparse.h"
+#include "util.h"
+#include "format.h"
+#include "key.h"
+#include "avl.h"
+
+#include "block.h"
+#include "btree.h"
+#include "extent.h"
+#include "iter.h"
+#include "sns.h"
+#include "meta.h"
+#include "problem.h"
+
+static inline void *item_val(struct scoutfs_btree_block *bt, struct scoutfs_btree_item *item)
+{
+	return (void *)bt + le16_to_cpu(item->val_off);
+}
+
+static void readahead_refs(struct scoutfs_btree_block *bt)
+{
+	struct scoutfs_btree_item *item;
+	struct scoutfs_avl_node *node;
+	struct scoutfs_block_ref *ref;
+	u64 *blknos;
+	u64 blkno;
+	u16 valid = 0;
+	u16 nr = le16_to_cpu(bt->nr_items);
+	int i;
+
+	blknos = calloc(nr, sizeof(blknos[0]));
+	if (!blknos)
+		return;
+
+	node = avl_first(&bt->item_root);
+
+	for (i = 0; i < nr; i++) {
+		item = container_of(node, struct scoutfs_btree_item, node);
+		ref = item_val(bt, item);
+		blkno = le64_to_cpu(ref->blkno);
+
+		if (valid_meta_blkno(blkno))
+			blknos[valid++] = blkno;
+
+		node = avl_next(&bt->item_root, &item->node);
+	}
+
+	if (valid > 0)
+		block_readahead(blknos, valid);
+	free(blknos);
+}
+
+/*
+ * Call the callback on the referenced block.  Then if the block
+ * contains referneces read it and recurse into all its references.
+ */
+static int btree_ref_meta_iter(struct scoutfs_block_ref *ref, unsigned level, extent_cb_t cb,
+			       void *cb_arg)
+{
+	struct scoutfs_btree_item *item;
+	struct scoutfs_btree_block *bt;
+	struct scoutfs_avl_node *node;
+	struct block *blk = NULL;
+	u64 blkno;
+	int ret;
+	int i;
+
+	blkno = le64_to_cpu(ref->blkno);
+	if (!blkno)
+		return 0;
+
+	ret = cb(blkno, 1, cb_arg);
+	if (ret < 0) {
+		ret = xlate_iter_errno(ret);
+		return 0;
+	}
+
+	if (level == 0)
+		return 0;
+
+	ret = block_get(&blk, blkno, 0);
+	if (ret < 0)
+		return ret;
+
+	ret = block_hdr_valid(blk, blkno, 0, SCOUTFS_BLOCK_MAGIC_BTREE);
+	if (ret < 0)
+		return ret;
+
+	sns_push("btree_parent", blkno, 0);
+
+	bt = block_buf(blk);
+
+	/* XXX integrate verification with block cache */
+	if (bt->level != level) {
+		problem(PB_BTREE_BLOCK_BAD_LEVEL, "expected %u level %u", level, bt->level);
+		ret = -EINVAL;
+		goto out;
+	}
+
+	/* read-ahead last level of parents */
+	if (level == 2)
+		readahead_refs(bt);
+
+	node = avl_first(&bt->item_root);
+
+	for (i = 0; i < le16_to_cpu(bt->nr_items); i++) {
+		item = container_of(node, struct scoutfs_btree_item, node);
+		ref = item_val(bt, item);
+
+		ret = btree_ref_meta_iter(ref, level - 1, cb, cb_arg);
+		if (ret < 0)
+			goto out;
+
+		node = avl_next(&bt->item_root, &item->node);
+	}
+
+	ret = 0;
+out:
+	block_put(&blk);
+	sns_pop();
+
+	return ret;
+}
+
+int btree_meta_iter(struct scoutfs_btree_root *root, extent_cb_t cb, void *cb_arg)
+{
+	/* XXX check root */
+	if (root->height == 0)
+		return 0;
+
+	return btree_ref_meta_iter(&root->ref, root->height - 1, cb, cb_arg);
+}
+
+static int btree_ref_item_iter(struct scoutfs_block_ref *ref, unsigned level,
+			       btree_item_cb_t cb, void *cb_arg)
+{
+	struct scoutfs_btree_item *item;
+	struct scoutfs_btree_block *bt;
+	struct scoutfs_avl_node *node;
+	struct block *blk = NULL;
+	u64 blkno;
+	int ret;
+	int i;
+
+	blkno = le64_to_cpu(ref->blkno);
+	if (!blkno)
+		return 0;
+
+	ret = block_get(&blk, blkno, 0);
+	if (ret < 0)
+		return ret;
+
+	if (level)
+		sns_push("btree_parent", blkno, 0);
+	else
+		sns_push("btree_leaf", blkno, 0);
+
+	ret = block_hdr_valid(blk, blkno, 0, SCOUTFS_BLOCK_MAGIC_BTREE);
+	if (ret < 0)
+		return ret;
+
+	bt = block_buf(blk);
+
+	/* XXX integrate verification with block cache */
+	if (bt->level != level) {
+		problem(PB_BTREE_BLOCK_BAD_LEVEL, "expected %u level %u", level, bt->level);
+		ret = -EINVAL;
+		goto out;
+	}
+
+	/* read-ahead leaves that contain items */
+	if (level == 1)
+		readahead_refs(bt);
+
+	node = avl_first(&bt->item_root);
+
+	for (i = 0; i < le16_to_cpu(bt->nr_items); i++) {
+		item = container_of(node, struct scoutfs_btree_item, node);
+
+		if (level) {
+			ref = item_val(bt, item);
+			ret = btree_ref_item_iter(ref, level - 1, cb, cb_arg);
+		} else {
+			ret = cb(&item->key, item_val(bt, item),
+				 le16_to_cpu(item->val_len), cb_arg);
+			debug("free item key "SK_FMT" ret %d", SK_ARG(&item->key), ret);
+		}
+		if (ret < 0) {
+			ret = xlate_iter_errno(ret);
+			goto out;
+		}
+
+		node = avl_next(&bt->item_root, &item->node);
+	}
+
+	ret = 0;
+out:
+	block_put(&blk);
+	sns_pop();
+
+	return ret;
+}
+
+int btree_item_iter(struct scoutfs_btree_root *root, btree_item_cb_t cb, void *cb_arg)
+{
+	/* XXX check root */
+	if (root->height == 0)
+		return 0;
+
+	return btree_ref_item_iter(&root->ref, root->height - 1, cb, cb_arg);
+}
@@ -0,0 +1,14 @@
+#ifndef _SCOUTFS_UTILS_CHECK_BTREE_H_
+#define _SCOUTFS_UTILS_CHECK_BTREE_H_
+
+#include "util.h"
+#include "format.h"
+
+#include "extent.h"
+
+typedef int (*btree_item_cb_t)(struct scoutfs_key *key, void *val, u16 val_len, void *cb_arg);
+
+int btree_meta_iter(struct scoutfs_btree_root *root, extent_cb_t cb, void *cb_arg);
+int btree_item_iter(struct scoutfs_btree_root *root, btree_item_cb_t cb, void *cb_arg);
+
+#endif
@@ -0,0 +1,184 @@
+#define _GNU_SOURCE /* O_DIRECT */
+#include <unistd.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <sys/ioctl.h>
+#include <fcntl.h>
+#include <errno.h>
+#include <string.h>
+#include <assert.h>
+#include <stdbool.h>
+#include <argp.h>
+
+#include "sparse.h"
+#include "parse.h"
+#include "util.h"
+#include "format.h"
+#include "ioctl.h"
+#include "cmd.h"
+#include "dev.h"
+
+#include "alloc.h"
+#include "block.h"
+#include "debug.h"
+#include "meta.h"
+#include "super.h"
+#include "problem.h"
+
+struct check_args {
+	char *meta_device;
+	char *data_device;
+	char *debug_path;
+};
+
+static int do_check(struct check_args *args)
+{
+	int debug_fd = -1;
+	int meta_fd = -1;
+	int data_fd = -1;
+	int ret;
+
+	if (args->debug_path) {
+		if (strcmp(args->debug_path, "-") == 0)
+			debug_fd = dup(STDERR_FILENO);
+		else
+			debug_fd = open(args->debug_path, O_WRONLY | O_CREAT | O_TRUNC, 0644);
+		if (debug_fd < 0) {
+			ret = -errno;
+			fprintf(stderr, "error opening debug output file '%s': %s (%d)\n",
+				args->debug_path, strerror(errno), errno);
+			goto out;
+		}
+
+		debug_enable(debug_fd);
+	}
+
+	meta_fd = open(args->meta_device, O_DIRECT | O_RDWR | O_EXCL);
+	if (meta_fd < 0) {
+		ret = -errno;
+		fprintf(stderr, "failed to open meta device '%s': %s (%d)\n",
+			args->meta_device, strerror(errno), errno);
+		goto out;
+	}
+
+	data_fd = open(args->data_device, O_DIRECT | O_RDWR | O_EXCL);
+	if (data_fd < 0) {
+		ret = -errno;
+		fprintf(stderr, "failed to open data device '%s': %s (%d)\n",
+			args->data_device, strerror(errno), errno);
+		goto out;
+	}
+
+	ret = block_setup(meta_fd, 128 * 1024 * 1024, 32 * 1024 * 1024);
+	if (ret < 0)
+		goto out;
+
+	/*
+	 * At some point we may convert this to a multi-pass system where we may
+	 * try and repair items, and, as long as repairs are made, we will rerun
+	 * the checks more times. We may need to start counting how many problems we
+	 * fix in the process of these loops, so that we don't stall on unrepairable
+	 * problems and are making actual repair progress. IOW - when we do a full
+	 * check loop without any problems fixed, we stop trying.
+	 */
+	ret = check_supers(data_fd) ?:
+	      check_super_in_use(meta_fd) ?:
+	      check_meta_alloc() ?:
+	      check_super_crc();
+
+	if (ret < 0)
+		goto out;
+
+	debug("problem count %lu", problems_count());
+	if (problems_count() > 0)
+		printf("Problems detected.\n");
+
+out:
+	/* and tear it all down */
+	block_shutdown();
+	super_shutdown();
+	debug_disable();
+
+	if (meta_fd >= 0)
+		close(meta_fd);
+	if (data_fd >= 0)
+		close(data_fd);
+	if (debug_fd >= 0)
+		close(debug_fd);
+
+	return ret;
+}
+
+static int parse_opt(int key, char *arg, struct argp_state *state)
+{
+	struct check_args *args = state->input;
+
+	switch (key) {
+	case 'd':
+		args->debug_path = strdup_or_error(state, arg);
+		break;
+	case 'e':
+	case ARGP_KEY_ARG:
+		if (!args->meta_device)
+			args->meta_device = strdup_or_error(state, arg);
+		else if (!args->data_device)
+			args->data_device = strdup_or_error(state, arg);
+		else
+			argp_error(state, "more than two device arguments given");
+		break;
+	case ARGP_KEY_FINI:
+		if (!args->meta_device)
+			argp_error(state, "no metadata device argument given");
+		if (!args->data_device)
+			argp_error(state, "no data device argument given");
+		break;
+	default:
+		break;
+	}
+
+	return 0;
+}
+
+static struct argp_option options[] = {
+	{ "debug", 'd', "FILE_PATH", 0, "Path to debug output file, will be created or truncated"},
+	{ NULL }
+};
+
+static struct argp argp = {
+	options,
+	parse_opt,
+	"META-DEVICE DATA-DEVICE",
+	"Check filesystem consistency"
+};
+
+/* Exit codes used by fsck-type programs */
+#define FSCK_EX_NONDESTRUCT	1	/* File system errors corrected */
+#define FSCK_EX_UNCORRECTED	4	/* File system errors left uncorrected */
+#define FSCK_EX_ERROR		8	/* Operational error */
+#define FSCK_EX_USAGE		16	/* Usage or syntax error */
+
+static int check_cmd(int argc, char **argv)
+{
+	struct check_args check_args = {NULL};
+	int ret;
+
+	ret = argp_parse(&argp, argc, argv, 0, NULL, &check_args);
+	if (ret)
+		exit(FSCK_EX_USAGE);
+
+	ret = do_check(&check_args);
+	if (ret < 0)
+		ret = FSCK_EX_ERROR;
+
+	if (problems_count() > 0)
+		ret |= FSCK_EX_UNCORRECTED;
+
+	exit(ret);
+}
+
+static void __attribute__((constructor)) check_ctor(void)
+{
+	cmd_register_argp("check", &argp, GROUP_CORE, check_cmd);
+}
@@ -0,0 +1,16 @@
+#include <stdlib.h>
+
+#include "debug.h"
+
+int debug_fd = -1;
+
+void debug_enable(int fd)
+{
+	debug_fd = fd;
+}
+
+void debug_disable(void)
+{
+	if (debug_fd >= 0)
+		debug_fd = -1;
+}
@@ -0,0 +1,17 @@
+#ifndef _SCOUTFS_UTILS_CHECK_DEBUG_H_
+#define _SCOUTFS_UTILS_CHECK_DEBUG_H_
+
+#include <stdio.h>
+
+#define debug(fmt, args...)				\
+do {							\
+	if (debug_fd >= 0)				\
+		dprintf(debug_fd, fmt"\n", ##args);	\
+} while (0)
+
+extern int debug_fd;
+
+void debug_enable(int fd);
+void debug_disable(void);
+
+#endif
@@ -0,0 +1,9 @@
+#ifndef _SCOUTFS_UTILS_CHECK_ENO_H_
+#define _SCOUTFS_UTILS_CHECK_ENO_H_
+
+#include <errno.h>
+
+#define ENO_FMT		"%d (%s)"
+#define ENO_ARG(eno)	eno, strerror(eno)
+
+#endif
@@ -0,0 +1,313 @@
+#include <unistd.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdbool.h>
+#include <errno.h>
+
+#include "util.h"
+#include "lk_rbtree_wrapper.h"
+
+#include "debug.h"
+#include "extent.h"
+
+/*
+ * In-memory extent management in rbtree nodes.
+ */
+
+bool extents_overlap(u64 a_start, u64 a_len, u64 b_start, u64 b_len)
+{
+	u64 a_end = a_start + a_len;
+	u64 b_end = b_start + b_len;
+
+	return !((a_end <= b_start) || (b_end <= a_start));
+}
+
+static int ext_contains(struct extent_node *ext, u64 start, u64 len)
+{
+	return ext->start <= start && ext->start + ext->len >= start + len;
+}
+
+/*
+ * True if the given extent is bisected by the given range; there's
+ * leftover containing extents on both the left and right sides of the
+ * range in the extent.
+ */
+static int ext_bisected(struct extent_node *ext, u64 start, u64 len)
+{
+	return ext->start < start && ext->start + ext->len > start + len;
+}
+
+static struct extent_node *ext_from_rbnode(struct rb_node *rbnode)
+{
+	return rbnode ? container_of(rbnode, struct extent_node, rbnode) : NULL;
+}
+
+static struct extent_node *next_ext(struct extent_node *ext)
+{
+	return ext ? ext_from_rbnode(rb_next(&ext->rbnode)) : NULL;
+}
+
+static struct extent_node *prev_ext(struct extent_node *ext)
+{
+	return ext ? ext_from_rbnode(rb_prev(&ext->rbnode)) : NULL;
+}
+
+struct walk_results {
+	unsigned bisect_to_leaf:1;
+	struct extent_node *found;
+	struct extent_node *next;
+	struct rb_node *parent;
+	struct rb_node **node;
+};
+
+static void walk_extents(struct extent_root *root, u64 start, u64 len, struct walk_results *wlk)
+{
+	struct rb_node **node = &root->rbroot.rb_node;
+	struct extent_node *ext;
+	u64 end = start + len;
+	int cmp;
+
+	wlk->found = NULL;
+	wlk->next = NULL;
+	wlk->parent = NULL;
+
+	while (*node) {
+		wlk->parent = *node;
+		ext = ext_from_rbnode(*node);
+		cmp = end <= ext->start ? -1 :
+		      start >= ext->start + ext->len ? 1 : 0;
+
+		if (cmp < 0) {
+			node = &ext->rbnode.rb_left;
+			wlk->next = ext;
+		} else if (cmp > 0) {
+			node = &ext->rbnode.rb_right;
+		} else {
+			wlk->found = ext;
+			if (!(wlk->bisect_to_leaf && ext_bisected(ext, start, len)))
+				break;
+			/* walk right so we can insert greater right from bisection */
+			node = &ext->rbnode.rb_right;
+		}
+	}
+
+	wlk->node = node;
+}
+
+/*
+ * Return an extent that overlaps with the given range.
+ */
+int extent_lookup(struct extent_root *root, u64 start, u64 len, struct extent_node *found)
+{
+	struct walk_results wlk = { 0, };
+	int ret;
+
+	walk_extents(root, start, len, &wlk);
+	if (wlk.found) {
+		memset(found, 0, sizeof(struct extent_node));
+		found->start = wlk.found->start;
+		found->len = wlk.found->len;
+		ret = 0;
+	} else {
+		ret = -ENOENT;
+	}
+
+	return ret;
+}
+
+/*
+ * Callers can iterate through direct node references and are entirely
+ * responsible for consistency when doing so.
+ */
+struct extent_node *extent_first(struct extent_root *root)
+{
+	struct walk_results wlk = { 0, };
+
+	walk_extents(root, 0, 1, &wlk);
+
+	return wlk.found ?: wlk.next;
+}
+
+struct extent_node *extent_next(struct extent_node *ext)
+{
+	return next_ext(ext);
+}
+
+struct extent_node *extent_prev(struct extent_node *ext)
+{
+	return prev_ext(ext);
+}
+
+/*
+ * Insert a new extent into the tree.  We can extend existing nodes,
+ * merge with neighbours, or remove existing extents entirely if we
+ * insert a range that fully spans existing nodes.
+ */
+static int walk_insert(struct extent_root *root, u64 start, u64 len, int found_err)
+{
+	struct walk_results wlk = { 0, };
+	struct extent_node *ext;
+	struct extent_node *nei;
+	int ret;
+
+	walk_extents(root, start, len, &wlk);
+
+	ext = wlk.found;
+	if (ext && found_err) {
+		ret = found_err;
+		goto out;
+	}
+
+	if (!ext) {
+		ext = malloc(sizeof(struct extent_node));
+		if (!ext) {
+			ret = -ENOMEM;
+			goto out;
+		}
+
+		ext->start = start;
+		ext->len = len;
+
+		rb_link_node(&ext->rbnode, wlk.parent, wlk.node);
+		rb_insert_color(&ext->rbnode, &root->rbroot);
+	}
+
+	/* start by expanding an existing extent if our range is larger */
+	if (start < ext->start) {
+		ext->len += ext->start - start;
+		ext->start = start;
+	}
+	if (ext->start + ext->len < start + len)
+		ext->len += (start + len) - (ext->start + ext->len);
+
+	/* drop any fully spanned neighbors, possibly merging with a final adjacent one */
+
+	while ((nei = prev_ext(ext))) {
+		if (nei->start + nei->len < ext->start)
+			break;
+
+		if (nei->start < ext->start) {
+			ext->len += ext->start - nei->start;
+			ext->start = nei->start;
+		}
+
+		rb_erase(&nei->rbnode, &root->rbroot);
+		free(nei);
+	}
+
+	while ((nei = next_ext(ext))) {
+		if (ext->start + ext->len < nei->start)
+			break;
+
+		if (ext->start + ext->len < nei->start + nei->len)
+			ext->len += (nei->start + nei->len) - (ext->start + ext->len);
+
+		rb_erase(&nei->rbnode, &root->rbroot);
+		free(nei);
+	}
+
+	ret = 0;
+out:
+	if (ret < 0)
+		debug("start %llu len %llu ret %d", start, len, ret);
+	return ret;
+}
+
+/*
+ * Insert a new extent.  The specified extent must not overlap with any
+ * existing extents or -EEXIST is returned.
+ */
+int extent_insert_new(struct extent_root *root, u64 start, u64 len)
+{
+	return walk_insert(root, start, len, true);
+}
+
+/*
+ * Insert an extent, extending any existing extents that may overlap.
+ */
+int extent_insert_extend(struct extent_root *root, u64 start, u64 len)
+{
+	return walk_insert(root, start, len, false);
+}
+
+/*
+ * Remove the specified extent from an existing node.  The given extent must be fully
+ * contained in a single node or -ENOENT is returned.
+ */
+int extent_remove(struct extent_root *root, u64 start, u64 len)
+{
+	struct extent_node *ext;
+	struct extent_node *ins;
+	struct walk_results wlk = {
+		.bisect_to_leaf = 1,
+	};
+	int ret;
+
+	walk_extents(root, start, len, &wlk);
+
+	if (!(ext = wlk.found) || !ext_contains(ext, start, len)) {
+		ret = -ENOENT;
+		goto out;
+	}
+
+	if (ext_bisected(ext, start, len)) {
+		debug("found bisected start %llu len %llu", ext->start, ext->len);
+		ins = malloc(sizeof(struct extent_node));
+		if (!ins) {
+			ret = -ENOMEM;
+			goto out;
+		}
+
+		ins->start = start + len;
+		ins->len = (ext->start + ext->len) - ins->start;
+
+		rb_link_node(&ins->rbnode, wlk.parent, wlk.node);
+		rb_insert_color(&ins->rbnode, &root->rbroot);
+	}
+
+	if (start > ext->start) {
+		ext->len = start - ext->start;
+	} else if (len < ext->len) {
+		ext->start += len;
+		ext->len -= len;
+	} else {
+		rb_erase(&ext->rbnode, &root->rbroot);
+	}
+
+	ret = 0;
+out:
+	debug("start %llu len %llu ret %d", start, len, ret);
+
+	return ret;
+}
+
+void extent_root_init(struct extent_root *root)
+{
+	root->rbroot = RB_ROOT;
+	root->total = 0;
+}
+
+void extent_root_free(struct extent_root *root)
+{
+	struct extent_node *ext;
+	struct rb_node *node;
+	struct rb_node *tmp;
+
+	for (node = rb_first(&root->rbroot); node && ((tmp = rb_next(node)), 1); node = tmp) {
+		ext = rb_entry(node, struct extent_node, rbnode);
+		rb_erase(&ext->rbnode, &root->rbroot);
+		free(ext);
+	}
+}
+
+void extent_root_print(struct extent_root *root)
+{
+	struct extent_node *ext;
+	struct rb_node *node;
+	struct rb_node *tmp;
+
+	for (node = rb_first(&root->rbroot); node && ((tmp = rb_next(node)), 1); node = tmp) {
+		ext = rb_entry(node, struct extent_node, rbnode);
+		debug("  start %llu len %llu", ext->start, ext->len);
+	}
+}
@@ -0,0 +1,38 @@
+#ifndef _SCOUTFS_UTILS_CHECK_EXTENT_H_
+#define _SCOUTFS_UTILS_CHECK_EXTENT_H_
+
+#include "lk_rbtree_wrapper.h"
+
+struct extent_root {
+	struct rb_root rbroot;
+	u64 total;
+};
+
+struct extent_node {
+	struct rb_node rbnode;
+	u64 start;
+	u64 len;
+};
+
+typedef int (*extent_cb_t)(u64 start, u64 len, void *arg);
+
+struct extent_cb_arg_t {
+	extent_cb_t cb;
+	void *cb_arg;
+};
+
+bool extents_overlap(u64 a_start, u64 a_len, u64 b_start, u64 b_len);
+
+int extent_lookup(struct extent_root *root, u64 start, u64 len, struct extent_node *found);
+struct extent_node *extent_first(struct extent_root *root);
+struct extent_node *extent_next(struct extent_node *ext);
+struct extent_node *extent_prev(struct extent_node *ext);
+int extent_insert_new(struct extent_root *root, u64 start, u64 len);
+int extent_insert_extend(struct extent_root *root, u64 start, u64 len);
+int extent_remove(struct extent_root *root, u64 start, u64 len);
+
+void extent_root_init(struct extent_root *root);
+void extent_root_free(struct extent_root *root);
+void extent_root_print(struct extent_root *root);
+
+#endif
@@ -0,0 +1,540 @@
+#define _GNU_SOURCE /* O_DIRECT */
+#include <unistd.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <fcntl.h>
+#include <errno.h>
+#include <string.h>
+#include <stdbool.h>
+#include <argp.h>
+
+#include "sparse.h"
+#include "bitmap.h"
+#include "parse.h"
+#include "util.h"
+#include "format.h"
+#include "crc.h"
+#include "cmd.h"
+#include "dev.h"
+
+#include "alloc.h"
+#include "block.h"
+#include "btree.h"
+#include "log_trees.h"
+#include "super.h"
+
+/* huh. */
+#define OFF_MAX (off_t)((u64)((off_t)~0ULL) >> 1)
+
+#define SCOUTFS_META_IMAGE_HEADER_MAGIC		0x8aee00d098fa60c5ULL
+#define SCOUTFS_META_IMAGE_BLOCK_HEADER_MAGIC	0x70bd5e9269effd86ULL
+
+struct scoutfs_meta_image_header {
+	__le64 magic;
+	__le64 total_bytes;
+	__le32 version;
+} __packed;
+
+struct scoutfs_meta_image_block_header {
+	__le64 magic;
+	__le64 offset;
+	__le32 size;
+	__le32 crc;
+} __packed;
+
+struct image_args {
+	char *meta_device;
+	bool is_read;
+	bool show_header;
+	u64 ra_window;
+};
+
+struct block_bitmaps {
+	unsigned long *bits;
+	u64 size;
+	u64 count;
+};
+
+#define errf(fmt, args...) \
+	dprintf(STDERR_FILENO, fmt, ##args)
+
+static int set_meta_bit(u64 start, u64 len, void *arg)
+{
+	struct block_bitmaps *bm = arg;
+	int ret;
+
+	if (len != 1) {
+		ret = -EINVAL;
+	} else {
+		if (!test_bit(bm->bits, start)) {
+			set_bit(bm->bits, start);
+			bm->count++;
+		}
+		ret = 0;
+	}
+
+	return ret;
+}
+
+static int get_ref_bits(struct block_bitmaps *bm)
+{
+	struct scoutfs_super_block *super = global_super;
+	int ret;
+	u64 i;
+
+	/*
+	 * There are almost no small blocks we need to read, so we read
+	 * them as the large blocks that contain them to simplify the
+	 * block reading process.
+	 */
+	set_meta_bit(SCOUTFS_SUPER_BLKNO >> SCOUTFS_BLOCK_SM_LG_SHIFT, 1, bm);
+
+	for (i = 0; i < SCOUTFS_QUORUM_BLOCKS; i++)
+		set_meta_bit((SCOUTFS_QUORUM_BLKNO + i) >> SCOUTFS_BLOCK_SM_LG_SHIFT, 1, bm);
+
+	ret = alloc_root_meta_iter(&super->meta_alloc[0], set_meta_bit, bm) ?:
+	      alloc_root_meta_iter(&super->meta_alloc[1], set_meta_bit, bm) ?:
+	      alloc_root_meta_iter(&super->data_alloc, set_meta_bit, bm) ?:
+	      alloc_list_meta_iter(&super->server_meta_avail[0], set_meta_bit, bm) ?:
+	      alloc_list_meta_iter(&super->server_meta_avail[1], set_meta_bit, bm) ?:
+	      alloc_list_meta_iter(&super->server_meta_freed[0], set_meta_bit, bm) ?:
+	      alloc_list_meta_iter(&super->server_meta_freed[1], set_meta_bit, bm) ?:
+	      btree_meta_iter(&super->fs_root, set_meta_bit, bm) ?:
+	      btree_meta_iter(&super->logs_root, set_meta_bit, bm) ?:
+	      btree_meta_iter(&super->log_merge, set_meta_bit, bm) ?:
+	      btree_meta_iter(&super->mounted_clients, set_meta_bit, bm) ?:
+	      btree_meta_iter(&super->srch_root, set_meta_bit, bm) ?:
+	      log_trees_meta_iter(set_meta_bit, bm);
+
+	return ret;
+}
+
+/*
+ * Note that this temporarily modifies the header that it's given.
+ */
+static __le32 calc_crc(struct scoutfs_meta_image_block_header *bh, void *buf, size_t size)
+{
+	__le32 saved = bh->crc;
+	u32 crc = ~0;
+
+	bh->crc = 0;
+	crc = crc32c(crc, bh, sizeof(*bh));
+	crc = crc32c(crc, buf, size);
+	bh->crc = saved;
+
+	return cpu_to_le32(crc);
+}
+
+static void printf_header(struct scoutfs_meta_image_header *hdr)
+{
+	errf("magic: 0x%016llx\n"
+	     "total_bytes: %llu\n"
+	     "version: %u\n",
+	       le64_to_cpu(hdr->magic),
+	       le64_to_cpu(hdr->total_bytes),
+	       le32_to_cpu(hdr->version));
+}
+
+typedef ssize_t (*rw_func_t)(int fd, void *buf, size_t count, off_t offset);
+
+static inline ssize_t rw_read(int fd, void *buf, size_t count, off_t offset)
+{
+	return read(fd, buf, count);
+}
+
+static inline ssize_t rw_pread(int fd, void *buf, size_t count, off_t offset)
+{
+	return pread(fd, buf, count, offset);
+}
+
+static inline ssize_t rw_write(int fd, void *buf, size_t count, off_t offset)
+{
+	return write(fd, buf, count);
+}
+
+static inline ssize_t rw_pwrite(int fd, void *buf, size_t count, off_t offset)
+{
+	return pwrite(fd, buf, count, offset);
+}
+
+static int rw_full_count(rw_func_t func, u64 *tot, int fd, void *buf, size_t count, off_t offset)
+{
+	ssize_t sret;
+
+	while (count > 0) {
+		sret = func(fd, buf, count, offset);
+		if (sret <= 0 || sret > count) {
+			if (sret < 0)
+				return -errno;
+			else
+				return -EIO;
+		}
+
+		if (tot)
+			*tot += sret;
+		buf += sret;
+		count -= sret;
+	}
+
+	return 0;
+}
+
+static int read_image(struct image_args *args, int fd, struct block_bitmaps *bm)
+{
+	struct scoutfs_meta_image_block_header bh;
+	struct scoutfs_meta_image_header hdr;
+	u64 opening;
+	void *buf;
+	off_t off;
+	u64 bit;
+	u64 ra;
+	int ret;
+
+	buf = malloc(SCOUTFS_BLOCK_LG_SIZE);
+	if (!buf) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	hdr.magic = cpu_to_le64(SCOUTFS_META_IMAGE_HEADER_MAGIC);
+	hdr.total_bytes = cpu_to_le64(sizeof(hdr) +
+				      (bm->count * (SCOUTFS_BLOCK_LG_SIZE + sizeof(bh))));
+	hdr.version = cpu_to_le32(1);
+
+	if (args->show_header) {
+		printf_header(&hdr);
+		ret = 0;
+		goto out;
+	}
+
+	ret = rw_full_count(rw_write, NULL, STDOUT_FILENO, &hdr, sizeof(hdr), 0);
+	if (ret < 0)
+		goto out;
+
+	opening = args->ra_window;
+	ra = 0;
+	bit = 0;
+
+	for (bit = 0; (bit = find_next_set_bit(bm->bits, bit, bm->size)) < bm->size; bit++) {
+
+		/* readahead to open the full window, then a block at a time */
+		do {
+			ra = find_next_set_bit(bm->bits, ra, bm->size);
+			if (ra < bm->size) {
+				off = ra << SCOUTFS_BLOCK_LG_SHIFT;
+				posix_fadvise(fd, off, SCOUTFS_BLOCK_LG_SIZE, POSIX_FADV_WILLNEED);
+				ra++;
+				if (opening)
+					opening -= min(opening, SCOUTFS_BLOCK_LG_SIZE);
+			}
+		} while (opening > 0);
+
+		off = bit << SCOUTFS_BLOCK_LG_SHIFT;
+		ret = rw_full_count(rw_pread, NULL, fd, buf, SCOUTFS_BLOCK_LG_SIZE, off);
+		if (ret < 0)
+			goto out;
+
+		/*
+		 * Might as well try to drop the pages we've used to
+		 * reduce memory pressure on our read-ahead pages that
+		 * are waiting.
+		 */
+		posix_fadvise(fd, off, SCOUTFS_BLOCK_LG_SIZE, POSIX_FADV_DONTNEED);
+
+		bh.magic = cpu_to_le64(SCOUTFS_META_IMAGE_BLOCK_HEADER_MAGIC);
+		bh.offset = cpu_to_le64(off);
+		bh.size = cpu_to_le32(SCOUTFS_BLOCK_LG_SIZE);
+		bh.crc = calc_crc(&bh, buf, SCOUTFS_BLOCK_LG_SIZE);
+
+		ret = rw_full_count(rw_write, NULL, STDOUT_FILENO, &bh, sizeof(bh), 0) ?:
+		      rw_full_count(rw_write, NULL, STDOUT_FILENO, buf, SCOUTFS_BLOCK_LG_SIZE, 0);
+		if (ret < 0)
+			goto out;
+	}
+
+out:
+	free(buf);
+
+	return ret;
+}
+
+static int invalid_header(struct scoutfs_meta_image_header *hdr)
+{
+	if (le64_to_cpu(hdr->magic) != SCOUTFS_META_IMAGE_HEADER_MAGIC) {
+		errf("bad image header magic 0x%016llx (!= expected %016llx)\n",
+		       le64_to_cpu(hdr->magic), SCOUTFS_META_IMAGE_HEADER_MAGIC);
+
+	} else if (le32_to_cpu(hdr->version) != 1) {
+		errf("unknown image header version %u\n", le32_to_cpu(hdr->version));
+
+	} else {
+		return 0;
+	}
+
+	return -EIO;
+}
+
+/*
+ * Doesn't catch offset+size overflowing, presumes pwrite() will return
+ * an error.
+ */
+static int invalid_block_header(struct scoutfs_meta_image_block_header *bh)
+{
+	if (le64_to_cpu(bh->magic) != SCOUTFS_META_IMAGE_BLOCK_HEADER_MAGIC) {
+		errf("bad block header magic 0x%016llx (!= expected %016llx)\n",
+		       le64_to_cpu(bh->magic), SCOUTFS_META_IMAGE_BLOCK_HEADER_MAGIC);
+
+	} else if (le32_to_cpu(bh->size) == 0) {
+		errf("invalid block header size %u\n", le32_to_cpu(bh->size));
+
+	} else if (le32_to_cpu(bh->size) > SIZE_MAX) {
+		errf("block header size %u too large for size_t (> %zu)\n",
+		       le32_to_cpu(bh->size), (size_t)SIZE_MAX);
+
+	} else if (le64_to_cpu(bh->offset) > OFF_MAX) {
+		errf("block header offset %llu too large for off_t (> %llu)\n",
+		       le64_to_cpu(bh->offset), (u64)OFF_MAX);
+
+	} else {
+		return 0;
+	}
+
+	return -EIO;
+}
+
+static int write_image(struct image_args *args, int fd, struct block_bitmaps *bm)
+{
+	struct scoutfs_meta_image_block_header bh;
+	struct scoutfs_meta_image_header hdr;
+	size_t writeback_batch = (2 * 1024 * 1024);
+	size_t buf_size;
+	size_t dirty;
+	size_t size;
+	off_t first;
+	off_t last;
+	off_t off;
+	__le32 calc;
+	void *buf;
+	u64 tot;
+	int ret;
+
+	tot = 0;
+
+	ret = rw_full_count(rw_read, &tot, STDIN_FILENO, &hdr, sizeof(hdr), 0);
+	if (ret < 0)
+		goto out;
+
+	if (args->show_header) {
+		printf_header(&hdr);
+		ret = 0;
+		goto out;
+	}
+
+	ret = invalid_header(&hdr);
+	if (ret < 0)
+		goto out;
+
+	dirty = 0;
+	first = OFF_MAX;
+	last = 0;
+	buf = NULL;
+	buf_size = 0;
+
+	while (tot < le64_to_cpu(hdr.total_bytes)) {
+
+		ret = rw_full_count(rw_read, &tot, STDIN_FILENO, &bh, sizeof(bh), 0);
+		if (ret < 0)
+			goto out;
+
+		ret = invalid_block_header(&bh);
+		if (ret < 0)
+			goto out;
+
+		size = le32_to_cpu(bh.size);
+		if (buf_size < size) {
+			buf = realloc(buf, size);
+			if (!buf) {
+				ret = -ENOMEM;
+				goto out;
+			}
+
+			buf_size = size;
+		}
+
+		ret = rw_full_count(rw_read, &tot, STDIN_FILENO, buf, size, 0);
+		if (ret < 0)
+			goto out;
+
+		calc = calc_crc(&bh, buf, size);
+		if (calc != bh.crc) {
+			errf("crc err");
+			ret = -EIO;
+			goto out;
+		}
+
+		off = le64_to_cpu(bh.offset);
+
+		ret = rw_full_count(rw_pwrite, NULL, fd, buf, size, off);
+		if (ret < 0)
+			goto out;
+
+		dirty += size;
+		first = min(first, off);
+		last = max(last, off);
+		if (dirty >= writeback_batch) {
+			posix_fadvise(fd, first, last, POSIX_FADV_DONTNEED);
+			dirty = 0;
+			first = OFF_MAX;
+			last = 0;
+		}
+	}
+
+	ret = fsync(fd);
+	if (ret < 0) {
+		ret = -errno;
+		goto out;
+	}
+
+out:
+	return ret;
+}
+
+static int do_image(struct image_args *args)
+{
+	struct block_bitmaps bm = { .bits = NULL };
+	int meta_fd = -1;
+	u64 dev_size;
+	mode_t mode;
+	int ret;
+
+	mode = args->is_read ? O_RDONLY : O_RDWR;
+
+	meta_fd = open(args->meta_device, mode);
+	if (meta_fd < 0) {
+		ret = -errno;
+		errf("failed to open meta device '%s': %s (%d)\n",
+		     args->meta_device, strerror(errno), errno);
+		goto out;
+	}
+
+	if (args->is_read) {
+		ret = flush_device(meta_fd);
+		if (ret < 0)
+			goto out;
+
+		ret = get_device_size(args->meta_device, meta_fd, &dev_size);
+		if (ret < 0)
+			goto out;
+
+		bm.size = DIV_ROUND_UP(dev_size, SCOUTFS_BLOCK_LG_SIZE);
+		bm.bits = calloc(1, round_up(bm.size, BITS_PER_LONG) / 8);
+		if (!bm.bits) {
+			ret = -ENOMEM;
+			goto out;
+		}
+
+		ret = block_setup(meta_fd, 128 * 1024 * 1024, 32 * 1024 * 1024) ?:
+		      check_supers(-1) ?:
+		      get_ref_bits(&bm) ?:
+		      read_image(args, meta_fd, &bm);
+		block_shutdown();
+	} else {
+		ret = write_image(args, meta_fd, &bm);
+	}
+out:
+	free(bm.bits);
+
+	if (meta_fd >= 0)
+		close(meta_fd);
+
+	return ret;
+}
+
+static int parse_opt(int key, char *arg, struct argp_state *state)
+{
+	struct image_args *args = state->input;
+	int ret;
+
+	switch (key) {
+	case 'h':
+		args->show_header = true;
+		break;
+	case 'r':
+		ret = parse_u64(arg, &args->ra_window);
+		if (ret)
+			argp_error(state, "readahead winddoe parse error");
+		break;
+	case ARGP_KEY_ARG:
+		if (!args->meta_device)
+			args->meta_device = strdup_or_error(state, arg);
+		else
+			argp_error(state, "more than two device arguments given");
+		break;
+	case ARGP_KEY_FINI:
+		if (!args->meta_device)
+			argp_error(state, "no metadata device argument given");
+		break;
+	default:
+		break;
+	}
+
+	return 0;
+}
+
+static struct argp_option options[] = {
+	{ "show-header", 'h', NULL, 0, "Print image header and exit without processing stream" },
+	{ "readahead", 'r', "NR", 0, "Maintain read-ahead window of NR blocks" },
+	{ NULL }
+};
+
+static struct argp read_image_argp = {
+	options,
+	parse_opt,
+	"META-DEVICE",
+	"Read metadata image stream from metadata device file"
+};
+
+#define DEFAULT_RA_WINDOW (512 * 1024)
+
+static int read_image_cmd(int argc, char **argv)
+{
+	struct image_args image_args = {
+		.is_read = true,
+		.ra_window = DEFAULT_RA_WINDOW,
+	};
+	int ret;
+
+	ret = argp_parse(&read_image_argp, argc, argv, 0, NULL, &image_args);
+	if (ret)
+		return ret;
+
+	return do_image(&image_args);
+}
+
+static struct argp write_image_argp = {
+	options,
+	parse_opt,
+	"META-DEVICE",
+	"Write metadata image stream to metadata device file"
+};
+
+static int write_image_cmd(int argc, char **argv)
+{
+	struct image_args image_args = {
+		.is_read = false,
+		.ra_window = DEFAULT_RA_WINDOW,
+	};
+	int ret;
+
+	ret = argp_parse(&write_image_argp, argc, argv, 0, NULL, &image_args);
+	if (ret)
+		return ret;
+
+	return do_image(&image_args);
+}
+
+static void __attribute__((constructor)) image_ctor(void)
+{
+	cmd_register_argp("read-metadata-image", &read_image_argp, GROUP_CORE, read_image_cmd);
+	cmd_register_argp("write-metadata-image", &write_image_argp, GROUP_CORE, write_image_cmd);
+}
@@ -0,0 +1,15 @@
+#ifndef _SCOUTFS_UTILS_CHECK_ITER_H_
+#define _SCOUTFS_UTILS_CHECK_ITER_H_
+
+/*
+ * Callbacks can return a weird -errno that we'll never use to indicate
+ * that iteration can stop and return 0 for success.
+ */
+#define ECHECK_ITER_DONE EL2HLT
+
+static inline int xlate_iter_errno(int ret)
+{
+	return ret == -ECHECK_ITER_DONE ? 0 : ret;
+}
+
+#endif
@@ -0,0 +1,98 @@
+#include <unistd.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <errno.h>
+
+#include "sparse.h"
+#include "util.h"
+#include "format.h"
+#include "key.h"
+
+#include "alloc.h"
+#include "btree.h"
+#include "debug.h"
+#include "extent.h"
+#include "iter.h"
+#include "sns.h"
+#include "log_trees.h"
+#include "super.h"
+
+struct iter_args {
+	extent_cb_t cb;
+	void *cb_arg;
+};
+
+static int lt_meta_iter(struct scoutfs_key *key, void *val, u16 val_len, void *cb_arg)
+{
+	struct iter_args *ia = cb_arg;
+	struct scoutfs_log_trees *lt;
+	int ret;
+
+	if (val_len != sizeof(struct scoutfs_log_trees))
+		; /* XXX */
+
+	lt = val;
+
+	sns_push("log_trees", le64_to_cpu(lt->rid), le64_to_cpu(lt->nr));
+
+	debug("lt rid 0x%16llx nr %llu", le64_to_cpu(lt->rid), le64_to_cpu(lt->nr));
+
+	sns_push("meta_avail", 0, 0);
+	ret = alloc_list_meta_iter(&lt->meta_avail, ia->cb, ia->cb_arg);
+	sns_pop();
+	if (ret < 0)
+		goto out;
+
+	sns_push("meta_freed", 0, 0);
+	ret = alloc_list_meta_iter(&lt->meta_freed, ia->cb, ia->cb_arg);
+	sns_pop();
+	if (ret < 0)
+		goto out;
+
+	sns_push("item_root", 0, 0);
+	ret = btree_meta_iter(&lt->item_root, ia->cb, ia->cb_arg);
+	sns_pop();
+	if (ret < 0)
+		goto out;
+
+	if (lt->bloom_ref.blkno) {
+		sns_push("bloom_ref", 0, 0);
+		ret = ia->cb(le64_to_cpu(lt->bloom_ref.blkno), 1, ia->cb_arg);
+		sns_pop();
+		if (ret < 0) {
+			ret = xlate_iter_errno(ret);
+			goto out;
+		}
+	}
+
+	sns_push("data_avail", 0, 0);
+	ret = alloc_root_meta_iter(&lt->data_avail, ia->cb, ia->cb_arg);
+	sns_pop();
+	if (ret < 0)
+		goto out;
+
+	sns_push("data_freed", 0, 0);
+	ret = alloc_root_meta_iter(&lt->data_freed, ia->cb, ia->cb_arg);
+	sns_pop();
+	if (ret < 0)
+		goto out;
+
+	ret = 0;
+out:
+	sns_pop();
+
+	return ret;
+}
+
+/*
+ * Call the callers callback with the extent of all the metadata block references contained
+ * in log btrees.  We walk the logs_root btree items and walk all the metadata structures
+ * they reference.
+ */
+int log_trees_meta_iter(extent_cb_t cb, void *cb_arg)
+{
+	struct scoutfs_super_block *super = global_super;
+	struct iter_args ia = { .cb = cb, .cb_arg = cb_arg };
+
+	return btree_item_iter(&super->logs_root, lt_meta_iter, &ia);
+}
@@ -0,0 +1,8 @@
+#ifndef _SCOUTFS_UTILS_CHECK_LOG_TREES_H_
+#define _SCOUTFS_UTILS_CHECK_LOG_TREES_H_
+
+#include "extent.h"
+
+int log_trees_meta_iter(extent_cb_t cb, void *cb_arg);
+
+#endif
@@ -0,0 +1,367 @@
+#include <unistd.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdbool.h>
+#include <sys/mman.h>
+#include <errno.h>
+
+#include "sparse.h"
+#include "util.h"
+#include "format.h"
+#include "bitmap.h"
+#include "key.h"
+
+#include "alloc.h"
+#include "btree.h"
+#include "debug.h"
+#include "extent.h"
+#include "sns.h"
+#include "log_trees.h"
+#include "meta.h"
+#include "problem.h"
+#include "super.h"
+
+static struct meta_data {
+	struct extent_root meta_refed;
+	struct extent_root meta_free;
+	struct {
+		u64 ref_blocks;
+		u64 free_extents;
+		u64 free_blocks;
+	} stats;
+} global_mdat;
+
+bool valid_meta_blkno(u64 blkno)
+{
+	u64 tot = le64_to_cpu(global_super->total_meta_blocks);
+
+	return blkno >= SCOUTFS_META_DEV_START_BLKNO && blkno < tot;
+}
+
+static bool valid_meta_extent(u64 start, u64 len)
+{
+	u64 tot = le64_to_cpu(global_super->total_meta_blocks);
+	bool valid;
+
+	valid = len > 0 &&
+		start >= SCOUTFS_META_DEV_START_BLKNO &&
+		start < tot &&
+		len <= tot &&
+		((start + len) <= tot) &&
+		((start + len) > start);
+
+	debug("start %llu len %llu valid %u", start, len, !!valid);
+
+	if (!valid)
+		problem(PB_META_EXTENT_INVALID, "start %llu len %llu", start, len);
+
+	return valid;
+}
+
+/*
+ * Track references to individual metadata blocks.  This uses the extent
+ * callback type but is only ever called for single block references.
+ * Any reference to a block that has already been referenced is
+ * considered invalid and is ignored.  Later repair will resolve
+ * duplicate references.
+ */
+static int insert_meta_ref(u64 start, u64 len, void *arg)
+{
+	struct meta_data *mdat = &global_mdat;
+	struct extent_root *root = arg;
+	int ret = 0;
+
+	/* this is tracking single metadata block references */
+	if (len != 1) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	if (valid_meta_blkno(start)) {
+		ret = extent_insert_new(root, start, len);
+		if (ret == 0)
+			mdat->stats.ref_blocks++;
+		else if (ret == -EEXIST)
+			problem(PB_META_REF_OVERLAPS_EXISTING, "blkno %llu", start);
+	}
+
+out:
+	return ret;
+}
+
+static int insert_meta_free(u64 start, u64 len, void *arg)
+{
+	struct meta_data *mdat = &global_mdat;
+	struct extent_root *root = arg;
+	int ret = 0;
+
+	if (valid_meta_extent(start, len)) {
+		ret = extent_insert_new(root, start, len);
+		if (ret == 0) {
+			mdat->stats.free_extents++;
+			mdat->stats.free_blocks++;
+
+		} else if (ret == -EEXIST) {
+			problem(PB_META_FREE_OVERLAPS_EXISTING,
+				"start %llu llen %llu", start, len);
+		}
+
+	}
+
+	return ret;
+}
+
+/*
+ * Walk all metadata references in the system.  This walk doesn't need
+ * to read metadata that doesn't contain any metadata references so it
+ * can skip the bulk of metadata blocks.  This gives us the set of
+ * referenced metadata blocks which we can then use to repair metadata
+ * allocator structures.
+ */
+static int get_meta_refs(void)
+{
+	struct meta_data *mdat = &global_mdat;
+	struct scoutfs_super_block *super = global_super;
+	int ret;
+
+	extent_root_init(&mdat->meta_refed);
+
+	/* XXX record reserved blocks around super as referenced */
+
+	sns_push("meta_alloc", 0, 0);
+	ret = alloc_root_meta_iter(&super->meta_alloc[0], insert_meta_ref, &mdat->meta_refed);
+	sns_pop();
+	if (ret < 0)
+		goto out;
+
+	sns_push("meta_alloc", 1, 0);
+	ret = alloc_root_meta_iter(&super->meta_alloc[1], insert_meta_ref, &mdat->meta_refed);
+	sns_pop();
+	if (ret < 0)
+		goto out;
+
+	sns_push("data_alloc", 1, 0);
+	ret = alloc_root_meta_iter(&super->data_alloc, insert_meta_ref, &mdat->meta_refed);
+	sns_pop();
+	if (ret < 0)
+		goto out;
+
+	sns_push("server_meta_avail", 0, 0);
+	ret = alloc_list_meta_iter(&super->server_meta_avail[0],
+				   insert_meta_ref, &mdat->meta_refed);
+	sns_pop();
+	if (ret < 0)
+		goto out;
+
+	sns_push("server_meta_avail", 1, 0);
+	ret = alloc_list_meta_iter(&super->server_meta_avail[1],
+				   insert_meta_ref, &mdat->meta_refed);
+	sns_pop();
+	if (ret < 0)
+		goto out;
+
+	sns_push("server_meta_freed", 0, 0);
+	ret = alloc_list_meta_iter(&super->server_meta_freed[0],
+				   insert_meta_ref, &mdat->meta_refed);
+	sns_pop();
+	if (ret < 0)
+		goto out;
+
+	sns_push("server_meta_freed", 1, 0);
+	ret = alloc_list_meta_iter(&super->server_meta_freed[1],
+				   insert_meta_ref, &mdat->meta_refed);
+	sns_pop();
+	if (ret < 0)
+		goto out;
+
+	sns_push("fs_root", 0, 0);
+	ret = btree_meta_iter(&super->fs_root, insert_meta_ref, &mdat->meta_refed);
+	sns_pop();
+	if (ret < 0)
+		goto out;
+
+	sns_push("logs_root", 0, 0);
+	ret = btree_meta_iter(&super->logs_root, insert_meta_ref, &mdat->meta_refed);
+	sns_pop();
+	if (ret < 0)
+		goto out;
+
+	sns_push("log_merge", 0, 0);
+	ret = btree_meta_iter(&super->log_merge, insert_meta_ref, &mdat->meta_refed);
+	sns_pop();
+	if (ret < 0)
+		goto out;
+
+	sns_push("mounted_clients", 0, 0);
+	ret = btree_meta_iter(&super->mounted_clients, insert_meta_ref, &mdat->meta_refed);
+	sns_pop();
+	if (ret < 0)
+		goto out;
+
+	sns_push("srch_root", 0, 0);
+	ret = btree_meta_iter(&super->srch_root, insert_meta_ref, &mdat->meta_refed);
+	sns_pop();
+	if (ret < 0)
+		goto out;
+
+	ret = log_trees_meta_iter(insert_meta_ref, &mdat->meta_refed);
+	if (ret < 0)
+		goto out;
+
+	debug("found %llu referenced metadata blocks", mdat->stats.ref_blocks);
+	ret = 0;
+out:
+	return ret;
+}
+
+static int get_meta_free(void)
+{
+	struct meta_data *mdat = &global_mdat;
+	struct scoutfs_super_block *super = global_super;
+	int ret;
+
+	extent_root_init(&mdat->meta_free);
+
+	sns_push("meta_alloc", 0, 0);
+	ret = alloc_root_extent_iter(&super->meta_alloc[0], insert_meta_free, &mdat->meta_free);
+	sns_pop();
+	if (ret < 0)
+		goto out;
+
+	sns_push("meta_alloc", 1, 0);
+	ret = alloc_root_extent_iter(&super->meta_alloc[1], insert_meta_free, &mdat->meta_free);
+	sns_pop();
+	if (ret < 0)
+		goto out;
+
+	sns_push("server_meta_avail", 0, 0);
+	ret = alloc_list_extent_iter(&super->server_meta_avail[0],
+				     insert_meta_free, &mdat->meta_free);
+	sns_pop();
+	if (ret < 0)
+		goto out;
+
+	sns_push("server_meta_avail", 1, 0);
+	ret = alloc_list_extent_iter(&super->server_meta_avail[1],
+				     insert_meta_free, &mdat->meta_free);
+	sns_pop();
+	if (ret < 0)
+		goto out;
+
+	sns_push("server_meta_freed", 0, 0);
+	ret = alloc_list_extent_iter(&super->server_meta_freed[0],
+				     insert_meta_free, &mdat->meta_free);
+	sns_pop();
+	if (ret < 0)
+		goto out;
+
+	sns_push("server_meta_freed", 1, 0);
+	ret = alloc_list_extent_iter(&super->server_meta_freed[1],
+				     insert_meta_free, &mdat->meta_free);
+	sns_pop();
+	if (ret < 0)
+		goto out;
+
+	debug("found %llu free metadata blocks in %llu extents",
+	       mdat->stats.free_blocks, mdat->stats.free_extents);
+	ret = 0;
+out:
+	return ret;
+}
+
+/*
+ * All the space between referenced blocks must be recorded in the free
+ * extents.  The free extent walk didn't check that the extents
+ * overlapped with references, we do that here.  Remember that metadata
+ * block references were merged into extents here, the refed extents
+ * aren't necessarily all a single block.
+ */
+static int compare_refs_and_free(void)
+{
+	struct meta_data *mdat = &global_mdat;
+	struct extent_node *ref;
+	struct extent_node *free;
+	struct extent_node *next;
+	struct extent_node *prev;
+	u64 expect;
+	u64 start;
+	u64 end;
+
+	expect = 0;
+	ref = extent_first(&mdat->meta_refed);
+	free = extent_first(&mdat->meta_free);
+	while (ref || free) {
+
+		debug("exp %llu ref %llu.%llu free %llu.%llu",
+			expect, ref ? ref->start : 0, ref ? ref->len : 0,
+			free ? free->start : 0, free ? free->len : 0);
+
+		/* referenced marked free, remove ref from free and continue from same point */
+		if (ref && free && extents_overlap(ref->start, ref->len, free->start, free->len)) {
+			debug("ref extent %llu.%llu overlaps free %llu %llu",
+				ref->start, ref->len, free->start, free->len);
+
+			start = max(ref->start, free->start);
+			end = min(ref->start + ref->len, free->start + free->len);
+
+			prev = extent_prev(free);
+
+			extent_remove(&mdat->meta_free, start, end - start);
+
+			if (prev)
+				free = extent_next(prev);
+			else
+				free = extent_first(&mdat->meta_free);
+			continue;
+		}
+
+		/* see which extent starts earlier */
+		if (!free || (ref && ref->start <= free->start))
+			next = ref;
+		else
+			next = free;
+
+		/* untracked region before next extent */
+		if (expect < next->start) {
+			debug("missing free extent %llu.%llu", expect, next->start - expect);
+			expect = next->start;
+			continue;
+		}
+
+
+		/* didn't overlap, advance past next extent */
+		expect = next->start + next->len;
+		if (next == ref)
+			ref = extent_next(ref);
+		else
+			free = extent_next(free);
+	}
+
+	return 0;
+}
+
+/*
+ * Check the metadata allocators by comparing the set of referenced
+ * blocks with the set of free blocks that are stored in free btree
+ * items and alloc list blocks.
+ */
+int check_meta_alloc(void)
+{
+	int ret;
+
+	ret = get_meta_refs();
+	if (ret < 0)
+		goto out;
+
+	ret = get_meta_free();
+	if (ret < 0)
+		goto out;
+
+	ret = compare_refs_and_free();
+	if (ret < 0)
+		goto out;
+
+	ret = 0;
+out:
+	return ret;
+}
--- a/Show More
+++ b/Show More