Disable mount-unmount-race test

The mount-unmount-race test is occasionally hanging, disable it while we debug it and have test coverage for unrelated work. Signed-off-by: Zach Brown <zab@versity.com>
Merge pull request #14 from agrover/fix-jira-202
2026-06-09 21:22:36 +00:00 · 2021-02-01 10:07:47 -08:00 · 2021-02-01 09:46:01 -08:00 · 2021-02-01 09:24:59 -08:00 · 2021-01-29 09:30:57 -08:00 · 2021-01-26 16:07:05 -08:00
79 changed files with 3565 additions and 2074 deletions
@@ -31,15 +31,9 @@ functionality hasn't been implemented.  It's appropriate for early
 adopters and interested developers, not for production use.

 In that vein, expect significant incompatible changes to both the format
-of network messages and persistent structures.  To avoid mistakes the
-implementation currently calculates a hash of the format and ioctl
-header files in the source tree.  The kernel module will refuse to mount
-a volume created by userspace utilities with a mismatched hash, and it
-will refuse to connect to a remote node with a mismatched hash.  This
-means having to unmount, mkfs, and remount everything across many
-functional changes.  Once the format is nailed down we'll wire up
-forward and back compat machinery and remove this temporary safety
-measure. 
+of network messages and persistent structures. Since the format hash-checking
+has now been removed in preparation for release, if there is any doubt, mkfs
+is strongly recommended.

 The current kernel module is developed against the RHEL/CentOS 7.x
 kernel to minimize the friction of developing and testing with partners'
@@ -16,11 +16,7 @@ SCOUTFS_GIT_DESCRIBE := \
 	$(shell git describe --all --abbrev=6 --long 2>/dev/null || \
 		echo no-git)

-SCOUTFS_FORMAT_HASH := \
-	$(shell cat src/format.h src/ioctl.h | md5sum | cut -b1-16)
-
 SCOUTFS_ARGS := SCOUTFS_GIT_DESCRIBE=$(SCOUTFS_GIT_DESCRIBE) \
-		SCOUTFS_FORMAT_HASH=$(SCOUTFS_FORMAT_HASH) \
 		CONFIG_SCOUTFS_FS=m -C $(SK_KSRC) M=$(CURDIR)/src \
 		EXTRA_CFLAGS="-Werror"

@@ -1,7 +1,6 @@
 obj-$(CONFIG_SCOUTFS_FS) := scoutfs.o

-CFLAGS_super.o = -DSCOUTFS_GIT_DESCRIBE=\"$(SCOUTFS_GIT_DESCRIBE)\" \
-		 -DSCOUTFS_FORMAT_HASH=0x$(SCOUTFS_FORMAT_HASH)LLU
+CFLAGS_super.o = -DSCOUTFS_GIT_DESCRIBE=\"$(SCOUTFS_GIT_DESCRIBE)\"

 CFLAGS_scoutfs_trace.o = -I$(src) # define_trace.h double include

@@ -657,6 +657,60 @@ out:
 	return ret;
 }

+void scoutfs_dalloc_init(struct scoutfs_data_alloc *dalloc,
+			 struct scoutfs_alloc_root *data_avail)
+{
+	dalloc->root = *data_avail;
+	memset(&dalloc->cached, 0, sizeof(dalloc->cached));
+	atomic64_set(&dalloc->total_len, le64_to_cpu(dalloc->root.total_len));
+}
+
+void scoutfs_dalloc_get_root(struct scoutfs_data_alloc *dalloc,
+			     struct scoutfs_alloc_root *data_avail)
+{
+	*data_avail = dalloc->root;
+}
+
+static void dalloc_update_total_len(struct scoutfs_data_alloc *dalloc)
+{
+	atomic64_set(&dalloc->total_len, le64_to_cpu(dalloc->root.total_len) +
+		     dalloc->cached.len);
+}
+
+u64 scoutfs_dalloc_total_len(struct scoutfs_data_alloc *dalloc)
+{
+	return atomic64_read(&dalloc->total_len);
+}
+
+/*
+ * Return the current in-memory cached free extent to extent items in
+ * the avail root.  This should be locked by the caller just like
+ * _alloc_data and _free_data.
+ */
+int scoutfs_dalloc_return_cached(struct super_block *sb,
+				 struct scoutfs_alloc *alloc,
+				 struct scoutfs_block_writer *wri,
+				 struct scoutfs_data_alloc *dalloc)
+{
+	struct alloc_ext_args args = {
+		.alloc = alloc,
+		.wri = wri,
+		.root = &dalloc->root,
+		.type = SCOUTFS_FREE_EXTENT_BLKNO_TYPE,
+	};
+	int ret = 0;
+
+	if (dalloc->cached.len) {
+		ret = scoutfs_ext_insert(sb, &alloc_ext_ops, &args,
+					 dalloc->cached.start,
+					 dalloc->cached.len, 0, 0);
+		if (ret == 0)
+			memset(&dalloc->cached, 0, sizeof(dalloc->cached));
+	}
+
+	return ret;
+}
+
 /*
 * Allocate a data extent.  An extent that's smaller than the requested
 * size can be returned.
@@ -671,14 +725,13 @@ out:
 */
 int scoutfs_alloc_data(struct super_block *sb, struct scoutfs_alloc *alloc,
 		       struct scoutfs_block_writer *wri,
-		       struct scoutfs_alloc_root *root,
-		       struct scoutfs_extent *cached, u64 count,
+		       struct scoutfs_data_alloc *dalloc, u64 count,
 		       u64 *blkno_ret, u64 *count_ret)
 {
 	struct alloc_ext_args args = {
 		.alloc = alloc,
 		.wri = wri,
-		.root = root,
+		.root = &dalloc->root,
 		.type = SCOUTFS_FREE_EXTENT_LEN_TYPE,
 	};
 	struct scoutfs_extent ext;
@@ -699,27 +752,35 @@ int scoutfs_alloc_data(struct super_block *sb, struct scoutfs_alloc *alloc,
 	}

 	/* smaller allocations come from a cached extent */
-	if (cached->len == 0) {
+	if (dalloc->cached.len == 0) {
 		ret = scoutfs_ext_alloc(sb, &alloc_ext_ops, &args, 0, 0,
-					SCOUTFS_ALLOC_DATA_LG_THRESH, cached);
+					SCOUTFS_ALLOC_DATA_LG_THRESH,
+					&dalloc->cached);
 		if (ret < 0)
 			goto out;
 	}

-	len = min(count, cached->len);
+	len = min(count, dalloc->cached.len);

-	*blkno_ret = cached->start;
+	*blkno_ret = dalloc->cached.start;
 	*count_ret = len;

-	cached->start += len;
-	cached->len -= len;
+	dalloc->cached.start += len;
+	dalloc->cached.len -= len;
 	ret = 0;
 out:
 	if (ret < 0) {
+		/*
+		 * Special retval meaning there wasn't space to alloc from
+		 * this txn. Doesn't mean filesystem is completely full.
+		 * Maybe upper layers want to try again.
+		 */
 		if (ret == -ENOENT)
-			ret = -ENOSPC;
+			ret = -ENOBUFS;
 		*blkno_ret = 0;
 		*count_ret = 0;
+	} else {
+		dalloc_update_total_len(dalloc);
 	}

 	scoutfs_inc_counter(sb, alloc_alloc_data);
@@ -80,6 +80,18 @@ struct scoutfs_alloc {
 	struct scoutfs_alloc_list_head freed;
 };

+/*
+ * A run-time data allocator.  We have a cached extent in memory that is
+ * a lot cheaper to work with than the extent items, and we have a
+ * consistent record of the total_len that can be sampled outside of the
+ * usual heavy serialization of the extent modifications.
+ */
+struct scoutfs_data_alloc {
+	struct scoutfs_alloc_root root;
+	struct scoutfs_extent cached;
+	atomic64_t total_len;
+};
+
 void scoutfs_alloc_init(struct scoutfs_alloc *alloc,
 			struct scoutfs_alloc_list_head *avail,
 			struct scoutfs_alloc_list_head *freed);
@@ -92,10 +104,18 @@ int scoutfs_alloc_meta(struct super_block *sb, struct scoutfs_alloc *alloc,
 int scoutfs_free_meta(struct super_block *sb, struct scoutfs_alloc *alloc,
 		      struct scoutfs_block_writer *wri, u64 blkno);

+void scoutfs_dalloc_init(struct scoutfs_data_alloc *dalloc,
+			 struct scoutfs_alloc_root *data_avail);
+void scoutfs_dalloc_get_root(struct scoutfs_data_alloc *dalloc,
+			     struct scoutfs_alloc_root *data_avail);
+u64 scoutfs_dalloc_total_len(struct scoutfs_data_alloc *dalloc);
+int scoutfs_dalloc_return_cached(struct super_block *sb,
+				 struct scoutfs_alloc *alloc,
+				 struct scoutfs_block_writer *wri,
+				 struct scoutfs_data_alloc *dalloc);
 int scoutfs_alloc_data(struct super_block *sb, struct scoutfs_alloc *alloc,
 		       struct scoutfs_block_writer *wri,
-		       struct scoutfs_alloc_root *root,
-		       struct scoutfs_extent *cached, u64 count,
+		       struct scoutfs_data_alloc *dalloc, u64 count,
 		       u64 *blkno_ret, u64 *count_ret);
 int scoutfs_free_data(struct super_block *sb, struct scoutfs_alloc *alloc,
 		      struct scoutfs_block_writer *wri,
@@ -121,16 +121,14 @@ int scoutfs_client_get_roots(struct super_block *sb,
 int scoutfs_client_advance_seq(struct super_block *sb, u64 *seq)
 {
 	struct client_info *client = SCOUTFS_SB(sb)->client_info;
-	__le64 before = cpu_to_le64p(seq);
-	__le64 after;
+	__le64 leseq;
 	int ret;

 	ret = scoutfs_net_sync_request(sb, client->conn,
 				       SCOUTFS_NET_CMD_ADVANCE_SEQ,
-				       &before, sizeof(before),
-				       &after, sizeof(after));
+				       NULL, 0, &leseq, sizeof(leseq));
 	if (ret == 0)
-		*seq = le64_to_cpu(after);
+		*seq = le64_to_cpu(leseq);

 	return ret;
 }
@@ -282,10 +280,10 @@ static int client_greeting(struct super_block *sb,
 		goto out;
 	}

-	if (gr->format_hash != super->format_hash) {
+	if (gr->version != super->version) {
 		scoutfs_warn(sb, "server sent format 0x%llx, client has 0x%llx",
-			     le64_to_cpu(gr->format_hash),
-			     le64_to_cpu(super->format_hash));
+			     le64_to_cpu(gr->version),
+			     le64_to_cpu(super->version));
 		ret = -EINVAL;
 		goto out;
 	}
@@ -394,7 +392,7 @@ static void scoutfs_client_connect_worker(struct work_struct *work)

 	/* send a greeting to verify endpoints of each connection */
 	greet.fsid = super->hdr.fsid;
-	greet.format_hash = super->format_hash;
+	greet.version = super->version;
 	greet.server_term = cpu_to_le64(client->server_term);
 	greet.unmount_barrier = cpu_to_le64(client->greeting_umb);
 	greet.rid = cpu_to_le64(sbi->rid);
@@ -1,315 +0,0 @@
-#ifndef _SCOUTFS_COUNT_H_
-#define _SCOUTFS_COUNT_H_
-
-/*
- * Our estimate of the space consumed while dirtying items is based on
- * the number of items and the size of their values.
- *
- * The estimate is still a read-only input to entering the transaction.
- * We'd like to use it as a clean rhs arg to hold_trans.  We define SIC_
- * functions which return the count struct.  This lets us have a single
- * arg and avoid bugs in initializing and passing in struct pointers
- * from callers.  The internal __count functions are used compose an
- * estimate out of the sets of items it manipulates.  We program in much
- * clearer C instead of in the preprocessor.
- *
- * Compilers are able to collapse the inlines into constants for the
- * constant estimates.
- */
-
-struct scoutfs_item_count {
-	signed items;
-	signed vals;
-};
-
-/* The caller knows exactly what they're doing. */
-static inline const struct scoutfs_item_count SIC_EXACT(signed items,
-							signed vals)
-{
-	struct scoutfs_item_count cnt = {
-		.items = items,
-		.vals = vals,
-	};
-
-	return cnt;
-}
-
-/*
- * Allocating an inode creates a new set of indexed items.
- */
-static inline void __count_alloc_inode(struct scoutfs_item_count *cnt)
-{
-	const int nr_indices = SCOUTFS_INODE_INDEX_NR;
-
-	cnt->items += 1 + nr_indices;
-	cnt->vals += sizeof(struct scoutfs_inode);
-}
-
-/*
- * Dirtying an inode dirties the inode item and can delete and create
- * the full set of indexed items.
- */
-static inline void __count_dirty_inode(struct scoutfs_item_count *cnt)
-{
-	const int nr_indices = 2 * SCOUTFS_INODE_INDEX_NR;
-
-	cnt->items += 1 + nr_indices;
-	cnt->vals += sizeof(struct scoutfs_inode);
-}
-
-static inline const struct scoutfs_item_count SIC_ALLOC_INODE(void)
-{
-	struct scoutfs_item_count cnt = {0,};
-
-	__count_alloc_inode(&cnt);
-
-	return cnt;
-}
-
-static inline const struct scoutfs_item_count SIC_DIRTY_INODE(void)
-{
-	struct scoutfs_item_count cnt = {0,};
-
-	__count_dirty_inode(&cnt);
-
-	return cnt;
-}
-
-/*
- * Directory entries are stored in three items.
- */
-static inline void __count_dirents(struct scoutfs_item_count *cnt,
-				   unsigned name_len)
-{
-	cnt->items += 3;
-	cnt->vals += 3 * offsetof(struct scoutfs_dirent, name[name_len]);
-}
-
-static inline void __count_sym_target(struct scoutfs_item_count *cnt,
-				      unsigned size)
-{
-	unsigned nr = DIV_ROUND_UP(size, SCOUTFS_MAX_VAL_SIZE);
-
-	cnt->items += nr;
-	cnt->vals += size;
-}
-
-static inline void __count_orphan(struct scoutfs_item_count *cnt)
-{
-
-	cnt->items += 1;
-}
-
-static inline void __count_mknod(struct scoutfs_item_count *cnt,
-				 unsigned name_len)
-{
-	__count_alloc_inode(cnt);
-	__count_dirents(cnt, name_len);
-	__count_dirty_inode(cnt);
-}
-
-static inline const struct scoutfs_item_count SIC_MKNOD(unsigned name_len)
-{
-	struct scoutfs_item_count cnt = {0,};
-
-	__count_mknod(&cnt, name_len);
-
-	return cnt;
-}
-
-/*
- * Dropping the inode deletes all its items.  Potentially enormous numbers
- * of items (data mapping, xattrs) are deleted in their own transactions.
- */
-static inline const struct scoutfs_item_count SIC_DROP_INODE(int mode,
-							     u64 size)
-{
-	struct scoutfs_item_count cnt = {0,};
-
-	if (S_ISLNK(mode))
-		__count_sym_target(&cnt, size);
-	__count_dirty_inode(&cnt);
-	__count_orphan(&cnt);
-
-	cnt.vals = 0;
-	return cnt;
-}
-
-static inline const struct scoutfs_item_count SIC_LINK(unsigned name_len)
-{
-	struct scoutfs_item_count cnt = {0,};
-
-	__count_dirents(&cnt, name_len);
-	__count_dirty_inode(&cnt);
-	__count_dirty_inode(&cnt);
-
-	return cnt;
-}
-
-/*
- * Unlink can add orphan items.
- */
-static inline const struct scoutfs_item_count SIC_UNLINK(unsigned name_len)
-{
-	struct scoutfs_item_count cnt = {0,};
-
-	__count_dirents(&cnt, name_len);
-	__count_dirty_inode(&cnt);
-	__count_dirty_inode(&cnt);
-	__count_orphan(&cnt);
-
-	return cnt;
-}
-
-static inline const struct scoutfs_item_count SIC_SYMLINK(unsigned name_len,
-							  unsigned size)
-{
-	struct scoutfs_item_count cnt = {0,};
-
-	__count_mknod(&cnt, name_len);
-	__count_sym_target(&cnt, size);
-
-	return cnt;
-}
-
-/*
- * This assumes the worst case of a rename between directories that
- * unlinks an existing target.  That'll be worse than the common case
- * by a few hundred bytes.
- */
-static inline const struct scoutfs_item_count SIC_RENAME(unsigned old_len,
-							 unsigned new_len)
-{
-	struct scoutfs_item_count cnt = {0,};
-
-	/* dirty dirs and inodes */
-	__count_dirty_inode(&cnt);
-	__count_dirty_inode(&cnt);
-	__count_dirty_inode(&cnt);
-	__count_dirty_inode(&cnt);
-
-	/* unlink old and new, link new */
-	__count_dirents(&cnt, old_len);
-	__count_dirents(&cnt, new_len);
-	__count_dirents(&cnt, new_len);
-
-	/* orphan the existing target */
-	__count_orphan(&cnt);
-
-	return cnt;
-}
-
-/*
- * Creating an xattr results in a dirty set of items with values that
- * store the xattr header, name, and value.  There's always at least one
- * item with the header and name.  Any previously existing items are
- * deleted which dirties their key but removes their value.  The two
- * sets of items are indexed by different ids so their items don't
- * overlap.
- */
-static inline const struct scoutfs_item_count SIC_XATTR_SET(unsigned old_parts,
-							    bool creating,
-							    unsigned name_len,
-							    unsigned size)
-{
-	struct scoutfs_item_count cnt = {0,};
-	unsigned int new_parts;
-
-	__count_dirty_inode(&cnt);
-
-	if (old_parts)
-		cnt.items += old_parts;
-
-	if (creating) {
-		new_parts = SCOUTFS_XATTR_NR_PARTS(name_len, size);
-
-		cnt.items += new_parts;
-		cnt.vals += sizeof(struct scoutfs_xattr) + name_len + size;
-	}
-
-	return cnt;
-}
-
-/*
- * write_begin can have to allocate all the blocks in the page and can
- * have to add a big allocation from the server to do so:
- *  - merge added free extents from the server
- *  - remove a free extent per block
- *  - remove an offline extent for every other block
- *  - add a file extent per block
- */
-static inline const struct scoutfs_item_count SIC_WRITE_BEGIN(void)
-{
-	struct scoutfs_item_count cnt = {0,};
-	unsigned nr_free = (1 + SCOUTFS_BLOCK_SM_PER_PAGE) * 3;
-	unsigned nr_file = (DIV_ROUND_UP(SCOUTFS_BLOCK_SM_PER_PAGE, 2) +
-			    SCOUTFS_BLOCK_SM_PER_PAGE) * 3;
-
-	__count_dirty_inode(&cnt);
-
-	cnt.items += nr_free + nr_file;
-	cnt.vals += nr_file;
-
-	return cnt;
-}
-
-/*
- * Truncating an extent can:
- *  - delete existing file extent,
- *  - create two surrounding file extents,
- *  - add an offline file extent,
- *  - delete two existing free extents
- *  - create a merged free extent
- */
-static inline const struct scoutfs_item_count
-SIC_TRUNC_EXTENT(struct inode *inode)
-{
-	struct scoutfs_item_count cnt = {0,};
-	unsigned int nr_file = 1 + 2 + 1;
-	unsigned int nr_free = (2 + 1) * 2;
-
-	if (inode)
-		__count_dirty_inode(&cnt);
-
-	cnt.items += nr_file + nr_free;
-	cnt.vals += nr_file;
-
-	return cnt;
-}
-
-/*
- * Fallocating an extent can, at most:
- *  - allocate from the server: delete two free and insert merged
- *  - free an allocated extent: delete one and create two split
- *  - remove an unallocated file extent: delete one and create two split
- *  - add an fallocated flie extent: delete two and inset one merged
- */
-static inline const struct scoutfs_item_count SIC_FALLOCATE_ONE(void)
-{
-	struct scoutfs_item_count cnt = {0,};
-	unsigned int nr_free = ((1 + 2) * 2) * 2;
-	unsigned int nr_file = (1 + 2) * 2;
-
-	__count_dirty_inode(&cnt);
-
-	cnt.items += nr_free + nr_file;
-	cnt.vals += nr_file;
-
-	return cnt;
-}
-
-/*
- * ioc_setattr_more can dirty the inode and add a single offline extent.
- */
-static inline const struct scoutfs_item_count SIC_SETATTR_MORE(void)
-{
-	struct scoutfs_item_count cnt = {0,};
-
-	__count_dirty_inode(&cnt);
-
-	cnt.items++;
-
-	return cnt;
-}
-
-#endif
@@ -58,6 +58,8 @@
 	EXPAND_COUNTER(corrupt_symlink_inode_size)		\
 	EXPAND_COUNTER(corrupt_symlink_missing_item)		\
 	EXPAND_COUNTER(corrupt_symlink_not_null_term)		\
+	EXPAND_COUNTER(data_fallocate_enobufs_retry)		\
+	EXPAND_COUNTER(data_write_begin_enobufs_retry)		\
 	EXPAND_COUNTER(dentry_revalidate_error)			\
 	EXPAND_COUNTER(dentry_revalidate_invalid)		\
 	EXPAND_COUNTER(dentry_revalidate_locked)		\
@@ -37,8 +37,8 @@
 #include "lock.h"
 #include "file.h"
 #include "msg.h"
-#include "count.h"
 #include "ext.h"
+#include "util.h"

 /*
 * We want to amortize work done after dirtying the shared transaction
@@ -53,9 +53,8 @@ struct data_info {
 	struct mutex mutex;
 	struct scoutfs_alloc *alloc;
 	struct scoutfs_block_writer *wri;
-	struct scoutfs_alloc_root data_avail;
 	struct scoutfs_alloc_root data_freed;
-	struct scoutfs_extent cached_ext;
+	struct scoutfs_data_alloc dalloc;
 };

 #define DECLARE_DATA_INFO(sb, name) \
@@ -93,6 +92,16 @@ static void ext_from_item(struct scoutfs_extent *ext,
 	ext->flags = dv->flags;
 }

+static void data_ext_op_warn(struct inode *inode)
+{
+	struct scoutfs_inode_info *si;
+
+	if (inode) {
+		si = SCOUTFS_I(inode);
+		WARN_ON_ONCE(!rwsem_is_locked(&si->extent_sem));
+	}
+}
+
 static int data_ext_next(struct super_block *sb, void *arg, u64 start, u64 len,
 			 struct scoutfs_extent *ext)
 {
@@ -102,6 +111,8 @@ static int data_ext_next(struct super_block *sb, void *arg, u64 start, u64 len,
 	struct scoutfs_key last;
 	int ret;

+	data_ext_op_warn(args->inode);
+
 	item_from_extent(&last, &dv, args->ino, U64_MAX, 1, 0, 0);
 	item_from_extent(&key, &dv, args->ino, start, len, 0, 0);

@@ -139,6 +150,8 @@ static int data_ext_insert(struct super_block *sb, void *arg, u64 start,
 	struct scoutfs_key key;
 	int ret;

+	data_ext_op_warn(args->inode);
+
 	item_from_extent(&key, &dv, args->ino, start, len, map, flags);
 	ret = scoutfs_item_create(sb, &key, &dv, sizeof(dv), args->lock);
 	if (ret == 0 && args->inode)
@@ -154,6 +167,8 @@ static int data_ext_remove(struct super_block *sb, void *arg, u64 start,
 	struct scoutfs_key key;
 	int ret;

+	data_ext_op_warn(args->inode);
+
 	item_from_extent(&key, &dv, args->ino, start, len, map, flags);
 	ret = scoutfs_item_delete(sb, &key, args->lock);
 	if (ret == 0 && args->inode)
@@ -275,7 +290,7 @@ int scoutfs_data_truncate_items(struct super_block *sb, struct inode *inode,
 				u64 ino, u64 iblock, u64 last, bool offline,
 				struct scoutfs_lock *lock)
 {
-	struct scoutfs_item_count cnt = SIC_TRUNC_EXTENT(inode);
+	struct scoutfs_inode_info *si = NULL;
 	LIST_HEAD(ind_locks);
 	s64 ret = 0;

@@ -290,12 +305,17 @@ int scoutfs_data_truncate_items(struct super_block *sb, struct inode *inode,
 	if (WARN_ON_ONCE(last < iblock))
 		return -EINVAL;

+	if (inode) {
+		si = SCOUTFS_I(inode);
+		down_write(&si->extent_sem);
+	}
+
 	while (iblock <= last) {
 		if (inode)
 			ret = scoutfs_inode_index_lock_hold(inode, &ind_locks,
-							    true, cnt);
+							    true);
 		else
-			ret = scoutfs_hold_trans(sb, cnt);
+			ret = scoutfs_hold_trans(sb);
 		if (ret)
 			break;

@@ -321,6 +341,9 @@ int scoutfs_data_truncate_items(struct super_block *sb, struct inode *inode,
 		ret = 0;
 	}

+	if (si)
+		up_write(&si->extent_sem);
+
 	return ret;
 }

@@ -407,8 +430,7 @@ static int alloc_block(struct super_block *sb, struct inode *inode,
 		count = 1;

 	ret = scoutfs_alloc_data(sb, datinf->alloc, datinf->wri,
-				 &datinf->data_avail, &datinf->cached_ext,
-				 count, &blkno, &count);
+				 &datinf->dalloc, count, &blkno, &count);
 	if (ret < 0)
 		goto out;

@@ -533,6 +555,38 @@ out:
 	return ret;
 }

+/*
+ * Typically extent item users are serialized by i_mutex.  But page
+ * readers only hold the page lock and need to be protected from writers
+ * in other pages which can be manipulating neighbouring extents as
+ * they split and merge.
+ */
+static int scoutfs_get_block_read(struct inode *inode, sector_t iblock,
+				  struct buffer_head *bh, int create)
+{
+	struct scoutfs_inode_info *si = SCOUTFS_I(inode);
+	int ret;
+
+	down_read(&si->extent_sem);
+	ret = scoutfs_get_block(inode, iblock, bh, create);
+	up_read(&si->extent_sem);
+
+	return ret;
+}
+
+static int scoutfs_get_block_write(struct inode *inode, sector_t iblock,
+				   struct buffer_head *bh, int create)
+{
+	struct scoutfs_inode_info *si = SCOUTFS_I(inode);
+	int ret;
+
+	down_write(&si->extent_sem);
+	ret = scoutfs_get_block(inode, iblock, bh, create);
+	up_write(&si->extent_sem);
+
+	return ret;
+}
+
 /*
 * This is almost never used.  We can't block on a cluster lock while
 * holding the page lock because lock invalidation gets the page lock
@@ -598,7 +652,7 @@ static int scoutfs_readpage(struct file *file, struct page *page)
 			return ret;
 	}

-	ret = mpage_readpage(page, scoutfs_get_block);
+	ret = mpage_readpage(page, scoutfs_get_block_read);

 	scoutfs_unlock(sb, inode_lock, SCOUTFS_LOCK_READ);
 	scoutfs_per_task_del(&si->pt_data_lock, &pt_ent);
@@ -646,7 +700,7 @@ static int scoutfs_readpages(struct file *file, struct address_space *mapping,
 		}
 	}

-	ret = mpage_readpages(mapping, pages, nr_pages, scoutfs_get_block);
+	ret = mpage_readpages(mapping, pages, nr_pages, scoutfs_get_block_read);
 out:
 	scoutfs_unlock(sb, inode_lock, SCOUTFS_LOCK_READ);
 	BUG_ON(!list_empty(pages));
@@ -655,13 +709,13 @@ out:

 static int scoutfs_writepage(struct page *page, struct writeback_control *wbc)
 {
-	return block_write_full_page(page, scoutfs_get_block, wbc);
+	return block_write_full_page(page, scoutfs_get_block_write, wbc);
 }

 static int scoutfs_writepages(struct address_space *mapping,
 			      struct writeback_control *wbc)
 {
-	return mpage_writepages(mapping, wbc, scoutfs_get_block);
+	return mpage_writepages(mapping, wbc, scoutfs_get_block_write);
 }

 /* fsdata allocated in write_begin and freed in write_end */
@@ -697,13 +751,13 @@ static int scoutfs_write_begin(struct file *file,
 		goto out;
 	}

+retry:
 	do {
 		ret = scoutfs_inode_index_start(sb, &ind_seq) ?:
 		      scoutfs_inode_index_prepare(sb, &wbd->ind_locks, inode,
 						  true) ?:
 		      scoutfs_inode_index_try_lock_hold(sb, &wbd->ind_locks,
-							ind_seq,
-							SIC_WRITE_BEGIN());
+							ind_seq);
 	} while (ret > 0);
 	if (ret < 0)
 		goto out;
@@ -712,17 +766,22 @@ static int scoutfs_write_begin(struct file *file,
 	flags |= AOP_FLAG_NOFS;

 	/* generic write_end updates i_size and calls dirty_inode */
-	ret = scoutfs_dirty_inode_item(inode, wbd->lock);
-	if (ret == 0)
-		ret = block_write_begin(mapping, pos, len, flags, pagep,
-					scoutfs_get_block);
-	if (ret)
+	ret = scoutfs_dirty_inode_item(inode, wbd->lock) ?:
+	      block_write_begin(mapping, pos, len, flags, pagep,
+				scoutfs_get_block_write);
+	if (ret < 0) {
 		scoutfs_release_trans(sb);
-out:
-	if (ret) {
 		scoutfs_inode_index_unlock(sb, &wbd->ind_locks);
-		kfree(wbd);
+		if (ret == -ENOBUFS) {
+			/* Retry with a new transaction. */
+			scoutfs_inc_counter(sb, data_write_begin_enobufs_retry);
+			goto retry;
+		}
 	}
+
+out:
+	if (ret < 0)
+		kfree(wbd);
        return ret;
 }

@@ -859,9 +918,8 @@ static s64 fallocate_extents(struct super_block *sb, struct inode *inode,
 		mutex_lock(&datinf->mutex);

 		ret = scoutfs_alloc_data(sb, datinf->alloc, datinf->wri,
-					 &datinf->data_avail,
-					 &datinf->cached_ext,
-					 count, &blkno, &count);
+					 &datinf->dalloc, count,
+					 &blkno, &count);
 		if (ret == 0) {
 			ret = scoutfs_ext_set(sb, &data_ext_ops, &args, iblock,
 					      count, blkno,
@@ -869,7 +927,7 @@ static s64 fallocate_extents(struct super_block *sb, struct inode *inode,
 			if (ret < 0) {
 				err = scoutfs_free_data(sb, datinf->alloc,
 							datinf->wri,
-							&datinf->data_avail,
+							&datinf->data_freed,
 							blkno, count);
 				BUG_ON(err); /* inconsistent */
 			}
@@ -903,6 +961,7 @@ static s64 fallocate_extents(struct super_block *sb, struct inode *inode,
 long scoutfs_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
 {
 	struct inode *inode = file_inode(file);
+	struct scoutfs_inode_info *si = SCOUTFS_I(inode);
 	struct super_block *sb = inode->i_sb;
 	const u64 ino = scoutfs_ino(inode);
 	struct scoutfs_lock *lock = NULL;
@@ -913,6 +972,7 @@ long scoutfs_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
 	s64 ret;

 	mutex_lock(&inode->i_mutex);
+	down_write(&si->extent_sem);

 	/* XXX support more flags */
        if (mode & ~(FALLOC_FL_KEEP_SIZE)) {
@@ -950,8 +1010,7 @@ long scoutfs_fallocate(struct file *file, int mode, loff_t offset, loff_t len)

 	while(iblock <= last) {

-		ret = scoutfs_inode_index_lock_hold(inode, &ind_locks, false,
-						    SIC_FALLOCATE_ONE());
+		ret = scoutfs_inode_index_lock_hold(inode, &ind_locks, false);
 		if (ret)
 			goto out;

@@ -969,6 +1028,12 @@ long scoutfs_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
 		scoutfs_release_trans(sb);
 		scoutfs_inode_index_unlock(sb, &ind_locks);

+		/* txn couldn't meet the request. Let's try with a new txn */
+		if (ret == -ENOBUFS) {
+			scoutfs_inc_counter(sb, data_fallocate_enobufs_retry);
+			continue;
+		}
+
 		if (ret <= 0)
 			goto out;

@@ -978,6 +1043,7 @@ long scoutfs_fallocate(struct file *file, int mode, loff_t offset, loff_t len)

 out:
 	scoutfs_unlock(sb, lock, SCOUTFS_LOCK_WRITE);
+	up_write(&si->extent_sem);
 	mutex_unlock(&inode->i_mutex);

 	trace_scoutfs_data_fallocate(sb, ino, mode, offset, len, ret);
@@ -998,6 +1064,7 @@ int scoutfs_data_init_offline_extent(struct inode *inode, u64 size,
 				     struct scoutfs_lock *lock)

 {
+	struct scoutfs_inode_info *si = SCOUTFS_I(inode);
 	struct super_block *sb = inode->i_sb;
 	struct data_ext_args args = {
 		.ino = scoutfs_ino(inode),
@@ -1019,8 +1086,7 @@ int scoutfs_data_init_offline_extent(struct inode *inode, u64 size,
 	}

 	/* we're updating meta_seq with offline block count */
-	ret = scoutfs_inode_index_lock_hold(inode, &ind_locks, false,
-					    SIC_SETATTR_MORE());
+	ret = scoutfs_inode_index_lock_hold(inode, &ind_locks, false);
 	if (ret < 0)
 		goto out;

@@ -1028,8 +1094,10 @@ int scoutfs_data_init_offline_extent(struct inode *inode, u64 size,
 	if (ret < 0)
 		goto unlock;

+	down_write(&si->extent_sem);
 	ret = scoutfs_ext_insert(sb, &data_ext_ops, &args,
 				 0, count, 0, SEF_OFFLINE);
+	up_write(&si->extent_sem);
 	if (ret < 0)
 		goto unlock;

@@ -1043,6 +1111,240 @@ out:
 	return ret;
 }

+/*
+ * We're using truncate_inode_pages_range to maintain consistency
+ * between the page cache and extents that just changed.  We have to
+ * call with full aligned page offsets or it thinks that it should leave
+ * behind a zeroed partial page.
+ */
+static void truncate_inode_pages_extent(struct inode *inode, u64 start, u64 len)
+{
+	truncate_inode_pages_range(&inode->i_data,
+				start << SCOUTFS_BLOCK_SM_SHIFT,
+				((start + len) << SCOUTFS_BLOCK_SM_SHIFT) - 1);
+}
+
+/*
+ * Move extents from one file to another.  The behaviour is more fully
+ * explained above the move_blocks ioctl argument structure definition.
+ *
+ * The caller has processed the ioctl args and performed the most basic
+ * inode checks, but we perform more detailed inode checks once we have
+ * the inode lock and refreshed inodes.  Our job is to safely lock the
+ * two files and move the extents.
+ */
+#define MOVE_DATA_EXTENTS_PER_HOLD 16
+int scoutfs_data_move_blocks(struct inode *from, u64 from_off,
+			     u64 byte_len, struct inode *to, u64 to_off)
+{
+	struct scoutfs_inode_info *from_si = SCOUTFS_I(from);
+	struct scoutfs_inode_info *to_si = SCOUTFS_I(to);
+	struct super_block *sb = from->i_sb;
+	struct scoutfs_lock *from_lock = NULL;
+	struct scoutfs_lock *to_lock = NULL;
+	struct data_ext_args from_args;
+	struct data_ext_args to_args;
+	struct scoutfs_extent ext;
+	LIST_HEAD(locks);
+	bool done = false;
+	loff_t from_size;
+	loff_t to_size;
+	u64 from_offline;
+	u64 to_offline;
+	u64 from_start;
+	u64 to_start;
+	u64 from_iblock;
+	u64 to_iblock;
+	u64 count;
+	u64 junk;
+	u64 seq;
+	u64 map;
+	u64 len;
+	int ret;
+	int err;
+	int i;
+
+	lock_two_nondirectories(from, to);
+
+	ret = scoutfs_lock_inodes(sb, SCOUTFS_LOCK_WRITE,
+				  SCOUTFS_LKF_REFRESH_INODE, from, &from_lock,
+				  to, &to_lock, NULL, NULL, NULL, NULL);
+	if (ret)
+		goto out;
+
+	if ((from_off & SCOUTFS_BLOCK_SM_MASK) ||
+	    (to_off & SCOUTFS_BLOCK_SM_MASK) ||
+	    ((byte_len & SCOUTFS_BLOCK_SM_MASK) &&
+	     (from_off + byte_len != i_size_read(from)))) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	from_iblock = from_off >> SCOUTFS_BLOCK_SM_SHIFT;
+	count = (byte_len + SCOUTFS_BLOCK_SM_MASK) >> SCOUTFS_BLOCK_SM_SHIFT;
+	to_iblock = to_off >> SCOUTFS_BLOCK_SM_SHIFT;
+
+	if (S_ISDIR(from->i_mode) || S_ISDIR(to->i_mode)) {
+		ret = -EISDIR;
+		goto out;
+	}
+
+	if (!S_ISREG(from->i_mode) || !S_ISREG(to->i_mode)) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	ret = inode_permission(from, MAY_WRITE) ?:
+	      inode_permission(to, MAY_WRITE);
+	if (ret < 0)
+		goto out;
+
+	/* can't stage once data_version changes */
+	scoutfs_inode_get_onoff(from, &junk, &from_offline);
+	scoutfs_inode_get_onoff(to, &junk, &to_offline);
+	if (from_offline || to_offline) {
+		ret = -ENODATA;
+		goto out;
+	}
+
+	from_args = (struct data_ext_args) {
+		.ino = scoutfs_ino(from),
+		.inode = from,
+		.lock = from_lock,
+	};
+
+	to_args = (struct data_ext_args) {
+		.ino = scoutfs_ino(to),
+		.inode = to,
+		.lock = to_lock,
+	};
+
+	inode_dio_wait(from);
+	inode_dio_wait(to);
+
+	ret = filemap_write_and_wait_range(&from->i_data, from_off,
+				   from_off + byte_len - 1);
+	if (ret < 0)
+		goto out;
+
+	for (;;) {
+		ret = scoutfs_inode_index_start(sb, &seq) ?:
+		      scoutfs_inode_index_prepare(sb, &locks, from, true) ?:
+		      scoutfs_inode_index_prepare(sb, &locks, to, true) ?:
+		      scoutfs_inode_index_try_lock_hold(sb, &locks, seq);
+		if (ret > 0)
+			continue;
+		if (ret < 0)
+			goto out;
+
+		ret = scoutfs_dirty_inode_item(from, from_lock) ?:
+		      scoutfs_dirty_inode_item(to, to_lock);
+		if (ret < 0)
+			goto out;
+
+		down_write_two(&from_si->extent_sem, &to_si->extent_sem);
+
+		/* arbitrarily limit the number of extents per trans hold */
+		for (i = 0; i < MOVE_DATA_EXTENTS_PER_HOLD; i++) {
+			/* find the next extent to move */
+			ret = scoutfs_ext_next(sb, &data_ext_ops, &from_args,
+					       from_iblock, 1, &ext);
+			if (ret < 0) {
+				if (ret == -ENOENT) {
+					done = true;
+					ret = 0;
+				}
+				break;
+			}
+
+			/* only move extents within count and i_size */
+			if (ext.start >= from_iblock + count ||
+			    ext.start >= i_size_read(from)) {
+				done = true;
+				ret = 0;
+				break;
+			}
+
+			from_start = max(ext.start, from_iblock);
+			map = ext.map + (from_start - ext.start);
+			len = min3(from_iblock + count,
+				   round_up((u64)i_size_read(from),
+					    SCOUTFS_BLOCK_SM_SIZE),
+				   ext.start + ext.len) - from_start;
+
+			to_start = to_iblock + (from_start - from_iblock);
+
+			/* insert the new, fails if it overlaps */
+			ret = scoutfs_ext_insert(sb, &data_ext_ops, &to_args,
+						 to_start, len,
+						 map, ext.flags);
+			if (ret < 0)
+				break;
+
+			/* remove the old, possibly splitting */
+			ret = scoutfs_ext_set(sb, &data_ext_ops, &from_args,
+					      from_start, len, 0, 0);
+			if (ret < 0) {
+				/* remove inserted new on err */
+				err = scoutfs_ext_remove(sb, &data_ext_ops,
+							 &to_args, to_start,
+							 len);
+				BUG_ON(err); /* XXX inconsistent */
+				break;
+			}
+
+			trace_scoutfs_data_move_blocks(sb, scoutfs_ino(from),
+						       from_start, len, map,
+						       ext.flags,
+						       scoutfs_ino(to),
+						       to_start);
+
+			/* moved extent might extend i_size */
+			to_size = (to_start + len) << SCOUTFS_BLOCK_SM_SHIFT;
+			if (to_size > i_size_read(to)) {
+				/* while maintaining final partial */
+				from_size = (from_start + len) <<
+						SCOUTFS_BLOCK_SM_SHIFT;
+				if (from_size > i_size_read(from))
+					to_size -= from_size -
+							i_size_read(from);
+				i_size_write(to, to_size);
+			}
+		}
+
+
+		up_write(&from_si->extent_sem);
+		up_write(&to_si->extent_sem);
+
+		from->i_ctime = from->i_mtime =
+			to->i_ctime = to->i_mtime = CURRENT_TIME;
+		scoutfs_inode_inc_data_version(from);
+		scoutfs_inode_inc_data_version(to);
+		scoutfs_inode_set_data_seq(from);
+		scoutfs_inode_set_data_seq(to);
+
+		scoutfs_update_inode_item(from, from_lock, &locks);
+		scoutfs_update_inode_item(to, to_lock, &locks);
+		scoutfs_release_trans(sb);
+		scoutfs_inode_index_unlock(sb, &locks);
+
+		if (ret < 0 || done)
+			break;
+	}
+
+	/* remove any cached pages from old extents */
+	truncate_inode_pages_extent(from, from_iblock, count);
+	truncate_inode_pages_extent(to, to_iblock, count);
+
+out:
+	scoutfs_unlock(sb, from_lock, SCOUTFS_LOCK_WRITE);
+	scoutfs_unlock(sb, to_lock, SCOUTFS_LOCK_WRITE);
+
+	unlock_two_nondirectories(from, to);
+
+	return ret;
+}
+
 /*
 * This copies to userspace :/
 */
@@ -1075,6 +1377,7 @@ static int fill_extent(struct fiemap_extent_info *fieinfo,
 int scoutfs_data_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
 			u64 start, u64 len)
 {
+	struct scoutfs_inode_info *si = SCOUTFS_I(inode);
 	struct super_block *sb = inode->i_sb;
 	const u64 ino = scoutfs_ino(inode);
 	struct scoutfs_lock *lock = NULL;
@@ -1095,8 +1398,8 @@ int scoutfs_data_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
 	if (ret)
 		goto out;

-	/* XXX overkill? */
 	mutex_lock(&inode->i_mutex);
+	down_read(&si->extent_sem);

 	ret = scoutfs_lock_inode(sb, SCOUTFS_LOCK_READ, 0, inode, &lock);
 	if (ret)
@@ -1148,6 +1451,7 @@ int scoutfs_data_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
 		ret = fill_extent(fieinfo, &cur, last_flags);
 unlock:
 	scoutfs_unlock(sb, lock, SCOUTFS_LOCK_READ);
+	up_read(&si->extent_sem);
 	mutex_unlock(&inode->i_mutex);

 out:
@@ -1227,8 +1531,9 @@ static struct scoutfs_data_wait *dw_next(struct scoutfs_data_wait *dw)
 * Check if we should wait by looking for extents whose flags match.
 * Returns 0 if no extents were found or any error encountered.
 *
- * The caller must have locked the extents before calling, both across
- * mounts and within this mount.
+ * The caller must have acquired a cluster lock that covers the extent
+ * items.  We acquire the extent_sem to protect our read from writers in
+ * other tasks.
 *
 * Returns 1 if any file extents in the caller's region matched.  If the
 * wait struct is provided then it is initialized to be woken when the
@@ -1240,6 +1545,7 @@ int scoutfs_data_wait_check(struct inode *inode, loff_t pos, loff_t len,
 			    u8 sef, u8 op, struct scoutfs_data_wait *dw,
 			    struct scoutfs_lock *lock)
 {
+	struct scoutfs_inode_info *si = SCOUTFS_I(inode);
 	struct super_block *sb = inode->i_sb;
 	const u64 ino = scoutfs_ino(inode);
 	struct data_ext_args args = {
@@ -1272,6 +1578,8 @@ int scoutfs_data_wait_check(struct inode *inode, loff_t pos, loff_t len,
 		}
 	}

+	down_read(&si->extent_sem);
+
 	iblock = pos >> SCOUTFS_BLOCK_SM_SHIFT;
 	last_block = (pos + len - 1) >> SCOUTFS_BLOCK_SM_SHIFT;

@@ -1308,6 +1616,8 @@ int scoutfs_data_wait_check(struct inode *inode, loff_t pos, loff_t len,
 		iblock = ext.start + ext.len;
 	}

+	up_read(&si->extent_sem);
+
 out:
 	trace_scoutfs_data_wait_check(sb, ino, pos, len, sef, op, &ext, ret);

@@ -1461,7 +1771,7 @@ void scoutfs_data_init_btrees(struct super_block *sb,

 	datinf->alloc = alloc;
 	datinf->wri = wri;
-	datinf->data_avail = lt->data_avail;
+	scoutfs_dalloc_init(&datinf->dalloc, &lt->data_avail);
 	datinf->data_freed = lt->data_freed;

 	mutex_unlock(&datinf->mutex);
@@ -1474,7 +1784,7 @@ void scoutfs_data_get_btrees(struct super_block *sb,

 	mutex_lock(&datinf->mutex);

-	lt->data_avail = datinf->data_avail;
+	scoutfs_dalloc_get_root(&datinf->dalloc, &lt->data_avail);
 	lt->data_freed = datinf->data_freed;

 	mutex_unlock(&datinf->mutex);
@@ -1490,31 +1800,20 @@ int scoutfs_data_prepare_commit(struct super_block *sb)
 	int ret;

 	mutex_lock(&datinf->mutex);
-	if (datinf->cached_ext.len) {
-		ret = scoutfs_free_data(sb, datinf->alloc, datinf->wri,
-					&datinf->data_avail,
-					datinf->cached_ext.start,
-					datinf->cached_ext.len);
-		if (ret == 0)
-			memset(&datinf->cached_ext, 0,
-			       sizeof(datinf->cached_ext));
-	} else {
-		ret = 0;
-	}
+	ret = scoutfs_dalloc_return_cached(sb, datinf->alloc, datinf->wri,
+					   &datinf->dalloc);
 	mutex_unlock(&datinf->mutex);

 	return ret;
 }

-/*
- * This isn't serializing with allocators so it can be a bit racey.
- */
 u64 scoutfs_data_alloc_free_bytes(struct super_block *sb)
 {
 	DECLARE_DATA_INFO(sb, datinf);

-	return le64_to_cpu(datinf->data_avail.total_len) <<
-			SCOUTFS_BLOCK_SM_SHIFT;
+	return scoutfs_dalloc_total_len(&datinf->dalloc) <<
+		SCOUTFS_BLOCK_SM_SHIFT;
+
 }

 int scoutfs_data_setup(struct super_block *sb)
@@ -58,6 +58,8 @@ int scoutfs_data_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
 long scoutfs_fallocate(struct file *file, int mode, loff_t offset, loff_t len);
 int scoutfs_data_init_offline_extent(struct inode *inode, u64 size,
 				     struct scoutfs_lock *lock);
+int scoutfs_data_move_blocks(struct inode *from, u64 from_off,
+			     u64 byte_len, struct inode *to, u64 to_off);

 int scoutfs_data_wait_check(struct inode *inode, loff_t pos, loff_t len,
 			    u8 sef, u8 op, struct scoutfs_data_wait *ow,
@@ -463,7 +463,18 @@ out:
 	else
 		inode = scoutfs_iget(sb, ino);

-	return d_splice_alias(inode, dentry);
+	/*
+	 * We can't splice dir aliases into the dcache.  dir entries
+	 * might have changed on other nodes so our dcache could still
+	 * contain them, rather than having been moved in rename.  For
+	 * dirs, we use d_materialize_unique to remove any existing
+	 * aliases which must be stale.  Our inode numbers aren't reused
+	 * so inodes pointed to by entries can't change types.
+	 */
+	if (!IS_ERR_OR_NULL(inode) && S_ISDIR(inode->i_mode))
+		return d_materialise_unique(dentry, inode);
+	else
+		return d_splice_alias(inode, dentry);
 }

 /*
@@ -655,7 +666,6 @@ static int del_entry_items(struct super_block *sb, u64 dir_ino, u64 hash,
 */
 static struct inode *lock_hold_create(struct inode *dir, struct dentry *dentry,
 				      umode_t mode, dev_t rdev,
-				      const struct scoutfs_item_count cnt,
 				      struct scoutfs_lock **dir_lock,
 				      struct scoutfs_lock **inode_lock,
 				      struct list_head *ind_locks)
@@ -694,7 +704,7 @@ retry:
 	ret = scoutfs_inode_index_start(sb, &ind_seq) ?:
 	      scoutfs_inode_index_prepare(sb, ind_locks, dir, true) ?:
 	      scoutfs_inode_index_prepare_ino(sb, ind_locks, ino, mode) ?:
-	      scoutfs_inode_index_try_lock_hold(sb, ind_locks, ind_seq, cnt);
+	      scoutfs_inode_index_try_lock_hold(sb, ind_locks, ind_seq);
 	if (ret > 0)
 		goto retry;
 	if (ret)
@@ -741,7 +751,6 @@ static int scoutfs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode,

 	hash = dirent_name_hash(dentry->d_name.name, dentry->d_name.len);
 	inode = lock_hold_create(dir, dentry, mode, rdev,
-				 SIC_MKNOD(dentry->d_name.len),
 				 &dir_lock, &inode_lock, &ind_locks);
 	if (IS_ERR(inode))
 		return PTR_ERR(inode);
@@ -836,8 +845,7 @@ retry:
 	ret = scoutfs_inode_index_start(sb, &ind_seq) ?:
 	      scoutfs_inode_index_prepare(sb, &ind_locks, dir, false) ?:
 	      scoutfs_inode_index_prepare(sb, &ind_locks, inode, false) ?:
-	      scoutfs_inode_index_try_lock_hold(sb, &ind_locks, ind_seq,
-						SIC_LINK(dentry->d_name.len));
+	      scoutfs_inode_index_try_lock_hold(sb, &ind_locks, ind_seq);
 	if (ret > 0)
 		goto retry;
 	if (ret)
@@ -918,8 +926,7 @@ retry:
 	ret = scoutfs_inode_index_start(sb, &ind_seq) ?:
 	      scoutfs_inode_index_prepare(sb, &ind_locks, dir, false) ?:
 	      scoutfs_inode_index_prepare(sb, &ind_locks, inode, false) ?:
-	      scoutfs_inode_index_try_lock_hold(sb, &ind_locks, ind_seq,
-						SIC_UNLINK(dentry->d_name.len));
+	      scoutfs_inode_index_try_lock_hold(sb, &ind_locks, ind_seq);
 	if (ret > 0)
 		goto retry;
 	if (ret)
@@ -1154,7 +1161,6 @@ static int scoutfs_symlink(struct inode *dir, struct dentry *dentry,
 		return ret;

 	inode = lock_hold_create(dir, dentry, S_IFLNK|S_IRWXUGO, 0,
-				 SIC_SYMLINK(dentry->d_name.len, name_len),
 				 &dir_lock, &inode_lock, &ind_locks);
 	if (IS_ERR(inode))
 		return PTR_ERR(inode);
@@ -1586,9 +1592,7 @@ retry:
 	       scoutfs_inode_index_prepare(sb, &ind_locks, new_dir, false)) ?:
 	      (new_inode == NULL ? 0 :
 	       scoutfs_inode_index_prepare(sb, &ind_locks, new_inode, false)) ?:
-	      scoutfs_inode_index_try_lock_hold(sb, &ind_locks, ind_seq,
-					    SIC_RENAME(old_dentry->d_name.len,
-						       new_dentry->d_name.len));
+	      scoutfs_inode_index_try_lock_hold(sb, &ind_locks, ind_seq);
 	if (ret > 0)
 		goto retry;
 	if (ret)
@@ -1,6 +1,9 @@
 #ifndef _SCOUTFS_FORMAT_H_
 #define _SCOUTFS_FORMAT_H_

+#define SCOUTFS_INTEROP_VERSION		0ULL
+#define SCOUTFS_INTEROP_VERSION_STR	__stringify(0)
+
 /* statfs(2) f_type */
 #define SCOUTFS_SUPER_MAGIC	0x554f4353		/* "SCOU" */

@@ -173,19 +176,6 @@ struct scoutfs_key {
 #define skfl_neglen	_sk_second
 #define skfl_blkno	_sk_third

-struct scoutfs_radix_block {
-	struct scoutfs_block_header hdr;
-	union {
-		struct scoutfs_radix_ref {
-			__le64 blkno;
-			__le64 seq;
-			__le64 sm_total;
-			__le64 lg_total;
-		} refs[0];
-		__le64 bits[0];
-	};
-};
-
 struct scoutfs_avl_root {
 	__le16 node;
 };
@@ -596,7 +586,7 @@ struct scoutfs_quorum_block {
 struct scoutfs_super_block {
 	struct scoutfs_block_header hdr;
 	__le64 id;
-	__le64 format_hash;
+	__le64 version;
 	__le64 flags;
 	__u8 uuid[SCOUTFS_UUID_BYTES];
 	__le64 next_ino;
@@ -759,7 +749,7 @@ enum scoutfs_dentry_type {
 */
 struct scoutfs_net_greeting {
 	__le64 fsid;
-	__le64 format_hash;
+	__le64 version;
 	__le64 server_term;
 	__le64 unmount_barrier;
 	__le64 rid;
@@ -71,29 +71,30 @@ static struct kmem_cache *scoutfs_inode_cachep;
 */
 static void scoutfs_inode_ctor(void *obj)
 {
-	struct scoutfs_inode_info *ci = obj;
+	struct scoutfs_inode_info *si = obj;

-	mutex_init(&ci->item_mutex);
-	seqcount_init(&ci->seqcount);
-	ci->staging = false;
-	scoutfs_per_task_init(&ci->pt_data_lock);
-	atomic64_set(&ci->data_waitq.changed, 0);
-	init_waitqueue_head(&ci->data_waitq.waitq);
-	init_rwsem(&ci->xattr_rwsem);
-	RB_CLEAR_NODE(&ci->writeback_node);
+	init_rwsem(&si->extent_sem);
+	mutex_init(&si->item_mutex);
+	seqcount_init(&si->seqcount);
+	si->staging = false;
+	scoutfs_per_task_init(&si->pt_data_lock);
+	atomic64_set(&si->data_waitq.changed, 0);
+	init_waitqueue_head(&si->data_waitq.waitq);
+	init_rwsem(&si->xattr_rwsem);
+	RB_CLEAR_NODE(&si->writeback_node);

-	inode_init_once(&ci->inode);
+	inode_init_once(&si->inode);
 }

 struct inode *scoutfs_alloc_inode(struct super_block *sb)
 {
-	struct scoutfs_inode_info *ci;
+	struct scoutfs_inode_info *si;

-	ci = kmem_cache_alloc(scoutfs_inode_cachep, GFP_NOFS);
-	if (!ci)
+	si = kmem_cache_alloc(scoutfs_inode_cachep, GFP_NOFS);
+	if (!si)
 		return NULL;

-	return &ci->inode;
+	return &si->inode;
 }

 static void scoutfs_i_callback(struct rcu_head *head)
@@ -221,7 +222,7 @@ static void set_item_info(struct scoutfs_inode_info *si,

 static void load_inode(struct inode *inode, struct scoutfs_inode *cinode)
 {
-	struct scoutfs_inode_info *ci = SCOUTFS_I(inode);
+	struct scoutfs_inode_info *si = SCOUTFS_I(inode);

 	i_size_write(inode, le64_to_cpu(cinode->size));
 	set_nlink(inode, le32_to_cpu(cinode->nlink));
@@ -236,23 +237,23 @@ static void load_inode(struct inode *inode, struct scoutfs_inode *cinode)
 	inode->i_ctime.tv_sec = le64_to_cpu(cinode->ctime.sec);
 	inode->i_ctime.tv_nsec = le32_to_cpu(cinode->ctime.nsec);

-	ci->meta_seq = le64_to_cpu(cinode->meta_seq);
-	ci->data_seq = le64_to_cpu(cinode->data_seq);
-	ci->data_version = le64_to_cpu(cinode->data_version);
-	ci->online_blocks = le64_to_cpu(cinode->online_blocks);
-	ci->offline_blocks = le64_to_cpu(cinode->offline_blocks);
-	ci->next_readdir_pos = le64_to_cpu(cinode->next_readdir_pos);
-	ci->next_xattr_id = le64_to_cpu(cinode->next_xattr_id);
-	ci->flags = le32_to_cpu(cinode->flags);
+	si->meta_seq = le64_to_cpu(cinode->meta_seq);
+	si->data_seq = le64_to_cpu(cinode->data_seq);
+	si->data_version = le64_to_cpu(cinode->data_version);
+	si->online_blocks = le64_to_cpu(cinode->online_blocks);
+	si->offline_blocks = le64_to_cpu(cinode->offline_blocks);
+	si->next_readdir_pos = le64_to_cpu(cinode->next_readdir_pos);
+	si->next_xattr_id = le64_to_cpu(cinode->next_xattr_id);
+	si->flags = le32_to_cpu(cinode->flags);

 	/*
 	 * i_blocks is initialized from online and offline and is then
 	 * maintained as blocks come and go.
 	 */
-	inode->i_blocks = (ci->online_blocks + ci->offline_blocks)
+	inode->i_blocks = (si->online_blocks + si->offline_blocks)
 				<< SCOUTFS_BLOCK_SM_SECTOR_SHIFT;

-	set_item_info(ci, cinode);
+	set_item_info(si, cinode);
 }

 static void init_inode_key(struct scoutfs_key *key, u64 ino)
@@ -334,7 +335,7 @@ int scoutfs_getattr(struct vfsmount *mnt, struct dentry *dentry,
 static int set_inode_size(struct inode *inode, struct scoutfs_lock *lock,
 			  u64 new_size, bool truncate)
 {
-	struct scoutfs_inode_info *ci = SCOUTFS_I(inode);
+	struct scoutfs_inode_info *si = SCOUTFS_I(inode);
 	struct super_block *sb = inode->i_sb;
 	LIST_HEAD(ind_locks);
 	int ret;
@@ -342,8 +343,7 @@ static int set_inode_size(struct inode *inode, struct scoutfs_lock *lock,
 	if (!S_ISREG(inode->i_mode))
 		return 0;

-	ret = scoutfs_inode_index_lock_hold(inode, &ind_locks, true,
-					    SIC_DIRTY_INODE());
+	ret = scoutfs_inode_index_lock_hold(inode, &ind_locks, true);
 	if (ret)
 		return ret;

@@ -353,7 +353,7 @@ static int set_inode_size(struct inode *inode, struct scoutfs_lock *lock,
 	truncate_setsize(inode, new_size);
 	inode->i_ctime = inode->i_mtime = CURRENT_TIME;
 	if (truncate)
-		ci->flags |= SCOUTFS_INO_FLAG_TRUNCATE;
+		si->flags |= SCOUTFS_INO_FLAG_TRUNCATE;
 	scoutfs_inode_set_data_seq(inode);
 	scoutfs_update_inode_item(inode, lock, &ind_locks);

@@ -365,17 +365,16 @@ static int set_inode_size(struct inode *inode, struct scoutfs_lock *lock,

 static int clear_truncate_flag(struct inode *inode, struct scoutfs_lock *lock)
 {
-	struct scoutfs_inode_info *ci = SCOUTFS_I(inode);
+	struct scoutfs_inode_info *si = SCOUTFS_I(inode);
 	struct super_block *sb = inode->i_sb;
 	LIST_HEAD(ind_locks);
 	int ret;

-	ret = scoutfs_inode_index_lock_hold(inode, &ind_locks, false,
-					    SIC_DIRTY_INODE());
+	ret = scoutfs_inode_index_lock_hold(inode, &ind_locks, false);
 	if (ret)
 		return ret;

-	ci->flags &= ~SCOUTFS_INO_FLAG_TRUNCATE;
+	si->flags &= ~SCOUTFS_INO_FLAG_TRUNCATE;
 	scoutfs_update_inode_item(inode, lock, &ind_locks);

 	scoutfs_release_trans(sb);
@@ -386,13 +385,13 @@ static int clear_truncate_flag(struct inode *inode, struct scoutfs_lock *lock)

 int scoutfs_complete_truncate(struct inode *inode, struct scoutfs_lock *lock)
 {
-	struct scoutfs_inode_info *ci = SCOUTFS_I(inode);
+	struct scoutfs_inode_info *si = SCOUTFS_I(inode);
 	u64 start;
 	int ret, err;

-	trace_scoutfs_complete_truncate(inode, ci->flags);
+	trace_scoutfs_complete_truncate(inode, si->flags);

-	if (!(ci->flags & SCOUTFS_INO_FLAG_TRUNCATE))
+	if (!(si->flags & SCOUTFS_INO_FLAG_TRUNCATE))
 		return 0;

 	start = (i_size_read(inode) + SCOUTFS_BLOCK_SM_SIZE - 1) >>
@@ -486,8 +485,7 @@ retry:
 		}
 	}

-	ret = scoutfs_inode_index_lock_hold(inode, &ind_locks, false,
-					    SIC_DIRTY_INODE());
+	ret = scoutfs_inode_index_lock_hold(inode, &ind_locks, false);
 	if (ret)
 		goto out;

@@ -643,19 +641,19 @@ void scoutfs_inode_get_onoff(struct inode *inode, s64 *on, s64 *off)

 static int scoutfs_iget_test(struct inode *inode, void *arg)
 {
-	struct scoutfs_inode_info *ci = SCOUTFS_I(inode);
+	struct scoutfs_inode_info *si = SCOUTFS_I(inode);
 	u64 *ino = arg;

-	return ci->ino == *ino;
+	return si->ino == *ino;
 }

 static int scoutfs_iget_set(struct inode *inode, void *arg)
 {
-	struct scoutfs_inode_info *ci = SCOUTFS_I(inode);
+	struct scoutfs_inode_info *si = SCOUTFS_I(inode);
 	u64 *ino = arg;

 	inode->i_ino = *ino;
-	ci->ino = *ino;
+	si->ino = *ino;

 	return 0;
 }
@@ -705,7 +703,7 @@ out:

 static void store_inode(struct scoutfs_inode *cinode, struct inode *inode)
 {
-	struct scoutfs_inode_info *ci = SCOUTFS_I(inode);
+	struct scoutfs_inode_info *si = SCOUTFS_I(inode);
 	u64 online_blocks;
 	u64 offline_blocks;

@@ -732,9 +730,9 @@ static void store_inode(struct scoutfs_inode *cinode, struct inode *inode)
 	cinode->data_version = cpu_to_le64(scoutfs_inode_data_version(inode));
 	cinode->online_blocks = cpu_to_le64(online_blocks);
 	cinode->offline_blocks = cpu_to_le64(offline_blocks);
-	cinode->next_readdir_pos = cpu_to_le64(ci->next_readdir_pos);
-	cinode->next_xattr_id = cpu_to_le64(ci->next_xattr_id);
-	cinode->flags = cpu_to_le32(ci->flags);
+	cinode->next_readdir_pos = cpu_to_le64(si->next_readdir_pos);
+	cinode->next_xattr_id = cpu_to_le64(si->next_xattr_id);
+	cinode->flags = cpu_to_le32(si->flags);
 }

 /*
@@ -1188,8 +1186,7 @@ int scoutfs_inode_index_start(struct super_block *sb, u64 *seq)
 * Returns > 0 if the seq changed and the locks should be retried.
 */
 int scoutfs_inode_index_try_lock_hold(struct super_block *sb,
-				      struct list_head *list, u64 seq,
-				      const struct scoutfs_item_count cnt)
+				      struct list_head *list, u64 seq)
 {
 	struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
 	struct index_lock *ind_lock;
@@ -1205,7 +1202,7 @@ int scoutfs_inode_index_try_lock_hold(struct super_block *sb,
 			goto out;
 	}

-	ret = scoutfs_hold_trans(sb, cnt);
+	ret = scoutfs_hold_trans(sb);
 	if (ret == 0 && seq != sbi->trans_seq) {
 		scoutfs_release_trans(sb);
 		ret = 1;
@@ -1219,8 +1216,7 @@ out:
 }

 int scoutfs_inode_index_lock_hold(struct inode *inode, struct list_head *list,
-				  bool set_data_seq,
-				  const struct scoutfs_item_count cnt)
+				  bool set_data_seq)
 {
 	struct super_block *sb = inode->i_sb;
 	int ret;
@@ -1230,7 +1226,7 @@ int scoutfs_inode_index_lock_hold(struct inode *inode, struct list_head *list,
 		ret = scoutfs_inode_index_start(sb, &seq) ?:
 		      scoutfs_inode_index_prepare(sb, list, inode,
 						  set_data_seq) ?:
-		      scoutfs_inode_index_try_lock_hold(sb, list, seq, cnt);
+		      scoutfs_inode_index_try_lock_hold(sb, list, seq);
 	} while (ret > 0);

 	return ret;
@@ -1368,7 +1364,7 @@ struct inode *scoutfs_new_inode(struct super_block *sb, struct inode *dir,
 				umode_t mode, dev_t rdev, u64 ino,
 				struct scoutfs_lock *lock)
 {
-	struct scoutfs_inode_info *ci;
+	struct scoutfs_inode_info *si;
 	struct scoutfs_key key;
 	struct scoutfs_inode sinode;
 	struct inode *inode;
@@ -1378,16 +1374,16 @@ struct inode *scoutfs_new_inode(struct super_block *sb, struct inode *dir,
 	if (!inode)
 		return ERR_PTR(-ENOMEM);

-	ci = SCOUTFS_I(inode);
-	ci->ino = ino;
-	ci->data_version = 0;
-	ci->online_blocks = 0;
-	ci->offline_blocks = 0;
-	ci->next_readdir_pos = SCOUTFS_DIRENT_FIRST_POS;
-	ci->next_xattr_id = 0;
-	ci->have_item = false;
-	atomic64_set(&ci->last_refreshed, lock->refresh_gen);
-	ci->flags = 0;
+	si = SCOUTFS_I(inode);
+	si->ino = ino;
+	si->data_version = 0;
+	si->online_blocks = 0;
+	si->offline_blocks = 0;
+	si->next_readdir_pos = SCOUTFS_DIRENT_FIRST_POS;
+	si->next_xattr_id = 0;
+	si->have_item = false;
+	atomic64_set(&si->last_refreshed, lock->refresh_gen);
+	si->flags = 0;

 	scoutfs_inode_set_meta_seq(inode);
 	scoutfs_inode_set_data_seq(inode);
@@ -1498,8 +1494,7 @@ static int delete_inode_items(struct super_block *sb, u64 ino)
 retry:
 	ret = scoutfs_inode_index_start(sb, &ind_seq) ?:
 	      prepare_index_deletion(sb, &ind_locks, ino, mode, &sinode) ?:
-	      scoutfs_inode_index_try_lock_hold(sb, &ind_locks, ind_seq,
-						SIC_DROP_INODE(mode, size));
+	      scoutfs_inode_index_try_lock_hold(sb, &ind_locks, ind_seq);
 	if (ret > 0)
 		goto retry;
 	if (ret)
@@ -4,7 +4,6 @@
 #include "key.h"
 #include "lock.h"
 #include "per_task.h"
-#include "count.h"
 #include "format.h"
 #include "data.h"

@@ -22,6 +21,14 @@ struct scoutfs_inode_info {
 	u64 offline_blocks;
 	u32 flags;

+	/*
+	 * Protects per-inode extent items, most particularly readers
+	 * who want to serialize writers without holding i_mutex. (only
+	 * used in data.c, it's the only place that understands file
+	 * extent items)
+	 */
+	struct rw_semaphore extent_sem;
+
 	/*
 	 * The in-memory item info caches the current index item values
 	 * so that we can decide to update them with comparisons instead
@@ -75,11 +82,9 @@ int scoutfs_inode_index_prepare_ino(struct super_block *sb,
 				    struct list_head *list, u64 ino,
 				    umode_t mode);
 int scoutfs_inode_index_try_lock_hold(struct super_block *sb,
-				      struct list_head *list, u64 seq,
-				      const struct scoutfs_item_count cnt);
+				      struct list_head *list, u64 seq);
 int scoutfs_inode_index_lock_hold(struct inode *inode, struct list_head *list,
-				  bool set_data_seq,
-				  const struct scoutfs_item_count cnt);
+				  bool set_data_seq);
 void scoutfs_inode_index_unlock(struct super_block *sb, struct list_head *list);

 int scoutfs_dirty_inode_item(struct inode *inode, struct scoutfs_lock *lock);
@@ -12,6 +12,7 @@
 */
 #include <linux/kernel.h>
 #include <linux/fs.h>
+#include <linux/file.h>
 #include <linux/uaccess.h>
 #include <linux/compiler.h>
 #include <linux/uio.h>
@@ -274,8 +275,8 @@ static long scoutfs_ioc_release(struct file *file, unsigned long arg)
 	struct super_block *sb = inode->i_sb;
 	struct scoutfs_ioctl_release args;
 	struct scoutfs_lock *lock = NULL;
-	loff_t start;
-	loff_t end_inc;
+	u64 sblock;
+	u64 eblock;
 	u64 online;
 	u64 offline;
 	u64 isize;
@@ -286,9 +287,11 @@ static long scoutfs_ioc_release(struct file *file, unsigned long arg)

 	trace_scoutfs_ioc_release(sb, scoutfs_ino(inode), &args);

-	if (args.count == 0)
+	if (args.length == 0)
 		return 0;
-	if ((args.block + args.count) < args.block)
+	if (((args.offset + args.length) < args.offset) ||
+	    (args.offset & SCOUTFS_BLOCK_SM_MASK) ||
+	    (args.length & SCOUTFS_BLOCK_SM_MASK))
 		return -EINVAL;


@@ -321,23 +324,24 @@ static long scoutfs_ioc_release(struct file *file, unsigned long arg)
 	inode_dio_wait(inode);

 	/* drop all clean and dirty cached blocks in the range */
-	start = args.block << SCOUTFS_BLOCK_SM_SHIFT;
-	end_inc = ((args.block + args.count) << SCOUTFS_BLOCK_SM_SHIFT) - 1;
-	truncate_inode_pages_range(&inode->i_data, start, end_inc);
+	truncate_inode_pages_range(&inode->i_data, args.offset,
+				   args.offset + args.length - 1);

+	sblock = args.offset >> SCOUTFS_BLOCK_SM_SHIFT;
+	eblock = (args.offset + args.length - 1) >> SCOUTFS_BLOCK_SM_SHIFT;
 	ret = scoutfs_data_truncate_items(sb, inode, scoutfs_ino(inode),
-					  args.block,
-					  args.block + args.count - 1, true,
+					  sblock,
+					  eblock, true,
 					  lock);
 	if (ret == 0) {
 		scoutfs_inode_get_onoff(inode, &online, &offline);
 		isize = i_size_read(inode);
 		if (online == 0 && isize) {
-			start = (isize + SCOUTFS_BLOCK_SM_SIZE - 1)
+			sblock = (isize + SCOUTFS_BLOCK_SM_SIZE - 1)
 					>> SCOUTFS_BLOCK_SM_SHIFT;
 			ret = scoutfs_data_truncate_items(sb, inode,
 							  scoutfs_ino(inode),
-							  start, U64_MAX,
+							  sblock, U64_MAX,
 							  false, lock);
 		}
 	}
@@ -459,23 +463,24 @@ static long scoutfs_ioc_stage(struct file *file, unsigned long arg)

 	trace_scoutfs_ioc_stage(sb, scoutfs_ino(inode), &args);

-	end_size = args.offset + args.count;
+	end_size = args.offset + args.length;

 	/* verify arg constraints that aren't dependent on file */
-	if (args.count < 0 || (end_size < args.offset) ||
-	    args.offset & SCOUTFS_BLOCK_SM_MASK)
+	if (args.length < 0 || (end_size < args.offset) ||
+	    args.offset & SCOUTFS_BLOCK_SM_MASK) {
 		return -EINVAL;
+	}

-	if (args.count == 0)
+	if (args.length == 0)
 		return 0;

 	/* the iocb is really only used for the file pointer :P */
 	init_sync_kiocb(&kiocb, file);
 	kiocb.ki_pos = args.offset;
-	kiocb.ki_left = args.count;
-	kiocb.ki_nbytes = args.count;
+	kiocb.ki_left = args.length;
+	kiocb.ki_nbytes = args.length;
 	iov.iov_base = (void __user *)(unsigned long)args.buf_ptr;
-	iov.iov_len = args.count;
+	iov.iov_len = args.length;

 	ret = mnt_want_write_file(file);
 	if (ret)
@@ -514,11 +519,11 @@ static long scoutfs_ioc_stage(struct file *file, unsigned long arg)
 	written = 0;
 	do {
 		ret = generic_file_buffered_write(&kiocb, &iov, 1, pos, &pos,
-						  args.count, written);
+						  args.length, written);
 		BUG_ON(ret == -EIOCBQUEUED);
 		if (ret > 0)
 			written += ret;
-	} while (ret > 0 && written < args.count);
+	} while (ret > 0 && written < args.length);

 	si->staging = false;
 	current->backing_dev_info = NULL;
@@ -669,8 +674,7 @@ static long scoutfs_ioc_setattr_more(struct file *file, unsigned long arg)

 	/* setting only so we don't see 0 data seq with nonzero data_version */
 	set_data_seq = sm.data_version != 0 ? true : false;
-	ret = scoutfs_inode_index_lock_hold(inode, &ind_locks, set_data_seq,
-					    SIC_SETATTR_MORE());
+	ret = scoutfs_inode_index_lock_hold(inode, &ind_locks, set_data_seq);
 	if (ret)
 		goto unlock;

@@ -933,6 +937,54 @@ static long scoutfs_ioc_alloc_detail(struct file *file, unsigned long arg)
 	       args.copied;
 }

+static long scoutfs_ioc_move_blocks(struct file *file, unsigned long arg)
+{
+	struct inode *to = file_inode(file);
+	struct super_block *sb = to->i_sb;
+	struct scoutfs_ioctl_move_blocks __user *umb = (void __user *)arg;
+	struct scoutfs_ioctl_move_blocks mb;
+	struct file *from_file;
+	struct inode *from;
+	int ret;
+
+	if (copy_from_user(&mb, umb, sizeof(mb)))
+		return -EFAULT;
+
+	if (mb.len == 0)
+		return 0;
+
+	if (mb.from_off + mb.len < mb.from_off ||
+	    mb.to_off + mb.len < mb.to_off)
+		return -EOVERFLOW;
+
+	from_file = fget(mb.from_fd);
+	if (!from_file)
+		return -EBADF;
+	from = file_inode(from_file);
+
+	if (from == to) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	if (from->i_sb != sb) {
+		ret = -EXDEV;
+		goto out;
+	}
+
+	ret = mnt_want_write_file(file);
+	if (ret < 0)
+		goto out;
+
+	ret = scoutfs_data_move_blocks(from, mb.from_off, mb.len,
+				       to, mb.to_off);
+	mnt_drop_write_file(file);
+out:
+	fput(from_file);
+
+	return ret;
+}
+
 long scoutfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 {
 	switch (cmd) {
@@ -960,6 +1012,8 @@ long scoutfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 		return scoutfs_ioc_data_wait_err(file, arg);
 	case SCOUTFS_IOC_ALLOC_DETAIL:
 		return scoutfs_ioc_alloc_detail(file, arg);
+	case SCOUTFS_IOC_MOVE_BLOCKS:
+		return scoutfs_ioc_move_blocks(file, arg);
 	}

 	return -ENOTTY;
@@ -176,8 +176,8 @@ struct scoutfs_ioctl_ino_path_result {
 * an offline record is left behind to trigger demand staging if the
 * file is read.
 *
- * The starting block offset and number of blocks to release are in
- * units 4KB blocks.
+ * The starting file offset and number of bytes to release must be in
+ * multiples of 4KB.
 *
 * The specified range can extend past i_size and can straddle sparse
 * regions or blocks that are already offline.  The only change it makes
@@ -193,8 +193,8 @@ struct scoutfs_ioctl_ino_path_result {
 * presentation of the data in the file.
 */
 struct scoutfs_ioctl_release {
-	__u64 block;
-	__u64 count;
+	__u64 offset;
+	__u64 length;
 	__u64 data_version;
 };

@@ -205,7 +205,7 @@ struct scoutfs_ioctl_stage {
 	__u64 data_version;
 	__u64 buf_ptr;
 	__u64 offset;
-	__s32 count;
+	__s32 length;
 	__u32 _pad;
 };

@@ -395,9 +395,6 @@ struct scoutfs_ioctl_data_wait_err {
 				       struct scoutfs_ioctl_data_wait_err)


-#define SCOUTFS_IOC_ALLOC_DETAIL _IOR(SCOUTFS_IOCTL_MAGIC, 12, \
-				     struct scoutfs_ioctl_alloc_detail)
-
 struct scoutfs_ioctl_alloc_detail {
 	__u64 entries_ptr;
 	__u64 entries_nr;
@@ -413,4 +410,58 @@ struct scoutfs_ioctl_alloc_detail_entry {
 	__u8 __pad[6];
 };

+#define SCOUTFS_IOC_ALLOC_DETAIL _IOR(SCOUTFS_IOCTL_MAGIC, 12, \
+				      struct scoutfs_ioctl_alloc_detail)
+
+/*
+ * Move extents from one regular file to another at a different offset,
+ * on the same file system.
+ *
+ * from_fd specifies the source file and the ioctl is called on the
+ * destination file.  Both files must have write access.  from_off
+ * specifies the byte offset in the source, to_off is the byte offset in
+ * the destination, and len is the number of bytes in the region to
+ * move.   All of the offsets and lengths must be in multiples of 4KB,
+ * except in the case where the from_off + len ends at the i_size of the
+ * source file.
+ *
+ * This interface only moves extents which are block granular, it does
+ * not perform RMW of sub-block byte extents and it does not overwrite
+ * existing extents in the destination.  It will split extents in the
+ * source.
+ *
+ * Only extents within i_size on the source are moved.  The destination
+ * i_size will be updated if extents are moved beyond its current
+ * i_size.  The i_size update will maintain final partial blocks in the
+ * source.
+ *
+ * It will return an error if either of the files have offline extents.
+ * It will return 0 when all of the extents in the source region have
+ * been moved to the destination.  Moving extents updates the ctime,
+ * mtime, meta_seq, data_seq, and data_version fields of both the source
+ * and destination inodes.  If an error is returned then partial
+ * progress may have been made and inode fields may have been updated.
+ *
+ * Errors specific to this interface include:
+ *
+ * EINVAL: from_off, len, or to_off aren't a multiple of 4KB; the source
+ *	   and destination files are the same inode; either the source or
+ *	   destination is not a regular file; the destination file has
+ *	   an existing overlapping extent.
+ * EOVERFLOW: either from_off + len or to_off + len exceeded 64bits.
+ * EBADF: from_fd isn't a valid open file descriptor.
+ * EXDEV: the source and destination files are in different filesystems.
+ * EISDIR: either the source or destination is a directory.
+ * ENODATA: either the source or destination file have offline extents.
+ */
+struct scoutfs_ioctl_move_blocks {
+	__u64 from_fd;
+	__u64 from_off;
+	__u64 len;
+	__u64 to_off;
+};
+
+#define SCOUTFS_IOC_MOVE_BLOCKS _IOR(SCOUTFS_IOCTL_MAGIC, 13, \
+				     struct scoutfs_ioctl_move_blocks)
+
 #endif
@@ -1339,7 +1339,10 @@ static int read_page_item(struct super_block *sb, struct scoutfs_key *key,
 		/* split needs multiple items, sparse may not have enough */
 		if (!left)
 			return -ENOMEM;
+
 		compact_page_items(sb, pg, left);
+		found = item_rbtree_walk(&pg->item_root, key, NULL, &par,
+					 &pnode);
 	}

 	item = alloc_item(pg, key, liv, val, val_len);
@@ -1491,6 +1494,8 @@ retry:
 			rbtree_erase(&rd->node, &root);
 			rbtree_insert(&rd->node, par, pnode, &cinf->pg_root);
 			lru_accessed(sb, cinf, rd);
+			trace_scoutfs_item_read_page(sb, key, &rd->start,
+						     &rd->end);
 			continue;
 		}

@@ -2342,6 +2347,8 @@ retry:
 		write_lock(&pg->rwlock);

 		pgi = trim_page_intersection(sb, cinf, pg, right, start, end);
+		trace_scoutfs_item_invalidate_page(sb, start, end,
+						   &pg->start, &pg->end, pgi);
 		BUG_ON(pgi == PGI_DISJOINT); /* walk wouldn't ret disjoint */

 		if (pgi == PGI_INSIDE) {
@@ -2364,9 +2371,9 @@ retry:
 			/* inv was entirely inside page, done after bisect */
 			write_trylock_will_succeed(&right->rwlock);
 			rbtree_insert(&right->node, par, pnode, &cinf->pg_root);
+			lru_accessed(sb, cinf, right);
 			write_unlock(&right->rwlock);
 			write_unlock(&pg->rwlock);
-			lru_accessed(sb, cinf, right);
 			right = NULL;
 			break;
 		}
@@ -2396,7 +2403,6 @@ static int item_lru_shrink(struct shrinker *shrink,
 	struct active_reader *active;
 	struct cached_page *tmp;
 	struct cached_page *pg;
-	LIST_HEAD(list);
 	int nr;

 	if (sc->nr_to_scan == 0)
@@ -2433,21 +2439,17 @@ static int item_lru_shrink(struct shrinker *shrink,

 		__lru_remove(sb, cinf, pg);
 		rbtree_erase(&pg->node, &cinf->pg_root);
-		list_move_tail(&pg->lru_head, &list);
 		invalidate_pcpu_page(pg);
 		write_unlock(&pg->rwlock);

+		put_pg(sb, pg);
+
 		if (--nr == 0)
 			break;
 	}

 	write_unlock(&cinf->rwlock);
 	spin_unlock(&cinf->lru_lock);
-
-	list_for_each_entry_safe(pg, tmp, &list, lru_head) {
-		list_del_init(&pg->lru_head);
-		put_pg(sb, pg);
-	}
 out:
 	return min_t(unsigned long, cinf->lru_pages, INT_MAX);
 }
@@ -65,7 +65,7 @@
 * relative to that lock state we resend.
 */

-#define GRACE_PERIOD_KT	ms_to_ktime(2)
+#define GRACE_PERIOD_KT	ms_to_ktime(10)

 /*
 * allocated per-super, freed on unmount.
@@ -770,16 +770,6 @@ static void lock_invalidate_worker(struct work_struct *work)
 	list_for_each_entry_safe(lock, tmp, &linfo->inv_list, inv_head) {
 		nl = &lock->inv_nl;

-		/* skip if grace hasn't elapsed, record earliest */
-		deadline = lock->grace_deadline;
-		if (ktime_before(now, deadline)) {
-			delay = min(delay,
-				    nsecs_to_jiffies(ktime_to_ns(
-						ktime_sub(deadline, now))));
-			scoutfs_inc_counter(linfo->sb, lock_grace_wait);
-			continue;
-		}
-
 		/* wait for reordered grant to finish */
 		if (lock->mode != nl->old_mode)
 			continue;
@@ -788,6 +778,15 @@ static void lock_invalidate_worker(struct work_struct *work)
 		if (!lock_counts_match(nl->new_mode, lock->users))
 			continue;

+		/* skip if grace hasn't elapsed, record earliest */
+		deadline = lock->grace_deadline;
+		if (!linfo->shutdown && ktime_before(now, deadline)) {
+			delay = min(delay,
+				    nsecs_to_jiffies(ktime_to_ns(
+						ktime_sub(deadline, now))));
+			scoutfs_inc_counter(linfo->sb, lock_grace_wait);
+			continue;
+		}
 		/* set the new mode, no incompatible users during inval */
 		lock->mode = nl->new_mode;

@@ -31,7 +31,6 @@
 #include "lock.h"
 #include "super.h"
 #include "ioctl.h"
-#include "count.h"
 #include "export.h"
 #include "dir.h"
 #include "server.h"
@@ -169,6 +168,40 @@ TRACE_EVENT(scoutfs_data_fallocate,
 		__entry->len, __entry->ret)
 );

+TRACE_EVENT(scoutfs_data_move_blocks,
+	TP_PROTO(struct super_block *sb, u64 from_ino, u64 from_start, u64 len,
+		 u64 map, u8 flags, u64 to_ino, u64 to_start),
+
+	TP_ARGS(sb, from_ino, from_start, len, map, flags, to_ino, to_start),
+
+	TP_STRUCT__entry(
+		SCSB_TRACE_FIELDS
+		__field(__u64, from_ino)
+		__field(__u64, from_start)
+		__field(__u64, len)
+		__field(__u64, map)
+		__field(__u8, flags)
+		__field(__u64, to_ino)
+		__field(__u64, to_start)
+	),
+
+	TP_fast_assign(
+		SCSB_TRACE_ASSIGN(sb);
+		__entry->from_ino = from_ino;
+		__entry->from_start = from_start;
+		__entry->len = len;
+		__entry->map = map;
+		__entry->flags = flags;
+		__entry->to_ino = to_ino;
+		__entry->to_start = to_start;
+	),
+
+	TP_printk(SCSBF" from_ino %llu from_start %llu len %llu map %llu flags 0x%x to_ino %llu to_start %llu\n",
+		SCSB_TRACE_ARGS, __entry->from_ino, __entry->from_start,
+		__entry->len, __entry->map, __entry->flags, __entry->to_ino,
+		__entry->to_start)
+);
+
 TRACE_EVENT(scoutfs_data_fiemap,
 	TP_PROTO(struct super_block *sb, __u64 start, __u64 len, int ret),

@@ -392,133 +425,59 @@ TRACE_EVENT(scoutfs_trans_write_func,

 TRACE_EVENT(scoutfs_release_trans,
 	TP_PROTO(struct super_block *sb, void *rsv, unsigned int rsv_holders,
-		 struct scoutfs_item_count *res,
-		 struct scoutfs_item_count *act, unsigned int tri_holders,
-		 unsigned int tri_writing, unsigned int tri_items,
-		 unsigned int tri_vals),
+		 unsigned int tri_holders,
+		 unsigned int tri_writing),

-	TP_ARGS(sb, rsv, rsv_holders, res, act, tri_holders, tri_writing,
-		tri_items, tri_vals),
+	TP_ARGS(sb, rsv, rsv_holders, tri_holders, tri_writing),

 	TP_STRUCT__entry(
 		SCSB_TRACE_FIELDS
 		__field(void *, rsv)
 		__field(unsigned int, rsv_holders)
-		__field(int, res_items)
-		__field(int, res_vals)
-		__field(int, act_items)
-		__field(int, act_vals)
 		__field(unsigned int, tri_holders)
 		__field(unsigned int, tri_writing)
-		__field(unsigned int, tri_items)
-		__field(unsigned int, tri_vals)
 	),

 	TP_fast_assign(
 		SCSB_TRACE_ASSIGN(sb);
 		__entry->rsv = rsv;
 		__entry->rsv_holders = rsv_holders;
-		__entry->res_items = res->items;
-		__entry->res_vals = res->vals;
-		__entry->act_items = act->items;
-		__entry->act_vals = act->vals;
 		__entry->tri_holders = tri_holders;
 		__entry->tri_writing = tri_writing;
-		__entry->tri_items = tri_items;
-		__entry->tri_vals = tri_vals;
 	),

-	TP_printk(SCSBF" rsv %p holders %u reserved %u.%u actual "
-		  "%d.%d, trans holders %u writing %u reserved "
-		  "%u.%u", SCSB_TRACE_ARGS, __entry->rsv, __entry->rsv_holders,
-		  __entry->res_items, __entry->res_vals, __entry->act_items,
-		  __entry->act_vals, __entry->tri_holders, __entry->tri_writing,
-		  __entry->tri_items, __entry->tri_vals)
+	TP_printk(SCSBF" rsv %p holders %u trans holders %u writing %u",
+		  SCSB_TRACE_ARGS, __entry->rsv, __entry->rsv_holders,
+		  __entry->tri_holders, __entry->tri_writing)
 );

 TRACE_EVENT(scoutfs_trans_acquired_hold,
-	TP_PROTO(struct super_block *sb, const struct scoutfs_item_count *cnt,
+	TP_PROTO(struct super_block *sb,
 		 void *rsv, unsigned int rsv_holders,
-		 struct scoutfs_item_count *res,
-		 struct scoutfs_item_count *act, unsigned int tri_holders,
-		 unsigned int tri_writing, unsigned int tri_items,
-		 unsigned int tri_vals),
+		 unsigned int tri_holders,
+		 unsigned int tri_writing),

-	TP_ARGS(sb, cnt, rsv, rsv_holders, res, act, tri_holders, tri_writing,
-		tri_items, tri_vals),
+	TP_ARGS(sb, rsv, rsv_holders, tri_holders, tri_writing),

 	TP_STRUCT__entry(
 		SCSB_TRACE_FIELDS
-		__field(int, cnt_items)
-		__field(int, cnt_vals)
 		__field(void *, rsv)
 		__field(unsigned int, rsv_holders)
-		__field(int, res_items)
-		__field(int, res_vals)
-		__field(int, act_items)
-		__field(int, act_vals)
 		__field(unsigned int, tri_holders)
 		__field(unsigned int, tri_writing)
-		__field(unsigned int, tri_items)
-		__field(unsigned int, tri_vals)
 	),

 	TP_fast_assign(
 		SCSB_TRACE_ASSIGN(sb);
-		__entry->cnt_items = cnt->items;
-		__entry->cnt_vals = cnt->vals;
 		__entry->rsv = rsv;
 		__entry->rsv_holders = rsv_holders;
-		__entry->res_items = res->items;
-		__entry->res_vals = res->vals;
-		__entry->act_items = act->items;
-		__entry->act_vals = act->vals;
 		__entry->tri_holders = tri_holders;
 		__entry->tri_writing = tri_writing;
-		__entry->tri_items = tri_items;
-		__entry->tri_vals = tri_vals;
 	),

-	TP_printk(SCSBF" cnt %u.%u, rsv %p holders %u reserved %u.%u "
-		  "actual %d.%d, trans holders %u writing %u reserved "
-		  "%u.%u", SCSB_TRACE_ARGS, __entry->cnt_items,
-		  __entry->cnt_vals, __entry->rsv, __entry->rsv_holders,
-		  __entry->res_items, __entry->res_vals, __entry->act_items,
-		  __entry->act_vals, __entry->tri_holders, __entry->tri_writing,
-		  __entry->tri_items, __entry->tri_vals)
-);
-
-TRACE_EVENT(scoutfs_trans_track_item,
-	TP_PROTO(struct super_block *sb, int delta_items, int delta_vals,
-		 int act_items, int act_vals, int res_items, int res_vals),
-
-	TP_ARGS(sb, delta_items, delta_vals, act_items, act_vals, res_items,
-		res_vals),
-
-	TP_STRUCT__entry(
-		SCSB_TRACE_FIELDS
-		__field(int, delta_items)
-		__field(int, delta_vals)
-		__field(int, act_items)
-		__field(int, act_vals)
-		__field(int, res_items)
-		__field(int, res_vals)
-	),
-
-	TP_fast_assign(
-		SCSB_TRACE_ASSIGN(sb);
-		__entry->delta_items = delta_items;
-		__entry->delta_vals = delta_vals;
-		__entry->act_items = act_items;
-		__entry->act_vals = act_vals;
-		__entry->res_items = res_items;
-		__entry->res_vals = res_vals;
-	),
-
-	TP_printk(SCSBF" delta_items %d delta_vals %d act_items %d act_vals %d res_items %d res_vals %d",
-		  SCSB_TRACE_ARGS, __entry->delta_items, __entry->delta_vals,
-		  __entry->act_items, __entry->act_vals, __entry->res_items,
-		  __entry->res_vals)
+	TP_printk(SCSBF" rsv %p holders %u trans holders %u writing %u",
+		  SCSB_TRACE_ARGS, __entry->rsv, __entry->rsv_holders,
+		  __entry->tri_holders, __entry->tri_writing)
 );

 TRACE_EVENT(scoutfs_ioc_release,
@@ -530,22 +489,22 @@ TRACE_EVENT(scoutfs_ioc_release,
 	TP_STRUCT__entry(
 		SCSB_TRACE_FIELDS
 		__field(__u64, ino)
-		__field(__u64, block)
-		__field(__u64, count)
+		__field(__u64, offset)
+		__field(__u64, length)
 		__field(__u64, vers)
 	),

 	TP_fast_assign(
 		SCSB_TRACE_ASSIGN(sb);
 		__entry->ino = ino;
-		__entry->block = args->block;
-		__entry->count = args->count;
+		__entry->offset = args->offset;
+		__entry->length = args->length;
 		__entry->vers = args->data_version;
 	),

-	TP_printk(SCSBF" ino %llu block %llu count %llu vers %llu",
-		  SCSB_TRACE_ARGS, __entry->ino, __entry->block,
-		  __entry->count, __entry->vers)
+	TP_printk(SCSBF" ino %llu offset %llu length %llu vers %llu",
+		  SCSB_TRACE_ARGS, __entry->ino, __entry->offset,
+		  __entry->length, __entry->vers)
 );

 DEFINE_EVENT(scoutfs_ino_ret_class, scoutfs_ioc_release_ret,
@@ -564,7 +523,7 @@ TRACE_EVENT(scoutfs_ioc_stage,
 		__field(__u64, ino)
 		__field(__u64, vers)
 		__field(__u64, offset)
-		__field(__s32, count)
+		__field(__s32, length)
 	),

 	TP_fast_assign(
@@ -572,12 +531,12 @@ TRACE_EVENT(scoutfs_ioc_stage,
 		__entry->ino = ino;
 		__entry->vers = args->data_version;
 		__entry->offset = args->offset;
-		__entry->count = args->count;
+		__entry->length = args->length;
 	),

-	TP_printk(SCSBF" ino %llu vers %llu offset %llu count %d",
+	TP_printk(SCSBF" ino %llu vers %llu offset %llu length %d",
 		  SCSB_TRACE_ARGS, __entry->ino, __entry->vers,
-		  __entry->offset, __entry->count)
+		  __entry->offset, __entry->length)
 );

 TRACE_EVENT(scoutfs_ioc_data_wait_err,
@@ -1979,31 +1938,27 @@ DEFINE_EVENT(scoutfs_clock_sync_class, scoutfs_recv_clock_sync,
 );

 TRACE_EVENT(scoutfs_trans_seq_advance,
-	TP_PROTO(struct super_block *sb, u64 rid, u64 prev_seq,
-		 u64 next_seq),
+	TP_PROTO(struct super_block *sb, u64 rid, u64 trans_seq),

-	TP_ARGS(sb, rid, prev_seq, next_seq),
+	TP_ARGS(sb, rid, trans_seq),

 	TP_STRUCT__entry(
 		SCSB_TRACE_FIELDS
 		__field(__u64, s_rid)
-		__field(__u64, prev_seq)
-		__field(__u64, next_seq)
+		__field(__u64, trans_seq)
 	),

 	TP_fast_assign(
 		SCSB_TRACE_ASSIGN(sb);
 		__entry->s_rid = rid;
-		__entry->prev_seq = prev_seq;
-		__entry->next_seq = next_seq;
+		__entry->trans_seq = trans_seq;
 	),

-	TP_printk(SCSBF" rid %016llx prev_seq %llu next_seq %llu",
-		  SCSB_TRACE_ARGS, __entry->s_rid, __entry->prev_seq,
-		  __entry->next_seq)
+	TP_printk(SCSBF" rid %016llx trans_seq %llu\n",
+		  SCSB_TRACE_ARGS, __entry->s_rid, __entry->trans_seq)
 );

-TRACE_EVENT(scoutfs_trans_seq_farewell,
+TRACE_EVENT(scoutfs_trans_seq_remove,
 	TP_PROTO(struct super_block *sb, u64 rid, u64 trans_seq),

 	TP_ARGS(sb, rid, trans_seq),
@@ -2462,6 +2417,53 @@ TRACE_EVENT(scoutfs_alloc_move,
 		  __entry->ret)
 );

+TRACE_EVENT(scoutfs_item_read_page,
+	TP_PROTO(struct super_block *sb, struct scoutfs_key *key,
+		 struct scoutfs_key *pg_start, struct scoutfs_key *pg_end),
+	TP_ARGS(sb, key, pg_start, pg_end),
+	TP_STRUCT__entry(
+		SCSB_TRACE_FIELDS
+		sk_trace_define(key)
+		sk_trace_define(pg_start)
+		sk_trace_define(pg_end)
+	),
+	TP_fast_assign(
+		SCSB_TRACE_ASSIGN(sb);
+		sk_trace_assign(key, key);
+		sk_trace_assign(pg_start, pg_start);
+		sk_trace_assign(pg_end, pg_end);
+	),
+	TP_printk(SCSBF" key "SK_FMT" pg_start "SK_FMT" pg_end "SK_FMT,
+		  SCSB_TRACE_ARGS, sk_trace_args(key), sk_trace_args(pg_start),
+		  sk_trace_args(pg_end))
+);
+
+TRACE_EVENT(scoutfs_item_invalidate_page,
+	TP_PROTO(struct super_block *sb, struct scoutfs_key *start,
+		 struct scoutfs_key *end, struct scoutfs_key *pg_start,
+		 struct scoutfs_key *pg_end, int pgi),
+	TP_ARGS(sb, start, end, pg_start, pg_end, pgi),
+	TP_STRUCT__entry(
+		SCSB_TRACE_FIELDS
+		sk_trace_define(start)
+		sk_trace_define(end)
+		sk_trace_define(pg_start)
+		sk_trace_define(pg_end)
+		__field(int, pgi)
+	),
+	TP_fast_assign(
+		SCSB_TRACE_ASSIGN(sb);
+		sk_trace_assign(start, start);
+		sk_trace_assign(end, end);
+		sk_trace_assign(pg_start, pg_start);
+		sk_trace_assign(pg_end, pg_end);
+		__entry->pgi = pgi;
+	),
+	TP_printk(SCSBF" start "SK_FMT" end "SK_FMT" pg_start "SK_FMT" pg_end "SK_FMT" pgi %d",
+		  SCSB_TRACE_ARGS, sk_trace_args(start), sk_trace_args(end),
+		  sk_trace_args(pg_start), sk_trace_args(pg_end), __entry->pgi)
+);
+
 #endif /* _TRACE_SCOUTFS_H */

 /* This part must be outside protection */
@@ -649,79 +649,10 @@ static void init_trans_seq_key(struct scoutfs_key *key, u64 seq, u64 rid)
 }

 /*
- * Give the client the next sequence number for their transaction.  They
- * provide their previous transaction sequence number that they've
- * committed.
- *
- * We track the sequence numbers of transactions that clients have open.
- * This limits the transaction sequence numbers that can be returned in
- * the index of inodes by meta and data transaction numbers.  We
- * communicate the largest possible sequence number to clients via an
- * rpc.
- *
- * The transaction sequence tracking is stored in a btree so it is
- * shared across servers.  Final entries are removed when processing a
- * client's farewell or when it's removed.
+ * Remove all trans_seq items owned by the client rid, the caller holds
+ * the seq_rwsem.
 */
-static int server_advance_seq(struct super_block *sb,
-			      struct scoutfs_net_connection *conn,
-			      u8 cmd, u64 id, void *arg, u16 arg_len)
-{
-	DECLARE_SERVER_INFO(sb, server);
-	struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
-	struct scoutfs_super_block *super = &sbi->super;
-	__le64 their_seq;
-	__le64 next_seq;
-	u64 rid = scoutfs_net_client_rid(conn);
-	struct scoutfs_key key;
-	int ret;
-
-	if (arg_len != sizeof(__le64)) {
-		ret = -EINVAL;
-		goto out;
-	}
-	memcpy(&their_seq, arg, sizeof(their_seq));
-
-	ret = scoutfs_server_hold_commit(sb);
-	if (ret)
-		goto out;
-
-	down_write(&server->seq_rwsem);
-
-	if (their_seq != 0) {
-		init_trans_seq_key(&key, le64_to_cpu(their_seq), rid);
-		ret = scoutfs_btree_delete(sb, &server->alloc, &server->wri,
-					   &super->trans_seqs, &key);
-		if (ret < 0 && ret != -ENOENT)
-			goto unlock;
-	}
-
-	next_seq = super->next_trans_seq;
-	le64_add_cpu(&super->next_trans_seq, 1);
-
-	trace_scoutfs_trans_seq_advance(sb, rid, le64_to_cpu(their_seq),
-					le64_to_cpu(next_seq));
-
-	init_trans_seq_key(&key, le64_to_cpu(next_seq), rid);
-	ret = scoutfs_btree_insert(sb, &server->alloc, &server->wri,
-				   &super->trans_seqs, &key, NULL, 0);
-unlock:
-	up_write(&server->seq_rwsem);
-	ret = scoutfs_server_apply_commit(sb, ret);
-
-out:
-	return scoutfs_net_response(sb, conn, cmd, id, ret,
-				    &next_seq, sizeof(next_seq));
-}
-
-/*
- * Remove any transaction sequences owned by the client.  They must have
- * committed any final transaction by the time they get here via sending
- * their farewell message.  This can be called multiple times as the
- * client's farewell is retransmitted so it's OK to not find any
- * entries.  This is called with the server commit rwsem held.
- */
-static int remove_trans_seq(struct super_block *sb, u64 rid)
+static int remove_trans_seq_locked(struct super_block *sb, u64 rid)
 {
 	DECLARE_SERVER_INFO(sb, server);
 	struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
@@ -730,8 +661,6 @@ static int remove_trans_seq(struct super_block *sb, u64 rid)
 	struct scoutfs_key key;
 	int ret = 0;

-	down_write(&server->seq_rwsem);
-
 	init_trans_seq_key(&key, 0, 0);

 	for (;;) {
@@ -746,17 +675,102 @@ static int remove_trans_seq(struct super_block *sb, u64 rid)
 		scoutfs_btree_put_iref(&iref);

 		if (le64_to_cpu(key.skts_rid) == rid) {
-			trace_scoutfs_trans_seq_farewell(sb, rid,
+			trace_scoutfs_trans_seq_remove(sb, rid,
 					le64_to_cpu(key.skts_trans_seq));
 			ret = scoutfs_btree_delete(sb, &server->alloc,
 						   &server->wri,
 						   &super->trans_seqs, &key);
-			break;
+			if (ret < 0)
+				break;
 		}

 		scoutfs_key_inc(&key);
 	}

+	return ret;
+}
+
+/*
+ * Give the client the next sequence number for the transaction that
+ * they're opening.
+ *
+ * We track the sequence numbers of transactions that clients have open.
+ * This limits the transaction sequence numbers that can be returned in
+ * the index of inodes by meta and data transaction numbers.  We
+ * communicate the largest possible sequence number to clients via an
+ * rpc.
+ *
+ * The transaction sequence tracking is stored in a btree so it is
+ * shared across servers.  Final entries are removed when processing a
+ * client's farewell or when it's removed.  We can be processent a
+ * resent request that was committed by a previous server before the
+ * reply was lost.  At this point the client has no transactions open
+ * and may or may not have just finished one.  To keep it simple we
+ * always remove any previous seq items, if there are any, and then
+ * insert a new item for the client at the next greatest seq.
+ */
+static int server_advance_seq(struct super_block *sb,
+			      struct scoutfs_net_connection *conn,
+			      u8 cmd, u64 id, void *arg, u16 arg_len)
+{
+	DECLARE_SERVER_INFO(sb, server);
+	struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
+	struct scoutfs_super_block *super = &sbi->super;
+	u64 rid = scoutfs_net_client_rid(conn);
+	struct scoutfs_key key;
+	__le64 leseq = 0;
+	u64 seq;
+	int ret;
+
+	if (arg_len != 0) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	ret = scoutfs_server_hold_commit(sb);
+	if (ret)
+		goto out;
+
+	down_write(&server->seq_rwsem);
+
+	ret = remove_trans_seq_locked(sb, rid);
+	if (ret < 0)
+		goto unlock;
+
+	seq = le64_to_cpu(super->next_trans_seq);
+	le64_add_cpu(&super->next_trans_seq, 1);
+
+	trace_scoutfs_trans_seq_advance(sb, rid, seq);
+
+	init_trans_seq_key(&key, seq, rid);
+	ret = scoutfs_btree_insert(sb, &server->alloc, &server->wri,
+				   &super->trans_seqs, &key, NULL, 0);
+	if (ret == 0)
+		leseq = cpu_to_le64(seq);
+unlock:
+	up_write(&server->seq_rwsem);
+	ret = scoutfs_server_apply_commit(sb, ret);
+
+out:
+	return scoutfs_net_response(sb, conn, cmd, id, ret,
+				    &leseq, sizeof(leseq));
+}
+
+/*
+ * Remove any transaction sequences owned by the client who's sent a
+ * farewell They must have committed any final transaction by the time
+ * they get here via sending their farewell message.  This can be called
+ * multiple times as the client's farewell is retransmitted so it's OK
+ * to not find any entries.  This is called with the server commit rwsem
+ * held.
+ */
+static int remove_trans_seq(struct super_block *sb, u64 rid)
+{
+	DECLARE_SERVER_INFO(sb, server);
+	int ret = 0;
+
+	down_write(&server->seq_rwsem);
+	ret = remove_trans_seq_locked(sb, rid);
 	up_write(&server->seq_rwsem);

 	return ret;
@@ -1096,6 +1110,20 @@ static int cancel_srch_compact(struct super_block *sb, u64 rid)
 	return ret;
 }

+/*
+ * Farewell processing is async to the request processing work.  Shutdown
+ * waits for request processing to finish and then tears down the connection.
+ * We don't want to queue farewell processing once we start shutting down
+ * so that we don't have farewell processing racing with the connecting
+ * being shutdown.  If a mount's farewell message is dropped by a server
+ * it will be processed by the next server.
+ */
+static void queue_farewell_work(struct server_info *server)
+{
+	if (!server->shutting_down)
+		queue_work(server->wq, &server->farewell_work);
+}
+
 /*
 * Process an incoming greeting request in the server from the client.
 * We try to send responses to failed greetings so that the sender can
@@ -1141,10 +1169,10 @@ static int server_greeting(struct super_block *sb,
 		goto send_err;
 	}

-	if (gr->format_hash != super->format_hash) {
+	if (gr->version != super->version) {
 		scoutfs_warn(sb, "client sent format 0x%llx, server has 0x%llx",
-			     le64_to_cpu(gr->format_hash),
-			     le64_to_cpu(super->format_hash));
+			     le64_to_cpu(gr->version),
+			     le64_to_cpu(super->version));
 		ret = -EINVAL;
 		goto send_err;
 	}
@@ -1173,7 +1201,7 @@ send_err:
 	err = ret;

 	greet.fsid = super->hdr.fsid;
-	greet.format_hash = super->format_hash;
+	greet.version = super->version;
 	greet.server_term = cpu_to_le64(server->term);
 	greet.unmount_barrier = umb;
 	greet.rid = gr->rid;
@@ -1400,8 +1428,8 @@ out:

 	if (ret < 0)
 		stop_server(server);
-	else if (more_reqs && !server->shutting_down)
-		queue_work(server->wq, &server->farewell_work);
+	else if (more_reqs)
+		queue_farewell_work(server);
 }

 static void free_farewell_requests(struct super_block *sb, u64 rid)
@@ -1455,7 +1483,7 @@ static int server_farewell(struct super_block *sb,
 	list_add_tail(&fw->entry, &server->farewell_requests);
 	mutex_unlock(&server->farewell_mutex);

-	queue_work(server->wq, &server->farewell_work);
+	queue_farewell_work(server);

 	/* response will be sent later */
 	return 0;
@@ -1618,12 +1646,17 @@ static void scoutfs_server_worker(struct work_struct *work)

 shutdown:
 	scoutfs_info(sb, "server shutting down at "SIN_FMT, SIN_ARG(&sin));
-	/* wait for request processing */
+
+	/* wait for farewell to finish sending messages */
+	flush_work(&server->farewell_work);
+
+	/* wait for requests to finish, no more requests */
 	scoutfs_net_shutdown(sb, conn);
-	/* wait for commit queued by request processing */
-	flush_work(&server->commit_work);
 	server->conn = NULL;

+	/* wait for extra queues by requests, won't find waiters */
+	flush_work(&server->commit_work);
+
 	scoutfs_lock_server_destroy(sb);

 out:
@@ -1696,8 +1729,9 @@ void scoutfs_server_stop(struct super_block *sb)
 	DECLARE_SERVER_INFO(sb, server);

 	stop_server(server);
-	/* XXX not sure both are needed */
+
 	cancel_work_sync(&server->work);
+	cancel_work_sync(&server->farewell_work);
 	cancel_work_sync(&server->commit_work);
 }

@@ -1752,11 +1786,12 @@ void scoutfs_server_destroy(struct super_block *sb)

 		/* wait for server work to wait for everything to shut down */
 		cancel_work_sync(&server->work);
+		/* farewell work triggers commits */
+		cancel_work_sync(&server->farewell_work);
 		/* recv work/compaction could have left commit_work queued */
 		cancel_work_sync(&server->commit_work);

 		/* pending farewell requests are another server's problem */
-		cancel_work_sync(&server->farewell_work);
 		free_farewell_requests(sb, 0);

 		trace_scoutfs_server_workqueue_destroy(sb, 0, 0);
@@ -1198,14 +1198,10 @@ int scoutfs_srch_get_compact(struct super_block *sb,

 	for (;;scoutfs_key_inc(&key)) {
 		ret = scoutfs_btree_next(sb, root, &key, &iref);
-		if (ret == -ENOENT) {
-			ret = 0;
-			sc->nr = 0;
-			goto out;
-		}
-
 		if (ret == 0) {
-			if (iref.val_len == sizeof(struct scoutfs_srch_file)) {
+			if (iref.key->sk_type != type) {
+				ret = -ENOENT;
+			} else if (iref.val_len == sizeof(sfl)) {
 				key = *iref.key;
 				memcpy(&sfl, iref.val, iref.val_len);
 			} else {
@@ -1213,24 +1209,25 @@ int scoutfs_srch_get_compact(struct super_block *sb,
 			}
 			scoutfs_btree_put_iref(&iref);
 		}
-		if (ret < 0)
+		if (ret < 0) {
+			/* see if we ran out of log files or files entirely */
+			if (ret == -ENOENT) {
+				sc->nr = 0;
+				if (type == SCOUTFS_SRCH_LOG_TYPE) {
+					type = SCOUTFS_SRCH_BLOCKS_TYPE;
+					init_srch_key(&key, type, 0, 0);
+					continue;
+				} else {
+					ret = 0;
+				}
+			}
 			goto out;
+		}

 		/* skip any files already being compacted */
 		if (scoutfs_spbm_test(&busy, le64_to_cpu(sfl.ref.blkno)))
 			continue;

-		/* see if we ran out of log files or files entirely */
-		if (key.sk_type != type) {
-			sc->nr = 0;
-			if (key.sk_type == SCOUTFS_SRCH_BLOCKS_TYPE) {
-				type = SCOUTFS_SRCH_BLOCKS_TYPE;
-			} else {
-				ret = 0;
-				goto out;
-			}
-		}
-
 		/* reset if we iterated into the next size category */
 		if (type == SCOUTFS_SRCH_BLOCKS_TYPE) {
 			order = fls64(le64_to_cpu(sfl.blocks)) /
@@ -352,10 +352,10 @@ static int scoutfs_read_super_from_bdev(struct super_block *sb,
 	}


-	if (super->format_hash != cpu_to_le64(SCOUTFS_FORMAT_HASH)) {
-		scoutfs_err(sb, "super block has invalid format hash 0x%llx, expected 0x%llx",
-			    le64_to_cpu(super->format_hash),
-			    SCOUTFS_FORMAT_HASH);
+	if (super->version != cpu_to_le64(SCOUTFS_INTEROP_VERSION)) {
+		scoutfs_err(sb, "super block has invalid version %llu, expected %llu",
+			    le64_to_cpu(super->version),
+			    SCOUTFS_INTEROP_VERSION);
 		ret = -EINVAL;
 		goto out;
 	}
@@ -682,6 +682,10 @@ static int __init scoutfs_module_init(void)
 		".section	.note.git_describe,\"a\"\n"
 		".string	\""SCOUTFS_GIT_DESCRIBE"\\n\"\n"
 		".previous\n");
+	__asm__ __volatile__ (
+		".section	.note.scoutfs_interop_version,\"a\"\n"
+		".string	\""SCOUTFS_INTEROP_VERSION_STR"\\n\"\n"
+		".previous\n");

 	scoutfs_init_counters();

@@ -714,3 +718,4 @@ module_exit(scoutfs_module_exit)
 MODULE_AUTHOR("Zach Brown <zab@versity.com>");
 MODULE_LICENSE("GPL");
 MODULE_INFO(git_describe, SCOUTFS_GIT_DESCRIBE);
+MODULE_INFO(scoutfs_interop_version, SCOUTFS_INTEROP_VERSION_STR);
@@ -60,8 +60,6 @@
 */
 struct trans_info {
 	spinlock_t lock;
-	unsigned reserved_items;
-	unsigned reserved_vals;
 	unsigned holders;
 	bool writing;

@@ -318,12 +316,11 @@ void scoutfs_trans_restart_sync_deadline(struct super_block *sb)
 * Including nested holds avoids having to deal with writing out partial
 * transactions while a caller still holds the transaction.
 */
+
 #define SCOUTFS_RESERVATION_MAGIC 0xd57cd13b
 struct scoutfs_reservation {
 	unsigned magic;
 	unsigned holders;
-	struct scoutfs_item_count reserved;
-	struct scoutfs_item_count actual;
 };

 /*
@@ -340,22 +337,16 @@ struct scoutfs_reservation {
 * delaying or prematurely forcing commits.
 */
 static bool acquired_hold(struct super_block *sb,
-			  struct scoutfs_reservation *rsv,
-			  const struct scoutfs_item_count *cnt)
+			  struct scoutfs_reservation *rsv)
 {
 	struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
 	DECLARE_TRANS_INFO(sb, tri);
 	bool acquired = false;
-	unsigned items;
-	unsigned vals;

 	spin_lock(&tri->lock);

-	trace_scoutfs_trans_acquired_hold(sb, cnt, rsv, rsv->holders,
-					  &rsv->reserved, &rsv->actual,
-					  tri->holders, tri->writing,
-					  tri->reserved_items,
-					  tri->reserved_vals);
+	trace_scoutfs_trans_acquired_hold(sb, rsv, rsv->holders,
+					  tri->holders, tri->writing);

 	/* use a caller's existing reservation */
 	if (rsv->holders)
@@ -365,10 +356,6 @@ static bool acquired_hold(struct super_block *sb,
 	if (tri->writing)
 		goto out;

-	/* see if we can reserve space for our item count */
-	items = tri->reserved_items + cnt->items;
-	vals = tri->reserved_vals + cnt->vals;
-
 	/*
 	 * In theory each dirty item page could be straddling two full
 	 * blocks, requiring 4 allocations for each item cache page.
@@ -405,12 +392,6 @@ static bool acquired_hold(struct super_block *sb,
 		goto out;
 	}

-	tri->reserved_items = items;
-	tri->reserved_vals = vals;
-
-	rsv->reserved.items = cnt->items;
-	rsv->reserved.vals = cnt->vals;
-
 hold:
 	rsv->holders++;
 	tri->holders++;
@@ -423,20 +404,12 @@ out:
 	return acquired;
 }

-int scoutfs_hold_trans(struct super_block *sb,
-		       const struct scoutfs_item_count cnt)
+int scoutfs_hold_trans(struct super_block *sb)
 {
 	struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
 	struct scoutfs_reservation *rsv;
 	int ret;

-	/*
-	 * Caller shouldn't provide garbage counts, nor counts that
-	 * can't fit in segments by themselves.
-	 */
-	if (WARN_ON_ONCE(cnt.items <= 0 || cnt.vals < 0))
-		return -EINVAL;
-
 	if (current == sbi->trans_task)
 		return 0;

@@ -453,7 +426,7 @@ int scoutfs_hold_trans(struct super_block *sb,
 	BUG_ON(rsv->magic != SCOUTFS_RESERVATION_MAGIC);

 	ret = wait_event_interruptible(sbi->trans_hold_wq,
-				       acquired_hold(sb, rsv, &cnt));
+				       acquired_hold(sb, rsv));
 	if (ret && rsv->holders == 0) {
 		current->journal_info = NULL;
 		kfree(rsv);
@@ -473,38 +446,6 @@ bool scoutfs_trans_held(void)
 	return rsv && rsv->magic == SCOUTFS_RESERVATION_MAGIC;
 }

-/*
- * Record a transaction holder's individual contribution to the dirty
- * items in the current transaction.  We're making sure that the
- * reservation matches the possible item manipulations while they hold
- * the reservation.
- *
- * It is possible and legitimate for an individual contribution to be
- * negative if they delete dirty items.  The item cache makes sure that
- * the total dirty item count doesn't fall below zero.
- */
-void scoutfs_trans_track_item(struct super_block *sb, signed items,
-			      signed vals)
-{
-	struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
-	struct scoutfs_reservation *rsv = current->journal_info;
-
-	if (current == sbi->trans_task)
-		return;
-
-	BUG_ON(!rsv || rsv->magic != SCOUTFS_RESERVATION_MAGIC);
-
-	rsv->actual.items += items;
-	rsv->actual.vals += vals;
-
-	trace_scoutfs_trans_track_item(sb, items, vals, rsv->actual.items,
-				       rsv->actual.vals, rsv->reserved.items,
-				       rsv->reserved.vals);
-
-	WARN_ON_ONCE(rsv->actual.items > rsv->reserved.items);
-	WARN_ON_ONCE(rsv->actual.vals > rsv->reserved.vals);
-}
-
 /*
 * As we drop the last hold in the reservation we try and wake other
 * hold attempts that were waiting for space.  As we drop the last trans
@@ -526,16 +467,12 @@ void scoutfs_release_trans(struct super_block *sb)

 	spin_lock(&tri->lock);

-	trace_scoutfs_release_trans(sb, rsv, rsv->holders, &rsv->reserved,
-				    &rsv->actual, tri->holders, tri->writing,
-				    tri->reserved_items, tri->reserved_vals);
+	trace_scoutfs_release_trans(sb, rsv, rsv->holders, tri->holders, tri->writing);

 	BUG_ON(rsv->holders <= 0);
 	BUG_ON(tri->holders <= 0);

 	if (--rsv->holders == 0) {
-		tri->reserved_items -= rsv->reserved.items;
-		tri->reserved_vals -= rsv->reserved.vals;
 		current->journal_info = NULL;
 		kfree(rsv);
 		wake = true;
@@ -6,21 +6,16 @@
 /* the client will force commits if data allocators get too low */
 #define SCOUTFS_TRANS_DATA_ALLOC_LWM	(256ULL * 1024 * 1024)

-#include "count.h"
-
 void scoutfs_trans_write_func(struct work_struct *work);
 int scoutfs_trans_sync(struct super_block *sb, int wait);
 int scoutfs_file_fsync(struct file *file, loff_t start, loff_t end,
 		       int datasync);
 void scoutfs_trans_restart_sync_deadline(struct super_block *sb);

-int scoutfs_hold_trans(struct super_block *sb,
-		       const struct scoutfs_item_count cnt);
+int scoutfs_hold_trans(struct super_block *sb);
 bool scoutfs_trans_held(void);
 void scoutfs_release_trans(struct super_block *sb);
 u64 scoutfs_trans_sample_seq(struct super_block *sb);
-void scoutfs_trans_track_item(struct super_block *sb, signed items,
-			      signed vals);

 int scoutfs_trans_get_log_trees(struct super_block *sb);
 bool scoutfs_trans_has_dirty(struct super_block *sb);
@@ -0,0 +1,20 @@
+#ifndef _SCOUTFS_UTIL_H_
+#define _SCOUTFS_UTIL_H_
+
+/*
+ * Little utility helpers that probably belong upstream.
+ */
+
+static inline void down_write_two(struct rw_semaphore *a,
+				  struct rw_semaphore *b)
+{
+	BUG_ON(a == b);
+
+	if (a > b)
+		swap(a, b);
+
+	down_write(a);
+	down_write_nested(b, SINGLE_DEPTH_NESTING);
+}
+
+#endif
@@ -577,10 +577,7 @@ static int scoutfs_xattr_set(struct dentry *dentry, const char *name,
 retry:
 	ret = scoutfs_inode_index_start(sb, &ind_seq) ?:
 	      scoutfs_inode_index_prepare(sb, &ind_locks, inode, false) ?:
-	      scoutfs_inode_index_try_lock_hold(sb, &ind_locks, ind_seq,
-						SIC_XATTR_SET(found_parts,
-							      value != NULL,
-							      name_len, size));
+	      scoutfs_inode_index_try_lock_hold(sb, &ind_locks, ind_seq);
 	if (ret > 0)
 		goto retry;
 	if (ret)
@@ -781,7 +778,7 @@ int scoutfs_xattr_drop(struct super_block *sb, u64 ino,
 					     &tgs) != 0)
 			memset(&tgs, 0, sizeof(tgs));

-		ret = scoutfs_hold_trans(sb, SIC_EXACT(2, 0));
+		ret = scoutfs_hold_trans(sb);
 		if (ret < 0)
 			break;
 		release = true;
@@ -59,5 +59,8 @@ t_filter_dmesg()
 	# some tests mount w/o options
 	re="$re|scoutfs .* error: Required mount option \"metadev_path\" not found"

+	# in debugging kernels we can slow things down a bit
+	re="$re|hrtimer: interrupt took .*"
+
 	egrep -v "($re)" 
 }
@@ -28,8 +28,8 @@ t_ident()
 	local fsid
 	local rid

-	fsid=$(scoutfs statfs -s fsid "$mnt")
-	rid=$(scoutfs statfs -s rid "$mnt")
+	fsid=$(scoutfs statfs -s fsid -p "$mnt")
+	rid=$(scoutfs statfs -s rid -p "$mnt")

 	echo "f.${fsid:0:6}.r.${rid:0:6}"
 }
@@ -21,5 +21,20 @@ t_require_mounts() {
 	local req="$1"

 	test "$T_NR_MOUNTS" -ge "$req" || \
-		t_fail "$req mounts required, only have $T_NR_MOUNTS"
+		t_skip "$req mounts required, only have $T_NR_MOUNTS"
+}
+
+#
+# Require that the meta device be at least the size string argument, as
+# parsed by numfmt using single char base 2 suffixes (iec).. 64G, etc.
+#
+t_require_meta_size() {
+	local dev="$T_META_DEVICE"
+	local req_iec="$1"
+	local req_bytes=$(numfmt --from=iec --to=none $req_iec)
+	local dev_bytes=$(blockdev --getsize64 $dev)
+	local dev_iec=$(numfmt --from=auto --to=iec $dev_bytes)
+
+	test "$dev_bytes" -ge "$req_bytes" || \
+		t_skip "$dev must be at least $req_iec, is $dev_iec"
 }
@@ -0,0 +1,3 @@
+== create per mount files
+== 30s of racing random mount/umount
+== mounting any unmounted
@@ -0,0 +1,33 @@
+== build test files
+== wrapped offsets should fail
+ioctl failed on '/mnt/test/test/move-blocks/to': Value too large for defined data type (75)
+scoutfs: move-blocks failed: Value too large for defined data type (75)
+ioctl failed on '/mnt/test/test/move-blocks/to': Value too large for defined data type (75)
+scoutfs: move-blocks failed: Value too large for defined data type (75)
+== specifying same file fails
+ioctl failed on '/mnt/test/test/move-blocks/hardlink': Invalid argument (22)
+scoutfs: move-blocks failed: Invalid argument (22)
+== specifying files in other file systems fails
+ioctl failed on '/mnt/test/test/move-blocks/to': Invalid cross-device link (18)
+scoutfs: move-blocks failed: Invalid cross-device link (18)
+== offsets must be multiples of 4KB
+ioctl failed on '/mnt/test/test/move-blocks/to': Invalid argument (22)
+scoutfs: move-blocks failed: Invalid argument (22)
+ioctl failed on '/mnt/test/test/move-blocks/to': Invalid argument (22)
+scoutfs: move-blocks failed: Invalid argument (22)
+ioctl failed on '/mnt/test/test/move-blocks/to': Invalid argument (22)
+scoutfs: move-blocks failed: Invalid argument (22)
+== can't move onto existing extent
+ioctl failed on '/mnt/test/test/move-blocks/to': Invalid argument (22)
+scoutfs: move-blocks failed: Invalid argument (22)
+== can't move between files with offline extents
+ioctl failed on '/mnt/test/test/move-blocks/to': No data available (61)
+scoutfs: move-blocks failed: No data available (61)
+ioctl failed on '/mnt/test/test/move-blocks/to': No data available (61)
+scoutfs: move-blocks failed: No data available (61)
+== basic moves work
+== moving final partial block sets partial i_size
+123
+== moving updates inode fields
+== moving blocks backwards works
+== combine many files into one
@@ -1,6 +1,6 @@
 == create files
 == waiter shows up in ioctl
-offline wating should be empty:
+offline waiting should be empty:
 0
 offline waiting should now have one known entry:
 == multiple waiters on same block listed once
@@ -8,7 +8,7 @@ offline waiting still has one known entry:
 == different blocks show up
 offline waiting now has two known entries:
 == staging wakes everyone
-offline wating should be empty again:
+offline waiting should be empty again:
 0
 == interruption does no harm
 offline waiting should now have one known entry:
@@ -1,9 +1,9 @@
 == 0 data_version arg fails
-setattr_more ioctl failed on '/mnt/test/test/setattr_more/file': Invalid argument (22)
-scoutfs: setattr failed: Invalid argument (22)
+setattr: data version must not be 0
+Try `setattr --help' or `setattr --usage' for more information.
 == args must specify size and offline
-setattr_more ioctl failed on '/mnt/test/test/setattr_more/file': Invalid argument (22)
-scoutfs: setattr failed: Invalid argument (22)
+setattr: must provide size if using --offline option
+Try `setattr --help' or `setattr --usage' for more information.
 == only works on regular files
 failed to open '/mnt/test/test/setattr_more/dir': Is a directory (21)
 scoutfs: setattr failed: Is a directory (21)
@@ -8,16 +8,16 @@
 release ioctl failed: Invalid argument (22)
 scoutfs: release failed: Invalid argument (22)
 == releasing non-file fails
-ioctl failed on '/mnt/test/test/simple-release-extents/file-char': Inappropriate ioctl for device (25)
-release ioctl failed: Inappropriate ioctl for device (25)
-scoutfs: release failed: Inappropriate ioctl for device (25)
+ioctl failed: Inappropriate ioctl for device (25)
+release: must provide file version --data-version
+Try `release --help' or `release --usage' for more information.
 == releasing a non-scoutfs file fails
-ioctl failed on '/dev/null': Inappropriate ioctl for device (25)
-release ioctl failed: Inappropriate ioctl for device (25)
-scoutfs: release failed: Inappropriate ioctl for device (25)
+ioctl failed: Inappropriate ioctl for device (25)
+release: must provide file version --data-version
+Try `release --help' or `release --usage' for more information.
 == releasing bad version fails
-release ioctl failed: Stale file handle (116)
-scoutfs: release failed: Stale file handle (116)
+release: must provide file version --data-version
+Try `release --help' or `release --usage' for more information.
 == verify small release merging
 0 0 0:  (0 0 1)  (1 101 4)
 0 0 1:  (0 0 2)  (2 102 3)
@@ -4,8 +4,8 @@
 == release+stage shouldn't change stat, data seq or vers
 == stage does change meta_seq
 == can't use stage to extend online file
-stage returned -1, not 4096: error Invalid argument (22)
-scoutfs: stage failed: Input/output error (5)
+stage: must provide file version with --data-version
+Try `stage --help' or `stage --usage' for more information.
 == wrapped region fails
 stage returned -1, not 4096: error Invalid argument (22)
 scoutfs: stage failed: Input/output error (5)
@@ -18,6 +18,6 @@ scoutfs: stage failed: Input/output error (5)
 == partial final block that writes to i_size does work
 == zero length stage doesn't bring blocks online
 == stage of non-regular file fails
-ioctl failed on '/mnt/test/test/simple-staging/file-char': Inappropriate ioctl for device (25)
-stage returned -1, not 1: error Inappropriate ioctl for device (25)
-scoutfs: stage failed: Input/output error (5)
+ioctl failed: Inappropriate ioctl for device (25)
+stage: must provide file version with --data-version
+Try `stage --help' or `stage --usage' for more information.
@@ -53,7 +53,7 @@ $(basename $0) options:
    -m        | Run mkfs on the device before mounting and running
              | tests.  Implies unmounting existing mounts first.
    -n        | The number of devices and mounts to test.
-    -P        | Output trace events with printk as they're generated.
+    -P        | Enable trace_printk.
    -p        | Exit script after preparing mounts only, don't run tests.
    -q <nr>   | Specify the quorum count needed to mount.  This is
              | used when running mkfs and is needed by a few tests.
@@ -62,6 +62,7 @@ $(basename $0) options:
              | exist.  Previous results will be deleted as each test runs.
    -s        | Skip git repo checkouts.
    -t        | Enabled trace events that match the given glob argument.
+              | Multiple options enable multiple globbed events.
    -X        | xfstests git repo. Used by tests/xfstests.sh.
    -x        | xfstests git branch to checkout and track.
    -y        | xfstests ./check additional args
@@ -77,6 +78,9 @@ done
 T_TRACE_DUMP="0"
 T_TRACE_PRINTK="0"

+# array declarations to be able to use array ops
+declare -a T_TRACE_GLOB
+
 while true; do
 	case $1 in
 	-a)
@@ -147,7 +151,7 @@ while true; do
 		;;
 	-t)
 		test -n "$2" || die "-t must have trace glob argument"
-		T_TRACE_GLOB="$2"
+		T_TRACE_GLOB+=("$2")
 		shift
 		;;
 	-X)
@@ -304,7 +308,7 @@ if [ -n "$T_UNMOUNT" ]; then
 fi

 if [ -n "$T_MKFS" ]; then
-	cmd scoutfs mkfs -Q "$T_QUORUM" "$T_META_DEVICE" "$T_DATA_DEVICE"
+	cmd scoutfs mkfs -Q "$T_QUORUM" "$T_META_DEVICE" "$T_DATA_DEVICE" -f
 fi

 if [ -n "$T_INSMOD" ]; then
@@ -314,23 +318,37 @@ if [ -n "$T_INSMOD" ]; then
 	cmd insmod "$T_KMOD/src/scoutfs.ko"
 fi

-if [ -n "$T_TRACE_GLOB" ]; then
-	msg "enabling trace events"
+nr_globs=${#T_TRACE_GLOB[@]}
+if [ $nr_globs -gt 0 ]; then
 	echo 0 > /sys/kernel/debug/tracing/events/scoutfs/enable
-	for g in $T_TRACE_GLOB; do
+
+	for g in "${T_TRACE_GLOB[@]}"; do
 		for e in /sys/kernel/debug/tracing/events/scoutfs/$g/enable; do
-			echo 1 > $e
+			if test -w "$e"; then
+				echo 1 > "$e"
+			else
+				die "-t glob '$g' matched no scoutfs events"
+			fi
 		done
 	done

-	echo "$T_TRACE_DUMP" > /proc/sys/kernel/ftrace_dump_on_oops
-	echo "$T_TRACE_PRINTK" > /sys/kernel/debug/tracing/options/trace_printk
-
-	cmd cat /sys/kernel/debug/tracing/set_event
-	cmd grep .  /sys/kernel/debug/tracing/options/trace_printk \
-		    /proc/sys/kernel/ftrace_dump_on_oops
+	nr_events=$(cat /sys/kernel/debug/tracing/set_event | wc -l)
+	msg "enabled $nr_events trace events from $nr_globs -t globs"
 fi

+if [ -n "$T_TRACE_PRINTK" ]; then
+	echo "$T_TRACE_PRINTK" > /sys/kernel/debug/tracing/options/trace_printk
+fi
+
+if [ -n "$T_TRACE_DUMP" ]; then
+	echo "$T_TRACE_DUMP" > /proc/sys/kernel/ftrace_dump_on_oops
+fi
+
+# always describe tracing in the logs
+cmd cat /sys/kernel/debug/tracing/set_event
+cmd grep .  /sys/kernel/debug/tracing/options/trace_printk \
+	    /proc/sys/kernel/ftrace_dump_on_oops
+
 #
 # mount concurrently so that a quorum is present to elect the leader and
 # start a server.
@@ -434,7 +452,7 @@ for t in $tests; do

 	# get stats from previous pass
 	last="$T_RESULTS/last-passed-test-stats"
-	stats=$(grep -s "^$test_name" "$last" | cut -d " " -f 2-)
+	stats=$(grep -s "^$test_name " "$last" | cut -d " " -f 2-)
 	test -n "$stats" && stats="last: $stats"

 	printf "  %-30s $stats" "$test_name"
@@ -497,7 +515,7 @@ for t in $tests; do
 		echo "  passed: $stats"
 		((passed++))
 		# save stats for passed test
-		grep -s -v "^$test_name" "$last" > "$last.tmp"
+		grep -s -v "^$test_name " "$last" > "$last.tmp"
 		echo "$test_name $stats" >> "$last.tmp"
 		mv -f "$last.tmp" "$last"
 	elif [ "$sts" == "$T_SKIP_STATUS" ]; then
@@ -515,23 +533,24 @@ done

 msg "all tests run: $passed passed, $skipped skipped, $failed failed"

-unmount_all

-if [ -n "$T_TRACE_GLOB" ]; then
+if [ -n "$T_TRACE_GLOB" -o -n "$T_TRACE_PRINTK" ]; then
 	msg "saving traces and disabling tracing"
 	echo 0 > /sys/kernel/debug/tracing/events/scoutfs/enable
+	echo 0 > /sys/kernel/debug/tracing/options/trace_printk
 	cat /sys/kernel/debug/tracing/trace > "$T_RESULTS/traces"
 fi

 if [ "$skipped" == 0 -a "$failed" == 0 ]; then
 	msg "all tests passed"
+	unmount_all
 	exit 0
 fi

 if [ "$skipped" != 0 ]; then
-	msg "$skipped tests skipped, check skip.log"
+	msg "$skipped tests skipped, check skip.log, still mounted"
 fi
 if [ "$failed" != 0 ]; then
-	msg "$failed tests failed, check fail.log"
+	msg "$failed tests failed, check fail.log, still mounted"
 fi
 exit 1
@@ -6,6 +6,7 @@ simple-staging.sh
 simple-release-extents.sh
 setattr_more.sh
 offline-extent-waiting.sh
+move-blocks.sh
 srch-basic-functionality.sh
 simple-xattr-unit.sh
 lock-refleak.sh
@@ -16,6 +17,7 @@ createmany-parallel.sh
 createmany-large-names.sh
 createmany-rename-large-dir.sh
 stage-release-race-alloc.sh
+stage-multi-part.sh
 basic-posix-consistency.sh
 dirent-consistency.sh
 lock-ex-race-processes.sh
@@ -23,7 +25,8 @@ lock-conflicting-batch-commit.sh
 cross-mount-data-free.sh
 persistent-item-vers.sh
 setup-error-teardown.sh
-mount-unmount-race.sh
+# failing in jenkins pr runners, zab's working on it
+#umount-unmount-race.sh
 createmany-parallel-mounts.sh
 archive-light-cycle.sh
 stale-btree-read.sh
@@ -161,9 +161,9 @@ for n in $(t_fs_nrs); do
 			echo "bash $gen $blocks $n $p $f > $path" >> $create
 			echo "cmp $path <(bash $gen $blocks $n $p $f)" >> $verify
 			echo "vers=\$(scoutfs stat -s data_version $path)" >> $release
-			echo "scoutfs release $path \$vers 0 $blocks" >> $release
+			echo "scoutfs release $path -V \$vers -o 0 -l $bytes" >> $release
 			echo "vers=\$(scoutfs stat -s data_version $path)" >> $stage
-			echo "scoutfs stage $path \$vers 0 $bytes <(bash $gen $blocks $n $p $f)" >> $stage
+			echo "scoutfs stage <(bash $gen $blocks $n $p $f) $path -V \$vers -o 0 -l $bytes " >> $stage
 			echo "rm -f $path" >> $unlink

 			echo "x=\$(scoutfs stat -s online_blocks $path)" >> $online
@@ -9,14 +9,14 @@ t_require_commands scoutfs dd truncate touch mkdir rm rmdir
 release_vers() {
 	local file="$1"
 	local vers="$2"
-	local block="$3"
-	local count="$4"
+	local offset="$3"
+	local length="$4"

 	if [ "$vers" == "stat" ]; then
 		vers=$(scoutfs stat -s data_version "$file")
 	fi

-	scoutfs release "$file" "$vers" "$block" "$count"
+	scoutfs release "$file" -V "$vers" -o "$offset" -l "$length"
 }

 # if vers is "stat" then we ask stat_more for the data_version
@@ -24,14 +24,14 @@ stage_vers() {
 	local file="$1"
 	local vers="$2"
 	local offset="$3"
-	local count="$4"
+	local length="$4"
 	local contents="$5"

 	if [ "$vers" == "stat" ]; then
 		vers=$(scoutfs stat -s data_version "$file")
 	fi

-	scoutfs stage "$file" "$vers" "$offset" "$count" "$contents"
+	scoutfs stage "$contents" "$file" -V "$vers" -o "$offset" -l "$length"
 }

 echo_blocks()
@@ -57,15 +57,15 @@ dd if=/dev/zero of="$FILE" bs=4K count=1 conv=notrunc oflag=append status=none
 echo_blocks "$FILE"

 echo "== release"
-release_vers "$FILE" stat 0 2
+release_vers "$FILE" stat 0 8K
 echo_blocks "$FILE"

 echo "== duplicate release"
-release_vers "$FILE" stat 0 2
+release_vers "$FILE" stat 0 8K
 echo_blocks "$FILE"

 echo "== duplicate release past i_size"
-release_vers "$FILE" stat 0 16
+release_vers "$FILE" stat 0 64K
 echo_blocks "$FILE"

 echo "== stage"
@@ -160,8 +160,8 @@ for i in $(seq 1 1); do
 		mkdir -p $(dirname $lnk)
 		ln "$T_D0/file" $lnk

-		scoutfs ino-path $ino "$T_M0" > "$T_TMP.0"
-		scoutfs ino-path $ino "$T_M1" > "$T_TMP.1"
+		scoutfs ino-path -p "$T_M0" $ino > "$T_TMP.0"
+		scoutfs ino-path -p "$T_M1" $ino > "$T_TMP.1"
 		diff -u "$T_TMP.0" "$T_TMP.1"
 	done
 done
@@ -169,32 +169,32 @@ rm -rf "$T_D0/dir"

 echo "== inode indexes match after syncing existing"
 t_sync_seq_index
-scoutfs walk-inodes meta_seq 0 -1 "$T_M0" > "$T_TMP.0"
-scoutfs walk-inodes meta_seq 0 -1 "$T_M1" > "$T_TMP.1"
+scoutfs walk-inodes -p "$T_M0" -- meta_seq 0 -1 > "$T_TMP.0"
+scoutfs walk-inodes -p "$T_M1" -- meta_seq 0 -1  > "$T_TMP.1"
 diff -u "$T_TMP.0" "$T_TMP.1"
-scoutfs walk-inodes data_seq 0 -1 "$T_M0" > "$T_TMP.0"
-scoutfs walk-inodes data_seq 0 -1 "$T_M1" > "$T_TMP.1"
+scoutfs walk-inodes -p "$T_M0" -- data_seq 0 -1 > "$T_TMP.0"
+scoutfs walk-inodes -p "$T_M1" -- data_seq 0 -1 > "$T_TMP.1"
 diff -u "$T_TMP.0" "$T_TMP.1"

 echo "== inode indexes match after copying and syncing"
 mkdir "$T_D0/dir"
 cp -ar /boot/conf* "$T_D0/dir"
 t_sync_seq_index
-scoutfs walk-inodes meta_seq 0 -1 "$T_M0" > "$T_TMP.0"
-scoutfs walk-inodes meta_seq 0 -1 "$T_M1" > "$T_TMP.1"
+scoutfs walk-inodes -p "$T_M0" -- meta_seq 0 -1 > "$T_TMP.0"
+scoutfs walk-inodes -p "$T_M1" -- meta_seq 0 -1  > "$T_TMP.1"
 diff -u "$T_TMP.0" "$T_TMP.1"
-scoutfs walk-inodes data_seq 0 -1 "$T_M0" > "$T_TMP.0"
-scoutfs walk-inodes data_seq 0 -1 "$T_M1" > "$T_TMP.1"
+scoutfs walk-inodes -p "$T_M0" -- data_seq 0 -1 > "$T_TMP.0"
+scoutfs walk-inodes -p "$T_M1" -- data_seq 0 -1 > "$T_TMP.1"
 diff -u "$T_TMP.0" "$T_TMP.1"

 echo "== inode indexes match after removing and syncing"
 rm -f "$T_D1/dir/conf*"
 t_sync_seq_index
-scoutfs walk-inodes meta_seq 0 -1 "$T_M0" > "$T_TMP.0"
-scoutfs walk-inodes meta_seq 0 -1 "$T_M1" > "$T_TMP.1"
+scoutfs walk-inodes -p "$T_M0" -- meta_seq 0 -1 > "$T_TMP.0"
+scoutfs walk-inodes -p "$T_M1" -- meta_seq 0 -1  > "$T_TMP.1"
 diff -u "$T_TMP.0" "$T_TMP.1"
-scoutfs walk-inodes data_seq 0 -1 "$T_M0" > "$T_TMP.0"
-scoutfs walk-inodes data_seq 0 -1 "$T_M1" > "$T_TMP.1"
+scoutfs walk-inodes -p "$T_M0" -- data_seq 0 -1 > "$T_TMP.0"
+scoutfs walk-inodes -p "$T_M1" -- data_seq 0 -1 > "$T_TMP.1"
 diff -u "$T_TMP.0" "$T_TMP.1"

 t_pass
@@ -30,7 +30,7 @@ echo "== create files and sync"
 dd if=/dev/zero of="$DIR/truncate" bs=4096 count=1 status=none
 dd if=/dev/zero of="$DIR/stage" bs=4096 count=1 status=none
 vers=$(scoutfs stat -s data_version "$DIR/stage")
-scoutfs release "$DIR/stage" $vers 0 1
+scoutfs release "$DIR/stage" -V $vers -o 0 -l 4K
 dd if=/dev/zero of="$DIR/release" bs=4096 count=1 status=none
 touch "$DIR/write_end"
 mkdir "$DIR"/{mknod_dir,link_dir,unlink_dir,symlink_dir,rename_dir}
@@ -41,9 +41,9 @@ sync; sync
 echo "== modify files" 
 truncate -s 0 "$DIR/truncate"
 vers=$(scoutfs stat -s data_version "$DIR/stage")
-scoutfs stage "$DIR/stage" $vers 0 4096 /dev/zero
+scoutfs stage /dev/zero "$DIR/stage" -V $vers -o 0 -l 4096
 vers=$(scoutfs stat -s data_version "$DIR/release")
-scoutfs release "$DIR/release" $vers 0 1
+scoutfs release "$DIR/release" -V $vers -o 0 -l 4K
 dd if=/dev/zero of="$DIR/write_end" bs=4096 count=1 status=none conv=notrunc
 touch $DIR/mknod_dir/mknod_file
 touch $DIR/link_dir/link_targ
@@ -50,7 +50,7 @@ for m in 0 1; do
 done
 wait
 CONF="$((SECONDS - START))"
-echo "conf: $IND" >> $T_TMP.log
+echo "conf: $CONF" >> $T_TMP.log

 if [ "$CONF" -gt "$((IND * 5))" ]; then
 	t_fail "conflicting $CONF secs is more than 5x independent $IND secs"
@@ -9,7 +9,7 @@ FILE="$T_D0/file"
 echo "== race writing and index walking"
 for i in $(seq 1 10); do
 	dd if=/dev/zero of="$FILE" bs=4K count=1 status=none conv=notrunc &
-	scoutfs walk-inodes data_seq 0 -1 "$T_M0" > /dev/null &
+	scoutfs walk-inodes -p "$T_M0" -- data_seq 0 -1  > /dev/null &
 	wait
 done

@@ -0,0 +1,169 @@
+#
+# test MOVE_BLOCKS ioctl, mostly basic error testing and functionality,
+# but a bit of expected use.
+#
+
+t_require_commands scoutfs dd
+
+FROM="$T_D0/from"
+TO="$T_D0/to"
+HARD="$T_D0/hardlink"
+OTHER="$T_TMP.other"
+
+BLOCKS=8
+BS=4096
+PART=123
+LEN=$(((BS * BLOCKS) + PART))
+PIECES=8
+
+regenerate_files() {
+	rm -f "$FROM"
+	rm -f "$TO"
+	dd if=/dev/urandom of="$FROM" bs=$LEN count=1 status=none
+	touch "$TO"
+}
+
+set_updated_fields() {
+	local arr="$1"
+	local path="$2"
+
+	eval $arr["ctime"]="$(stat -c '%Z' "$path")"
+	eval $arr["mtime"]="$(stat -c '%Y' "$path")"
+	eval $arr["data_version"]="$(scoutfs stat -s data_version "$path")"
+	eval $arr["meta_seq"]="$(scoutfs stat -s meta_seq "$path")"
+	eval $arr["data_seq"]="$(scoutfs stat -s data_seq "$path")"
+}
+
+#
+# before moving extents manually copy the byte regions so that we have
+# expected good file contents to compare to.  We know that the byte
+# regions are 4KB block aligned (with an allowance for a len that ends
+# on from i_size).
+#
+move_and_compare() {
+	local from="$1"
+	local from_off="$2"
+	local from_blk="$((from_off / BS))"
+	local len="$3"
+	local blocks="$(((len + BS - 1) / BS))"
+	local to="$4"
+	local to_off="$5"
+	local to_blk="$((to_off / BS))"
+
+	local right_start=$((from_blk + blocks))
+	local from_size=$(stat -c "%s" "$from")
+	local from_blocks=$(( (from_size + BS - 1) / BS ))
+	local right_len=$((from_blocks - right_start))
+
+	# copying around instead of punching hole
+	dd if="$from" of="$from.expected" bs="$BS" \
+		skip=0 seek=0 count="$from_blk" \
+		status=none
+	dd if="$from" of="$from.expected" bs="$BS" \
+		skip="$right_start" seek="$right_start" count="$right_len" \
+		status=none conv=notrunc
+	# moving doesn't truncate, expect full size when no data
+	truncate -s "$from_size" "$from.expected"
+
+	cp "$to" "$to.expected"
+	dd if="$from" of="$to.expected" bs="$BS" \
+		skip="$from_blk" seek="$to_blk" count="$blocks" \
+		status=none conv=notrunc
+
+	scoutfs move-blocks "$from" -f "$from_off" -l "$len" "$to" -t "$to_off" \
+		2>&1 | t_filter_fs
+
+	cmp "$from" "$from.expected"
+	cmp "$to" "$to.expected"
+}
+
+echo "== build test files"
+regenerate_files
+touch "$OTHER"
+ln "$FROM" "$HARD"
+
+echo "== wrapped offsets should fail"
+HUGE=0x8000000000000000
+scoutfs move-blocks "$FROM" -f "$HUGE" -l "$HUGE" "$TO" -t 0 2>&1 | t_filter_fs
+scoutfs move-blocks "$FROM" -f 0 -l "$HUGE" "$TO" -t "$HUGE" 2>&1 | t_filter_fs
+
+echo "== specifying same file fails"
+scoutfs move-blocks "$FROM" -f 0 -l "$BS" "$HARD" -t 0 2>&1 | t_filter_fs
+
+echo "== specifying files in other file systems fails"
+scoutfs move-blocks "$OTHER" -f 0 -l "$BS" "$TO" -t 0 2>&1 | t_filter_fs
+
+echo "== offsets must be multiples of 4KB"
+scoutfs move-blocks "$FROM" -f 1 -l "$BS" "$TO" -t 0 2>&1 | t_filter_fs
+scoutfs move-blocks "$FROM" -f 0 -l 1 "$TO" -t 0 2>&1 | t_filter_fs
+scoutfs move-blocks "$FROM" -f 0 -l "$BS" "$TO" -t 1 2>&1 | t_filter_fs
+
+echo "== can't move onto existing extent"
+dd if=/dev/urandom of="$TO" bs=$BS count=1 status=none
+scoutfs move-blocks "$FROM" -f 0 -l "$BS" "$TO" -t 0 2>&1 | t_filter_fs
+
+echo "== can't move between files with offline extents"
+dd if=/dev/zero of="$TO" bs=$BS count=1 status=none
+vers=$(scoutfs stat -s data_version "$TO")
+scoutfs release "$TO" -V "$vers" -o 0 -l $BS
+scoutfs move-blocks "$FROM" -f 0 -l "$BS" "$TO" -t 0 2>&1 | t_filter_fs
+regenerate_files
+vers=$(scoutfs stat -s data_version "$FROM")
+scoutfs release "$FROM" -V "$vers" -o 0 -l $BS
+scoutfs move-blocks "$FROM" -f 0 -l "$BS" "$TO" -t 0 2>&1 | t_filter_fs
+regenerate_files
+
+echo "== basic moves work"
+move_and_compare "$FROM" 0 "$BS" "$TO" 0
+regenerate_files
+move_and_compare "$FROM" 0 "$BS" "$TO" "$BS"
+regenerate_files
+move_and_compare "$FROM" 0 "$LEN" "$TO" 0
+regenerate_files
+
+echo "== moving final partial block sets partial i_size"
+move_and_compare "$FROM" $((LEN - PART)) "$PART" "$TO" 0
+stat -c '%s' "$TO"
+regenerate_files
+
+echo "== moving updates inode fields"
+declare -A from_before from_after to_before to_after
+set_updated_fields from_before "$FROM"
+set_updated_fields to_before "$TO"
+t_quiet sync
+sleep 1
+move_and_compare "$FROM" 0 "$BS" "$TO" 0
+set_updated_fields from_after "$FROM"
+set_updated_fields to_after "$TO"
+for k in ${!from_after[@]}; do
+	if [ "${from_before[$k]}" == "${from_after[$k]}" ]; then
+		echo "move didn't change from $k ${from_before[$k]}"
+	fi
+	if [ "${to_before[$k]}" == "${to_after[$k]}" ]; then
+		echo "move didn't change to $k ${to_before[$k]}"
+	fi
+done
+regenerate_files
+
+echo "== moving blocks backwards works"
+cp "$FROM" "$FROM.orig"
+move_and_compare "$FROM" $((LEN - PART)) "$PART" "$TO" $((LEN - PART))
+for i in $(seq $((BLOCKS - 1)) -1 0); do
+	move_and_compare "$FROM" $((i * BS)) "$BS" "$TO" $((i * BS))
+done
+cmp "$TO" "$FROM.orig"
+regenerate_files
+
+echo "== combine many files into one"
+for i in $(seq 0 $((PIECES - 1))); do
+	dd if=/dev/urandom of="$FROM.$i" bs=$BS count=$BLOCKS status=none
+	cat "$FROM.$i" >> "$TO.large"
+	move_and_compare "$FROM.$i" 0 "$((BS * BLOCKS))" \
+		"$TO" $((i * BS * BLOCKS))
+done
+((i++))
+cat "$FROM" >> "$TO.large"
+move_and_compare "$FROM" 0 "$LEN" "$TO" $((i * BS * BLOCKS))
+cmp "$TO.large" "$TO"
+
+t_pass
@@ -24,7 +24,7 @@ expect_wait()
 		shift
 	done

-	scoutfs data-waiting 0 0 "$file" > $T_TMP.wait.output
+	scoutfs data-waiting -B 0 -I 0 -p "$file" > $T_TMP.wait.output
 	diff -u $T_TMP.wait.expected $T_TMP.wait.output
 }

@@ -37,9 +37,9 @@ ino=$(stat -c "%i" "$DIR/file")
 vers=$(scoutfs stat -s data_version "$DIR/file")

 echo "== waiter shows up in ioctl"
-echo "offline wating should be empty:"
-scoutfs data-waiting 0 0 "$DIR" | wc -l
-scoutfs release "$DIR/file" "$vers" 0 $BLOCKS
+echo "offline waiting should be empty:"
+scoutfs data-waiting -B 0 -I 0 -p "$DIR" | wc -l
+scoutfs release "$DIR/file" -V "$vers" -o 0 -l $BYTES
 cat "$DIR/file" > /dev/null &
 sleep .1
 echo "offline waiting should now have one known entry:"
@@ -58,13 +58,13 @@ echo "offline waiting now has two known entries:"
 expect_wait "$DIR/file" "read" $ino 0 $ino 1

 echo "== staging wakes everyone"
-scoutfs stage "$DIR/file" "$vers" 0 $BYTES "$DIR/golden"
+scoutfs stage "$DIR/golden" "$DIR/file" -V "$vers" -o 0 -l $BYTES
 sleep .1
-echo "offline wating should be empty again:"
-scoutfs data-waiting 0 0 "$DIR" | wc -l
+echo "offline waiting should be empty again:"
+scoutfs data-waiting -B 0 -I 0 -p "$DIR" | wc -l

 echo "== interruption does no harm"
-scoutfs release "$DIR/file" "$vers" 0 $BLOCKS
+scoutfs release "$DIR/file" -V "$vers" -o 0 -l $BYTES
 cat "$DIR/file" > /dev/null 2>&1 &
 pid="$!"
 sleep .1
@@ -74,7 +74,7 @@ kill "$pid"
 # silence terminated message
 wait "$pid" 2> /dev/null
 echo "offline waiting should be empty again:"
-scoutfs data-waiting 0 0 "$DIR" | wc -l
+scoutfs data-waiting -B 0 -I 0 -p "$DIR" | wc -l

 echo "== EIO injection for waiting readers works"
 ino=$(stat -c "%i" "$DIR/file")
@@ -86,23 +86,23 @@ dd if="$DIR/file" bs=$BS skip=1 of=/dev/null 2>&1 | \
 pid2="$!"
 sleep .1
 echo "offline waiting should now have two known entries:"
-scoutfs data-waiting 0 0 "$DIR" | wc -l
+scoutfs data-waiting -B 0 -I 0 -p "$DIR" | wc -l
 expect_wait "$DIR/file" "read" $ino 0 $ino 1
-scoutfs data-wait-err "$DIR" "$ino" "$vers" 0 $((BS*2)) read -5
+scoutfs data-wait-err -p "$DIR" -I "$ino" -V "$vers" -F 0 -C $((BS*2)) -O read -E -5
 sleep .1
 echo "offline waiting should now have 0 known entries:"
-scoutfs data-waiting 0 0 "$DIR" | wc -l
+scoutfs data-waiting -B 0 -I 0 -p "$DIR" | wc -l
 # silence terminated message
 wait "$pid" 2> /dev/null
 wait "$pid2" 2> /dev/null
 cat $T_TMP.cat1
 cat $T_TMP.cat2
 echo "offline waiting should be empty again:"
-scoutfs data-waiting 0 0 "$DIR" | wc -l
+scoutfs data-waiting -B 0 -I 0 -p "$DIR" | wc -l

 echo "== readahead while offline does no harm"
 xfs_io -c "fadvise -w 0 $BYTES" "$DIR/file"
-scoutfs stage "$DIR/file" "$vers" 0 $BYTES "$DIR/golden"
+scoutfs stage "$DIR/golden" "$DIR/file" -V "$vers" -o 0 -l $BYTES
 cmp "$DIR/file" "$DIR/golden"

 echo "== waiting on interesting blocks works"
@@ -113,65 +113,65 @@ for base in $(echo 0 $(($BLOCKS / 2)) $(($BLOCKS - 2))); do
 	done
 done
 for b in $blocks; do
-	scoutfs release "$DIR/file" "$vers" 0 $BLOCKS
+	scoutfs release "$DIR/file" -V "$vers" -o 0 -l $BYTES
 	dd if="$DIR/file" of=/dev/null \
 		status=none bs=$BS count=1 skip=$b 2> /dev/null &
 	sleep .1
-	scoutfs stage "$DIR/file" "$vers" 0 $BYTES "$DIR/golden"
+	scoutfs stage "$DIR/golden" "$DIR/file" -V "$vers" -o 0 -l $BYTES
 	sleep .1
 	echo "offline waiting is empty at block $b"
-	scoutfs data-waiting 0 0 "$DIR" | wc -l
+	scoutfs data-waiting -B 0 -I 0 -p "$DIR" | wc -l
 done

 echo "== contents match when staging blocks forward"
-scoutfs release "$DIR/file" "$vers" 0 $BLOCKS
+scoutfs release "$DIR/file" -V "$vers" -o 0 -l $BYTES
 cat "$DIR/file" > "$DIR/forward" &
 for b in $(seq 0 1 $((BLOCKS - 1))); do
 	dd if="$DIR/golden" of="$DIR/block" status=none bs=$BS skip=$b count=1
-	scoutfs stage "$DIR/file" "$vers" $((b * $BS)) $BS "$DIR/block"
+	scoutfs stage "$DIR/block" "$DIR/file" -V "$vers" -o $((b * $BS)) -l $BS
 done
 sleep .1
 cmp "$DIR/golden" "$DIR/forward"

 echo "== contents match when staging blocks backwards"
-scoutfs release "$DIR/file" "$vers" 0 $BLOCKS
+scoutfs release "$DIR/file" -V "$vers" -o 0 -l $BYTES
 cat "$DIR/file" > "$DIR/backward" &
 for b in $(seq $((BLOCKS - 1)) -1 0); do
 	dd if="$DIR/golden" of="$DIR/block" status=none bs=$BS skip=$b count=1
-	scoutfs stage "$DIR/file" "$vers" $((b * $BS)) $BS "$DIR/block"
+	scoutfs stage "$DIR/block" "$DIR/file" -V "$vers" -o $((b * $BS)) -l $BS
 done
 sleep .1
 cmp "$DIR/golden" "$DIR/backward"

 echo "== truncate to same size doesn't wait"
-scoutfs release "$DIR/file" "$vers" 0 $BLOCKS
+scoutfs release "$DIR/file" -V "$vers" -o 0 -l $BYTES
 truncate -s "$BYTES" "$DIR/file" &
 sleep .1
 echo "offline wating should be empty:"
-scoutfs data-waiting 0 0 "$DIR" | wc -l
+scoutfs data-waiting -B 0 -I 0 -p "$DIR" | wc -l

 echo "== truncating does wait"
 truncate -s "$BS" "$DIR/file" &
 sleep .1
 echo "truncate should be waiting for first block:"
 expect_wait "$DIR/file" "change_size" $ino 0
-scoutfs stage "$DIR/file" "$vers" 0 $BYTES "$DIR/golden"
+scoutfs stage "$DIR/golden" "$DIR/file" -V "$vers" -o 0 -l $BYTES
 sleep .1
 echo "trunate should no longer be waiting:"
-scoutfs data-waiting 0 0 "$DIR" | wc -l
+scoutfs data-waiting -B 0 -I 0 -p "$DIR" | wc -l
 cat "$DIR/golden" > "$DIR/file"
 vers=$(scoutfs stat -s data_version "$DIR/file")

 echo "== writing waits"
 dd if=/dev/urandom of="$DIR/other" bs=$BS count=$BLOCKS status=none
-scoutfs release "$DIR/file" "$vers" 0 $BLOCKS
+scoutfs release "$DIR/file" -V "$vers" -o 0 -l $BYTES
 # overwrite, not truncate+write
 dd if="$DIR/other" of="$DIR/file" \
 	bs=$BS count=$BLOCKS conv=notrunc status=none &
 sleep .1
 echo "should be waiting for write"
 expect_wait "$DIR/file" "write" $ino 0
-scoutfs stage "$DIR/file" "$vers" 0 $BYTES "$DIR/golden"
+scoutfs stage "$DIR/golden" "$DIR/file" -V "$vers" -o 0 -l $BYTES
 cmp "$DIR/file" "$DIR/other"

 echo "== cleanup"
@@ -8,63 +8,63 @@ FILE="$T_D0/file"

 echo "== 0 data_version arg fails"
 touch "$FILE"
-scoutfs setattr -d 0 -s 1 -f "$FILE" 2>&1 | t_filter_fs
+scoutfs setattr -V 0 -s 1 "$FILE" 2>&1 | t_filter_fs
 rm "$FILE"

 echo "== args must specify size and offline"
 touch "$FILE"
-scoutfs setattr -d 1 -o -s 0 -f "$FILE" 2>&1 | t_filter_fs
+scoutfs setattr -V 1 -o -s 0 "$FILE" 2>&1 | t_filter_fs
 rm "$FILE"

 echo "== only works on regular files"
 mkdir "$T_D0/dir"
-scoutfs setattr -d 1 -s 1 -f "$T_D0/dir" 2>&1 | t_filter_fs
+scoutfs setattr -V 1 -s 1 "$T_D0/dir" 2>&1 | t_filter_fs
 rmdir "$T_D0/dir"
 mknod "$T_D0/char" c 1 3
-scoutfs setattr -d 1 -s 1 -f "$T_D0/char" 2>&1 | t_filter_fs
+scoutfs setattr -V 1 -s 1 "$T_D0/char" 2>&1 | t_filter_fs
 rm "$T_D0/char"

 echo "== non-zero file size fails"
 echo contents > "$FILE"
-scoutfs setattr -d 1 -s 1 -f "$FILE" 2>&1 | t_filter_fs
+scoutfs setattr -V 1 -s 1 "$FILE" 2>&1 | t_filter_fs
 rm "$FILE"

 echo "== non-zero file data_version fails"
 touch "$FILE"
 truncate -s 1M "$FILE"
 truncate -s 0 "$FILE"
-scoutfs setattr -d 1 -o -s 1 -f "$FILE" 2>&1 | t_filter_fs
+scoutfs setattr -V 1 -o -s 1 "$FILE" 2>&1 | t_filter_fs
 rm "$FILE"

 echo "== large size is set"
 touch "$FILE"
-scoutfs setattr -d 1 -s 578437695752307201 -f "$FILE" 2>&1 | t_filter_fs
+scoutfs setattr -V 1 -s 578437695752307201 "$FILE" 2>&1 | t_filter_fs
 stat -c "%s" "$FILE"
 rm "$FILE"

 echo "== large data_version is set"
 touch "$FILE"
-scoutfs setattr -d 578437695752307201 -s 1 -f "$FILE" 2>&1 | t_filter_fs
+scoutfs setattr -V 578437695752307201 -s 1 "$FILE" 2>&1 | t_filter_fs
 scoutfs stat -s data_version "$FILE"
 rm "$FILE"

 echo "== large ctime is set"
 touch "$FILE"
 # only doing 32bit sec 'cause stat gets confused
-scoutfs setattr -c 67305985.999999999 -d 1 -s 1 -f "$FILE" 2>&1 | t_filter_fs
+scoutfs setattr -t 67305985.999999999 -V 1 -s 1 "$FILE" 2>&1 | t_filter_fs
 TZ=GMT stat -c "%z" "$FILE"
 rm "$FILE"

 echo "== large offline extents are created"
 touch "$FILE"
-scoutfs setattr -d 1 -o -s $((10007 * 4096)) -f "$FILE" 2>&1 | t_filter_fs
+scoutfs setattr -V 1 -o -s $((10007 * 4096)) "$FILE" 2>&1 | t_filter_fs
 filefrag -v -b4096 "$FILE" 2>&1 | t_filter_fs
 rm "$FILE"

 # had a bug where we were creating extents that were too long
 echo "== correct offline extent length"
 touch "$FILE"
-scoutfs setattr -d 1 -o -s 4000000000 -f "$FILE" 2>&1 | t_filter_fs
+scoutfs setattr -V 1 -o -s 4000000000 "$FILE" 2>&1 | t_filter_fs
 scoutfs stat -s offline_blocks "$FILE"
 rm "$FILE"

@@ -14,7 +14,7 @@ query_index() {
 	local first="${2:-0}"
 	local last="${3:--1}"

-	scoutfs walk-inodes $which $first $last "$T_M0"
+	scoutfs walk-inodes -p "$T_M0" -- $which $first $last
 }

 # print the major in the index for the ino if it's found
@@ -22,7 +22,7 @@ ino_major() {
 	local which="$1"
 	local ino="$2"

-	scoutfs walk-inodes $which 0 -1 "$T_M0" | \
+	scoutfs walk-inodes -p "$T_M0" -- $which 0 -1 | \
 		awk '($4 == "'$ino'") {print $2}'
 }

@@ -23,14 +23,14 @@ create_file() {
 release_vers() {
 	local file="$1"
 	local vers="$2"
-	local block="$3"
-	local count="$4"
+	local offset="$3"
+	local length="$4"

 	if [ "$vers" == "stat" ]; then
 		vers=$(scoutfs stat -s data_version "$file")
 	fi

-	scoutfs release "$file" "$vers" "$block" "$count"
+	scoutfs release "$file" -V "$vers" -o "$offset" -l "$length"
 }

 FILE="$T_D0/file"
@@ -38,41 +38,41 @@ CHAR="$FILE-char"

 echo "== simple whole file multi-block releasing"
 create_file "$FILE" 65536
-release_vers "$FILE" stat 0 16
+release_vers "$FILE" stat 0 64K
 rm "$FILE"

 echo "== release last block that straddles i_size"
 create_file "$FILE" 6144
-release_vers "$FILE" stat 1 1
+release_vers "$FILE" stat 4K 4K
 rm "$FILE"

 echo "== release entire file past i_size"
 create_file "$FILE" 8192
-release_vers "$FILE" stat 0 100
+release_vers "$FILE" stat 0 400K
 # not deleting for the following little tests

 echo "== releasing offline extents is fine"
-release_vers "$FILE" stat 0 100
+release_vers "$FILE" stat 0 400K

 echo "== 0 count is fine"
 release_vers "$FILE" stat 0 0

 echo "== release past i_size is fine"
-release_vers "$FILE" stat 100 1
+release_vers "$FILE" stat 400K 4K

 echo "== wrapped blocks fails"
 release_vers "$FILE" stat $vers 0x8000000000000000 0x8000000000000000

 echo "== releasing non-file fails"
 mknod "$CHAR" c 1 3
-release_vers "$CHAR" stat 0 1 2>&1 | t_filter_fs
+release_vers "$CHAR" stat 0 4K 2>&1 | t_filter_fs
 rm "$CHAR"

 echo "== releasing a non-scoutfs file fails"
-release_vers "/dev/null" stat 0 1
+release_vers "/dev/null" stat 0 4K

 echo "== releasing bad version fails"
-release_vers "$FILE" 0 0 1
+release_vers "$FILE" 0 0 4K

 rm "$FILE"

@@ -108,9 +108,9 @@ for c in $(seq 0 4); do
 	start=$(fiemap_file "$FILE" | \
 		awk '($1 == "0:"){print substr($4, 0, length($4)- 2)}')

-	release_vers "$FILE" stat $a 1
-	release_vers "$FILE" stat $b 1
-	release_vers "$FILE" stat $c 1
+	release_vers "$FILE" stat $(($a * 4))K 4K
+	release_vers "$FILE" stat $(($b * 4))K 4K
+	release_vers "$FILE" stat $(($c * 4))K 4K

 	echo -n "$a $b $c:"

@@ -29,14 +29,14 @@ create_file() {
 release_vers() {
 	local file="$1"
 	local vers="$2"
-	local block="$3"
-	local count="$4"
+	local offset="$3"
+	local length="$4"

 	if [ "$vers" == "stat" ]; then
 		vers=$(scoutfs stat -s data_version "$file")
 	fi

-	scoutfs release "$file" "$vers" "$block" "$count"
+	scoutfs release "$file" -V "$vers" -o "$offset" -l "$length"
 }

 # if vers is "stat" then we ask stat_more for the data_version
@@ -44,14 +44,14 @@ stage_vers() {
 	local file="$1"
 	local vers="$2"
 	local offset="$3"
-	local count="$4"
+	local length="$4"
 	local contents="$5"

 	if [ "$vers" == "stat" ]; then
 		vers=$(scoutfs stat -s data_version "$file")
 	fi

-	scoutfs stage "$file" "$vers" "$offset" "$count" "$contents"
+	scoutfs stage "$contents" "$file" -V "$vers" -o "$offset" -l "$length"
 }

 FILE="$T_D0/file"
@@ -60,7 +60,7 @@ CHAR="$FILE-char"
 echo "== create/release/stage single block file"
 create_file "$FILE" 4096
 cp "$FILE"  "$T_TMP"
-release_vers "$FILE" stat 0 1
+release_vers "$FILE" stat 0 4K
 # make sure there only offline extents
 fiemap_file "$FILE" | grep "^[ 0-9]*:" | grep -v "unknown"
 stage_vers "$FILE" stat 0 4096 "$T_TMP"
@@ -70,7 +70,7 @@ rm -f "$FILE"
 echo "== create/release/stage larger file"
 create_file "$FILE" $((4096 * 4096))
 cp "$FILE"  "$T_TMP"
-release_vers "$FILE" stat 0 4096
+release_vers "$FILE" stat 0 16M
 # make sure there only offline extents
 fiemap_file "$FILE" | grep "^[ 0-9]*:" | grep -v "unknown"
 stage_vers "$FILE" stat 0 $((4096 * 4096)) "$T_TMP"
@@ -83,7 +83,7 @@ cp "$FILE"  "$T_TMP"
 nr=1
 while [ "$nr" -lt 10 ]; do
 	echo "attempt $nr" >> $seqres.full 2>&1
-	release_vers "$FILE" stat 0 1024
+	release_vers "$FILE" stat 0 4096K
 	sync
 	echo 3 > /proc/sys/vm/drop_caches
 	stage_vers "$FILE" stat 0 $((4096 * 1024)) "$T_TMP"
@@ -100,7 +100,7 @@ sync
 stat "$FILE" > "$T_TMP.before"
 scoutfs stat -s data_seq "$FILE" >> "$T_TMP.before"
 scoutfs stat -s data_version "$FILE" >> "$T_TMP.before"
-release_vers "$FILE" stat 0 1
+release_vers "$FILE" stat 0 4K
 stage_vers "$FILE" stat 0 4096 "$T_TMP"
 stat "$FILE" > "$T_TMP.after"
 scoutfs stat -s data_seq "$FILE" >> "$T_TMP.after"
@@ -110,7 +110,7 @@ rm -f "$FILE"

 echo "== stage does change meta_seq"
 create_file "$FILE" 4096
-release_vers "$FILE" stat 0 1
+release_vers "$FILE" stat 0 4K
 sync
 before=$(scoutfs stat -s meta_seq "$FILE")
 stage_vers "$FILE" stat 0 4096 "$T_TMP"
@@ -121,7 +121,7 @@ rm -f "$FILE"
 # XXX this now waits, demand staging should be own test
 #echo "== can't write to offline"
 #create_file "$FILE" 4096
-#release_vers "$FILE" stat 0 1
+#release_vers "$FILE" stat 0 4K
 ## make sure there only offline extents
 #fiemap_file "$FILE" | grep "^[ 0-9]*:" | grep -v "unknown"
 #dd if=/dev/zero of="$FILE" conv=notrunc bs=4096 count=1  2>&1 | t_filter_fs
@@ -144,13 +144,13 @@ rm -f "$FILE"

 echo "== wrapped region fails"
 create_file "$FILE" 4096
-stage_vers "$FILE" stat 0xFFFFFFFFFFFFFFFF 4096 /dev/zero
+stage_vers "$FILE" stat 0xFFFFFFFFFFFFF000 4096 /dev/zero
 rm -f "$FILE"

 echo "== non-block aligned offset fails"
 create_file "$FILE" 4096
 cp "$FILE"  "$T_TMP"
-release_vers "$FILE" stat 0 1
+release_vers "$FILE" stat 0 4K
 stage_vers "$FILE" stat 1 4095 "$T_TMP"
 fiemap_file "$FILE" | grep "^[ 0-9]*:" | grep -v "unknown"
 rm -f "$FILE"
@@ -158,7 +158,7 @@ rm -f "$FILE"
 echo "== non-block aligned len within block fails"
 create_file "$FILE" 4096
 cp "$FILE"  "$T_TMP"
-release_vers "$FILE" stat 0 1
+release_vers "$FILE" stat 0 4K
 stage_vers "$FILE" stat 0 1024 "$T_TMP"
 fiemap_file "$FILE" | grep "^[ 0-9]*:" | grep -v "unknown"
 rm -f "$FILE"
@@ -166,14 +166,14 @@ rm -f "$FILE"
 echo "== partial final block that writes to i_size does work"
 create_file "$FILE" 2048
 cp "$FILE"  "$T_TMP"
-release_vers "$FILE" stat 0 1
+release_vers "$FILE" stat 0 4K
 stage_vers "$FILE" stat 0 2048 "$T_TMP"
 cmp "$FILE" "$T_TMP"
 rm -f "$FILE"

 echo "== zero length stage doesn't bring blocks online"
 create_file "$FILE" $((4096 * 100))
-release_vers "$FILE" stat 0 100
+release_vers "$FILE" stat 0 400K
 stage_vers "$FILE" stat 4096 0 /dev/zero
 fiemap_file "$FILE" | grep "^[ 0-9]*:" | grep -v "unknown"
 rm -f "$FILE"
@@ -188,7 +188,7 @@ rm -f "$FILE"
 #create_file "$FILE" 4096
 #cp "$FILE"  "$T_TMP"
 #sync
-#release_vers "$FILE" stat 0 1
+#release_vers "$FILE" stat 0 4K
 #md5sum "$FILE" 2>&1 | t_filter_fs
 #stage_vers "$FILE" stat 0 4096 "$T_TMP"
 #cmp "$FILE" "$T_TMP"
@@ -17,7 +17,7 @@ diff_srch_find()
 	local n="$1"

 	sync
-	scoutfs search-xattrs -n "$n" -f "$T_M0" > "$T_TMP.srch"
+	scoutfs search-xattrs "$n" -p "$T_M0" > "$T_TMP.srch"
 	find_xattrs -d "$T_D0" -m "$T_M0" -n "$n" > "$T_TMP.find"

 	diff -u "$T_TMP.srch" "$T_TMP.find"
@@ -0,0 +1,69 @@
+#
+# Stage a large file in multiple parts and have a reader read it while
+# it's being staged.  This has found problems with extent access
+# locking.
+#
+
+t_require_commands scoutfs perl cmp rm
+
+FILE_BYTES=$((4 * 1024 * 1024 * 1024))
+FILE_BLOCKS=$((FILE_BYTES / 4096))
+FRAG_BYTES=$((128 * 1024 * 1024))
+FRAG_BLOCKS=$((FRAG_BYTES / 4096))
+NR_FRAGS=$((FILE_BLOCKS / FRAG_BLOCKS))
+
+#
+# high bandwidth way to generate file contents with predictable
+# contents.  We use ascii lines with the block identity, padded to 4KB
+# with spaces.
+#
+# $1 is number of 4k blocks to write, and each block gets its block
+# number in the line.  $2, $3, and $4 are fields that are put in every
+# block.
+#
+gen() {
+	perl -e 'for (my $i = 0; $i < '$1'; $i++) { printf("mount %020u process %020u file %020u blkno %020u%s\n", '$2', '$3', '$4', $i, " " x 3987); }'
+}
+
+release_file() {
+        local path="$1"
+        local vers=$(scoutfs stat -s data_version "$path")
+
+        scoutfs release "$path" -V "$vers" -o 0 -l $FILE_BYTES
+}
+
+stage_file() {
+        local path="$1"
+        local vers=$(scoutfs stat -s data_version "$path")
+        local off=0
+
+	for a in $(seq 1 $NR_FRAGS); do
+		scoutfs stage <(gen $FRAG_BLOCKS $a $a $a) "$path" -V "$vers" \
+			-o $off -l $FRAG_BYTES
+		((off+=$FRAG_BYTES))
+	done
+}
+
+FILE="$T_D0/file"
+
+whole_file() {
+	for a in $(seq 1 $NR_FRAGS); do
+		gen $FRAG_BLOCKS $a $a $a
+	done
+}
+
+#
+# just one pass through the file.
+#
+
+whole_file > "$FILE"
+release_file "$FILE"
+
+cmp "$FILE" <(whole_file) &
+pid=$!
+
+stage_file "$FILE"
+
+wait $pid || t_fail "comparison failed"
+
+t_pass
@@ -15,7 +15,7 @@ release_file() {
 	local vers=$(scoutfs stat -s data_version "$path")

 	echo "releasing $path" >> "$T_TMP.log"
-	scoutfs release "$path" "$vers" 0 $BLOCKS
+	scoutfs release "$path" -V "$vers" -o 0 -l $BYTES
 	echo "released $path" >> "$T_TMP.log"
 }

@@ -24,8 +24,8 @@ stage_file() {
 	local vers=$(scoutfs stat -s data_version "$path")

 	echo "staging $path" >> "$T_TMP.log"
-	scoutfs stage "$path" "$vers" 0 $BYTES \
-		"$DIR/good/$(basename $path)"
+	scoutfs stage "$DIR/good/$(basename $path)" "$path" -V "$vers" -o 0 -l $BYTES
+
 	echo "staged $path" >> "$T_TMP.log"
 }

@@ -83,7 +83,7 @@ generic/375	# utils output change?  update branch?
 EOF

 t_restore_output
-echo "(showing output of xfstests)"
+echo "  (showing output of xfstests)"

 args="-E local.exclude ${T_XFSTESTS_ARGS:--g quick}"
 ./check $args
@@ -1,25 +1,12 @@
 #
-# The userspace utils and kernel module share definitions of physical
-# structures and ioctls.  If we're in the repo we include the kmod
-# headers directly, and hash them directly to calculate the format hash.
-#
 # If we're creating a standalone tarball for distribution we copy the
 # headers out of the kmod dir into the tarball.  And then when we're
 # building in that tarball we use the headers in src/ directly.
 #
 FMTIOC_H := format.h ioctl.h
-FMTIOC_DIST := $(addprefix src/,$(FMTIOC_H))
 FMTIOC_KMOD := $(addprefix ../kmod/src/,$(FMTIOC_H))

-ifneq ($(wildcard $(firstword $(FMTIOC_KMOD))),)
-HASH_FILES := $(FMTIOC_KMOD)
-else
-HASH_FILES := $(FMTIOC_DIST)
-endif
-SCOUTFS_FORMAT_HASH := $(shell cat $(HASH_FILES) | md5sum | cut -b1-16)
-
 CFLAGS := -Wall -O2 -Werror -D_FILE_OFFSET_BITS=64 -g -msse4.2 \
-	-Wpadded \
 	-fno-strict-aliasing \
 	-DSCOUTFS_FORMAT_HASH=0x$(SCOUTFS_FORMAT_HASH)LLU

@@ -47,7 +34,7 @@ endif

 $(BIN): $(OBJ)
 	$(QU)  [BIN $@]
-	$(VE)gcc -o $@ $^ -luuid -lm -lcrypto
+	$(VE)gcc -o $@ $^ -luuid -lm -lcrypto -lblkid

 %.o %.d: %.c Makefile sparse.sh
 	$(QU)  [CC $<]
@@ -3,51 +3,302 @@
 scoutfs \- scoutfs management utility
 .SH DESCRIPTION
 The
-.b
-scoutfs
-utility provides commands to manage a scoutfs filesystem.
+.B scoutfs
+utility provides commands to create and manage a ScoutFS filesystem.
 .SH COMMANDS
+
+Note: Commands taking the
+.B --path
+option will, when the option is omitted, fall back to using the value of the
+.I SCOUTFS_MOUNT_PATH
+environment variable. If that variable is also absent the current working
+directory will be used.
+
 .TP
-.BI "counters [\-t\] <sysfs topdir>"
+.BI "df [-h|--human-readable] [-p|--path PATH]"
 .sp
-Displays the counters and their values for a mounted scoutfs filesystem.
-Each counter and its value are printed on a line to stdout with
-sufficient spaces seperating the name and value to align the values
-after
+Display available and used space on the ScoutFS data and metadata devices.
 .RS 1.0i
 .PD 0
 .TP
 .sp
-.B "\-t"
-Format the counters into a table that fills the display instead of
-printing one counter per line.  The names and values are padded to
-create columns that fill the current width of the terminal.
+.B "-h, --human-readable"
+Output sizes in human-readable size units (e.g. 500G, 1.2P) rather than number
+of ScoutFS allocation blocks.
 .TP
-.B "sysfs topdir"
-Specify the mount's sysfs directory in which to find the
-.B counters/
-directory when then contains files for each counter.
-The sysfs directory is typically
-of the form
-.I /sys/fs/scoutfs/f.<fsid>.r.<rid>/
-\&.
+.B "-p, --path PATH"
+A path within a ScoutFS filesystem.
 .RE
 .PD

 .TP
-.BI "data-waiting <ino> <iblock> <path>"
+.BI "mkfs META-DEVICE DATA-DEVICE {-Q|--quorum-count} NUM [-m|--max-meta-size SIZE] [-d|--max-data-size SIZE] [-f|--force]"
 .sp
-Displays all the files and blocks for which there is a task blocked waiting on
+Initialize a new ScoutFS filesystem on the target devices. Since ScoutFS uses
+separate block devices for its metadata and data storage, two are required.
+.sp
+If
+.B --force
+option is not given, mkfs will check for existing filesystem signatures. It is
+recommended to use
+.B wipefs(8)
+to remove non-ScoutFS filesystem signatures before proceeding, and
+.B --force
+to overwrite a previous ScoutFS filesystem.
+.RS 1.0i
+.PD 0
+.TP
+.sp
+.B META-DEVICE
+The path to the block device to be used for ScoutFS metadata.  If possible, use
+a faster block device for the metadata device.
+.TP
+.B DATA-DEVICE
+The path to the block device to be used for ScoutFS file data.  If possible, use
+a larger block device for the data device.
+.TP
+.B "-Q, --quorum-count NUM"
+The number of mounts needed to reach quorum and elect one
+to be the server.  Mounts of the filesystem will hang until a quorum of
+mounts are operational.
+.sp
+Mounts with the
+.B server_addr
+mount option participate in quorum.  The safest quorum number is the
+smallest majority of an odd number of participating mounts.  For
+example,
+two out of three total mounts.  This ensures that there can only be one
+set of mounts that can establish quorum.
+.TP
+.B "-m, --max-meta-size SIZE"
+Limit the space used by ScoutFS on the metadata device to the
+given size, rather than using the entire block device. Size is given as
+an integer followed by a units digit: "K", "M", "G", "T", "P", to denote
+kibibytes, mebibytes, etc.
+.TP
+.B "-d, --max-data-size SIZE"
+Same as previous, but for limiting the size of the data device.
+.TP
+.B "-f, --force"
+Ignore presence of existing data on the data and metadata devices.
+.RE
+.PD
+
+.TP
+.BI "stat FILE [-s|--single-field FIELD-NAME]"
+.sp
+Display ScoutFS-specific metadata fields for the given file.
+.RS 1.0i
+.PD 0
+.TP
+.sp
+.B "FILE"
+Path to the file.
+.TP
+.B "-s, --single-field FIELD-NAME"
+Only output a single field's value instead of the default: all the stats with
+one stat per line.
+.sp
+.TP
+.RE
+.PD
+The fields are:
+.RS 1.0i
+.PD 0
+.TP
+.B "meta_seq"
+The metadata change sequence.  This changes each time the inode's metadata
+is changed.
+.TP
+.B "data_seq"
+The data change sequence.  This changes each time the inode's data
+is changed.
+.TP
+.B "data_version"
+The data version changes every time the contents of the file changes,
+or the file grows or shrinks.
+.TP
+.B "online_blocks"
+The number of 4Kb data blocks that contain data and can be read.
+.TP
+.B "offline_blocks"
+The number of 4Kb data blocks that are offline and would need to be
+staged to be read.
+.RE
+.PD
+
+.TP
+.BI "statfs [-s|--single-field FIELD-NAME] [-p|--path PATH]"
+.sp
+Display ScoutFS-specific filesystem-wide metadata fields.
+.RS 1.0i
+.PD 0
+.TP
+.sp
+.B "-s, --single-field FIELD-NAME"
+Only ontput a single stat instead of all the stats with one stat per
+line.  The possible stat names are those given in the output.
+.TP
+.B "-p, --path PATH"
+A path within a ScoutFS filesystem.
+.sp
+.TP
+.RE
+.PD
+The fields are:
+.RS 1.0i
+.PD 0
+.TP
+.B "fsid"
+The unique 64bit filesystem identifier for this filesystem.
+.TP
+.B "rid"
+The unique 64bit random identifier for this mount of the filesystem.
+This is generated for every new mount of the file system.
+.TP
+.B "committed_seq"
+All seqs up to and including this seq have been
+committed.  Can be compared with meta_seq and data_seq from inodes in
+.B stat
+to discover if changes to a file have been committed to disk.
+.TP
+.B "total_meta_blocks"
+The total number of 64K metadata blocks in the filesystem.
+.TP
+.B "total_data_blocks"
+The total number of 4K data blocks in the filesystem.
+.RE
+.PD
+
+.TP
+.BI "counters [-t|--table] SYSFS-DIR"
+.sp
+Display the counters and their values for a mounted ScoutFS filesystem.
+.RS 1.0i
+.PD 0
+.sp
+.TP
+.B SYSFS-DIR
+The mount's sysfs directory in which to find the
+.B counters/
+directory when then contains files for each counter.
+The sysfs directory is
+of the form
+.I /sys/fs/scoutfs/f.<fsid>.r.<rid>/
+\&.
+.TP
+.B "-t, --table"
+Format the counters into a columnar table that fills the width of the display
+instead of printing one counter per line.
+.RE
+.PD
+
+.TP
+.BI "search-xattrs XATTR-NAME [-p|--path PATH]"
+.sp
+Display the inode numbers of inodes in the filesystem which may have
+an extended attribute with the given name.
+.sp
+The results may contain false positives.  The returned inode numbers
+should be checked to verify that the extended attribute is in fact
+present on the inode.
+.RS 1.0i
+.PD 0
+.TP
+.sp
+.B XATTR-NAME
+The full name of the extended attribute to search for as
+described in the
+.BR xattr (7)
+manual page.
+.TP
+.B "-p|--path PATH"
+A path within a ScoutFS filesystem.
+.RE
+.PD
+
+.TP
+.BI "list-hidden-xattrs FILE"
+.sp
+Display extended attributes starting with the
+.BR scoutfs.
+prefix and containing the
+.BR hide.
+tag
+which makes them invisible to
+.BR listxattr (2) .
+The names of each attribute are output, one per line.  Their order
+is not specified.
+.RS 1.0i
+.PD 0
+.TP
+.sp
+.B "FILE"
+The path to a file within a ScoutFS filesystem.  File permissions must allow
+reading.
+.RE
+.PD
+
+.TP
+.BI "walk-inodes {meta_seq|data_seq} FIRST-INODE LAST-INODE [-p|--path PATH]"
+.sp
+Walk an inode index in the file system and output the inode numbers
+that are found between the first and last positions in the index.
+.RS 1.0i
+.PD 0
+.sp
+.TP
+.BR meta_seq , data_seq
+Which index to walk.
+.TP
+.B "FIRST-INODE"
+An integer index value giving starting position of the index walk.
+.I 0
+is the first possible position.
+.TP
+.B "LAST-INODE"
+An integer index value giving the last position to include in the index walk.
+.I \-1
+can be given to indicate the last possible position.
+.TP
+.B "-p|--path PATH"
+A path within a ScoutFS filesystem.
+.RE
+.PD
+
+.TP
+.BI "ino-path INODE-NUM [-p|--path PATH]"
+.sp
+Display all paths that reference an inode number.
+.sp
+Ongoing filesystem changes, such as renaming a common parent of multiple paths,
+can cause displayed paths to be inconsistent.
+.RS 1.0i
+.PD 0
+.sp
+.TP
+.B "INODE-NUM"
+The inode number of the target inode.
+.TP
+.B "-p|--path PATH"
+A path within a ScoutFS filesystem.
+.RE
+.PD
+
+.TP
+.BI "data-waiting {-I|--inode} INODE-NUM {-B|--block} BLOCK-NUM [-p|--path PATH]"
+.sp
+Display all the files and blocks for which there is a task blocked waiting on
 offline data.
 .sp
 The results are sorted by the file's inode number and the
 logical block offset that is being waited on.
 .sp
-Each line of output specifies a block in a file that has a task waiting
+Each line of output describes a block in a file that has a task waiting
 and is formatted as:
 .I "ino <nr> iblock <nr> ops [str]"
 \&. The ops string indicates blocked operations seperated by commas and can
-include 
+include
 .B read
 for a read operation,
 .B write
@@ -58,156 +309,151 @@ for a truncate or extending write.
 .PD 0
 .sp
 .TP
-.B "ino"
+.B "-I, --inode INODE-NUM"
 Start iterating over waiting tasks from the given inode number.
-Specifying 0 will show all waiting tasks.
+Value of 0 will show all waiting tasks.
 .TP
-.B "iblock"
+.B "-B, --block BLOCK-NUM"
 Start iterating over waiting tasks from the given logical block number
-in the starting inode.  Specifying 0 will show blocks in the first inode
+in the starting inode.  Value of 0 will show blocks in the first inode
 and then continue to show all blocks with tasks waiting in all the
 remaining inodes.
 .TP
+.B "-p, --path PATH"
+A path within a ScoutFS filesystem.
+.RE
+.PD
+
+.TP
+.BI "data-wait-err {-I|--inode} INODE-NUM {-V|--version} VER-NUM {-F|--offset} OFF-NUM {-C|--count} COUNT {-O|--op} OP {-E|--err} ERR [-p|--path PATH]"
+.sp
+Return error from matching waiters.
+.RS 1.0i
+.PD 0
+.sp
+.TP
+.B "-C, --count COUNT"
+Count.
+.TP
+.B "-E, --err ERR"
+Error.
+.TP
+.B "-F, --offset OFF-NUM"
+Offset. May be expressed in bytes, or with KMGTP (Kibi, Mibi, etc.) size
+suffixes.
+.TP
+.B "-I, --inode INODE-NUM"
+Inode number.
+.TP
+.B "-O, --op OP"
+Operation. One of: "read", "write", "change_size".
+.TP
+.B "-p, --path PATH"
+A path within a ScoutFS filesystem.
+.RE
+.PD
+
+.TP
+.BI "stage ARCHIVE-FILE FILE {-V|--version} VERSION [-o, --offset OFF-NUM] [-l, --length LENGTH]"
+.sp
+.B Stage
+(i.e. return to online) the previously-offline contents of a file by copying a
+region from another file, the archive, and without updating regular inode
+metadata.  Any operations that are blocked by the existence of an offline
+region will proceed once the region has been staged.
+.RS 1.0i
+.PD 0
+.TP
+.sp
+.B "ARCHIVE-FILE"
+The source file for the file contents being staged.
+.TP
+.B "FILE"
+The regular file whose contents will be staged.
+.TP
+.B "-V, --version VERSION"
+The data_version of the contents to be staged.  It must match the
+current data_version of the file.
+.TP
+.B "-o, --offset OFF-NUM"
+The starting byte offset of the region to write.  May be expressed in bytes, or with
+KMGTP (Kibi, Mibi, etc.) size suffixes. Default is 0.
+.TP
+.B "-l, --length LENGTH"
+Length of range (bytes or KMGTP units) of file to stage. Default is the file's
+total size.
+.RE
+.PD
+
+.TP
+.BI "release FILE {-V|--version} VERSION [-o, --offset OFF-NUM] [-l, --length LENGTH]"
+.sp
+.B Release
+the given region of the file.  That is, remove the region's backing data and
+leave an offline data region. Future attempts to read or write the offline
+region will block until the region is restored by a
+.B stage
+write.  This is used by userspace archive managers to free data space in the
+ScoutFS filesystem once the file data has been archived.
+.sp
+Note: This only works on regular files with write permission.  Releasing regions
+that are already offline or sparse, including regions extending past the end of
+the file, will silently succeed.
+.RS 1.0i
+.PD 0
+.TP
+.sp
 .B "path"
-A path to any inode in the target filesystem, typically the root
-directory.
+The path to the regular file whose region will be released.
+.TP
+.B "-V, --version VERSION"
+The data_version of the contents to be released.  It must match the current
+data_version of the file. This ensures that a release operation is truncating
+the same version of the data that was archived. (Use the
+.BI "stat"
+subcommand to obtain data version for a file.)
+.TP
+.B "-o, --offset OFF-NUM"
+The starting byte offset of the region to write.  May be expressed in bytes, or with
+KMGTP (Kibi, Mibi, etc.) size suffixes. Default is 0.
+.TP
+.B "-l, --length LENGTH"
+Length of range (bytes or KMGTP units) of file to stage. Default is the file's
+total size.
 .RE
 .PD

 .TP
-.BI "find-xattrs <\-n\ name> <\-f path>"
+.BI "setattr FILE [-d, --data-version=VERSION [-s, --size=SIZE [-o, --offline]]] [-t, --ctime=TIMESPEC]"
 .sp
-Displays the inode numbers of inodes in the filesystem which may have
-an extended attribute with the given name.
-.sp
-The results may contain false positives.  The returned inode numbers
-should be checked to verify that the extended attribute is in fact
-present on the inode.
-.RS 1.0i
-.PD 0
-.TP
-.sp
-.B "-n name"
-Specifies the full name of the extended attribute to search for as
-described in the
-.BR xattr (7)
-manual page.
-.TP
-.B "-f path"
-Specifies the path to any inode in the filesystem to search. 
-.RE
-.PD
-
-.TP
-.BI "ino-path <ino> <path>"
-.sp
-Displays all the paths to links to the given inode number.
-.sp
-All the relative paths from the root directory to each link of the
-target inode are output, one result per line.  Each output path is
-guaranteed to have been a valid path to a link at some point in the
-past.  An individual path won't be corrupted by a rename that occurs
-during the search.  The set of paths can be modified while the search is
-running.  A rename of a parent directory of all the paths, for example,
-can result in output where the parent directory name component changes
-in the middle of outputting all the paths.
+Set ScoutFS-specific attributes on a newly created zero-length file.
 .RS 1.0i
 .PD 0
 .sp
 .TP
-.B "ino"
-The inode number of the target inode to resolve.
+.B "-V, --data-version=VERSION"
+Set data version.
 .TP
-.B "path"
-A path to any inode in the target filesystem, typically the root
-directory.
+.B "-o, --offline"
+Set file contents as offline, not sparse. Requires
+.I --size
+option also be present.
+.TP
+.B "-s, --size=SIZE"
+Set file size. May be expressed in bytes, or with
+KMGTP (Kibi, Mibi, etc.) size suffixes. Requires
+.I --data-version
+option also be present.
+.TP
+.B "-t, --ctime=TIMESPEC"
+Set creation time using
+.I "<seconds-since-epoch>.<nanoseconds>"
+format.
 .RE
 .PD

 .TP
-.BI "listxattr-hidden <\-f path>"
-.sp
-Displays all the extended attributes starting with the
-.BR scoutfs.
-prefix and which contain the
-.BR hide.
-tag
-which makes them invisible to 
-.BR listxattr (2)
-\&.
-The names of each attribute are output, one name per line.  Their order
-is determined by internal indexing implementation details and should not
-be relied on.
-.RS 1.0i
-.PD 0
-.TP
-.sp
-.B "-f path"
-The path to the file whose extended attributes will be listed.  The
-user must have read permission to the inode.
-.RE
-.PD
-
-.TP
-.BI "mkfs <\-Q nr> <meta_dev_path> <data_dev_path> [-M meta_size] [-D data_size]"
-.sp
-Initialize a new empty filesystem in the target devices by writing empty
-structures and a new superblock. Since ScoutFS uses separate block
-devices for its metadata and data storage, both must be given.
-.sp
-This 
-.B unconditionally destroys
-the contents of the devices, regardless of what they contain or who may be
-using them.  It simply writes new data structures into known offsets.
-.B Be very careful that the devices do not contain data and are not actively in use.
-.RS 1.0i
-.PD 0
-.TP
-.sp
-.B "-Q nr"
-Specify the number of mounts needed to reach quorum and elect a mount
-to start the server.  Mounts of the filesystem will hang until this many
-mounts are operational and can elect a server amongst themselves.
-.sp
-Mounts with the 
-.B server_addr
-mount option participate in quorum.  The safest quorum number is the
-smallest majority of an odd number of participating mounts.  For
-example,
-two out of three total mounts.  This ensures that there can only be one
-set of mounts that can establish quorum.
-.sp
-Degenerate quorums are possible, for example by specifying half of an
-even number of mounts or less than half of the mount count, down to even
-just one mount establishing quorum. These minority quorums carry the
-risk of multiple quorums being established concurrently.  Each quorum's
-elected servers race to fence each other and can have the unlikely
-outcome of continually racing to fence each other resulting in a
-persistent loss of service.
-.TP
-.B "meta_dev_path"
-The path to the device to be used for ScoutFS metadata.  If possible,
-use a faster block device for the metadata device.  Its contents will be
-unconditionally destroyed.
-.TP
-.B "data_dev_path"
-The path to the device to be used for ScoutFS file data.  If possible,
-use a larger block device for the data device.  Its contents will be
-unconditionally destroyed.
-.TP
-.B "-M meta_size"
-Limit the space used by the filesystem on the metadata device to the
-given size, rather than using the entire block device. Size is given as
-an integer followed by a units digit: "K", "M", "G", "T", "P", to denote
-kibibytes, mebibytes, etc.
-.TP
-.B "-D data_size"
-Same as previous, but for limiting the size of the data device.
-.RE
-.PD
-
-.TP
-.BI "print <path>"
+.BI "print META-DEVICE"
 .sp
 Prints out all of the metadata in the file system.  This makes no effort
 to ensure that the structures are consistent as they're traversed and
@@ -217,236 +463,21 @@ output.
 .PD 0
 .TP
 .sp
-.B "path"
-The path to the metadata device for filesystem whose metadata will
-be printed.  The command reads from the buffer cache of the device which
-may not reflect the current blocks in the filesystem that may have been
-written through another host or device.  The local device's cache can be
-manually flushed before printing, perhaps with the
-.B \--flushbufs
-command in the
-.BR blockdev (8)
-command.
-.RE
-.PD
-
-.TP
-.BI "release <path> <vers> <4KB block offset> <4KB block count>"
-.sp
-.B Release
-the given logical block region of the file.  That is, truncate away
-any data blocks but leave behind offline data regions and do not change
-the main inode metadata.  Future attempts to read or write the block
-region
-will block until the region is restored by a 
-.B stage
-write.  This is used by userspace archive managers to store file data
-in a remote archive tier.
-.sp
-This only works on regular files and with write permission.  Releasing
-regions that are already offline or are sparse, including past the end
-of the file, silently succeed.
-.RS 1.0i
-.PD 0
-.TP
-.sp
-.B "path"
-The path to the regular file whose region will be released.
-.TP
-.B "version"
-The current data version of the contents of the file.  This ensures
-that a release operation is truncating the version of the data that it
-expects.  It can't throw away data that was newly written while it was
-performing its release operation.  An inode's data_version is read
-by the SCOUTFS_IOC_STATFS_MORE
-ioctl.
-.TP
-.B "4KB block offset"
-The 64bit logical block offset of the start of the region in units of 4KB.
-.TP
-.B "4KB block count"
-The 64bit length of the region to release in units of 4KB blocks.
-.RE
-.PD
-
-.TP
-.BI "setattr <\-c ctime> <\-d data_version> -o <\-s i_size> <\-f path>
-.sp
-Set scoutfs specific metadata on a newly created inode without updating
-other inode metadata.
-.RS 1.0i
-.PD 0
-.TP
-.sp
-.B "-c ctime"
-Specify the inode's creation GMT timespec with 64bit seconds and 32bit
-nanoseconds formatted as 
-.B sec.nsec
-\&.
-.TP
-.B "-d data_version"
-Specify the inode's data version.  This can only be set on regular files whose
-current data_version is 0.
-.TP
-.B "-o"
-Create an offline region for all of the file's data up to the specified
-file size.  This can only be set on regular files whose data_version is
-0 and i_size must also be specified.
-.TP
-.B "-s i_size"
-Set the inode's i_size.  This can only be set on regular files whose
-data_version is 0.
-.TP
-.B "-f path"
-The file whose metadata will be set.
-.RE
-.PD
-
-.TP
-.BI "stage <file> <vers> <offset> <count> <archive file>"
-.sp
-.B Stage
-the contents of the file by reading a region of another archive file and writing it
-into the file region without updating regular inode metadata.  Any tasks
-that are blocked by the offline region will proceed once it has been
-staged.
-.RS 1.0i
-.PD 0
-.TP
-.sp
-.B "file"
-The regular file whose contents will be staged.
-.TP
-.B "vers"
-The data_version of the contents to be staged.  It must match the
-current data_version of the file.
-.TP
-.B "offset"
-The starting byte offset of the region to write.  This must be aligned
-to 4KB blocks.
-.TP
-.B "count"
-The length of the region to write in bytes.  A length of 0 is a noop
-and will immediately return success.  The length must be a multiple
-of 4KB blocks unless it is writing the final partial block in which
-case it must end at i_size.
-.TP
-.B "archive file"
-A file whose contents will be read and written as the staged region.
-The start of the archive file will be used as the start of the region.
-.RE
-.PD
-
-.TP
-.BI "stat [-s single] <path>"
-.sp
-Display scoutfs metadata fields for the given inode.
-.RS 1.0i
-.PD 0
-.TP
-.sp
-.B "-s single"
-Only ontput a single stat instead of all the stats with one stat per
-line.  The possible stat names are those given in the output.
-.TP
-.B "path"
-The path to the file whose inode field will be output.
-.sp
-.TP
-.RE
-.PD
-The fields are as follows:
-.RS 1.0i
-.PD 0
-.TP
-.B "meta_seq"
-The metadata change sequence.  This changes each time the inode's metadata
-is changed during a mount's transaction.
-.TP
-.B "data_seq"
-The data change sequence.  This changes each time the inode's data
-is changed during a mount's transaction.
-.TP
-.B "data_version"
-The data version changes every time any contents of the file changes,
-including size changes.  It can change many times during a syscall in a
-transactions.
-.TP
-.B "online_blocks"
-The number of 4Kb data blocks that contain data and can be read.
-.TP
-.B "online_blocks"
-The number of 4Kb data blocks that are offline and would need to be
-staged to be read.
-.RE
-.PD
-
-.TP
-.BI "statfs [-s single] <path>"
-.sp
-Display scoutfs metadata fields for a scoutfs filesystem.
-.RS 1.0i
-.PD 0
-.TP
-.sp
-.B "-s single"
-Only ontput a single stat instead of all the stats with one stat per
-line.  The possible stat names are those given in the output.
-.TP
-.B "path"
-The path to any inode in the filesystem.
-.sp
-.TP
-.RE
-.PD
-The fields are as follows:
-.RS 1.0i
-.PD 0
-.TP
-.B "fsid"
-The unique 64bit filesystem identifier for this filesystem.
-.TP
-.B "rid"
-The unique 64bit random identifier for this mount of the filesystem.
-This is generated for every new mount of the file system.
-.RE
-.PD
-
-.TP
-.BI "walk-inodes <index> <first> <last> <path>"
-.sp
-Walks an inode index in the file system and outputs the inode numbers
-that are found within the first and last positions in the index.
-.RS 1.0i
-.PD 0
-.sp
-.TP
-.B "index"
-Specifies the index to walk.  The currently supported indices are
-.B meta_seq
-and
-.B data_seq
-\&.
-.TP
-.B "first"
-The starting position of the index walk.
-.I 0
-is the first possible position in every index.
-.TP
-.B "last"
-The last position to include in the index walk.
-.I \-1
-can be given as shorthand for the U64_MAX last possible position in
-every index.
-.TP
-.B "path"
-A path to any inode in the filesystem, typically the root directory.
+.B "META-DEVICE"
+The path to the metadata device for the filesystem whose metadata will be
+printed.  Since this command reads via the host's buffer cache, it may not
+reflect the current blocks in the filesystem possibly written to the shared
+block devices from another host, unless
+.B blockdev \--flushbufs
+command is used first.
 .RE
 .PD

 .SH SEE ALSO
 .BR scoutfs (5),
-.BR xattr (7).
+.BR xattr (7),
+.BR blockdev (8),
+.BR wipefs (8)

 .SH AUTHORS
 Zach Brown <zab@versity.com>
@@ -16,6 +16,7 @@ BuildRequires:  git
 BuildRequires:  gzip
 BuildRequires:  libuuid-devel
 BuildRequires:  openssl-devel
+BuildRequires:  libblkid-devel

 #Requires:	kmod-scoutfs = %{version}

@@ -0,0 +1,94 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <stdbool.h>
+#include <string.h>
+#include <unistd.h>
+#include <errno.h>
+
+#include <blkid/blkid.h>
+
+#include "util.h"
+#include "format.h"
+#include "blkid.h"
+
+static int check_bdev_blkid(int fd, char *devname, char *usage)
+{
+	blkid_probe pr;
+	int ret = 0;
+
+	pr = blkid_new_probe_from_filename(devname);
+	if (!pr) {
+		fprintf(stderr, "%s: failed to create a new libblkid probe\n", devname);
+		goto out;
+	}
+
+	/* enable partitions probing (superblocks are enabled by default) */
+	ret = blkid_probe_enable_partitions(pr, true);
+	if (ret == -1) {
+		fprintf(stderr, "%s: blkid_probe_enable_partitions() failed\n", devname);
+		goto out;
+	}
+
+	ret = blkid_do_fullprobe(pr);
+	if (ret == -1) {
+		fprintf(stderr, "%s: blkid_do_fullprobe() failed", devname);
+		goto out;
+	} else if (ret == 0) {
+		const char *type;
+
+		if (!blkid_probe_lookup_value(pr, "TYPE", &type, NULL)) {
+			fprintf(stderr, "%s: appears to contain an existing "
+					"%s superblock\n", devname, type);
+			ret = -1;
+			goto out;
+		}
+
+		if (!blkid_probe_lookup_value(pr, "PTTYPE", &type, NULL)) {
+			fprintf(stderr, "%s: appears to contain a partition "
+					"table (%s)\n", devname, type);
+			ret = -1;
+			goto out;
+		}
+	} else {
+		/* return 0 if ok */
+		ret = 0;
+	}
+
+out:
+	blkid_free_probe(pr);
+
+	return ret;
+}
+
+static int check_bdev_scoutfs(int fd, char *devname, char *usage)
+{
+	struct scoutfs_super_block *super = NULL;
+	int ret;
+
+	ret = read_block(fd, SCOUTFS_SUPER_BLKNO, SCOUTFS_BLOCK_SM_SHIFT, (void **)&super);
+	if (ret)
+		return ret;
+
+	if (le32_to_cpu(super->hdr.magic) == SCOUTFS_BLOCK_MAGIC_SUPER) {
+		fprintf(stderr, "%s: appears to contain an existing "
+			"ScoutFS superblock\n", devname);
+		ret = -EINVAL;
+	}
+
+	free(super);
+
+	return ret;
+}
+
+
+/*
+ * Returns -1 on error, 0 otherwise.
+ */
+int check_bdev(int fd, char *devname, char *usage)
+{
+	return check_bdev_blkid(fd, devname, usage) ?:
+		/* Our sig is not in blkid (yet) so check explicitly for us. */
+		check_bdev_scoutfs(fd, devname, usage);
+}
@@ -0,0 +1,6 @@
+#ifndef _BLKID_H_
+#define _BLKID_H_
+
+int check_bdev(int fd, char *path, char *usage);
+
+#endif
@@ -4,35 +4,37 @@
 #include <stdbool.h>
 #include <string.h>
 #include <assert.h>
+#include <argp.h>

 #include "cmd.h"
 #include "util.h"

-static struct command {
+static struct argp_command {
 	char *name;
-	char *opts;
-	char *summary;
+	struct argp *argp;
+	int group;
 	int (*func)(int argc, char **argv);
-} cmds[100], *next_cmd = cmds;
+} argp_cmds[100], *next_argp_cmd = argp_cmds;

-#define cmd_for_each(com) for (com = cmds; com->func; com++)
+#define cmd_for_each(com) for (com = argp_cmds; com->func; com++)

-void cmd_register(char *name, char *opts, char *summary,
+void cmd_register_argp(char *name, struct argp *argp, int group,
 		  int (*func)(int argc, char **argv))
 {
-	struct command *com = next_cmd++;
+	struct argp_command *com = next_argp_cmd++;

-	assert((com - cmds) < array_size(cmds));
+	assert((com - argp_cmds) < array_size(argp_cmds));

 	com->name = name;
-	com->opts = opts;
-	com->summary = summary;
+	com->argp = argp;
+	com->group = group;
 	com->func = func;
 }

-static struct command *find_command(char *name)
+
+static struct argp_command *find_command(char *name)
 {
-	struct command *com;
+	struct argp_command *com;

 	cmd_for_each(com) {
 		if (!strcmp(name, com->name))
@@ -42,28 +44,47 @@ static struct command *find_command(char *name)
 	return NULL;
 }

-static void usage(void)
+static void print_cmds_for_group(int group)
 {
-	struct command *com;
+	struct argp_command *com;
 	int largest = 0;

-	fprintf(stderr, "usage: scoutfs <command> [<args>]\n"
-	       "Commands:\n");
-
+	/* Base alignment on all groups */
 	cmd_for_each(com)
 		largest = max(strlen(com->name), largest);

 	cmd_for_each(com) {
-		fprintf(stderr, "  %*s %s\n  %*s %s\n",
-			largest, com->name, com->opts,
-			largest, "", com->summary);
+		if (com->group == group) {
+			fprintf(stderr, "  %*s %s\n  %*s %s\n",
+				largest, com->name, com->argp->args_doc,
+				largest, "", com->argp->doc);
+		}
 	}
+
+}
+
+static void usage(void)
+{
+	fprintf(stderr, "usage: scoutfs <command> [<args>]\n\n");
+	fprintf(stderr, "Selected fs defaults to current working directory.\n");
+	fprintf(stderr, "See <command> --help for more details.\n");
+
+	fprintf(stderr, "\nCore admin:\n");
+	print_cmds_for_group(GROUP_CORE);
+	fprintf(stderr, "\nAdditional Information:\n");
+	print_cmds_for_group(GROUP_INFO);
+	fprintf(stderr, "\nSearch Acceleration:\n");
+	print_cmds_for_group(GROUP_SEARCH);
+	fprintf(stderr, "\nArchival Agent Support:\n");
+	print_cmds_for_group(GROUP_AGENT);
+	fprintf(stderr, "\nDebugging commands:\n");
+	print_cmds_for_group(GROUP_DEBUG);
 }

 /* this returns a positive unix return code on error for some reason */
 char cmd_execute(int argc, char **argv)
 {
-	struct command *com = NULL;
+	struct argp_command *com = NULL;
 	int ret;

 	if (argc > 1) {
@@ -1,7 +1,13 @@
 #ifndef _CMD_H_
 #define _CMD_H_

-void cmd_register(char *name, char *opts, char *summary,
+#define GROUP_CORE 0
+#define GROUP_INFO 1
+#define GROUP_SEARCH 2
+#define GROUP_AGENT 3
+#define GROUP_DEBUG 4
+
+void cmd_register_argp(char *name, struct argp *argp, int group,
 		  int (*func)(int argc, char **argv));

 char cmd_execute(int argc, char **argv);
@@ -12,7 +12,10 @@
 #include <dirent.h>
 #include <sys/ioctl.h>
 #include <stdbool.h>
+#include <argp.h>

+#include "sparse.h"
+#include "parse.h"
 #include "util.h"
 #include "cmd.h"

@@ -37,7 +40,12 @@ static int cmp_counter_names(const void *A, const void *B)
 	return strcmp(a->name, b->name);
 }

-static int counters_cmd(int argc, char **argv)
+struct counters_args {
+	char *sysfs_path;
+	bool tabular;
+};
+
+static int do_counters(struct counters_args *args)
 {
 	unsigned int *name_wid = NULL;
 	unsigned int *val_wid = NULL;
@@ -50,9 +58,7 @@ static int counters_cmd(int argc, char **argv)
 	unsigned int rows = 0;
 	unsigned int cols = 0;
 	unsigned int nr = 0;
-	char *dir_arg = NULL;
 	struct dirent *dent;
-	bool table = false;
 	struct winsize ws;
 	DIR *dirp = NULL;
 	int dir_fd = -1;
@@ -64,28 +70,16 @@ static int counters_cmd(int argc, char **argv)
 	int r;
 	int c;

-	for (i = 1; i < argc; i++) {
-		if (strcmp(argv[i], "-t") == 0)
-			table = true;
-		else
-			dir_arg = argv[i];
-	}
-
 	ret = ioctl(STDOUT_FILENO, TIOCGWINSZ, &ws);
 	if (ret < 0)
 		ret = ioctl(STDIN_FILENO, TIOCGWINSZ, &ws);
 	if (ret < 0)
-		table = false;
+		args->tabular = false;

-	if (dir_arg == NULL) {
-		printf("scoutfs counter-table: need mount sysfs dir (i.e. /sys/fs/scoutfs/$fr)\n");
-		return -EINVAL;
-	}
-
-	ret = snprintf(path, PATH_MAX, "%s/counters", dir_arg);
+	ret = snprintf(path, PATH_MAX, "%s/counters", args->sysfs_path);
 	if (ret < 1 || ret >= PATH_MAX) {
 		ret = -EINVAL;
-		fprintf(stderr, "invalid counter dir path '%s'\n", dir_arg);
+		fprintf(stderr, "invalid counter dir path '%s'\n", args->sysfs_path);
 		goto out;
 	}

@@ -120,6 +114,7 @@ static int counters_cmd(int argc, char **argv)
 				goto out;
 			}
 			memset(&ctrs[nr], 0, (alloced - nr) * sizeof(*ctrs));
+			memset(&name_wid[nr], 0, (alloced - nr) * sizeof(*name_wid));
 		}

 		ctr = &ctrs[nr];
@@ -191,7 +186,7 @@ static int counters_cmd(int argc, char **argv)
 	 * one column of counters and use the max field widths from the
 	 * initial counter reads.
 	 */
-	if (table) {
+	if (args->tabular) {
 		min_rows = 1;
 		cols = ws.ws_col / (name_wid[0] + 1 + val_wid[0] + 2);
 		max_rows = nr / cols;
@@ -276,9 +271,58 @@ out:
 	return ret;
 };

+static int parse_opt(int key, char *arg, struct argp_state *state)
+{
+	struct counters_args *args = state->input;
+
+	switch (key) {
+	case 't':
+		args->tabular = true;
+		break;
+	case ARGP_KEY_ARG:
+		if (!args->sysfs_path)
+			args->sysfs_path = strdup_or_error(state, arg);
+		else
+			argp_error(state, "more than one argument given");
+		break;
+	case ARGP_KEY_FINI:
+		if (!args->sysfs_path)
+			argp_error(state, "no sysfs path argument given");
+		break;
+	default:
+		break;
+	}
+
+	return 0;
+}
+
+
+static struct argp_option options[] = {
+	{ "table", 't', NULL, 0, "Output in table format" },
+	{ NULL }
+};
+
+static struct argp argp = {
+	options,
+	parse_opt,
+	"SYSFS-DIR",
+	"Show counters for a mounted volume"
+};
+
+static int counters_cmd(int argc, char *argv[])
+{
+	struct counters_args counters_args = {NULL};
+	int ret;
+
+	ret = argp_parse(&argp, argc, argv, 0, NULL, &counters_args);
+	if (ret)
+		return ret;
+
+	return do_counters(&counters_args);
+}
+
+
 static void __attribute__((constructor)) counters_ctor(void)
 {
-	cmd_register("counters", "[-t] <sysfs dir>",
-		     "show [tablular] counters for a given mounted volume",
-		     counters_cmd);
+	cmd_register_argp("counters", &argp, GROUP_INFO, counters_cmd);
 }
@@ -1,7 +1,7 @@
 #ifndef _DEV_H_
 #define _DEV_H_

-#define BASE_SIZE_FMT "%.2f %s"
+#define BASE_SIZE_FMT "%.2f%s"
 #define BASE_SIZE_ARGS(sz) size_flt(sz, 1), size_str(sz, 1)

 #define SIZE_FMT "%llu (%.2f %s)"
@@ -7,20 +7,28 @@
 #include <fcntl.h>
 #include <errno.h>
 #include <string.h>
-#include <getopt.h>
 #include <assert.h>
+#include <stdbool.h>
+#include <argp.h>

 #include "sparse.h"
+#include "parse.h"
 #include "util.h"
 #include "format.h"
 #include "ioctl.h"
 #include "cmd.h"
+#include "dev.h"

 #define ROWS 3
 #define COLS 6
 #define CHARS 20

-static int df_cmd(int argc, char **argv)
+struct df_args {
+	char *path;
+	bool human_readable;
+};
+
+static int do_df(struct df_args *args)
 {
 	struct scoutfs_ioctl_alloc_detail ad;
 	struct scoutfs_ioctl_alloc_detail_entry *ade = NULL;
@@ -36,18 +44,9 @@ static int df_cmd(int argc, char **argv)
 	int r;
 	int c;

-	if (argc != 2) {
-		fprintf(stderr, "must specify path\n");
-		return -EINVAL;
-	}
-
-	fd = open(argv[1], O_RDONLY);
-	if (fd < 0) {
-		ret = -errno;
-		fprintf(stderr, "failed to open '%s': %s (%d)\n",
-			argv[1], strerror(errno), errno);
-		return ret;
-	}
+	fd = get_path(args->path, O_RDONLY);
+	if (fd < 0)
+		return fd;

 	sfm.valid_bytes = sizeof(struct scoutfs_ioctl_statfs_more);
 	ret = ioctl(fd, SCOUTFS_IOC_STATFS_MORE, &sfm);
@@ -96,18 +95,38 @@ static int df_cmd(int argc, char **argv)

 	snprintf(cells[1][0], CHARS, "MetaData");
 	snprintf(cells[1][1], CHARS, "64KB");
-	snprintf(cells[1][2], CHARS, "%llu", sfm.total_meta_blocks);
-	snprintf(cells[1][3], CHARS, "%llu", sfm.total_meta_blocks - meta_free);
-	snprintf(cells[1][4], CHARS, "%llu", meta_free);
+	if (args->human_readable) {
+		snprintf(cells[1][2], CHARS, BASE_SIZE_FMT,
+			 BASE_SIZE_ARGS(sfm.total_meta_blocks * SCOUTFS_BLOCK_LG_SIZE));
+		snprintf(cells[1][3], CHARS, BASE_SIZE_FMT,
+			 BASE_SIZE_ARGS((sfm.total_meta_blocks - meta_free)
+					* SCOUTFS_BLOCK_LG_SIZE));
+		snprintf(cells[1][4], CHARS, BASE_SIZE_FMT,
+			 BASE_SIZE_ARGS(meta_free * SCOUTFS_BLOCK_LG_SIZE));
+	} else {
+		snprintf(cells[1][2], CHARS, "%llu", sfm.total_meta_blocks);
+		snprintf(cells[1][3], CHARS, "%llu", sfm.total_meta_blocks - meta_free);
+		snprintf(cells[1][4], CHARS, "%llu", meta_free);
+	}
 	snprintf(cells[1][5], CHARS, "%llu",
 		((sfm.total_meta_blocks - meta_free) * 100) /
 		sfm.total_meta_blocks);

 	snprintf(cells[2][0], CHARS, "Data");
 	snprintf(cells[2][1], CHARS, "4KB");
-	snprintf(cells[2][2], CHARS, "%llu", sfm.total_data_blocks);
-	snprintf(cells[2][3], CHARS, "%llu", sfm.total_data_blocks - data_free);
-	snprintf(cells[2][4], CHARS, "%llu", data_free);
+	if (args->human_readable) {
+		snprintf(cells[2][2], CHARS, BASE_SIZE_FMT,
+			 BASE_SIZE_ARGS(sfm.total_data_blocks * SCOUTFS_BLOCK_SM_SIZE));
+		snprintf(cells[2][3], CHARS, BASE_SIZE_FMT,
+			 BASE_SIZE_ARGS((sfm.total_data_blocks - data_free)
+					* SCOUTFS_BLOCK_SM_SIZE));
+		snprintf(cells[2][4], CHARS, BASE_SIZE_FMT,
+			 BASE_SIZE_ARGS(data_free * SCOUTFS_BLOCK_SM_SIZE));
+	} else {
+		snprintf(cells[2][2], CHARS, "%llu", sfm.total_data_blocks);
+		snprintf(cells[2][3], CHARS, "%llu", sfm.total_data_blocks - data_free);
+		snprintf(cells[2][4], CHARS, "%llu", data_free);
+	}
 	snprintf(cells[2][5], CHARS, "%llu",
 		((sfm.total_data_blocks - data_free) * 100) /
 		sfm.total_data_blocks);
@@ -131,8 +150,51 @@ out:
 	return ret;
 }

+static int parse_opt(int key, char *arg, struct argp_state *state)
+{
+	struct df_args *args = state->input;
+
+	switch (key) {
+	case 'p':
+		args->path = strdup_or_error(state, arg);
+		break;
+	case 'h':
+		args->human_readable = true;
+		break;
+	default:
+		break;
+	}
+
+	return 0;
+}
+
+static struct argp_option options[] = {
+	{ "path", 'p', "PATH", 0, "Path to ScoutFS filesystem"},
+	{ "human-readable", 'h', NULL, 0, "Print sizes in human readable format (e.g., 1KB 234MB 2GB)"},
+	{ NULL }
+};
+
+static struct argp argp = {
+	options,
+	parse_opt,
+	"",
+	"Show metadata and data block usage"
+};
+
+static int df_cmd(int argc, char **argv)
+{
+	struct df_args df_args = {NULL};
+	int ret;
+
+	ret = argp_parse(&argp, argc, argv, 0, NULL, &df_args);
+	if (ret)
+		return ret;
+
+	return do_df(&df_args);
+
+}
+
 static void __attribute__((constructor)) df_ctor(void)
 {
-	cmd_register("df", "<path>",
-		     "show metadata and data block usage", df_cmd);
+	cmd_register_argp("df", &argp, GROUP_CORE, df_cmd);
 }
@@ -8,44 +8,32 @@
 #include <errno.h>
 #include <string.h>
 #include <limits.h>
+#include <argp.h>

 #include "sparse.h"
+#include "parse.h"
 #include "util.h"
 #include "format.h"
 #include "ioctl.h"
+#include "parse.h"
 #include "cmd.h"

-static int ino_path_cmd(int argc, char **argv)
+struct ino_args {
+	char *path;
+	u64 ino;
+};
+
+static int do_ino_path(struct ino_args *args)
 {
-	struct scoutfs_ioctl_ino_path args;
+	struct scoutfs_ioctl_ino_path ioctl_args;
 	struct scoutfs_ioctl_ino_path_result *res;
 	unsigned int result_bytes;
-	char *endptr = NULL;
-	u64 ino;
 	int ret;
 	int fd;

-	if (argc != 3) {
-		fprintf(stderr, "must specify ino and path\n");
-		return -EINVAL;
-	}
-
-	ino = strtoull(argv[1], &endptr, 0);
-	if (*endptr != '\0' ||
-	    ((ino == LLONG_MIN || ino == LLONG_MAX) && errno == ERANGE)) {
-		fprintf(stderr, "error parsing inode number '%s'\n",
-			argv[1]);
-		return -EINVAL;
-	}
-
-
-	fd = open(argv[2], O_RDONLY);
-	if (fd < 0) {
-		ret = -errno;
-		fprintf(stderr, "failed to open '%s': %s (%d)\n",
-			argv[2], strerror(errno), errno);
-		return ret;
-	}
+	fd = get_path(args->path, O_RDONLY);
+	if (fd < 0)
+		return fd;

 	result_bytes = offsetof(struct scoutfs_ioctl_ino_path_result,
 				path[PATH_MAX]);
@@ -57,13 +45,13 @@ static int ino_path_cmd(int argc, char **argv)
 		goto out;
 	}

-	args.ino = ino;
-	args.dir_ino = 0;
-	args.dir_pos = 0;
-	args.result_ptr = (intptr_t)res;
-	args.result_bytes = result_bytes;
+	ioctl_args.ino = args->ino;
+	ioctl_args.dir_ino = 0;
+	ioctl_args.dir_pos = 0;
+	ioctl_args.result_ptr = (intptr_t)res;
+	ioctl_args.result_bytes = result_bytes;
 	for (;;) {
-		ret = ioctl(fd, SCOUTFS_IOC_INO_PATH, &args);
+		ret = ioctl(fd, SCOUTFS_IOC_INO_PATH, &ioctl_args);
 		if (ret < 0) {
 			ret = -errno;
 			if (ret == -ENOENT)
@@ -73,10 +61,10 @@ static int ino_path_cmd(int argc, char **argv)

 		printf("%.*s\n", res->path_bytes, res->path);

-		args.dir_ino = res->dir_ino;
-		args.dir_pos = res->dir_pos;
-		if (++args.dir_pos == 0) {
-			if (++args.dir_ino == 0)
+		ioctl_args.dir_ino = res->dir_ino;
+		ioctl_args.dir_pos = res->dir_pos;
+		if (++ioctl_args.dir_pos == 0) {
+			if (++ioctl_args.dir_ino == 0)
 				break;
 		}
 	}
@@ -92,8 +80,60 @@ out:
 	return ret;
 };

+static int parse_opt(int key, char *arg, struct argp_state *state)
+{
+	struct ino_args *args = state->input;
+	int ret;
+
+	switch (key) {
+	case 'p':
+		args->path = strdup_or_error(state, arg);
+		break;
+	case ARGP_KEY_ARG:
+		if (args->ino)
+			argp_error(state, "more than one argument given");
+		ret = parse_u64(arg, &args->ino);
+		if (ret)
+			argp_error(state, "inode parse error");
+		break;
+	case ARGP_KEY_FINI:
+		if (!args->ino) {
+			argp_error(state, "must provide inode number");
+		}
+		break;
+	default:
+		break;
+	}
+
+	return 0;
+}
+
+static struct argp_option options[] = {
+	{ "path", 'p', "PATH", 0, "Path to ScoutFS filesystem"},
+	{ NULL }
+};
+
+static struct argp argp = {
+	options,
+	parse_opt,
+	"INODE-NUM",
+	"Print paths that refer to inode number"
+};
+
+static int ino_path_cmd(int argc, char **argv)
+{
+	struct ino_args ino_args = {NULL};
+	int ret;
+
+	ret = argp_parse(&argp, argc, argv, 0, NULL, &ino_args);
+	if (ret)
+		return ret;
+
+	return do_ino_path(&ino_args);
+}
+
+
 static void __attribute__((constructor)) ino_path_ctor(void)
 {
-	cmd_register("ino-path", "<ino> <path>",
-		     "print paths that refer to inode #", ino_path_cmd);
+	cmd_register_argp("ino-path", &argp, GROUP_SEARCH, ino_path_cmd);
 }
@@ -7,56 +7,31 @@
 #include <fcntl.h>
 #include <errno.h>
 #include <string.h>
-#include <getopt.h>
 #include <ctype.h>
+#include <argp.h>

 #include "sparse.h"
+#include "parse.h"
 #include "util.h"
 #include "format.h"
 #include "ioctl.h"
 #include "cmd.h"

-static struct option long_ops[] = {
-	{ "file", 1, NULL, 'f' },
-	{ NULL, 0, NULL, 0}
+struct list_hidden_xattr_args {
+	char *filename;
 };

-static int listxattr_hidden_cmd(int argc, char **argv)
+static int do_list_hidden_xattrs(struct list_hidden_xattr_args *args)
 {
 	struct scoutfs_ioctl_listxattr_hidden lxh;
-	char *path = NULL;
 	char *buf = NULL;
 	char *name;
 	int fd = -1;
 	int bytes;
 	int len;
 	int ret;
-	int c;
 	int i;

-	while ((c = getopt_long(argc, argv, "f:", long_ops, NULL)) != -1) {
-		switch (c) {
-		case 'f':
-			path = strdup(optarg);
-			if (!path) {
-				fprintf(stderr, "path mem alloc failed\n");
-				ret = -ENOMEM;
-				goto out;
-			}
-			break;
-		case '?':
-		default:
-			ret = -EINVAL;
-			goto out;
-		}
-	}
-
-	if (path == NULL) {
-		fprintf(stderr, "must specify -f path to file\n");
-		ret = -EINVAL;
-		goto out;
-	}
-
 	memset(&lxh, 0, sizeof(lxh));
 	lxh.id_pos = 0;
 	lxh.hash_pos = 0;
@@ -69,11 +44,11 @@ static int listxattr_hidden_cmd(int argc, char **argv)
 	}
 	lxh.buf_ptr = (unsigned long)buf;

-	fd = open(path, O_RDONLY);
+	fd = open(args->filename, O_RDONLY);
 	if (fd < 0) {
 		ret = -errno;
 		fprintf(stderr, "failed to open '%s': %s (%d)\n",
-			path, strerror(errno), errno);
+			args->filename, strerror(errno), errno);
 		goto out;
 	}

@@ -139,9 +114,50 @@ out:
 	return ret;
 };

+static int parse_opt(int key, char *arg, struct argp_state *state)
+{
+	struct list_hidden_xattr_args *args = state->input;
+
+	switch (key) {
+	case ARGP_KEY_ARG:
+		if (args->filename)
+			argp_error(state, "more than one filename argument given");
+
+		args->filename = strdup_or_error(state, arg);
+		break;
+	case ARGP_KEY_FINI:
+		if (!args->filename) {
+			argp_error(state, "must specify filename");
+		}
+		break;
+	default:
+		break;
+	}
+
+	return 0;
+}
+
+static struct argp argp = {
+	NULL,
+	parse_opt,
+	"FILE",
+	"Print the names of hidden xattrs on a file"
+};
+
+static int list_hidden_xattrs_cmd(int argc, char **argv)
+{
+	struct list_hidden_xattr_args list_hidden_xattr_args = {NULL};
+	int ret;
+
+	ret = argp_parse(&argp, argc, argv, 0, NULL, &list_hidden_xattr_args);
+	if (ret)
+		return ret;
+
+	return do_list_hidden_xattrs(&list_hidden_xattr_args);
+}
+
+
 static void __attribute__((constructor)) listxattr_hidden_ctor(void)
 {
-	cmd_register("listxattr-hidden", "-f <path>",
-		     "print the names of hidden xattrs on the file",
-		     listxattr_hidden_cmd);
+	cmd_register_argp("list-hidden-xattrs", &argp, GROUP_INFO, list_hidden_xattrs_cmd);
 }
@@ -4,10 +4,20 @@
 #include <stdbool.h>
 #include <string.h>
 #include <assert.h>
+#include <argp.h>

 #include "cmd.h"
 #include "util.h"

+/*
+ * Ensure no compiler-added padding sneaks into structs defined in these
+ * headers.
+ */
+#pragma GCC diagnostic error "-Wpadded"
+#include "format.h"
+#include "ioctl.h"
+#pragma GCC diagnostic pop
+
 int main(int argc, char **argv)
 {
 	/*
@@ -11,12 +11,12 @@
 #include <sys/stat.h>
 #include <unistd.h>
 #include <assert.h>
-#include <getopt.h>
 #include <sys/socket.h>
 #include <netinet/in.h>
 #include <arpa/inet.h>
 #include <ctype.h>
 #include <inttypes.h>
+#include <argp.h>

 #include "sparse.h"
 #include "cmd.h"
@@ -30,6 +30,7 @@
 #include "bitops.h"
 #include "btree.h"
 #include "leaf_item_hash.h"
+#include "blkid.h"

 static int write_raw_block(int fd, u64 blkno, int shift, void *blk)
 {
@@ -99,6 +100,15 @@ static int write_alloc_root(struct scoutfs_super_block *super, int fd,
 	return write_raw_block(fd, blkno, SCOUTFS_BLOCK_LG_SHIFT, bt);
 }

+struct mkfs_args {
+	unsigned long long quorum_count;
+	char *meta_device;
+	char *data_device;
+	unsigned long long max_meta_size;
+	unsigned long long max_data_size;
+	bool force;
+};
+
 /*
 * Make a new file system by writing:
 *  - super blocks
@@ -108,19 +118,18 @@ static int write_alloc_root(struct scoutfs_super_block *super, int fd,
 * Superblock is written to both metadata and data devices, everything else is
 * written only to the metadata device.
 */
-static int write_new_fs(char *meta_path, char *data_path,
-			int meta_fd, int data_fd,
-			u8 quorum_count,
-			u64 max_meta_size, u64 max_data_size)
+static int do_mkfs(struct mkfs_args *args)
 {
-	struct scoutfs_super_block *super;
+	struct scoutfs_super_block *super = NULL;
 	struct scoutfs_inode inode;
 	struct scoutfs_alloc_list_block *lblk;
-	struct scoutfs_btree_block *bt;
+	struct scoutfs_btree_block *bt = NULL;
 	struct scoutfs_key key;
 	struct timeval tv;
+	int meta_fd = -1;
+	int data_fd = -1;
 	char uuid_str[37];
-	void *zeros;
+	void *zeros = NULL;
 	u64 blkno;
 	u64 meta_size;
 	u64 data_size;
@@ -135,6 +144,33 @@ static int write_new_fs(char *meta_path, char *data_path,

 	gettimeofday(&tv, NULL);

+	meta_fd = open(args->meta_device, O_RDWR | O_EXCL);
+	if (meta_fd < 0) {
+		ret = -errno;
+		fprintf(stderr, "failed to open '%s': %s (%d)\n",
+			args->meta_device, strerror(errno), errno);
+		goto out;
+	}
+	if (!args->force) {
+		ret = check_bdev(meta_fd, args->meta_device, "meta");
+		if (ret)
+			return ret;
+	}
+
+	data_fd = open(args->data_device, O_RDWR | O_EXCL);
+	if (data_fd < 0) {
+		ret = -errno;
+		fprintf(stderr, "failed to open '%s': %s (%d)\n",
+			args->data_device, strerror(errno), errno);
+		goto out;
+	}
+	if (!args->force) {
+		ret = check_bdev(data_fd, args->data_device, "data");
+		if (ret)
+			return ret;
+	}
+
+
 	super = calloc(1, SCOUTFS_BLOCK_SM_SIZE);
 	bt = calloc(1, SCOUTFS_BLOCK_LG_SIZE);
 	zeros = calloc(1, SCOUTFS_BLOCK_SM_SIZE);
@@ -145,13 +181,13 @@ static int write_new_fs(char *meta_path, char *data_path,
 		goto out;
 	}

-	ret = device_size(meta_path, meta_fd, 2ULL * (1024 * 1024 * 1024),
-			  max_meta_size, "meta", &meta_size);
+	ret = device_size(args->meta_device, meta_fd, 2ULL * (1024 * 1024 * 1024),
+			  args->max_meta_size, "meta", &meta_size);
 	if (ret)
 		goto out;

-	ret = device_size(data_path, data_fd, 8ULL * (1024 * 1024 * 1024),
-			  max_data_size, "data", &data_size);
+	ret = device_size(args->data_device, data_fd, 8ULL * (1024 * 1024 * 1024),
+			  args->max_data_size, "data", &data_size);
 	if (ret)
 		goto out;

@@ -169,7 +205,7 @@ static int write_new_fs(char *meta_path, char *data_path,
 	pseudo_random_bytes(&super->hdr.fsid, sizeof(super->hdr.fsid));
 	super->hdr.magic = cpu_to_le32(SCOUTFS_BLOCK_MAGIC_SUPER);
 	super->hdr.seq = cpu_to_le64(1);
-	super->format_hash = cpu_to_le64(SCOUTFS_FORMAT_HASH);
+	super->version = cpu_to_le64(SCOUTFS_INTEROP_VERSION);
 	uuid_generate(super->uuid);
 	super->next_ino = cpu_to_le64(SCOUTFS_ROOT_INO + 1);
 	super->next_trans_seq = cpu_to_le64(1);
@@ -179,7 +215,7 @@ static int write_new_fs(char *meta_path, char *data_path,
 	super->total_data_blocks = cpu_to_le64(last_data - first_data + 1);
 	super->first_data_blkno = cpu_to_le64(first_data);
 	super->last_data_blkno = cpu_to_le64(last_data);
-	super->quorum_count = quorum_count;
+	super->quorum_count = args->quorum_count;

 	/* fs root starts with root inode and its index items */
 	blkno = next_meta++;
@@ -293,7 +329,7 @@ static int write_new_fs(char *meta_path, char *data_path,
 	if (fsync(data_fd)) {
 		ret = -errno;
 		fprintf(stderr, "failed to fsync '%s': %s (%d)\n",
-			data_path, strerror(errno), errno);
+			args->data_device, strerror(errno), errno);
 		goto out;
 	}

@@ -306,7 +342,7 @@ static int write_new_fs(char *meta_path, char *data_path,
 	if (fsync(meta_fd)) {
 		ret = -errno;
 		fprintf(stderr, "failed to fsync '%s': %s (%d)\n",
-			meta_path, strerror(errno), errno);
+			args->meta_device, strerror(errno), errno);
 		goto out;
 	}

@@ -316,15 +352,15 @@ static int write_new_fs(char *meta_path, char *data_path,
 	       "  meta device path:     %s\n"
 	       "  data device path:     %s\n"
 	       "  fsid:                 %llx\n"
-	       "  format hash:          %llx\n"
+	       "  version:              %llx\n"
 	       "  uuid:                 %s\n"
 	       "  64KB metadata blocks: "SIZE_FMT"\n"
 	       "  4KB data blocks:      "SIZE_FMT"\n"
 	       "  quorum count:         %u\n",
-		meta_path,
-		data_path,
+		args->meta_device,
+	        args->data_device,
 		le64_to_cpu(super->hdr.fsid),
-		le64_to_cpu(super->format_hash),
+		le64_to_cpu(super->version),
 		uuid_str,
 		SIZE_ARGS(le64_to_cpu(super->total_meta_blocks),
 			  SCOUTFS_BLOCK_LG_SIZE),
@@ -340,102 +376,106 @@ out:
 		free(bt);
 	if (zeros)
 		free(zeros);
+	if (meta_fd != -1)
+		close(meta_fd);
+	if (data_fd != -1)
+		close(data_fd);
 	return ret;
 }

-static struct option long_ops[] = {
-	{ "quorum_count", 1, NULL, 'Q' },
-	{ NULL, 0, NULL, 0}
+static int parse_opt(int key, char *arg, struct argp_state *state)
+{
+	struct mkfs_args *args = state->input;
+	int ret;
+
+	switch (key) {
+	case 'Q':
+		ret = parse_u64(arg, &args->quorum_count);
+		if (ret)
+			return ret;
+		break;
+	case 'f':
+		args->force = true;
+		break;
+	case 'm': /* max-meta-size */
+	{
+		u64 prev_val;
+		ret = parse_human(arg, &args->max_meta_size);
+		if (ret)
+			return ret;
+		prev_val = args->max_meta_size;
+		args->max_meta_size = round_down(args->max_meta_size, SCOUTFS_BLOCK_LG_SIZE);
+		if (args->max_meta_size != prev_val)
+			fprintf(stderr, "Meta dev size %llu rounded down to %llu bytes\n",
+				prev_val, args->max_meta_size);
+		break;
+	}
+	case 'd': /* max-data-size */
+	{
+		u64 prev_val;
+		ret = parse_human(arg, &args->max_data_size);
+		if (ret)
+			return ret;
+		prev_val = args->max_data_size;
+		args->max_data_size = round_down(args->max_data_size, SCOUTFS_BLOCK_SM_SIZE);
+		if (args->max_data_size != prev_val)
+			fprintf(stderr, "Data dev size %llu rounded down to %llu bytes\n",
+				prev_val, args->max_data_size);
+		break;
+	}
+	case ARGP_KEY_ARG:
+		if (!args->meta_device)
+			args->meta_device = strdup_or_error(state, arg);
+		else if (!args->data_device)
+			args->data_device = strdup_or_error(state, arg);
+		else
+			argp_error(state, "more than two arguments given");
+		break;
+	case ARGP_KEY_FINI:
+		if (!args->quorum_count)
+			argp_error(state, "must provide nonzero quorum count with --quorum-count|-Q option");
+		if (!args->meta_device)
+			argp_error(state, "no metadata device argument given");
+		if (!args->data_device)
+			argp_error(state, "no data device argument given");
+		break;
+	default:
+		break;
+	}
+
+	return 0;
+}
+
+static struct argp_option options[] = {
+	{ "quorum-count", 'Q', "NUM", 0, "Number of voters required to use the filesystem [Required]"},
+	{ "force", 'f', NULL, 0, "Overwrite existing data on block devices"},
+	{ "max-meta-size", 'm', "SIZE", 0, "Use a size less than the base metadata device size (bytes or KMGTP units)"},
+	{ "max-data-size", 'd', "SIZE", 0, "Use a size less than the base data device size (bytes or KMGTP units)"},
+	{ NULL }
 };

-static int mkfs_func(int argc, char *argv[])
+static struct argp argp = {
+	options,
+	parse_opt,
+	"META-DEVICE DATA-DEVICE",
+	"Initialize a new ScoutFS filesystem"
+};
+
+static int mkfs_cmd(int argc, char *argv[])
 {
-	unsigned long long ull;
-	u8 quorum_count = 0;
-	u64 max_data_size = 0;
-	u64 max_meta_size = 0;
-	char *end = NULL;
-	char *meta_path;
-	char *data_path;
-	int meta_fd;
-	int data_fd;
+	struct mkfs_args mkfs_args = {0};
 	int ret;
-	int c;

-	while ((c = getopt_long(argc, argv, "Q:D:M:", long_ops, NULL)) != -1) {
-		switch (c) {
-		case 'Q':
-			ull = strtoull(optarg, &end, 0);
-			if (*end != '\0' || ull == 0 ||
-			    ull > SCOUTFS_QUORUM_MAX_COUNT) {
-				printf("scoutfs: invalid quorum count '%s'\n",
-					optarg);
-				return -EINVAL;
-			}
-			quorum_count = ull;
-			break;
-		case 'D':
-			ret = parse_human(optarg, &max_data_size);
-			if (ret < 0) {
-				printf("scoutfs: invalid data device size '%s'\n",
-					optarg);
-				return ret;
-			}
-			break;
-		case 'M':
-			ret = parse_human(optarg, &max_meta_size);
-			if (ret < 0) {
-				printf("scoutfs: invalid meta device size '%s'\n",
-					optarg);
-				return ret;
-			}
-			break;
-		case '?':
-		default:
-			return -EINVAL;
-		}
-	}
-
-	if (optind + 2 != argc) {
-		printf("scoutfs: mkfs: paths to metadata and data devices are required\n");
-		return -EINVAL;
-	}
-
-	meta_path = argv[optind];
-	data_path = argv[optind + 1];
-
-	if (!quorum_count) {
-		printf("provide quorum count with --quorum_count|-Q option\n");
-		return -EINVAL;
-	}
-
-	meta_fd = open(meta_path, O_RDWR | O_EXCL);
-	if (meta_fd < 0) {
-		ret = -errno;
-		fprintf(stderr, "failed to open metadata device '%s': %s (%d)\n",
-			meta_path, strerror(errno), errno);
+	ret = argp_parse(&argp, argc, argv, 0, NULL, &mkfs_args);
+	if (ret)
 		return ret;
-	}

-	data_fd = open(data_path, O_RDWR | O_EXCL);
-	if (data_fd < 0) {
-		ret = -errno;
-		fprintf(stderr, "failed to open data device '%s': %s (%d)\n",
-			data_path, strerror(errno), errno);
-		return ret;
-	}
-
-	ret = write_new_fs(meta_path, data_path, meta_fd, data_fd,
-			   quorum_count, max_meta_size, max_data_size);
-	close(meta_fd);
-	close(data_fd);
-
-	return ret;
+	return do_mkfs(&mkfs_args);
 }

 static void __attribute__((constructor)) mkfs_ctor(void)
 {
-	cmd_register("mkfs", "<path>", "write a new file system", mkfs_func);
+	cmd_register_argp("mkfs", &argp, GROUP_CORE, mkfs_cmd);

 	/* for lack of some other place to put these.. */
 	build_assert(sizeof(uuid_t) == SCOUTFS_UUID_BYTES);
@@ -0,0 +1,161 @@
+#include <unistd.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <sys/ioctl.h>
+#include <fcntl.h>
+#include <errno.h>
+#include <string.h>
+#include <getopt.h>
+#include <assert.h>
+#include <argp.h>
+
+#include "sparse.h"
+#include "util.h"
+#include "format.h"
+#include "ioctl.h"
+#include "cmd.h"
+#include "parse.h"
+
+struct move_blocks_args {
+	char *from_path;
+	u64 from_offset;
+	u64 length;
+	char *to_path;
+	u64 to_offset;
+
+	unsigned from_off_set:1,
+	         len_set:1,
+	         to_off_set:1;
+};
+
+static int do_move_blocks(struct move_blocks_args *args)
+{
+	struct scoutfs_ioctl_move_blocks mb;
+	int from_fd = -1;
+	int to_fd = -1;
+	int ret;
+
+	from_fd = open(args->from_path, O_RDWR);
+	if (from_fd < 0) {
+		ret = -errno;
+		fprintf(stderr, "failed to open '%s': %s (%d)\n",
+			args->from_path, strerror(errno), errno);
+		goto out;
+	}
+
+	to_fd = open(args->to_path, O_RDWR);
+	if (to_fd < 0) {
+		ret = -errno;
+		fprintf(stderr, "failed to open '%s': %s (%d)\n",
+			args->to_path, strerror(errno), errno);
+		goto out;
+	}
+
+	mb.from_fd = from_fd;
+	mb.from_off = args->from_offset;
+	mb.len = args->length;
+	mb.to_off = args->to_offset;
+
+	ret = ioctl(to_fd, SCOUTFS_IOC_MOVE_BLOCKS, &mb);
+	if (ret < 0) {
+		ret = -errno;
+		fprintf(stderr, "ioctl failed on '%s': %s (%d)\n",
+			args->to_path, strerror(errno), errno);
+	}
+
+out:
+	if (from_fd >= 0)
+		close(from_fd);
+	if (to_fd >= 0)
+		close(to_fd);
+
+	return ret;
+}
+
+static int parse_move_blocks_opts(int key, char *arg, struct argp_state *state)
+{
+	struct move_blocks_args *args = state->input;
+	int ret;
+
+	switch (key) {
+	case 'f':
+		ret = parse_u64(arg, &args->from_offset);
+		if (ret)
+			return ret;
+		args->from_off_set = 1;
+		break;
+	case 'l':
+		ret = parse_human(arg, &args->length);
+		if (ret)
+			return ret;
+		args->len_set = 1;
+		break;
+	case 't':
+		ret = parse_human(arg, &args->to_offset);
+		if (ret)
+			return ret;
+		args->to_off_set = 1;
+		break;
+	case ARGP_KEY_ARG:
+		if (args->to_path)
+			argp_error(state, "more than two file path arguments given");
+		if (args->from_path)
+			args->to_path = strdup_or_error(state, arg);
+		else
+			args->from_path = strdup_or_error(state, arg);
+		break;
+	case ARGP_KEY_FINI:
+		if (!args->from_path)
+			argp_error(state, "must provide from file path");
+		if (!args->to_path)
+			argp_error(state, "must provide to file path");
+		if (!args->from_off_set)
+			argp_error(state, "must provide from file offset --from-offset");
+		if (!args->len_set)
+			argp_error(state, "must provide region length --length");
+		if (!args->to_off_set)
+			argp_error(state, "must provide to file offset --to-offset");
+		break;
+	default:
+		break;
+	}
+
+	return 0;
+}
+
+static struct argp_option move_blocks_options[] = {
+	{ "from-offset", 'f', "OFFSET", 0,
+	   "Byte offset in from file of region to move [Required]"},
+	{ "length", 'l', "LENGTH", 0,
+	   "Length in bytes of region to move between files [Required]"},
+	{ "to-offset", 't', "OFFSET", 0,
+	   "Byte offset in to file where region will be moved to [Required]"},
+	{ NULL }
+};
+
+static struct argp move_blocks_argp = {
+	move_blocks_options,
+	parse_move_blocks_opts,
+	"FROM_FILE --from-offset OFFSET --length LENGTH TO_FILE --to-offset OFFSET",
+	"Move a fixed-size region of extents from one regular file to another",
+};
+
+static int move_blocks_cmd(int argc, char **argv)
+{
+	struct move_blocks_args args = {NULL};
+	int ret;
+
+	ret = argp_parse(&move_blocks_argp, argc, argv, 0, NULL, &args);
+	if (ret)
+		return ret;
+
+	return do_move_blocks(&args);
+}
+
+static void __attribute__((constructor)) move_blocks_ctor(void)
+{
+	cmd_register_argp("move-blocks", &move_blocks_argp, GROUP_AGENT,
+			  move_blocks_cmd);
+}
@@ -2,6 +2,7 @@
 #define _PARSE_H_

 #include <sys/time.h>
+#include <argp.h>

 int parse_human(char* str, u64 *val_ret);
 int parse_u64(char *str, u64 *val_ret);
@@ -9,4 +10,13 @@ int parse_s64(char *str, s64 *val_ret);
 int parse_u32(char *str, u32 *val_ret);
 int parse_timespec(char *str, struct timespec *ts);

+static inline char* strdup_or_error(const struct argp_state *state, char *str)
+{
+	char *new = strdup(str);
+	if (!new)
+		argp_error(state, "memory allocation failed");
+
+	return new;
+}
+
 #endif
@@ -12,8 +12,10 @@
 #include <sys/socket.h>
 #include <netinet/in.h>
 #include <arpa/inet.h>
+#include <argp.h>

 #include "sparse.h"
+#include "parse.h"
 #include "util.h"
 #include "format.h"
 #include "bitmap.h"
@@ -24,27 +26,6 @@
 #include "srch.h"
 #include "leaf_item_hash.h"

-static void *read_block(int fd, u64 blkno, int shift)
-{
-	size_t size = 1ULL << shift;
-	ssize_t ret;
-	void *buf;
-
-	buf = malloc(size);
-	if (!buf)
-		return NULL;
-
-	ret = pread(fd, buf, size, blkno << shift);
-	if (ret != size) {
-		fprintf(stderr, "read blkno %llu returned %zd: %s (%d)\n",
-			blkno, ret, strerror(errno), errno);
-		free(buf);
-		buf = NULL;
-	}
-
-	return buf;
-}
-
 static void print_block_header(struct scoutfs_block_header *hdr, int size)
 {
 	u32 crc = crc_block(hdr, size);
@@ -465,9 +446,9 @@ static int print_btree_block(int fd, struct scoutfs_super_block *super,
 	int ret;
 	int i;

-	bt = read_block(fd, le64_to_cpu(ref->blkno), SCOUTFS_BLOCK_LG_SHIFT);
-	if (!bt)
-		return -ENOMEM;
+	ret = read_block(fd, le64_to_cpu(ref->blkno), SCOUTFS_BLOCK_LG_SHIFT, (void **)&bt);
+	if (ret)
+		return ret;

 	if (bt->level == level) {
 		printf("%s btree blkno %llu\n"
@@ -559,15 +540,16 @@ static int print_alloc_list_block(int fd, char *str,
 	u64 start;
 	u64 len;
 	int wid;
+	int ret;
 	int i;

 	blkno = le64_to_cpu(ref->blkno);
 	if (blkno == 0)
 		return 0;

-	lblk = read_block(fd, blkno, SCOUTFS_BLOCK_LG_SHIFT);
-	if (!lblk)
-		return -ENOMEM;
+	ret = read_block(fd, blkno, SCOUTFS_BLOCK_LG_SHIFT, (void **)&lblk);
+	if (ret)
+		return ret;

 	printf("%s alloc_list_block blkno %llu\n", str, blkno);
 	print_block_header(&lblk->hdr, SCOUTFS_BLOCK_LG_SIZE);
@@ -617,11 +599,10 @@ static int print_srch_block(int fd, struct scoutfs_srch_ref *ref, int level)
 	if (blkno == 0)
 		return 0;

-	srp = read_block(fd, blkno, SCOUTFS_BLOCK_LG_SHIFT);
-	if (!srp) {
-		ret = -ENOMEM;
+	ret = read_block(fd, blkno, SCOUTFS_BLOCK_LG_SHIFT, (void **)&srp);
+	if (ret)
 		goto out;
-	}
+
 	srb = (void *)srp;

 	printf("srch %sblock blkno %llu\n", level ? "parent " : "", blkno);
@@ -677,7 +658,6 @@ out:
 struct print_recursion_args {
 	struct scoutfs_super_block *super;
 	int fd;
-	u8 __pad[4];
 };

 /* same as fs item but with a small header in the value */
@@ -763,9 +743,9 @@ static int print_btree_leaf_items(int fd, struct scoutfs_super_block *super,
 	if (ref->blkno == 0)
 		return 0;

-	bt = read_block(fd, le64_to_cpu(ref->blkno), SCOUTFS_BLOCK_LG_SHIFT);
-	if (!bt)
-		return -ENOMEM;
+	ret = read_block(fd, le64_to_cpu(ref->blkno), SCOUTFS_BLOCK_LG_SHIFT, (void **)&bt);
+	if (ret)
+		return ret;

 	node = avl_first(&bt->item_root);
 	while (node) {
@@ -828,11 +808,9 @@ static int print_quorum_blocks(int fd, struct scoutfs_super_block *super)
 	for (i = 0; i < SCOUTFS_QUORUM_BLOCKS; i++) {
 		blkno = SCOUTFS_QUORUM_BLKNO + i;
 		free(blk);
-		blk = read_block(fd, blkno, SCOUTFS_BLOCK_SM_SHIFT);
-		if (!blk) {
-			ret = -ENOMEM;
+		ret = read_block(fd, blkno, SCOUTFS_BLOCK_SM_SHIFT, (void **)&blk);
+		if (ret)
 			goto out;
-		}

 		if (blk->voter_rid != 0) {
 			printf("quorum block blkno %llu\n"
@@ -876,11 +854,15 @@ static void print_super_block(struct scoutfs_super_block *super, u64 blkno)

 	uuid_unparse(super->uuid, uuid_str);

+	if (!(le64_to_cpu(super->flags) && SCOUTFS_FLAG_IS_META_BDEV))
+	    fprintf(stderr,
+		    "**** Printing metadata from a data device! Did you mean to do this? ****\n");
+
 	printf("super blkno %llu\n", blkno);
 	print_block_header(&super->hdr, SCOUTFS_BLOCK_SM_SIZE);
-	printf("  format_hash %llx uuid %s\n",
-	       le64_to_cpu(super->format_hash), uuid_str);
-	printf("  flags: 0x%016llx\n", super->flags);
+	printf("  version %llx uuid %s\n",
+	       le64_to_cpu(super->version), uuid_str);
+	printf("  flags: 0x%016llx\n", le64_to_cpu(super->flags));

 	server_addr = alloc_addr_str(&super->server_addr);
 	if (!server_addr)
@@ -943,6 +925,10 @@ static void print_super_block(struct scoutfs_super_block *super, u64 blkno)
 	free(server_addr);
 }

+struct print_args {
+	char *meta_device;
+};
+
 static int print_volume(int fd)
 {
 	struct scoutfs_super_block *super = NULL;
@@ -952,9 +938,9 @@ static int print_volume(int fd)
 	int err;
 	int i;

-	super = read_block(fd, SCOUTFS_SUPER_BLKNO, SCOUTFS_BLOCK_SM_SHIFT);
-	if (!super)
-		return -ENOMEM;
+	ret = read_block(fd, SCOUTFS_SUPER_BLKNO, SCOUTFS_BLOCK_SM_SHIFT, (void **)&super);
+	if (ret)
+		return ret;

 	print_super_block(super, SCOUTFS_SUPER_BLKNO);

@@ -1034,23 +1020,16 @@ static int print_volume(int fd)
 	return ret;
 }

-static int print_cmd(int argc, char **argv)
+static int do_print(struct print_args *args)
 {
-	char *path;
 	int ret;
 	int fd;

-	if (argc != 2) {
-		printf("scoutfs print: a single path argument is required\n");
-		return -EINVAL;
-	}
-	path = argv[1];
-
-	fd = open(path, O_RDONLY);
+	fd = open(args->meta_device, O_RDONLY);
 	if (fd < 0) {
 		ret = -errno;
 		fprintf(stderr, "failed to open '%s': %s (%d)\n",
-			path, strerror(errno), errno);
+			args->meta_device, strerror(errno), errno);
 		return ret;
 	}

@@ -1059,8 +1038,49 @@ static int print_cmd(int argc, char **argv)
 	return ret;
 };

+static int parse_opt(int key, char *arg, struct argp_state *state)
+{
+	struct print_args *args = state->input;
+
+	switch (key) {
+	case ARGP_KEY_ARG:
+		if (!args->meta_device)
+			args->meta_device = strdup_or_error(state, arg);
+		else
+			argp_error(state, "more than one argument given");
+		break;
+	case ARGP_KEY_FINI:
+		if (!args->meta_device)
+			argp_error(state, "no metadata device argument given");
+		break;
+	default:
+		break;
+	}
+
+	return 0;
+}
+
+static struct argp argp = {
+	NULL,
+	parse_opt,
+	"META-DEV",
+	"Print metadata structures"
+};
+
+static int print_cmd(int argc, char **argv)
+{
+	struct print_args print_args = {NULL};
+	int ret;
+
+	ret = argp_parse(&argp, argc, argv, 0, NULL, &print_args);
+	if (ret)
+		return ret;
+
+	return do_print(&print_args);
+}
+
+
 static void __attribute__((constructor)) print_ctor(void)
 {
-	cmd_register("print", "<device>", "print metadata structures",
-			print_cmd);
+	cmd_register_argp("print", &argp, GROUP_DEBUG, print_cmd);
 }
@@ -7,38 +7,36 @@
 #include <fcntl.h>
 #include <errno.h>
 #include <string.h>
-#include <getopt.h>
+#include <argp.h>

 #include "sparse.h"
+#include "parse.h"
 #include "util.h"
 #include "format.h"
 #include "ioctl.h"
 #include "cmd.h"

-static struct option long_ops[] = {
-	{ "name", 1, NULL, 'n' },
-	{ "file", 1, NULL, 'f' },
-	{ NULL, 0, NULL, 0}
-};
-
 /*
 * There are significant constant costs to each search call, we
 * want to get the inodes in as few calls as possible.
 */
 #define BATCH_SIZE 1000000

-static int search_xattrs_cmd(int argc, char **argv)
+struct xattr_args {
+	char *name;
+	char *path;
+};
+
+static int do_search_xattrs(struct xattr_args *args)
 {
-	struct scoutfs_ioctl_search_xattrs sx;
-	char *path = NULL;
-	char *name = NULL;
+	struct scoutfs_ioctl_search_xattrs sx = {0};
 	u64 *inos = NULL;
 	int fd = -1;
 	int ret;
-	int c;
 	int i;

 	memset(&sx, 0, sizeof(sx));
+
 	inos = malloc(BATCH_SIZE * sizeof(inos[0]));
 	if (!inos) {
 		fprintf(stderr, "inos mem alloc failed\n");
@@ -46,56 +44,15 @@ static int search_xattrs_cmd(int argc, char **argv)
 		goto out;
 	}

-	while ((c = getopt_long(argc, argv, "f:n:", long_ops, NULL)) != -1) {
-		switch (c) {
-		case 'f':
-			path = strdup(optarg);
-			if (!path) {
-				fprintf(stderr, "path mem alloc failed\n");
-				ret = -ENOMEM;
-				goto out;
-			}
-			break;
-		case 'n':
-			name = strdup(optarg);
-			if (!name) {
-				fprintf(stderr, "name mem alloc failed\n");
-				ret = -ENOMEM;
-				goto out;
-			}
-			break;
-		case '?':
-		default:
-			ret = -EINVAL;
-			goto out;
-		}
-	}
-
-	if (path == NULL) {
-		fprintf(stderr, "must specify -f path to file\n");
-		ret = -EINVAL;
-		goto out;
-	}
-
-	if (name == NULL) {
-		fprintf(stderr, "must specify -n xattr name to search for\n");
-		ret = -EINVAL;
-		goto out;
-	}
-
-	fd = open(path, O_RDONLY);
-	if (fd < 0) {
-		ret = -errno;
-		fprintf(stderr, "failed to open '%s': %s (%d)\n",
-			path, strerror(errno), errno);
-		goto out;
-	}
+	fd = get_path(args->path, O_RDONLY);
+	if (fd < 0)
+		return fd;

 	sx.next_ino = 0;
 	sx.last_ino = U64_MAX;
-	sx.name_ptr = (unsigned long)name;
+	sx.name_ptr = (unsigned long)args->name;
 	sx.inodes_ptr = (unsigned long)inos;
-	sx.name_bytes = strlen(name);
+	sx.name_bytes = strlen(args->name);
 	sx.nr_inodes = BATCH_SIZE;

 	do {
@@ -119,16 +76,63 @@ static int search_xattrs_cmd(int argc, char **argv)
 out:
 	if (fd >= 0)
 		close(fd);
-	free(path);
-	free(name);
 	free(inos);

 	return ret;
 };

+static int parse_opt(int key, char *arg, struct argp_state *state)
+{
+	struct xattr_args *args = state->input;
+
+	switch (key) {
+	case 'p':
+		args->path = strdup_or_error(state, arg);
+		break;
+	case ARGP_KEY_ARG:
+		if (args->name)
+			argp_error(state, "more than one name argument given");
+
+		args->name = strdup_or_error(state, arg);
+		break;
+	case ARGP_KEY_FINI:
+		if (!args->name) {
+			argp_error(state, "must provide xattr containing .srch. scoutfs tag");
+		}
+		break;
+	default:
+		break;
+	}
+
+	return 0;
+}
+
+static struct argp_option options[] = {
+	{ "path", 'p', "PATH", 0, "Path to ScoutFS filesystem"},
+	{ NULL }
+};
+
+static struct argp argp = {
+	options,
+	parse_opt,
+	"XATTR-NAME",
+	"Print inode numbers of inodes which may have given xattr"
+};
+
+static int search_xattrs_cmd(int argc, char **argv)
+{
+
+	struct xattr_args xattr_args = {NULL};
+	int ret;
+
+	ret = argp_parse(&argp, argc, argv, 0, NULL, &xattr_args);
+	if (ret)
+		return ret;
+
+	return do_search_xattrs(&xattr_args);
+}
+
 static void __attribute__((constructor)) search_xattrs_ctor(void)
 {
-	cmd_register("search-xattrs", "-n name -f <path>",
-		     "print inode numbers of inodes which may have given xattr",
-		     search_xattrs_cmd);
+	cmd_register_argp("search-xattrs", &argp, GROUP_INFO, search_xattrs_cmd);
 }
@@ -7,8 +7,9 @@
 #include <fcntl.h>
 #include <errno.h>
 #include <string.h>
-#include <getopt.h>
 #include <assert.h>
+#include <argp.h>
+#include <stdbool.h>

 #include "sparse.h"
 #include "util.h"
@@ -17,83 +18,40 @@
 #include "parse.h"
 #include "cmd.h"

-static struct option long_ops[] = {
-	{ "ctime", 1, NULL, 'c' },
-	{ "data_version", 1, NULL, 'd' },
-	{ "file", 1, NULL, 'f' },
-	{ "offline", 0, NULL, 'o' },
-	{ "i_size", 1, NULL, 's' },
-	{ NULL, 0, NULL, 0}
+struct setattr_args {
+	char *filename;
+	struct timespec ctime;
+	u64 data_version;
+	u64 i_size;
+	bool offline;
 };

-static int setattr_more_cmd(int argc, char **argv)
+static int do_setattr(struct setattr_args *args)
 {
-	struct scoutfs_ioctl_setattr_more sm;
-	struct timespec ctime;
-	char *path = NULL;
-	int ret;
+	struct scoutfs_ioctl_setattr_more sm = {0};
 	int fd = -1;
-	int c;
+	int ret;

-	memset(&sm, 0, sizeof(sm));
-
-	while ((c = getopt_long(argc, argv, "c:d:f:os:", long_ops, NULL)) != -1) {
-		switch (c) {
-		case 'c':
-			ret = parse_timespec(optarg, &ctime);
-			if (ret)
-				goto out;
-			break;
-		case 'd':
-			ret = parse_u64(optarg, &sm.data_version);
-			if (ret)
-				goto out;
-			break;
-		case 'f':
-			path = strdup(optarg);
-			if (!path) {
-				fprintf(stderr, "path mem alloc failed\n");
-				ret = -ENOMEM;
-				goto out;
-			}
-			break;
-		case 'o':
-			sm.flags |= SCOUTFS_IOC_SETATTR_MORE_OFFLINE;
-			break;
-		case 's':
-			ret = parse_u64(optarg, &sm.i_size);
-			if (ret)
-				goto out;
-			break;
-		case '?':
-		default:
-			ret = -EINVAL;
-			goto out;
-		}
-	}
-
-	if (path == NULL) {
-		fprintf(stderr, "must specify -f path to file\n");
-		ret = -EINVAL;
-		goto out;
-	}
-
-	fd = open(path, O_WRONLY);
+	fd = open(args->filename, O_WRONLY);
 	if (fd < 0) {
 		ret = -errno;
 		fprintf(stderr, "failed to open '%s': %s (%d)\n",
-			path, strerror(errno), errno);
+			args->filename, strerror(errno), errno);
 		goto out;
 	}

-	sm.ctime_sec = ctime.tv_sec;
-	sm.ctime_nsec = ctime.tv_nsec;
+	sm.ctime_sec = args->ctime.tv_sec;
+	sm.ctime_nsec = args->ctime.tv_nsec;
+	sm.data_version = args->data_version;
+	if (args->offline)
+		sm.flags |= SCOUTFS_IOC_SETATTR_MORE_OFFLINE;
+	sm.i_size = args->i_size;

 	ret = ioctl(fd, SCOUTFS_IOC_SETATTR_MORE, &sm);
 	if (ret < 0) {
 		ret = -errno;
 		fprintf(stderr, "setattr_more ioctl failed on '%s': "
-			"%s (%d)\n", path, strerror(errno), errno);
+			"%s (%d)\n", args->filename, strerror(errno), errno);
 		goto out;
 	}

@@ -104,9 +62,83 @@ out:
 	return ret;
 }

+static int parse_opt(int key, char *arg, struct argp_state *state)
+{
+	struct setattr_args *args = state->input;
+	int ret;
+
+	switch (key) {
+	case 't': /* timespec */
+		ret = parse_timespec(arg, &args->ctime);
+		if (ret)
+			return ret;
+		break;
+	case 'V': /* data version */
+		ret = parse_u64(arg, &args->data_version);
+		if (ret)
+			return ret;
+		if (args->data_version == 0)
+			argp_error(state, "data version must not be 0");
+		break;
+	case 's': /* size */
+		ret = parse_human(arg, &args->i_size);
+		if (ret)
+			return ret;
+		break;
+	case 'o': /* offline */
+		args->offline = true;
+		break;
+	case ARGP_KEY_ARG:
+		if (!args->filename)
+			args->filename = strdup_or_error(state, arg);
+		else
+			argp_error(state, "more than one argument given");
+		break;
+	case ARGP_KEY_FINI:
+		if (!args->filename)
+			argp_error(state, "no filename given");
+		if (args->i_size && !args->data_version) {
+			argp_error(state, "must provide data-version if using --size option");
+		}
+		if (!args->i_size && args->offline) {
+			argp_error(state, "must provide size if using --offline option");
+		}
+		break;
+	default:
+		break;
+	}
+
+	return 0;
+}
+
+static struct argp_option options[] = {
+	{ "ctime", 't', "TIMESPEC", 0, "Set creation time using \"<seconds-since-epoch>.<nanoseconds>\" format"},
+	{ "data-version", 'V', "VERSION", 0, "Set data version"},
+	{ "size", 's', "SIZE", 0, "Set file size (bytes or KMGTP units). Requires --data-version"},
+	{ "offline", 'o', NULL, 0, "Set file contents as offline, not sparse. Requires --size"},
+	{ NULL }
+};
+
+static struct argp argp = {
+	options,
+	parse_opt,
+	"FILE",
+	"Set attributes on newly-created zero-length file"
+};
+
+static int setattr_cmd(int argc, char **argv)
+{
+	struct setattr_args setattr_args = {NULL};
+	int ret;
+
+	ret = argp_parse(&argp, argc, argv, 0, NULL, &setattr_args);
+	if (ret)
+		return ret;
+
+	return do_setattr(&setattr_args);
+}
+
 static void __attribute__((constructor)) setattr_more_ctor(void)
 {
-	cmd_register("setattr", "-c ctime -d data_version -o -s i_size -f <path>",
-		     "set attributes on file with no data",  
-		     setattr_more_cmd);
+	cmd_register_argp("setattr", &argp, GROUP_AGENT, setattr_cmd);
 }
@@ -8,82 +8,49 @@
 #include <errno.h>
 #include <string.h>
 #include <limits.h>
+#include <argp.h>

 #include "sparse.h"
 #include "util.h"
 #include "format.h"
 #include "ioctl.h"
+#include "parse.h"
 #include "cmd.h"

-static int stage_cmd(int argc, char **argv)
+struct stage_args {
+	char *archive_path;
+	char *path;
+	u64 data_version;
+	u64 offset;
+	u64 length;
+};
+
+static int do_stage(struct stage_args *args)
 {
-	struct scoutfs_ioctl_stage args;
+	struct scoutfs_ioctl_stage ioctl_args;
 	unsigned int buf_len = 1024 * 1024;
 	unsigned int bytes;
-	char *endptr = NULL;
 	char *buf = NULL;
 	int afd = -1;
 	int fd = -1;
-	u64 offset;
-	u64 count;
-	u64 vers;
 	int ret;

-	if (argc != 6) {
-		fprintf(stderr, "must specify moar args\n");
-		return -EINVAL;
-	}
-
-	fd = open(argv[1], O_RDWR);
-	if (fd < 0) {
-		ret = -errno;
-		fprintf(stderr, "failed to open '%s': %s (%d)\n",
-			argv[1], strerror(errno), errno);
-		return ret;
-	}
-
-	vers = strtoull(argv[2], &endptr, 0);
-	if (*endptr != '\0' ||
-	    ((vers == LLONG_MIN || vers == LLONG_MAX) && errno == ERANGE)) {
-		fprintf(stderr, "error parsing data version '%s'\n",
-			argv[2]);
-		ret = -EINVAL;
-		goto out;
-	}
-
-	offset = strtoull(argv[3], &endptr, 0);
-	if (*endptr != '\0' ||
-	    ((offset == LLONG_MIN || offset == LLONG_MAX) && errno == ERANGE)) {
-		fprintf(stderr, "error parsing offset '%s'\n",
-			argv[3]);
-		ret = -EINVAL;
-		goto out;
-	}
-
-	count = strtoull(argv[4], &endptr, 0);
-	if (*endptr != '\0' ||
-	    ((count == LLONG_MIN || count == LLONG_MAX) && errno == ERANGE)) {
-		fprintf(stderr, "error parsing count '%s'\n",
-			argv[4]);
-		ret = -EINVAL;
-		goto out;
-	}
-
-	if (count > INT_MAX) {
-		fprintf(stderr, "count %llu too large, limited to %d\n",
-			count, INT_MAX);
-		ret = -EINVAL;
-		goto out;
-	}
-
-	afd = open(argv[5], O_RDONLY);
+	afd = open(args->archive_path, O_RDONLY);
 	if (afd < 0) {
 		ret = -errno;
 		fprintf(stderr, "failed to open '%s': %s (%d)\n",
-			argv[5], strerror(errno), errno);
+			args->archive_path, strerror(errno), errno);
 		goto out;
 	}

+	fd = open(args->path, O_RDWR);
+	if (fd < 0) {
+		ret = -errno;
+		fprintf(stderr, "failed to open '%s': %s (%d)\n",
+			args->path, strerror(errno), errno);
+		return ret;
+	}
+
 	buf = malloc(buf_len);
 	if (!buf) {
 		fprintf(stderr, "couldn't allocate %u byte buffer\n", buf_len);
@@ -91,9 +58,9 @@ static int stage_cmd(int argc, char **argv)
 		goto out;
 	}

-	while (count) {
+	while (args->length) {

-		bytes = min(count, buf_len);
+		bytes = min(args->length, buf_len);

 		ret = read(afd, buf, bytes);
 		if (ret <= 0) {
@@ -105,15 +72,15 @@ static int stage_cmd(int argc, char **argv)

 		bytes = ret;

-		args.data_version = vers;
-		args.buf_ptr = (unsigned long)buf;
-		args.offset = offset;
-		args.count = bytes;
+		ioctl_args.data_version = args->data_version;
+		ioctl_args.buf_ptr = (unsigned long)buf;
+		ioctl_args.offset = args->offset;
+		ioctl_args.length = bytes;

-		count -= bytes;
-		offset += bytes;
+		args->length -= bytes;
+		args->offset += bytes;

-		ret = ioctl(fd, SCOUTFS_IOC_STAGE, &args);
+		ret = ioctl(fd, SCOUTFS_IOC_STAGE, &ioctl_args);
 		if (ret != bytes) {
 			fprintf(stderr, "stage returned %d, not %u: error %s (%d)\n",
 				ret, bytes, strerror(errno), errno);
@@ -132,79 +99,200 @@ out:
 	return ret;
 };

-static void __attribute__((constructor)) stage_ctor(void)
+static int parse_stage_opts(int key, char *arg, struct argp_state *state)
 {
-	cmd_register("stage", "<file> <vers> <offset> <count> <archive file>",
-		     "write archive file contents to offline region", stage_cmd);
+	struct stage_args *args = state->input;
+	int ret;
+
+	switch (key) {
+	case 'V':
+		ret = parse_u64(arg, &args->data_version);
+		if (ret)
+			return ret;
+		break;
+	case 'o': /* offset */
+		ret = parse_human(arg, &args->offset);
+		if (ret)
+			return ret;
+		break;
+	case 'l': /* length */
+		ret = parse_human(arg, &args->length);
+		if (ret)
+			return ret;
+		break;
+	case ARGP_KEY_ARG:
+		if (!args->archive_path)
+			args->archive_path = strdup_or_error(state, arg);
+		else if (!args->path)
+			args->path = strdup_or_error(state, arg);
+		else
+			argp_error(state, "more than two arguments given");
+		break;
+	case ARGP_KEY_FINI:
+		if (!args->archive_path) {
+			argp_error(state, "must provide archive file path");
+		}
+		if (!args->path) {
+			argp_error(state, "must provide to-stage file path");
+		}
+		if (!args->data_version) {
+			argp_error(state, "must provide file version with --data-version");
+		}
+		if (!args->length) {
+			struct stat statbuf = {0};
+
+			ret = stat(args->archive_path, &statbuf);
+			if (ret < 0)
+				argp_failure(state, 1, -errno, "Could not get file size");
+
+			args->length = statbuf.st_size;
+		}
+		break;
+	default:
+		break;
+	}
+
+	return 0;
 }

-static int release_cmd(int argc, char **argv)
+static struct argp_option options[] = {
+	{ "data-version", 'V', "VERSION", 0, "Data version of the file [Required]"},
+	{ "offset", 'o', "OFFSET", 0, "Offset (bytes or KMGTP units) in file to stage (default: 0)"},
+	{ "length", 'l', "LENGTH", 0, "Length of range (bytes or KMGTP units) of file to stage. (default: size of ARCHIVE-FILE)"},
+	{ NULL }
+};
+
+static struct argp stage_argp = {
+		options,
+		parse_stage_opts,
+		"ARCHIVE-FILE STAGE-FILE --data-version VERSION",
+		"Write archive file contents to an offline file"
+	};
+
+static int stage_cmd(int argc, char **argv)
 {
-	struct scoutfs_ioctl_release args;
-	char *endptr = NULL;
-	u64 block;
-	u64 count;
-	u64 vers;
+	struct stage_args stage_args = {NULL};
+	int ret;
+
+	ret = argp_parse(&stage_argp, argc, argv, 0, NULL, &stage_args);
+	if (ret)
+		return ret;
+
+	return do_stage(&stage_args);
+}
+
+static void __attribute__((constructor)) stage_ctor(void)
+{
+	cmd_register_argp("stage", &stage_argp, GROUP_AGENT, stage_cmd);
+}
+
+struct release_args {
+	char *path;
+	u64 data_version;
+	u64 offset;
+	u64 length;
+};
+
+static int do_release(struct release_args *args)
+{
+	struct scoutfs_ioctl_release ioctl_args = {0};
 	int ret;
 	int fd;

-	if (argc != 5) {
-		fprintf(stderr, "must specify path, data version, offset, and count\n");
-		return -EINVAL;
-	}
-
-	fd = open(argv[1], O_RDWR);
+	fd = open(args->path, O_RDWR);
 	if (fd < 0) {
 		ret = -errno;
 		fprintf(stderr, "failed to open '%s': %s (%d)\n",
-			argv[1], strerror(errno), errno);
+			args->path, strerror(errno), errno);
 		return ret;
 	}

-	vers = strtoull(argv[2], &endptr, 0);
-	if (*endptr != '\0' ||
-	    ((vers == LLONG_MIN || vers == LLONG_MAX) && errno == ERANGE)) {
-		fprintf(stderr, "error parsing data version '%s'\n",
-			argv[2]);
-		ret = -EINVAL;
-		goto out;
-	}
+	ioctl_args.offset = args->offset;
+	ioctl_args.length = args->length;
+	ioctl_args.data_version = args->data_version;

-	block = strtoull(argv[3], &endptr, 0);
-	if (*endptr != '\0' ||
-	    ((block == LLONG_MIN || block == LLONG_MAX) && errno == ERANGE)) {
-		fprintf(stderr, "error parsing starting 4K block offset '%s'\n",
-			argv[3]);
-		ret = -EINVAL;
-		goto out;
-	}
-
-	count = strtoull(argv[4], &endptr, 0);
-	if (*endptr != '\0' ||
-	    ((count == LLONG_MIN || count == LLONG_MAX) && errno == ERANGE)) {
-		fprintf(stderr, "error parsing length '%s'\n",
-			argv[4]);
-		ret = -EINVAL;
-		goto out;
-	}
-
-	args.block = block;
-	args.count = count;
-	args.data_version = vers;
-
-	ret = ioctl(fd, SCOUTFS_IOC_RELEASE, &args);
+	ret = ioctl(fd, SCOUTFS_IOC_RELEASE, &ioctl_args);
 	if (ret < 0) {
 		ret = -errno;
 		fprintf(stderr, "release ioctl failed: %s (%d)\n",
 			strerror(errno), errno);
 	}
-out:
+
 	close(fd);
 	return ret;
 };

+static int parse_release_opts(int key, char *arg, struct argp_state *state)
+{
+	struct release_args *args = state->input;
+	int ret;
+
+	switch (key) {
+	case 'V':
+		ret = parse_u64(arg, &args->data_version);
+		if (ret)
+			return ret;
+		break;
+	case 'o': /* offset */
+		ret = parse_human(arg, &args->offset);
+		if (ret)
+			return ret;
+		break;
+	case 'l': /* length */
+		ret = parse_human(arg, &args->length);
+		if (ret)
+			return ret;
+		break;
+	case ARGP_KEY_ARG:
+		if (args->path)
+			argp_error(state, "more than one argument given");
+		args->path = strdup_or_error(state, arg);
+		break;
+	case ARGP_KEY_FINI:
+		if (!args->path) {
+			argp_error(state, "must provide file path");
+		}
+		if (!args->data_version) {
+			argp_error(state, "must provide file version --data-version");
+		}
+		if (!args->length) {
+			int ret;
+			struct stat statbuf = {0};
+
+			ret = stat(args->path, &statbuf);
+			if (ret < 0)
+				argp_failure(state, 1, -errno, "Could not get file size");
+
+			args->length = round_up(statbuf.st_size, SCOUTFS_BLOCK_SM_SIZE);
+		}
+		break;
+	default:
+		break;
+	}
+
+	return 0;
+}
+
+static struct argp release_argp = {
+	options,
+	parse_release_opts,
+	"FILE --data-version VERSION",
+	"Mark file region offline and free extents"
+};
+
+static int release_cmd(int argc, char **argv)
+{
+	struct release_args release_args = {NULL};
+	int ret;
+
+	ret = argp_parse(&release_argp, argc, argv, 0, NULL, &release_args);
+	if (ret)
+		return ret;
+
+	return do_release(&release_args);
+}
+
 static void __attribute__((constructor)) release_ctor(void)
 {
-	cmd_register("release", "<path> <vers> <4K block offset> <block count>",
-		     "mark file region offline and free extents", release_cmd);
+	cmd_register_argp("release", &release_argp, GROUP_AGENT, release_cmd);
 }
@@ -7,10 +7,12 @@
 #include <fcntl.h>
 #include <errno.h>
 #include <string.h>
-#include <getopt.h>
 #include <assert.h>
+#include <argp.h>
+#include <stdbool.h>

 #include "sparse.h"
+#include "parse.h"
 #include "util.h"
 #include "format.h"
 #include "ioctl.h"
@@ -101,12 +103,13 @@ static void print_fs_field(void *st, size_t off)

 typedef void (*print_field_t)(void *st, size_t off);

-static struct option long_ops[] = {
-	{ "single_field", 1, NULL, 's' },
-	{ NULL, 0, NULL, 0}
+struct stat_args {
+	char *path;
+	char *single_field;
+	bool is_inode;
 };

-static int do_stat(int argc, char **argv, int is_inode)
+static int do_stat(struct stat_args *args)
 {
 	union {
 		struct scoutfs_ioctl_stat_more stm;
@@ -115,17 +118,13 @@ static int do_stat(int argc, char **argv, int is_inode)
 	struct stat_more_field *single = NULL;
 	struct stat_more_field *fields;
 	struct stat_more_field *fi;
-	char *single_name = NULL;
 	print_field_t pr = NULL;
-	char *path;
 	int cmd;
 	int ret;
 	int fd;
-	int i;
-	int c;

 	memset(&st, 0, sizeof(st));
-	if (is_inode) {
+	if (args->is_inode) {
 		cmd = SCOUTFS_IOC_STAT_MORE;
 		fields = inode_fields;
 		st.stm.valid_bytes = sizeof(struct scoutfs_ioctl_stat_more);
@@ -137,89 +136,141 @@ static int do_stat(int argc, char **argv, int is_inode)
 		pr = print_fs_field;
 	}

-	while ((c = getopt_long(argc, argv, "s:", long_ops, NULL)) != -1) {
-		switch (c) {
-		case 's':
-			single_name = strdup(optarg);
-			assert(single_name);
-			break;
-		case '?':
-		default:
-			return -EINVAL;
-		}
-	}
-
-	if (single_name) {
+	if (args->single_field) {
 		for_each_field(fi, fields) {
-			if (strcmp(fi->name, single_name) == 0) {
+			if (strcmp(fi->name, args->single_field) == 0) {
 				single = fi;
 				break;
 			}
 		}
 		if (!single) {
-			fprintf(stderr, "unknown field: '%s'\n", single_name);
+			fprintf(stderr, "unknown field: '%s'\n", args->single_field);
 			return -EINVAL;
 		}
 	}

-	if (optind >= argc) {
-		fprintf(stderr, "must specify at least one path argument\n");
-		return -EINVAL;
-	}
+	fd = get_path(args->path, O_RDONLY);
+	if (fd < 0)
+		return fd;

-	for (i = optind; i < argc; i++) {
-		path = argv[i];
-
-		fd = open(path, O_RDONLY);
-		if (fd < 0) {
-			ret = -errno;
-			fprintf(stderr, "failed to open '%s': %s (%d)\n",
-				path, strerror(errno), errno);
-			continue;
-		}
-
-		ret = ioctl(fd, cmd, &st);
-		if (ret < 0) {
-			ret = -errno;
-			fprintf(stderr, "ioctl failed on '%s': "
-				"%s (%d)\n", path, strerror(errno), errno);
-
-		} else if (single) {
-			pr(&st, single->offset);
+	ret = ioctl(fd, cmd, &st);
+	if (ret < 0) {
+		ret = -errno;
+		fprintf(stderr, "ioctl failed: %s (%d)\n", strerror(errno), errno);
+	} else if (single) {
+		pr(&st, single->offset);
+		printf("\n");
+	} else {
+		for_each_field(fi, fields) {
+			printf("%-17s ", fi->name);
+			pr(&st, fi->offset);
 			printf("\n");
-		} else {
-			printf("%-17s %s\n", "path", path);
-			for_each_field(fi, fields) {
-				printf("%-17s ", fi->name);
-				pr(&st, fi->offset);
-				printf("\n");
-			}
 		}
-
-		close(fd);
 	}

 	return 0;
 }

+static int stat_parse_opt(int key, char *arg, struct argp_state *state)
+{
+	struct stat_args *args = state->input;
+
+	switch (key) {
+	case 's':
+		args->single_field = strdup_or_error(state, arg);
+		break;
+	case ARGP_KEY_ARG:
+		if (!args->path)
+			args->path = strdup_or_error(state, arg);
+		else
+			argp_error(state, "more than one argument");
+		break;
+	case ARGP_KEY_FINI:
+		if (!args->path)
+			argp_error(state, "missing operand");
+		break;
+	default:
+		break;
+	}
+
+	return 0;
+}
+
+static struct argp_option stat_options[] = {
+	{ "single-field", 's', "FIELD-NAME", 0, "Specify single field to print" },
+	{ NULL }
+};
+
+static struct argp stat_argp = {
+	stat_options,
+	stat_parse_opt,
+	"FILE",
+	"Show ScoutFS extra inode information"
+};
+
 static int stat_more_cmd(int argc, char **argv)
 {
-	return do_stat(argc, argv, 1);
+	struct stat_args stat_args = {NULL};
+	int ret;
+
+	ret = argp_parse(&stat_argp, argc, argv, 0, NULL, &stat_args);
+	if (ret)
+		return ret;
+	stat_args.is_inode = true;
+
+	return do_stat(&stat_args);
 }

+static struct argp_option statfs_options[] = {
+	{ "path", 'p', "PATH", 0, "Path to ScoutFS filesystem"},
+	{ "single-field", 's', "FIELD-NAME", 0, "Specify single field to print" },
+	{ NULL }
+};
+
+static int statfs_parse_opt(int key, char *arg, struct argp_state *state)
+{
+	struct stat_args *args = state->input;
+
+	switch (key) {
+	case 'p':
+		args->path = strdup_or_error(state, arg);
+		break;
+	case 's':
+		args->single_field = strdup_or_error(state, arg);
+		break;
+	default:
+		break;
+	}
+
+	return 0;
+}
+
+static struct argp statfs_argp = {
+	statfs_options,
+	statfs_parse_opt,
+	"",
+	"Show ScoutFS file system information"
+};
+
 static int statfs_more_cmd(int argc, char **argv)
 {
-	return do_stat(argc, argv, 0);
+	struct stat_args stat_args = {NULL};
+	int ret;
+
+	ret = argp_parse(&statfs_argp, argc, argv, 0, NULL, &stat_args);
+	if (ret)
+		return ret;
+	stat_args.is_inode = false;
+
+	return do_stat(&stat_args);
 }

 static void __attribute__((constructor)) stat_more_ctor(void)
 {
-	cmd_register("stat", "<path>",
-		     "show scoutfs inode information", stat_more_cmd);
+	cmd_register_argp("stat", &stat_argp, GROUP_INFO, stat_more_cmd);
 }

 static void __attribute__((constructor)) statfs_more_ctor(void)
 {
-	cmd_register("statfs", "<path>",
-		     "show scoutfs file system information", statfs_more_cmd);
+	cmd_register_argp("statfs", &statfs_argp, GROUP_INFO, statfs_more_cmd);
 }
@@ -0,0 +1,100 @@
+#define _GNU_SOURCE
+#include <unistd.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <string.h>
+#include <errno.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <wordexp.h>
+
+#include "util.h"
+
+#define ENV_PATH "SCOUTFS_MOUNT_PATH"
+
+static int open_path(char *path, int flags)
+{
+	wordexp_t exp_result;
+	int ret;
+
+	ret = wordexp(path, &exp_result, WRDE_NOCMD | WRDE_SHOWERR | WRDE_UNDEF);
+	if (ret) {
+		fprintf(stderr, "wordexp() failure for \"%s\": %d\n", path, ret);
+		ret = -EINVAL;
+		goto out;
+	}
+
+	ret = open(exp_result.we_wordv[0], flags);
+	if (ret < 0) {
+		ret = -errno;
+		fprintf(stderr, "failed to open '%s': %s (%d)\n",
+			path, strerror(errno), errno);
+	}
+
+out:
+	wordfree(&exp_result);
+
+	return ret;
+}
+
+/*
+ * 1. if path option given, use that
+ * 2. if env var, use that
+ * 3. if cwd is in a scoutfs fs, use that
+ * 4. else error
+ */
+int get_path(char *path, int flags)
+{
+	char *env_path;
+	char *cur_dir_path;
+	int ret;
+
+	if (path)
+		return open_path(path, flags);
+
+	env_path = getenv(ENV_PATH);
+	if (env_path)
+		return open_path(path, flags);
+
+	cur_dir_path = get_current_dir_name();
+	if (!cur_dir_path) {
+		ret = -errno;
+		return ret;
+	}
+
+	ret = open_path(cur_dir_path, flags);
+	free(cur_dir_path);
+
+	// TODO: check this is within a scoutfs mount?
+
+	return ret;
+}
+
+int read_block(int fd, u64 blkno, int shift, void **ret_val)
+{
+	size_t size = 1ULL << shift;
+	void *buf;
+	int ret;
+
+	*ret_val = NULL;
+
+	buf = malloc(size);
+	if (!buf)
+		return -ENOMEM;
+
+	ret = pread(fd, buf, size, blkno << shift);
+	if (ret == -1) {
+		fprintf(stderr, "read blkno %llu returned %d: %s (%d)\n",
+			blkno, ret, strerror(errno), errno);
+		free(buf);
+		return -errno;
+	} else if (ret != size) {
+		fprintf(stderr, "incomplete pread\n");
+		free(buf);
+		return -EINVAL;
+	} else {
+		*ret_val = buf;
+		return 0;
+	}
+}
@@ -111,4 +111,7 @@ static inline int memcmp_lens(const void *a, int a_len,
 	return memcmp(a, b, len) ?: a_len - b_len;
 }

+int get_path(char *path, int flags);
+int read_block(int fd, u64 blkno, int shift, void **ret_val);
+
 #endif
@@ -8,6 +8,8 @@
 #include <errno.h>
 #include <string.h>
 #include <limits.h>
+#include <argp.h>
+#include <stdbool.h>

 #include "sparse.h"
 #include "util.h"
@@ -31,7 +33,16 @@
 	(((ops) & (bit)) ? (str) : ""),			\
 	(((ops) & (bit)) && ((ops) & ~(((bit) << 1) - 1)) ? "," : "")

-static int waiting_cmd(int argc, char **argv)
+
+struct waiting_args {
+	char *path;
+	bool inode_set;
+	u64 inode;
+	bool blkno_set;
+	u64 blkno;
+};
+
+static int do_waiting(struct waiting_args *args)
 {
 	struct scoutfs_ioctl_data_waiting_entry dwe[16];
 	struct scoutfs_ioctl_data_waiting idw;
@@ -39,25 +50,13 @@ static int waiting_cmd(int argc, char **argv)
 	int fd;
 	int i;

-	if (argc != 4) {
-		fprintf(stderr, "must specify ino, iblock, and path\n");
-		return -EINVAL;
-	}
-
-	ret = parse_u64(argv[1], &idw.after_ino) ?:
-	      parse_u64(argv[2], &idw.after_iblock);
-	if (ret)
-		return ret;
-
-	fd = open(argv[3], O_RDONLY);
-	if (fd < 0) {
-		ret = -errno;
-		fprintf(stderr, "failed to open '%s': %s (%d)\n",
-			argv[3], strerror(errno), errno);
-		return ret;
-	}
+	fd = get_path(args->path, O_RDONLY);
+	if (fd < 0)
+		return fd;

 	idw.flags = 0;
+	idw.after_ino = args->inode;
+	idw.after_iblock = args->blkno;
 	idw.ents_ptr = (unsigned long)dwe;
 	idw.ents_nr = array_size(dwe);

@@ -91,59 +90,114 @@ static int waiting_cmd(int argc, char **argv)
 	return ret;
 };

-static void __attribute__((constructor)) waiting_ctor(void)
+static int waiting_parse_opt(int key, char *arg, struct argp_state *state)
 {
-	cmd_register("data-waiting", "<ino> <iblock> <path>",
-		     "print ops waiting for data blocks", waiting_cmd);
-}
-
-static int data_wait_err_cmd(int argc, char **argv)
-{
-	struct scoutfs_ioctl_data_wait_err args;
-	int fd = -1;
+	struct waiting_args *args = state->input;
 	int ret;

-	memset(&args, 0, sizeof(args));
-
-	if (argc != 8) {
-		fprintf(stderr, "must specify path, ino, version, offset, count,op, and err\n");
-		return -EINVAL;
+	switch (key) {
+	case 'p':
+		args->path = strdup_or_error(state, arg);
+		break;
+	case 'I': /* inode */
+		ret = parse_u64(arg, &args->inode);
+		if (ret)
+			argp_error(state, "inode parse error");
+		args->inode_set = true;
+		break;
+	case 'B': /* blkno */
+		ret = parse_u64(arg, &args->blkno);
+		if (ret)
+			argp_error(state, "blkno parse error");
+		args->blkno_set = true;
+		break;
+	case ARGP_KEY_FINI:
+		if (!args->inode_set)
+			argp_error(state, "no inode given");
+		if (!args->blkno_set)
+			argp_error(state, "no blkno given");
+		break;
+	default:
+		break;
 	}

-	ret = parse_u64(argv[2], &args.ino) ?:
-	      parse_u64(argv[3], &args.data_version) ?:
-	      parse_u64(argv[4], &args.offset) ?:
-	      parse_u64(argv[5], &args.count) ?:
-	      parse_s64(argv[7], &args.err);
+	return 0;
+}
+
+static struct argp_option waiting_options[] = {
+	{ "path", 'p', "PATH", 0, "Path to ScoutFS filesystem"},
+	{ "inode", 'I', "INODE-NUM", 0, "Inode number [Required]"},
+	{ "block", 'B', "BLKNO-NUM", 0, "Block number [Required]"},
+	{ NULL }
+};
+
+static struct argp waiting_argp = {
+	waiting_options,
+	waiting_parse_opt,
+	"--inode INODE-NUM --block BLOCK-NUM",
+	"Print operations waiting for data blocks"
+};
+
+static int waiting_cmd(int argc, char **argv)
+{
+	struct waiting_args waiting_args = {NULL};
+	int ret;
+
+	ret = argp_parse(&waiting_argp, argc, argv, 0, NULL, &waiting_args);
 	if (ret)
 		return ret;

-	if ((args.err >= 0) || (args.err < -MAX_ERRNO)) {
-		fprintf(stderr, "err %lld invalid\n", args.err);
-		ret = -EINVAL;
-		goto out;
-	}
+	return do_waiting(&waiting_args);
+}

-	if (!strcmp(argv[6], "read")) {
-		args.op = SCOUTFS_IOC_DWO_READ;
-	} else if (!strcmp(argv[6], "write")) {
-		args.op = SCOUTFS_IOC_DWO_WRITE;
-	} else if (!strcmp(argv[6], "change_size")) {
-		args.op = SCOUTFS_IOC_DWO_CHANGE_SIZE;
+static void __attribute__((constructor)) waiting_ctor(void)
+{
+	cmd_register_argp("data-waiting", &waiting_argp, GROUP_AGENT, waiting_cmd);
+}
+
+struct wait_err_args {
+	char *path;
+	bool inode_set;
+	u64 inode;
+	bool version_set;
+	u64 version;
+	bool offset_set;
+	u64 offset;
+	bool count_set;
+	u64 count;
+	char *op;
+	bool err_set;
+	s64 err;
+};
+
+static int do_wait_err(struct wait_err_args *args)
+{
+	struct scoutfs_ioctl_data_wait_err dwe = {0};
+	int fd = -1;
+	int ret;
+
+
+	dwe.ino = args->inode;
+	dwe.data_version = args->version;
+	dwe.offset = args->offset;
+	dwe.count = args->count;
+	if (!strcmp(args->op, "read")) {
+		dwe.op = SCOUTFS_IOC_DWO_READ;
+	} else if (!strcmp(args->op, "write")) {
+		dwe.op = SCOUTFS_IOC_DWO_WRITE;
+	} else if (!strcmp(args->op, "change_size")) {
+		dwe.op = SCOUTFS_IOC_DWO_CHANGE_SIZE;
 	} else {
-		fprintf(stderr, "invalid data wait op: '%s'\n", argv[6]);
+		fprintf(stderr, "invalid data wait op: '%s'\n", args->op);
 		return -EINVAL;
 	}
+	dwe.err = args->err;

-	fd = open(argv[1], O_RDONLY);
-	if (fd < 0) {
-		ret = -errno;
-		fprintf(stderr, "failed to open '%s': %s (%d)\n",
-			argv[1], strerror(errno), errno);
-		return ret;
-	}
+	fd = get_path(args->path, O_RDONLY);
+	if (fd < 0)
+		return fd;

-	ret = ioctl(fd, SCOUTFS_IOC_DATA_WAIT_ERR, &args);
+	ret = ioctl(fd, SCOUTFS_IOC_DATA_WAIT_ERR, &dwe);
 	if (ret < 0) {
 		fprintf(stderr, "data_wait_err returned %d: error %s (%d)\n",
 			ret, strerror(errno), errno);
@@ -158,9 +212,104 @@ out:
 	return ret;
 };

+static int wait_err_parse_opt(int key, char *arg, struct argp_state *state)
+{
+	struct wait_err_args *args = state->input;
+	int ret;
+
+	switch (key) {
+	case 'p':
+		args->path = strdup_or_error(state, arg);
+		break;
+	case 'I': /* inode */
+		ret = parse_u64(arg, &args->inode);
+		if (ret)
+			argp_error(state, "inode parse error");
+		args->inode_set = true;
+		break;
+	case 'V': /* version */
+		ret = parse_u64(arg, &args->version);
+		if (ret)
+			argp_error(state, "version parse error");
+		args->version_set = true;
+		break;
+	case 'F': /* offset */
+		ret = parse_human(arg, &args->offset);
+		if (ret)
+			argp_error(state, "version parse error");
+		args->offset_set = true;
+		break;
+	case 'C': /* count */
+		ret = parse_u64(arg, &args->count);
+		if (ret)
+			argp_error(state, "count parse error");
+		args->count_set = true;
+		break;
+	case 'O': /* op */
+		args->op = strdup_or_error(state, arg);
+		break;
+	case 'E': /* err */
+		ret = parse_s64(arg, &args->err);
+		if (ret)
+			argp_error(state, "error parse error");
+		if ((args->err >= 0) || (args->err < -MAX_ERRNO))
+			argp_error(state, "errno out of range");
+		args->err_set = true;
+		break;
+	case ARGP_KEY_FINI:
+		if (!args->inode_set)
+			argp_error(state, "no inode given");
+		if (!args->version_set)
+			argp_error(state, "no version given");
+		if (!args->offset_set)
+			argp_error(state, "no offset given");
+		if (!args->count_set)
+			argp_error(state, "no count given");
+		if (!args->op)
+			argp_error(state, "no operation given");
+		if (!args->err_set)
+			argp_error(state, "no error given");
+		break;
+	default:
+		break;
+	}
+
+	return 0;
+}
+
+static struct argp_option wait_err_options[] = {
+	{ "path", 'p', "PATH", 0, "Path to ScoutFS filesystem"},
+	{ "inode", 'I', "INODE-NUM", 0, "Inode number [Required]"},
+	{ "version", 'V', "VER-NUM", 0, "Version [Required]"},
+	{ "offset", 'F', "OFF-NUM", 0, "Offset (bytes or KMGTP units) [Required]"},
+	{ "count", 'C', "COUNT", 0, "Count [Required]"},
+	{ "op", 'O', "OP", 0, "Operation: \"read\", \"write\", \"change_size\" [Required]"},
+	{ "err", 'E', "ERR", 0, "Error [Required]"},
+	{ NULL }
+};
+
+static struct argp wait_err_argp = {
+	wait_err_options,
+	wait_err_parse_opt,
+	"--inode INODE-NUM --version VER-NUM "
+	"--offset OFF-NUM --count COUNT --op OP --err ERR",
+	"Return error from matching waiters"
+};
+
+static int wait_err_cmd(int argc, char **argv)
+{
+	struct wait_err_args wait_err_args = {NULL};
+	int ret;
+
+	ret = argp_parse(&wait_err_argp, argc, argv, 0, NULL, &wait_err_args);
+	if (ret)
+		return ret;
+
+	return do_wait_err(&wait_err_args);
+}
+
+
 static void __attribute__((constructor)) data_wait_err_ctor(void)
 {
-	cmd_register("data-wait-err", "<path> <ino> <vers> <offset> <count> <op> <err>",
-		     "return error from matching waiters",
-		     data_wait_err_cmd);
+	cmd_register_argp("data-wait-err", &wait_err_argp, GROUP_AGENT, wait_err_cmd);
 }
@@ -8,8 +8,10 @@
 #include <errno.h>
 #include <string.h>
 #include <limits.h>
+#include <argp.h>

 #include "sparse.h"
+#include "parse.h"
 #include "util.h"
 #include "format.h"
 #include "ioctl.h"
@@ -66,7 +68,14 @@ static int parse_walk_entry(struct scoutfs_ioctl_walk_inodes_entry *ent,
 	return 0;
 }

-static int walk_inodes_cmd(int argc, char **argv)
+struct walk_inodes_args {
+	char *path;
+	char *index;
+	char *first_entry;
+	char *last_entry;
+};
+
+static int do_walk_inodes(struct walk_inodes_args *args)
 {
 	struct scoutfs_ioctl_walk_inodes_entry ents[128];
 	struct scoutfs_ioctl_walk_inodes walk;
@@ -75,44 +84,35 @@ static int walk_inodes_cmd(int argc, char **argv)
 	int fd;
 	int i;

-	if (argc != 5) {
-		fprintf(stderr, "must specify seq and path\n");
-		return -EINVAL;
-	}
-
-	if (!strcasecmp(argv[1], "meta_seq"))
+	if (!strcasecmp(args->index, "meta_seq"))
 		walk.index = SCOUTFS_IOC_WALK_INODES_META_SEQ;
-	else if (!strcasecmp(argv[1], "data_seq"))
+	else if (!strcasecmp(args->index, "data_seq"))
 		walk.index = SCOUTFS_IOC_WALK_INODES_DATA_SEQ;
 	else {
 		fprintf(stderr, "unknown index '%s', try 'meta_seq' or "
-				"'data_seq'\n", argv[1]);
+				"'data_seq'\n", args->index);
 		return -EINVAL;
 	}

-	ret = parse_walk_entry(&walk.first, argv[2]);
+	ret = parse_walk_entry(&walk.first, args->first_entry);
 	if (ret) {
 		fprintf(stderr, "invalid first position '%s', try '1.2.3' or "
-			"'-1'\n", argv[2]);
+			"'-1'\n", args->first_entry);
 		return -EINVAL;

 	}

-	ret = parse_walk_entry(&walk.last, argv[3]);
+	ret = parse_walk_entry(&walk.last, args->last_entry);
 	if (ret) {
 		fprintf(stderr, "invalid last position '%s', try '1.2.3' or "
-			"'-1'\n", argv[3]);
+			"'-1'\n", args->last_entry);
 		return -EINVAL;

 	}

-	fd = open(argv[4], O_RDONLY);
-	if (fd < 0) {
-		ret = -errno;
-		fprintf(stderr, "failed to open '%s': %s (%d)\n",
-			argv[4], strerror(errno), errno);
-		return ret;
-	}
+	fd = get_path(args->path, O_RDONLY);
+	if (fd < 0)
+		return fd;

 	walk.entries_ptr = (unsigned long)ents;
 	walk.nr_entries = array_size(ents);
@@ -149,8 +149,65 @@ static int walk_inodes_cmd(int argc, char **argv)
 	return ret;
 };

+static int walk_inodes_parse_opt(int key, char *arg, struct argp_state *state)
+{
+	struct walk_inodes_args *args = state->input;
+
+	switch (key) {
+	case 'p':
+		args->path = strdup_or_error(state, arg);
+		break;
+	case ARGP_KEY_ARG:
+		if (!args->index)
+			args->index = strdup_or_error(state, arg);
+		else if (!args->first_entry)
+			args->first_entry = strdup_or_error(state, arg);
+		else if (!args->last_entry)
+			args->last_entry = strdup_or_error(state, arg);
+		else
+			argp_error(state, "more than three arguments given");
+		break;
+	case ARGP_KEY_FINI:
+		if (!args->index)
+			argp_error(state, "no index given");
+		if (!args->first_entry)
+			argp_error(state, "no first entry given");
+		if (!args->last_entry)
+			argp_error(state, "no last entry given");
+		break;
+	default:
+		break;
+	}
+
+	return 0;
+}
+
+static struct argp_option options[] = {
+	{ "path", 'p', "PATH", 0, "Path to ScoutFS filesystem"},
+	{ NULL }
+};
+
+static struct argp argp = {
+	options,
+	walk_inodes_parse_opt,
+	"<meta_seq|data_seq> FIRST-ENTRY LAST-ENTRY",
+	"Print range of indexed inodes"
+};
+
+static int walk_inodes_cmd(int argc, char **argv)
+{
+	struct walk_inodes_args walk_inodes_args = {NULL};
+	int ret;
+
+	ret = argp_parse(&argp, argc, argv, 0, NULL, &walk_inodes_args);
+	if (ret)
+		return ret;
+
+	return do_walk_inodes(&walk_inodes_args);
+}
+
+
 static void __attribute__((constructor)) walk_inodes_ctor(void)
 {
-	cmd_register("walk-inodes", "<index> <first> <last> <path>",
-		     "print range of indexed inodes", walk_inodes_cmd);
+	cmd_register_argp("walk-inodes", &argp, GROUP_SEARCH, walk_inodes_cmd);
 }