Add inode crtime creation time

Add an inode creation time field. It's created for all new inodes. It's visible to stat_more. setattr_more can set it during restore. Signed-off-by: Zach Brown <zab@versity.com>
2026-06-09 21:22:36 +00:00 · 2021-07-08 11:00:30 -07:00
38 changed files with 330 additions and 891 deletions
@@ -261,17 +261,20 @@ static bool invalid_extent(u64 start, u64 end, u64 first, u64 last)

 static bool invalid_meta_blkno(struct super_block *sb, u64 blkno)
 {
-	struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
-	u64 last_meta = (i_size_read(sbi->meta_bdev->bd_inode) >> SCOUTFS_BLOCK_LG_SHIFT) - 1;
+	struct scoutfs_super_block *super = &SCOUTFS_SB(sb)->super;

-	return invalid_extent(blkno, blkno, SCOUTFS_META_DEV_START_BLKNO, last_meta);
+	return invalid_extent(blkno, blkno,
+			      le64_to_cpu(super->first_meta_blkno),
+			      le64_to_cpu(super->last_meta_blkno));
 }

 static bool invalid_data_extent(struct super_block *sb, u64 start, u64 len)
 {
-	u64 last_data = (i_size_read(sb->s_bdev->bd_inode) >> SCOUTFS_BLOCK_SM_SHIFT) - 1;
+	struct scoutfs_super_block *super = &SCOUTFS_SB(sb)->super;

-	return invalid_extent(start, start + len - 1, SCOUTFS_DATA_DEV_START_BLKNO, last_data);
+	return invalid_extent(start, start + len - 1,
+			      le64_to_cpu(super->first_data_blkno),
+			      le64_to_cpu(super->last_data_blkno));
 }

 void scoutfs_alloc_init(struct scoutfs_alloc *alloc,
@@ -977,39 +980,6 @@ int scoutfs_alloc_move(struct super_block *sb, struct scoutfs_alloc *alloc,
 	return ret;
 }

-/*
- * Add new free space to an allocator.  _ext_insert will make sure that it doesn't
- * overlap with any existing extents.  This is done by the server in a transaction that
- * also updates total_*_blocks in the super so we don't verify.
- */
-int scoutfs_alloc_insert(struct super_block *sb, struct scoutfs_alloc *alloc,
-			 struct scoutfs_block_writer *wri, struct scoutfs_alloc_root *root,
-			 u64 start, u64 len)
-{
-	struct alloc_ext_args args = {
-		.alloc = alloc,
-		.wri = wri,
-		.root = root,
-		.zone = SCOUTFS_FREE_EXTENT_BLKNO_ZONE,
-	};
-
-	return scoutfs_ext_insert(sb, &alloc_ext_ops, &args, start, len, 0, 0);
-}
-
-int scoutfs_alloc_remove(struct super_block *sb, struct scoutfs_alloc *alloc,
-			 struct scoutfs_block_writer *wri, struct scoutfs_alloc_root *root,
-			 u64 start, u64 len)
-{
-	struct alloc_ext_args args = {
-		.alloc = alloc,
-		.wri = wri,
-		.root = root,
-		.zone = SCOUTFS_FREE_EXTENT_BLKNO_ZONE,
-	};
-
-	return scoutfs_ext_remove(sb, &alloc_ext_ops, &args, start, len);
-}
-
 /*
 * We only trim one block, instead of looping trimming all, because the
 * caller is assuming that we do a fixed amount of work when they check
@@ -1056,31 +1026,18 @@ out:
 }

 /*
- * True if the allocator has enough blocks in the avail list and space
- * in the freed list to be able to perform the callers operations.  If
- * false the caller should back off and return partial progress rather
- * than completely exhausting the avail list or overflowing the freed
- * list.
+ * True if the allocator has enough free blocks to cow (alloc and free)
+ * a list block and all the btree blocks that store extent items.
 *
- * An extent modification dirties three distinct leaves of an allocator
- * btree as it adds and removes the blkno and size sorted items for the
- * old and new lengths of the extent.  Dirtying the paths to these
- * leaves can grow the tree and grow/shrink neighbours at each level.
- * We over-estimate the number of blocks allocated and freed (the paths
- * share a root, growth doesn't free) to err on the simpler and safer
- * side.  The overhead is minimal given the relatively large list blocks
- * and relatively short allocator trees.
- *
- * The caller tells us how many extents they're about to modify and how
- * many other additional blocks they may cow manually.  And finally, the
- * caller could be the first to dirty the avail and freed blocks in the
- * allocator,
+ * At most, an extent operation can dirty down three paths of the tree
+ * to modify a blkno item and two distant order items.  We can grow and
+ * split the root, and then those three paths could share blocks but each
+ * modify two leaf blocks.
 */
-static bool list_has_blocks(struct super_block *sb, struct scoutfs_alloc *alloc,
-			    struct scoutfs_alloc_root *root, u32 extents, u32 addl_blocks)
+static bool list_can_cow(struct super_block *sb, struct scoutfs_alloc *alloc,
+			 struct scoutfs_alloc_root *root)
 {
-	u32 tree_blocks = (((1 + root->root.height) * 2) * 3) * extents;
-	u32 most = 1 + tree_blocks + addl_blocks;
+	u32 most = 1 + (1 + 1 + (3 * (1 - root->root.height + 1)));

 	if (le32_to_cpu(alloc->avail.first_nr) < most) {
 		scoutfs_inc_counter(sb, alloc_list_avail_lo);
@@ -1144,7 +1101,8 @@ int scoutfs_alloc_fill_list(struct super_block *sb,
 		goto out;
 	lblk = bl->data;

-	while (le32_to_cpu(lblk->nr) < target && list_has_blocks(sb, alloc, root, 1, 0)) {
+	while (le32_to_cpu(lblk->nr) < target &&
+	       list_can_cow(sb, alloc, root)) {

 		ret = scoutfs_ext_alloc(sb, &alloc_ext_ops, &args, 0, 0,
 					target - le32_to_cpu(lblk->nr), &ext);
@@ -1188,7 +1146,7 @@ int scoutfs_alloc_empty_list(struct super_block *sb,
 	if (WARN_ON_ONCE(lhead_in_alloc(alloc, lhead)))
 		return -EINVAL;

-	while (lhead->ref.blkno && list_has_blocks(sb, alloc, args.root, 1, 1)) {
+	while (lhead->ref.blkno && list_can_cow(sb, alloc, args.root)) {

 		if (lhead->first_nr == 0) {
 			ret = trim_empty_first_block(sb, alloc, wri, lhead);
@@ -132,12 +132,6 @@ int scoutfs_alloc_move(struct super_block *sb, struct scoutfs_alloc *alloc,
 		       struct scoutfs_alloc_root *dst,
 		       struct scoutfs_alloc_root *src, u64 total,
 		       __le64 *exclusive, __le64 *vacant, u64 zone_blocks);
-int scoutfs_alloc_insert(struct super_block *sb, struct scoutfs_alloc *alloc,
-			 struct scoutfs_block_writer *wri, struct scoutfs_alloc_root *root,
-			 u64 start, u64 len);
-int scoutfs_alloc_remove(struct super_block *sb, struct scoutfs_alloc *alloc,
-			 struct scoutfs_block_writer *wri, struct scoutfs_alloc_root *root,
-			 u64 start, u64 len);

 int scoutfs_alloc_fill_list(struct super_block *sb,
 			    struct scoutfs_alloc *alloc,
@@ -645,11 +645,9 @@ static struct block_private *block_read(struct super_block *sb, u64 blkno)
 			goto out;
 	}

-	wait_event(binf->waitq, uptodate_or_error(bp));
-	if (test_bit(BLOCK_BIT_ERROR, &bp->bits))
+	ret = wait_event_interruptible(binf->waitq, uptodate_or_error(bp));
+	if (ret == 0 && test_bit(BLOCK_BIT_ERROR, &bp->bits))
 		ret = -EIO;
-	else
-		ret = 0;

 out:
 	if (ret < 0) {
@@ -297,14 +297,6 @@ int scoutfs_client_clear_volopt(struct super_block *sb, struct scoutfs_volume_op
 					volopt, sizeof(*volopt), NULL, 0);
 }

-int scoutfs_client_resize_devices(struct super_block *sb, struct scoutfs_net_resize_devices *nrd)
-{
-	struct client_info *client = SCOUTFS_SB(sb)->client_info;
-
-	return scoutfs_net_sync_request(sb, client->conn, SCOUTFS_NET_CMD_RESIZE_DEVICES,
-					nrd, sizeof(*nrd), NULL, 0);
-}
-
 /* The client is receiving a invalidation request from the server */
 static int client_lock(struct super_block *sb,
 		       struct scoutfs_net_connection *conn, u8 cmd, u64 id,
@@ -631,8 +623,10 @@ void scoutfs_client_destroy(struct super_block *sb)
 						 client_farewell_response,
 						 NULL, NULL);
 		if (ret == 0) {
-			wait_for_completion(&client->farewell_comp);
-			ret = client->farewell_error;
+			ret = wait_for_completion_interruptible(
+							&client->farewell_comp);
+			if (ret == 0)
+				ret = client->farewell_error;
 		}
 		if (ret) {
 			scoutfs_inc_counter(sb, client_farewell_error);
@@ -33,7 +33,6 @@ int scoutfs_client_open_ino_map(struct super_block *sb, u64 group_nr,
 int scoutfs_client_get_volopt(struct super_block *sb, struct scoutfs_volume_options *volopt);
 int scoutfs_client_set_volopt(struct super_block *sb, struct scoutfs_volume_options *volopt);
 int scoutfs_client_clear_volopt(struct super_block *sb, struct scoutfs_volume_options *volopt);
-int scoutfs_client_resize_devices(struct super_block *sb, struct scoutfs_net_resize_devices *nrd);

 int scoutfs_client_setup(struct super_block *sb);
 void scoutfs_client_destroy(struct super_block *sb);
@@ -88,6 +88,7 @@
 	EXPAND_COUNTER(forest_read_items)			\
 	EXPAND_COUNTER(forest_roots_next_hint)			\
 	EXPAND_COUNTER(forest_set_bloom_bits)			\
+	EXPAND_COUNTER(inode_evict_intr)			\
 	EXPAND_COUNTER(item_clear_dirty)			\
 	EXPAND_COUNTER(item_create)				\
 	EXPAND_COUNTER(item_delete)				\
@@ -207,7 +207,6 @@ static s64 truncate_extents(struct super_block *sb, struct inode *inode,
 	u64 offset;
 	s64 ret;
 	u8 flags;
-	int err;
 	int i;

 	flags = offline ? SEF_OFFLINE : 0;
@@ -247,18 +246,6 @@ static s64 truncate_extents(struct super_block *sb, struct inode *inode,
 		tr.len = min(ext.len - offset, last - iblock + 1);
 		tr.flags = ext.flags;

-		trace_scoutfs_data_extent_truncated(sb, ino, &tr);
-
-		ret = scoutfs_ext_set(sb, &data_ext_ops, &args,
-				      tr.start, tr.len, 0, flags);
-		if (ret < 0) {
-			if (WARN_ON_ONCE(ret == -EINVAL)) {
-				scoutfs_err(sb, "unexpected truncate inconsistency: ino %llu iblock %llu last %llu, start %llu len %llu",
-					    ino, iblock, last, tr.start, tr.len);
-			}
-			break;
-		}
-
 		if (tr.map) {
 			mutex_lock(&datinf->mutex);
 			ret = scoutfs_free_data(sb, datinf->alloc,
@@ -266,16 +253,16 @@ static s64 truncate_extents(struct super_block *sb, struct inode *inode,
 						&datinf->data_freed,
 						tr.map, tr.len);
 			mutex_unlock(&datinf->mutex);
-			if (ret < 0) {
-				err = scoutfs_ext_set(sb, &data_ext_ops, &args,
-						      tr.start, tr.len, tr.map, tr.flags);
-				if (err < 0)
-					scoutfs_err(sb, "truncate err %d restoring extent after error %lld: ino %llu start %llu len %llu",
-						    err, ret, ino, tr.start, tr.len);
+			if (ret < 0)
 				break;
-			}
 		}

+		trace_scoutfs_data_extent_truncated(sb, ino, &tr);
+
+		ret = scoutfs_ext_set(sb, &data_ext_ops, &args,
+				      tr.start, tr.len, 0, flags);
+		BUG_ON(ret);  /* inconsistent, could prealloc items */
+
 		iblock += tr.len;
 	}

@@ -1031,10 +1018,8 @@ long scoutfs_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
 			end = (iblock + ret) << SCOUTFS_BLOCK_SM_SHIFT;
 			if (end > offset + len)
 				end = offset + len;
-			if (end > i_size_read(inode)) {
+			if (end > i_size_read(inode))
 				i_size_write(inode, end);
-				scoutfs_inode_inc_data_version(inode);
-			}
 		}
 		if (ret >= 0)
 			scoutfs_update_inode_item(inode, lock, &ind_locks);
@@ -253,7 +253,7 @@ static u64 dirent_name_hash(const char *name, unsigned int name_len)
              ((u64)dirent_name_fingerprint(name, name_len) << 32);
 }

-static bool dirent_names_equal(const char *a_name, unsigned int a_len,
+static u64 dirent_names_equal(const char *a_name, unsigned int a_len,
 			      const char *b_name, unsigned int b_len)
 {
 	return a_len == b_len && memcmp(a_name, b_name, a_len) == 0;
@@ -462,7 +462,7 @@ out:
 	else if (ino == 0)
 		inode = NULL;
 	else
-		inode = scoutfs_iget(sb, ino, 0);
+		inode = scoutfs_iget(sb, ino);

 	/*
 	 * We can't splice dir aliases into the dcache.  dir entries
@@ -753,6 +753,7 @@ static int scoutfs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode,
 	struct inode *inode = NULL;
 	struct scoutfs_lock *dir_lock = NULL;
 	struct scoutfs_lock *inode_lock = NULL;
+	struct scoutfs_inode_info *si;
 	LIST_HEAD(ind_locks);
 	u64 hash;
 	u64 pos;
@@ -766,6 +767,7 @@ static int scoutfs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode,
 				 &dir_lock, &inode_lock, NULL, &ind_locks);
 	if (IS_ERR(inode))
 		return PTR_ERR(inode);
+	si = SCOUTFS_I(inode);

 	pos = SCOUTFS_I(dir)->next_readdir_pos++;

@@ -781,6 +783,7 @@ static int scoutfs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode,
 	i_size_write(dir, i_size_read(dir) + dentry->d_name.len);
 	dir->i_mtime = dir->i_ctime = CURRENT_TIME;
 	inode->i_mtime = inode->i_atime = inode->i_ctime = dir->i_mtime;
+	si->crtime = inode->i_mtime;

 	if (S_ISDIR(mode)) {
 		inc_nlink(inode);
@@ -1185,6 +1188,7 @@ static int scoutfs_symlink(struct inode *dir, struct dentry *dentry,
 	struct inode *inode = NULL;
 	struct scoutfs_lock *dir_lock = NULL;
 	struct scoutfs_lock *inode_lock = NULL;
+	struct scoutfs_inode_info *si;
 	LIST_HEAD(ind_locks);
 	u64 hash;
 	u64 pos;
@@ -1205,6 +1209,7 @@ static int scoutfs_symlink(struct inode *dir, struct dentry *dentry,
 				 &dir_lock, &inode_lock, NULL, &ind_locks);
 	if (IS_ERR(inode))
 		return PTR_ERR(inode);
+	si = SCOUTFS_I(inode);

 	ret = symlink_item_ops(sb, SYM_CREATE, scoutfs_ino(inode), inode_lock,
 			       symname, name_len);
@@ -1226,6 +1231,7 @@ static int scoutfs_symlink(struct inode *dir, struct dentry *dentry,
 	dir->i_mtime = dir->i_ctime = CURRENT_TIME;

 	inode->i_ctime = dir->i_mtime;
+	si->crtime = inode->i_ctime;
 	i_size_write(inode, name_len);

 	scoutfs_update_inode_item(inode, inode_lock, &ind_locks);
@@ -1817,6 +1823,7 @@ static int scoutfs_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mod
 	struct scoutfs_lock *dir_lock = NULL;
 	struct scoutfs_lock *inode_lock = NULL;
 	struct scoutfs_lock *orph_lock = NULL;
+	struct scoutfs_inode_info *si;
 	LIST_HEAD(ind_locks);
 	int ret;

@@ -1827,6 +1834,7 @@ static int scoutfs_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mod
 				 &dir_lock, &inode_lock, &orph_lock, &ind_locks);
 	if (IS_ERR(inode))
 		return PTR_ERR(inode);
+	si = SCOUTFS_I(inode);

 	ret = scoutfs_inode_orphan_create(sb, scoutfs_ino(inode), orph_lock);
 	if (ret < 0) {
@@ -1835,6 +1843,7 @@ static int scoutfs_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mod
 	}

 	inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
+	si->crtime = inode->i_mtime;
 	insert_inode_hash(inode);
 	ihold(inode); /* need to update inode modifications in d_tmpfile */
 	d_tmpfile(dentry, inode);
@@ -81,7 +81,7 @@ static struct dentry *scoutfs_fh_to_dentry(struct super_block *sb,
 	trace_scoutfs_fh_to_dentry(sb, fh_type, sfid);

 	if (scoutfs_valid_fileid(fh_type))
-		inode = scoutfs_iget(sb, le64_to_cpu(sfid->ino), 0);
+		inode = scoutfs_iget(sb, le64_to_cpu(sfid->ino));

 	return d_obtain_alias(inode);
 }
@@ -100,7 +100,7 @@ static struct dentry *scoutfs_fh_to_parent(struct super_block *sb,

 	if (scoutfs_valid_fileid(fh_type) &&
 	    fh_type == FILEID_SCOUTFS_WITH_PARENT)
-		inode = scoutfs_iget(sb, le64_to_cpu(sfid->parent_ino), 0);
+		inode = scoutfs_iget(sb, le64_to_cpu(sfid->parent_ino));

 	return d_obtain_alias(inode);
 }
@@ -123,7 +123,7 @@ static struct dentry *scoutfs_get_parent(struct dentry *child)
 	scoutfs_dir_free_backref_path(sb, &list);
 	trace_scoutfs_get_parent(sb, inode, ino);

-	inode = scoutfs_iget(sb, ino, 0);
+	inode = scoutfs_iget(sb, ino);

 	return d_obtain_alias(inode);
 }
@@ -376,7 +376,7 @@ int scoutfs_fence_wait_fenced(struct super_block *sb, long timeout_jiffies)
 	bool error;
 	long ret;

-	ret = wait_event_timeout(fi->waitq, all_fenced(fi, &error), timeout_jiffies);
+	ret = wait_event_interruptible_timeout(fi->waitq, all_fenced(fi, &error), timeout_jiffies);
 	if (ret == 0)
 		ret = -ETIMEDOUT;
 	else if (ret > 0)
@@ -747,6 +747,9 @@ int scoutfs_forest_setup(struct super_block *sb)
 		goto out;
 	}

+	queue_delayed_work(finf->workq, &finf->log_merge_dwork,
+			   msecs_to_jiffies(LOG_MERGE_DELAY_MS));
+
 	ret = 0;
 out:
 	if (ret)
@@ -755,14 +758,6 @@ out:
 	return 0;
 }

-void scoutfs_forest_start(struct super_block *sb)
-{
-	DECLARE_FOREST_INFO(sb, finf);
-
-	queue_delayed_work(finf->workq, &finf->log_merge_dwork,
-			   msecs_to_jiffies(LOG_MERGE_DELAY_MS));
-}
-
 void scoutfs_forest_stop(struct super_block *sb)
 {
 	DECLARE_FOREST_INFO(sb, finf);
@@ -39,7 +39,6 @@ void scoutfs_forest_get_btrees(struct super_block *sb,
 			       struct scoutfs_log_trees *lt);

 int scoutfs_forest_setup(struct super_block *sb);
-void scoutfs_forest_start(struct super_block *sb);
 void scoutfs_forest_stop(struct super_block *sb);
 void scoutfs_forest_destroy(struct super_block *sb);

@@ -779,7 +779,11 @@ struct scoutfs_super_block {
 	__le64 seq;
 	__le64 next_ino;
 	__le64 total_meta_blocks;	/* both static and dynamic */
+	__le64 first_meta_blkno;	/* first dynamically allocated */
+	__le64 last_meta_blkno;
 	__le64 total_data_blocks;
+	__le64 first_data_blkno;
+	__le64 last_data_blkno;
 	struct scoutfs_quorum_config qconf;
 	struct scoutfs_alloc_root meta_alloc[2];
 	struct scoutfs_alloc_root data_alloc;
@@ -817,7 +821,6 @@ struct scoutfs_super_block {
 * online by staging.
 *
 * XXX
- *	- otime?
 *	- compat flags?
 *	- version?
 *	- generation?
@@ -841,6 +844,7 @@ struct scoutfs_inode {
 	struct scoutfs_timespec atime;
 	struct scoutfs_timespec ctime;
 	struct scoutfs_timespec mtime;
+	struct scoutfs_timespec crtime;
 };

 #define SCOUTFS_INO_FLAG_TRUNCATE 0x1
@@ -986,7 +990,6 @@ enum scoutfs_net_cmd {
 	SCOUTFS_NET_CMD_GET_VOLOPT,
 	SCOUTFS_NET_CMD_SET_VOLOPT,
 	SCOUTFS_NET_CMD_CLEAR_VOLOPT,
-	SCOUTFS_NET_CMD_RESIZE_DEVICES,
 	SCOUTFS_NET_CMD_FAREWELL,
 	SCOUTFS_NET_CMD_UNKNOWN,
 };
@@ -1029,11 +1032,6 @@ struct scoutfs_net_roots {
 	struct scoutfs_btree_root srch_root;
 };

-struct scoutfs_net_resize_devices {
-	__le64 new_total_meta_blocks;
-	__le64 new_total_data_blocks;
-};
-
 struct scoutfs_net_lock {
 	struct scoutfs_key key;
 	__le64 write_seq;
@@ -59,7 +59,7 @@ struct inode_sb_info {
 	bool stopped;

 	spinlock_t writeback_lock;
-	struct list_head writeback_list;
+	struct rb_root writeback_inodes;
 	struct inode_allocator dir_ino_alloc;
 	struct inode_allocator ino_alloc;

@@ -68,9 +68,6 @@ struct inode_sb_info {
 	/* serialize multiple inode ->evict trying to delete same ino's items */
 	spinlock_t deleting_items_lock;
 	struct list_head deleting_items_list;
-
-	struct work_struct iput_work;
-	struct llist_head iput_llist;
 };

 #define DECLARE_INODE_SB_INFO(sb, name) \
@@ -95,9 +92,9 @@ static void scoutfs_inode_ctor(void *obj)
 	atomic64_set(&si->data_waitq.changed, 0);
 	init_waitqueue_head(&si->data_waitq.waitq);
 	init_rwsem(&si->xattr_rwsem);
-	INIT_LIST_HEAD(&si->writeback_entry);
+	RB_CLEAR_NODE(&si->writeback_node);
 	scoutfs_lock_init_coverage(&si->ino_lock_cov);
-	atomic_set(&si->iput_count, 0);
+	atomic_set(&si->inv_iput_count, 0);

 	inode_init_once(&si->inode);
 }
@@ -121,14 +118,47 @@ static void scoutfs_i_callback(struct rcu_head *head)
 	kmem_cache_free(scoutfs_inode_cachep, SCOUTFS_I(inode));
 }

+static void insert_writeback_inode(struct inode_sb_info *inf,
+				   struct scoutfs_inode_info *ins)
+{
+	struct rb_root *root = &inf->writeback_inodes;
+	struct rb_node **node = &root->rb_node;
+	struct rb_node *parent = NULL;
+	struct scoutfs_inode_info *si;
+
+	while (*node) {
+		parent = *node;
+		si = container_of(*node, struct scoutfs_inode_info,
+				  writeback_node);
+
+		if (ins->ino < si->ino)
+			node = &(*node)->rb_left;
+		else if (ins->ino > si->ino)
+			node = &(*node)->rb_right;
+		else
+			BUG();
+	}
+
+	rb_link_node(&ins->writeback_node, parent, node);
+	rb_insert_color(&ins->writeback_node, root);
+}
+
+static void remove_writeback_inode(struct inode_sb_info *inf,
+			       struct scoutfs_inode_info *si)
+{
+	if (!RB_EMPTY_NODE(&si->writeback_node)) {
+		rb_erase(&si->writeback_node, &inf->writeback_inodes);
+		RB_CLEAR_NODE(&si->writeback_node);
+	}
+}
+
 void scoutfs_destroy_inode(struct inode *inode)
 {
 	struct scoutfs_inode_info *si = SCOUTFS_I(inode);
 	DECLARE_INODE_SB_INFO(inode->i_sb, inf);

 	spin_lock(&inf->writeback_lock);
-	if (!list_empty(&si->writeback_entry))
-		list_del_init(&si->writeback_entry);
+	remove_writeback_inode(inf, SCOUTFS_I(inode));
 	spin_unlock(&inf->writeback_lock);

 	scoutfs_lock_del_coverage(inode->i_sb, &si->ino_lock_cov);
@@ -232,6 +262,8 @@ static void load_inode(struct inode *inode, struct scoutfs_inode *cinode)
 	si->next_readdir_pos = le64_to_cpu(cinode->next_readdir_pos);
 	si->next_xattr_id = le64_to_cpu(cinode->next_xattr_id);
 	si->flags = le32_to_cpu(cinode->flags);
+	si->crtime.tv_sec = le64_to_cpu(cinode->crtime.sec);
+	si->crtime.tv_nsec = le32_to_cpu(cinode->crtime.nsec);

 	/*
 	 * i_blocks is initialized from online and offline and is then
@@ -662,14 +694,14 @@ struct inode *scoutfs_ilookup(struct super_block *sb, u64 ino)
 	return ilookup5(sb, ino, scoutfs_iget_test, &ino);
 }

-struct inode *scoutfs_iget(struct super_block *sb, u64 ino, int lkf)
+struct inode *scoutfs_iget(struct super_block *sb, u64 ino)
 {
 	struct scoutfs_lock *lock = NULL;
 	struct scoutfs_inode_info *si;
 	struct inode *inode;
 	int ret;

-	ret = scoutfs_lock_ino(sb, SCOUTFS_LOCK_READ, lkf, ino, &lock);
+	ret = scoutfs_lock_ino(sb, SCOUTFS_LOCK_READ, 0, ino, &lock);
 	if (ret)
 		return ERR_PTR(ret);

@@ -734,6 +766,9 @@ static void store_inode(struct scoutfs_inode *cinode, struct inode *inode)
 	cinode->next_readdir_pos = cpu_to_le64(si->next_readdir_pos);
 	cinode->next_xattr_id = cpu_to_le64(si->next_xattr_id);
 	cinode->flags = cpu_to_le32(si->flags);
+	cinode->crtime.sec = cpu_to_le64(si->crtime.tv_sec);
+	cinode->crtime.nsec = cpu_to_le32(si->crtime.tv_nsec);
+	memset(cinode->crtime.__pad, 0, sizeof(cinode->crtime.__pad));
 }

 /*
@@ -1627,6 +1662,11 @@ void scoutfs_evict_inode(struct inode *inode)
 		scoutfs_unlock(sb, lock, SCOUTFS_LOCK_WRITE);
 		scoutfs_unlock(sb, orph_lock, SCOUTFS_LOCK_WRITE_ONLY);
 	}
+	if (ret == -ERESTARTSYS) {
+		/* can be in task with pending, could be found as orphan */
+		scoutfs_inc_counter(sb, inode_evict_intr);
+		ret = 0;
+	}
 	if (ret < 0) {
 		scoutfs_err(sb, "error %d while checking to delete inode nr %llu, it might linger.",
 			    ret, ino);
@@ -1664,49 +1704,6 @@ int scoutfs_drop_inode(struct inode *inode)
 	       generic_drop_inode(inode);
 }

-static void iput_worker(struct work_struct *work)
-{
-	struct inode_sb_info *inf = container_of(work, struct inode_sb_info, iput_work);
-	struct scoutfs_inode_info *si;
-	struct scoutfs_inode_info *tmp;
-	struct llist_node *inodes;
-	bool more;
-
-	inodes = llist_del_all(&inf->iput_llist);
-
-	llist_for_each_entry_safe(si, tmp, inodes, iput_llnode) {
-		do {
-			more = atomic_dec_return(&si->iput_count) > 0;
-			iput(&si->inode);
-		} while (more);
-	}
-}
-
-/*
- * Final iput can get into evict and perform final inode deletion which
- * can delete a lot of items spanning multiple cluster locks and
- * transactions.  It should be understood as a heavy high level
- * operation, more like file writing and less like dropping a refcount.
- *
- * Unfortunately we also have incentives to use igrab/iput from internal
- * contexts that have no business doing that work, like lock
- * invalidation or dirty inode writeback during transaction commit.
- *
- * In those cases we can kick iput off to background work context.
- * Nothing stops multiple puts of an inode before the work runs so we
- * can track multiple puts in flight.
- */
-void scoutfs_inode_queue_iput(struct inode *inode)
-{
-	DECLARE_INODE_SB_INFO(inode->i_sb, inf);
-	struct scoutfs_inode_info *si = SCOUTFS_I(inode);
-
-	if (atomic_inc_return(&si->iput_count) == 1)
-		llist_add(&si->iput_llnode, &inf->iput_llist);
-	smp_wmb(); /* count and list visible before work executes */
-	schedule_work(&inf->iput_work);
-}
-
 /*
 * All mounts are performing this work concurrently.  We introduce
 * significant jitter between them to try and keep them from all
@@ -1822,7 +1819,7 @@ static void inode_orphan_scan_worker(struct work_struct *work)
 		}

 		/* try to cached and evict unused inode to delete, can be racing */
-		inode = scoutfs_iget(sb, ino, 0);
+		inode = scoutfs_iget(sb, ino);
 		if (IS_ERR(inode)) {
 			ret = PTR_ERR(inode);
 			if (ret == -ENOENT)
@@ -1851,33 +1848,30 @@ out:
 * ourselves in knots trying to call through the high level vfs sync
 * methods.
 *
- * File data block allocations tend to advance through free space so we
- * add the inode to the end of the list to roughly encourage sequential
- * IO.
- *
 * This is called by writers who hold the inode and transaction.  The
- * inode is removed from the list by evict->destroy if it's unlinked
- * during the transaction or by committing the transaction.  Pruning the
- * icache won't try to evict the inode as long as it has dirty buffers.
+ * inode's presence in the rbtree is removed by destroy_inode, prevented
+ * by the inode hold, and by committing the transaction, which is
+ * prevented by holding the transaction.  The inode can only go from
+ * empty to on the rbtree while we're here.
 */
 void scoutfs_inode_queue_writeback(struct inode *inode)
 {
 	DECLARE_INODE_SB_INFO(inode->i_sb, inf);
 	struct scoutfs_inode_info *si = SCOUTFS_I(inode);

-	if (list_empty(&si->writeback_entry)) {
+	if (RB_EMPTY_NODE(&si->writeback_node)) {
 		spin_lock(&inf->writeback_lock);
-		if (list_empty(&si->writeback_entry))
-			list_add_tail(&si->writeback_entry, &inf->writeback_list);
+		if (RB_EMPTY_NODE(&si->writeback_node))
+			insert_writeback_inode(inf, si);
 		spin_unlock(&inf->writeback_lock);
 	}
 }

 /*
- * Walk our dirty inodes and either start dirty page writeback or wait
- * for writeback to complete.
+ * Walk our dirty inodes in ino order and either start dirty page
+ * writeback or wait for writeback to complete.
 *
- * This is called by transaction committing so other writers are
+ * This is called by transaction commiting so other writers are
 * excluded.  We're still very careful to iterate over the tree while it
 * and the inodes could be changing.
 *
@@ -1890,19 +1884,29 @@ int scoutfs_inode_walk_writeback(struct super_block *sb, bool write)
 {
 	DECLARE_INODE_SB_INFO(sb, inf);
 	struct scoutfs_inode_info *si;
-	struct scoutfs_inode_info *tmp;
+	struct rb_node *node;
 	struct inode *inode;
+	struct inode *defer_iput = NULL;
 	int ret;

 	spin_lock(&inf->writeback_lock);

-	list_for_each_entry_safe(si, tmp, &inf->writeback_list, writeback_entry) {
+	node = rb_first(&inf->writeback_inodes);
+	while (node) {
+		si = container_of(node, struct scoutfs_inode_info,
+				  writeback_node);
+		node = rb_next(node);
 		inode = igrab(&si->inode);
 		if (!inode)
 			continue;

 		spin_unlock(&inf->writeback_lock);

+		if (defer_iput) {
+			iput(defer_iput);
+			defer_iput = NULL;
+		}
+
 		if (write)
 			ret = filemap_fdatawrite(inode->i_mapping);
 		else
@@ -1910,28 +1914,28 @@ int scoutfs_inode_walk_writeback(struct super_block *sb, bool write)
 		trace_scoutfs_inode_walk_writeback(sb, scoutfs_ino(inode),
 						   write, ret);
 		if (ret) {
-			scoutfs_inode_queue_iput(inode);
+			iput(inode);
 			goto out;
 		}

 		spin_lock(&inf->writeback_lock);

-		/* restore tmp after reacquiring lock */
-		if (WARN_ON_ONCE(list_empty(&si->writeback_entry)))
-			tmp = list_first_entry(&inf->writeback_list, struct scoutfs_inode_info,
-					       writeback_entry);
+		if (WARN_ON_ONCE(RB_EMPTY_NODE(&si->writeback_node)))
+			node = rb_first(&inf->writeback_inodes);
 		else
-			tmp = list_next_entry(si, writeback_entry);
+			node = rb_next(&si->writeback_node);

 		if (!write)
-			list_del_init(&si->writeback_entry);
+			remove_writeback_inode(inf, si);

-		scoutfs_inode_queue_iput(inode);
+		/* avoid iput->destroy lock deadlock */
+		defer_iput = inode;
 	}

 	spin_unlock(&inf->writeback_lock);
 out:
-
+	if (defer_iput)
+		iput(defer_iput);
 	return ret;
 }

@@ -1946,14 +1950,12 @@ int scoutfs_inode_setup(struct super_block *sb)

 	inf->sb = sb;
 	spin_lock_init(&inf->writeback_lock);
-	INIT_LIST_HEAD(&inf->writeback_list);
+	inf->writeback_inodes = RB_ROOT;
 	spin_lock_init(&inf->dir_ino_alloc.lock);
 	spin_lock_init(&inf->ino_alloc.lock);
 	INIT_DELAYED_WORK(&inf->orphan_scan_dwork, inode_orphan_scan_worker);
 	spin_lock_init(&inf->deleting_items_lock);
 	INIT_LIST_HEAD(&inf->deleting_items_list);
-	INIT_WORK(&inf->iput_work, iput_worker);
-	init_llist_head(&inf->iput_llist);

 	sbi->inode_sb_info = inf;

@@ -1965,11 +1967,12 @@ int scoutfs_inode_setup(struct super_block *sb)
 * many other subsystems like networking and the server.  We only kick
 * it off once everything is ready.
 */
-void scoutfs_inode_start(struct super_block *sb)
+int scoutfs_inode_start(struct super_block *sb)
 {
 	DECLARE_INODE_SB_INFO(sb, inf);

 	schedule_orphan_dwork(inf);
+	return 0;
 }

 void scoutfs_inode_stop(struct super_block *sb)
@@ -20,6 +20,7 @@ struct scoutfs_inode_info {
 	u64 online_blocks;
 	u64 offline_blocks;
 	u32 flags;
+	struct timespec crtime;

 	/*
 	 * Protects per-inode extent items, most particularly readers
@@ -49,14 +50,14 @@ struct scoutfs_inode_info {
 	struct scoutfs_per_task pt_data_lock;
 	struct scoutfs_data_waitq data_waitq;
 	struct rw_semaphore xattr_rwsem;
-	struct list_head writeback_entry;
+	struct rb_node writeback_node;

 	struct scoutfs_lock_coverage ino_lock_cov;

 	/* drop if i_count hits 0, allows drop while invalidate holds coverage */
 	bool drop_invalidated;
-	struct llist_node iput_llnode;
-	atomic_t iput_count;
+	struct llist_node inv_iput_llnode;
+	atomic_t inv_iput_count;

 	struct inode inode;
 };
@@ -75,9 +76,8 @@ struct inode *scoutfs_alloc_inode(struct super_block *sb);
 void scoutfs_destroy_inode(struct inode *inode);
 int scoutfs_drop_inode(struct inode *inode);
 void scoutfs_evict_inode(struct inode *inode);
-void scoutfs_inode_queue_iput(struct inode *inode);

-struct inode *scoutfs_iget(struct super_block *sb, u64 ino, int lkf);
+struct inode *scoutfs_iget(struct super_block *sb, u64 ino);
 struct inode *scoutfs_ilookup(struct super_block *sb, u64 ino);

 void scoutfs_inode_init_index_key(struct scoutfs_key *key, u8 type, u64 major,
@@ -132,7 +132,7 @@ void scoutfs_inode_exit(void);
 int scoutfs_inode_init(void);

 int scoutfs_inode_setup(struct super_block *sb);
-void scoutfs_inode_start(struct super_block *sb);
+int scoutfs_inode_start(struct super_block *sb);
 void scoutfs_inode_stop(struct super_block *sb);
 void scoutfs_inode_destroy(struct super_block *sb);

@@ -541,6 +541,7 @@ out:
 static long scoutfs_ioc_stat_more(struct file *file, unsigned long arg)
 {
 	struct inode *inode = file_inode(file);
+	struct scoutfs_inode_info *si = SCOUTFS_I(inode);
 	struct scoutfs_ioctl_stat_more stm;

 	if (get_user(stm.valid_bytes, (__u64 __user *)arg))
@@ -552,6 +553,8 @@ static long scoutfs_ioc_stat_more(struct file *file, unsigned long arg)
 	stm.data_seq = scoutfs_inode_data_seq(inode);
 	stm.data_version = scoutfs_inode_data_version(inode);
 	scoutfs_inode_get_onoff(inode, &stm.online_blocks, &stm.offline_blocks);
+	stm.crtime_sec = si->crtime.tv_sec;
+	stm.crtime_nsec = si->crtime.tv_nsec;

 	if (copy_to_user((void __user *)arg, &stm, stm.valid_bytes))
 		return -EFAULT;
@@ -617,6 +620,7 @@ static long scoutfs_ioc_data_waiting(struct file *file, unsigned long arg)
 static long scoutfs_ioc_setattr_more(struct file *file, unsigned long arg)
 {
 	struct inode *inode = file->f_inode;
+	struct scoutfs_inode_info *si = SCOUTFS_I(inode);
 	struct super_block *sb = inode->i_sb;
 	struct scoutfs_ioctl_setattr_more __user *usm = (void __user *)arg;
 	struct scoutfs_ioctl_setattr_more sm;
@@ -685,6 +689,8 @@ static long scoutfs_ioc_setattr_more(struct file *file, unsigned long arg)
 		i_size_write(inode, sm.i_size);
 	inode->i_ctime.tv_sec = sm.ctime_sec;
 	inode->i_ctime.tv_nsec = sm.ctime_nsec;
+	si->crtime.tv_sec = sm.crtime_sec;
+	si->crtime.tv_nsec = sm.crtime_nsec;

 	scoutfs_update_inode_item(inode, lock, &ind_locks);
 	ret = 0;
@@ -867,21 +873,13 @@ static long scoutfs_ioc_statfs_more(struct file *file, unsigned long arg)
 {
 	struct super_block *sb = file_inode(file)->i_sb;
 	struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
-	struct scoutfs_super_block *super;
+	struct scoutfs_super_block *super = &sbi->super;
 	struct scoutfs_ioctl_statfs_more sfm;
 	int ret;

 	if (get_user(sfm.valid_bytes, (__u64 __user *)arg))
 		return -EFAULT;

-	super = kzalloc(sizeof(struct scoutfs_super_block), GFP_NOFS);
-	if (!super)
-		return -ENOMEM;
-
-	ret = scoutfs_read_super(sb, super);
-	if (ret)
-		goto out;
-
 	sfm.valid_bytes = min_t(u64, sfm.valid_bytes,
 				sizeof(struct scoutfs_ioctl_statfs_more));
 	sfm.fsid = le64_to_cpu(super->hdr.fsid);
@@ -892,15 +890,12 @@ static long scoutfs_ioc_statfs_more(struct file *file, unsigned long arg)

 	ret = scoutfs_client_get_last_seq(sb, &sfm.committed_seq);
 	if (ret)
-		goto out;
+		return ret;

 	if (copy_to_user((void __user *)arg, &sfm, sfm.valid_bytes))
-		ret = -EFAULT;
-	else
-		ret = 0;
-out:
-	kfree(super);
-	return ret;
+		return -EFAULT;
+
+	return 0;
 }

 struct copy_alloc_detail_args {
@@ -1004,37 +999,6 @@ out:
 	return ret;
 }

-static long scoutfs_ioc_resize_devices(struct file *file, unsigned long arg)
-{
-	struct super_block *sb = file_inode(file)->i_sb;
-	struct scoutfs_ioctl_resize_devices __user *urd = (void __user *)arg;
-	struct scoutfs_ioctl_resize_devices rd;
-	struct scoutfs_net_resize_devices nrd;
-	int ret;
-
-	if (!(file->f_mode & FMODE_READ)) {
-		ret = -EBADF;
-		goto out;
-	}
-
-	if (!capable(CAP_SYS_ADMIN)) {
-		ret = -EPERM;
-		goto out;
-	}
-
-	if (copy_from_user(&rd, urd, sizeof(rd))) {
-		ret = -EFAULT;
-		goto out;
-	}
-
-	nrd.new_total_meta_blocks = cpu_to_le64(rd.new_total_meta_blocks);
-	nrd.new_total_data_blocks = cpu_to_le64(rd.new_total_data_blocks);
-
-	ret = scoutfs_client_resize_devices(sb, &nrd);
-out:
-	return ret;
-}
-
 long scoutfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 {
 	switch (cmd) {
@@ -1064,8 +1028,6 @@ long scoutfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 		return scoutfs_ioc_alloc_detail(file, arg);
 	case SCOUTFS_IOC_MOVE_BLOCKS:
 		return scoutfs_ioc_move_blocks(file, arg);
-	case SCOUTFS_IOC_RESIZE_DEVICES:
-		return scoutfs_ioc_resize_devices(file, arg);
 	}

 	return -ENOTTY;
@@ -232,6 +232,9 @@ struct scoutfs_ioctl_stat_more {
 	__u64 data_version;
 	__u64 online_blocks;
 	__u64 offline_blocks;
+	__u64 crtime_sec;
+	__u32 crtime_nsec;
+	__u8  _pad[4];
 };

 #define SCOUTFS_IOC_STAT_MORE _IOR(SCOUTFS_IOCTL_MAGIC, 5, \
@@ -275,7 +278,8 @@ struct scoutfs_ioctl_setattr_more {
 	__u64 flags;
 	__u64 ctime_sec;
 	__u32 ctime_nsec;
-	__u8 _pad[4];
+	__u32 crtime_nsec;
+	__u64 crtime_sec;
 };

 #define SCOUTFS_IOC_SETATTR_MORE_OFFLINE		(1 << 0)
@@ -477,12 +481,4 @@ struct scoutfs_ioctl_move_blocks {
 #define SCOUTFS_IOC_MOVE_BLOCKS _IOR(SCOUTFS_IOCTL_MAGIC, 13, \
 				     struct scoutfs_ioctl_move_blocks)

-struct scoutfs_ioctl_resize_devices {
-	__u64 new_total_meta_blocks;
-	__u64 new_total_data_blocks;
-};
-
-#define SCOUTFS_IOC_RESIZE_DEVICES \
-	_IOR(SCOUTFS_IOCTL_MAGIC, 14, struct scoutfs_ioctl_resize_devices)
-
 #endif
@@ -89,6 +89,8 @@ struct lock_info {
 	struct work_struct shrink_work;
 	struct list_head shrink_list;
 	atomic64_t next_refresh_gen;
+	struct work_struct inv_iput_work;
+	struct llist_head inv_iput_llist;

 	struct dentry *tseq_dentry;
 	struct scoutfs_tseq_tree tseq_tree;
@@ -124,6 +126,34 @@ static bool lock_modes_match(int granted, int requested)
 		requested == SCOUTFS_LOCK_READ);
 }

+/*
+ * Final iput can get into evict and perform final inode deletion which
+ * can delete a lot of items under locks and transactions.  We really
+ * don't want to be doing all that in an iput during invalidation.  When
+ * invalidation sees that iput might perform final deletion it puts them
+ * on a list and queues this work.
+ *
+ * Nothing stops multiple puts for multiple invalidations of an inode
+ * before the work runs so we can track multiple puts in flight.
+ */
+static void lock_inv_iput_worker(struct work_struct *work)
+{
+	struct lock_info *linfo = container_of(work, struct lock_info, inv_iput_work);
+	struct scoutfs_inode_info *si;
+	struct scoutfs_inode_info *tmp;
+	struct llist_node *inodes;
+	bool more;
+
+	inodes = llist_del_all(&linfo->inv_iput_llist);
+
+	llist_for_each_entry_safe(si, tmp, inodes, inv_iput_llnode) {
+		do {
+			more = atomic_dec_return(&si->inv_iput_count) > 0;
+			iput(&si->inode);
+		} while (more);
+	}
+}
+
 /*
 * Invalidate cached data associated with an inode whose lock is going
 * away.
@@ -164,8 +194,11 @@ static void invalidate_inode(struct super_block *sb, u64 ino)
 		if (scoutfs_lock_is_covered(sb, &si->ino_lock_cov) && inode->i_nlink > 0) {
 			iput(inode);
 		} else {
-			/* defer iput to work context so we don't evict inodes from invalidation */
-			scoutfs_inode_queue_iput(inode);
+			/* defer iput to work context so we don't evict inodes from invalidation */ 
+			if (atomic_inc_return(&si->inv_iput_count) == 1)
+				llist_add(&si->inv_iput_llnode, &linfo->inv_iput_llist);
+			smp_wmb(); /* count and list visible before work executes */
+			queue_work(linfo->workq, &linfo->inv_iput_work);
 		}
 	}
 }
@@ -1095,14 +1128,8 @@ static int lock_key_range(struct super_block *sb, enum scoutfs_lock_mode mode, i

 		trace_scoutfs_lock_wait(sb, lock);

-		if (flags & SCOUTFS_LKF_INTERRUPTIBLE) {
-			ret = wait_event_interruptible(lock->waitq,
-						       lock_wait_cond(sb, lock, mode));
-		} else {
-			wait_event(lock->waitq, lock_wait_cond(sb, lock, mode));
-			ret = 0;
-		}
-
+		ret = wait_event_interruptible(lock->waitq,
+					       lock_wait_cond(sb, lock, mode));
 		spin_lock(&linfo->lock);
 		if (ret)
 			break;
@@ -1762,6 +1789,8 @@ int scoutfs_lock_setup(struct super_block *sb)
 	INIT_WORK(&linfo->shrink_work, lock_shrink_worker);
 	INIT_LIST_HEAD(&linfo->shrink_list);
 	atomic64_set(&linfo->next_refresh_gen, 0);
+	INIT_WORK(&linfo->inv_iput_work, lock_inv_iput_worker);
+	init_llist_head(&linfo->inv_iput_llist);
 	scoutfs_tseq_tree_init(&linfo->tseq_tree, lock_tseq_show);

 	sbi->lock_info = linfo;
@@ -6,8 +6,7 @@

 #define SCOUTFS_LKF_REFRESH_INODE	0x01 /* update stale inode from item */
 #define SCOUTFS_LKF_NONBLOCK		0x02 /* only use already held locks */
-#define SCOUTFS_LKF_INTERRUPTIBLE	0x04 /* pending signals return -ERESTARTSYS */
-#define SCOUTFS_LKF_INVALID		(~((SCOUTFS_LKF_INTERRUPTIBLE << 1) - 1))
+#define SCOUTFS_LKF_INVALID		(~((SCOUTFS_LKF_NONBLOCK << 1) - 1))

 #define SCOUTFS_LOCK_NR_MODES		SCOUTFS_LOCK_INVALID

@@ -1486,7 +1486,8 @@ int scoutfs_net_connect(struct super_block *sb,
 			struct scoutfs_net_connection *conn,
 			struct sockaddr_in *sin, unsigned long timeout_ms)
 {
-	int ret = 0;
+	int error = 0;
+	int ret;

 	spin_lock(&conn->lock);
 	conn->connect_sin = *sin;
@@ -1494,8 +1495,10 @@ int scoutfs_net_connect(struct super_block *sb,
 	spin_unlock(&conn->lock);

 	queue_work(conn->workq, &conn->connect_work);
-	wait_event(conn->waitq, connect_result(conn, &ret));
-	return ret;
+
+	ret = wait_event_interruptible(conn->waitq,
+				       connect_result(conn, &error));
+	return ret ?: error;
 }

 static void set_valid_greeting(struct scoutfs_net_connection *conn)
@@ -1631,10 +1634,10 @@ restart:
 		conn->next_send_id = reconn->next_send_id;
 		atomic64_set(&conn->recv_seq, atomic64_read(&reconn->recv_seq));

-		/* reconn should be idle while in reconn_wait  */
+		/* greeting response/ack will be on conn send queue */
 		BUG_ON(!list_empty(&reconn->send_queue));
-		/* queued greeting response is racing, can be in send or resend queue */
-		list_splice_tail_init(&reconn->resend_queue, &conn->resend_queue);
+		BUG_ON(!list_empty(&conn->resend_queue));
+		list_splice_init(&reconn->resend_queue, &conn->resend_queue);

 		/* new conn info is unused, swap, old won't call down */
 		swap(conn->info, reconn->info);
@@ -1798,10 +1801,11 @@ int scoutfs_net_sync_request(struct super_block *sb,
 	ret = scoutfs_net_submit_request(sb, conn, cmd, arg, arg_len,
 					 sync_response, &sreq, &id);

-	if (ret == 0) {
-		wait_for_completion(&sreq.comp);
+	ret = wait_for_completion_interruptible(&sreq.comp);
+	if (ret == -ERESTARTSYS)
+		scoutfs_net_cancel_request(sb, conn, cmd, id);
+	else
 		ret = sreq.error;
-	}

 	return ret;
 }
@@ -97,7 +97,7 @@ struct quorum_host_msg {

 struct last_msg {
 	struct quorum_host_msg msg;
-	ktime_t ts;
+	struct timespec64 ts;
 };

 enum quorum_role { FOLLOWER, CANDIDATE, LEADER };
@@ -209,7 +209,7 @@ static void send_msg_members(struct super_block *sb, int type, u64 term,
 	DECLARE_QUORUM_INFO(sb, qinf);
 	struct mount_options *opts = &SCOUTFS_SB(sb)->opts;
 	struct scoutfs_super_block *super = &SCOUTFS_SB(sb)->super;
-	ktime_t now;
+	struct timespec64 ts;
 	int i;

 	struct scoutfs_quorum_message qmes = {
@@ -235,6 +235,7 @@ static void send_msg_members(struct super_block *sb, int type, u64 term,

 	qmes.crc = quorum_message_crc(&qmes);

+	ts = ktime_to_timespec64(ktime_get());

 	for (i = 0; i < SCOUTFS_QUORUM_MAX_SLOTS; i++) {
 		if (!quorum_slot_present(super, i) ||
@@ -242,13 +243,12 @@ static void send_msg_members(struct super_block *sb, int type, u64 term,
 			continue;

 		scoutfs_quorum_slot_sin(super, i, &sin);
-		now = ktime_get();
 		kernel_sendmsg(qinf->sock, &mh, &kv, 1, kv.iov_len);

 		spin_lock(&qinf->show_lock);
 		qinf->last_send[i].msg.term = term;
 		qinf->last_send[i].msg.type = type;
-		qinf->last_send[i].ts = now;
+		qinf->last_send[i].ts = ts;
 		spin_unlock(&qinf->show_lock);

 		if (i == only)
@@ -308,8 +308,6 @@ static int recv_msg(struct super_block *sb, struct quorum_host_msg *msg,
 	if (ret < 0)
 		return ret;

-	now = ktime_get();
-
 	if (ret != sizeof(qmes) ||
 	    qmes.crc != quorum_message_crc(&qmes) ||
 	    qmes.fsid != super->hdr.fsid ||
@@ -329,7 +327,7 @@ static int recv_msg(struct super_block *sb, struct quorum_host_msg *msg,

 	spin_lock(&qinf->show_lock);
 	qinf->last_recv[msg->from].msg = *msg;
-	qinf->last_recv[msg->from].ts = now;
+	qinf->last_recv[msg->from].ts = ktime_to_timespec64(ktime_get());
 	spin_unlock(&qinf->show_lock);

 	return 0;
@@ -558,8 +556,10 @@ out:
 			ret = err;
 	}

-	if (ret < 0)
+	if (ret < 0) {
+		scoutfs_err(sb, "error %d attempting to find and fence previous leaders", ret);
 		scoutfs_inc_counter(sb, quorum_fence_error);
+	}

 	return ret;
 }
@@ -610,7 +610,7 @@ static void scoutfs_quorum_worker(struct work_struct *work)
 	if (ret < 0)
 		goto out;

-	while (!(qinf->shutdown || scoutfs_forcing_unmount(sb))) {
+	while (!qinf->shutdown) {

 		ret = recv_msg(sb, &msg, qst.timeout);
 		if (ret < 0) {
@@ -733,15 +733,13 @@ static void scoutfs_quorum_worker(struct work_struct *work)
 			ret = scoutfs_server_start(sb, qst.term);
 			if (ret < 0) {
 				clear_bit(QINF_FLAG_SERVER, &qinf->flags);
+				scoutfs_err(sb, "server startup failed with %d", ret);
 				/* store our increased term */
 				err = update_quorum_block(sb, SCOUTFS_QUORUM_EVENT_STOP, qst.term,
 							  true);
-				if (err < 0) {
+				if (err < 0 && ret == 0)
 					ret = err;
-					goto out;
-				}
-				ret = 0;
-				continue;
+				goto out;
 			}
 		}

@@ -791,7 +789,7 @@ static void scoutfs_quorum_worker(struct work_struct *work)
 	update_quorum_block(sb, SCOUTFS_QUORUM_EVENT_END, qst.term, true);
 out:
 	if (ret < 0) {
-		scoutfs_err(sb, "quorum service saw error %d, shutting down.  This mount is no longer participating in quorum.  It should be remounted to restore service.",
+		scoutfs_err(sb, "quorum service saw error %d, shutting down.  Cluster will be degraded until this slot is remounted to restart the quorum service",
 			    ret);
 	}
 }
@@ -917,7 +915,6 @@ static ssize_t status_show(struct kobject *kobj, struct kobj_attribute *attr,
 	struct quorum_status qst;
 	struct last_msg last;
 	struct timespec64 ts;
-	const ktime_t now = ktime_get();
 	size_t size;
 	int ret;
 	int i;
@@ -939,9 +936,9 @@ static ssize_t status_show(struct kobject *kobj, struct kobj_attribute *attr,
 		     qst.vote_for);
 	snprintf_ret(buf, size, &ret, "vote_bits 0x%lx (count %lu)\n",
 		     qst.vote_bits, hweight_long(qst.vote_bits));
-	ts = ktime_to_timespec64(ktime_sub(qst.timeout, now));
-	snprintf_ret(buf, size, &ret, "timeout_in_secs %lld.%09u\n",
-		     (s64)ts.tv_sec, (int)ts.tv_nsec);
+	ts = ktime_to_timespec64(qst.timeout);
+	snprintf_ret(buf, size, &ret, "timeout %llu.%u\n",
+		     (u64)ts.tv_sec, (int)ts.tv_nsec);

 	for (i = 0; i < SCOUTFS_QUORUM_MAX_SLOTS; i++) {
 		spin_lock(&qinf->show_lock);
@@ -951,11 +948,10 @@ static ssize_t status_show(struct kobject *kobj, struct kobj_attribute *attr,
 		if (last.msg.term == 0)
 			continue;

-		ts = ktime_to_timespec64(ktime_sub(now, last.ts));
 		snprintf_ret(buf, size, &ret,
-			     "last_send to %u term %llu type %u secs_since %lld.%09u\n",
+			     "last_send to %u term %llu type %u ts %llu.%u\n",
 			     i, last.msg.term, last.msg.type,
-			     (s64)ts.tv_sec, (int)ts.tv_nsec);
+			     (u64)last.ts.tv_sec, (int)last.ts.tv_nsec);
 	}

 	for (i = 0; i < SCOUTFS_QUORUM_MAX_SLOTS; i++) {
@@ -965,12 +961,10 @@ static ssize_t status_show(struct kobject *kobj, struct kobj_attribute *attr,

 		if (last.msg.term == 0)
 			continue;
-
-		ts = ktime_to_timespec64(ktime_sub(now, last.ts));
 		snprintf_ret(buf, size, &ret,
-			     "last_recv from %u term %llu type %u secs_since %lld.%09u\n",
+			     "last_recv from %u term %llu type %u ts %llu.%u\n",
 			     i, last.msg.term, last.msg.type,
-			     (s64)ts.tv_sec, (int)ts.tv_nsec);
+			     (u64)last.ts.tv_sec, (int)last.ts.tv_nsec);
 	}

 	return ret;
@@ -323,6 +323,7 @@ static void scoutfs_server_commit_func(struct work_struct *work)
 	struct commit_waiter *cw;
 	struct commit_waiter *pos;
 	struct llist_node *node;
+	u64 reserved;
 	int ret;

 	trace_scoutfs_server_commit_work_enter(sb, 0, 0);
@@ -388,19 +389,16 @@ static void scoutfs_server_commit_func(struct work_struct *work)
 	server->other_freed = &super->server_meta_freed[server->other_ind];

 	/*
-	 * get_log_trees sets ALLOC_LOW when its allocator drops below
-	 * the reserved blocks after having filled the log trees's avail
-	 * allocator during its transaction.  To avoid prematurely
-	 * setting the low flag and causing enospc we make sure that the
-	 * next transaction's meta_avail has 2x the reserved blocks so
-	 * that it can consume a full reserved amount and still have
-	 * enough to avoid enospc.  We swap to freed if avail is under
-	 * the buffer and freed is larger.
+	 * The reserved metadata blocks includes the max size of
+	 * outstanding allocators and a server transaction could be
+	 * asked to refill all those allocators from meta_avail.  If our
+	 * meta_avail falls below the reserved count, and freed is still
+	 * above it, then swap so that we don't start returning enospc
+	 * until we're truly low.
 	 */
-	if ((le64_to_cpu(server->meta_avail->total_len) <
-	     (scoutfs_server_reserved_meta_blocks(sb) * 2)) &&
-	    (le64_to_cpu(server->meta_freed->total_len) >
-	     le64_to_cpu(server->meta_avail->total_len)))
+	reserved = scoutfs_server_reserved_meta_blocks(sb);
+	if (le64_to_cpu(server->meta_avail->total_len) <= reserved &&
+	    le64_to_cpu(server->meta_freed->total_len) > reserved)
 		swap(server->meta_avail, server->meta_freed);

 	ret = 0;
@@ -2357,25 +2355,15 @@ static int open_ino_map_response(struct super_block *sb, struct scoutfs_net_conn
 	return scoutfs_omap_server_handle_response(sb, rid, resp);
 }

-/*
- * The server is sending an omap requests to all the clients it thought
- * were connected when it received a request from another client.
- * This send can race with the client's connection being removed.  We
- * can drop those sends on the floor and mask ENOTCONN.  The client's rid
- * will soon be removed from the request which will be correctly handled.
- */
+/* The server is sending an omap request to the client */
 int scoutfs_server_send_omap_request(struct super_block *sb, u64 rid,
 				     struct scoutfs_open_ino_map_args *args)
 {
 	struct server_info *server = SCOUTFS_SB(sb)->server_info;
-	int ret;

-	ret = scoutfs_net_submit_request_node(sb, server->conn, rid, SCOUTFS_NET_CMD_OPEN_INO_MAP,
+	return scoutfs_net_submit_request_node(sb, server->conn, rid, SCOUTFS_NET_CMD_OPEN_INO_MAP,
 					      args, sizeof(*args),
 					      open_ino_map_response, NULL, NULL);
-	if (ret == -ENOTCONN)
-		ret = 0;
-	return ret;
 }

 /*
@@ -2565,103 +2553,6 @@ out:
 	return scoutfs_net_response(sb, conn, cmd, id, ret, NULL, 0);
 }

-static u64 device_blocks(struct block_device *bdev, int shift)
-{
-	return i_size_read(bdev->bd_inode) >> shift;
-}
-
-static int server_resize_devices(struct super_block *sb, struct scoutfs_net_connection *conn,
-				 u8 cmd, u64 id, void *arg, u16 arg_len)
-{
-	DECLARE_SERVER_INFO(sb, server);
-	struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
-	struct scoutfs_super_block *super = &SCOUTFS_SB(sb)->super;
-	struct scoutfs_net_resize_devices *nrd;
-	u64 meta_tot;
-	u64 meta_start;
-	u64 meta_len;
-	u64 data_tot;
-	u64 data_start;
-	u64 data_len;
-	int ret;
-	int err;
-
-	if (arg_len != sizeof(struct scoutfs_net_resize_devices)) {
-		ret = -EINVAL;
-		goto out;
-	}
-	nrd = arg;
-
-	meta_tot = le64_to_cpu(nrd->new_total_meta_blocks);
-	data_tot = le64_to_cpu(nrd->new_total_data_blocks);
-
-	scoutfs_server_hold_commit(sb);
-	mutex_lock(&server->alloc_mutex);
-
-	if (meta_tot == le64_to_cpu(super->total_meta_blocks))
-		meta_tot = 0;
-	if (data_tot == le64_to_cpu(super->total_data_blocks))
-		data_tot = 0;
-
-	if (!meta_tot && !data_tot) {
-		ret = 0;
-		goto unlock;
-	}
-
-	/* we don't support shrinking */
-	if ((meta_tot && (meta_tot < le64_to_cpu(super->total_meta_blocks))) ||
-	    (data_tot && (data_tot < le64_to_cpu(super->total_data_blocks)))) {
-		ret = -EINVAL;
-		goto unlock;
-	}
-
-	/* must be within devices */
-	if ((meta_tot > device_blocks(sbi->meta_bdev, SCOUTFS_BLOCK_LG_SHIFT)) ||
-	    (data_tot > device_blocks(sb->s_bdev, SCOUTFS_BLOCK_SM_SHIFT))) {
-		ret = -EINVAL;
-		goto unlock;
-	}
-
-	/* extents are only used if _tot is set */
-	meta_start = le64_to_cpu(super->total_meta_blocks);
-	meta_len = meta_tot - meta_start;
-	data_start = le64_to_cpu(super->total_data_blocks);
-	data_len = data_tot - data_start;
-
-	if (meta_tot) {
-		ret = scoutfs_alloc_insert(sb, &server->alloc, &server->wri,
-					   server->meta_avail, meta_start, meta_len);
-		if (ret < 0)
-			goto unlock;
-	}
-
-	if (data_tot) {
-		ret = scoutfs_alloc_insert(sb, &server->alloc, &server->wri,
-					   &super->data_alloc, data_start, data_len);
-		if (ret < 0) {
-			if (meta_tot) {
-				err = scoutfs_alloc_remove(sb, &server->alloc, &server->wri,
-							   server->meta_avail, meta_start,
-							   meta_len);
-				WARN_ON_ONCE(err); /* btree blocks are dirty.. really unlikely? */
-			}
-			goto unlock;
-		}
-	}
-
-	if (meta_tot)
-		super->total_meta_blocks = cpu_to_le64(meta_tot);
-	if (data_tot)
-		super->total_data_blocks = cpu_to_le64(data_tot);
-
-	ret = 0;
-unlock:
-	mutex_unlock(&server->alloc_mutex);
-	ret = scoutfs_server_apply_commit(sb, ret);
-out:
-	return scoutfs_net_response(sb, conn, cmd, id, ret, NULL, 0);
-};
-
 static void init_mounted_client_key(struct scoutfs_key *key, u64 rid)
 {
 	*key = (struct scoutfs_key) {
@@ -3298,7 +3189,6 @@ static scoutfs_net_request_t server_req_funcs[] = {
 	[SCOUTFS_NET_CMD_GET_VOLOPT]		= server_get_volopt,
 	[SCOUTFS_NET_CMD_SET_VOLOPT]		= server_set_volopt,
 	[SCOUTFS_NET_CMD_CLEAR_VOLOPT]		= server_clear_volopt,
-	[SCOUTFS_NET_CMD_RESIZE_DEVICES]	= server_resize_devices,
 	[SCOUTFS_NET_CMD_FAREWELL]		= server_farewell,
 };

@@ -3573,15 +3463,13 @@ static void scoutfs_server_worker(struct work_struct *work)

 	trace_scoutfs_server_work_enter(sb, 0, 0);

-	scoutfs_quorum_slot_sin(super, opts->quorum_slot_nr, &sin);
-	scoutfs_info(sb, "server starting at "SIN_FMT, SIN_ARG(&sin));
-
 	/* first make sure no other servers are still running */
 	ret = scoutfs_quorum_fence_leaders(sb, server->term);
-	if (ret < 0) {
-		scoutfs_err(sb, "server error %d attempting to fence previous leaders", ret);
+	if (ret < 0)
 		goto out;
-	}
+
+	scoutfs_quorum_slot_sin(super, opts->quorum_slot_nr, &sin);
+	scoutfs_info(sb, "server setting up at "SIN_FMT, SIN_ARG(&sin));

 	conn = scoutfs_net_alloc_conn(sb, server_notify_up, server_notify_down,
 				      sizeof(struct server_client_info),
@@ -3602,10 +3490,8 @@ static void scoutfs_server_worker(struct work_struct *work)

 	/* start up the server subsystems before accepting */
 	ret = scoutfs_read_super(sb, super);
-	if (ret < 0) {
-		scoutfs_err(sb, "server error %d reading super block", ret);
+	if (ret < 0)
 		goto shutdown;
-	}

 	/* update volume options early, possibly for use during startup */
 	write_seqcount_begin(&server->volopt_seqcount);
@@ -3643,17 +3529,10 @@ static void scoutfs_server_worker(struct work_struct *work)
 	}
 	scoutfs_server_set_seq_if_greater(sb, max_seq);

-	ret = scoutfs_lock_server_setup(sb, &server->alloc, &server->wri);
-	if (ret) {
-		scoutfs_err(sb, "server error %d starting lock server", ret);
+	ret = scoutfs_lock_server_setup(sb, &server->alloc, &server->wri) ?:
+	      start_recovery(sb);
+	if (ret)
 		goto shutdown;
-	}
-
-	ret = start_recovery(sb);
-	if (ret) {
-		scoutfs_err(sb, "server error %d starting client recovery", ret);
-		goto shutdown;
-	}

 	/* start accepting connections and processing work */
 	server->conn = conn;
@@ -3664,7 +3543,7 @@ static void scoutfs_server_worker(struct work_struct *work)

 	queue_reclaim_work(server, 0);

-	/* interruptible mostly to avoid stuck messages */
+	/* wait_event/wake_up provide barriers */
 	wait_event_interruptible(server->waitq, test_shutting_down(server));

 shutdown:
@@ -230,15 +230,7 @@ static void scoutfs_metadev_close(struct super_block *sb)
 	struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);

 	if (sbi->meta_bdev) {
-		/*
-		 * Some kernels have blkdev_reread_part which calls
-		 * fsync_bdev while holding the bd_mutex which inverts
-		 * the s_umount hold in deactivate_super and blkdev_put
-		 * from kill_sb->put_super.
-		 */
-		lockdep_off();
 		blkdev_put(sbi->meta_bdev, SCOUTFS_META_BDEV_MODE);
-		lockdep_on();
 		sbi->meta_bdev = NULL;
 	}
 }
@@ -336,16 +328,28 @@ int scoutfs_write_super(struct super_block *sb,
 				      sizeof(struct scoutfs_super_block));
 }

-static bool small_bdev(struct super_block *sb, char *which, u64 blocks,
-		       struct block_device *bdev, int shift)
+static bool invalid_blkno_limits(struct super_block *sb, char *which,
+				 u64 start, __le64 first, __le64 last,
+				 struct block_device *bdev, int shift)
 {
-	u64 size = (u64)i_size_read(bdev->bd_inode);
-	u64 count = size >> shift;
+	u64 blkno;

-	if (blocks > count) {
-		scoutfs_err(sb, "super block records %llu %s blocks, but device %u:%u size %llu only allows %llu blocks",
-			blocks, which, MAJOR(bdev->bd_dev), MINOR(bdev->bd_dev), size, count);
+	if (le64_to_cpu(first) < start) {
+		scoutfs_err(sb, "super block first %s blkno %llu is within first valid blkno %llu",
+			which, le64_to_cpu(first), start);
+		return true;
+	}

+	if (le64_to_cpu(first) > le64_to_cpu(last)) {
+		scoutfs_err(sb, "super block first %s blkno %llu is greater than last %s blkno %llu",
+			which, le64_to_cpu(first), which, le64_to_cpu(last));
+		return true;
+	}
+
+	blkno = (i_size_read(bdev->bd_inode) >> shift) - 1;
+	if (le64_to_cpu(last) > blkno) {
+		scoutfs_err(sb, "super block last %s blkno %llu is beyond device size last blkno %llu",
+			which, le64_to_cpu(last), blkno);
 		return true;
 	}

@@ -405,10 +409,16 @@ static int scoutfs_read_super_from_bdev(struct super_block *sb,

 	/* XXX do we want more rigorous invalid super checking? */

-	if (small_bdev(sb, "metadata", le64_to_cpu(super->total_meta_blocks), sbi->meta_bdev,
-		       SCOUTFS_BLOCK_LG_SHIFT) ||
-	    small_bdev(sb, "data", le64_to_cpu(super->total_data_blocks), sb->s_bdev,
-		       SCOUTFS_BLOCK_SM_SHIFT)) {
+	if (invalid_blkno_limits(sb, "meta",
+			         SCOUTFS_META_DEV_START_BLKNO,
+				 super->first_meta_blkno,
+				 super->last_meta_blkno, sbi->meta_bdev,
+				 SCOUTFS_BLOCK_LG_SHIFT) ||
+	    invalid_blkno_limits(sb, "data",
+			         SCOUTFS_DATA_DEV_START_BLKNO,
+				 super->first_data_blkno,
+				 super->last_data_blkno, sb->s_bdev,
+				 SCOUTFS_BLOCK_SM_SHIFT)) {
 		ret = -EINVAL;
 	}

@@ -612,16 +622,15 @@ static int scoutfs_fill_super(struct super_block *sb, void *data, int silent)
 	      scoutfs_quorum_setup(sb) ?:
 	      scoutfs_client_setup(sb) ?:
 	      scoutfs_volopt_setup(sb) ?:
-	      scoutfs_srch_setup(sb);
+	      scoutfs_trans_get_log_trees(sb) ?:
+	      scoutfs_srch_setup(sb) ?:
+	      scoutfs_inode_start(sb);
 	if (ret)
 		goto out;

-	/* this interruptible iget lets hung mount be aborted with ctl-c */
-	inode = scoutfs_iget(sb, SCOUTFS_ROOT_INO, SCOUTFS_LKF_INTERRUPTIBLE);
+	inode = scoutfs_iget(sb, SCOUTFS_ROOT_INO);
 	if (IS_ERR(inode)) {
 		ret = PTR_ERR(inode);
-		if (ret == -ERESTARTSYS)
-			ret = -EINTR;
 		goto out;
 	}

@@ -631,15 +640,10 @@ static int scoutfs_fill_super(struct super_block *sb, void *data, int silent)
 		goto out;
 	}

-	/* send requests once iget progress shows we had a server */
-	ret = scoutfs_trans_get_log_trees(sb) ?:
-	      scoutfs_client_advance_seq(sb, &sbi->trans_seq);
+	ret = scoutfs_client_advance_seq(sb, &sbi->trans_seq);
 	if (ret)
 		goto out;

-	/* start up background services that use everything else */
-	scoutfs_inode_start(sb);
-	scoutfs_forest_start(sb);
 	scoutfs_trans_restart_sync_deadline(sb);
 	ret = 0;
 out:
@@ -291,7 +291,7 @@ static void queue_trans_work(struct scoutfs_sb_info *sbi)
 int scoutfs_trans_sync(struct super_block *sb, int wait)
 {
 	struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
-	struct write_attempt attempt = { .ret = 0 };
+	struct write_attempt attempt;
 	int ret;


@@ -306,8 +306,10 @@ int scoutfs_trans_sync(struct super_block *sb, int wait)

 	queue_trans_work(sbi);

-	wait_event(sbi->trans_write_wq, write_attempted(sbi, &attempt));
-	ret = attempt.ret;
+	ret = wait_event_interruptible(sbi->trans_write_wq,
+				       write_attempted(sbi, &attempt));
+	if (ret == 0)
+		ret = attempt.ret;

 	return ret;
 }
@@ -494,7 +496,9 @@ int scoutfs_hold_trans(struct super_block *sb, bool allocing)
 		/* wait until the writer work is finished */
 		if (!inc_holders_unless_writer(tri)) {
 			dec_journal_info_holders();
-			wait_event(sbi->trans_hold_wq, holders_no_writer(tri));
+			ret = wait_event_interruptible(sbi->trans_hold_wq, holders_no_writer(tri));
+			if (ret < 0)
+				break;
 			continue;
 		}

@@ -510,7 +514,10 @@ int scoutfs_hold_trans(struct super_block *sb, bool allocing)
 			seq = scoutfs_trans_sample_seq(sb);
 			release_holders(sb);
 			queue_trans_work(sbi);
-			wait_event(sbi->trans_hold_wq, scoutfs_trans_sample_seq(sb) != seq);
+			ret = wait_event_interruptible(sbi->trans_hold_wq,
+						       scoutfs_trans_sample_seq(sb) != seq);
+			if (ret < 0)
+				break;
 			continue;
 		}

@@ -40,7 +40,7 @@ t_filter_dmesg()
 	# mount and unmount spew a bunch
 	re="$re|scoutfs.*client connected"
 	re="$re|scoutfs.*client disconnected"
-	re="$re|scoutfs.*server starting"
+	re="$re|scoutfs.*server setting up"
 	re="$re|scoutfs.*server ready"
 	re="$re|scoutfs.*server accepted"
 	re="$re|scoutfs.*server closing"
@@ -1,27 +0,0 @@
-== make initial small fs
-== 0s do nothing
-== shrinking fails
-resize_devices ioctl failed: Invalid argument (22)
-scoutfs: resize-devices failed: Invalid argument (22)
-resize_devices ioctl failed: Invalid argument (22)
-scoutfs: resize-devices failed: Invalid argument (22)
-resize_devices ioctl failed: Invalid argument (22)
-scoutfs: resize-devices failed: Invalid argument (22)
-== existing sizes do nothing
-== growing outside device fails
-resize_devices ioctl failed: Invalid argument (22)
-scoutfs: resize-devices failed: Invalid argument (22)
-resize_devices ioctl failed: Invalid argument (22)
-scoutfs: resize-devices failed: Invalid argument (22)
-resize_devices ioctl failed: Invalid argument (22)
-scoutfs: resize-devices failed: Invalid argument (22)
-== resizing meta works
-== resizing data works
-== shrinking back fails
-resize_devices ioctl failed: Invalid argument (22)
-scoutfs: resize-devices failed: Invalid argument (22)
-resize_devices ioctl failed: Invalid argument (22)
-scoutfs: resize-devices failed: Invalid argument (22)
-== resizing again does nothing
-== resizing to full works
-== cleanup extra fs
@@ -29,7 +29,6 @@ lock-conflicting-batch-commit.sh
 cross-mount-data-free.sh
 persistent-item-vers.sh
 setup-error-teardown.sh
-resize-devices.sh
 fence-and-reclaim.sh
 orphan-inodes.sh
 mount-unmount-race.sh
@@ -48,9 +48,8 @@ char buf[SZ];

 int main(int argc, char **argv)
 {
-	struct scoutfs_ioctl_release rel = {0};
+	struct scoutfs_ioctl_release ioctl_args = {0};
 	struct scoutfs_ioctl_move_blocks mb;
-	struct scoutfs_ioctl_stat_more stm;
 	struct sub_tmp_info sub_tmps[8];
 	int tot_size = 0;
 	char *dest_file;
@@ -112,20 +111,12 @@ int main(int argc, char **argv)
 		exit(1);
 	}

-	// get current data_version after fallocate's size extensions
-	stm.valid_bytes = sizeof(struct scoutfs_ioctl_stat_more);
-	ret = ioctl(dest_fd, SCOUTFS_IOC_STAT_MORE, &stm);
-	if (ret < 0) {
-		perror("stat_more ioctl error");
-		exit(1);
-	}
-
 	// release everything in dest file
-	rel.offset = 0;
-	rel.length = tot_size;
-	rel.data_version = stm.data_version;
+	ioctl_args.offset = 0;
+	ioctl_args.length = tot_size;
+	ioctl_args.data_version = 0;

-	ret = ioctl(dest_fd, SCOUTFS_IOC_RELEASE, &rel);
+	ret = ioctl(dest_fd, SCOUTFS_IOC_RELEASE, &ioctl_args);
 	if (ret < 0) {
 		perror("error");
 		exit(1);
@@ -139,7 +130,7 @@ int main(int argc, char **argv)
 		mb.from_off = 0;
 		mb.len = sub_tmp->length;
 		mb.to_off = sub_tmp->offset;
-		mb.data_version = stm.data_version;
+		mb.data_version = 0;
 		mb.flags = SCOUTFS_IOC_MB_STAGE;

 		ret = ioctl(dest_fd, SCOUTFS_IOC_MOVE_BLOCKS, &mb);
@@ -1,149 +0,0 @@
-#
-# Some basic tests of online resizing metadata and data devices.
-#
-
-statfs_total() {
-	local single="total_$1_blocks"
-	local mnt="$2"
-
-	scoutfs statfs -s $single -p "$mnt"
-}
-
-df_free() {
-	local md="$1"
-	local mnt="$2"
-
-	scoutfs df -p "$mnt" | awk '($1 == "'$md'") { print $5; exit }'
-}
-
-same_totals() {
-	cur_meta_tot=$(statfs_total meta "$SCR")
-	cur_data_tot=$(statfs_total data "$SCR")
-
-	test "$cur_meta_tot" == "$exp_meta_tot" || \
-		t_fail "cur total_meta_blocks $cur_meta_tot != expected $exp_meta_tot"
-	test "$cur_data_tot" == "$exp_data_tot" || \
-		t_fail "cur total_data_blocks $cur_data_tot != expected $exp_data_tot"
-}
-
-#
-# make sure that the specified devices have grown by doubling.   The
-# total blocks can be tested exactly but the df reported total needs
-# some slop to account for reserved blocks and concurrent allocation.
-#
-devices_grew() {
-	cur_meta_tot=$(statfs_total meta "$SCR")
-	cur_data_tot=$(statfs_total data "$SCR")
-	cur_meta_df=$(df_free MetaData "$SCR")
-	cur_data_df=$(df_free Data "$SCR")
-
-	local grow_meta_tot=$(echo "$exp_meta_tot * 2" | bc)
-	local grow_data_tot=$(echo "$exp_data_tot * 2" | bc)
-	local grow_meta_df=$(echo "($exp_meta_df * 1.95)/1" | bc)
-	local grow_data_df=$(echo "($exp_data_df * 1.95)/1" | bc)
-
-	if [ "$1" == "meta" ]; then
-		test "$cur_meta_tot" == "$grow_meta_tot" || \
-			t_fail "cur total_meta_blocks $cur_meta_tot != grown $grow_meta_tot"
-		test "$cur_meta_df" -lt "$grow_meta_df" && \
-			t_fail "cur meta df total $cur_meta_df < grown $grow_meta_df"
-		exp_meta_tot=$cur_meta_tot
-		exp_meta_df=$cur_meta_df
-		shift
-	fi
-
-	if [ "$1" == "data" ]; then
-		test "$cur_data_tot" == "$grow_data_tot" || \
-			t_fail "cur total_data_blocks $cur_data_tot != grown $grow_data_tot"
-		test "$cur_data_df" -lt "$grow_data_df" && \
-			t_fail "cur data df total $cur_data_df < grown $grow_data_df"
-		exp_data_tot=$cur_data_tot
-		exp_data_df=$cur_data_df
-	fi
-}
-
-# first calculate small mkfs based on device size
-size_meta=$(blockdev --getsize64 "$T_EX_META_DEV")
-size_data=$(blockdev --getsize64 "$T_EX_DATA_DEV")
-quarter_meta=$(echo "$size_meta / 4" | bc)
-quarter_data=$(echo "$size_data / 4" | bc)
-
-# XXX this is all pretty manual, would be nice to have helpers
-echo "== make initial small fs"
-scoutfs mkfs -A -f -Q 0,127.0.0.1,53000 -m $quarter_meta -d $quarter_data \
-	"$T_EX_META_DEV" "$T_EX_DATA_DEV" > $T_TMP.mkfs.out 2>&1 || \
-		t_fail "mkfs failed"
-SCR="/mnt/scoutfs.enospc"
-mkdir -p "$SCR"
-mount -t scoutfs -o metadev_path=$T_EX_META_DEV,quorum_slot_nr=0 \
-	"$T_EX_DATA_DEV" "$SCR"
-
-# then calculate sizes based on blocks that mkfs used
-quarter_meta=$(echo "$(statfs_total meta "$SCR") * 64 * 1024" | bc)
-quarter_data=$(echo "$(statfs_total data "$SCR") * 4 * 1024" | bc)
-whole_meta=$(echo "$quarter_meta * 4" | bc)
-whole_data=$(echo "$quarter_data * 4" | bc)
-outsize_meta=$(echo "$whole_meta * 2" | bc)
-outsize_data=$(echo "$whole_data * 2" | bc)
-half_meta=$(echo "$whole_meta / 2" | bc)
-half_data=$(echo "$whole_data / 2" | bc)
-shrink_meta=$(echo "$quarter_meta / 2" | bc)
-shrink_data=$(echo "$quarter_data / 2" | bc)
-
-# and save expected values for checks
-exp_meta_tot=$(statfs_total meta "$SCR")
-exp_meta_df=$(df_free MetaData "$SCR")
-exp_data_tot=$(statfs_total data "$SCR")
-exp_data_df=$(df_free Data "$SCR")
-
-echo "== 0s do nothing"
-scoutfs resize-devices -p "$SCR" 
-scoutfs resize-devices -p "$SCR" -m 0
-scoutfs resize-devices -p "$SCR" -d 0
-scoutfs resize-devices -p "$SCR" -m 0 -d 0
-
-echo "== shrinking fails"
-scoutfs resize-devices -p "$SCR" -m $shrink_meta
-scoutfs resize-devices -p "$SCR" -d $shrink_data
-scoutfs resize-devices -p "$SCR" -m $shrink_meta -d $shrink_data
-same_totals
-
-echo "== existing sizes do nothing"
-scoutfs resize-devices -p "$SCR" -m $quarter_meta
-scoutfs resize-devices -p "$SCR" -d $quarter_data
-scoutfs resize-devices -p "$SCR" -m $quarter_meta -d $quarter_data
-same_totals
-
-echo "== growing outside device fails"
-scoutfs resize-devices -p "$SCR" -m $outsize_meta
-scoutfs resize-devices -p "$SCR" -d $outsize_data
-scoutfs resize-devices -p "$SCR" -m $outsize_meta -d $outsize_data
-same_totals
-
-echo "== resizing meta works"
-scoutfs resize-devices -p "$SCR" -m $half_meta
-devices_grew meta
-
-echo "== resizing data works"
-scoutfs resize-devices -p "$SCR" -d $half_data
-devices_grew data
-
-echo "== shrinking back fails"
-scoutfs resize-devices -p "$SCR" -m $quarter_meta
-scoutfs resize-devices -p "$SCR" -m $quarter_data
-same_totals
-
-echo "== resizing again does nothing"
-scoutfs resize-devices -p "$SCR" -m $half_meta
-scoutfs resize-devices -p "$SCR" -m $half_data
-same_totals
-
-echo "== resizing to full works"
-scoutfs resize-devices -p "$SCR" -m $whole_meta -d $whole_data
-devices_grew meta data
-
-echo "== cleanup extra fs"
-umount "$SCR"
-rmdir "$SCR"
-
-t_pass
@@ -7,7 +7,7 @@ message_output()

 error_message()
 {
-	message_output "$@" >&2
+	message_output "$@" >> /dev/stderr
 }

 error_exit()
@@ -18,7 +18,7 @@ error_exit()

 log_message()
 {
-	message_output "$@"
+	message_output "$@" >> /dev/stdout
 }

 # restart if we catch hup to re-read the config
@@ -1,3 +0,0 @@
-SCOUTFS_FENCED_DELAY=1
-SCOUTFS_FENCED_RUN=/usr/libexec/scoutfs-fenced/run/local-force-unmount
-SCOUTFS_FENCED_RUN_ARGS=""
@@ -1,11 +0,0 @@
-[Unit]
-Description=ScoutFS fenced
-
-[Service]
-Restart=on-failure
-RestartSec=5s
-StartLimitBurst=5
-ExecStart=/usr/libexec/scoutfs-fenced/scoutfs-fenced
-
-[Install]
-WantedBy=default.target
@@ -103,63 +103,6 @@ Ignore presence of existing data on the data and metadata devices.
 .PD

 .TP
-.BI "resize-devices [-p|--path PATH] [-m|--meta-size SIZE] [-d|--data-size SIZE]"
-.sp
-Resize the metadata or data devices of a mounted ScoutFS filesystem.
-.sp
-ScoutFS metadata has free extent records and fields in the super block
-that reflect the size of the devices in use.  This command sends a
-request to the server to change the size of the device that can be used
-by updating free extents and setting the super block fields.
-.sp
-The specified sizes are in bytes and are translated into block counts.
-If the specified sizes are not a multiple of the metadata or data block
-sizes then a message is output and the resized size is truncated down to
-the next whole block.  Specifying either a size of 0 or the current
-device size makes no change.    The current size of the devices can be
-seen, in units of their respective block sizes, in the total_meta_blocks
-and total_data_blocks fields returned by the scoutfs statfs command (via
-the statfs_more ioctl).
-.sp
-Shrinking is not supported.  Specifying a smaller size for either device
-will return an error and neither device will be resized.
-.sp
-Specifying a larger size will expand the initial size of the device that
-will be used.  Free space records are added for the expanded region and
-can be used once the resizing transaction is complete.
-.sp
-The resizing action is performed in a transaction on the server.  This
-command will hang until a server is elected and running and can service
-the reqeust.  The server serializes any concurrent requests to resize.
-.sp
-The new sizes must fit within the current sizes of the mounted devices.
-Presumably this command is being performed as part of a larger
-coordinated resize of the underlying devices.  The device must be
-expanded before ScoutFS can use the larger device and ScoutFS must stop
-using a region to shrink before it could be removed from the device
-(which is not currently supported).
-.sp
-The resize will be committed by the server before the response is sent
-to the client.  The system can be using the new device size before the
-result is communicated through the client and this command completes.
-The client could crash and the server could still have performed the
-resize.
-.RS 1.0i
-.PD 0
-.TP
-.sp
-.B "-p, --path PATH"
-A path in the mounted ScoutFS filesystem which will have its devices
-resized.
-.TP
-.B "-m, --meta-size SIZE"
-.B "-d, --data-size SIZE"
-The new size of the metadata or data device to use, in bytes.  Size is given as
-an integer followed by a units digit: "K", "M", "G", "T", "P", to denote
-kibibytes, mebibytes, etc.
-.RE
-.PD
-
 .BI "stat FILE [-s|--single-field FIELD-NAME]"
 .sp
 Display ScoutFS-specific metadata fields for the given file.
@@ -56,14 +56,10 @@ install -m 644 -D src/ioctl.h $RPM_BUILD_ROOT%{_includedir}/scoutfs/ioctl.h
 install -m 644 -D src/format.h $RPM_BUILD_ROOT%{_includedir}/scoutfs/format.h
 install -m 755 -D fenced/scoutfs-fenced $RPM_BUILD_ROOT%{_libexecdir}/scoutfs-fenced/scoutfs-fenced
 install -m 755 -D fenced/local-force-unmount $RPM_BUILD_ROOT%{_libexecdir}/scoutfs-fenced/run/local-force-unmount
-install -m 644 -D fenced/scoutfs-fenced.service $RPM_BUILD_ROOT%{_unitdir}/scoutfs-fenced.service
-install -m 644 -D fenced/scoutfs-fenced.conf.example $RPM_BUILD_ROOT%{_sysconfdir}/scoutfs/scoutfs-fenced.conf.example

 %files
 %defattr(644,root,root,755)
 %{_mandir}/man*/scoutfs*.gz
-%{_unitdir}/scoutfs-fenced.service
-%{_sysconfdir}/scoutfs
 %defattr(755,root,root,755)
 %{_sbindir}/scoutfs
 %{_libexecdir}/scoutfs-fenced
@@ -241,7 +241,11 @@ static int do_mkfs(struct mkfs_args *args)
 	super->next_ino = cpu_to_le64(SCOUTFS_ROOT_INO + 1);
 	super->seq = cpu_to_le64(1);
 	super->total_meta_blocks = cpu_to_le64(last_meta + 1);
-	super->total_data_blocks = cpu_to_le64(last_data + 1);
+	super->first_meta_blkno = cpu_to_le64(next_meta);
+	super->last_meta_blkno = cpu_to_le64(last_meta);
+	super->total_data_blocks = cpu_to_le64(last_data - first_data + 1);
+	super->first_data_blkno = cpu_to_le64(first_data);
+	super->last_data_blkno = cpu_to_le64(last_data);

 	assert(sizeof(args->slots) ==
 		     member_sizeof(struct scoutfs_super_block, qconf.slots));
@@ -316,7 +320,7 @@ static int do_mkfs(struct mkfs_args *args)
 	blkno = next_meta++;
 	ret = write_alloc_root(meta_fd, fsid, &super->data_alloc, bt,
 			       1, blkno, first_data,
-			       last_data - first_data + 1);
+			       le64_to_cpu(super->total_data_blocks));
 	if (ret < 0)
 		goto out;

@@ -951,7 +951,8 @@ static void print_super_block(struct scoutfs_super_block *super, u64 blkno)

 	/* XXX these are all in a crazy order */
 	printf("  next_ino %llu seq %llu\n"
-	       "  total_meta_blocks %llu total_data_blocks %llu\n"
+	       "  total_meta_blocks %llu first_meta_blkno %llu last_meta_blkno %llu\n"
+	       "  total_data_blocks %llu first_data_blkno %llu last_data_blkno %llu\n"
 	       "  meta_alloc[0]: "ALCROOT_F"\n"
 	       "  meta_alloc[1]: "ALCROOT_F"\n"
 	       "  data_alloc: "ALCROOT_F"\n"
@@ -968,7 +969,11 @@ static void print_super_block(struct scoutfs_super_block *super, u64 blkno)
 		le64_to_cpu(super->next_ino),
 		le64_to_cpu(super->seq),
 		le64_to_cpu(super->total_meta_blocks),
+		le64_to_cpu(super->first_meta_blkno),
+		le64_to_cpu(super->last_meta_blkno),
 		le64_to_cpu(super->total_data_blocks),
+		le64_to_cpu(super->first_data_blkno),
+		le64_to_cpu(super->last_data_blkno),
 		ALCROOT_A(&super->meta_alloc[0]),
 		ALCROOT_A(&super->meta_alloc[1]),
 		ALCROOT_A(&super->data_alloc),
@@ -1,120 +0,0 @@
-#include <unistd.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <sys/types.h>
-#include <sys/stat.h>
-#include <sys/ioctl.h>
-#include <fcntl.h>
-#include <errno.h>
-#include <string.h>
-#include <argp.h>
-
-#include "sparse.h"
-#include "parse.h"
-#include "util.h"
-#include "format.h"
-#include "ioctl.h"
-#include "cmd.h"
-
-struct resize_args {
-	char *path;
-	u64 meta_size;
-	u64 data_size;
-};
-
-static int do_resize_devices(struct resize_args *args)
-{
-	struct scoutfs_ioctl_resize_devices rd;
-	int ret;
-	int fd;
-
-	if (args->meta_size & SCOUTFS_BLOCK_LG_MASK) {
-		printf("metadata device size %llu is not a multiple of %u metadata block size, truncating down to %llu byte size\n",
-		args->meta_size, SCOUTFS_BLOCK_LG_SIZE,
-		args->meta_size & ~(u64)SCOUTFS_BLOCK_LG_MASK);
-	}
-
-	if (args->data_size & SCOUTFS_BLOCK_SM_MASK) {
-		printf("data device size %llu is not a multiple of %u data block size, truncating down to %llu byte size\n",
-		args->data_size, SCOUTFS_BLOCK_SM_SIZE,
-		args->data_size & ~(u64)SCOUTFS_BLOCK_SM_MASK);
-	}
-
-	fd = get_path(args->path, O_RDONLY);
-	if (fd < 0)
-		return fd;
-
-	rd.new_total_meta_blocks = args->meta_size >> SCOUTFS_BLOCK_LG_SHIFT;
-	rd.new_total_data_blocks = args->data_size >> SCOUTFS_BLOCK_SM_SHIFT;
-
-	ret = ioctl(fd, SCOUTFS_IOC_RESIZE_DEVICES, &rd);
-	if (ret < 0) {
-		ret = -errno;
-		fprintf(stderr, "resize_devices ioctl failed: %s (%d)\n", strerror(errno), errno);
-	}
-
-	close(fd);
-	return ret;
-};
-
-static int parse_opt(int key, char *arg, struct argp_state *state)
-{
-	struct resize_args *args = state->input;
-	int ret;
-
-	switch (key) {
-	case 'm': /* meta-size */
-	{
-		ret = parse_human(arg, &args->meta_size);
-		if (ret)
-			return ret;
-		break;
-	}
-	case 'd': /* data-size */
-	{
-		ret = parse_human(arg, &args->data_size);
-		if (ret)
-			return ret;
-		break;
-	}
-	case 'p':
-		args->path = strdup_or_error(state, arg);
-		break;
-	default:
-		break;
-	}
-
-	return 0;
-}
-
-static struct argp_option options[] = {
-	{ "path", 'p', "PATH", 0, "Path to ScoutFS filesystem"},
-	{ "meta-size", 'm', "SIZE", 0, "New metadata device size (bytes or KMGTP units)"},
-	{ "data-size", 'd', "SIZE", 0, "New data device size (bytes or KMGTP units)"},
-	{ NULL }
-};
-
-static struct argp argp = {
-	options,
-	parse_opt,
-	"",
-	"Online resize of metadata and/or data devices",
-};
-
-static int resize_devices_cmd(int argc, char **argv)
-{
-
-	struct resize_args resize_args = {NULL,};
-	int ret;
-
-	ret = argp_parse(&argp, argc, argv, 0, NULL, &resize_args);
-	if (ret)
-		return ret;
-
-	return do_resize_devices(&resize_args);
-}
-
-static void __attribute__((constructor)) read_xattr_totals_ctor(void)
-{
-	cmd_register_argp("resize-devices", &argp, GROUP_CORE, resize_devices_cmd);
-}
@@ -37,6 +37,7 @@ static struct stat_more_field inode_fields[] = {
 	INODE_FIELD(data_version),
 	INODE_FIELD(online_blocks),
 	INODE_FIELD(offline_blocks),
+	{ .name = "crtime", .offset = INODE_FIELD_OFF(crtime_sec) },
 	{ NULL, }
 };

@@ -60,6 +61,9 @@ static void print_inode_field(void *st, size_t off)
 		case INODE_FIELD_OFF(offline_blocks):
 			printf("%llu", stm->offline_blocks);
 			break;
+		case INODE_FIELD_OFF(crtime_sec):
+			printf("%llu.%09u", stm->crtime_sec, stm->crtime_nsec);
+			break;
 	};
 }