Block cache use rht _lookup_ insert for EEXIST

The sneaky rhashtable_insert_fast() can't return -EEXIST despite the last line of the function *REALLY* making it look like it can. It just inserts new objects at the head of the bucket lists without comparing the insertion with existing objects. The block cache was relying on insertion to resolve duplicate racing allocated blocks. Because it couldn't return -EEXIST we could get duplicate cached blocks present in the hash table. rhashtable_lookup_insert_fast() fixes this by actually comparing the inserted objects key with the objects found in the insertion bucket. A racing allocator trying to insert a duplicate cached block will get an error, drop their allocated block, and retry their lookup. Signed-off-by: Zach Brown <zab@versity.com>
Wait for rhashtable to rehash on insert EBUSY
2026-01-10 05:37:25 +00:00 · 2021-04-13 09:24:23 -07:00 · 2021-04-13 09:24:23 -07:00 · 2021-04-08 09:03:12 -07:00 · 2021-04-07 12:50:50 -07:00 · 2021-04-07 12:27:00 -07:00
40 changed files with 805 additions and 373 deletions
--- a/kmod/src/alloc.c
+++ b/kmod/src/alloc.c
@@ -252,7 +252,7 @@ void scoutfs_alloc_init(struct scoutfs_alloc *alloc,
 {
 	memset(alloc, 0, sizeof(struct scoutfs_alloc));

-	spin_lock_init(&alloc->lock);
+	seqlock_init(&alloc->seqlock);
 	mutex_init(&alloc->mutex);
 	alloc->avail = *avail;
 	alloc->freed = *freed;
@@ -526,7 +526,8 @@ int scoutfs_alloc_meta(struct super_block *sb, struct scoutfs_alloc *alloc,
 	if (ret < 0)
 		goto out;

-	spin_lock(&alloc->lock);
+	write_seqlock(&alloc->seqlock);
+
 	lblk = alloc->dirty_avail_bl->data;
 	if (WARN_ON_ONCE(lblk->nr == 0)) {
 		/* shouldn't happen, transaction should commit first */
@@ -536,7 +537,8 @@ int scoutfs_alloc_meta(struct super_block *sb, struct scoutfs_alloc *alloc,
 		list_block_remove(&alloc->avail, lblk, 1);
 		ret = 0;
 	}
-	spin_unlock(&alloc->lock);
+
+	write_sequnlock(&alloc->seqlock);

 out:
 	if (ret < 0)
@@ -559,7 +561,8 @@ int scoutfs_free_meta(struct super_block *sb, struct scoutfs_alloc *alloc,
 	if (ret < 0)
 		goto out;

-	spin_lock(&alloc->lock);
+	write_seqlock(&alloc->seqlock);
+
 	lblk = alloc->dirty_freed_bl->data;
 	if (WARN_ON_ONCE(list_block_space(lblk->nr) == 0)) {
 		/* shouldn't happen, transaction should commit first */
@@ -568,7 +571,8 @@ int scoutfs_free_meta(struct super_block *sb, struct scoutfs_alloc *alloc,
 		list_block_add(&alloc->freed, lblk, blkno);
 		ret = 0;
 	}
-	spin_unlock(&alloc->lock);
+
+	write_sequnlock(&alloc->seqlock);

 out:
 	scoutfs_inc_counter(sb, alloc_free_meta);
@@ -1066,17 +1070,23 @@ out:

 /*
 * Returns true if meta avail and free don't have room for the given
- * number of alloctions or frees.
+ * number of allocations or frees.  This is called at a significantly
+ * higher frequency than allocations as writers try to enter
+ * transactions.  This is the only reader of the seqlock which gives
+ * read-mostly sampling instead of bouncing a spinlock around all the
+ * cores.
 */
 bool scoutfs_alloc_meta_low(struct super_block *sb,
 			    struct scoutfs_alloc *alloc, u32 nr)
 {
+	unsigned int seq;
 	bool lo;

-	spin_lock(&alloc->lock);
-	lo = le32_to_cpu(alloc->avail.first_nr) < nr ||
-	     list_block_space(alloc->freed.first_nr) < nr;
-	spin_unlock(&alloc->lock);
+	do {
+		seq = read_seqbegin(&alloc->seqlock);
+		lo = le32_to_cpu(alloc->avail.first_nr) < nr ||
+		     list_block_space(alloc->freed.first_nr) < nr;
+	} while (read_seqretry(&alloc->seqlock, seq));

 	return lo;
 }
--- a/kmod/src/alloc.h
+++ b/kmod/src/alloc.h
@@ -72,7 +72,8 @@
 * transaction.
 */
 struct scoutfs_alloc {
-	spinlock_t lock;
+	/* writers rarely modify list_head avail/freed.  readers often check for _meta_alloc_low */
+	seqlock_t seqlock;
 	struct mutex mutex;
 	struct scoutfs_block *dirty_avail_bl;
 	struct scoutfs_block *dirty_freed_bl;
--- a/kmod/src/block.c
+++ b/kmod/src/block.c
@@ -286,10 +286,16 @@ static int block_insert(struct super_block *sb, struct block_private *bp)

 	WARN_ON_ONCE(atomic_read(&bp->refcount) & BLOCK_REF_INSERTED);

+retry:
 	atomic_add(BLOCK_REF_INSERTED, &bp->refcount);
-	ret = rhashtable_insert_fast(&binf->ht, &bp->ht_head, block_ht_params);
+	ret = rhashtable_lookup_insert_fast(&binf->ht, &bp->ht_head, block_ht_params);
 	if (ret < 0) {
 		atomic_sub(BLOCK_REF_INSERTED, &bp->refcount);
+		if (ret == -EBUSY) {
+			/* wait for pending rebalance to finish */
+			synchronize_rcu();
+			goto retry;
+		}
 	} else {
 		atomic_inc(&binf->total_inserted);
 		TRACE_BLOCK(insert, bp);
@@ -396,6 +402,7 @@ static void block_remove_all(struct super_block *sb)

 		if (block_get_if_inserted(bp)) {
 			block_remove(sb, bp);
+			WARN_ON_ONCE(atomic_read(&bp->refcount) != 1);
 			block_put(sb, bp);
 		}
 	}
@@ -1073,10 +1080,11 @@ restart:
 		if (bp == NULL)
 			break;
 		if (bp == ERR_PTR(-EAGAIN)) {
-			/* hard reset to not hold rcu grace period across retries */
+			/* hard exit to wait for rcu rebalance to finish */
 			rhashtable_walk_stop(&iter);
 			rhashtable_walk_exit(&iter);
 			scoutfs_inc_counter(sb, block_cache_shrink_restart);
+			synchronize_rcu();
 			goto restart;
 		}

--- a/kmod/src/client.c
+++ b/kmod/src/client.c
@@ -49,7 +49,6 @@ struct client_info {
 	struct delayed_work connect_dwork;

 	u64 server_term;
-	u64 greeting_umb;

 	bool sending_farewell;
 	int farewell_error;
@@ -151,7 +150,7 @@ static int client_lock_response(struct super_block *sb,
 				void *resp, unsigned int resp_len,
 				int error, void *data)
 {
-	if (resp_len != sizeof(struct scoutfs_net_lock_grant_response))
+	if (resp_len != sizeof(struct scoutfs_net_lock))
 		return -EINVAL;

 	/* XXX error? */
--- a/kmod/src/data.c
+++ b/kmod/src/data.c
@@ -1135,7 +1135,8 @@ static void truncate_inode_pages_extent(struct inode *inode, u64 start, u64 len)
 */
 #define MOVE_DATA_EXTENTS_PER_HOLD 16
 int scoutfs_data_move_blocks(struct inode *from, u64 from_off,
-			     u64 byte_len, struct inode *to, u64 to_off)
+			     u64 byte_len, struct inode *to, u64 to_off, bool is_stage,
+			     u64 data_version)
 {
 	struct scoutfs_inode_info *from_si = SCOUTFS_I(from);
 	struct scoutfs_inode_info *to_si = SCOUTFS_I(to);
@@ -1145,6 +1146,7 @@ int scoutfs_data_move_blocks(struct inode *from, u64 from_off,
 	struct data_ext_args from_args;
 	struct data_ext_args to_args;
 	struct scoutfs_extent ext;
+	struct timespec cur_time;
 	LIST_HEAD(locks);
 	bool done = false;
 	loff_t from_size;
@@ -1180,6 +1182,11 @@ int scoutfs_data_move_blocks(struct inode *from, u64 from_off,
 		goto out;
 	}

+	if (is_stage && (data_version != SCOUTFS_I(to)->data_version)) {
+		ret = -ESTALE;
+		goto out;
+	}
+
 	from_iblock = from_off >> SCOUTFS_BLOCK_SM_SHIFT;
 	count = (byte_len + SCOUTFS_BLOCK_SM_MASK) >> SCOUTFS_BLOCK_SM_SHIFT;
 	to_iblock = to_off >> SCOUTFS_BLOCK_SM_SHIFT;
@@ -1202,7 +1209,7 @@ int scoutfs_data_move_blocks(struct inode *from, u64 from_off,
 	/* can't stage once data_version changes */
 	scoutfs_inode_get_onoff(from, &junk, &from_offline);
 	scoutfs_inode_get_onoff(to, &junk, &to_offline);
-	if (from_offline || to_offline) {
+	if (from_offline || (to_offline && !is_stage)) {
 		ret = -ENODATA;
 		goto out;
 	}
@@ -1246,6 +1253,8 @@ int scoutfs_data_move_blocks(struct inode *from, u64 from_off,

 		/* arbitrarily limit the number of extents per trans hold */
 		for (i = 0; i < MOVE_DATA_EXTENTS_PER_HOLD; i++) {
+			struct scoutfs_extent off_ext;
+
 			/* find the next extent to move */
 			ret = scoutfs_ext_next(sb, &data_ext_ops, &from_args,
 					       from_iblock, 1, &ext);
@@ -1274,10 +1283,27 @@ int scoutfs_data_move_blocks(struct inode *from, u64 from_off,

 			to_start = to_iblock + (from_start - from_iblock);

-			/* insert the new, fails if it overlaps */
-			ret = scoutfs_ext_insert(sb, &data_ext_ops, &to_args,
-						 to_start, len,
-						 map, ext.flags);
+			if (is_stage) {
+				ret = scoutfs_ext_next(sb, &data_ext_ops, &to_args,
+						       to_iblock, 1, &off_ext);
+				if (ret)
+					break;
+
+				if (!scoutfs_ext_inside(to_start, len, &off_ext) ||
+				    !(off_ext.flags & SEF_OFFLINE)) {
+					ret = -EINVAL;
+					break;
+				}
+
+				ret = scoutfs_ext_set(sb, &data_ext_ops, &to_args,
+							 to_start, len,
+							 map, ext.flags);
+			} else {
+				/* insert the new, fails if it overlaps */
+				ret = scoutfs_ext_insert(sb, &data_ext_ops, &to_args,
+							 to_start, len,
+							 map, ext.flags);
+			}
 			if (ret < 0)
 				break;

@@ -1285,10 +1311,18 @@ int scoutfs_data_move_blocks(struct inode *from, u64 from_off,
 			ret = scoutfs_ext_set(sb, &data_ext_ops, &from_args,
 					      from_start, len, 0, 0);
 			if (ret < 0) {
-				/* remove inserted new on err */
-				err = scoutfs_ext_remove(sb, &data_ext_ops,
-							 &to_args, to_start,
-							 len);
+				if (is_stage) {
+					/* re-mark dest range as offline */
+					WARN_ON_ONCE(!(off_ext.flags & SEF_OFFLINE));
+					err = scoutfs_ext_set(sb, &data_ext_ops, &to_args,
+							      to_start, len,
+							      0, off_ext.flags);
+				} else {
+					/* remove inserted new on err */
+					err = scoutfs_ext_remove(sb, &data_ext_ops,
+								 &to_args, to_start,
+								 len);
+				}
 				BUG_ON(err); /* XXX inconsistent */
 				break;
 			}
@@ -1316,12 +1350,15 @@ int scoutfs_data_move_blocks(struct inode *from, u64 from_off,
 		up_write(&from_si->extent_sem);
 		up_write(&to_si->extent_sem);

-		from->i_ctime = from->i_mtime =
-			to->i_ctime = to->i_mtime = CURRENT_TIME;
+		cur_time = CURRENT_TIME;
+		if (!is_stage) {
+			to->i_ctime = to->i_mtime = cur_time;
+			scoutfs_inode_inc_data_version(to);
+			scoutfs_inode_set_data_seq(to);
+		}
+		from->i_ctime = from->i_mtime = cur_time;
 		scoutfs_inode_inc_data_version(from);
-		scoutfs_inode_inc_data_version(to);
 		scoutfs_inode_set_data_seq(from);
-		scoutfs_inode_set_data_seq(to);

 		scoutfs_update_inode_item(from, from_lock, &locks);
 		scoutfs_update_inode_item(to, to_lock, &locks);
--- a/kmod/src/data.h
+++ b/kmod/src/data.h
@@ -59,7 +59,8 @@ long scoutfs_fallocate(struct file *file, int mode, loff_t offset, loff_t len);
 int scoutfs_data_init_offline_extent(struct inode *inode, u64 size,
 				     struct scoutfs_lock *lock);
 int scoutfs_data_move_blocks(struct inode *from, u64 from_off,
-			     u64 byte_len, struct inode *to, u64 to_off);
+			     u64 byte_len, struct inode *to, u64 to_off, bool to_stage,
+			     u64 data_version);

 int scoutfs_data_wait_check(struct inode *inode, loff_t pos, loff_t len,
 			    u8 sef, u8 op, struct scoutfs_data_wait *ow,
--- a/kmod/src/dir.c
+++ b/kmod/src/dir.c
@@ -813,6 +813,7 @@ static int scoutfs_link(struct dentry *old_dentry,
 	struct scoutfs_lock *dir_lock;
 	struct scoutfs_lock *inode_lock = NULL;
 	LIST_HEAD(ind_locks);
+	bool del_orphan;
 	u64 dir_size;
 	u64 ind_seq;
 	u64 hash;
@@ -841,6 +842,8 @@ static int scoutfs_link(struct dentry *old_dentry,
 		goto out_unlock;

 	dir_size = i_size_read(dir) + dentry->d_name.len;
+	del_orphan = (inode->i_nlink == 0);
+
 retry:
 	ret = scoutfs_inode_index_start(sb, &ind_seq) ?:
 	      scoutfs_inode_index_prepare(sb, &ind_locks, dir, false) ?:
@@ -855,6 +858,12 @@ retry:
 	if (ret)
 		goto out;

+	if (del_orphan) {
+		ret = scoutfs_orphan_dirty(sb, scoutfs_ino(inode));
+		if (ret)
+			goto out;
+	}
+
 	pos = SCOUTFS_I(dir)->next_readdir_pos++;

 	ret = add_entry_items(sb, scoutfs_ino(dir), hash, pos,
@@ -870,6 +879,11 @@ retry:
 	inode->i_ctime = dir->i_mtime;
 	inc_nlink(inode);

+	if (del_orphan) {
+		ret = scoutfs_orphan_delete(sb, scoutfs_ino(inode));
+		WARN_ON_ONCE(ret);
+	}
+
 	scoutfs_update_inode_item(inode, inode_lock, &ind_locks);
 	scoutfs_update_inode_item(dir, dir_lock, &ind_locks);

@@ -1760,6 +1774,42 @@ static int scoutfs_dir_open(struct inode *inode, struct file *file)
 }
 #endif

+static int scoutfs_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode)
+{
+	struct super_block *sb = dir->i_sb;
+	struct inode *inode = NULL;
+	struct scoutfs_lock *dir_lock = NULL;
+	struct scoutfs_lock *inode_lock = NULL;
+	LIST_HEAD(ind_locks);
+	int ret;
+
+	if (dentry->d_name.len > SCOUTFS_NAME_LEN)
+		return -ENAMETOOLONG;
+
+	inode = lock_hold_create(dir, dentry, mode, 0,
+				 &dir_lock, &inode_lock, &ind_locks);
+	if (IS_ERR(inode))
+		return PTR_ERR(inode);
+
+	inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
+	insert_inode_hash(inode);
+	d_tmpfile(dentry, inode);
+
+	scoutfs_update_inode_item(inode, inode_lock, &ind_locks);
+	scoutfs_update_inode_item(dir, dir_lock, &ind_locks);
+	scoutfs_inode_index_unlock(sb, &ind_locks);
+
+	ret = scoutfs_orphan_inode(inode);
+	WARN_ON_ONCE(ret); /* XXX returning error but items deleted */
+
+	scoutfs_release_trans(sb);
+	scoutfs_inode_index_unlock(sb, &ind_locks);
+	scoutfs_unlock(sb, dir_lock, SCOUTFS_LOCK_WRITE);
+	scoutfs_unlock(sb, inode_lock, SCOUTFS_LOCK_WRITE);
+
+	return ret;
+}
+
 const struct file_operations scoutfs_dir_fops = {
 	.KC_FOP_READDIR	= scoutfs_readdir,
 #ifdef KC_FMODE_KABI_ITERATE
@@ -1770,7 +1820,10 @@ const struct file_operations scoutfs_dir_fops = {
 	.llseek		= generic_file_llseek,
 };

-const struct inode_operations scoutfs_dir_iops = {
+
+
+const struct inode_operations_wrapper scoutfs_dir_iops = {
+	.ops = {
 	.lookup		= scoutfs_lookup,
 	.mknod		= scoutfs_mknod,
 	.create		= scoutfs_create,
@@ -1787,6 +1840,8 @@ const struct inode_operations scoutfs_dir_iops = {
 	.removexattr	= scoutfs_removexattr,
 	.symlink	= scoutfs_symlink,
 	.permission	= scoutfs_permission,
+	},
+	.tmpfile	= scoutfs_tmpfile,
 };

 void scoutfs_dir_exit(void)
--- a/kmod/src/dir.h
+++ b/kmod/src/dir.h
@@ -5,7 +5,7 @@
 #include "lock.h"

 extern const struct file_operations scoutfs_dir_fops;
-extern const struct inode_operations scoutfs_dir_iops;
+extern const struct inode_operations_wrapper scoutfs_dir_iops;
 extern const struct inode_operations scoutfs_symlink_iops;

 struct scoutfs_link_backref_entry {
@@ -14,7 +14,7 @@ struct scoutfs_link_backref_entry {
 	u64 dir_pos;
 	u16 name_len;
 	struct scoutfs_dirent dent;
-	/* the full name is allocated and stored in dent.name[0] */
+	/* the full name is allocated and stored in dent.name[] */
 };

 int scoutfs_dir_get_backref_path(struct super_block *sb, u64 ino, u64 dir_ino,
--- a/kmod/src/ext.c
+++ b/kmod/src/ext.c
@@ -38,7 +38,7 @@ static bool ext_overlap(struct scoutfs_extent *ext, u64 start, u64 len)
 	return !(e_end < start || ext->start > end);
 }

-static bool ext_inside(u64 start, u64 len, struct scoutfs_extent *out)
+bool scoutfs_ext_inside(u64 start, u64 len, struct scoutfs_extent *out)
 {
 	u64 in_end = start + len - 1;
 	u64 out_end = out->start + out->len - 1;
@@ -241,7 +241,7 @@ int scoutfs_ext_remove(struct super_block *sb, struct scoutfs_ext_ops *ops,
 		goto out;

 	/* removed extent must be entirely within found */
-	if (!ext_inside(start, len, &found)) {
+	if (!scoutfs_ext_inside(start, len, &found)) {
 		ret = -EINVAL;
 		goto out;
 	}
@@ -341,7 +341,7 @@ int scoutfs_ext_set(struct super_block *sb, struct scoutfs_ext_ops *ops,

 	if (ret == 0 && ext_overlap(&found, start, len)) {
 		/* set extent must be entirely within found */
-		if (!ext_inside(start, len, &found)) {
+		if (!scoutfs_ext_inside(start, len, &found)) {
 			ret = -EINVAL;
 			goto out;
 		}
--- a/kmod/src/ext.h
+++ b/kmod/src/ext.h
@@ -31,5 +31,6 @@ int scoutfs_ext_alloc(struct super_block *sb, struct scoutfs_ext_ops *ops,
 		      struct scoutfs_extent *ext);
 int scoutfs_ext_set(struct super_block *sb, struct scoutfs_ext_ops *ops,
 		    void *arg, u64 start, u64 len, u64 map, u8 flags);
+bool scoutfs_ext_inside(u64 start, u64 len, struct scoutfs_extent *out);

 #endif
--- a/kmod/src/forest.c
+++ b/kmod/src/forest.c
@@ -276,7 +276,6 @@ int scoutfs_forest_read_items(struct super_block *sb,
 	scoutfs_inc_counter(sb, forest_read_items);
 	calc_bloom_nrs(&bloom, &lock->start);

-	roots = lock->roots;
 retry:
 	ret = scoutfs_client_get_roots(sb, &roots);
 	if (ret)
@@ -349,15 +348,9 @@ retry:
 	ret = 0;
 out:
 	if (ret == -ESTALE) {
-		if (memcmp(&prev_refs, &refs, sizeof(refs)) == 0) {
-			ret = -EIO;
-			goto out;
-		}
+		if (memcmp(&prev_refs, &refs, sizeof(refs)) == 0)
+			return -EIO;
 		prev_refs = refs;
-
-		ret = scoutfs_client_get_roots(sb, &roots);
-		if (ret)
-			goto out;
 		goto retry;
 	}

--- a/kmod/src/format.h
+++ b/kmod/src/format.h
@@ -86,11 +86,33 @@ struct scoutfs_timespec {
 	__u8 __pad[4];
 };

-/* XXX ipv6 */
-struct scoutfs_inet_addr {
-	__le32 addr;
+enum scoutfs_inet_family {
+	SCOUTFS_AF_NONE = 0,
+	SCOUTFS_AF_IPV4 = 1,
+	SCOUTFS_AF_IPV6 = 2,
+};
+
+struct scoutfs_inet_addr4 {
+	__le16 family;
 	__le16 port;
-	__u8 __pad[2];
+	__le32 addr;
+};
+
+/*
+ * Not yet supported by code.
+ */
+struct scoutfs_inet_addr6 {
+	__le16 family;
+	__le16 port;
+	__u8 addr[16];
+	__le32 flow_info;
+	__le32 scope_id;
+	__u8 __pad[4];
+};
+
+union scoutfs_inet_addr {
+	struct scoutfs_inet_addr4 v4;
+	struct scoutfs_inet_addr6 v6;
 };

 /*
@@ -237,7 +259,7 @@ struct scoutfs_btree_block {
 	__le16 mid_free_len;
 	__u8 level;
 	__u8 __pad[7];
-	struct scoutfs_btree_item items[0];
+	struct scoutfs_btree_item items[];
 	/* leaf blocks have a fixed size item offset hash table at the end */
 };

@@ -285,7 +307,7 @@ struct scoutfs_alloc_list_block {
 	struct scoutfs_block_ref next;
 	__le32 start;
 	__le32 nr;
-	__le64 blknos[0]; /* naturally aligned for sorting */
+	__le64 blknos[]; /* naturally aligned for sorting */
 };

 #define SCOUTFS_ALLOC_LIST_MAX_BLOCKS					      \
@@ -340,7 +362,7 @@ struct scoutfs_srch_file {

 struct scoutfs_srch_parent {
 	struct scoutfs_block_header hdr;
-	struct scoutfs_block_ref refs[0];
+	struct scoutfs_block_ref refs[];
 };

 #define SCOUTFS_SRCH_PARENT_REFS				\
@@ -355,7 +377,7 @@ struct scoutfs_srch_block {
 	struct scoutfs_srch_entry tail;
 	__le32 entry_nr;
 	__le32 entry_bytes;
-	__u8 entries[0];
+	__u8 entries[];
 };

 /*
@@ -430,7 +452,7 @@ struct scoutfs_log_item_value {
 	__le64 vers;
 	__u8 flags;
 	__u8 __pad[7];
-	__u8 data[0];
+	__u8 data[];
 };

 /*
@@ -445,7 +467,7 @@ struct scoutfs_log_item_value {
 struct scoutfs_bloom_block {
 	struct scoutfs_block_header hdr;
 	__le64 total_set;
-	__le64 bits[0];
+	__le64 bits[];
 };

 /*
@@ -527,7 +549,7 @@ struct scoutfs_xattr {
 	__le16 val_len;
 	__u8 name_len;
 	__u8 __pad[5];
-	__u8 name[0];
+	__u8 name[];
 };


@@ -591,7 +613,7 @@ struct scoutfs_quorum_message {
 struct scoutfs_quorum_config {
 	__le64 version;
 	struct scoutfs_quorum_slot {
-		struct scoutfs_inet_addr addr;
+		union scoutfs_inet_addr addr;
 	} slots[SCOUTFS_QUORUM_MAX_SLOTS];
 };

@@ -707,7 +729,7 @@ struct scoutfs_dirent {
 	__le64 pos;
 	__u8 type;
 	__u8 __pad[7];
-	__u8 name[0];
+	__u8 name[];
 };

 #define SCOUTFS_NAME_LEN 255
@@ -805,7 +827,7 @@ struct scoutfs_net_header {
 	__u8 flags;
 	__u8 error;
 	__u8 __pad[3];
-	__u8 data[0];
+	__u8 data[];
 };

 #define SCOUTFS_NET_FLAG_RESPONSE	(1 << 0)
@@ -873,15 +895,10 @@ struct scoutfs_net_lock {
 	__u8 __pad[6];
 };

-struct scoutfs_net_lock_grant_response {
-	struct scoutfs_net_lock nl;
-	struct scoutfs_net_roots roots;
-};
-
 struct scoutfs_net_lock_recover {
 	__le16 nr;
 	__u8 __pad[6];
-	struct scoutfs_net_lock locks[0];
+	struct scoutfs_net_lock locks[];
 };

 #define SCOUTFS_NET_LOCK_MAX_RECOVER_NR					       \
--- a/kmod/src/inode.c
+++ b/kmod/src/inode.c
@@ -182,7 +182,8 @@ static void set_inode_ops(struct inode *inode)
 		inode->i_fop = &scoutfs_file_fops;
 		break;
 	case S_IFDIR:
-		inode->i_op = &scoutfs_dir_iops;
+		inode->i_op = &scoutfs_dir_iops.ops;
+		inode->i_flags |= S_IOPS_WRAPPER;
 		inode->i_fop = &scoutfs_dir_fops;
 		break;
 	case S_IFLNK:
@@ -1417,7 +1418,18 @@ static void init_orphan_key(struct scoutfs_key *key, u64 rid, u64 ino)
 	};
 }

-static int remove_orphan_item(struct super_block *sb, u64 ino)
+int scoutfs_orphan_dirty(struct super_block *sb, u64 ino)
+{
+	struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
+	struct scoutfs_lock *lock = sbi->rid_lock;
+	struct scoutfs_key key;
+
+	init_orphan_key(&key, sbi->rid, ino);
+
+	return scoutfs_item_dirty(sb, &key, lock);
+}
+
+int scoutfs_orphan_delete(struct super_block *sb, u64 ino)
 {
 	struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
 	struct scoutfs_lock *lock = sbi->rid_lock;
@@ -1516,7 +1528,7 @@ retry:
 	if (ret)
 		goto out;

-	ret = remove_orphan_item(sb, ino);
+	ret = scoutfs_orphan_delete(sb, ino);
 out:
 	if (release)
 		scoutfs_release_trans(sb);
@@ -1621,19 +1633,28 @@ int scoutfs_orphan_inode(struct inode *inode)
 }

 /*
- * Track an inode that could have dirty pages.  Used to kick off writeback
- * on all dirty pages during transaction commit without tying ourselves in
- * knots trying to call through the high level vfs sync methods.
+ * Track an inode that could have dirty pages.  Used to kick off
+ * writeback on all dirty pages during transaction commit without tying
+ * ourselves in knots trying to call through the high level vfs sync
+ * methods.
+ *
+ * This is called by writers who hold the inode and transaction.  The
+ * inode's presence in the rbtree is removed by destroy_inode, prevented
+ * by the inode hold, and by committing the transaction, which is
+ * prevented by holding the transaction.  The inode can only go from
+ * empty to on the rbtree while we're here.
 */
 void scoutfs_inode_queue_writeback(struct inode *inode)
 {
 	DECLARE_INODE_SB_INFO(inode->i_sb, inf);
 	struct scoutfs_inode_info *si = SCOUTFS_I(inode);

-	spin_lock(&inf->writeback_lock);
-	if (RB_EMPTY_NODE(&si->writeback_node))
-		insert_writeback_inode(inf, si);
-	spin_unlock(&inf->writeback_lock);
+	if (RB_EMPTY_NODE(&si->writeback_node)) {
+		spin_lock(&inf->writeback_lock);
+		if (RB_EMPTY_NODE(&si->writeback_node))
+			insert_writeback_inode(inf, si);
+		spin_unlock(&inf->writeback_lock);
+	}
 }

 /*
--- a/kmod/src/inode.h
+++ b/kmod/src/inode.h
@@ -114,6 +114,8 @@ int scoutfs_getattr(struct vfsmount *mnt, struct dentry *dentry,
 int scoutfs_setattr(struct dentry *dentry, struct iattr *attr);

 int scoutfs_scan_orphans(struct super_block *sb);
+int scoutfs_orphan_dirty(struct super_block *sb, u64 ino);
+int scoutfs_orphan_delete(struct super_block *sb, u64 ino);

 void scoutfs_inode_queue_writeback(struct inode *inode);
 int scoutfs_inode_walk_writeback(struct super_block *sb, bool write);
--- a/kmod/src/ioctl.c
+++ b/kmod/src/ioctl.c
@@ -972,12 +972,18 @@ static long scoutfs_ioc_move_blocks(struct file *file, unsigned long arg)
 		goto out;
 	}

+	if (mb.flags & SCOUTFS_IOC_MB_UNKNOWN) {
+		ret = -EINVAL;
+		goto out;
+	}
+
 	ret = mnt_want_write_file(file);
 	if (ret < 0)
 		goto out;

 	ret = scoutfs_data_move_blocks(from, mb.from_off, mb.len,
-				       to, mb.to_off);
+				       to, mb.to_off, !!(mb.flags & SCOUTFS_IOC_MB_STAGE),
+				       mb.data_version);
 	mnt_drop_write_file(file);
 out:
 	fput(from_file);
--- a/kmod/src/ioctl.h
+++ b/kmod/src/ioctl.h
@@ -163,7 +163,7 @@ struct scoutfs_ioctl_ino_path_result {
 	__u64 dir_pos;
 	__u16 path_bytes;
 	__u8  _pad[6];
-	__u8  path[0];
+	__u8  path[];
 };

 /* Get a single path from the root to the given inode number */
@@ -259,7 +259,7 @@ struct scoutfs_ioctl_data_waiting {
 	__u8 _pad[6];
 };

-#define SCOUTFS_IOC_DATA_WAITING_FLAGS_UNKNOWN		(U8_MAX << 0)
+#define SCOUTFS_IOC_DATA_WAITING_FLAGS_UNKNOWN		(U64_MAX << 0)

 #define SCOUTFS_IOC_DATA_WAITING _IOR(SCOUTFS_IOCTL_MAGIC, 6, \
 				      struct scoutfs_ioctl_data_waiting)
@@ -279,7 +279,7 @@ struct scoutfs_ioctl_setattr_more {
 };

 #define SCOUTFS_IOC_SETATTR_MORE_OFFLINE		(1 << 0)
-#define SCOUTFS_IOC_SETATTR_MORE_UNKNOWN		(U8_MAX << 1)
+#define SCOUTFS_IOC_SETATTR_MORE_UNKNOWN		(U64_MAX << 1)

 #define SCOUTFS_IOC_SETATTR_MORE _IOW(SCOUTFS_IOCTL_MAGIC, 7, \
 				      struct scoutfs_ioctl_setattr_more)
@@ -418,12 +418,13 @@ struct scoutfs_ioctl_alloc_detail_entry {
 * on the same file system.
 *
 * from_fd specifies the source file and the ioctl is called on the
- * destination file.  Both files must have write access.  from_off
- * specifies the byte offset in the source, to_off is the byte offset in
- * the destination, and len is the number of bytes in the region to
- * move.   All of the offsets and lengths must be in multiples of 4KB,
- * except in the case where the from_off + len ends at the i_size of the
- * source file.
+ * destination file.  Both files must have write access.  from_off specifies
+ * the byte offset in the source, to_off is the byte offset in the
+ * destination, and len is the number of bytes in the region to move.  All of
+ * the offsets and lengths must be in multiples of 4KB, except in the case
+ * where the from_off + len ends at the i_size of the source
+ * file. data_version is only used when STAGE flag is set (see below).  flags
+ * field is currently only used to optionally specify STAGE behavior.
 *
 * This interface only moves extents which are block granular, it does
 * not perform RMW of sub-block byte extents and it does not overwrite
@@ -435,30 +436,41 @@ struct scoutfs_ioctl_alloc_detail_entry {
 * i_size.  The i_size update will maintain final partial blocks in the
 * source.
 *
- * It will return an error if either of the files have offline extents.
- * It will return 0 when all of the extents in the source region have
- * been moved to the destination.  Moving extents updates the ctime,
- * mtime, meta_seq, data_seq, and data_version fields of both the source
- * and destination inodes.  If an error is returned then partial
+ * If STAGE flag is not set, it will return an error if either of the files
+ * have offline extents.  It will return 0 when all of the extents in the
+ * source region have been moved to the destination.  Moving extents updates
+ * the ctime, mtime, meta_seq, data_seq, and data_version fields of both the
+ * source and destination inodes.  If an error is returned then partial
 * progress may have been made and inode fields may have been updated.
 *
+ * If STAGE flag is set, as above except destination range must be in an
+ * offline extent. Fields are updated only for source inode.
+ *
 * Errors specific to this interface include:
 *
 * EINVAL: from_off, len, or to_off aren't a multiple of 4KB; the source
 *	   and destination files are the same inode; either the source or
 *	   destination is not a regular file; the destination file has
- *	   an existing overlapping extent.
+ *	   an existing overlapping extent (if STAGE flag not set); the
+ *	   destination range is not in an offline extent (if STAGE set).
 * EOVERFLOW: either from_off + len or to_off + len exceeded 64bits.
 * EBADF: from_fd isn't a valid open file descriptor.
 * EXDEV: the source and destination files are in different filesystems.
 * EISDIR: either the source or destination is a directory.
- * ENODATA: either the source or destination file have offline extents.
+ * ENODATA: either the source or destination file have offline extents and
+ *	    STAGE flag is not set.
+ * ESTALE: data_version does not match destination data_version.
 */
+#define SCOUTFS_IOC_MB_STAGE		(1 << 0)
+#define SCOUTFS_IOC_MB_UNKNOWN		(U64_MAX << 1)
+
 struct scoutfs_ioctl_move_blocks {
 	__u64 from_fd;
 	__u64 from_off;
 	__u64 len;
 	__u64 to_off;
+	__u64 data_version;
+	__u64 flags;
 };

 #define SCOUTFS_IOC_MOVE_BLOCKS _IOR(SCOUTFS_IOCTL_MAGIC, 13, \
--- a/kmod/src/lock.c
+++ b/kmod/src/lock.c
@@ -638,7 +638,6 @@ static void lock_grant_worker(struct work_struct *work)
 	struct lock_info *linfo = container_of(work, struct lock_info,
 					       grant_work);
 	struct super_block *sb = linfo->sb;
-	struct scoutfs_net_lock_grant_response *gr;
 	struct scoutfs_net_lock *nl;
 	struct scoutfs_lock *lock;
 	struct scoutfs_lock *tmp;
@@ -648,8 +647,7 @@ static void lock_grant_worker(struct work_struct *work)
 	spin_lock(&linfo->lock);

 	list_for_each_entry_safe(lock, tmp, &linfo->grant_list, grant_head) {
-		gr = &lock->grant_resp;
-		nl = &lock->grant_resp.nl;
+		nl = &lock->grant_nl;

 		/* wait for reordered invalidation to finish */
 		if (lock->mode != nl->old_mode)
@@ -667,7 +665,6 @@ static void lock_grant_worker(struct work_struct *work)
 		lock->request_pending = 0;
 		lock->mode = nl->new_mode;
 		lock->write_version = le64_to_cpu(nl->write_version);
-		lock->roots = gr->roots;

 		if (lock_count_match_exists(nl->new_mode, lock->waiters))
 			extend_grace(sb, lock);
@@ -689,9 +686,8 @@ static void lock_grant_worker(struct work_struct *work)
 * work to process.
 */
 int scoutfs_lock_grant_response(struct super_block *sb,
-				struct scoutfs_net_lock_grant_response *gr)
+				struct scoutfs_net_lock *nl)
 {
-	struct scoutfs_net_lock *nl = &gr->nl;
 	DECLARE_LOCK_INFO(sb, linfo);
 	struct scoutfs_lock *lock;

@@ -705,7 +701,7 @@ int scoutfs_lock_grant_response(struct super_block *sb,
 	trace_scoutfs_lock_grant_response(sb, lock);
 	BUG_ON(!lock->request_pending);

-	lock->grant_resp = *gr;
+	lock->grant_nl = *nl;
 	list_add_tail(&lock->grant_head, &linfo->grant_list);
 	queue_grant_work(linfo);

--- a/kmod/src/lock.h
+++ b/kmod/src/lock.h
@@ -23,7 +23,6 @@ struct scoutfs_lock {
 	u64 refresh_gen;
 	u64 write_version;
 	u64 dirty_trans_seq;
-	struct scoutfs_net_roots roots;
 	struct list_head lru_head;
 	wait_queue_head_t waitq;
 	ktime_t grace_deadline;
@@ -31,7 +30,7 @@ struct scoutfs_lock {
 		      invalidate_pending:1;

 	struct list_head grant_head;
-	struct scoutfs_net_lock_grant_response grant_resp;
+	struct scoutfs_net_lock grant_nl;
 	struct list_head inv_head;
 	struct scoutfs_net_lock inv_nl;
 	u64 inv_net_id;
@@ -57,7 +56,7 @@ struct scoutfs_lock_coverage {
 };

 int scoutfs_lock_grant_response(struct super_block *sb,
-				struct scoutfs_net_lock_grant_response *gr);
+				struct scoutfs_net_lock *nl);
 int scoutfs_lock_invalidate_request(struct super_block *sb, u64 net_id,
 				    struct scoutfs_net_lock *nl);
 int scoutfs_lock_recover_request(struct super_block *sb, u64 net_id,
--- a/kmod/src/lock_server.c
+++ b/kmod/src/lock_server.c
@@ -484,7 +484,6 @@ static int process_waiting_requests(struct super_block *sb,
 				    struct server_lock_node *snode)
 {
 	DECLARE_LOCK_SERVER_INFO(sb, inf);
-	struct scoutfs_net_lock_grant_response gres;
 	struct scoutfs_net_lock nl;
 	struct client_lock_entry *req;
 	struct client_lock_entry *req_tmp;
@@ -547,11 +546,8 @@ static int process_waiting_requests(struct super_block *sb,
 			nl.write_version = cpu_to_le64(wv);
 		}

-		gres.nl = nl;
-		scoutfs_server_get_roots(sb, &gres.roots);
-
 		ret = scoutfs_server_lock_response(sb, req->rid,
-						   req->net_id, &gres);
+						   req->net_id, &nl);
 		if (ret)
 			goto out;

@@ -586,7 +582,9 @@ static void init_lock_clients_key(struct scoutfs_key *key, u64 rid)
 * the client had already talked to the server then we must find an
 * existing record for it and should begin recovery.  If it doesn't have
 * a record then its timed out and we can't allow it to reconnect.  If
- * its connecting for the first time then we insert a new record.  If
+ * we're creating a new record for a client we can see EEXIST if the
+ * greeting is resent to a new server after the record was committed but
+ * before the response was received by the client.
 *
 * This is running in concurrent client greeting processing contexts.
 */
@@ -611,6 +609,8 @@ int scoutfs_lock_server_greeting(struct super_block *sb, u64 rid,
 		ret = scoutfs_btree_insert(sb, inf->alloc, inf->wri,
 					   &super->lock_clients,
 					   &key, NULL, 0);
+		if (ret == -EEXIST)
+			ret = 0;
 	}
 	mutex_unlock(&inf->mutex);

--- a/kmod/src/net.c
+++ b/kmod/src/net.c
@@ -944,7 +944,6 @@ static void scoutfs_net_listen_worker(struct work_struct *work)
 	struct scoutfs_net_connection *acc_conn;
 	DECLARE_WAIT_QUEUE_HEAD(waitq);
 	struct socket *acc_sock;
-	LIST_HEAD(conn_list);
 	int ret;

 	trace_scoutfs_net_listen_work_enter(sb, 0, 0);
@@ -1546,9 +1545,8 @@ void scoutfs_net_client_greeting(struct super_block *sb,
 * response and they can disconnect cleanly.
 *
 * At this point our connection is idle except for send submissions and
- * shutdown being queued.  Once we shut down a We completely own a We
- * have exclusive access to a previous conn once its shutdown and we set
- * _freeing.
+ * shutdown being queued.  We have exclusive access to the previous conn
+ * once it's shutdown and we set _freeing.
 */
 void scoutfs_net_server_greeting(struct super_block *sb,
 				 struct scoutfs_net_connection *conn,
--- a/kmod/src/net.h
+++ b/kmod/src/net.h
@@ -90,19 +90,13 @@ enum conn_flags {
 #define SIN_ARG(sin)	sin, be16_to_cpu((sin)->sin_port)

 static inline void scoutfs_addr_to_sin(struct sockaddr_in *sin,
-				       struct scoutfs_inet_addr *addr)
+				       union scoutfs_inet_addr *addr)
 {
-	sin->sin_family = AF_INET;
-	sin->sin_addr.s_addr = cpu_to_be32(le32_to_cpu(addr->addr));
-	sin->sin_port = cpu_to_be16(le16_to_cpu(addr->port));
-}
+	BUG_ON(addr->v4.family != cpu_to_le16(SCOUTFS_AF_IPV4));

-static inline void scoutfs_addr_from_sin(struct scoutfs_inet_addr *addr,
-					 struct sockaddr_in *sin)
-{
-	addr->addr = be32_to_le32(sin->sin_addr.s_addr);
-	addr->port = be16_to_le16(sin->sin_port);
-	memset(addr->__pad, 0, sizeof(addr->__pad));
+	sin->sin_family = AF_INET;
+	sin->sin_addr.s_addr = cpu_to_be32(le32_to_cpu(addr->v4.addr));
+	sin->sin_port = cpu_to_be16(le16_to_cpu(addr->v4.port));
 }

 struct scoutfs_net_connection *
--- a/kmod/src/quorum.c
+++ b/kmod/src/quorum.c
@@ -138,7 +138,7 @@ static bool quorum_slot_present(struct scoutfs_super_block *super, int i)
 {
 	BUG_ON(i < 0 || i > SCOUTFS_QUORUM_MAX_SLOTS);

-	return super->qconf.slots[i].addr.addr != 0;
+	return super->qconf.slots[i].addr.v4.family == cpu_to_le16(SCOUTFS_AF_IPV4);
 }

 static ktime_t election_timeout(void)
@@ -976,6 +976,9 @@ static int verify_quorum_slots(struct super_block *sb)
 		}

 		for (j = i + 1; j < SCOUTFS_QUORUM_MAX_SLOTS; j++) {
+			if (!quorum_slot_present(super, j))
+				continue;
+
 			scoutfs_quorum_slot_sin(super, j, &other);

 			if (sin.sin_addr.s_addr == other.sin_addr.s_addr &&
--- a/kmod/src/scoutfs_trace.h
+++ b/kmod/src/scoutfs_trace.h
@@ -423,61 +423,34 @@ TRACE_EVENT(scoutfs_trans_write_func,
 	TP_printk(SCSBF" dirty %lu", SCSB_TRACE_ARGS, __entry->dirty)
 );

-TRACE_EVENT(scoutfs_release_trans,
-	TP_PROTO(struct super_block *sb, void *rsv, unsigned int rsv_holders,
-		 unsigned int tri_holders,
-		 unsigned int tri_writing),
+DECLARE_EVENT_CLASS(scoutfs_trans_hold_release_class,
+	TP_PROTO(struct super_block *sb, void *journal_info, int holders),

-	TP_ARGS(sb, rsv, rsv_holders, tri_holders, tri_writing),
+	TP_ARGS(sb, journal_info, holders),

 	TP_STRUCT__entry(
 		SCSB_TRACE_FIELDS
-		__field(void *, rsv)
-		__field(unsigned int, rsv_holders)
-		__field(unsigned int, tri_holders)
-		__field(unsigned int, tri_writing)
+		__field(unsigned long, journal_info)
+		__field(int, holders)
 	),

 	TP_fast_assign(
 		SCSB_TRACE_ASSIGN(sb);
-		__entry->rsv = rsv;
-		__entry->rsv_holders = rsv_holders;
-		__entry->tri_holders = tri_holders;
-		__entry->tri_writing = tri_writing;
+		__entry->journal_info = (unsigned long)journal_info;
+		__entry->holders = holders;
 	),

-	TP_printk(SCSBF" rsv %p holders %u trans holders %u writing %u",
-		  SCSB_TRACE_ARGS, __entry->rsv, __entry->rsv_holders,
-		  __entry->tri_holders, __entry->tri_writing)
+	TP_printk(SCSBF" journal_info 0x%0lx holders %d",
+		  SCSB_TRACE_ARGS, __entry->journal_info, __entry->holders)
 );

-TRACE_EVENT(scoutfs_trans_acquired_hold,
-	TP_PROTO(struct super_block *sb,
-		 void *rsv, unsigned int rsv_holders,
-		 unsigned int tri_holders,
-		 unsigned int tri_writing),
-
-	TP_ARGS(sb, rsv, rsv_holders, tri_holders, tri_writing),
-
-	TP_STRUCT__entry(
-		SCSB_TRACE_FIELDS
-		__field(void *, rsv)
-		__field(unsigned int, rsv_holders)
-		__field(unsigned int, tri_holders)
-		__field(unsigned int, tri_writing)
-	),
-
-	TP_fast_assign(
-		SCSB_TRACE_ASSIGN(sb);
-		__entry->rsv = rsv;
-		__entry->rsv_holders = rsv_holders;
-		__entry->tri_holders = tri_holders;
-		__entry->tri_writing = tri_writing;
-	),
-
-	TP_printk(SCSBF" rsv %p holders %u trans holders %u writing %u",
-		  SCSB_TRACE_ARGS, __entry->rsv, __entry->rsv_holders,
-		  __entry->tri_holders, __entry->tri_writing)
+DEFINE_EVENT(scoutfs_trans_hold_release_class, scoutfs_trans_acquired_hold,
+	TP_PROTO(struct super_block *sb, void *journal_info, int holders),
+	TP_ARGS(sb, journal_info, holders)
+);
+DEFINE_EVENT(scoutfs_trans_hold_release_class, scoutfs_release_trans,
+	TP_PROTO(struct super_block *sb, void *journal_info, int holders),
+	TP_ARGS(sb, journal_info, holders)
 );

 TRACE_EVENT(scoutfs_ioc_release,
--- a/kmod/src/server.c
+++ b/kmod/src/server.c
@@ -182,7 +182,7 @@ int scoutfs_server_apply_commit(struct super_block *sb, int err)
 	return err;
 }

-void scoutfs_server_get_roots(struct super_block *sb,
+static void get_roots(struct super_block *sb,
 			      struct scoutfs_net_roots *roots)
 {
 	DECLARE_SERVER_INFO(sb, server);
@@ -556,7 +556,7 @@ static int server_get_roots(struct super_block *sb,
 		memset(&roots, 0, sizeof(roots));
 		ret = -EINVAL;
 	}  else {
-		scoutfs_server_get_roots(sb, &roots);
+		get_roots(sb, &roots);
 		ret = 0;
 	}

@@ -862,13 +862,13 @@ int scoutfs_server_lock_request(struct super_block *sb, u64 rid,
 }

 int scoutfs_server_lock_response(struct super_block *sb, u64 rid, u64 id,
-				 struct scoutfs_net_lock_grant_response *gr)
+				 struct scoutfs_net_lock *nl)
 {
 	struct server_info *server = SCOUTFS_SB(sb)->server_info;

 	return scoutfs_net_response_node(sb, server->conn, rid,
 					 SCOUTFS_NET_CMD_LOCK, id, 0,
-					 gr, sizeof(*gr));
+					 nl, sizeof(*nl));
 }

 static bool invalid_recover(struct scoutfs_net_lock_recover *nlr,
@@ -1024,6 +1024,12 @@ static void init_mounted_client_key(struct scoutfs_key *key, u64 rid)
 	};
 }

+/*
+ * Insert a new mounted client item for a client that is sending us a
+ * greeting that hasn't yet seen a response.  The greeting can be
+ * retransmitted to a new server after the previous inserted the item so
+ * it's acceptable to see -EEXIST.
+ */
 static int insert_mounted_client(struct super_block *sb, u64 rid,
 				 u64 gr_flags)
 {
@@ -1042,6 +1048,8 @@ static int insert_mounted_client(struct super_block *sb, u64 rid,
 	ret = scoutfs_btree_insert(sb, &server->alloc, &server->wri,
 				   &super->mounted_clients, &key, &mcv,
 				   sizeof(mcv));
+	if (ret == -EEXIST)
+		ret = 0;
 	mutex_unlock(&server->mounted_clients_mutex);

 	return ret;
@@ -1543,7 +1551,6 @@ static void scoutfs_server_worker(struct work_struct *work)
 	struct scoutfs_net_connection *conn = NULL;
 	DECLARE_WAIT_QUEUE_HEAD(waitq);
 	struct sockaddr_in sin;
-	LIST_HEAD(conn_list);
 	u64 max_vers;
 	int ret;

--- a/kmod/src/server.h
+++ b/kmod/src/server.h
@@ -59,11 +59,9 @@ do {								\
 int scoutfs_server_lock_request(struct super_block *sb, u64 rid,
 				struct scoutfs_net_lock *nl);
 int scoutfs_server_lock_response(struct super_block *sb, u64 rid, u64 id,
-				 struct scoutfs_net_lock_grant_response *gr);
+				 struct scoutfs_net_lock *nl);
 int scoutfs_server_lock_recover_request(struct super_block *sb, u64 rid,
 					struct scoutfs_key *key);
-void scoutfs_server_get_roots(struct super_block *sb,
-			      struct scoutfs_net_roots *roots);
 int scoutfs_server_hold_commit(struct super_block *sb);
 int scoutfs_server_apply_commit(struct super_block *sb, int err);

--- a/kmod/src/srch.c
+++ b/kmod/src/srch.c
@@ -2156,7 +2156,8 @@ static void scoutfs_srch_compact_worker(struct work_struct *work)
 	if (ret < 0)
 		goto commit;

-	ret = scoutfs_block_writer_write(sb, &wri);
+	ret = scoutfs_alloc_prepare_commit(sb, &alloc, &wri) ?:
+	      scoutfs_block_writer_write(sb, &wri);
 commit:
 	/* the server won't use our partial compact if _ERROR is set */
 	sc->meta_avail = alloc.avail;
--- a/kmod/src/trans.c
+++ b/kmod/src/trans.c
@@ -39,17 +39,15 @@
 * track the relationships between dirty blocks so there's only ever one
 * transaction being built.
 *
- * The copy of the on-disk super block in the fs sb info has its header
- * sequence advanced so that new dirty blocks inherit this dirty
- * sequence number.  It's only advanced once all those dirty blocks are
- * reachable after having first written them all out and then the new
- * super with that seq.  It's first incremented at mount.
+ * Committing the current dirty transaction can be triggered by sync, a
+ * regular background commit interval, reaching a dirty block threshold,
+ * or the transaction running out of its private allocator resources.
+ * Once all the current holders release the writing func writes out the
+ * dirty blocks while excluding holders until it finishes.
 *
- * Unfortunately writers can nest.  We don't bother trying to special
- * case holding a transaction that you're already holding because that
- * requires per-task storage.  We just let anyone hold transactions
- * regardless of waiters waiting to write, which risks waiters waiting a
- * very long time.
+ * Unfortunately writing holders can nest.  We track nested hold callers
+ * with the per-task journal_info pointer to avoid deadlocks between
+ * holders that might otherwise wait for a pending commit.
 */

 /* sync dirty data at least this often */
@@ -59,9 +57,7 @@
 * XXX move the rest of the super trans_ fields here.
 */
 struct trans_info {
-	spinlock_t lock;
-	unsigned holders;
-	bool writing;
+	atomic_t holders;

 	struct scoutfs_log_trees lt;
 	struct scoutfs_alloc alloc;
@@ -71,17 +67,9 @@ struct trans_info {
 #define DECLARE_TRANS_INFO(sb, name) \
 	struct trans_info *name = SCOUTFS_SB(sb)->trans_info

-static bool drained_holders(struct trans_info *tri)
-{
-	bool drained;
-
-	spin_lock(&tri->lock);
-	tri->writing = true;
-	drained = tri->holders == 0;
-	spin_unlock(&tri->lock);
-
-	return drained;
-}
+/* avoid the high sign bit out of an abundance of caution*/
+#define TRANS_HOLDERS_WRITE_FUNC_BIT	(1 << 30)
+#define TRANS_HOLDERS_COUNT_MASK	(TRANS_HOLDERS_WRITE_FUNC_BIT - 1)

 static int commit_btrees(struct super_block *sb)
 {
@@ -126,6 +114,36 @@ bool scoutfs_trans_has_dirty(struct super_block *sb)
 	return scoutfs_block_writer_has_dirty(sb, &tri->wri);
 }

+/*
+ * This is racing with wait_event conditions, make sure our atomic
+ * stores and waitqueue loads are ordered.
+ */
+static void sub_holders_and_wake(struct super_block *sb, int val)
+{
+	struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
+	DECLARE_TRANS_INFO(sb, tri);
+
+	atomic_sub(val, &tri->holders);
+	smp_mb(); /* make sure sub is visible before we wake */
+	if (waitqueue_active(&sbi->trans_hold_wq))
+		wake_up(&sbi->trans_hold_wq);
+}
+
+/*
+ * called as a wait_event condition, needs to be careful to not change
+ * task state and is racing with waking paths that sub_return, test, and
+ * wake.
+ */
+static bool drained_holders(struct trans_info *tri)
+{
+	int holders;
+
+	smp_mb(); /* make sure task in wait_event queue before atomic read */
+	holders = atomic_read(&tri->holders) & TRANS_HOLDERS_COUNT_MASK;
+
+	return holders == 0;
+}
+
 /*
 * This work func is responsible for writing out all the dirty blocks
 * that make up the current dirty transaction.  It prevents writers from
@@ -162,6 +180,9 @@ void scoutfs_trans_write_func(struct work_struct *work)

 	sbi->trans_task = current;

+	/* mark that we're writing so holders wait for us to finish and clear our bit */
+	atomic_add(TRANS_HOLDERS_WRITE_FUNC_BIT, &tri->holders);
+
 	wait_event(sbi->trans_hold_wq, drained_holders(tri));

 	trace_scoutfs_trans_write_func(sb,
@@ -213,11 +234,8 @@ out:
 	spin_unlock(&sbi->trans_write_lock);
 	wake_up(&sbi->trans_write_wq);

-	spin_lock(&tri->lock);
-	tri->writing = false;
-	spin_unlock(&tri->lock);
-
-	wake_up(&sbi->trans_hold_wq);
+	/* we're done, wake waiting holders */
+	sub_holders_and_wake(sb, TRANS_HOLDERS_WRITE_FUNC_BIT);

 	sbi->trans_task = NULL;

@@ -309,53 +327,83 @@ void scoutfs_trans_restart_sync_deadline(struct super_block *sb)
 }

 /*
- * Each thread reserves space in the segment for their dirty items while
- * they hold the transaction.  This is calculated before the first
- * transaction hold is acquired.  It includes all the potential nested
- * item manipulation that could happen with the transaction held.
- * Including nested holds avoids having to deal with writing out partial
- * transactions while a caller still holds the transaction.
+ * We store nested holders in the lower bits of journal_info.  We use
+ * some higher bits as a magic value to detect if something goes
+ * horribly wrong and it gets clobbered.
 */
+#define TRANS_JI_MAGIC		0xd5700000
+#define TRANS_JI_MAGIC_MASK	0xfff00000
+#define TRANS_JI_COUNT_MASK	0x000fffff

-#define SCOUTFS_RESERVATION_MAGIC 0xd57cd13b
-struct scoutfs_reservation {
-	unsigned magic;
-	unsigned holders;
-};
+/* returns true if a caller already had a holder counted in journal_info */
+static bool inc_journal_info_holders(void)
+{
+	unsigned long holders = (unsigned long)current->journal_info;
+
+	WARN_ON_ONCE(holders != 0 && ((holders & TRANS_JI_MAGIC_MASK) != TRANS_JI_MAGIC));
+
+	if (holders == 0)
+		holders = TRANS_JI_MAGIC;
+	holders++;
+
+	current->journal_info = (void *)holders;
+	return (holders > (TRANS_JI_MAGIC | 1));
+}
+
+static void dec_journal_info_holders(void)
+{
+	unsigned long holders = (unsigned long)current->journal_info;
+
+	WARN_ON_ONCE(holders != 0 && ((holders & TRANS_JI_MAGIC_MASK) != TRANS_JI_MAGIC));
+	WARN_ON_ONCE((holders & TRANS_JI_COUNT_MASK) == 0);
+
+	holders--;
+	if (holders == TRANS_JI_MAGIC)
+		holders = 0;
+
+	current->journal_info = (void *)holders;
+}

 /*
- * Try to hold the transaction.  If a caller already holds the trans then
- * we piggy back on their hold.  We wait if the writer is trying to
- * write out the transation.  And if our items won't fit then we kick off
- * a write.
+ * This is called as the wait_event condition for holding a transaction.
+ * Increment the holder count unless the writer is present.  We return
+ * false to wait until the writer finishes and wakes us.
 *
- * This is called as a condition for wait_event.  It is very limited in
- * the locking (blocking) it can do because the caller has set the task
- * state before testing the condition safely race with waking after
- * setting the condition.  Our checking the amount of dirty metadata
- * blocks and free data blocks is racy, but we don't mind the risk of
- * delaying or prematurely forcing commits.
+ * This can be racing with itself while there's no waiters.  We retry
+ * the cmpxchg instead of returning and waiting.
 */
-static bool acquired_hold(struct super_block *sb,
-			  struct scoutfs_reservation *rsv)
+static bool inc_holders_unless_writer(struct trans_info *tri)
 {
-	struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
-	DECLARE_TRANS_INFO(sb, tri);
-	bool acquired = false;
+	int holders;

-	spin_lock(&tri->lock);
+	do {
+		smp_mb(); /* make sure we read after wait puts task in queue */
+		holders = atomic_read(&tri->holders);
+		if (holders & TRANS_HOLDERS_WRITE_FUNC_BIT)
+			return false;

-	trace_scoutfs_trans_acquired_hold(sb, rsv, rsv->holders,
-					  tri->holders, tri->writing);
+	} while (atomic_cmpxchg(&tri->holders, holders, holders + 1) != holders);

-	/* use a caller's existing reservation */
-	if (rsv->holders)
-		goto hold;
+	return true;
+}

-	/* wait until the writing thread is finished */
-	if (tri->writing)
-		goto out;
+/*
+ * As we drop the last trans holder we try to wake a writing thread that
+ * was waiting for us to finish.
+ */
+static void release_holders(struct super_block *sb)
+{
+	dec_journal_info_holders();
+	sub_holders_and_wake(sb, 1);
+}

+/*
+ * The caller has incremented holders so it is blocking commits.  We
+ * make some quick checks to see if we need to trigger and wait for
+ * another commit before proceeding.
+ */
+static bool commit_before_hold(struct super_block *sb, struct trans_info *tri)
+{
 	/*
 	 * In theory each dirty item page could be straddling two full
 	 * blocks, requiring 4 allocations for each item cache page.
@@ -365,11 +413,9 @@ static bool acquired_hold(struct super_block *sb,
 	 * that it accounts for having to dirty parent blocks and
 	 * whatever dirtying is done during the transaction hold.
 	 */
-	if (scoutfs_alloc_meta_low(sb, &tri->alloc,
-				   scoutfs_item_dirty_pages(sb) * 2)) {
+	if (scoutfs_alloc_meta_low(sb, &tri->alloc, scoutfs_item_dirty_pages(sb) * 2)) {
 		scoutfs_inc_counter(sb, trans_commit_dirty_meta_full);
-		queue_trans_work(sbi);
-		goto out;
+		return true;
 	}

 	/*
@@ -381,57 +427,74 @@ static bool acquired_hold(struct super_block *sb,
 	 */
 	if (scoutfs_alloc_meta_low(sb, &tri->alloc, 16)) {
 		scoutfs_inc_counter(sb, trans_commit_meta_alloc_low);
-		queue_trans_work(sbi);
-		goto out;
+		return true;
 	}

 	/* Try to refill data allocator before premature enospc */
 	if (scoutfs_data_alloc_free_bytes(sb) <= SCOUTFS_TRANS_DATA_ALLOC_LWM) {
 		scoutfs_inc_counter(sb, trans_commit_data_alloc_low);
-		queue_trans_work(sbi);
+		return true;
+	}
+
+	return false;
+}
+
+static bool acquired_hold(struct super_block *sb)
+{
+	struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
+	DECLARE_TRANS_INFO(sb, tri);
+	bool acquired;
+
+	/* if a caller already has a hold we acquire unconditionally */
+	if (inc_journal_info_holders()) {
+		atomic_inc(&tri->holders);
+		acquired = true;
 		goto out;
 	}

-hold:
-	rsv->holders++;
-	tri->holders++;
+	/* wait if the writer is blocking holds */
+	if (!inc_holders_unless_writer(tri)) {
+		dec_journal_info_holders();
+		acquired = false;
+		goto out;
+	}
+
+	/* wait if we're triggering another commit */
+	if (commit_before_hold(sb, tri)) {
+		release_holders(sb);
+		queue_trans_work(sbi);
+		acquired = false;
+		goto out;
+	}
+
+	trace_scoutfs_trans_acquired_hold(sb, current->journal_info, atomic_read(&tri->holders));
 	acquired = true;
-
 out:
-
-	spin_unlock(&tri->lock);
-
 	return acquired;
 }

+/*
+ * Try to hold the transaction.  Holding the transaction prevents it
+ * from being committed.  If a transaction is currently being written
+ * then we'll block until it's done and our hold can be granted.
+ *
+ * If a caller already holds the trans then we unconditionally acquire
+ * our hold and return to avoid deadlocks with our caller, the writing
+ * thread, and us.  We record nested holds in a call stack with the
+ * journal_info pointer in the task_struct.
+ *
+ * The writing thread marks itself as a global trans_task which
+ * short-circuits all the hold machinery so it can call code that would
+ * otherwise try to hold transactions while it is writing.
+ */
 int scoutfs_hold_trans(struct super_block *sb)
 {
 	struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
-	struct scoutfs_reservation *rsv;
-	int ret;

 	if (current == sbi->trans_task)
 		return 0;

-	rsv = current->journal_info;
-	if (rsv == NULL) {
-		rsv = kzalloc(sizeof(struct scoutfs_reservation), GFP_NOFS);
-		if (!rsv)
-			return -ENOMEM;
-
-		rsv->magic = SCOUTFS_RESERVATION_MAGIC;
-		current->journal_info = rsv;
-	}
-
-	BUG_ON(rsv->magic != SCOUTFS_RESERVATION_MAGIC);
-
-	ret = wait_event_interruptible(sbi->trans_hold_wq,
-				       acquired_hold(sb, rsv));
-	if (ret && rsv->holders == 0) {
-		current->journal_info = NULL;
-		kfree(rsv);
-	}
-	return ret;
+	return wait_event_interruptible(sbi->trans_hold_wq, acquired_hold(sb));
 }

 /*
@@ -441,50 +504,22 @@ int scoutfs_hold_trans(struct super_block *sb)
 */
 bool scoutfs_trans_held(void)
 {
-	struct scoutfs_reservation *rsv = current->journal_info;
+	unsigned long holders = (unsigned long)current->journal_info;

-	return rsv && rsv->magic == SCOUTFS_RESERVATION_MAGIC;
+	return (holders != 0 && ((holders & TRANS_JI_MAGIC_MASK) == TRANS_JI_MAGIC));
 }

-/*
- * As we drop the last hold in the reservation we try and wake other
- * hold attempts that were waiting for space.  As we drop the last trans
- * holder we try to wake a writing thread that was waiting for us to
- * finish.
- */
 void scoutfs_release_trans(struct super_block *sb)
 {
 	struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
-	struct scoutfs_reservation *rsv;
 	DECLARE_TRANS_INFO(sb, tri);
-	bool wake = false;

 	if (current == sbi->trans_task)
 		return;

-	rsv = current->journal_info;
-	BUG_ON(!rsv || rsv->magic != SCOUTFS_RESERVATION_MAGIC);
+	release_holders(sb);

-	spin_lock(&tri->lock);
-
-	trace_scoutfs_release_trans(sb, rsv, rsv->holders, tri->holders, tri->writing);
-
-	BUG_ON(rsv->holders <= 0);
-	BUG_ON(tri->holders <= 0);
-
-	if (--rsv->holders == 0) {
-		current->journal_info = NULL;
-		kfree(rsv);
-		wake = true;
-	}
-
-	if (--tri->holders == 0)
-		wake = true;
-
-	spin_unlock(&tri->lock);
-
-	if (wake)
-		wake_up(&sbi->trans_hold_wq);
+	trace_scoutfs_release_trans(sb, current->journal_info, atomic_read(&tri->holders));
 }

 /*
@@ -513,7 +548,7 @@ int scoutfs_setup_trans(struct super_block *sb)
 	if (!tri)
 		return -ENOMEM;

-	spin_lock_init(&tri->lock);
+	atomic_set(&tri->holders, 0);
 	scoutfs_block_writer_init(sb, &tri->wri);

 	sbi->trans_write_workq = alloc_workqueue("scoutfs_trans",
--- a/tests/Makefile
+++ b/tests/Makefile
@@ -1,4 +1,4 @@
-CFLAGS := -Wall -O2 -Werror -D_FILE_OFFSET_BITS=64 -fno-strict-aliasing 
+CFLAGS := -Wall -O2 -Werror -D_FILE_OFFSET_BITS=64 -fno-strict-aliasing -I ../kmod/src
 SHELL := /usr/bin/bash

 # each binary command is built from a single .c file
@@ -6,6 +6,7 @@ BIN := src/createmany			\
 	src/dumb_setxattr		\
 	src/handle_cat			\
 	src/bulk_create_paths		\
+	src/stage_tmpfile		\
 	src/find_xattrs

 DEPS := $(wildcard src/*.d)
--- a/tests/funcs/fs.sh
+++ b/tests/funcs/fs.sh
@@ -209,12 +209,19 @@ t_trigger_show() {
 	echo "trigger $which $string: $(t_trigger_get $which $nr)"
 }

-t_trigger_arm() {
+t_trigger_arm_silent() {
 	local which="$1"
 	local nr="$2"
 	local path=$(t_trigger_path "$nr")

 	echo 1 > "$path/$which"
+}
+
+t_trigger_arm() {
+	local which="$1"
+	local nr="$2"
+
+	t_trigger_arm_silent $which $nr
 	t_trigger_show $which armed $nr
 }

@@ -229,16 +236,44 @@ t_counter() {
 	cat "$(t_sysfs_path $nr)/counters/$which"
 }

+#
+# output the difference between the current value of a counter and the
+# caller's provided previous value.
+#
+t_counter_diff_value() {
+	local which="$1"
+	local old="$2"
+	local nr="$3"
+	local new="$(t_counter $which $nr)"
+
+	echo "$((new - old))"
+}
+
 #
 # output the value of the given counter for the given mount, defaulting
-# to mount 0 if a mount isn't specified.
+# to mount 0 if a mount isn't specified.  For tests which expect a
+# specific difference in counters.
 #
 t_counter_diff() {
 	local which="$1"
 	local old="$2"
 	local nr="$3"
-	local new

-	new="$(t_counter $which $nr)"
-	echo "counter $which diff $((new - old))"
+	echo "counter $which diff $(t_counter_diff_value $which $old $nr)"
+}
+
+#
+# output a message indicating whether or not the counter value changed.
+# For tests that expect a difference, or not, but the amount of
+# difference isn't significant.
+#
+t_counter_diff_changed() {
+	local which="$1"
+	local old="$2"
+	local nr="$3"
+	local diff="$(t_counter_diff_value $which $old $nr)"
+
+	test "$diff" -eq 0 && \
+		echo "counter $which didn't change" ||
+		echo "counter $which changed"
 }
--- a/tests/golden/block-stale-reads
+++ b/tests/golden/block-stale-reads
@@ -1,29 +1,52 @@
-== create file for xattr ping pong
-# file: /mnt/test/test/block-stale-reads/file
-user.xat="initial"
-
-== retry btree forest reads between mounts
-trigger block_remove_stale armed: 0
+== create shared test file
+== set and get xattrs between mount pairs while retrying
 # file: /mnt/test/test/block-stale-reads/file
 user.xat="1"

-trigger block_remove_stale after: 0
-counter block_cache_remove_stale diff 1
-trigger block_remove_stale armed: 0
+counter block_cache_remove_stale changed
+counter block_cache_remove_stale changed
 # file: /mnt/test/test/block-stale-reads/file
 user.xat="2"

-trigger block_remove_stale after: 0
-counter block_cache_remove_stale diff 2
-trigger block_remove_stale armed: 0
+counter block_cache_remove_stale changed
+counter block_cache_remove_stale changed
 # file: /mnt/test/test/block-stale-reads/file
 user.xat="3"

-trigger block_remove_stale after: 0
-counter block_cache_remove_stale diff 3
-trigger block_remove_stale armed: 0
+counter block_cache_remove_stale changed
+counter block_cache_remove_stale changed
 # file: /mnt/test/test/block-stale-reads/file
 user.xat="4"

-trigger block_remove_stale after: 0
-counter block_cache_remove_stale diff 4
+counter block_cache_remove_stale changed
+counter block_cache_remove_stale changed
+# file: /mnt/test/test/block-stale-reads/file
+user.xat="5"
+
+counter block_cache_remove_stale changed
+counter block_cache_remove_stale changed
+# file: /mnt/test/test/block-stale-reads/file
+user.xat="6"
+
+counter block_cache_remove_stale changed
+counter block_cache_remove_stale changed
+# file: /mnt/test/test/block-stale-reads/file
+user.xat="7"
+
+counter block_cache_remove_stale changed
+counter block_cache_remove_stale changed
+# file: /mnt/test/test/block-stale-reads/file
+user.xat="8"
+
+counter block_cache_remove_stale changed
+counter block_cache_remove_stale changed
+# file: /mnt/test/test/block-stale-reads/file
+user.xat="9"
+
+counter block_cache_remove_stale changed
+counter block_cache_remove_stale changed
+# file: /mnt/test/test/block-stale-reads/file
+user.xat="10"
+
+counter block_cache_remove_stale changed
+counter block_cache_remove_stale changed
--- a/tests/golden/stage-tmpfile
+++ b/tests/golden/stage-tmpfile
@@ -0,0 +1,18 @@
+total file size 33669120
+00000000  41 41 41 41 41 41 41 41  41 41 41 41 41 41 41 41  |AAAAAAAAAAAAAAAA|
+*
+00400000  42 42 42 42 42 42 42 42  42 42 42 42 42 42 42 42  |BBBBBBBBBBBBBBBB|
+*
+00801000  43 43 43 43 43 43 43 43  43 43 43 43 43 43 43 43  |CCCCCCCCCCCCCCCC|
+*
+00c03000  44 44 44 44 44 44 44 44  44 44 44 44 44 44 44 44  |DDDDDDDDDDDDDDDD|
+*
+01006000  45 45 45 45 45 45 45 45  45 45 45 45 45 45 45 45  |EEEEEEEEEEEEEEEE|
+*
+0140a000  46 46 46 46 46 46 46 46  46 46 46 46 46 46 46 46  |FFFFFFFFFFFFFFFF|
+*
+0180f000  47 47 47 47 47 47 47 47  47 47 47 47 47 47 47 47  |GGGGGGGGGGGGGGGG|
+*
+01c15000  48 48 48 48 48 48 48 48  48 48 48 48 48 48 48 48  |HHHHHHHHHHHHHHHH|
+*
+0201c000
--- a/tests/golden/xfstests
+++ b/tests/golden/xfstests
@@ -1,6 +1,7 @@
 Ran:
 generic/001
 generic/002
+generic/004
 generic/005
 generic/006
 generic/007
@@ -73,7 +74,6 @@ generic/376
 generic/377
 Not
 run:
-generic/004
 generic/008
 generic/009
 generic/012
@@ -278,4 +278,4 @@ shared/004
 shared/032
 shared/051
 shared/289
-Passed all 72 tests
+Passed all 73 tests
--- a/tests/sequence
+++ b/tests/sequence
@@ -18,6 +18,7 @@ createmany-large-names.sh
 createmany-rename-large-dir.sh
 stage-release-race-alloc.sh
 stage-multi-part.sh
+stage-tmpfile.sh
 basic-posix-consistency.sh
 dirent-consistency.sh
 lock-ex-race-processes.sh
--- a/tests/src/stage_tmpfile.c
+++ b/tests/src/stage_tmpfile.c
@@ -0,0 +1,145 @@
+/*
+ * Exercise O_TMPFILE creation as well as staging from tmpfiles into
+ * a released destination file.
+ *
+ * Copyright (C) 2021 Versity Software, Inc.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ */
+
+#ifndef _GNU_SOURCE
+#define _GNU_SOURCE
+#endif
+#include <unistd.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/ioctl.h>
+#include <fcntl.h>
+#include <errno.h>
+#include <linux/types.h>
+#include <assert.h>
+
+#include "ioctl.h"
+
+#define array_size(arr) (sizeof(arr) / sizeof(arr[0]))
+
+/*
+ * Write known data into 8 tmpfiles.
+ * Make a new file X and release it
+ * Move contents of 8 tmpfiles into X.
+ */
+
+struct sub_tmp_info {
+	int fd;
+	unsigned int offset;
+	unsigned int length;
+};
+
+#define SZ	4096
+char buf[SZ];
+
+int main(int argc, char **argv)
+{
+	struct scoutfs_ioctl_release ioctl_args = {0};
+	struct scoutfs_ioctl_move_blocks mb;
+	struct sub_tmp_info sub_tmps[8];
+	int tot_size = 0;
+	char *dest_file;
+	int dest_fd;
+	char *mnt;
+	int ret;
+	int i;
+
+	if (argc < 3) {
+		printf("%s <mountpoint> <dest_file>\n", argv[0]);
+		return 1;
+	}
+
+	mnt = argv[1];
+	dest_file = argv[2];
+
+	for (i = 0; i < array_size(sub_tmps); i++) {
+		struct sub_tmp_info *sub_tmp = &sub_tmps[i];
+		int remaining;
+
+		sub_tmp->fd = open(mnt, O_RDWR | O_TMPFILE, S_IRUSR | S_IWUSR);
+		if (sub_tmp->fd < 0) {
+			perror("error");
+			exit(1);
+		}
+
+		sub_tmp->offset = tot_size;
+
+		/* First tmp file is 4MB */
+		/* Each is 4k bigger than last */
+		sub_tmp->length = (i + 1024) * sizeof(buf);
+
+		remaining = sub_tmp->length;
+
+		/* Each sub tmpfile written with 'A', 'B', etc. */
+		memset(buf, 'A' + i, sizeof(buf));
+		while (remaining) {
+			int written;
+
+			written = write(sub_tmp->fd, buf, sizeof(buf));
+			assert(written == sizeof(buf));
+			tot_size += sizeof(buf);
+			remaining -= written;
+		}
+	}
+
+	printf("total file size %d\n", tot_size);
+
+	dest_fd = open(dest_file, O_RDWR | O_CREAT, S_IRUSR | S_IWUSR);
+	if (dest_fd == -1) {
+		perror("error");
+		exit(1);
+	}
+
+	// make dest file big
+	ret = posix_fallocate(dest_fd, 0, tot_size);
+	if (ret) {
+		perror("error");
+		exit(1);
+	}
+
+	// release everything in dest file
+	ioctl_args.offset = 0;
+	ioctl_args.length = tot_size;
+	ioctl_args.data_version = 0;
+
+	ret = ioctl(dest_fd, SCOUTFS_IOC_RELEASE, &ioctl_args);
+	if (ret < 0) {
+		perror("error");
+		exit(1);
+	}
+
+	// move contents into dest in reverse order
+	for (i = array_size(sub_tmps) - 1; i >= 0 ; i--) {
+		struct sub_tmp_info *sub_tmp = &sub_tmps[i];
+
+		mb.from_fd = sub_tmp->fd;
+		mb.from_off = 0;
+		mb.len = sub_tmp->length;
+		mb.to_off = sub_tmp->offset;
+		mb.data_version = 0;
+		mb.flags = SCOUTFS_IOC_MB_STAGE;
+
+		ret = ioctl(dest_fd, SCOUTFS_IOC_MOVE_BLOCKS, &mb);
+		if (ret < 0) {
+			perror("error");
+			exit(1);
+		}
+
+	}
+
+	return 0;
+}
--- a/tests/tests/block-stale-reads.sh
+++ b/tests/tests/block-stale-reads.sh
@@ -1,5 +1,5 @@
 #
-# exercise stale block reading.
+# Exercise stale block reading.
 #
 # It would be very difficult to manipulate the allocators, cache, and
 # persistent blocks to create stable block reading scenarios.    Instead
@@ -7,34 +7,55 @@
 #

 t_require_commands touch setfattr getfattr
-t_require_mounts 2
+
+inc_wrap_fs_nr()
+{
+	local nr="$(($1 + 1))"
+
+	if [ "$nr" == "$T_NR_MOUNTS" ]; then
+		nr=0
+	fi
+
+	echo $nr
+}

 GETFATTR="getfattr --absolute-names"
 SETFATTR="setfattr"

-#
-# force re-reading forest btree blocks as each mount reads the items
-# written by the other.
-#
-set_file="$T_D0/file"
-get_file="$T_D1/file"
-echo "== create file for xattr ping pong"
-touch "$set_file"
-$SETFATTR -n user.xat -v initial "$set_file"
-$GETFATTR -n user.xat "$get_file" 2>&1 | t_filter_fs
+echo "== create shared test file"
+touch "$T_D0/file"
+$SETFATTR -n user.xat -v 0 "$T_D0/file"

-echo "== retry btree forest reads between mounts" 
-for i in $(seq 1 4); do
-	tmp="$set_file"
-	set_file="$get_file"
-	get_file="$tmp"
+#
+# Trigger retries in the block cache as we bounce xattr values around
+# between sequential pairs of mounts.  This is a little silly because if
+# either of the mounts are the server then they'll almost certaily have
+# their trigger fired prematurely by message handling btree calls while
+# working with the t_ helpers long before we work with the xattrs.  But
+# the block cache stale retry path is still being exercised.
+#
+echo "== set and get xattrs between mount pairs while retrying"
+set_nr=0
+get_nr=$(inc_wrap_fs_nr $set_nr)
+
+for i in $(seq 1 10); do
+	eval set_file="\$T_D${set_nr}/file"
+	eval get_file="\$T_D${get_nr}/file"
+
+	old_set=$(t_counter block_cache_remove_stale $set_nr)
+	old_get=$(t_counter block_cache_remove_stale $get_nr)
+
+	t_trigger_arm_silent block_remove_stale $set_nr
+	t_trigger_arm_silent block_remove_stale $get_nr

 	$SETFATTR -n user.xat -v $i "$set_file"
-	t_trigger_arm block_remove_stale $cl
-	old=$(t_counter btree_stale_read $cl)
 	$GETFATTR -n user.xat "$get_file" 2>&1 | t_filter_fs
-	t_trigger_show block_remove_stale "after" $cl
-	t_counter_diff block_cache_remove_stale $old $cl
+
+	t_counter_diff_changed block_cache_remove_stale $old_set $set_nr
+	t_counter_diff_changed block_cache_remove_stale $old_get $get_nr
+
+	set_nr="$get_nr"
+	get_nr=$(inc_wrap_fs_nr $set_nr)
 done

 t_pass
--- a/tests/tests/stage-tmpfile.sh
+++ b/tests/tests/stage-tmpfile.sh
@@ -0,0 +1,15 @@
+#
+# Run tmpfile_stage and check the output with hexdump.
+#
+
+t_require_commands stage_tmpfile hexdump
+
+DEST_FILE="$T_D0/dest_file"
+
+stage_tmpfile $T_D0 $DEST_FILE
+
+hexdump -C "$DEST_FILE"
+
+rm -fr "$DEST_FILE"
+
+t_pass
--- a/utils/src/mkfs.c
+++ b/utils/src/mkfs.c
@@ -361,12 +361,12 @@ static int do_mkfs(struct mkfs_args *args)
 		struct scoutfs_quorum_slot *sl = &super->qconf.slots[i];
 		struct in_addr in;

-		if (sl->addr.addr == 0)
+		if (sl->addr.v4.family != cpu_to_le16(SCOUTFS_AF_IPV4))
 			continue;

-		in.s_addr = htonl(le32_to_cpu(sl->addr.addr));
+		in.s_addr = htonl(le32_to_cpu(sl->addr.v4.addr));
 		printf("%s%u: %s:%u", indent,
-		       i, inet_ntoa(in), le16_to_cpu(sl->addr.port));
+		       i, inet_ntoa(in), le16_to_cpu(sl->addr.v4.port));
 		indent = "\n                        ";
 	}
 	printf("\n");
@@ -395,22 +395,28 @@ static bool valid_quorum_slots(struct scoutfs_quorum_slot *slots)
 	int j;

 	for (i = 0; i < SCOUTFS_QUORUM_MAX_SLOTS; i++) {
-		if (slots[i].addr.addr == 0)
+		if (slots[i].addr.v4.family == cpu_to_le16(SCOUTFS_AF_NONE))
 			continue;

+		if (slots[i].addr.v4.family != cpu_to_le16(SCOUTFS_AF_IPV4)) {
+			fprintf(stderr, "quorum slot nr %u has invalid family %u\n",
+				i, le16_to_cpu(slots[i].addr.v4.family));
+			valid = false;
+		}
+
 		for (j = i + 1; j < SCOUTFS_QUORUM_MAX_SLOTS; j++) {
-			if (slots[j].addr.addr == 0)
+			if (slots[i].addr.v4.family != cpu_to_le16(SCOUTFS_AF_IPV4))
 				continue;

-			if (slots[i].addr.addr == slots[j].addr.addr &&
-			    slots[i].addr.port == slots[j].addr.port) {
+			if (slots[i].addr.v4.addr == slots[j].addr.v4.addr &&
+			    slots[i].addr.v4.port == slots[j].addr.v4.port) {

 				in.s_addr =
-					htonl(le32_to_cpu(slots[i].addr.addr));
+					htonl(le32_to_cpu(slots[i].addr.v4.addr));
 				addr = inet_ntoa(in);
 				fprintf(stderr, "quorum slot nr %u and %u have the same address %s:%u\n",
 					i, j, addr,
-					le16_to_cpu(slots[i].addr.port));
+					le16_to_cpu(slots[i].addr.v4.port));
 				valid = false;
 			}
 		}
@@ -430,7 +436,7 @@ static int parse_opt(int key, char *arg, struct argp_state *state)
 		ret = parse_quorum_slot(&slot, arg);
 		if (ret < 0)
 			return ret;
-		if (args->slots[ret].addr.addr != 0)
+		if (args->slots[ret].addr.v4.family != cpu_to_le16(SCOUTFS_AF_NONE))
 			argp_error(state, "Quorum slot %u already specified before slot '%s'\n",
 				   ret, arg);
 		args->slots[ret] = slot;
--- a/utils/src/move_blocks.c
+++ b/utils/src/move_blocks.c
@@ -32,7 +32,7 @@ struct move_blocks_args {

 static int do_move_blocks(struct move_blocks_args *args)
 {
-	struct scoutfs_ioctl_move_blocks mb;
+	struct scoutfs_ioctl_move_blocks mb = {0};
 	int from_fd = -1;
 	int to_fd = -1;
 	int ret;
--- a/utils/src/parse.c
+++ b/utils/src/parse.c
@@ -213,7 +213,8 @@ int parse_quorum_slot(struct scoutfs_quorum_slot *slot, char *arg)
 		return -EINVAL;
 	}

-	slot->addr.addr = cpu_to_le32(htonl(in.s_addr));
-	slot->addr.port = cpu_to_le16(port);
+	slot->addr.v4.family = cpu_to_le16(SCOUTFS_AF_IPV4);
+	slot->addr.v4.addr = cpu_to_le32(htonl(in.s_addr));
+	slot->addr.v4.port = cpu_to_le16(port);
 	return nr;
 }
--- a/utils/src/print.c
+++ b/utils/src/print.c
@@ -769,7 +769,7 @@ static int print_btree_leaf_items(int fd, struct scoutfs_super_block *super,
 	return 0;
 }

-static char *alloc_addr_str(struct scoutfs_inet_addr *ia)
+static char *alloc_addr_str(union scoutfs_inet_addr *ia)
 {
 	struct in_addr addr;
 	char *quad;
@@ -777,12 +777,12 @@ static char *alloc_addr_str(struct scoutfs_inet_addr *ia)
 	int len;

 	memset(&addr, 0, sizeof(addr));
-	addr.s_addr = htonl(le32_to_cpu(ia->addr));
+	addr.s_addr = htonl(le32_to_cpu(ia->v4.addr));
 	quad = inet_ntoa(addr);
 	if (quad == NULL)
 		return NULL;

-	len = snprintf(NULL, 0, "%s:%u", quad, le16_to_cpu(ia->port));
+	len = snprintf(NULL, 0, "%s:%u", quad, le16_to_cpu(ia->v4.port));
 	if (len < 1 || len > 22)
 		return NULL;

@@ -791,7 +791,7 @@ static char *alloc_addr_str(struct scoutfs_inet_addr *ia)
 	if (!str)
 		return NULL;

-	snprintf(str, len, "%s:%u", quad, le16_to_cpu(ia->port));
+	snprintf(str, len, "%s:%u", quad, le16_to_cpu(ia->v4.port));
 	return str;
 }

@@ -915,8 +915,7 @@ static void print_super_block(struct scoutfs_super_block *super, u64 blkno)
 	printf("  quorum config version %llu\n",
 		le64_to_cpu(super->qconf.version));
 	for (i = 0; i < array_size(super->qconf.slots); i++) {
-		if (!super->qconf.slots[i].addr.addr &&
-		    !super->qconf.slots[i].addr.port)
+		if (super->qconf.slots[i].addr.v4.family != cpu_to_le16(SCOUTFS_AF_IPV4))
 			continue;

 		addr = alloc_addr_str(&super->qconf.slots[i].addr);
Author	SHA1	Message	Date
Zach Brown	c3290771a0	Block cache use rht _lookup_ insert for EEXIST The sneaky rhashtable_insert_fast() can't return -EEXIST despite the last line of the function REALLY making it look like it can. It just inserts new objects at the head of the bucket lists without comparing the insertion with existing objects. The block cache was relying on insertion to resolve duplicate racing allocated blocks. Because it couldn't return -EEXIST we could get duplicate cached blocks present in the hash table. rhashtable_lookup_insert_fast() fixes this by actually comparing the inserted objects key with the objects found in the insertion bucket. A racing allocator trying to insert a duplicate cached block will get an error, drop their allocated block, and retry their lookup. Signed-off-by: Zach Brown <zab@versity.com>	2021-04-13 09:24:23 -07:00
Zach Brown	cf3cb3f197	Wait for rhashtable to rehash on insert EBUSY The rhashtable can return EBUSY if you insert fast enough to trigger an expansion of the next table size that is waiting to be rehashed in an rcu callback. If we get EBUSY from rhasthable_insert we call synchronize_rcu to wait for the rehash to complete before trying again. This was hit in testing restores of a very large namespace and took a few hours to hit. Signed-off-by: Zach Brown <zab@versity.com>	2021-04-13 09:24:23 -07:00
Andy Grover	cb4ed98b3c	Merge pull request #31 from versity/zab/block_shrink_wait_for_rebalance Block cache shrink restart waits for rcu callbacks	2021-04-08 09:03:12 -07:00
Zach Brown	9ee7f7b9dc	Block cache shrink restart waits for rcu callbacks We're seeing cpu livelocks in block shrinking where counters show that a single block cache shrink call is only getting EAGAIN from repeated rhashtable walk attempts. It occurred to me that the running task might be preventing an RCU grace period from ending by never blocking. The hope of this commit is that by waiting for rcu callbacks to run we'll ensure that any pending rebalance callback runs before we retry the rhashtable walk again. I haven't been able to reproduce this easily so this is a stab in the dark. Signed-off-by: Zach Brown <zab@versity.com>	2021-04-07 12:50:50 -07:00
Zach Brown	300791ecfa	Merge pull request #29 from agrover/cleanup Cleanup	2021-04-07 12:27:00 -07:00
Andy Grover	4630b77b45	cleanup: Use flexible array members instead of 0-length arrays See Documentation/process/deprecated.rst:217, items[] now preferred over items[0]. Signed-off-by: Andy Grover <agrover@versity.com>	2021-04-07 10:14:47 -07:00
Andy Grover	bdc43ca634	cleanup: Fix ESTALE handling in forest_read_items Kinda weird to goto back to the out label and then out the bottom. Just return -EIO, like forest_next_hint() does. Don't call client_get_roots() right before retry, since is the first thing retry does. Signed-off-by: Andy Grover <agrover@versity.com>	2021-04-07 10:14:04 -07:00
Andy Grover	6406f05350	cleanup: Remove struct net_lock_grant_response We're not using the roots member of this struct, so we can just use struct scoutfs_net_lock directly. Signed-off-by: Andy Grover <agrover@versity.com>	2021-04-07 10:13:56 -07:00
Andy Grover	820b7295f0	cleanup: Unused LIST_HEADs Signed-off-by: Andy Grover <agrover@versity.com>	2021-04-05 16:23:41 -07:00
Zach Brown	b3611103ee	Merge pull request #26 from agrover/tmpfile Support O_TMPFILE and allow MOVE_BLOCKS into released extents	2021-04-05 15:23:41 -07:00
Andy Grover	0deb232d3f	Support O_TMPFILE and allow MOVE_BLOCKS into released extents Support O_TMPFILE: Create an unlinked file and put it on the orphan list. If it ever gains a link, take it off the orphan list. Change MOVE_BLOCKS ioctl to allow moving blocks into offline extent ranges. Ioctl callers must set a new flag to enable this operation mode. RH-compat: tmpfile support it actually backported by RH into 3.10 kernel. We need to use some of their kabi-maintaining wrappers to use it: use a struct inode_operations_wrapper instead of base struct inode_operations, set S_IOPS_WRAPPER flag in i_flags. This lets RH's modified vfs_tmpfile() find our tmpfile fn pointer. Add a test that tests both creating tmpfiles as well as moving their contents into a destination file via MOVE_BLOCKS. xfstests common/004 now runs because tmpfile is supported. Signed-off-by: Andy Grover <agrover@versity.com>	2021-04-05 14:23:44 -07:00
Andy Grover	1366e254f9	Merge pull request #30 from versity/zab/srch_block_ref_leak Zab/srch block ref leak	2021-04-01 16:50:34 -07:00
Zach Brown	1259f899a3	srch compaction needs to prepare alloc for commit The srch client compaction work initializes allocators, dirties blocks, and writes them out as its transaction. It forgot to call the pre-commit allocator prepare function. The prepare function drops block references used by the meta allocator during the transaction. This leaked block references which kept blocks from being freed by the shrinker under memory pressure. Eventually memory was full of leaked blocks and the shrinker walked all of them looking blocks to free, resulting in an effective livelock that ground the system to a crawl. Signed-off-by: Zach Brown <zab@versity.com>	2021-04-01 13:04:40 -07:00
Zach Brown	2d393f435b	Warn on leaked block refs on unmount By the time we get to destroying the block cache we should have put all our block references. Warn as we tear down the blocks if we see any blocks that still have references, implying a ref leak. This caught a leak caused by srch compaction forgetting to put allocator list block refs. Signed-off-by: Zach Brown <zab@versity.com>	2021-04-01 13:04:06 -07:00
Andy Grover	09c879bcf1	Merge pull request #25 from versity/zab/client_greeting_items_exist Zab/client greeting items exist	2021-03-16 15:57:55 -07:00
Zach Brown	3de703757f	Fix weird comment editing error That comment looked very weird indeed until I recognized that I must have forgotten to delete the first two attempts at starting the sentence. Signed-off-by: Zach Brown <zab@versity.com>	2021-03-16 12:02:05 -07:00
Zach Brown	7d67489b0c	Handle resent initial client greetings The very first greeting a client sends is unique becuase it doesn't yet have a server_term field set and tells the server to create items to track the client. A server processing this request can create the items and then shut down before the client is able to receive the reply. They'll resend the greeting without server_term but then the next server will get -EEXIST errors as it tries to create items for the client. This causes the connection to break, which the client tries to reestablish, and the pattern repeats indefinitely. The fix is to simply recognize that -EEXIST is acceptable during item creation. Server message handlers always have to address the case where a resent message was already processed by a previous server but it's response didn't make it to the client. Signed-off-by: Zach Brown <zab@versity.com>	2021-03-16 11:56:26 -07:00
Zach Brown	73084462e9	Remove unused client greeting_umb Remove an old client info field from the unmount barrier mechanism which was removed a while ago. It used to be compared to a super field to decide to finish unmount without reconnecting but now we check for our mounted_client item in the server's btree. Signed-off-by: Zach Brown <zab@versity.com>	2021-03-16 10:04:42 -07:00
Zach Brown	8c81af2b9b	Merge pull request #22 from agrover/ipv6 Reserve space in superblock for IPv6 addresses	2021-03-15 16:04:26 -07:00
Andy Grover	efe5d92458	Reserve space in superblock for IPv6 addresses Define a family field, and add a union for IPv4 and v6 variants, although v6 is not supported yet. Family field is now used to determine presence of address in a quorum slot, instead of checking if addr is zero. Signed-off-by: Andy Grover <agrover@versity.com>	2021-03-12 14:10:42 -08:00
Andy Grover	d39e56d953	Merge pull request #24 from versity/zab/fix-block-stale-reads Zab/fix block stale reads	2021-03-11 09:33:03 -08:00
Zach Brown	5661a1fb02	Fix block-stale-reads test The block-stale-reads test was built from the ashes of a test that used counters and triggers to work with the btree when it was only used on the server. The initial quick translation to try and trigger block cache retries while the forest called the btree got so much wrong. It was still trying to use some 'cl' variable that didn't refer to the client any more, the trigger helpers now call statfs to find paths and can end up triggering themselves. and many more counters stale reads can happen throughout the system while we're working -- not just one from our trigger. This fixes it up to consistently use fs numbers instead of the silly stale cl variable and be less sensitive to triggers firing and counter differences. Signed-off-by: Zach Brown <zab@versity.com>	2021-03-10 12:36:41 -08:00
Zach Brown	12fa289399	Add t_trigger_arm_silent t_trigger_arm always output the value of the trigger after arming on the premise that tests required the trigger being armed. In the process of showing the trigger it calls a bunch of t_ helpers that build the path to the trigger file using statfs_more to get the rid of mounts. If the trigger being armed is in the server's mount and the specific trigger test is fired by the server's statfs_more request processing then the trigger can be fired before read its value. Tests can inconsistently fail as the golden output shows the trigger being armed or not depending on if it was in the server's mount or not. t_trigger_arm_silent doesn't output the value of the armed trigger. It can be used for low level triggers that don't rely on reading the trigger's value to discover that their effect has happened. Signed-off-by: Zach Brown <zab@versity.com>	2021-03-10 12:36:34 -08:00
Zach Brown	75e8fab57c	Add t_counter_diff_changed Tests can use t_counter_diff to put a message in their golden output when a specific change in counters is expected. This adds t_counter_diff_changed to output a message that indicates change or not, for tests that want to see counters change but the amount of change doesn't need to be precisely known. Signed-off-by: Zach Brown <zab@versity.com>	2021-03-10 12:32:04 -08:00
Zach Brown	513d6b2734	Merge pull request #20 from versity/zab/remove_trans_spinlock Zab/remove trans spinlock	2021-03-04 13:59:07 -08:00
Zach Brown	f8d39610a2	Only get inode writeback_lock when adding inodes Each transaction maintains a global list of inodes to sync. It checks the inode and adds it in each write_end call per OS page. Locking and unlocking the global spinlock was showing up in profiles. At the very least, we can only get the lock once per large file that's written during a transaction. This will reduce spinlock traffic on the lock by the number of pages written per file. We'll want a better solution in the long run, but this helps for now. Signed-off-by: Zach Brown <zab@versity.com>	2021-03-04 11:39:30 -08:00
Zach Brown	c470c1c9f6	Allow read-mostly _alloc_meta_low Each transaction hold makes multiple calls to _alloc_meta_low to see if the transaction should be committed to refill allocators before the caller's hold is acquired and they can dirty blocks in the transaction. _alloc_meta_low was using a spinlock to sample the allocator list_head blocks to determine if there was space available. The lock and unlock stores were creating significant cacheline contention. The _alloc_meta_low calls are higher frequency than allocations. We can use a seqlock to have exclusive writers and allow concurrent _alloc_meta_low readers who retry if a writer intervenes. Signed-off-by: Zach Brown <zab@versity.com>	2021-03-04 11:39:30 -08:00
Andy Grover	cad902b9cd	Merge pull request #19 from versity/zab/block_crash_and_consistency Zab/block crash and consistency	2021-03-04 10:57:27 -08:00
Zach Brown	e163f3b099	Use atomic holders instead of trans info lock We saw the transaction info lock showing up in profiles. We were doing quite a lot of work with that lock held. We can remove it entirely and use an atomic. Instead of a locked holders count and writer boolean we can use an atomic holders and have a high bit indicate that the write_func is pending. This turns the lock/unlock pairs in hold and release into atomic inc/cmpxchg/dec operations. Then we were checking allocators under the trans lock. Now that we have an atomic holders count we can increment it to prevent the writer from commiting and release it after the checks if we need another commit before the hold. And finally, we were freeing our allocated reservation struct under the lock. We weren't actually doing anything with the reservation struct so we can use journal_info as the nested hold counter instead of having it point to an allocated and freed struct. Signed-off-by: Zach Brown <zab@versity.com>	2021-03-01 14:18:04 -08:00