Merge pull request #46 from versity/zab/orphan_deletion_and_enospc

Zab/orphan deletion and enospc
2026-05-02 02:45:43 +00:00 · 2021-07-08 10:52:53 -07:00
parent 0374661a92 b4ede2ac6a
commit 53f11f5479
41 changed files with 1097 additions and 374 deletions
--- a/kmod/src/alloc.c
+++ b/kmod/src/alloc.c
@@ -676,6 +676,14 @@ int scoutfs_dalloc_return_cached(struct super_block *sb,
 *
 * Unlike meta allocations, the caller is expected to serialize
 * allocations from the root.
+ *
+ * ENOBUFS is returned if the data allocator ran out of space and we can
+ * probably refill it from the server.  The caller is expected to back
+ * out, commit the transaction, and try again.
+ *
+ * ENOSPC is returned if the data allocator ran out of space but we have
+ * a flag from the server telling us that there's no more space
+ * available.  This is a hard error and should be returned.
 */
 int scoutfs_alloc_data(struct super_block *sb, struct scoutfs_alloc *alloc,
 		       struct scoutfs_block_writer *wri,
@@ -724,13 +732,13 @@ int scoutfs_alloc_data(struct super_block *sb, struct scoutfs_alloc *alloc,
 	ret = 0;
 out:
 	if (ret < 0) {
-		/*
-		 * Special retval meaning there wasn't space to alloc from
-		 * this txn. Doesn't mean filesystem is completely full.
-		 * Maybe upper layers want to try again.
-		 */
-		if (ret == -ENOENT)
-			ret = -ENOBUFS;
+		if (ret == -ENOENT) {
+			if (le32_to_cpu(dalloc->root.flags) & SCOUTFS_ALLOC_FLAG_LOW)
+				ret = -ENOSPC;
+			else
+				ret = -ENOBUFS;
+		}
+
 		*blkno_ret = 0;
 		*count_ret = 0;
 	} else {
@@ -1261,6 +1269,20 @@ bool scoutfs_alloc_meta_low(struct super_block *sb,
 	return lo;
 }

+bool scoutfs_alloc_test_flag(struct super_block *sb,
+			    struct scoutfs_alloc *alloc, u32 flag)
+{
+	unsigned int seq;
+	bool set;
+
+	do {
+		seq = read_seqbegin(&alloc->seqlock);
+		set = !!(le32_to_cpu(alloc->avail.flags) & flag);
+	} while (read_seqretry(&alloc->seqlock, seq));
+
+	return set;
+}
+
 /*
 * Call the callers callback for every persistent allocator structure
 * we can find.
--- a/kmod/src/alloc.h
+++ b/kmod/src/alloc.h
@@ -38,6 +38,10 @@
 #define SCOUTFS_ALLOC_DATA_LG_THRESH \
 	(8ULL * 1024 * 1024 >> SCOUTFS_BLOCK_SM_SHIFT)

+/* the client will force commits if data allocators get too low */
+#define SCOUTFS_ALLOC_DATA_REFILL_THRESH \
+	((256ULL * 1024 * 1024) >> SCOUTFS_BLOCK_SM_SHIFT)
+
 /*
 * Fill client alloc roots to the target when they fall below the lo
 * threshold.
@@ -55,6 +59,7 @@
 #define SCOUTFS_SERVER_DATA_FILL_LO \
 	(1ULL * 1024 * 1024 * 1024 >> SCOUTFS_BLOCK_SM_SHIFT)

+
 /*
 * Log merge meta allocations are only used for one request and will
 * never use more than the dirty limit.
@@ -65,16 +70,6 @@
 	((SCOUTFS_LOG_MERGE_DIRTY_BYTE_LIMIT >> SCOUTFS_BLOCK_LG_SHIFT) + 4)
 #define SCOUTFS_SERVER_MERGE_FILL_LO		SCOUTFS_SERVER_MERGE_FILL_TARGET

-/*
- * Each of the server meta_alloc roots will try to keep a minimum amount
- * of free blocks.  The server will swap roots when its current avail
- * falls below the threshold while the freed root is still above it.  It
- * must have room for all the largest allocation attempted in a
- * transaction on the server.
- */
-#define SCOUTFS_SERVER_META_ALLOC_MIN \
-	(SCOUTFS_SERVER_META_FILL_TARGET * 2)
-
 /*
 * A run-time use of a pair of persistent avail/freed roots as a
 * metadata allocator.  It has the machinery needed to lock and avoid
@@ -157,6 +152,8 @@ int scoutfs_alloc_splice_list(struct super_block *sb,

 bool scoutfs_alloc_meta_low(struct super_block *sb,
 			    struct scoutfs_alloc *alloc, u32 nr);
+bool scoutfs_alloc_test_flag(struct super_block *sb,
+			    struct scoutfs_alloc *alloc, u32 flag);

 typedef int (*scoutfs_alloc_foreach_cb_t)(struct super_block *sb, void *arg,
 					  int owner, u64 id,
--- a/kmod/src/counters.h
+++ b/kmod/src/counters.h
@@ -88,6 +88,7 @@
 	EXPAND_COUNTER(forest_read_items)			\
 	EXPAND_COUNTER(forest_roots_next_hint)			\
 	EXPAND_COUNTER(forest_set_bloom_bits)			\
+	EXPAND_COUNTER(inode_evict_intr)			\
 	EXPAND_COUNTER(item_clear_dirty)			\
 	EXPAND_COUNTER(item_create)				\
 	EXPAND_COUNTER(item_delete)				\
@@ -151,6 +152,12 @@
 	EXPAND_COUNTER(net_recv_invalid_message)		\
 	EXPAND_COUNTER(net_recv_messages)			\
 	EXPAND_COUNTER(net_unknown_request)			\
+	EXPAND_COUNTER(orphan_scan)				\
+	EXPAND_COUNTER(orphan_scan_cached)			\
+	EXPAND_COUNTER(orphan_scan_error)			\
+	EXPAND_COUNTER(orphan_scan_item)			\
+	EXPAND_COUNTER(orphan_scan_omap_set)			\
+	EXPAND_COUNTER(orphan_scan_read)			\
 	EXPAND_COUNTER(quorum_elected)				\
 	EXPAND_COUNTER(quorum_fence_error)			\
 	EXPAND_COUNTER(quorum_fence_leader)			\
--- a/kmod/src/data.c
+++ b/kmod/src/data.c
@@ -312,10 +312,9 @@ int scoutfs_data_truncate_items(struct super_block *sb, struct inode *inode,

 	while (iblock <= last) {
 		if (inode)
-			ret = scoutfs_inode_index_lock_hold(inode, &ind_locks,
-							    true);
+			ret = scoutfs_inode_index_lock_hold(inode, &ind_locks, true, false);
 		else
-			ret = scoutfs_hold_trans(sb);
+			ret = scoutfs_hold_trans(sb, false);
 		if (ret)
 			break;

@@ -756,8 +755,7 @@ retry:
 		ret = scoutfs_inode_index_start(sb, &ind_seq) ?:
 		      scoutfs_inode_index_prepare(sb, &wbd->ind_locks, inode,
 						  true) ?:
-		      scoutfs_inode_index_try_lock_hold(sb, &wbd->ind_locks,
-							ind_seq);
+		      scoutfs_inode_index_try_lock_hold(sb, &wbd->ind_locks, ind_seq, true);
 	} while (ret > 0);
 	if (ret < 0)
 		goto out;
@@ -1010,7 +1008,7 @@ long scoutfs_fallocate(struct file *file, int mode, loff_t offset, loff_t len)

 	while(iblock <= last) {

-		ret = scoutfs_inode_index_lock_hold(inode, &ind_locks, false);
+		ret = scoutfs_inode_index_lock_hold(inode, &ind_locks, false, true);
 		if (ret)
 			goto out;

@@ -1086,7 +1084,7 @@ int scoutfs_data_init_offline_extent(struct inode *inode, u64 size,
 	}

 	/* we're updating meta_seq with offline block count */
-	ret = scoutfs_inode_index_lock_hold(inode, &ind_locks, false);
+	ret = scoutfs_inode_index_lock_hold(inode, &ind_locks, false, true);
 	if (ret < 0)
 		goto out;

@@ -1238,7 +1236,7 @@ int scoutfs_data_move_blocks(struct inode *from, u64 from_off,
 		ret = scoutfs_inode_index_start(sb, &seq) ?:
 		      scoutfs_inode_index_prepare(sb, &locks, from, true) ?:
 		      scoutfs_inode_index_prepare(sb, &locks, to, true) ?:
-		      scoutfs_inode_index_try_lock_hold(sb, &locks, seq);
+		      scoutfs_inode_index_try_lock_hold(sb, &locks, seq, false);
 		if (ret > 0)
 			continue;
 		if (ret < 0)
@@ -1844,13 +1842,17 @@ int scoutfs_data_prepare_commit(struct super_block *sb)
 	return ret;
 }

-u64 scoutfs_data_alloc_free_bytes(struct super_block *sb)
+/*
+ * Return true if the data allocator is lower than the caller's
+ * requirement and we haven't been told by the server that we're out of
+ * free extents.
+ */
+bool scoutfs_data_alloc_should_refill(struct super_block *sb, u64 blocks)
 {
 	DECLARE_DATA_INFO(sb, datinf);

-	return scoutfs_dalloc_total_len(&datinf->dalloc) <<
-		SCOUTFS_BLOCK_SM_SHIFT;
-
+	return (scoutfs_dalloc_total_len(&datinf->dalloc) < blocks) &&
+	       !(le32_to_cpu(datinf->dalloc.root.flags) & SCOUTFS_ALLOC_FLAG_LOW);
 }

 int scoutfs_data_setup(struct super_block *sb)
--- a/kmod/src/data.h
+++ b/kmod/src/data.h
@@ -86,7 +86,7 @@ void scoutfs_data_init_btrees(struct super_block *sb,
 void scoutfs_data_get_btrees(struct super_block *sb,
 			     struct scoutfs_log_trees *lt);
 int scoutfs_data_prepare_commit(struct super_block *sb);
-u64 scoutfs_data_alloc_free_bytes(struct super_block *sb);
+bool scoutfs_data_alloc_should_refill(struct super_block *sb, u64 blocks);

 int scoutfs_data_setup(struct super_block *sb);
 void scoutfs_data_destroy(struct super_block *sb);
--- a/kmod/src/dir.c
+++ b/kmod/src/dir.c
@@ -669,6 +669,7 @@ static struct inode *lock_hold_create(struct inode *dir, struct dentry *dentry,
 				      umode_t mode, dev_t rdev,
 				      struct scoutfs_lock **dir_lock,
 				      struct scoutfs_lock **inode_lock,
+				      struct scoutfs_lock **orph_lock,
 				      struct list_head *ind_locks)
 {
 	struct super_block *sb = dir->i_sb;
@@ -701,11 +702,17 @@ static struct inode *lock_hold_create(struct inode *dir, struct dentry *dentry,
 	if (ret)
 		goto out_unlock;

+	if (orph_lock) {
+		ret = scoutfs_lock_orphan(sb, SCOUTFS_LOCK_WRITE_ONLY, 0, ino, orph_lock);
+		if (ret < 0)
+			goto out_unlock;
+	}
+
 retry:
 	ret = scoutfs_inode_index_start(sb, &ind_seq) ?:
 	      scoutfs_inode_index_prepare(sb, ind_locks, dir, true) ?:
 	      scoutfs_inode_index_prepare_ino(sb, ind_locks, ino, mode) ?:
-	      scoutfs_inode_index_try_lock_hold(sb, ind_locks, ind_seq);
+	      scoutfs_inode_index_try_lock_hold(sb, ind_locks, ind_seq, true);
 	if (ret > 0)
 		goto retry;
 	if (ret)
@@ -725,9 +732,13 @@ out_unlock:
 	if (ret) {
 		scoutfs_inode_index_unlock(sb, ind_locks);
 		scoutfs_unlock(sb, *dir_lock, SCOUTFS_LOCK_WRITE);
-		scoutfs_unlock(sb, *inode_lock, SCOUTFS_LOCK_WRITE);
 		*dir_lock = NULL;
+		scoutfs_unlock(sb, *inode_lock, SCOUTFS_LOCK_WRITE);
 		*inode_lock = NULL;
+		if (orph_lock) {
+			scoutfs_unlock(sb, *orph_lock, SCOUTFS_LOCK_WRITE_ONLY);
+			*orph_lock = NULL;
+		}

 		inode = ERR_PTR(ret);
 	}
@@ -752,7 +763,7 @@ static int scoutfs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode,

 	hash = dirent_name_hash(dentry->d_name.name, dentry->d_name.len);
 	inode = lock_hold_create(dir, dentry, mode, rdev,
-				 &dir_lock, &inode_lock, &ind_locks);
+				 &dir_lock, &inode_lock, NULL, &ind_locks);
 	if (IS_ERR(inode))
 		return PTR_ERR(inode);

@@ -813,13 +824,15 @@ static int scoutfs_link(struct dentry *old_dentry,
 	struct super_block *sb = dir->i_sb;
 	struct scoutfs_lock *dir_lock;
 	struct scoutfs_lock *inode_lock = NULL;
+	struct scoutfs_lock *orph_lock = NULL;
 	LIST_HEAD(ind_locks);
-	bool del_orphan;
+	bool del_orphan = false;
 	u64 dir_size;
 	u64 ind_seq;
 	u64 hash;
 	u64 pos;
 	int ret;
+	int err;

 	hash = dirent_name_hash(dentry->d_name.name, dentry->d_name.len);

@@ -843,13 +856,20 @@ static int scoutfs_link(struct dentry *old_dentry,
 		goto out_unlock;

 	dir_size = i_size_read(dir) + dentry->d_name.len;
-	del_orphan = (inode->i_nlink == 0);
+
+	if (inode->i_nlink == 0) {
+		del_orphan = true;
+		ret = scoutfs_lock_orphan(sb, SCOUTFS_LOCK_WRITE_ONLY, 0, scoutfs_ino(inode),
+					  &orph_lock);
+		if (ret < 0)
+			goto out_unlock;
+	}

 retry:
 	ret = scoutfs_inode_index_start(sb, &ind_seq) ?:
 	      scoutfs_inode_index_prepare(sb, &ind_locks, dir, false) ?:
 	      scoutfs_inode_index_prepare(sb, &ind_locks, inode, false) ?:
-	      scoutfs_inode_index_try_lock_hold(sb, &ind_locks, ind_seq);
+	      scoutfs_inode_index_try_lock_hold(sb, &ind_locks, ind_seq, true);
 	if (ret > 0)
 		goto retry;
 	if (ret)
@@ -860,7 +880,7 @@ retry:
 		goto out;

 	if (del_orphan) {
-		ret = scoutfs_orphan_dirty(sb, scoutfs_ino(inode));
+		ret = scoutfs_inode_orphan_delete(sb, scoutfs_ino(inode), orph_lock);
 		if (ret)
 			goto out;
 	}
@@ -871,8 +891,11 @@ retry:
 			      dentry->d_name.name, dentry->d_name.len,
 			      scoutfs_ino(inode), inode->i_mode, dir_lock,
 			      inode_lock);
-	if (ret)
+	if (ret) {
+		err = scoutfs_inode_orphan_create(sb, scoutfs_ino(inode), orph_lock);
+		WARN_ON_ONCE(err); /* no orphan, might not scan and delete after crash */
 		goto out;
+	}
 	update_dentry_info(sb, dentry, hash, pos, dir_lock);

 	i_size_write(dir, dir_size);
@@ -880,11 +903,6 @@ retry:
 	inode->i_ctime = dir->i_mtime;
 	inc_nlink(inode);

-	if (del_orphan) {
-		ret = scoutfs_orphan_delete(sb, scoutfs_ino(inode));
-		WARN_ON_ONCE(ret);
-	}
-
 	scoutfs_update_inode_item(inode, inode_lock, &ind_locks);
 	scoutfs_update_inode_item(dir, dir_lock, &ind_locks);

@@ -896,6 +914,8 @@ out_unlock:
 	scoutfs_inode_index_unlock(sb, &ind_locks);
 	scoutfs_unlock(sb, dir_lock, SCOUTFS_LOCK_WRITE);
 	scoutfs_unlock(sb, inode_lock, SCOUTFS_LOCK_WRITE);
+	scoutfs_unlock(sb, orph_lock, SCOUTFS_LOCK_WRITE_ONLY);
+
 	return ret;
 }

@@ -920,6 +940,7 @@ static int scoutfs_unlink(struct inode *dir, struct dentry *dentry)
 	struct inode *inode = dentry->d_inode;
 	struct timespec ts = current_kernel_time();
 	struct scoutfs_lock *inode_lock = NULL;
+	struct scoutfs_lock *orph_lock = NULL;
 	struct scoutfs_lock *dir_lock = NULL;
 	LIST_HEAD(ind_locks);
 	u64 ind_seq;
@@ -937,32 +958,36 @@ static int scoutfs_unlink(struct inode *dir, struct dentry *dentry)
 		goto unlock;
 	}

+	if (should_orphan(inode)) {
+		ret = scoutfs_lock_orphan(sb, SCOUTFS_LOCK_WRITE_ONLY, 0, scoutfs_ino(inode),
+					  &orph_lock);
+		if (ret < 0)
+			goto unlock;
+	}
+
 retry:
 	ret = scoutfs_inode_index_start(sb, &ind_seq) ?:
 	      scoutfs_inode_index_prepare(sb, &ind_locks, dir, false) ?:
 	      scoutfs_inode_index_prepare(sb, &ind_locks, inode, false) ?:
-	      scoutfs_inode_index_try_lock_hold(sb, &ind_locks, ind_seq);
+	      scoutfs_inode_index_try_lock_hold(sb, &ind_locks, ind_seq, false);
 	if (ret > 0)
 		goto retry;
 	if (ret)
 		goto unlock;

+	if (should_orphan(inode)) {
+		ret = scoutfs_inode_orphan_create(sb, scoutfs_ino(inode), orph_lock);
+		if (ret < 0)
+			goto out;
+	}
+
 	ret = del_entry_items(sb, scoutfs_ino(dir), dentry_info_hash(dentry),
 			      dentry_info_pos(dentry), scoutfs_ino(inode),
 			      dir_lock, inode_lock);
-	if (ret)
+	if (ret) {
+		ret = scoutfs_inode_orphan_delete(sb, scoutfs_ino(inode), orph_lock);
+		WARN_ON_ONCE(ret); /* should have been dirty */
 		goto out;
-
-	if (should_orphan(inode)) {
-		/*
-		 * Insert the orphan item before we modify any inode
-		 * metadata so we can gracefully exit should it
-		 * fail.
-		 */
-		ret = scoutfs_orphan_inode(inode);
-		WARN_ON_ONCE(ret); /* XXX returning error but items deleted */
-		if (ret)
-			goto out;
 	}

 	dir->i_ctime = ts;
@@ -984,6 +1009,7 @@ unlock:
 	scoutfs_inode_index_unlock(sb, &ind_locks);
 	scoutfs_unlock(sb, dir_lock, SCOUTFS_LOCK_WRITE);
 	scoutfs_unlock(sb, inode_lock, SCOUTFS_LOCK_WRITE);
+	scoutfs_unlock(sb, orph_lock, SCOUTFS_LOCK_WRITE_ONLY);

 	return ret;
 }
@@ -1176,7 +1202,7 @@ static int scoutfs_symlink(struct inode *dir, struct dentry *dentry,
 		return ret;

 	inode = lock_hold_create(dir, dentry, S_IFLNK|S_IRWXUGO, 0,
-				 &dir_lock, &inode_lock, &ind_locks);
+				 &dir_lock, &inode_lock, NULL, &ind_locks);
 	if (IS_ERR(inode))
 		return PTR_ERR(inode);

@@ -1535,6 +1561,7 @@ static int scoutfs_rename(struct inode *old_dir, struct dentry *old_dentry,
 	struct scoutfs_lock *new_dir_lock = NULL;
 	struct scoutfs_lock *old_inode_lock = NULL;
 	struct scoutfs_lock *new_inode_lock = NULL;
+	struct scoutfs_lock *orph_lock = NULL;
 	struct timespec now;
 	bool ins_new = false;
 	bool del_new = false;
@@ -1599,6 +1626,13 @@ static int scoutfs_rename(struct inode *old_dir, struct dentry *old_dentry,
 	if (ret)
 		goto out_unlock;

+	if (should_orphan(new_inode)) {
+		ret = scoutfs_lock_orphan(sb, SCOUTFS_LOCK_WRITE_ONLY, 0, scoutfs_ino(new_inode),
+					  &orph_lock);
+		if (ret < 0)
+			goto out_unlock;
+	}
+
 retry:
 	ret = scoutfs_inode_index_start(sb, &ind_seq) ?:
 	      scoutfs_inode_index_prepare(sb, &ind_locks, old_dir, false) ?:
@@ -1607,7 +1641,7 @@ retry:
 	       scoutfs_inode_index_prepare(sb, &ind_locks, new_dir, false)) ?:
 	      (new_inode == NULL ? 0 :
 	       scoutfs_inode_index_prepare(sb, &ind_locks, new_inode, false)) ?:
-	      scoutfs_inode_index_try_lock_hold(sb, &ind_locks, ind_seq);
+	      scoutfs_inode_index_try_lock_hold(sb, &ind_locks, ind_seq, true);
 	if (ret > 0)
 		goto retry;
 	if (ret)
@@ -1658,7 +1692,7 @@ retry:
 	ins_old = true;

 	if (should_orphan(new_inode)) {
-		ret = scoutfs_orphan_inode(new_inode);
+		ret = scoutfs_inode_orphan_create(sb, scoutfs_ino(new_inode), orph_lock);
 		if (ret)
 			goto out;
 	}
@@ -1762,6 +1796,7 @@ out_unlock:
 	scoutfs_unlock(sb, old_dir_lock, SCOUTFS_LOCK_WRITE);
 	scoutfs_unlock(sb, new_dir_lock, SCOUTFS_LOCK_WRITE);
 	scoutfs_unlock(sb, rename_lock, SCOUTFS_LOCK_WRITE);
+	scoutfs_unlock(sb, orph_lock, SCOUTFS_LOCK_WRITE_ONLY);

 	return ret;
 }
@@ -1781,6 +1816,7 @@ static int scoutfs_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mod
 	struct inode *inode = NULL;
 	struct scoutfs_lock *dir_lock = NULL;
 	struct scoutfs_lock *inode_lock = NULL;
+	struct scoutfs_lock *orph_lock = NULL;
 	LIST_HEAD(ind_locks);
 	int ret;

@@ -1788,25 +1824,32 @@ static int scoutfs_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mod
 		return -ENAMETOOLONG;

 	inode = lock_hold_create(dir, dentry, mode, 0,
-				 &dir_lock, &inode_lock, &ind_locks);
+				 &dir_lock, &inode_lock, &orph_lock, &ind_locks);
 	if (IS_ERR(inode))
 		return PTR_ERR(inode);

+	ret = scoutfs_inode_orphan_create(sb, scoutfs_ino(inode), orph_lock);
+	if (ret < 0) {
+		iput(inode);
+		goto out; /* XXX returning error but items created */
+	}
+
 	inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
 	insert_inode_hash(inode);
+	ihold(inode); /* need to update inode modifications in d_tmpfile */
 	d_tmpfile(dentry, inode);

 	scoutfs_update_inode_item(inode, inode_lock, &ind_locks);
 	scoutfs_update_inode_item(dir, dir_lock, &ind_locks);
 	scoutfs_inode_index_unlock(sb, &ind_locks);
+	iput(inode);

-	ret = scoutfs_orphan_inode(inode);
-	WARN_ON_ONCE(ret); /* XXX returning error but items deleted */
-
+out:
 	scoutfs_release_trans(sb);
 	scoutfs_inode_index_unlock(sb, &ind_locks);
 	scoutfs_unlock(sb, dir_lock, SCOUTFS_LOCK_WRITE);
 	scoutfs_unlock(sb, inode_lock, SCOUTFS_LOCK_WRITE);
+	scoutfs_unlock(sb, orph_lock, SCOUTFS_LOCK_WRITE_ONLY);

 	return ret;
 }
--- a/kmod/src/forest.c
+++ b/kmod/src/forest.c
@@ -758,6 +758,16 @@ out:
 	return 0;
 }

+void scoutfs_forest_stop(struct super_block *sb)
+{
+	DECLARE_FOREST_INFO(sb, finf);
+
+	if (finf && finf->workq) {
+		cancel_delayed_work_sync(&finf->log_merge_dwork);
+		destroy_workqueue(finf->workq);
+	}
+}
+
 void scoutfs_forest_destroy(struct super_block *sb)
 {
 	struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
@@ -766,11 +776,6 @@ void scoutfs_forest_destroy(struct super_block *sb)
 	if (finf) {
 		scoutfs_block_put(sb, finf->srch_bl);

-		if (finf->workq) {
-			cancel_delayed_work_sync(&finf->log_merge_dwork);
-			destroy_workqueue(finf->workq);
-		}
-
 		kfree(finf);
 		sbi->forest_info = NULL;
 	}
--- a/kmod/src/forest.h
+++ b/kmod/src/forest.h
@@ -39,6 +39,7 @@ void scoutfs_forest_get_btrees(struct super_block *sb,
 			       struct scoutfs_log_trees *lt);

 int scoutfs_forest_setup(struct super_block *sb);
+void scoutfs_forest_stop(struct super_block *sb);
 void scoutfs_forest_destroy(struct super_block *sb);

 #endif
--- a/kmod/src/format.h
+++ b/kmod/src/format.h
@@ -286,9 +286,10 @@ struct scoutfs_alloc_list_head {
 	struct scoutfs_block_ref ref;
 	__le64 total_nr;
 	__le32 first_nr;
-	__u8 __pad[4];
+	__le32 flags;
 };

+
 /*
 * While the main allocator uses extent items in btree blocks, metadata
 * allocations for a single transaction are recorded in arrays in
@@ -317,9 +318,14 @@ struct scoutfs_alloc_list_block {
 */
 struct scoutfs_alloc_root {
 	__le64 total_len;
+	__le32 flags;
+	__le32 _pad;
 	struct scoutfs_btree_root root;
 };

+/* Shared by _alloc_list_head and _alloc_root */
+#define SCOUTFS_ALLOC_FLAG_LOW	(1U << 0)
+
 /* types of allocators, exposed to alloc_detail ioctl */
 #define SCOUTFS_ALLOC_OWNER_NONE	0
 #define SCOUTFS_ALLOC_OWNER_SERVER	1
@@ -570,7 +576,7 @@ struct scoutfs_log_merge_freeing {
 * Keys are first sorted by major key zones.
 */
 #define SCOUTFS_INODE_INDEX_ZONE		1
-#define SCOUTFS_RID_ZONE			2
+#define SCOUTFS_ORPHAN_ZONE			2
 #define SCOUTFS_FS_ZONE				3
 #define SCOUTFS_LOCK_ZONE			4
 /* Items only stored in server btrees */
@@ -592,7 +598,7 @@ struct scoutfs_log_merge_freeing {
 #define SCOUTFS_INODE_INDEX_DATA_SEQ_TYPE	2
 #define SCOUTFS_INODE_INDEX_NR			3 /* don't forget to update */

-/* rid zone (also used in server alloc btree) */
+/* orphan zone, redundant type used for clarity */
 #define SCOUTFS_ORPHAN_TYPE			1

 /* fs zone */
--- a/kmod/src/inode.c
+++ b/kmod/src/inode.c
@@ -34,6 +34,7 @@
 #include "client.h"
 #include "cmp.h"
 #include "omap.h"
+#include "forest.h"

 /*
 * XXX
@@ -54,10 +55,19 @@ struct inode_allocator {
 };

 struct inode_sb_info {
+	struct super_block *sb;
+	bool stopped;
+
 	spinlock_t writeback_lock;
 	struct rb_root writeback_inodes;
 	struct inode_allocator dir_ino_alloc;
 	struct inode_allocator ino_alloc;
+
+	struct delayed_work orphan_scan_dwork;
+
+	/* serialize multiple inode ->evict trying to delete same ino's items */
+	spinlock_t deleting_items_lock;
+	struct list_head deleting_items_list;
 };

 #define DECLARE_INODE_SB_INFO(sb, name) \
@@ -352,7 +362,7 @@ static int set_inode_size(struct inode *inode, struct scoutfs_lock *lock,
 	if (!S_ISREG(inode->i_mode))
 		return 0;

-	ret = scoutfs_inode_index_lock_hold(inode, &ind_locks, true);
+	ret = scoutfs_inode_index_lock_hold(inode, &ind_locks, true, false);
 	if (ret)
 		return ret;

@@ -379,7 +389,7 @@ static int clear_truncate_flag(struct inode *inode, struct scoutfs_lock *lock)
 	LIST_HEAD(ind_locks);
 	int ret;

-	ret = scoutfs_inode_index_lock_hold(inode, &ind_locks, false);
+	ret = scoutfs_inode_index_lock_hold(inode, &ind_locks, false, false);
 	if (ret)
 		return ret;

@@ -494,7 +504,7 @@ retry:
 		}
 	}

-	ret = scoutfs_inode_index_lock_hold(inode, &ind_locks, false);
+	ret = scoutfs_inode_index_lock_hold(inode, &ind_locks, false, false);
 	if (ret)
 		goto out;

@@ -1207,7 +1217,7 @@ int scoutfs_inode_index_start(struct super_block *sb, u64 *seq)
 * Returns > 0 if the seq changed and the locks should be retried.
 */
 int scoutfs_inode_index_try_lock_hold(struct super_block *sb,
-				      struct list_head *list, u64 seq)
+				      struct list_head *list, u64 seq, bool allocing)
 {
 	struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
 	struct index_lock *ind_lock;
@@ -1223,7 +1233,7 @@ int scoutfs_inode_index_try_lock_hold(struct super_block *sb,
 			goto out;
 	}

-	ret = scoutfs_hold_trans(sb);
+	ret = scoutfs_hold_trans(sb, allocing);
 	if (ret == 0 && seq != sbi->trans_seq) {
 		scoutfs_release_trans(sb);
 		ret = 1;
@@ -1237,7 +1247,7 @@ out:
 }

 int scoutfs_inode_index_lock_hold(struct inode *inode, struct list_head *list,
-				  bool set_data_seq)
+				  bool set_data_seq, bool allocing)
 {
 	struct super_block *sb = inode->i_sb;
 	int ret;
@@ -1247,7 +1257,7 @@ int scoutfs_inode_index_lock_hold(struct inode *inode, struct list_head *list,
 		ret = scoutfs_inode_index_start(sb, &seq) ?:
 		      scoutfs_inode_index_prepare(sb, list, inode,
 						  set_data_seq) ?:
-		      scoutfs_inode_index_try_lock_hold(sb, list, seq);
+		      scoutfs_inode_index_try_lock_hold(sb, list, seq, allocing);
 	} while (ret > 0);

 	return ret;
@@ -1437,41 +1447,74 @@ out:
 	return inode;
 }

-static void init_orphan_key(struct scoutfs_key *key, u64 rid, u64 ino)
+static void init_orphan_key(struct scoutfs_key *key, u64 ino)
 {
 	*key = (struct scoutfs_key) {
-		.sk_zone = SCOUTFS_RID_ZONE,
-		.sko_rid = cpu_to_le64(rid),
-		.sk_type = SCOUTFS_ORPHAN_TYPE,
+		.sk_zone = SCOUTFS_ORPHAN_ZONE,
 		.sko_ino = cpu_to_le64(ino),
+		.sk_type = SCOUTFS_ORPHAN_TYPE,
 	};
 }

-int scoutfs_orphan_dirty(struct super_block *sb, u64 ino)
+/*
+ * Create an orphan item.  The orphan items are maintained in their own
+ * zone under a write only lock while the caller has the inode protected
+ * by a write lock.
+ */
+int scoutfs_inode_orphan_create(struct super_block *sb, u64 ino, struct scoutfs_lock *lock)
 {
-	struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
-	struct scoutfs_lock *lock = sbi->rid_lock;
 	struct scoutfs_key key;

-	init_orphan_key(&key, sbi->rid, ino);
+	init_orphan_key(&key, ino);

-	return scoutfs_item_dirty(sb, &key, lock);
+	return scoutfs_item_create_force(sb, &key, NULL, 0, lock);
 }

-int scoutfs_orphan_delete(struct super_block *sb, u64 ino)
+int scoutfs_inode_orphan_delete(struct super_block *sb, u64 ino, struct scoutfs_lock *lock)
 {
-	struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
-	struct scoutfs_lock *lock = sbi->rid_lock;
 	struct scoutfs_key key;
-	int ret;

-	init_orphan_key(&key, sbi->rid, ino);
+	init_orphan_key(&key, ino);

-	ret = scoutfs_item_delete(sb, &key, lock);
-	if (ret == -ENOENT)
-		ret = 0;
+	return scoutfs_item_delete_force(sb, &key, lock);
+}

-	return ret;
+struct deleting_ino_entry {
+	struct list_head head;
+	u64 ino;
+};
+
+static bool added_deleting_ino(struct inode_sb_info *inf, struct deleting_ino_entry *del, u64 ino)
+{
+	struct deleting_ino_entry *tmp;
+	bool added = true;
+
+	spin_lock(&inf->deleting_items_lock);
+
+	list_for_each_entry(tmp, &inf->deleting_items_list, head) {
+		if (tmp->ino == ino) {
+			added = false;
+			break;
+		}
+	}
+
+	if (added) {
+		del->ino = ino;
+		list_add_tail(&del->head, &inf->deleting_items_list);
+	}
+
+	spin_unlock(&inf->deleting_items_lock);
+
+	return added;
+}
+
+static void del_deleting_ino(struct inode_sb_info *inf, struct deleting_ino_entry *del)
+{
+	if (del->ino) {
+		spin_lock(&inf->deleting_items_lock);
+		list_del_init(&del->head);
+		spin_unlock(&inf->deleting_items_lock);
+	}
 }

 /*
@@ -1482,9 +1525,21 @@ int scoutfs_orphan_delete(struct super_block *sb, u64 ino)
 * orphan item will continue triggering attempts to finish previous
 * partial deletion until all deletion is complete and the orphan item
 * is removed.
+ *
+ * Currently this can be called multiple times for multiple cached
+ * inodes for a given ino number (ilookup avoids freeing inodes to avoid
+ * cluster lock<->inode flag waiting inversions).  Some items are not
+ * safe to delete concurrently, for example concurrent data truncation
+ * could free extents multiple times.  We use a very silly list of inos
+ * being deleted.  Duplicates just return success.  If the first
+ * deletion ends up failing orphan deletion will come back around later
+ * and retry.
 */
-static int delete_inode_items(struct super_block *sb, u64 ino, struct scoutfs_lock *lock)
+static int delete_inode_items(struct super_block *sb, u64 ino, struct scoutfs_lock *lock,
+			      struct scoutfs_lock *orph_lock)
 {
+	DECLARE_INODE_SB_INFO(sb, inf);
+	struct deleting_ino_entry del = {{NULL, }};
 	struct scoutfs_inode sinode;
 	struct scoutfs_key key;
 	LIST_HEAD(ind_locks);
@@ -1494,6 +1549,11 @@ static int delete_inode_items(struct super_block *sb, u64 ino, struct scoutfs_lo
 	u64 size;
 	int ret;

+	if (!added_deleting_ino(inf, &del, ino)) {
+		ret = 0;
+		goto out;
+	}
+
 	init_inode_key(&key, ino);

 	ret = scoutfs_item_lookup_exact(sb, &key, &sinode, sizeof(sinode),
@@ -1531,7 +1591,7 @@ static int delete_inode_items(struct super_block *sb, u64 ino, struct scoutfs_lo
 retry:
 	ret = scoutfs_inode_index_start(sb, &ind_seq) ?:
 	      prepare_index_deletion(sb, &ind_locks, ino, mode, &sinode) ?:
-	      scoutfs_inode_index_try_lock_hold(sb, &ind_locks, ind_seq);
+	      scoutfs_inode_index_try_lock_hold(sb, &ind_locks, ind_seq, false);
 	if (ret > 0)
 		goto retry;
 	if (ret)
@@ -1553,8 +1613,9 @@ retry:
 	if (ret)
 		goto out;

-	ret = scoutfs_orphan_delete(sb, ino);
+	ret = scoutfs_inode_orphan_delete(sb, ino, orph_lock);
 out:
+	del_deleting_ino(inf, &del);
 	if (release)
 		scoutfs_release_trans(sb);
 	scoutfs_inode_index_unlock(sb, &ind_locks);
@@ -1568,11 +1629,17 @@ out:
 * tear down.  We use locking and open inode number bitmaps to decide if
 * we should finally destroy an inode that is no longer open nor
 * reachable through directory entries.
+ *
+ * Because lookup ignores freeing inodes we can get here from multiple
+ * instances of an inode that is being deleted.  Orphan scanning in
+ * particular can race with deletion.   delete_inode_items() resolves
+ * concurrent attempts.
 */
 void scoutfs_evict_inode(struct inode *inode)
 {
 	struct super_block *sb = inode->i_sb;
 	const u64 ino = scoutfs_ino(inode);
+	struct scoutfs_lock *orph_lock;
 	struct scoutfs_lock *lock;
 	int ret;

@@ -1584,14 +1651,21 @@ void scoutfs_evict_inode(struct inode *inode)

 	truncate_inode_pages_final(&inode->i_data);

-	ret = scoutfs_omap_should_delete(sb, inode, &lock);
+	ret = scoutfs_omap_should_delete(sb, inode, &lock, &orph_lock);
 	if (ret > 0) {
-		ret = delete_inode_items(inode->i_sb, scoutfs_ino(inode), lock);
+		ret = delete_inode_items(inode->i_sb, scoutfs_ino(inode), lock, orph_lock);
 		scoutfs_unlock(sb, lock, SCOUTFS_LOCK_WRITE);
+		scoutfs_unlock(sb, orph_lock, SCOUTFS_LOCK_WRITE_ONLY);
 	}
-	if (ret < 0)
+	if (ret == -ERESTARTSYS) {
+		/* can be in task with pending, could be found as orphan */
+		scoutfs_inc_counter(sb, inode_evict_intr);
+		ret = 0;
+	}
+	if (ret < 0) {
 		scoutfs_err(sb, "error %d while checking to delete inode nr %llu, it might linger.",
 			    ret, ino);
+	}

 	scoutfs_omap_dec(sb, ino);

@@ -1626,75 +1700,141 @@ int scoutfs_drop_inode(struct inode *inode)
 }

 /*
- * Find orphan items and process each one.
- *
- * Runtime of this will be bounded by the number of orphans, which could
- * theoretically be very large. If that becomes a problem we might want to push
- * this work off to a thread.
- *
- * This only scans orphans for this node.  This will need to be covered by
- * the rest of node zone cleanup.
+ * All mounts are performing this work concurrently.  We introduce
+ * significant jitter between them to try and keep them from all
+ * bunching up and working on the same inodes.
 */
-int scoutfs_scan_orphans(struct super_block *sb)
+static void schedule_orphan_dwork(struct inode_sb_info *inf)
 {
-	struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
-	struct scoutfs_lock *lock = sbi->rid_lock;
-	struct scoutfs_lock *inode_lock = NULL;
-	struct scoutfs_key key;
+#define ORPHAN_SCAN_MIN_MS (10 * MSEC_PER_SEC)
+#define ORPHAN_SCAN_JITTER_MS (40 * MSEC_PER_SEC)
+	unsigned long delay = msecs_to_jiffies(ORPHAN_SCAN_MIN_MS +
+					       prandom_u32_max(ORPHAN_SCAN_JITTER_MS));
+	if (!inf->stopped) {
+		delay = msecs_to_jiffies(ORPHAN_SCAN_MIN_MS +
+					 prandom_u32_max(ORPHAN_SCAN_JITTER_MS));
+		schedule_delayed_work(&inf->orphan_scan_dwork, delay);
+	}
+}
+
+/*
+ * Find and delete inodes whose only remaining reference is the
+ * persistent orphan item that was created as they were unlinked.
+ *
+ * Orphan items are created as the final directory entry referring to an
+ * inode is deleted.  They're deleted as the final cached inode is
+ * evicted and the inode items are destroyed.  They can linger if all
+ * the cached inodes pinning the inode fail to delete as they are
+ * evicted from the cache -- either through crashing or errors.
+ *
+ * This work runs in all mounts in the background looking for orphaned
+ * inodes that should be deleted.
+ *
+ * We use the forest hint call to read the persistent forest trees
+ * looking for orphan items without creating lock contention.  Orphan
+ * items exist for O_TMPFILE users and we don't want to force them to
+ * commit by trying to acquire a conflicting read lock the orphan zone.
+ * There's no rush to reclaim deleted items, eventually they will be
+ * found in the persistent item btrees.
+ *
+ * Once we find candidate orphan items we can first check our local
+ * inode cache for inodes that are already on their way to eviction and
+ * can be skipped.  Then we ask the server for the open map containing
+ * the inode.  Only if we don't have it cached, and no one else does, do
+ * we try and read it into our cache and evict it to trigger the final
+ * inode deletion process.
+ *
+ * Orphaned items that make it that far should be very rare.  They can
+ * only exist if all the mounts that were using an inode after it had
+ * been unlinked (or created with o_tmpfile) didn't unmount cleanly.
+ */
+static void inode_orphan_scan_worker(struct work_struct *work)
+{
+	struct inode_sb_info *inf = container_of(work, struct inode_sb_info,
+						 orphan_scan_dwork.work);
+	struct super_block *sb = inf->sb;
+	struct scoutfs_open_ino_map omap;
 	struct scoutfs_key last;
+	struct scoutfs_key next;
+	struct scoutfs_key key;
+	struct inode *inode;
+	u64 group_nr;
+	int bit_nr;
 	u64 ino;
-	int err = 0;
 	int ret;

-	trace_scoutfs_scan_orphans(sb);
+	scoutfs_inc_counter(sb, orphan_scan);

-	init_orphan_key(&key, sbi->rid, 0);
-	init_orphan_key(&last, sbi->rid, ~0ULL);
+	init_orphan_key(&last, U64_MAX);
+	omap.args.group_nr = cpu_to_le64(U64_MAX);

-	while (1) {
-		ret = scoutfs_item_next(sb, &key, &last, NULL, 0, lock);
-		if (ret == -ENOENT) /* No more orphan items */
-			break;
-		if (ret < 0)
+	for (ino = SCOUTFS_ROOT_INO + 1; ino != 0; ino++) {
+		if (inf->stopped) {
+			ret = 0;
 			goto out;
-
-		ino = le64_to_cpu(key.sko_ino);
-
-		ret = scoutfs_lock_ino(sb, SCOUTFS_LOCK_WRITE, 0, ino, &inode_lock);
-		if (ret == 0) {
-			ret = delete_inode_items(sb, le64_to_cpu(key.sko_ino), inode_lock);
-			scoutfs_unlock(sb, inode_lock, SCOUTFS_LOCK_WRITE);
 		}
-		if (ret && ret != -ENOENT && !err)
-			err = ret;

-		if (le64_to_cpu(key.sko_ino) == U64_MAX) {
-			ret = -ENOENT;
+		/* find the next orphan item */
+		init_orphan_key(&key, ino);
+		ret = scoutfs_forest_next_hint(sb, &key, &next);
+		if (ret < 0) {
+			if (ret == -ENOENT)
+				break;
+			goto out;
+		}
+
+		if (scoutfs_key_compare(&next, &last) > 0)
 			break;
+
+		scoutfs_inc_counter(sb, orphan_scan_item);
+		ino = le64_to_cpu(next.sko_ino);
+
+		/* locally cached inodes will already be deleted */
+		inode = scoutfs_ilookup(sb, ino);
+		if (inode) {
+			scoutfs_inc_counter(sb, orphan_scan_cached);
+			iput(inode);
+			continue;
 		}
-		le64_add_cpu(&key.sko_ino, 1);
+
+		/* get an omap that covers the orphaned ino */
+		group_nr = ino >> SCOUTFS_OPEN_INO_MAP_SHIFT;
+		bit_nr = ino & SCOUTFS_OPEN_INO_MAP_MASK;
+
+		if (le64_to_cpu(omap.args.group_nr) != group_nr) {
+			ret = scoutfs_client_open_ino_map(sb, group_nr, &omap);
+			if (ret < 0)
+				goto out;
+		}
+
+		/* don't need to evict if someone else has it open (cached) */
+		if (test_bit_le(bit_nr, omap.bits)) {
+			scoutfs_inc_counter(sb, orphan_scan_omap_set);
+			continue;
+		}
+
+		/* try to cached and evict unused inode to delete, can be racing */
+		inode = scoutfs_iget(sb, ino);
+		if (IS_ERR(inode)) {
+			ret = PTR_ERR(inode);
+			if (ret == -ENOENT)
+				continue;
+			else
+				goto out;
+		}
+
+		scoutfs_inc_counter(sb, orphan_scan_read);
+		SCOUTFS_I(inode)->drop_invalidated = true;
+		iput(inode);
 	}

 	ret = 0;
+
 out:
-	return err ? err : ret;
-}
+	if (ret < 0)
+		scoutfs_inc_counter(sb, orphan_scan_error);

-int scoutfs_orphan_inode(struct inode *inode)
-{
-	struct super_block *sb = inode->i_sb;
-	struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
-	struct scoutfs_lock *lock = sbi->rid_lock;
-	struct scoutfs_key key;
-	int ret;
-
-	trace_scoutfs_orphan_inode(sb, inode);
-
-	init_orphan_key(&key, sbi->rid, scoutfs_ino(inode));
-
-	ret = scoutfs_item_create(sb, &key, NULL, 0, lock);
-
-	return ret;
+	schedule_orphan_dwork(inf);
 }

 /*
@@ -1803,16 +1943,43 @@ int scoutfs_inode_setup(struct super_block *sb)
 	if (!inf)
 		return -ENOMEM;

+	inf->sb = sb;
 	spin_lock_init(&inf->writeback_lock);
 	inf->writeback_inodes = RB_ROOT;
 	spin_lock_init(&inf->dir_ino_alloc.lock);
 	spin_lock_init(&inf->ino_alloc.lock);
+	INIT_DELAYED_WORK(&inf->orphan_scan_dwork, inode_orphan_scan_worker);
+	spin_lock_init(&inf->deleting_items_lock);
+	INIT_LIST_HEAD(&inf->deleting_items_list);

 	sbi->inode_sb_info = inf;

 	return 0;
 }

+/*
+ * Our inode subsystem is setup pretty early but orphan scanning uses
+ * many other subsystems like networking and the server.  We only kick
+ * it off once everything is ready.
+ */
+int scoutfs_inode_start(struct super_block *sb)
+{
+	DECLARE_INODE_SB_INFO(sb, inf);
+
+	schedule_orphan_dwork(inf);
+	return 0;
+}
+
+void scoutfs_inode_stop(struct super_block *sb)
+{
+	DECLARE_INODE_SB_INFO(sb, inf);
+
+	if (inf) {
+		inf->stopped = true;
+		cancel_delayed_work_sync(&inf->orphan_scan_dwork);
+	}
+}
+
 void scoutfs_inode_destroy(struct super_block *sb)
 {
 	struct inode_sb_info *inf = SCOUTFS_SB(sb)->inode_sb_info;
--- a/kmod/src/inode.h
+++ b/kmod/src/inode.h
@@ -75,7 +75,6 @@ struct inode *scoutfs_alloc_inode(struct super_block *sb);
 void scoutfs_destroy_inode(struct inode *inode);
 int scoutfs_drop_inode(struct inode *inode);
 void scoutfs_evict_inode(struct inode *inode);
-int scoutfs_orphan_inode(struct inode *inode);

 struct inode *scoutfs_iget(struct super_block *sb, u64 ino);
 struct inode *scoutfs_ilookup(struct super_block *sb, u64 ino);
@@ -89,9 +88,9 @@ int scoutfs_inode_index_prepare_ino(struct super_block *sb,
 				    struct list_head *list, u64 ino,
 				    umode_t mode);
 int scoutfs_inode_index_try_lock_hold(struct super_block *sb,
-				      struct list_head *list, u64 seq);
+				      struct list_head *list, u64 seq, bool allocing);
 int scoutfs_inode_index_lock_hold(struct inode *inode, struct list_head *list,
-				  bool set_data_seq);
+				  bool set_data_seq, bool allocing);
 void scoutfs_inode_index_unlock(struct super_block *sb, struct list_head *list);

 int scoutfs_dirty_inode_item(struct inode *inode, struct scoutfs_lock *lock);
@@ -120,9 +119,8 @@ int scoutfs_getattr(struct vfsmount *mnt, struct dentry *dentry,
 		    struct kstat *stat);
 int scoutfs_setattr(struct dentry *dentry, struct iattr *attr);

-int scoutfs_scan_orphans(struct super_block *sb);
-int scoutfs_orphan_dirty(struct super_block *sb, u64 ino);
-int scoutfs_orphan_delete(struct super_block *sb, u64 ino);
+int scoutfs_inode_orphan_create(struct super_block *sb, u64 ino, struct scoutfs_lock *lock);
+int scoutfs_inode_orphan_delete(struct super_block *sb, u64 ino, struct scoutfs_lock *lock);

 void scoutfs_inode_queue_writeback(struct inode *inode);
 int scoutfs_inode_walk_writeback(struct super_block *sb, bool write);
@@ -133,6 +131,8 @@ void scoutfs_inode_exit(void);
 int scoutfs_inode_init(void);

 int scoutfs_inode_setup(struct super_block *sb);
+int scoutfs_inode_start(struct super_block *sb);
+void scoutfs_inode_stop(struct super_block *sb);
 void scoutfs_inode_destroy(struct super_block *sb);

 #endif
--- a/kmod/src/ioctl.c
+++ b/kmod/src/ioctl.c
@@ -38,6 +38,7 @@
 #include "hash.h"
 #include "srch.h"
 #include "alloc.h"
+#include "server.h"
 #include "scoutfs_trace.h"

 /*
@@ -674,7 +675,7 @@ static long scoutfs_ioc_setattr_more(struct file *file, unsigned long arg)

 	/* setting only so we don't see 0 data seq with nonzero data_version */
 	set_data_seq = sm.data_version != 0 ? true : false;
-	ret = scoutfs_inode_index_lock_hold(inode, &ind_locks, set_data_seq);
+	ret = scoutfs_inode_index_lock_hold(inode, &ind_locks, set_data_seq, false);
 	if (ret)
 		goto unlock;

@@ -879,6 +880,7 @@ static long scoutfs_ioc_statfs_more(struct file *file, unsigned long arg)
 	sfm.rid = sbi->rid;
 	sfm.total_meta_blocks = le64_to_cpu(super->total_meta_blocks);
 	sfm.total_data_blocks = le64_to_cpu(super->total_data_blocks);
+	sfm.reserved_meta_blocks = scoutfs_server_reserved_meta_blocks(sb);

 	ret = scoutfs_client_get_last_seq(sb, &sfm.committed_seq);
 	if (ret)
--- a/kmod/src/ioctl.h
+++ b/kmod/src/ioctl.h
@@ -371,6 +371,7 @@ struct scoutfs_ioctl_statfs_more {
 	__u64 committed_seq;
 	__u64 total_meta_blocks;
 	__u64 total_data_blocks;
+	__u64 reserved_meta_blocks;
 };

 #define SCOUTFS_IOC_STATFS_MORE _IOR(SCOUTFS_IOCTL_MAGIC, 10, \
--- a/kmod/src/item.c
+++ b/kmod/src/item.c
@@ -95,7 +95,7 @@ struct item_cache_info {

 	/* written by page readers, read by shrink */
 	spinlock_t active_lock;
-	struct rb_root active_root;
+	struct list_head active_list;
 };

 #define DECLARE_ITEM_CACHE_INFO(sb, name) \
@@ -127,6 +127,7 @@ struct cached_page {
 	unsigned long lru_time;
 	struct list_head dirty_list;
 	struct list_head dirty_head;
+	u64 max_liv_seq;
 	struct page *page;
 	unsigned int page_off;
 	unsigned int erased_bytes;
@@ -385,6 +386,14 @@ static void put_pg(struct super_block *sb, struct cached_page *pg)
 	}
 }

+static void update_pg_max_liv_seq(struct cached_page *pg, struct cached_item *item)
+{
+	u64 liv_seq = le64_to_cpu(item->liv.seq);
+
+	if (liv_seq > pg->max_liv_seq)
+		pg->max_liv_seq = liv_seq;
+}
+
 /*
 * Allocate space for a new item from the free offset at the end of a
 * cached page.  This isn't a blocking allocation, and it's likely that
@@ -416,6 +425,8 @@ static struct cached_item *alloc_item(struct cached_page *pg,
 	if (val_len)
 		memcpy(item->val, val, val_len);

+	update_pg_max_liv_seq(pg, item);
+
 	return item;
 }

@@ -622,6 +633,8 @@ static void mark_item_dirty(struct super_block *sb,
 		list_add_tail(&item->dirty_head, &pg->dirty_list);
 		item->dirty = 1;
 	}
+
+	update_pg_max_liv_seq(pg, item);
 }

 static void clear_item_dirty(struct super_block *sb,
@@ -1260,46 +1273,76 @@ static int cache_empty_page(struct super_block *sb,
 	return 0;
 }

+/*
+ * Readers operate independently from dirty items and transactions.
+ * They read a set of persistent items and insert them into the cache
+ * when there aren't already pages whose key range contains the items.
+ * This naturally prefers cached dirty items over stale read items.
+ *
+ * We have to deal with the case where dirty items are written and
+ * invalidated while a read is in flight.   The reader won't have seen
+ * the items that were dirty in their persistent roots as they started
+ * reading.  By the time they insert their read pages the previously
+ * dirty items have been reclaimed and are not in the cache.  The old
+ * stale items will be inserted in their place, effectively corrupting
+ * by having the dirty items disappear.
+ *
+ * We fix this by tracking the max seq of items in pages.  As readers
+ * start they record the current transaction seq.  Invalidation skips
+ * pages with a max seq greater than the first reader seq because the
+ * items in the page have to stick around to prevent the readers stale
+ * items from being inserted.
+ *
+ * This naturally only affects a small set of pages with items that were
+ * written relatively recently.  If we're in memory pressure then we
+ * probably have a lot of pages and they'll naturally have items that
+ * were visible to any raders.  We don't bother with the complicated and
+ * expensive further refinement of tracking the ranges that are being
+ * read and comparing those with pages to invalidate.
+ */
 struct active_reader {
-	struct rb_node node;
-	struct scoutfs_key start;
-	struct scoutfs_key end;
+	struct list_head head;
+	u64 seq;
 };

-static struct active_reader *active_rbtree_walk(struct rb_root *root,
-						struct scoutfs_key *start,
-						struct scoutfs_key *end,
-						struct rb_node **par,
-						struct rb_node ***pnode)
+#define INIT_ACTIVE_READER(rdr) \
+	struct active_reader rdr = { .head = LIST_HEAD_INIT(rdr.head) }
+
+static void add_active_reader(struct super_block *sb, struct active_reader *active)
+{
+	DECLARE_ITEM_CACHE_INFO(sb, cinf);
+
+	BUG_ON(!list_empty(&active->head));
+
+	active->seq = scoutfs_trans_sample_seq(sb);
+
+	spin_lock(&cinf->active_lock);
+	list_add_tail(&active->head, &cinf->active_list);
+	spin_unlock(&cinf->active_lock);
+}
+
+static u64 first_active_reader_seq(struct item_cache_info *cinf)
 {
-	struct rb_node **node = &root->rb_node;
-	struct rb_node *parent = NULL;
-	struct active_reader *ret = NULL;
 	struct active_reader *active;
-	int cmp;
+	u64 first;

-	while (*node) {
-		parent = *node;
-		active = container_of(*node, struct active_reader, node);
+	/* only the calling task adds or deletes this active */
+	spin_lock(&cinf->active_lock);
+	active = list_first_entry_or_null(&cinf->active_list, struct active_reader, head);
+	first = active ? active->seq : U64_MAX;
+	spin_unlock(&cinf->active_lock);

-		cmp = scoutfs_key_compare_ranges(start, end, &active->start,
-						 &active->end);
-		if (cmp < 0) {
-			node = &(*node)->rb_left;
-		} else if (cmp > 0) {
-			node = &(*node)->rb_right;
-		} else {
-			ret = active;
-			node = &(*node)->rb_left;
-		}
+	return first;
+}
+
+static void del_active_reader(struct item_cache_info *cinf, struct active_reader *active)
+{
+	/* only the calling task adds or deletes this active */
+	if (!list_empty(&active->head)) {
+		spin_lock(&cinf->active_lock);
+		list_del_init(&active->head);
+		spin_unlock(&cinf->active_lock);
 	}
-
-	if (par)
-		*par = parent;
-	if (pnode)
-		*pnode = node;
-
-	return ret;
 }

 /*
@@ -1399,22 +1442,15 @@ static int read_page_item(struct super_block *sb, struct scoutfs_key *key,
 * locks held, but without locking the cache.  The regions we read can
 * be stale with respect to the current cache, which can be read and
 * dirtied by other cluster lock holders on our node, but the cluster
- * locks protect the stable items we read.
- *
- * There's also the exciting case where a reader can populate the cache
- * with stale old persistent data which was read before another local
- * cluster lock holder was able to read, dirty, write, and then shrink
- * the cache.  In this case the cache couldn't be cleared by lock
- * invalidation because the caller is actively holding the lock.  But
- * shrinking could evict the cache within the held lock.  So we record
- * that we're an active reader in the range covered by the lock and
- * shrink will refuse to reclaim any pages that intersect with our read.
+ * locks protect the stable items we read.  Invalidation is careful not
+ * to drop pages that have items that we couldn't see because they were
+ * dirty when we started reading.
 */
 static int read_pages(struct super_block *sb, struct item_cache_info *cinf,
 		      struct scoutfs_key *key, struct scoutfs_lock *lock)
 {
 	struct rb_root root = RB_ROOT;
-	struct active_reader active;
+	INIT_ACTIVE_READER(active);
 	struct cached_page *right = NULL;
 	struct cached_page *pg;
 	struct cached_page *rd;
@@ -1430,15 +1466,6 @@ static int read_pages(struct super_block *sb, struct item_cache_info *cinf,
 	int pgi;
 	int ret;

-	/* stop shrink from freeing new clean data, would let us cache stale */
-	active.start = lock->start;
-	active.end = lock->end;
-	spin_lock(&cinf->active_lock);
-	active_rbtree_walk(&cinf->active_root, &active.start, &active.end,
-		           &par, &pnode);
-	rbtree_insert(&active.node, par, pnode, &cinf->active_root);
-	spin_unlock(&cinf->active_lock);
-
 	/* start with an empty page that covers the whole lock */
 	pg = alloc_pg(sb, 0);
 	if (!pg) {
@@ -1449,6 +1476,9 @@ static int read_pages(struct super_block *sb, struct item_cache_info *cinf,
 	pg->end = lock->end;
 	rbtree_insert(&pg->node, NULL, &root.rb_node, &root);

+	/* set active reader seq before reading persistent roots */
+	add_active_reader(sb, &active);
+
 	ret = scoutfs_forest_read_items(sb, lock, key, &start, &end,
 				       read_page_item, &root);
 	if (ret < 0)
@@ -1526,9 +1556,7 @@ retry:

 	ret = 0;
 out:
-	spin_lock(&cinf->active_lock);
-	rbtree_erase(&active.node, &cinf->active_root);
-	spin_unlock(&cinf->active_lock);
+	del_active_reader(cinf, &active);

 	/* free any pages we left dangling on error */
 	for_each_page_safe(&root, rd, pg_tmp) {
@@ -1830,8 +1858,8 @@ int scoutfs_item_dirty(struct super_block *sb, struct scoutfs_key *key,
 	if (!item || item->deletion) {
 		ret = -ENOENT;
 	} else {
-		mark_item_dirty(sb, cinf, pg, NULL, item);
 		item->liv.seq = item_seq(sb, lock);
+		mark_item_dirty(sb, cinf, pg, NULL, item);
 		ret = 0;
 	}

@@ -2406,9 +2434,9 @@ retry:

 /*
 * Shrink the size the item cache.  We're operating against the fast
- * path lock ordering and we skip pages if we can't acquire locks.
- * Similarly, we can run into dirty pages or pages which intersect with
- * active readers that we can't shrink and also choose to skip.
+ * path lock ordering and we skip pages if we can't acquire locks.  We
+ * can run into dirty pages or pages with items that weren't visible to
+ * the earliest active reader which must be skipped.
 */
 static int item_lru_shrink(struct shrinker *shrink,
 			   struct shrink_control *sc)
@@ -2417,26 +2445,24 @@ static int item_lru_shrink(struct shrinker *shrink,
 						    struct item_cache_info,
 						    shrinker);
 	struct super_block *sb = cinf->sb;
-	struct active_reader *active;
 	struct cached_page *tmp;
 	struct cached_page *pg;
+	u64 first_reader_seq;
 	int nr;

 	if (sc->nr_to_scan == 0)
 		goto out;
 	nr = sc->nr_to_scan;

+	/* can't invalidate pages with items that weren't visible to first reader */
+	first_reader_seq = first_active_reader_seq(cinf);
+
 	write_lock(&cinf->rwlock);
 	spin_lock(&cinf->lru_lock);

 	list_for_each_entry_safe(pg, tmp, &cinf->lru_list, lru_head) {

-		/* can't invalidate ranges being read, reader might be stale */
-		spin_lock(&cinf->active_lock);
-		active = active_rbtree_walk(&cinf->active_root, &pg->start,
-					    &pg->end, NULL, NULL);
-		spin_unlock(&cinf->active_lock);
-		if (active) {
+		if (first_reader_seq <= pg->max_liv_seq) {
 			scoutfs_inc_counter(sb, item_shrink_page_reader);
 			continue;
 		}
@@ -2505,7 +2531,7 @@ int scoutfs_item_setup(struct super_block *sb)
 	spin_lock_init(&cinf->lru_lock);
 	INIT_LIST_HEAD(&cinf->lru_list);
 	spin_lock_init(&cinf->active_lock);
-	cinf->active_root = RB_ROOT;
+	INIT_LIST_HEAD(&cinf->active_list);

 	cinf->pcpu_pages = alloc_percpu(struct item_percpu_pages);
 	if (!cinf->pcpu_pages)
@@ -2536,7 +2562,7 @@ void scoutfs_item_destroy(struct super_block *sb)
 	int cpu;

 	if (cinf) {
-		BUG_ON(!RB_EMPTY_ROOT(&cinf->active_root));
+		BUG_ON(!list_empty(&cinf->active_list));

 		unregister_hotcpu_notifier(&cinf->notifier);
 		unregister_shrinker(&cinf->shrinker);
--- a/kmod/src/lock.c
+++ b/kmod/src/lock.c
@@ -1347,29 +1347,28 @@ int scoutfs_lock_inode_index(struct super_block *sb, enum scoutfs_lock_mode mode
 }

 /*
- * The rid lock protects a mount's private persistent items in the rid
- * zone.  It's held for the duration of the mount.  It lets the mount
- * modify the rid items at will and signals to other mounts that we're
- * still alive and our rid items shouldn't be reclaimed.
+ * Orphan items are stored in their own zone which are modified with
+ * shared write_only locks and are read inconsistently without locks by
+ * background scanning work.
 *
- * Being held for the entire mount prevents other nodes from reclaiming
- * our items, like free blocks, when it would make sense for them to be
- * able to.  Maybe we have a bunch free and they're trying to allocate
- * and are getting ENOSPC.
+ * Since we only use write_only locks we just lock the entire zone, but
+ * the api provides the inode in case we ever change the locking scheme.
 */
-int scoutfs_lock_rid(struct super_block *sb, enum scoutfs_lock_mode mode, int flags,
-		     u64 rid, struct scoutfs_lock **lock)
+int scoutfs_lock_orphan(struct super_block *sb, enum scoutfs_lock_mode mode, int flags, u64 ino,
+			struct scoutfs_lock **lock)
 {
 	struct scoutfs_key start;
 	struct scoutfs_key end;

 	scoutfs_key_set_zeros(&start);
-	start.sk_zone = SCOUTFS_RID_ZONE;
-	start.sko_rid = cpu_to_le64(rid);
+	start.sk_zone = SCOUTFS_ORPHAN_ZONE;
+	start.sko_ino = 0;
+	start.sk_type = SCOUTFS_ORPHAN_TYPE;

-	scoutfs_key_set_ones(&end);
-	end.sk_zone = SCOUTFS_RID_ZONE;
-	end.sko_rid = cpu_to_le64(rid);
+	scoutfs_key_set_zeros(&end);
+	end.sk_zone = SCOUTFS_ORPHAN_ZONE;
+	end.sko_ino = cpu_to_le64(U64_MAX);
+	end.sk_type = SCOUTFS_ORPHAN_TYPE;

 	return lock_key_range(sb, mode, flags, &start, &end, lock);
 }
--- a/kmod/src/lock.h
+++ b/kmod/src/lock.h
@@ -85,8 +85,8 @@ int scoutfs_lock_inodes(struct super_block *sb, enum scoutfs_lock_mode mode, int
 			struct inode *d, struct scoutfs_lock **D_lock);
 int scoutfs_lock_rename(struct super_block *sb, enum scoutfs_lock_mode mode, int flags,
 			struct scoutfs_lock **lock);
-int scoutfs_lock_rid(struct super_block *sb, enum scoutfs_lock_mode mode, int flags,
-		     u64 rid, struct scoutfs_lock **lock);
+int scoutfs_lock_orphan(struct super_block *sb, enum scoutfs_lock_mode mode, int flags,
+		        u64 ino, struct scoutfs_lock **lock);
 void scoutfs_unlock(struct super_block *sb, struct scoutfs_lock *lock,
 		    enum scoutfs_lock_mode mode);

--- a/kmod/src/omap.c
+++ b/kmod/src/omap.c
@@ -595,10 +595,6 @@ out:
 		free_req(req);
 	}

-	/* it's fine if we couldn't send to a client that left */
-	if (ret == -ENOTCONN)
-		ret = 0;
-
 	return ret;
 }

@@ -908,9 +904,9 @@ out:
 }

 /*
- * Return 1 and give the caller a write inode lock if it is safe to be
- * deleted.  It's safe to be deleted when it is no longer reachable and
- * nothing is referencing it.
+ * Return 1 and give the caller their locks when they should delete the
+ * inode items.  It's safe to delete the inode items when it is no
+ * longer reachable and nothing is referencing it.
 *
 * The inode is unreachable when nlink hits zero.  Cluster locks protect
 * modification and testing of nlink.  We use the ino_lock_cov covrage
@@ -925,15 +921,17 @@ out:
 * increase nlink from zero and let people get a reference to the inode.
 */
 int scoutfs_omap_should_delete(struct super_block *sb, struct inode *inode,
-			       struct scoutfs_lock **lock_ret)
+			       struct scoutfs_lock **lock_ret, struct scoutfs_lock **orph_lock_ret)
 {
 	struct scoutfs_inode_info *si = SCOUTFS_I(inode);
+	struct scoutfs_lock *orph_lock = NULL;
 	struct scoutfs_lock *lock = NULL;
 	const u64 ino = scoutfs_ino(inode);
 	struct scoutfs_omap_lock_data *ldata;
 	u64 group_nr;
 	int bit_nr;
 	int ret;
+	int err;

 	/* lock group and omap constants are defined independently */
 	BUILD_BUG_ON(SCOUTFS_OPEN_INO_MAP_BITS != SCOUTFS_LOCK_INODE_GROUP_NR);
@@ -964,12 +962,19 @@ int scoutfs_omap_should_delete(struct super_block *sb, struct inode *inode,
 out:
 	trace_scoutfs_omap_should_delete(sb, ino, inode->i_nlink, ret);

+	if (ret > 0) {
+		err = scoutfs_lock_orphan(sb, SCOUTFS_LOCK_WRITE_ONLY, 0, ino, &orph_lock);
+		if (err < 0)
+			ret = err;
+	}
+
 	if (ret <= 0) {
 		scoutfs_unlock(sb, lock, SCOUTFS_LOCK_WRITE);
 		lock = NULL;
 	}

 	*lock_ret = lock;
+	*orph_lock_ret = orph_lock;
 	return ret;
 }

--- a/kmod/src/omap.h
+++ b/kmod/src/omap.h
@@ -4,7 +4,7 @@
 int scoutfs_omap_inc(struct super_block *sb, u64 ino);
 void scoutfs_omap_dec(struct super_block *sb, u64 ino);
 int scoutfs_omap_should_delete(struct super_block *sb, struct inode *inode,
-			       struct scoutfs_lock **lock_ret);
+			       struct scoutfs_lock **lock_ret, struct scoutfs_lock **orph_lock_ret);
 void scoutfs_omap_free_lock_data(struct scoutfs_omap_lock_data *ldata);
 int scoutfs_omap_client_handle_request(struct super_block *sb, u64 id,
 				       struct scoutfs_open_ino_map_args *args);
--- a/kmod/src/scoutfs_trace.h
+++ b/kmod/src/scoutfs_trace.h
@@ -424,14 +424,15 @@ TRACE_EVENT(scoutfs_trans_write_func,
 );

 DECLARE_EVENT_CLASS(scoutfs_trans_hold_release_class,
-	TP_PROTO(struct super_block *sb, void *journal_info, int holders),
+	TP_PROTO(struct super_block *sb, void *journal_info, int holders, int ret),

-	TP_ARGS(sb, journal_info, holders),
+	TP_ARGS(sb, journal_info, holders, ret),

 	TP_STRUCT__entry(
 		SCSB_TRACE_FIELDS
 		__field(unsigned long, journal_info)
 		__field(int, holders)
+		__field(int, ret)
 	),

 	TP_fast_assign(
@@ -440,17 +441,17 @@ DECLARE_EVENT_CLASS(scoutfs_trans_hold_release_class,
 		__entry->holders = holders;
 	),

-	TP_printk(SCSBF" journal_info 0x%0lx holders %d",
-		  SCSB_TRACE_ARGS, __entry->journal_info, __entry->holders)
+	TP_printk(SCSBF" journal_info 0x%0lx holders %d ret %d",
+		  SCSB_TRACE_ARGS, __entry->journal_info, __entry->holders, __entry->ret)
 );

-DEFINE_EVENT(scoutfs_trans_hold_release_class, scoutfs_trans_acquired_hold,
-	TP_PROTO(struct super_block *sb, void *journal_info, int holders),
-	TP_ARGS(sb, journal_info, holders)
+DEFINE_EVENT(scoutfs_trans_hold_release_class, scoutfs_hold_trans,
+	TP_PROTO(struct super_block *sb, void *journal_info, int holders, int ret),
+	TP_ARGS(sb, journal_info, holders, ret)
 );
 DEFINE_EVENT(scoutfs_trans_hold_release_class, scoutfs_release_trans,
-	TP_PROTO(struct super_block *sb, void *journal_info, int holders),
-	TP_ARGS(sb, journal_info, holders)
+	TP_PROTO(struct super_block *sb, void *journal_info, int holders, int ret),
+	TP_ARGS(sb, journal_info, holders, ret)
 );

 TRACE_EVENT(scoutfs_ioc_release,
@@ -985,22 +986,6 @@ TRACE_EVENT(scoutfs_delete_inode,
 		  __entry->mode, __entry->size)
 );

-TRACE_EVENT(scoutfs_scan_orphans,
-	TP_PROTO(struct super_block *sb),
-
-	TP_ARGS(sb),
-
-	TP_STRUCT__entry(
-		__field(dev_t, dev)
-	),
-
-	TP_fast_assign(
-		__entry->dev = sb->s_dev;
-	),
-
-	TP_printk("dev %d,%d", MAJOR(__entry->dev), MINOR(__entry->dev))
-);
-
 DECLARE_EVENT_CLASS(scoutfs_key_class,
        TP_PROTO(struct super_block *sb, struct scoutfs_key *key),
        TP_ARGS(sb, key),
--- a/kmod/src/server.c
+++ b/kmod/src/server.c
@@ -323,6 +323,7 @@ static void scoutfs_server_commit_func(struct work_struct *work)
 	struct commit_waiter *cw;
 	struct commit_waiter *pos;
 	struct llist_node *node;
+	u64 reserved;
 	int ret;

 	trace_scoutfs_server_commit_work_enter(sb, 0, 0);
@@ -387,11 +388,17 @@ static void scoutfs_server_commit_func(struct work_struct *work)
 	server->other_avail = &super->server_meta_avail[server->other_ind];
 	server->other_freed = &super->server_meta_freed[server->other_ind];

-	/* swap avail/free if avail gets low and freed is high */
-	if (le64_to_cpu(server->meta_avail->total_len) <=
-	    SCOUTFS_SERVER_META_ALLOC_MIN &&
-	    le64_to_cpu(server->meta_freed->total_len) >
-	    SCOUTFS_SERVER_META_ALLOC_MIN)
+	/*
+	 * The reserved metadata blocks includes the max size of
+	 * outstanding allocators and a server transaction could be
+	 * asked to refill all those allocators from meta_avail.  If our
+	 * meta_avail falls below the reserved count, and freed is still
+	 * above it, then swap so that we don't start returning enospc
+	 * until we're truly low.
+	 */
+	reserved = scoutfs_server_reserved_meta_blocks(sb);
+	if (le64_to_cpu(server->meta_avail->total_len) <= reserved &&
+	    le64_to_cpu(server->meta_freed->total_len) > reserved)
 		swap(server->meta_avail, server->meta_freed);

 	ret = 0;
@@ -479,6 +486,57 @@ static int alloc_move_empty(struct super_block *sb,
 				  dst, src, le64_to_cpu(src->total_len), NULL, NULL, 0);
 }

+/*
+ * Copy on write transactions need to allocate new dirty blocks as they
+ * make modifications to delete items and eventually free more blocks.
+ * The reserved blocks are meant to keep enough available blocks in
+ * flight to allow servers and clients to perform transactions that
+ * don't consume additional space.  We have quite a few allocators in
+ * flight across the server and various client mechanisms (posix items,
+ * srch compaction, and log merging).  We also want to include
+ * sufficient blocks for client log btrees to grow tall enough to be
+ * finalized and merges.
+ *
+ * The reserved blocks calculation is a policy of the server but it's
+ * exposed to the statfs_more interface so that df isn't misleading.
+ * Requiring this synchronization without explicit protocol
+ * communication isn't great.
+ */
+u64 scoutfs_server_reserved_meta_blocks(struct super_block *sb)
+{
+	DECLARE_SERVER_INFO(sb, server);
+	u64 server_blocks;
+	u64 client_blocks;
+	u64 log_blocks;
+	u64 nr_clients;
+
+	/* server has two meta_avail lists it swaps between */
+	server_blocks = SCOUTFS_SERVER_META_FILL_TARGET * 2;
+
+	/*
+	 * Log trees will be compacted once they hit a height of 3.
+	 * That'll be the grandparent, two parents resulting from a
+	 * split, and all their child blocks (roughly calculated,
+	 * overestimating).
+	 */
+	log_blocks = 3 + (SCOUTFS_BLOCK_LG_SIZE /
+		          (sizeof(struct scoutfs_btree_item) + sizeof(struct scoutfs_block_ref)));
+
+	/*
+	 * Each client can have a meta_avail list, srch compaction
+	 * request, log merge request, and a log btree it's building.
+	 */
+	client_blocks = SCOUTFS_SERVER_META_FILL_TARGET + SCOUTFS_SERVER_META_FILL_TARGET +
+			SCOUTFS_SERVER_MERGE_FILL_TARGET + log_blocks;
+
+	/* we should reserve for voting majority, too */
+	spin_lock(&server->lock);
+	nr_clients = server->nr_clients;
+	spin_unlock(&server->lock);
+
+	return server_blocks + (max(1ULL, nr_clients) * client_blocks);
+}
+
 /*
 * Set all the bits in the destination which overlap with the extent.
 */
@@ -662,6 +720,7 @@ static int server_get_log_trees(struct super_block *sb,
 	struct scoutfs_log_trees lt;
 	struct scoutfs_key key;
 	bool have_fin = false;
+	bool unlock_alloc = false;
 	u64 data_zone_blocks;
 	u64 nr;
 	int ret;
@@ -701,8 +760,15 @@ static int server_get_log_trees(struct super_block *sb,
 		lt.nr = cpu_to_le64(nr);
 	}

-	/* finalize an existing root when large enough and don't have one */
-	if (lt.item_root.height > 2 && !have_fin) {
+	/*
+	 * Finalize the client log btree when it has enough leaf blocks
+	 * to allow some degree of merging concurrency.  Smaller btrees
+	 * are also finalized when meta was low so that deleted items
+	 * are merged promptly and freed blocks can bring the client out
+	 * of enospc.
+	 */
+	if (!have_fin && ((lt.item_root.height > 2) ||
+		          (le32_to_cpu(lt.meta_avail.flags) & SCOUTFS_ALLOC_FLAG_LOW))) {
 		fin = lt;
 		memset(&fin.meta_avail, 0, sizeof(fin.meta_avail));
 		memset(&fin.meta_freed, 0, sizeof(fin.meta_freed));
@@ -734,24 +800,45 @@ static int server_get_log_trees(struct super_block *sb,
 		data_zone_blocks = 0;
 	}

-	/* return freed to server for emptying, refill avail  */
+	/*
+	 * Reclaim the freed meta and data allocators and refill the
+	 * avail allocators, setting low flags if they drop too low.
+	 */
 	mutex_lock(&server->alloc_mutex);
-	ret = scoutfs_alloc_splice_list(sb, &server->alloc, &server->wri,
-					server->other_freed,
+	unlock_alloc = true;
+
+	ret = scoutfs_alloc_splice_list(sb, &server->alloc, &server->wri, server->other_freed,
 					&lt.meta_freed) ?:
-	      alloc_move_empty(sb, &super->data_alloc, &lt.data_freed) ?:
-	      scoutfs_alloc_fill_list(sb, &server->alloc, &server->wri,
-				      &lt.meta_avail, server->meta_avail,
-				      SCOUTFS_SERVER_META_FILL_LO,
-				      SCOUTFS_SERVER_META_FILL_TARGET) ?:
-	      alloc_move_refill_zoned(sb, &lt.data_avail, &super->data_alloc,
-				      SCOUTFS_SERVER_DATA_FILL_LO,
-				      SCOUTFS_SERVER_DATA_FILL_TARGET,
-				      exclusive, vacant, data_zone_blocks);
-	mutex_unlock(&server->alloc_mutex);
+	      alloc_move_empty(sb, &super->data_alloc, &lt.data_freed);
 	if (ret < 0)
 		goto unlock;

+	ret = scoutfs_alloc_fill_list(sb, &server->alloc, &server->wri,
+				      &lt.meta_avail, server->meta_avail,
+				      SCOUTFS_SERVER_META_FILL_LO,
+				      SCOUTFS_SERVER_META_FILL_TARGET);
+	if (ret < 0)
+		goto unlock;
+
+	if (le64_to_cpu(server->meta_avail->total_len) <= scoutfs_server_reserved_meta_blocks(sb))
+		lt.meta_avail.flags |= cpu_to_le32(SCOUTFS_ALLOC_FLAG_LOW);
+	else
+		lt.meta_avail.flags &= ~cpu_to_le32(SCOUTFS_ALLOC_FLAG_LOW);
+
+	ret = alloc_move_refill_zoned(sb, &lt.data_avail, &super->data_alloc,
+				      SCOUTFS_SERVER_DATA_FILL_LO, SCOUTFS_SERVER_DATA_FILL_TARGET,
+				      exclusive, vacant, data_zone_blocks);
+	if (ret < 0)
+		goto unlock;
+
+	if (le64_to_cpu(lt.data_avail.total_len) < SCOUTFS_SERVER_DATA_FILL_LO)
+		lt.data_avail.flags |= cpu_to_le32(SCOUTFS_ALLOC_FLAG_LOW);
+	else
+		lt.data_avail.flags &= ~cpu_to_le32(SCOUTFS_ALLOC_FLAG_LOW);
+
+	mutex_unlock(&server->alloc_mutex);
+	unlock_alloc = false;
+
 	/* record data alloc zone bits */
 	zero_data_alloc_zone_bits(&lt);
 	if (data_zone_blocks != 0) {
@@ -772,6 +859,8 @@ static int server_get_log_trees(struct super_block *sb,
 	ret = scoutfs_btree_force(sb, &server->alloc, &server->wri,
 				  &super->logs_root, &key, &lt, sizeof(lt));
 unlock:
+	if (unlock_alloc)
+		mutex_unlock(&server->alloc_mutex);
 	mutex_unlock(&server->logs_mutex);

 	ret = scoutfs_server_apply_commit(sb, ret);
@@ -2277,15 +2366,27 @@ int scoutfs_server_send_omap_request(struct super_block *sb, u64 rid,
 					      open_ino_map_response, NULL, NULL);
 }

-/* The server is sending an omap response to the client */
+/*
+ * The server is sending an omap response to the client that originated
+ * the request.  These responses are sent long after the incoming
+ * request has pinned the client connection and guaranteed that we'll be
+ * able to queue a response.  This can race with the client connection
+ * being torn down and it's OK if we drop the response.  Either the
+ * client is being evicted and we don't care about them anymore or we're
+ * tearing down in unmount and the client will resend to thee next
+ * server.
+ */
 int scoutfs_server_send_omap_response(struct super_block *sb, u64 rid, u64 id,
 				      struct scoutfs_open_ino_map *map, int err)
 {
 	struct server_info *server = SCOUTFS_SB(sb)->server_info;
+	int ret;

-	return scoutfs_net_response_node(sb, server->conn, rid,
-					 SCOUTFS_NET_CMD_OPEN_INO_MAP, id, err,
-					 map, sizeof(*map));
+	ret = scoutfs_net_response_node(sb, server->conn, rid, SCOUTFS_NET_CMD_OPEN_INO_MAP,
+					id, err, map, sizeof(*map));
+	if (ret == -ENOTCONN)
+		ret = 0;
+	return ret;
 }

 /* The server is receiving an omap request from the client */
--- a/kmod/src/server.h
+++ b/kmod/src/server.h
@@ -56,6 +56,8 @@ do {								\
 	__entry->name##_data_len, __entry->name##_cmd, __entry->name##_flags, \
 	__entry->name##_error

+u64 scoutfs_server_reserved_meta_blocks(struct super_block *sb);
+
 int scoutfs_server_lock_request(struct super_block *sb, u64 rid,
 				struct scoutfs_net_lock *nl);
 int scoutfs_server_lock_response(struct super_block *sb, u64 rid, u64 id,
@@ -75,8 +77,6 @@ u64 scoutfs_server_seq(struct super_block *sb);
 u64 scoutfs_server_next_seq(struct super_block *sb);
 void scoutfs_server_set_seq_if_greater(struct super_block *sb, u64 seq);

-struct sockaddr_in;
-struct scoutfs_quorum_elected_info;
 int scoutfs_server_start(struct super_block *sb, u64 term);
 void scoutfs_server_abort(struct super_block *sb);
 void scoutfs_server_stop(struct super_block *sb);
--- a/kmod/src/super.c
+++ b/kmod/src/super.c
@@ -247,11 +247,10 @@ static void scoutfs_put_super(struct super_block *sb)

 	trace_scoutfs_put_super(sb);

+	scoutfs_inode_stop(sb);
+	scoutfs_forest_stop(sb);
 	scoutfs_srch_destroy(sb);

-	scoutfs_unlock(sb, sbi->rid_lock, SCOUTFS_LOCK_WRITE);
-	sbi->rid_lock = NULL;
-
 	scoutfs_lock_shutdown(sb);

 	scoutfs_shutdown_trans(sb);
@@ -623,10 +622,9 @@ static int scoutfs_fill_super(struct super_block *sb, void *data, int silent)
 	      scoutfs_quorum_setup(sb) ?:
 	      scoutfs_client_setup(sb) ?:
 	      scoutfs_volopt_setup(sb) ?:
-	      scoutfs_lock_rid(sb, SCOUTFS_LOCK_WRITE, 0, sbi->rid,
-				   &sbi->rid_lock) ?:
 	      scoutfs_trans_get_log_trees(sb) ?:
-	      scoutfs_srch_setup(sb);
+	      scoutfs_srch_setup(sb) ?:
+	      scoutfs_inode_start(sb);
 	if (ret)
 		goto out;

@@ -647,7 +645,6 @@ static int scoutfs_fill_super(struct super_block *sb, void *data, int silent)
 		goto out;

 	scoutfs_trans_restart_sync_deadline(sb);
-//	scoutfs_scan_orphans(sb);
 	ret = 0;
 out:
 	/* on error, generic_shutdown_super calls put_super if s_root */
--- a/kmod/src/super.h
+++ b/kmod/src/super.h
@@ -36,7 +36,6 @@ struct scoutfs_sb_info {

 	/* assigned once at the start of each mount, read-only */
 	u64 rid;
-	struct scoutfs_lock *rid_lock;

 	struct scoutfs_super_block super;

--- a/kmod/src/trans.c
+++ b/kmod/src/trans.c
@@ -436,8 +436,8 @@ static bool commit_before_hold(struct super_block *sb, struct trans_info *tri)
 		return true;
 	}

-	/* Try to refill data allocator before premature enospc */
-	if (scoutfs_data_alloc_free_bytes(sb) <= SCOUTFS_TRANS_DATA_ALLOC_LWM) {
+	/* if we're low and can't refill then alloc could empty and return enospc */
+	if (scoutfs_data_alloc_should_refill(sb, SCOUTFS_ALLOC_DATA_REFILL_THRESH)) {
 		scoutfs_inc_counter(sb, trans_commit_data_alloc_low);
 		return true;
 	}
@@ -445,38 +445,15 @@ static bool commit_before_hold(struct super_block *sb, struct trans_info *tri)
 	return false;
 }

-static bool acquired_hold(struct super_block *sb)
+/*
+ * called as a wait_event condition, needs to be careful to not change
+ * task state and is racing with waking paths that sub_return, test, and
+ * wake.
+ */
+static bool holders_no_writer(struct trans_info *tri)
 {
-	struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
-	DECLARE_TRANS_INFO(sb, tri);
-	bool acquired;
-
-	/* if a caller already has a hold we acquire unconditionally */
-	if (inc_journal_info_holders()) {
-		atomic_inc(&tri->holders);
-		acquired = true;
-		goto out;
-	}
-
-	/* wait if the writer is blocking holds */
-	if (!inc_holders_unless_writer(tri)) {
-		dec_journal_info_holders();
-		acquired = false;
-		goto out;
-	}
-
-	/* wait if we're triggering another commit */
-	if (commit_before_hold(sb, tri)) {
-		release_holders(sb);
-		queue_trans_work(sbi);
-		acquired = false;
-		goto out;
-	}
-
-	trace_scoutfs_trans_acquired_hold(sb, current->journal_info, atomic_read(&tri->holders));
-	acquired = true;
-out:
-	return acquired;
+	smp_mb(); /* make sure task in wait_event queue before atomic read */
+	return !(atomic_read(&tri->holders) & TRANS_HOLDERS_WRITE_FUNC_BIT);
 }

 /*
@@ -492,15 +469,64 @@ out:
 * The writing thread marks itself as a global trans_task which
 * short-circuits all the hold machinery so it can call code that would
 * otherwise try to hold transactions while it is writing.
+ *
+ * If the caller is adding metadata items that will eventually consume
+ * free space -- not dirtying existing items or adding deletion items --
+ * then we can return enospc if our metadata allocator indicates that
+ * we're low on space.
 */
-int scoutfs_hold_trans(struct super_block *sb)
+int scoutfs_hold_trans(struct super_block *sb, bool allocing)
 {
 	struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
+	DECLARE_TRANS_INFO(sb, tri);
+	u64 seq;
+	int ret;

 	if (current == sbi->trans_task)
 		return 0;

-	return wait_event_interruptible(sbi->trans_hold_wq, acquired_hold(sb));
+	for (;;) {
+		/* if a caller already has a hold we acquire unconditionally */
+		if (inc_journal_info_holders()) {
+			atomic_inc(&tri->holders);
+			ret = 0;
+			break;
+		}
+
+		/* wait until the writer work is finished */
+		if (!inc_holders_unless_writer(tri)) {
+			dec_journal_info_holders();
+			ret = wait_event_interruptible(sbi->trans_hold_wq, holders_no_writer(tri));
+			if (ret < 0)
+				break;
+			continue;
+		}
+
+		/* return enospc if server is into reserved blocks and we're allocating */
+		if (allocing && scoutfs_alloc_test_flag(sb, &tri->alloc, SCOUTFS_ALLOC_FLAG_LOW)) {
+			release_holders(sb);
+			ret = -ENOSPC;
+			break;
+		}
+
+		/* see if we need to trigger and wait for a commit before holding */
+		if (commit_before_hold(sb, tri)) {
+			seq = scoutfs_trans_sample_seq(sb);
+			release_holders(sb);
+			queue_trans_work(sbi);
+			ret = wait_event_interruptible(sbi->trans_hold_wq,
+						       scoutfs_trans_sample_seq(sb) != seq);
+			if (ret < 0)
+				break;
+			continue;
+		}
+
+		ret = 0;
+		break;
+	}
+
+	trace_scoutfs_hold_trans(sb, current->journal_info, atomic_read(&tri->holders), ret);
+	return ret;
 }

 /*
@@ -525,7 +551,7 @@ void scoutfs_release_trans(struct super_block *sb)

 	release_holders(sb);

-	trace_scoutfs_release_trans(sb, current->journal_info, atomic_read(&tri->holders));
+	trace_scoutfs_release_trans(sb, current->journal_info, atomic_read(&tri->holders), 0);
 }

 /*
--- a/kmod/src/trans.h
+++ b/kmod/src/trans.h
@@ -1,18 +1,13 @@
 #ifndef _SCOUTFS_TRANS_H_
 #define _SCOUTFS_TRANS_H_

-/* the server will attempt to fill data allocs for each trans */
-#define SCOUTFS_TRANS_DATA_ALLOC_HWM	(2ULL * 1024 * 1024 * 1024)
-/* the client will force commits if data allocators get too low */
-#define SCOUTFS_TRANS_DATA_ALLOC_LWM	(256ULL * 1024 * 1024)
-
 void scoutfs_trans_write_func(struct work_struct *work);
 int scoutfs_trans_sync(struct super_block *sb, int wait);
 int scoutfs_file_fsync(struct file *file, loff_t start, loff_t end,
 		       int datasync);
 void scoutfs_trans_restart_sync_deadline(struct super_block *sb);

-int scoutfs_hold_trans(struct super_block *sb);
+int scoutfs_hold_trans(struct super_block *sb, bool allocing);
 bool scoutfs_trans_held(void);
 void scoutfs_release_trans(struct super_block *sb);
 u64 scoutfs_trans_sample_seq(struct super_block *sb);
--- a/kmod/src/xattr.c
+++ b/kmod/src/xattr.c
@@ -577,7 +577,7 @@ static int scoutfs_xattr_set(struct dentry *dentry, const char *name,
 retry:
 	ret = scoutfs_inode_index_start(sb, &ind_seq) ?:
 	      scoutfs_inode_index_prepare(sb, &ind_locks, inode, false) ?:
-	      scoutfs_inode_index_try_lock_hold(sb, &ind_locks, ind_seq);
+	      scoutfs_inode_index_try_lock_hold(sb, &ind_locks, ind_seq, true);
 	if (ret > 0)
 		goto retry;
 	if (ret)
@@ -778,7 +778,7 @@ int scoutfs_xattr_drop(struct super_block *sb, u64 ino,
 					     &tgs) != 0)
 			memset(&tgs, 0, sizeof(tgs));

-		ret = scoutfs_hold_trans(sb);
+		ret = scoutfs_hold_trans(sb, false);
 		if (ret < 0)
 			break;
 		release = true;
--- a/tests/.gitignore
+++ b/tests/.gitignore
@@ -5,3 +5,4 @@ src/handle_cat
 src/bulk_create_paths
 src/find_xattrs
 src/stage_tmpfile
+src/create_xattr_loop
--- a/tests/Makefile
+++ b/tests/Makefile
@@ -7,7 +7,8 @@ BIN := src/createmany			\
 	src/handle_cat			\
 	src/bulk_create_paths		\
 	src/stage_tmpfile		\
-	src/find_xattrs
+	src/find_xattrs			\
+	src/create_xattr_loop

 DEPS := $(wildcard src/*.d)

--- a/tests/funcs/filter.sh
+++ b/tests/funcs/filter.sh
@@ -71,6 +71,7 @@ t_filter_dmesg()
 	re="$re|scoutfs .* quorum .* error"
 	re="$re|scoutfs .* error reading quorum block"
 	re="$re|scoutfs .* error .* writing quorum block"
+	re="$re|scoutfs .* error .* while checking to delete inode"

 	egrep -v "($re)" 
 }
--- a/tests/golden/enospc
+++ b/tests/golden/enospc
@@ -0,0 +1,8 @@
+== prepare directories and files
+== fallocate until enospc
+== remove all the files and verify free data blocks
+== make small meta fs
+== create large xattrs until we fill up metadata
+== remove files with xattrs after enospc
+== make sure we can create again
+== cleanup small meta fs
--- a/tests/golden/orphan-inodes
+++ b/tests/golden/orphan-inodes
@@ -0,0 +1,4 @@
+== test our inode existance function
+== unlinked and opened inodes still exist
+== orphan from failed evict deletion is picked up
+== orphaned inos in all mounts all deleted
--- a/tests/sequence
+++ b/tests/sequence
@@ -7,6 +7,7 @@ simple-release-extents.sh
 setattr_more.sh
 offline-extent-waiting.sh
 move-blocks.sh
+enospc.sh
 srch-basic-functionality.sh
 simple-xattr-unit.sh
 lock-refleak.sh
@@ -29,6 +30,7 @@ cross-mount-data-free.sh
 persistent-item-vers.sh
 setup-error-teardown.sh
 fence-and-reclaim.sh
+orphan-inodes.sh
 mount-unmount-race.sh
 createmany-parallel-mounts.sh
 archive-light-cycle.sh
--- a/tests/src/create_xattr_loop.c
+++ b/tests/src/create_xattr_loop.c
@@ -0,0 +1,113 @@
+#include <stdlib.h>
+#include <stdio.h>
+#include <unistd.h>
+#include <sys/types.h>
+#include <sys/xattr.h>
+#include <ctype.h>
+#include <string.h>
+#include <errno.h>
+#include <limits.h>
+
+static void exit_usage(void)
+{
+	printf(" -h/-?         output this usage message and exit\n"
+	       " -c <count>    number of xattrs to create\n"
+	       " -n <string>   xattr name prefix, -NR is appended\n"
+	       " -p <path>     string with path to file with xattrs\n" 
+	       " -s <size>     xattr value size\n");
+	exit(1);
+}
+
+int main(int argc, char **argv)
+{
+	char *pref = NULL;
+	char *path = NULL;
+	char *val;
+	char *name;
+	unsigned long long count = 0;
+	unsigned long long size = 0;
+	unsigned long long i;
+	int ret;
+	int c;
+
+	while ((c = getopt(argc, argv, "+c:n:p:s:")) != -1) {
+
+		switch (c) {
+			case 'c':
+				count = strtoull(optarg, NULL, 0);
+				break;
+			case 'n':
+				pref = strdup(optarg);
+				break;
+			case 'p':
+				path = strdup(optarg);
+				break;
+			case 's':
+				size = strtoull(optarg, NULL, 0);
+				break;
+			case '?':
+				printf("unknown argument: %c\n", optind);
+			case 'h':
+				exit_usage();
+		}
+	}
+
+	if (count == 0) {
+		printf("specify count of xattrs to create with -c\n");
+		exit(1);
+	}
+
+	if (count == ULLONG_MAX) {
+		printf("invalid -c count\n");
+		exit(1);
+	}
+
+	if (size == 0) {
+		printf("specify xattrs value size with -s\n");
+		exit(1);
+	}
+
+	if (size == ULLONG_MAX || size < 2) {
+		printf("invalid -s size\n");
+		exit(1);
+	}
+
+	if (path == NULL) {
+		printf("specify path to file with -p\n");
+		exit(1);
+	}
+
+	if (pref == NULL) {
+		printf("specify xattr name prefix string with -n\n");
+		exit(1);
+	}
+
+	ret = snprintf(NULL, 0, "%s-%llu", pref, ULLONG_MAX) + 1;
+	name = malloc(ret);
+	if (!name) {
+		printf("couldn't allocate xattr name buffer\n");
+		exit(1);
+	}
+
+	val = malloc(size);
+	if (!val) {
+		printf("couldn't allocate xattr value buffer\n");
+		exit(1);
+	}
+
+	memset(val, 'a', size - 1);
+	val[size - 1] = '\0';
+
+	for (i = 0; i < count; i++) {
+		sprintf(name, "%s-%llu", pref, i);
+
+		ret = setxattr(path, name, val, size, 0);
+		if (ret) {
+			printf("returned %d errno %d (%s)\n",
+					ret, errno, strerror(errno));
+			return 1;
+		}
+	}
+
+	return 0;
+}
--- a/tests/tests/enospc.sh
+++ b/tests/tests/enospc.sh
@@ -0,0 +1,100 @@
+#
+# test hititng enospc by filling with data or metadata and
+# then recovering by removing what we filled.
+#
+
+#    Type  Size     Total   Used      Free  Use%  
+#MetaData  64KB   1048576  32782   1015794     3  
+#    Data   4KB  16777152      0  16777152     0  
+free_blocks() {
+	local md="$1"
+	local mnt="$2"
+	scoutfs df -p "$mnt" | awk '($1 == "'$md'") { print $5; exit }'
+}
+
+t_require_commands scoutfs stat fallocate createmany
+
+echo "== prepare directories and files"
+for n in $(t_fs_nrs); do
+	eval path="\$T_D${n}/dir-$n/file-$n"
+	mkdir -p $(dirname $path)
+	touch $path
+done
+sync
+
+echo "== fallocate until enospc"
+before=$(free_blocks Data "$T_M0")
+finished=0
+while [ $finished != 1 ]; do
+	for n in $(t_fs_nrs); do
+		eval path="\$T_D${n}/dir-$n/file-$n"
+		off=$(stat -c "%s" "$path")
+
+		LC_ALL=C fallocate -o $off -l 128MiB  "$path" > $T_TMP.fallocate 2>&1
+		err="$?"
+
+		if grep -qi "no space" $T_TMP.fallocate; then
+			finished=1
+			break
+		fi
+		if [ "$err" != "0" ]; then
+			t_fail "fallocate failed with $err"
+		fi
+	done
+done
+
+echo "== remove all the files and verify free data blocks"
+for n in $(t_fs_nrs); do
+	eval dir="\$T_D${n}/dir-$n"
+	rm -rf "$dir"
+done
+sync
+after=$(free_blocks Data "$T_M0")
+# nothing else should be modifying data blocks
+test "$before" == "$after" || \
+	t_fail "$after free data blocks after rm, expected $before"
+
+# XXX this is all pretty manual, would be nice to have helpers
+echo "== make small meta fs"
+# meta device just big enough for reserves and the metadata we'll fill
+scoutfs mkfs -A -f -Q 0,127.0.0.1,53000 -m 10G "$T_EX_META_DEV" "$T_EX_DATA_DEV" > $T_TMP.mkfs.out 2>&1 || \
+	t_fail "mkfs failed"
+SCR="/mnt/scoutfs.enospc"
+mkdir -p "$SCR"
+mount -t scoutfs -o metadev_path=$T_EX_META_DEV,quorum_slot_nr=0 \
+	"$T_EX_DATA_DEV" "$SCR"
+
+echo "== create large xattrs until we fill up metadata"
+mkdir -p "$SCR/xattrs"
+
+for f in $(seq 1 100000); do
+	file="$SCR/xattrs/file-$f"
+	touch "$file"
+
+	LC_ALL=C create_xattr_loop -c 1000 -n user.scoutfs-enospc -p "$file" -s 65535 > $T_TMP.cxl 2>&1
+	err="$?"
+
+	if grep -qi "no space" $T_TMP.cxl; then
+		echo "enospc at f $f" >> $T_TMP.cxl
+		break
+	fi
+	if [ "$err" != "0" ]; then
+		t_fail "create_xattr_loop failed with $err"
+	fi
+done
+
+echo "== remove files with xattrs after enospc"
+rm -rf "$SCR/xattrs"
+
+echo "== make sure we can create again"
+file="$SCR/file-after"
+touch $file
+setfattr -n user.scoutfs-enospc -v 1 "$file"
+sync
+rm -f "$file"
+
+echo "== cleanup small meta fs"
+umount "$SCR"
+rmdir "$SCR"
+
+t_pass
--- a/tests/tests/orphan-inodes.sh
+++ b/tests/tests/orphan-inodes.sh
@@ -0,0 +1,77 @@
+#
+# make sure we clean up orphaned inodes
+#
+
+t_require_commands sleep touch sync stat handle_cat kill rm
+t_require_mounts 2
+
+#
+# usually bash prints an annoying output message when jobs
+# are killed.  We can avoid that by redirecting stderr for
+# the bash process when it reaps the jobs that are killed.
+#
+silent_kill() {
+	exec {ERR}>&2 2>/dev/null
+	kill "$@"
+	wait "$@"
+	exec 2>&$ERR {ERR}>&-
+}
+
+#
+# We don't have a great way to test that inode items still exist.   We
+# don't prevent opening handles with nlink 0 today, so we'll use that.
+# This would have to change to some other method.
+#
+inode_exists()
+{
+	local ino="$1"
+
+	handle_cat "$T_M0" "$ino" > "$T_TMP.handle_cat.log" 2>&1
+}
+
+echo "== test our inode existance function"
+path="$T_D0/file"
+touch "$path"
+ino=$(stat -c "%i" "$path")
+inode_exists $ino || echo "$ino didn't exist"
+
+echo "== unlinked and opened inodes still exist"
+sleep 1000000 < "$path" &
+pid="$!"
+rm -f "$path"
+inode_exists $ino || echo "$ino didn't exist"
+
+echo "== orphan from failed evict deletion is picked up"
+# pending kill signal stops evict from getting locks and deleting
+silent_kill $pid
+sleep 55
+inode_exists $ino && echo "$ino still exists"
+
+echo "== orphaned inos in all mounts all deleted"
+pids=""
+inos=""
+for nr in $(t_fs_nrs); do
+	eval path="\$T_D${nr}/file-$nr"
+	touch "$path"
+	inos="$inos $(stat -c %i $path)"
+	sleep 1000000 < "$path" &
+	pids="$pids $!"
+	rm -f "$path"
+done
+sync
+silent_kill $pids
+for nr in $(t_fs_nrs); do
+	t_force_umount $nr
+done
+t_mount_all
+# wait for all fence requests to complete
+while test -d $(echo /sys/fs/scoutfs/*/fence/* | cut -d " " -f 1); do
+	sleep .5
+done
+# wait for orphan scans to run
+sleep 55
+for ino in $inos; do
+	inode_exists $ino && echo "$ino still exists"
+done
+
+t_pass
--- a/utils/man/scoutfs.8
+++ b/utils/man/scoutfs.8
@@ -32,10 +32,18 @@ A path within a ScoutFS filesystem.
 .PD

 .TP
-.BI "mkfs META-DEVICE DATA-DEVICE {-Q|--quorum-slot} NR,ADDR,PORT [-m|--max-meta-size SIZE] [-d|--max-data-size SIZE] [-z|--data-alloc-zone-blocks BLOCKS] [-f|--force]"
+.BI "mkfs META-DEVICE DATA-DEVICE {-Q|--quorum-slot} NR,ADDR,PORT [-m|--max-meta-size SIZE] [-d|--max-data-size SIZE] [-z|--data-alloc-zone-blocks BLOCKS] [-f|--force] [-A|--allow-small-size]"
 .sp
 Initialize a new ScoutFS filesystem on the target devices. Since ScoutFS uses
 separate block devices for its metadata and data storage, two are required.
+The internal structures and nature of metadata and data transactions
+lead to minimum viable device sizes.  
+.B mkfs
+will check both devices and fail with an error if either are under the
+minimum size.   If
+.B --allow-small-size
+is given then sizes under the minimum size will be
+allowed after printing an informational warning.
 .sp
 If
 .B --force
@@ -81,6 +89,10 @@ kibibytes, mebibytes, etc.
 .B "-d, --max-data-size SIZE"
 Same as previous, but for limiting the size of the data device.
 .TP
+.B "-A, --allow-small-size"
+Allows use of specified device sizes less than the minimum.  This can
+result in bad behaviour and is only intended for testing.
+.TP
 .B "-z, --data-alloc-zone-blocks BLOCKS"
 Set the data_alloc_zone_blocks volume option, as described in
 .BR scoutfs (5).
--- a/utils/src/dev.c
+++ b/utils/src/dev.c
@@ -6,12 +6,13 @@
 #include <sys/ioctl.h>
 #include <linux/fs.h>
 #include <errno.h>
+#include <stdbool.h>

 #include "sparse.h"
 #include "dev.h"

 int device_size(char *path, int fd,
-		u64 min_size, u64 max_size,
+		u64 min_size, u64 max_size, bool allow_small_size,
 		char *use_type, u64 *size_ret)
 {
 	struct stat st;
@@ -63,10 +64,13 @@ int device_size(char *path, int fd,
 	if (size < min_size) {
 		fprintf(stderr,
 			BASE_SIZE_FMT" %s too small for min "
-			BASE_SIZE_FMT" %s device\n",
+			BASE_SIZE_FMT" %s device%s\n",
 			BASE_SIZE_ARGS(size), target_type,
-			BASE_SIZE_ARGS(min_size), use_type);
-		return -EINVAL;
+			BASE_SIZE_ARGS(min_size), use_type,
+			allow_small_size ? ", allowing with -A" : "");
+
+		if (!allow_small_size)
+			return -EINVAL;
 	}

 	*size_ret = size;
--- a/utils/src/dev.h
+++ b/utils/src/dev.h
@@ -1,6 +1,8 @@
 #ifndef _DEV_H_
 #define _DEV_H_

+#include <stdbool.h>
+
 #define BASE_SIZE_FMT "%.2f%s"
 #define BASE_SIZE_ARGS(sz) size_flt(sz, 1), size_str(sz, 1)

@@ -8,7 +10,7 @@
 #define SIZE_ARGS(nr, sz) (nr), size_flt(nr, sz), size_str(nr, sz)

 int device_size(char *path, int fd,
-		u64 min_size, u64 max_size,
+		u64 min_size, u64 max_size, bool allow_small_size,
 		char *use_type, u64 *size_ret);
 float size_flt(u64 nr, unsigned size);
 char *size_str(u64 nr, unsigned size);
--- a/utils/src/df.c
+++ b/utils/src/df.c
@@ -86,6 +86,11 @@ static int do_df(struct df_args *args)
 			data_free += ade[i].blocks;
 	}

+	if (meta_free >= sfm.reserved_meta_blocks)
+		meta_free -= sfm.reserved_meta_blocks;
+	else
+		meta_free = 0;
+
 	snprintf(cells[0][0], CHARS, "Type");
 	snprintf(cells[0][1], CHARS, "Size");
 	snprintf(cells[0][2], CHARS, "Total");
--- a/utils/src/mkfs.c
+++ b/utils/src/mkfs.c
@@ -135,6 +135,7 @@ struct mkfs_args {
 	unsigned long long max_data_size;
 	u64 data_alloc_zone_blocks;
 	bool force;
+	bool allow_small_size;
 	int nr_slots;
 	struct scoutfs_quorum_slot slots[SCOUTFS_QUORUM_MAX_SLOTS];
 };
@@ -215,13 +216,15 @@ static int do_mkfs(struct mkfs_args *args)
 		goto out;
 	}

-	ret = device_size(args->meta_device, meta_fd, 2ULL * (1024 * 1024 * 1024),
-			  args->max_meta_size, "meta", &meta_size);
+	/* minumum meta device size to make reserved blocks reasonably large */
+	ret = device_size(args->meta_device, meta_fd, 64ULL * (1024 * 1024 * 1024),
+			  args->max_meta_size, args->allow_small_size, "meta", &meta_size);
 	if (ret)
 		goto out;

-	ret = device_size(args->data_device, data_fd, 8ULL * (1024 * 1024 * 1024),
-			  args->max_data_size, "data", &data_size);
+	/* .. then arbitrarily the same minimum data device size */
+	ret = device_size(args->data_device, data_fd, 64ULL * (1024 * 1024 * 1024),
+			  args->max_data_size, args->allow_small_size, "data", &data_size);
 	if (ret)
 		goto out;

@@ -520,6 +523,9 @@ static int parse_opt(int key, char *arg, struct argp_state *state)
 				prev_val, args->max_data_size);
 		break;
 	}
+	case 'A':
+		args->allow_small_size = true;
+		break;
 	case 'z': /* data-alloc-zone-blocks */
 	{
 		ret = parse_u64(arg, &args->data_alloc_zone_blocks);
@@ -559,6 +565,7 @@ static int parse_opt(int key, char *arg, struct argp_state *state)
 static struct argp_option options[] = {
 	{ "quorum-slot", 'Q', "NR,ADDR,PORT", 0, "Specify quorum slot addresses [Required]"},
 	{ "force", 'f', NULL, 0, "Overwrite existing data on block devices"},
+	{ "allow-small-size", 'A', NULL, 0, "Allow specified meta/data devices less than minimum, still warns"},
 	{ "max-meta-size", 'm', "SIZE", 0, "Use a size less than the base metadata device size (bytes or KMGTP units)"},
 	{ "max-data-size", 'd', "SIZE", 0, "Use a size less than the base data device size (bytes or KMGTP units)"},
 	{ "data-alloc-zone-blocks", 'z', "BLOCKS", 0, "Divide data device into block zones so each mounts writes to a zone (4KB blocks)"},
--- a/utils/src/print.c
+++ b/utils/src/print.c
@@ -158,7 +158,7 @@ static print_func_t find_printer(u8 zone, u8 type)
 	    type <= SCOUTFS_INODE_INDEX_DATA_SEQ_TYPE)
 		return print_inode_index;

-	if (zone == SCOUTFS_RID_ZONE) {
+	if (zone == SCOUTFS_ORPHAN_ZONE) {
 		if (type == SCOUTFS_ORPHAN_TYPE)
 			return print_orphan;
 	}
@@ -245,15 +245,15 @@ static int print_logs_item(struct scoutfs_key *key, void *val,
 	le64_to_cpu((p)->blkno), le64_to_cpu((p)->seq)

 #define AL_HEAD_F \
-	AL_REF_F" total_nr %llu first_nr %u"
+	AL_REF_F" total_nr %llu first_nr %u flags 0x%x"
 #define AL_HEAD_A(p)					\
 	AL_REF_A(&(p)->ref), le64_to_cpu((p)->total_nr),\
-	le32_to_cpu((p)->first_nr)
+	le32_to_cpu((p)->first_nr), le32_to_cpu((p)->flags)

 #define ALCROOT_F \
-	BTROOT_F" total_len %llu"
+	BTROOT_F" total_len %llu flags 0x%x"
 #define ALCROOT_A(ar) \
-	BTROOT_A(&(ar)->root), le64_to_cpu((ar)->total_len)
+	BTROOT_A(&(ar)->root), le64_to_cpu((ar)->total_len), le32_to_cpu((ar)->flags)

 #define SRE_FMT "%016llx.%llu.%llu"
 #define SRE_A(sre)						\