Reliably delete orphaned inodes

Orphaned items haven't been deleted for quite a while -- the call to the orphan inode scanner has been commented out for ages. The deletion of the orphan item didn't take rid zone locking into account as we moved deletion from being strictly local to being performed by whoever last used the inode. This reworks orphan item management and brings back orphan inode scanning to correctly delete orphaned inodes. We get rid of the rid zone that was always _WRITE locked by each mount. That made it impossible for other mounts to get a _WRITE lock to delete orphan items. Instead we rename it to the orphan zone and have orphan item callers get _WRITE_ONLY locks inside their inode locks. Now all nodes can create and delete orphan items as they have _WRITE locks on the associated inodes. Then we refresh the orphan inode scanning function. It now runs regularly in the background of all mounts. It avoids creating cluster lock contention by finding candidates with unlocked forest hint reads and by testing inode caches locally and via the open map before properly locking and trying to delete the inode's items. Signed-off-by: Zach Brown <zab@versity.com>
2026-01-06 12:06:26 +00:00 · 2021-05-27 15:58:24 -07:00
parent 0374661a92
commit 07210b5734
13 changed files with 287 additions and 156 deletions
--- a/kmod/src/counters.h
+++ b/kmod/src/counters.h
@@ -151,6 +151,12 @@
 	EXPAND_COUNTER(net_recv_invalid_message)		\
 	EXPAND_COUNTER(net_recv_messages)			\
 	EXPAND_COUNTER(net_unknown_request)			\
+	EXPAND_COUNTER(orphan_scan)				\
+	EXPAND_COUNTER(orphan_scan_cached)			\
+	EXPAND_COUNTER(orphan_scan_error)			\
+	EXPAND_COUNTER(orphan_scan_item)			\
+	EXPAND_COUNTER(orphan_scan_omap_set)			\
+	EXPAND_COUNTER(orphan_scan_read)			\
 	EXPAND_COUNTER(quorum_elected)				\
 	EXPAND_COUNTER(quorum_fence_error)			\
 	EXPAND_COUNTER(quorum_fence_leader)			\
--- a/kmod/src/dir.c
+++ b/kmod/src/dir.c
@@ -669,6 +669,7 @@ static struct inode *lock_hold_create(struct inode *dir, struct dentry *dentry,
 				      umode_t mode, dev_t rdev,
 				      struct scoutfs_lock **dir_lock,
 				      struct scoutfs_lock **inode_lock,
+				      struct scoutfs_lock **orph_lock,
 				      struct list_head *ind_locks)
 {
 	struct super_block *sb = dir->i_sb;
@@ -701,6 +702,12 @@ static struct inode *lock_hold_create(struct inode *dir, struct dentry *dentry,
 	if (ret)
 		goto out_unlock;

+	if (orph_lock) {
+		ret = scoutfs_lock_orphan(sb, SCOUTFS_LOCK_WRITE_ONLY, 0, ino, orph_lock);
+		if (ret < 0)
+			goto out_unlock;
+	}
+
 retry:
 	ret = scoutfs_inode_index_start(sb, &ind_seq) ?:
 	      scoutfs_inode_index_prepare(sb, ind_locks, dir, true) ?:
@@ -725,9 +732,13 @@ out_unlock:
 	if (ret) {
 		scoutfs_inode_index_unlock(sb, ind_locks);
 		scoutfs_unlock(sb, *dir_lock, SCOUTFS_LOCK_WRITE);
-		scoutfs_unlock(sb, *inode_lock, SCOUTFS_LOCK_WRITE);
 		*dir_lock = NULL;
+		scoutfs_unlock(sb, *inode_lock, SCOUTFS_LOCK_WRITE);
 		*inode_lock = NULL;
+		if (orph_lock) {
+			scoutfs_unlock(sb, *orph_lock, SCOUTFS_LOCK_WRITE_ONLY);
+			*orph_lock = NULL;
+		}

 		inode = ERR_PTR(ret);
 	}
@@ -752,7 +763,7 @@ static int scoutfs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode,

 	hash = dirent_name_hash(dentry->d_name.name, dentry->d_name.len);
 	inode = lock_hold_create(dir, dentry, mode, rdev,
-				 &dir_lock, &inode_lock, &ind_locks);
+				 &dir_lock, &inode_lock, NULL, &ind_locks);
 	if (IS_ERR(inode))
 		return PTR_ERR(inode);

@@ -813,13 +824,15 @@ static int scoutfs_link(struct dentry *old_dentry,
 	struct super_block *sb = dir->i_sb;
 	struct scoutfs_lock *dir_lock;
 	struct scoutfs_lock *inode_lock = NULL;
+	struct scoutfs_lock *orph_lock = NULL;
 	LIST_HEAD(ind_locks);
-	bool del_orphan;
+	bool del_orphan = false;
 	u64 dir_size;
 	u64 ind_seq;
 	u64 hash;
 	u64 pos;
 	int ret;
+	int err;

 	hash = dirent_name_hash(dentry->d_name.name, dentry->d_name.len);

@@ -843,7 +856,14 @@ static int scoutfs_link(struct dentry *old_dentry,
 		goto out_unlock;

 	dir_size = i_size_read(dir) + dentry->d_name.len;
-	del_orphan = (inode->i_nlink == 0);
+
+	if (inode->i_nlink == 0) {
+		del_orphan = true;
+		ret = scoutfs_lock_orphan(sb, SCOUTFS_LOCK_WRITE_ONLY, 0, scoutfs_ino(inode),
+					  &orph_lock);
+		if (ret < 0)
+			goto out_unlock;
+	}

 retry:
 	ret = scoutfs_inode_index_start(sb, &ind_seq) ?:
@@ -860,7 +880,7 @@ retry:
 		goto out;

 	if (del_orphan) {
-		ret = scoutfs_orphan_dirty(sb, scoutfs_ino(inode));
+		ret = scoutfs_inode_orphan_delete(sb, scoutfs_ino(inode), orph_lock);
 		if (ret)
 			goto out;
 	}
@@ -871,8 +891,11 @@ retry:
 			      dentry->d_name.name, dentry->d_name.len,
 			      scoutfs_ino(inode), inode->i_mode, dir_lock,
 			      inode_lock);
-	if (ret)
+	if (ret) {
+		err = scoutfs_inode_orphan_create(sb, scoutfs_ino(inode), orph_lock);
+		WARN_ON_ONCE(err); /* no orphan, might not scan and delete after crash */
 		goto out;
+	}
 	update_dentry_info(sb, dentry, hash, pos, dir_lock);

 	i_size_write(dir, dir_size);
@@ -880,11 +903,6 @@ retry:
 	inode->i_ctime = dir->i_mtime;
 	inc_nlink(inode);

-	if (del_orphan) {
-		ret = scoutfs_orphan_delete(sb, scoutfs_ino(inode));
-		WARN_ON_ONCE(ret);
-	}
-
 	scoutfs_update_inode_item(inode, inode_lock, &ind_locks);
 	scoutfs_update_inode_item(dir, dir_lock, &ind_locks);

@@ -896,6 +914,8 @@ out_unlock:
 	scoutfs_inode_index_unlock(sb, &ind_locks);
 	scoutfs_unlock(sb, dir_lock, SCOUTFS_LOCK_WRITE);
 	scoutfs_unlock(sb, inode_lock, SCOUTFS_LOCK_WRITE);
+	scoutfs_unlock(sb, orph_lock, SCOUTFS_LOCK_WRITE_ONLY);
+
 	return ret;
 }

@@ -920,6 +940,7 @@ static int scoutfs_unlink(struct inode *dir, struct dentry *dentry)
 	struct inode *inode = dentry->d_inode;
 	struct timespec ts = current_kernel_time();
 	struct scoutfs_lock *inode_lock = NULL;
+	struct scoutfs_lock *orph_lock = NULL;
 	struct scoutfs_lock *dir_lock = NULL;
 	LIST_HEAD(ind_locks);
 	u64 ind_seq;
@@ -937,6 +958,13 @@ static int scoutfs_unlink(struct inode *dir, struct dentry *dentry)
 		goto unlock;
 	}

+	if (should_orphan(inode)) {
+		ret = scoutfs_lock_orphan(sb, SCOUTFS_LOCK_WRITE_ONLY, 0, scoutfs_ino(inode),
+					  &orph_lock);
+		if (ret < 0)
+			goto unlock;
+	}
+
 retry:
 	ret = scoutfs_inode_index_start(sb, &ind_seq) ?:
 	      scoutfs_inode_index_prepare(sb, &ind_locks, dir, false) ?:
@@ -947,22 +975,19 @@ retry:
 	if (ret)
 		goto unlock;

+	if (should_orphan(inode)) {
+		ret = scoutfs_inode_orphan_create(sb, scoutfs_ino(inode), orph_lock);
+		if (ret < 0)
+			goto out;
+	}
+
 	ret = del_entry_items(sb, scoutfs_ino(dir), dentry_info_hash(dentry),
 			      dentry_info_pos(dentry), scoutfs_ino(inode),
 			      dir_lock, inode_lock);
-	if (ret)
+	if (ret) {
+		ret = scoutfs_inode_orphan_delete(sb, scoutfs_ino(inode), orph_lock);
+		WARN_ON_ONCE(ret); /* should have been dirty */
 		goto out;
-
-	if (should_orphan(inode)) {
-		/*
-		 * Insert the orphan item before we modify any inode
-		 * metadata so we can gracefully exit should it
-		 * fail.
-		 */
-		ret = scoutfs_orphan_inode(inode);
-		WARN_ON_ONCE(ret); /* XXX returning error but items deleted */
-		if (ret)
-			goto out;
 	}

 	dir->i_ctime = ts;
@@ -984,6 +1009,7 @@ unlock:
 	scoutfs_inode_index_unlock(sb, &ind_locks);
 	scoutfs_unlock(sb, dir_lock, SCOUTFS_LOCK_WRITE);
 	scoutfs_unlock(sb, inode_lock, SCOUTFS_LOCK_WRITE);
+	scoutfs_unlock(sb, orph_lock, SCOUTFS_LOCK_WRITE_ONLY);

 	return ret;
 }
@@ -1176,7 +1202,7 @@ static int scoutfs_symlink(struct inode *dir, struct dentry *dentry,
 		return ret;

 	inode = lock_hold_create(dir, dentry, S_IFLNK|S_IRWXUGO, 0,
-				 &dir_lock, &inode_lock, &ind_locks);
+				 &dir_lock, &inode_lock, NULL, &ind_locks);
 	if (IS_ERR(inode))
 		return PTR_ERR(inode);

@@ -1535,6 +1561,7 @@ static int scoutfs_rename(struct inode *old_dir, struct dentry *old_dentry,
 	struct scoutfs_lock *new_dir_lock = NULL;
 	struct scoutfs_lock *old_inode_lock = NULL;
 	struct scoutfs_lock *new_inode_lock = NULL;
+	struct scoutfs_lock *orph_lock = NULL;
 	struct timespec now;
 	bool ins_new = false;
 	bool del_new = false;
@@ -1599,6 +1626,13 @@ static int scoutfs_rename(struct inode *old_dir, struct dentry *old_dentry,
 	if (ret)
 		goto out_unlock;

+	if (should_orphan(new_inode)) {
+		ret = scoutfs_lock_orphan(sb, SCOUTFS_LOCK_WRITE_ONLY, 0, scoutfs_ino(new_inode),
+					  &orph_lock);
+		if (ret < 0)
+			goto out_unlock;
+	}
+
 retry:
 	ret = scoutfs_inode_index_start(sb, &ind_seq) ?:
 	      scoutfs_inode_index_prepare(sb, &ind_locks, old_dir, false) ?:
@@ -1658,7 +1692,7 @@ retry:
 	ins_old = true;

 	if (should_orphan(new_inode)) {
-		ret = scoutfs_orphan_inode(new_inode);
+		ret = scoutfs_inode_orphan_create(sb, scoutfs_ino(new_inode), orph_lock);
 		if (ret)
 			goto out;
 	}
@@ -1762,6 +1796,7 @@ out_unlock:
 	scoutfs_unlock(sb, old_dir_lock, SCOUTFS_LOCK_WRITE);
 	scoutfs_unlock(sb, new_dir_lock, SCOUTFS_LOCK_WRITE);
 	scoutfs_unlock(sb, rename_lock, SCOUTFS_LOCK_WRITE);
+	scoutfs_unlock(sb, orph_lock, SCOUTFS_LOCK_WRITE_ONLY);

 	return ret;
 }
@@ -1781,6 +1816,7 @@ static int scoutfs_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mod
 	struct inode *inode = NULL;
 	struct scoutfs_lock *dir_lock = NULL;
 	struct scoutfs_lock *inode_lock = NULL;
+	struct scoutfs_lock *orph_lock = NULL;
 	LIST_HEAD(ind_locks);
 	int ret;

@@ -1788,25 +1824,32 @@ static int scoutfs_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mod
 		return -ENAMETOOLONG;

 	inode = lock_hold_create(dir, dentry, mode, 0,
-				 &dir_lock, &inode_lock, &ind_locks);
+				 &dir_lock, &inode_lock, &orph_lock, &ind_locks);
 	if (IS_ERR(inode))
 		return PTR_ERR(inode);

+	ret = scoutfs_inode_orphan_create(sb, scoutfs_ino(inode), orph_lock);
+	if (ret < 0) {
+		iput(inode);
+		goto out; /* XXX returning error but items created */
+	}
+
 	inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
 	insert_inode_hash(inode);
+	ihold(inode); /* need to update inode modifications in d_tmpfile */
 	d_tmpfile(dentry, inode);

 	scoutfs_update_inode_item(inode, inode_lock, &ind_locks);
 	scoutfs_update_inode_item(dir, dir_lock, &ind_locks);
 	scoutfs_inode_index_unlock(sb, &ind_locks);
+	iput(inode);

-	ret = scoutfs_orphan_inode(inode);
-	WARN_ON_ONCE(ret); /* XXX returning error but items deleted */
-
+out:
 	scoutfs_release_trans(sb);
 	scoutfs_inode_index_unlock(sb, &ind_locks);
 	scoutfs_unlock(sb, dir_lock, SCOUTFS_LOCK_WRITE);
 	scoutfs_unlock(sb, inode_lock, SCOUTFS_LOCK_WRITE);
+	scoutfs_unlock(sb, orph_lock, SCOUTFS_LOCK_WRITE_ONLY);

 	return ret;
 }
--- a/kmod/src/format.h
+++ b/kmod/src/format.h
@@ -570,7 +570,7 @@ struct scoutfs_log_merge_freeing {
 * Keys are first sorted by major key zones.
 */
 #define SCOUTFS_INODE_INDEX_ZONE		1
-#define SCOUTFS_RID_ZONE			2
+#define SCOUTFS_ORPHAN_ZONE			2
 #define SCOUTFS_FS_ZONE				3
 #define SCOUTFS_LOCK_ZONE			4
 /* Items only stored in server btrees */
@@ -592,7 +592,7 @@ struct scoutfs_log_merge_freeing {
 #define SCOUTFS_INODE_INDEX_DATA_SEQ_TYPE	2
 #define SCOUTFS_INODE_INDEX_NR			3 /* don't forget to update */

-/* rid zone (also used in server alloc btree) */
+/* orphan zone, redundant type used for clarity */
 #define SCOUTFS_ORPHAN_TYPE			1

 /* fs zone */
--- a/kmod/src/inode.c
+++ b/kmod/src/inode.c
@@ -34,6 +34,7 @@
 #include "client.h"
 #include "cmp.h"
 #include "omap.h"
+#include "forest.h"

 /*
 * XXX
@@ -54,10 +55,15 @@ struct inode_allocator {
 };

 struct inode_sb_info {
+	struct super_block *sb;
+	bool stopped;
+
 	spinlock_t writeback_lock;
 	struct rb_root writeback_inodes;
 	struct inode_allocator dir_ino_alloc;
 	struct inode_allocator ino_alloc;
+
+	struct delayed_work orphan_scan_dwork;
 };

 #define DECLARE_INODE_SB_INFO(sb, name) \
@@ -1437,41 +1443,36 @@ out:
 	return inode;
 }

-static void init_orphan_key(struct scoutfs_key *key, u64 rid, u64 ino)
+static void init_orphan_key(struct scoutfs_key *key, u64 ino)
 {
 	*key = (struct scoutfs_key) {
-		.sk_zone = SCOUTFS_RID_ZONE,
-		.sko_rid = cpu_to_le64(rid),
-		.sk_type = SCOUTFS_ORPHAN_TYPE,
+		.sk_zone = SCOUTFS_ORPHAN_ZONE,
 		.sko_ino = cpu_to_le64(ino),
+		.sk_type = SCOUTFS_ORPHAN_TYPE,
 	};
 }

-int scoutfs_orphan_dirty(struct super_block *sb, u64 ino)
+/*
+ * Create an orphan item.  The orphan items are maintained in their own
+ * zone under a write only lock while the caller has the inode protected
+ * by a write lock.
+ */
+int scoutfs_inode_orphan_create(struct super_block *sb, u64 ino, struct scoutfs_lock *lock)
 {
-	struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
-	struct scoutfs_lock *lock = sbi->rid_lock;
 	struct scoutfs_key key;

-	init_orphan_key(&key, sbi->rid, ino);
+	init_orphan_key(&key, ino);

-	return scoutfs_item_dirty(sb, &key, lock);
+	return scoutfs_item_create_force(sb, &key, NULL, 0, lock);
 }

-int scoutfs_orphan_delete(struct super_block *sb, u64 ino)
+int scoutfs_inode_orphan_delete(struct super_block *sb, u64 ino, struct scoutfs_lock *lock)
 {
-	struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
-	struct scoutfs_lock *lock = sbi->rid_lock;
 	struct scoutfs_key key;
-	int ret;

-	init_orphan_key(&key, sbi->rid, ino);
+	init_orphan_key(&key, ino);

-	ret = scoutfs_item_delete(sb, &key, lock);
-	if (ret == -ENOENT)
-		ret = 0;
-
-	return ret;
+	return scoutfs_item_delete_force(sb, &key, lock);
 }

 /*
@@ -1483,7 +1484,8 @@ int scoutfs_orphan_delete(struct super_block *sb, u64 ino)
 * partial deletion until all deletion is complete and the orphan item
 * is removed.
 */
-static int delete_inode_items(struct super_block *sb, u64 ino, struct scoutfs_lock *lock)
+static int delete_inode_items(struct super_block *sb, u64 ino, struct scoutfs_lock *lock,
+			      struct scoutfs_lock *orph_lock)
 {
 	struct scoutfs_inode sinode;
 	struct scoutfs_key key;
@@ -1553,7 +1555,7 @@ retry:
 	if (ret)
 		goto out;

-	ret = scoutfs_orphan_delete(sb, ino);
+	ret = scoutfs_inode_orphan_delete(sb, ino, orph_lock);
 out:
 	if (release)
 		scoutfs_release_trans(sb);
@@ -1573,6 +1575,7 @@ void scoutfs_evict_inode(struct inode *inode)
 {
 	struct super_block *sb = inode->i_sb;
 	const u64 ino = scoutfs_ino(inode);
+	struct scoutfs_lock *orph_lock;
 	struct scoutfs_lock *lock;
 	int ret;

@@ -1584,10 +1587,11 @@ void scoutfs_evict_inode(struct inode *inode)

 	truncate_inode_pages_final(&inode->i_data);

-	ret = scoutfs_omap_should_delete(sb, inode, &lock);
+	ret = scoutfs_omap_should_delete(sb, inode, &lock, &orph_lock);
 	if (ret > 0) {
-		ret = delete_inode_items(inode->i_sb, scoutfs_ino(inode), lock);
+		ret = delete_inode_items(inode->i_sb, scoutfs_ino(inode), lock, orph_lock);
 		scoutfs_unlock(sb, lock, SCOUTFS_LOCK_WRITE);
+		scoutfs_unlock(sb, orph_lock, SCOUTFS_LOCK_WRITE_ONLY);
 	}
 	if (ret < 0)
 		scoutfs_err(sb, "error %d while checking to delete inode nr %llu, it might linger.",
@@ -1626,75 +1630,141 @@ int scoutfs_drop_inode(struct inode *inode)
 }

 /*
- * Find orphan items and process each one.
- *
- * Runtime of this will be bounded by the number of orphans, which could
- * theoretically be very large. If that becomes a problem we might want to push
- * this work off to a thread.
- *
- * This only scans orphans for this node.  This will need to be covered by
- * the rest of node zone cleanup.
+ * All mounts are performing this work concurrently.  We introduce
+ * significant jitter between them to try and keep them from all
+ * bunching up and working on the same inodes.
 */
-int scoutfs_scan_orphans(struct super_block *sb)
+static void schedule_orphan_dwork(struct inode_sb_info *inf)
 {
-	struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
-	struct scoutfs_lock *lock = sbi->rid_lock;
-	struct scoutfs_lock *inode_lock = NULL;
-	struct scoutfs_key key;
+#define ORPHAN_SCAN_MIN_MS (10 * MSEC_PER_SEC)
+#define ORPHAN_SCAN_JITTER_MS (40 * MSEC_PER_SEC)
+	unsigned long delay = msecs_to_jiffies(ORPHAN_SCAN_MIN_MS +
+					       prandom_u32_max(ORPHAN_SCAN_JITTER_MS));
+	if (!inf->stopped) {
+		delay = msecs_to_jiffies(ORPHAN_SCAN_MIN_MS +
+					 prandom_u32_max(ORPHAN_SCAN_JITTER_MS));
+		schedule_delayed_work(&inf->orphan_scan_dwork, delay);
+	}
+}
+
+/*
+ * Find and delete inodes whose only remaining reference is the
+ * persistent orphan item that was created as they were unlinked.
+ *
+ * Orphan items are created as the final directory entry referring to an
+ * inode is deleted.  They're deleted as the final cached inode is
+ * evicted and the inode items are destroyed.  They can linger if all
+ * the cached inodes pinning the inode fail to delete as they are
+ * evicted from the cache -- either through crashing or errors.
+ *
+ * This work runs in all mounts in the background looking for orphaned
+ * inodes that should be deleted.
+ *
+ * We use the forest hint call to read the persistent forest trees
+ * looking for orphan items without creating lock contention.  Orphan
+ * items exist for O_TMPFILE users and we don't want to force them to
+ * commit by trying to acquire a conflicting read lock the orphan zone.
+ * There's no rush to reclaim deleted items, eventually they will be
+ * found in the persistent item btrees.
+ *
+ * Once we find candidate orphan items we can first check our local
+ * inode cache for inodes that are already on their way to eviction and
+ * can be skipped.  Then we ask the server for the open map containing
+ * the inode.  Only if we don't have it cached, and no one else does, do
+ * we try and read it into our cache and evict it to trigger the final
+ * inode deletion process.
+ *
+ * Orphaned items that make it that far should be very rare.  They can
+ * only exist if all the mounts that were using an inode after it had
+ * been unlinked (or created with o_tmpfile) didn't unmount cleanly.
+ */
+static void inode_orphan_scan_worker(struct work_struct *work)
+{
+	struct inode_sb_info *inf = container_of(work, struct inode_sb_info,
+						 orphan_scan_dwork.work);
+	struct super_block *sb = inf->sb;
+	struct scoutfs_open_ino_map omap;
 	struct scoutfs_key last;
+	struct scoutfs_key next;
+	struct scoutfs_key key;
+	struct inode *inode;
+	u64 group_nr;
+	int bit_nr;
 	u64 ino;
-	int err = 0;
 	int ret;

-	trace_scoutfs_scan_orphans(sb);
+	scoutfs_inc_counter(sb, orphan_scan);

-	init_orphan_key(&key, sbi->rid, 0);
-	init_orphan_key(&last, sbi->rid, ~0ULL);
+	init_orphan_key(&last, U64_MAX);
+	omap.args.group_nr = cpu_to_le64(U64_MAX);

-	while (1) {
-		ret = scoutfs_item_next(sb, &key, &last, NULL, 0, lock);
-		if (ret == -ENOENT) /* No more orphan items */
-			break;
-		if (ret < 0)
+	for (ino = SCOUTFS_ROOT_INO + 1; ino != 0; ino++) {
+		if (inf->stopped) {
+			ret = 0;
 			goto out;
-
-		ino = le64_to_cpu(key.sko_ino);
-
-		ret = scoutfs_lock_ino(sb, SCOUTFS_LOCK_WRITE, 0, ino, &inode_lock);
-		if (ret == 0) {
-			ret = delete_inode_items(sb, le64_to_cpu(key.sko_ino), inode_lock);
-			scoutfs_unlock(sb, inode_lock, SCOUTFS_LOCK_WRITE);
 		}
-		if (ret && ret != -ENOENT && !err)
-			err = ret;

-		if (le64_to_cpu(key.sko_ino) == U64_MAX) {
-			ret = -ENOENT;
+		/* find the next orphan item */
+		init_orphan_key(&key, ino);
+		ret = scoutfs_forest_next_hint(sb, &key, &next);
+		if (ret < 0) {
+			if (ret == -ENOENT)
+				break;
+			goto out;
+		}
+
+		if (scoutfs_key_compare(&next, &last) > 0)
 			break;
+
+		scoutfs_inc_counter(sb, orphan_scan_item);
+		ino = le64_to_cpu(next.sko_ino);
+
+		/* locally cached inodes will already be deleted */
+		inode = scoutfs_ilookup(sb, ino);
+		if (inode) {
+			scoutfs_inc_counter(sb, orphan_scan_cached);
+			iput(inode);
+			continue;
 		}
-		le64_add_cpu(&key.sko_ino, 1);
+
+		/* get an omap that covers the orphaned ino */
+		group_nr = ino >> SCOUTFS_OPEN_INO_MAP_SHIFT;
+		bit_nr = ino & SCOUTFS_OPEN_INO_MAP_MASK;
+
+		if (le64_to_cpu(omap.args.group_nr) != group_nr) {
+			ret = scoutfs_client_open_ino_map(sb, group_nr, &omap);
+			if (ret < 0)
+				goto out;
+		}
+
+		/* don't need to evict if someone else has it open (cached) */
+		if (test_bit_le(bit_nr, omap.bits)) {
+			scoutfs_inc_counter(sb, orphan_scan_omap_set);
+			continue;
+		}
+
+		/* try to cached and evict unused inode to delete, can be racing */
+		inode = scoutfs_iget(sb, ino);
+		if (IS_ERR(inode)) {
+			ret = PTR_ERR(inode);
+			if (ret == -ENOENT)
+				continue;
+			else
+				goto out;
+		}
+
+		scoutfs_inc_counter(sb, orphan_scan_read);
+		SCOUTFS_I(inode)->drop_invalidated = true;
+		iput(inode);
 	}

 	ret = 0;
+
 out:
-	return err ? err : ret;
-}
+	if (ret < 0)
+		scoutfs_inc_counter(sb, orphan_scan_error);

-int scoutfs_orphan_inode(struct inode *inode)
-{
-	struct super_block *sb = inode->i_sb;
-	struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
-	struct scoutfs_lock *lock = sbi->rid_lock;
-	struct scoutfs_key key;
-	int ret;
-
-	trace_scoutfs_orphan_inode(sb, inode);
-
-	init_orphan_key(&key, sbi->rid, scoutfs_ino(inode));
-
-	ret = scoutfs_item_create(sb, &key, NULL, 0, lock);
-
-	return ret;
+	schedule_orphan_dwork(inf);
 }

 /*
@@ -1803,16 +1873,41 @@ int scoutfs_inode_setup(struct super_block *sb)
 	if (!inf)
 		return -ENOMEM;

+	inf->sb = sb;
 	spin_lock_init(&inf->writeback_lock);
 	inf->writeback_inodes = RB_ROOT;
 	spin_lock_init(&inf->dir_ino_alloc.lock);
 	spin_lock_init(&inf->ino_alloc.lock);
+	INIT_DELAYED_WORK(&inf->orphan_scan_dwork, inode_orphan_scan_worker);

 	sbi->inode_sb_info = inf;

 	return 0;
 }

+/*
+ * Our inode subsystem is setup pretty early but orphan scanning uses
+ * many other subsystems like networking and the server.  We only kick
+ * it off once everything is ready.
+ */
+int scoutfs_inode_start(struct super_block *sb)
+{
+	DECLARE_INODE_SB_INFO(sb, inf);
+
+	schedule_orphan_dwork(inf);
+	return 0;
+}
+
+void scoutfs_inode_stop(struct super_block *sb)
+{
+	DECLARE_INODE_SB_INFO(sb, inf);
+
+	if (inf) {
+		inf->stopped = true;
+		cancel_delayed_work_sync(&inf->orphan_scan_dwork);
+	}
+}
+
 void scoutfs_inode_destroy(struct super_block *sb)
 {
 	struct inode_sb_info *inf = SCOUTFS_SB(sb)->inode_sb_info;
--- a/kmod/src/inode.h
+++ b/kmod/src/inode.h
@@ -75,7 +75,6 @@ struct inode *scoutfs_alloc_inode(struct super_block *sb);
 void scoutfs_destroy_inode(struct inode *inode);
 int scoutfs_drop_inode(struct inode *inode);
 void scoutfs_evict_inode(struct inode *inode);
-int scoutfs_orphan_inode(struct inode *inode);

 struct inode *scoutfs_iget(struct super_block *sb, u64 ino);
 struct inode *scoutfs_ilookup(struct super_block *sb, u64 ino);
@@ -120,9 +119,8 @@ int scoutfs_getattr(struct vfsmount *mnt, struct dentry *dentry,
 		    struct kstat *stat);
 int scoutfs_setattr(struct dentry *dentry, struct iattr *attr);

-int scoutfs_scan_orphans(struct super_block *sb);
-int scoutfs_orphan_dirty(struct super_block *sb, u64 ino);
-int scoutfs_orphan_delete(struct super_block *sb, u64 ino);
+int scoutfs_inode_orphan_create(struct super_block *sb, u64 ino, struct scoutfs_lock *lock);
+int scoutfs_inode_orphan_delete(struct super_block *sb, u64 ino, struct scoutfs_lock *lock);

 void scoutfs_inode_queue_writeback(struct inode *inode);
 int scoutfs_inode_walk_writeback(struct super_block *sb, bool write);
@@ -133,6 +131,8 @@ void scoutfs_inode_exit(void);
 int scoutfs_inode_init(void);

 int scoutfs_inode_setup(struct super_block *sb);
+int scoutfs_inode_start(struct super_block *sb);
+void scoutfs_inode_stop(struct super_block *sb);
 void scoutfs_inode_destroy(struct super_block *sb);

 #endif
--- a/kmod/src/lock.c
+++ b/kmod/src/lock.c
@@ -1347,29 +1347,28 @@ int scoutfs_lock_inode_index(struct super_block *sb, enum scoutfs_lock_mode mode
 }

 /*
- * The rid lock protects a mount's private persistent items in the rid
- * zone.  It's held for the duration of the mount.  It lets the mount
- * modify the rid items at will and signals to other mounts that we're
- * still alive and our rid items shouldn't be reclaimed.
+ * Orphan items are stored in their own zone which are modified with
+ * shared write_only locks and are read inconsistently without locks by
+ * background scanning work.
 *
- * Being held for the entire mount prevents other nodes from reclaiming
- * our items, like free blocks, when it would make sense for them to be
- * able to.  Maybe we have a bunch free and they're trying to allocate
- * and are getting ENOSPC.
+ * Since we only use write_only locks we just lock the entire zone, but
+ * the api provides the inode in case we ever change the locking scheme.
 */
-int scoutfs_lock_rid(struct super_block *sb, enum scoutfs_lock_mode mode, int flags,
-		     u64 rid, struct scoutfs_lock **lock)
+int scoutfs_lock_orphan(struct super_block *sb, enum scoutfs_lock_mode mode, int flags, u64 ino,
+			struct scoutfs_lock **lock)
 {
 	struct scoutfs_key start;
 	struct scoutfs_key end;

 	scoutfs_key_set_zeros(&start);
-	start.sk_zone = SCOUTFS_RID_ZONE;
-	start.sko_rid = cpu_to_le64(rid);
+	start.sk_zone = SCOUTFS_ORPHAN_ZONE;
+	start.sko_ino = 0;
+	start.sk_type = SCOUTFS_ORPHAN_TYPE;

-	scoutfs_key_set_ones(&end);
-	end.sk_zone = SCOUTFS_RID_ZONE;
-	end.sko_rid = cpu_to_le64(rid);
+	scoutfs_key_set_zeros(&end);
+	end.sk_zone = SCOUTFS_ORPHAN_ZONE;
+	end.sko_ino = cpu_to_le64(U64_MAX);
+	end.sk_type = SCOUTFS_ORPHAN_TYPE;

 	return lock_key_range(sb, mode, flags, &start, &end, lock);
 }
--- a/kmod/src/lock.h
+++ b/kmod/src/lock.h
@@ -85,8 +85,8 @@ int scoutfs_lock_inodes(struct super_block *sb, enum scoutfs_lock_mode mode, int
 			struct inode *d, struct scoutfs_lock **D_lock);
 int scoutfs_lock_rename(struct super_block *sb, enum scoutfs_lock_mode mode, int flags,
 			struct scoutfs_lock **lock);
-int scoutfs_lock_rid(struct super_block *sb, enum scoutfs_lock_mode mode, int flags,
-		     u64 rid, struct scoutfs_lock **lock);
+int scoutfs_lock_orphan(struct super_block *sb, enum scoutfs_lock_mode mode, int flags,
+		        u64 ino, struct scoutfs_lock **lock);
 void scoutfs_unlock(struct super_block *sb, struct scoutfs_lock *lock,
 		    enum scoutfs_lock_mode mode);

--- a/kmod/src/omap.c
+++ b/kmod/src/omap.c
@@ -908,9 +908,9 @@ out:
 }

 /*
- * Return 1 and give the caller a write inode lock if it is safe to be
- * deleted.  It's safe to be deleted when it is no longer reachable and
- * nothing is referencing it.
+ * Return 1 and give the caller their locks when they should delete the
+ * inode items.  It's safe to delete the inode items when it is no
+ * longer reachable and nothing is referencing it.
 *
 * The inode is unreachable when nlink hits zero.  Cluster locks protect
 * modification and testing of nlink.  We use the ino_lock_cov covrage
@@ -925,15 +925,17 @@ out:
 * increase nlink from zero and let people get a reference to the inode.
 */
 int scoutfs_omap_should_delete(struct super_block *sb, struct inode *inode,
-			       struct scoutfs_lock **lock_ret)
+			       struct scoutfs_lock **lock_ret, struct scoutfs_lock **orph_lock_ret)
 {
 	struct scoutfs_inode_info *si = SCOUTFS_I(inode);
+	struct scoutfs_lock *orph_lock = NULL;
 	struct scoutfs_lock *lock = NULL;
 	const u64 ino = scoutfs_ino(inode);
 	struct scoutfs_omap_lock_data *ldata;
 	u64 group_nr;
 	int bit_nr;
 	int ret;
+	int err;

 	/* lock group and omap constants are defined independently */
 	BUILD_BUG_ON(SCOUTFS_OPEN_INO_MAP_BITS != SCOUTFS_LOCK_INODE_GROUP_NR);
@@ -964,12 +966,19 @@ int scoutfs_omap_should_delete(struct super_block *sb, struct inode *inode,
 out:
 	trace_scoutfs_omap_should_delete(sb, ino, inode->i_nlink, ret);

+	if (ret > 0) {
+		err = scoutfs_lock_orphan(sb, SCOUTFS_LOCK_WRITE_ONLY, 0, ino, &orph_lock);
+		if (err < 0)
+			ret = err;
+	}
+
 	if (ret <= 0) {
 		scoutfs_unlock(sb, lock, SCOUTFS_LOCK_WRITE);
 		lock = NULL;
 	}

 	*lock_ret = lock;
+	*orph_lock_ret = orph_lock;
 	return ret;
 }

--- a/kmod/src/omap.h
+++ b/kmod/src/omap.h
@@ -4,7 +4,7 @@
 int scoutfs_omap_inc(struct super_block *sb, u64 ino);
 void scoutfs_omap_dec(struct super_block *sb, u64 ino);
 int scoutfs_omap_should_delete(struct super_block *sb, struct inode *inode,
-			       struct scoutfs_lock **lock_ret);
+			       struct scoutfs_lock **lock_ret, struct scoutfs_lock **orph_lock_ret);
 void scoutfs_omap_free_lock_data(struct scoutfs_omap_lock_data *ldata);
 int scoutfs_omap_client_handle_request(struct super_block *sb, u64 id,
 				       struct scoutfs_open_ino_map_args *args);
--- a/kmod/src/scoutfs_trace.h
+++ b/kmod/src/scoutfs_trace.h
@@ -985,22 +985,6 @@ TRACE_EVENT(scoutfs_delete_inode,
 		  __entry->mode, __entry->size)
 );

-TRACE_EVENT(scoutfs_scan_orphans,
-	TP_PROTO(struct super_block *sb),
-
-	TP_ARGS(sb),
-
-	TP_STRUCT__entry(
-		__field(dev_t, dev)
-	),
-
-	TP_fast_assign(
-		__entry->dev = sb->s_dev;
-	),
-
-	TP_printk("dev %d,%d", MAJOR(__entry->dev), MINOR(__entry->dev))
-);
-
 DECLARE_EVENT_CLASS(scoutfs_key_class,
        TP_PROTO(struct super_block *sb, struct scoutfs_key *key),
        TP_ARGS(sb, key),
--- a/kmod/src/super.c
+++ b/kmod/src/super.c
@@ -247,11 +247,9 @@ static void scoutfs_put_super(struct super_block *sb)

 	trace_scoutfs_put_super(sb);

+	scoutfs_inode_stop(sb);
 	scoutfs_srch_destroy(sb);

-	scoutfs_unlock(sb, sbi->rid_lock, SCOUTFS_LOCK_WRITE);
-	sbi->rid_lock = NULL;
-
 	scoutfs_lock_shutdown(sb);

 	scoutfs_shutdown_trans(sb);
@@ -623,10 +621,9 @@ static int scoutfs_fill_super(struct super_block *sb, void *data, int silent)
 	      scoutfs_quorum_setup(sb) ?:
 	      scoutfs_client_setup(sb) ?:
 	      scoutfs_volopt_setup(sb) ?:
-	      scoutfs_lock_rid(sb, SCOUTFS_LOCK_WRITE, 0, sbi->rid,
-				   &sbi->rid_lock) ?:
 	      scoutfs_trans_get_log_trees(sb) ?:
-	      scoutfs_srch_setup(sb);
+	      scoutfs_srch_setup(sb) ?:
+	      scoutfs_inode_start(sb);
 	if (ret)
 		goto out;

@@ -647,7 +644,6 @@ static int scoutfs_fill_super(struct super_block *sb, void *data, int silent)
 		goto out;

 	scoutfs_trans_restart_sync_deadline(sb);
-//	scoutfs_scan_orphans(sb);
 	ret = 0;
 out:
 	/* on error, generic_shutdown_super calls put_super if s_root */
--- a/kmod/src/super.h
+++ b/kmod/src/super.h
@@ -36,7 +36,6 @@ struct scoutfs_sb_info {

 	/* assigned once at the start of each mount, read-only */
 	u64 rid;
-	struct scoutfs_lock *rid_lock;

 	struct scoutfs_super_block super;

--- a/utils/src/print.c
+++ b/utils/src/print.c
@@ -158,7 +158,7 @@ static print_func_t find_printer(u8 zone, u8 type)
 	    type <= SCOUTFS_INODE_INDEX_DATA_SEQ_TYPE)
 		return print_inode_index;

-	if (zone == SCOUTFS_RID_ZONE) {
+	if (zone == SCOUTFS_ORPHAN_ZONE) {
 		if (type == SCOUTFS_ORPHAN_TYPE)
 			return print_orphan;
 	}