Merge pull request #76 from versity/zab/inode_deletion_fixes

Zab/inode deletion fixes
2026-05-02 19:05:43 +00:00 · 2022-03-11 16:23:21 -08:00
parent c5068efef0 bddca171ee
commit f2679d9598
26 changed files with 986 additions and 628 deletions
--- a/kmod/src/client.c
+++ b/kmod/src/client.c
@@ -477,12 +477,15 @@ static void scoutfs_client_connect_worker(struct work_struct *work)
 	struct super_block *sb = client->sb;
 	struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
 	struct scoutfs_super_block *super = &sbi->super;
-	struct mount_options *opts = &sbi->opts;
-	const bool am_quorum = opts->quorum_slot_nr >= 0;
+	struct scoutfs_mount_options opts;
 	struct scoutfs_net_greeting greet;
 	struct sockaddr_in sin;
+	bool am_quorum;
 	int ret;

+	scoutfs_options_read(sb, &opts);
+	am_quorum = opts.quorum_slot_nr >= 0;
+
 	/* can unmount once server farewell handling removes our item */
 	if (client->sending_farewell &&
 	    lookup_mounted_client_item(sb, sbi->rid) == 0) {
--- a/kmod/src/counters.h
+++ b/kmod/src/counters.h
@@ -152,11 +152,11 @@
 	EXPAND_COUNTER(net_recv_messages)			\
 	EXPAND_COUNTER(net_unknown_request)			\
 	EXPAND_COUNTER(orphan_scan)				\
+	EXPAND_COUNTER(orphan_scan_attempts)			\
 	EXPAND_COUNTER(orphan_scan_cached)			\
 	EXPAND_COUNTER(orphan_scan_error)			\
 	EXPAND_COUNTER(orphan_scan_item)			\
 	EXPAND_COUNTER(orphan_scan_omap_set)			\
-	EXPAND_COUNTER(orphan_scan_read)			\
 	EXPAND_COUNTER(quorum_elected)				\
 	EXPAND_COUNTER(quorum_fence_error)			\
 	EXPAND_COUNTER(quorum_fence_leader)			\
--- a/kmod/src/dir.c
+++ b/kmod/src/dir.c
@@ -720,7 +720,7 @@ static struct inode *lock_hold_create(struct inode *dir, struct dentry *dentry,
 				      struct list_head *ind_locks)
 {
 	struct super_block *sb = dir->i_sb;
-	struct inode *inode;
+	struct inode *inode = NULL;
 	u64 ind_seq;
 	int ret = 0;
 	u64 ino;
@@ -765,11 +765,9 @@ retry:
 	if (ret)
 		goto out_unlock;

-	inode = scoutfs_new_inode(sb, dir, mode, rdev, ino, *inode_lock);
-	if (IS_ERR(inode)) {
-		ret = PTR_ERR(inode);
+	ret = scoutfs_new_inode(sb, dir, mode, rdev, ino, *inode_lock, &inode);
+	if (ret < 0)
 		goto out;
-	}

 	ret = scoutfs_dirty_inode_item(dir, *dir_lock);
 out:
@@ -787,6 +785,8 @@ out_unlock:
 			*orph_lock = NULL;
 		}

+		if (!IS_ERR_OR_NULL(inode))
+			iput(inode);
 		inode = ERR_PTR(ret);
 	}

@@ -1319,11 +1319,11 @@ static int scoutfs_symlink(struct inode *dir, struct dentry *dentry,
 	insert_inode_hash(inode);
 	/* XXX need to set i_op/fop before here for sec callbacks */
 	d_instantiate(dentry, inode);
+	inode = NULL;
+	ret = 0;
 out:
 	if (ret < 0) {
 		/* XXX remove inode items */
-		if (!IS_ERR_OR_NULL(inode))
-			iput(inode);

 		symlink_item_ops(sb, SYM_DELETE, scoutfs_ino(inode), inode_lock,
 				 NULL, name_len);
@@ -1334,6 +1334,9 @@ out:
 	scoutfs_unlock(sb, dir_lock, SCOUTFS_LOCK_WRITE);
 	scoutfs_unlock(sb, inode_lock, SCOUTFS_LOCK_WRITE);

+	if (!IS_ERR_OR_NULL(inode))
+		iput(inode);
+
 	return ret;
 }

@@ -1923,10 +1926,8 @@ static int scoutfs_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mod
 	si = SCOUTFS_I(inode);

 	ret = scoutfs_inode_orphan_create(sb, scoutfs_ino(inode), orph_lock);
-	if (ret < 0) {
-		iput(inode);
+	if (ret < 0)
 		goto out; /* XXX returning error but items created */
-	}

 	inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
 	si->crtime = inode->i_mtime;
@@ -1939,7 +1940,6 @@ static int scoutfs_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mod
 	scoutfs_update_inode_item(inode, inode_lock, &ind_locks);
 	scoutfs_update_inode_item(dir, dir_lock, &ind_locks);
 	scoutfs_inode_index_unlock(sb, &ind_locks);
-	iput(inode);

 out:
 	scoutfs_release_trans(sb);
@@ -1948,6 +1948,9 @@ out:
 	scoutfs_unlock(sb, inode_lock, SCOUTFS_LOCK_WRITE);
 	scoutfs_unlock(sb, orph_lock, SCOUTFS_LOCK_WRITE_ONLY);

+	if (!IS_ERR_OR_NULL(inode))
+		iput(inode);
+
 	return ret;
 }

--- a/kmod/src/fence.c
+++ b/kmod/src/fence.c
@@ -395,12 +395,13 @@ int scoutfs_fence_wait_fenced(struct super_block *sb, long timeout_jiffies)
 int scoutfs_fence_setup(struct super_block *sb)
 {
 	struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
-	struct mount_options *opts = &sbi->opts;
+	struct scoutfs_mount_options opts;
 	struct fence_info *fi;
 	int ret;

 	/* can only fence if we can be elected by quorum */
-	if (opts->quorum_slot_nr == -1) {
+	scoutfs_options_read(sb, &opts);
+	if (opts.quorum_slot_nr == -1) {
 		ret = 0;
 		goto out;
 	}
--- a/kmod/src/inode.c
+++ b/kmod/src/inode.c
@@ -66,10 +66,6 @@ struct inode_sb_info {

 	struct delayed_work orphan_scan_dwork;

-	/* serialize multiple inode ->evict trying to delete same ino's items */
-	spinlock_t deleting_items_lock;
-	struct list_head deleting_items_list;
-
 	struct work_struct iput_work;
 	struct llist_head iput_llist;
 };
@@ -662,22 +658,12 @@ void scoutfs_inode_get_onoff(struct inode *inode, s64 *on, s64 *off)
 	} while (read_seqcount_retry(&si->seqcount, seq));
 }

-/*
- * We have inversions between getting cluster locks while performing
- * final deletion on a freeing inode and waiting on a freeing inode
- * while holding a cluster lock.
- *
- * We can avoid these deadlocks by hiding freeing inodes in our hash
- * lookup function.  We're fine with either returning null or populating
- * a new inode overlapping with eviction freeing a previous instance of
- * the inode.
- */
 static int scoutfs_iget_test(struct inode *inode, void *arg)
 {
 	struct scoutfs_inode_info *si = SCOUTFS_I(inode);
 	u64 *ino = arg;

-	return (si->ino == *ino) && !(inode->i_state & I_FREEING);
+	return si->ino == *ino;
 }

 static int scoutfs_iget_set(struct inode *inode, void *arg)
@@ -691,11 +677,35 @@ static int scoutfs_iget_set(struct inode *inode, void *arg)
 	return 0;
 }

-struct inode *scoutfs_ilookup(struct super_block *sb, u64 ino)
+/*
+ * There's a risk of a deadlock between lock invalidation and eviction.
+ * Invalidation blocks locks while looking up inodes.  Eviction blocks
+ * inode lookups while trying to get a lock.
+ *
+ * We have an inode lookup variant which will never block waiting for an
+ * inode.   This is more aggressive than base ilookup5_nowait() which
+ * will, you know, wait for inodes that are being freed.   We have our
+ * test function hide those inodes from find_inode so that it won't wait
+ * on them.
+ *
+ * These semantics are sufficiently weird that we use a big giant scary
+ * looking function name to deter use.
+ */
+static int ilookup_test_nonewfree(struct inode *inode, void *arg)
 {
-	return ilookup5(sb, ino, scoutfs_iget_test, &ino);
+	return scoutfs_iget_test(inode, arg) &&
+	       !(inode->i_state & (I_NEW | I_WILL_FREE | I_FREEING));
+}
+struct inode *scoutfs_ilookup_nowait_nonewfree(struct super_block *sb, u64 ino)
+{
+	return ilookup5_nowait(sb, ino, ilookup_test_nonewfree, &ino);
 }

+/*
+ * Final iput can delete an unused inode's items which can take multiple
+ * locked transactions.  iget (which can call iput in error cases) and
+ * iput must not be called with locks or transactions held.
+ */
 struct inode *scoutfs_iget(struct super_block *sb, u64 ino, int lkf, int igf)
 {
 	struct scoutfs_lock *lock = NULL;
@@ -703,32 +713,36 @@ struct inode *scoutfs_iget(struct super_block *sb, u64 ino, int lkf, int igf)
 	struct inode *inode = NULL;
 	int ret;

-	ret = scoutfs_lock_ino(sb, SCOUTFS_LOCK_READ, lkf, ino, &lock);
-	if (ret < 0)
-		goto out;
-
+	/* wait for vfs inode (I_FREEING in particular) before acquiring cluster lock */
 	inode = iget5_locked(sb, ino, scoutfs_iget_test, scoutfs_iget_set, &ino);
 	if (!inode) {
 		ret = -ENOMEM;
 		goto out;
 	}

+	ret = scoutfs_lock_ino(sb, SCOUTFS_LOCK_READ, lkf, ino, &lock);
+	if (ret < 0)
+		goto out;
+
 	if (inode->i_state & I_NEW) {
 		/* XXX ensure refresh, instead clear in drop_inode? */
 		si = SCOUTFS_I(inode);
 		atomic64_set(&si->last_refreshed, 0);
 		inode->i_version = 0;
+	}

-		ret = scoutfs_inode_refresh(inode, lock);
-		if (ret < 0)
-			goto out;
+	ret = scoutfs_inode_refresh(inode, lock);
+	if (ret < 0)
+		goto out;

-		if ((igf & SCOUTFS_IGF_LINKED) && inode->i_nlink == 0) {
-			ret = -ENOENT;
-			goto out;
-		}
+	/* check nlink both for new and after refreshing */
+	if ((igf & SCOUTFS_IGF_LINKED) && inode->i_nlink == 0) {
+		ret = -ENOENT;
+		goto out;
+	}

-		ret = scoutfs_omap_inc(sb, ino);
+	if (inode->i_state & I_NEW) {
+		ret = scoutfs_omap_set(sb, ino);
 		if (ret < 0)
 			goto out;

@@ -741,8 +755,12 @@ out:
 	scoutfs_unlock(sb, lock, SCOUTFS_LOCK_READ);

 	if (ret < 0) {
-		if (inode)
-			iget_failed(inode);
+		if (inode) {
+			if (inode->i_state & I_NEW)
+				iget_failed(inode);
+			else
+				iput(inode);
+		}
 		inode = ERR_PTR(ret);
 	}

@@ -1393,10 +1411,14 @@ out:
 /*
 * Allocate and initialize a new inode.  The caller is responsible for
 * creating links to it and updating it.  @dir can be null.
+ *
+ * This is called with locks and a transaction because it creates the
+ * inode item.   We can't call iput on the new inode on error.   We
+ * return the inode to the caller *including on error* for them to put
+ * once they've released the transaction.
 */
-struct inode *scoutfs_new_inode(struct super_block *sb, struct inode *dir,
-				umode_t mode, dev_t rdev, u64 ino,
-				struct scoutfs_lock *lock)
+int scoutfs_new_inode(struct super_block *sb, struct inode *dir, umode_t mode, dev_t rdev,
+		      u64 ino, struct scoutfs_lock *lock, struct inode **inode_ret)
 {
 	struct scoutfs_inode_info *si;
 	struct scoutfs_key key;
@@ -1405,8 +1427,10 @@ struct inode *scoutfs_new_inode(struct super_block *sb, struct inode *dir,
 	int ret;

 	inode = new_inode(sb);
-	if (!inode)
-		return ERR_PTR(-ENOMEM);
+	if (!inode) {
+		ret = -ENOMEM;
+		goto out;
+	}

 	si = SCOUTFS_I(inode);
 	si->ino = ino;
@@ -1434,20 +1458,17 @@ struct inode *scoutfs_new_inode(struct super_block *sb, struct inode *dir,
 	store_inode(&sinode, inode);
 	scoutfs_inode_init_key(&key, scoutfs_ino(inode));

-	ret = scoutfs_omap_inc(sb, ino);
+	ret = scoutfs_omap_set(sb, ino);
 	if (ret < 0)
 		goto out;

 	ret = scoutfs_item_create(sb, &key, &sinode, sizeof(sinode), lock);
 	if (ret < 0)
-		scoutfs_omap_dec(sb, ino);
+		scoutfs_omap_clear(sb, ino);
 out:
-	if (ret) {
-		iput(inode);
-		inode = ERR_PTR(ret);
-	}
+	*inode_ret = inode;

-	return inode;
+	return ret;
 }

 static void init_orphan_key(struct scoutfs_key *key, u64 ino)
@@ -1482,44 +1503,6 @@ int scoutfs_inode_orphan_delete(struct super_block *sb, u64 ino, struct scoutfs_
 	return scoutfs_item_delete_force(sb, &key, lock);
 }

-struct deleting_ino_entry {
-	struct list_head head;
-	u64 ino;
-};
-
-static bool added_deleting_ino(struct inode_sb_info *inf, struct deleting_ino_entry *del, u64 ino)
-{
-	struct deleting_ino_entry *tmp;
-	bool added = true;
-
-	spin_lock(&inf->deleting_items_lock);
-
-	list_for_each_entry(tmp, &inf->deleting_items_list, head) {
-		if (tmp->ino == ino) {
-			added = false;
-			break;
-		}
-	}
-
-	if (added) {
-		del->ino = ino;
-		list_add_tail(&del->head, &inf->deleting_items_list);
-	}
-
-	spin_unlock(&inf->deleting_items_lock);
-
-	return added;
-}
-
-static void del_deleting_ino(struct inode_sb_info *inf, struct deleting_ino_entry *del)
-{
-	if (del->ino) {
-		spin_lock(&inf->deleting_items_lock);
-		list_del_init(&del->head);
-		spin_unlock(&inf->deleting_items_lock);
-	}
-}
-
 /*
 * Remove all the items associated with a given inode.  This is only
 * called once nlink has dropped to zero and nothing has the inode open
@@ -1528,22 +1511,10 @@ static void del_deleting_ino(struct inode_sb_info *inf, struct deleting_ino_entr
 * orphan item will continue triggering attempts to finish previous
 * partial deletion until all deletion is complete and the orphan item
 * is removed.
- *
- * Currently this can be called multiple times for multiple cached
- * inodes for a given ino number (ilookup avoids freeing inodes to avoid
- * cluster lock<->inode flag waiting inversions).  Some items are not
- * safe to delete concurrently, for example concurrent data truncation
- * could free extents multiple times.  We use a very silly list of inos
- * being deleted.  Duplicates just return success.  If the first
- * deletion ends up failing orphan deletion will come back around later
- * and retry.
 */
-static int delete_inode_items(struct super_block *sb, u64 ino, struct scoutfs_lock *lock,
-			      struct scoutfs_lock *orph_lock)
+static int delete_inode_items(struct super_block *sb, u64 ino, struct scoutfs_inode *sinode,
+			      struct scoutfs_lock *lock, struct scoutfs_lock *orph_lock)
 {
-	DECLARE_INODE_SB_INFO(sb, inf);
-	struct deleting_ino_entry del = {{NULL, }};
-	struct scoutfs_inode sinode;
 	struct scoutfs_key key;
 	LIST_HEAD(ind_locks);
 	bool release = false;
@@ -1552,30 +1523,10 @@ static int delete_inode_items(struct super_block *sb, u64 ino, struct scoutfs_lo
 	u64 size;
 	int ret;

-	if (!added_deleting_ino(inf, &del, ino)) {
-		ret = 0;
-		goto out;
-	}
-
 	scoutfs_inode_init_key(&key, ino);

-	ret = scoutfs_item_lookup_exact(sb, &key, &sinode, sizeof(sinode),
-					lock);
-	if (ret < 0) {
-		if (ret == -ENOENT)
-			ret = 0;
-		goto out;
-	}
-
-	/* XXX corruption, inode probably won't be freed without repair */
-	if (le32_to_cpu(sinode.nlink)) {
-		scoutfs_warn(sb, "Dangling orphan item for inode %llu.", ino);
-		ret = -EIO;
-		goto out;
-	}
-
-	mode = le32_to_cpu(sinode.mode);
-	size = le64_to_cpu(sinode.size);
+	mode = le32_to_cpu(sinode->mode);
+	size = le64_to_cpu(sinode->size);
 	trace_scoutfs_delete_inode(sb, ino, mode, size);

 	/* remove data items in their own transactions */
@@ -1593,7 +1544,7 @@ static int delete_inode_items(struct super_block *sb, u64 ino, struct scoutfs_lo
 	/* then delete the small known number of remaining inode items */
 retry:
 	ret = scoutfs_inode_index_start(sb, &ind_seq) ?:
-	      prepare_index_deletion(sb, &ind_locks, ino, mode, &sinode) ?:
+	      prepare_index_deletion(sb, &ind_locks, ino, mode, sinode) ?:
 	      scoutfs_inode_index_try_lock_hold(sb, &ind_locks, ind_seq, false);
 	if (ret > 0)
 		goto retry;
@@ -1602,7 +1553,7 @@ retry:

 	release = true;

-	ret = remove_index_items(sb, ino, &sinode, &ind_locks);
+	ret = remove_index_items(sb, ino, sinode, &ind_locks);
 	if (ret)
 		goto out;

@@ -1612,15 +1563,21 @@ retry:
 			goto out;
 	}

-	ret = scoutfs_item_delete(sb, &key, lock);
-	if (ret)
+	/* make sure inode item and orphan are deleted together */
+	ret = scoutfs_item_dirty(sb, &key, lock);
+	if (ret < 0)
 		goto out;

 	ret = scoutfs_inode_orphan_delete(sb, ino, orph_lock);
-	if (ret == 0)
-		scoutfs_forest_dec_inode_count(sb);
+	if (ret < 0)
+		goto out;
+
+	ret = scoutfs_item_delete(sb, &key, lock);
+	BUG_ON(ret != 0); /* dirtying should have guaranteed success */
+
+	scoutfs_forest_dec_inode_count(sb);
+
 out:
-	del_deleting_ino(inf, &del);
 	if (release)
 		scoutfs_release_trans(sb);
 	scoutfs_inode_index_unlock(sb, &ind_locks);
@@ -1628,48 +1585,192 @@ out:
 	return ret;
 }

+struct inode_deletion_lock_data {
+	wait_queue_head_t waitq;
+	atomic64_t seq;
+	struct scoutfs_open_ino_map map;
+	unsigned long trying[DIV_ROUND_UP(SCOUTFS_OPEN_INO_MAP_BITS, BITS_PER_LONG)];
+};
+
 /*
- * iput_final has already written out the dirty pages to the inode
- * before we get here.  We're left with a clean inode that we have to
- * tear down.  We use locking and open inode number bitmaps to decide if
- * we should finally destroy an inode that is no longer open nor
- * reachable through directory entries.
+ * Get a lock data struct that has the current omap from this hold of
+ * the lock.  The lock data is saved on the lock so it can be used
+ * multiple times until the lock is refreshed.  Only one task will send
+ * an omap request at a time, and errors are only returned by each task
+ * as it gets a response to its send.
+ */
+static int get_current_lock_data(struct super_block *sb, struct scoutfs_lock *lock,
+				 struct inode_deletion_lock_data **ldata_ret, u64 group_nr)
+{
+	struct inode_deletion_lock_data *ldata;
+	u64 seq;
+	int ret;
+
+	/* we're storing omap maps in locks, they need to cover the same number of inodes */
+	BUILD_BUG_ON(SCOUTFS_OPEN_INO_MAP_BITS != SCOUTFS_LOCK_INODE_GROUP_NR);
+
+	/* allocate a new lock data struct as needed */
+	while ((ldata = cmpxchg(&lock->inode_deletion_data, NULL, NULL)) == NULL) {
+		ldata = kzalloc(sizeof(struct inode_deletion_lock_data), GFP_NOFS);
+		if (!ldata) {
+			ret = -ENOMEM;
+			goto out;
+		}
+
+		atomic64_set(&ldata->seq, lock->write_seq - 1); /* ensure refresh */
+		init_waitqueue_head(&ldata->waitq);
+
+		/* the lock kfrees the inode_deletion_data pointer along with the lock */
+		if (cmpxchg(&lock->inode_deletion_data, NULL, ldata) == NULL)
+			break;
+		else
+			kfree(ldata);
+	}
+
+	/* make sure that the lock's data is current */
+	while ((seq = atomic64_read(&ldata->seq)) != lock->write_seq) {
+		if (seq != U64_MAX && atomic64_cmpxchg(&ldata->seq, seq, U64_MAX) == seq) {
+			/* ask the server for current omap */
+			ret = scoutfs_client_open_ino_map(sb, group_nr, &ldata->map);
+			if (ret == 0)
+				atomic64_set(&ldata->seq, lock->write_seq);
+			else
+				atomic64_set(&ldata->seq, lock->write_seq - 1);
+			wake_up(&ldata->waitq);
+			if (ret < 0)
+				goto out;
+		} else {
+			/* wait for someone else who's sent a request */
+			wait_event(ldata->waitq, atomic64_read(&ldata->seq) != U64_MAX);
+		}
+	}
+
+	ret = 0;
+out:
+	if (ret < 0)
+		ldata = NULL;
+	*ldata_ret = ldata;
+	return ret;
+}
+
+/*
+ * Try to delete all the items for an unused inode number.  This is the
+ * relatively slow path that uses cluster locks, network requests, and
+ * IO to ensure correctness.  Callers should try hard to avoid calling
+ * when there's no work to do.
 *
- * Because lookup ignores freeing inodes we can get here from multiple
- * instances of an inode that is being deleted.  Orphan scanning in
- * particular can race with deletion.   delete_inode_items() resolves
- * concurrent attempts.
+ * Inode references are added under cluster locks.  In-memory vfs cache
+ * references are added under read cluster locks and are visible in omap
+ * bitmaps.  Directory entry references are added under write cluster
+ * locks and are visible in the inode's nlink.  Orphan items exist
+ * whenever nlink == 0 and are maintained under write cluster locks.
+ * Directory entries can be added to an inode with nlink == 0 to
+ * instantiate tmpfile inodes into the name space.  Cached inodes will
+ * not be created for inodes with an nlink of 0.
+ *
+ * Combining all this we know that it's safe to delete an inode's items
+ * when we hold an exclusive write cluster lock, the inode has nlink ==
+ * 0, and an omap request protected by the lock doesn't have the inode's
+ * bit set.
+ *
+ * This is called by orphan scanning and vfs inode cache eviction after
+ * they've checked that the inode could really be deleted.  We serialize
+ * on a bit in the lock data so that we only have one deletion attempt
+ * per inode under this mount's cluster lock.
+ */
+static int try_delete_inode_items(struct super_block *sb, u64 ino)
+{
+	struct inode_deletion_lock_data *ldata = NULL;
+	struct scoutfs_lock *orph_lock = NULL;
+	struct scoutfs_lock *lock = NULL;
+	struct scoutfs_inode sinode;
+	struct scoutfs_key key;
+	u64 group_nr;
+	int bit_nr;
+	int ret;
+
+	ret = scoutfs_lock_ino(sb, SCOUTFS_LOCK_WRITE, 0, ino, &lock);
+	if (ret < 0)
+		goto out;
+
+	scoutfs_omap_calc_group_nrs(ino, &group_nr, &bit_nr);
+
+	ret = get_current_lock_data(sb, lock, &ldata, group_nr);
+	if (ret < 0)
+		goto out;
+
+	/* only one local attempt per inode at a time */
+	if (test_and_set_bit(bit_nr, ldata->trying)) {
+		ret = 0;
+		goto out;
+	}
+
+	/* can't delete if it's cached in local or remote mounts */
+	if (scoutfs_omap_test(sb, ino) || test_bit_le(bit_nr, ldata->map.bits)) {
+		ret = 0;
+		goto out;
+	}
+
+	scoutfs_inode_init_key(&key, ino);
+	ret = scoutfs_item_lookup_exact(sb, &key, &sinode, sizeof(sinode), lock);
+	if (ret < 0) {
+		if (ret == -ENOENT)
+			ret = 0;
+		goto out;
+	}
+
+	if (le32_to_cpu(sinode.nlink) > 0) {
+		ret = 0;
+		goto out;
+	}
+
+	ret = scoutfs_lock_orphan(sb, SCOUTFS_LOCK_WRITE_ONLY, 0, ino, &orph_lock);
+	if (ret < 0)
+		goto out;
+
+	ret = delete_inode_items(sb, ino, &sinode, lock, orph_lock);
+out:
+	if (ldata)
+		clear_bit(bit_nr, ldata->trying);
+
+	scoutfs_unlock(sb, lock, SCOUTFS_LOCK_WRITE);
+	scoutfs_unlock(sb, orph_lock, SCOUTFS_LOCK_WRITE_ONLY);
+
+	return ret;
+}
+
+/*
+ * As we drop an inode we need to decide to try and delete its items or
+ * not, which is expensive.  The two common cases we want to get right
+ * both have cluster lock coverage and don't want to delete.   Dropping
+ * unused inodes during read lock invalidation has the current lock and
+ * sees a nonzero nlink and knows not to delete.  Final iput after a
+ * local unlink also has a lock, sees a zero nlink, and tries to perform
+ * item deletion in the task that dropped the last link, as users
+ * expect. 
+ *
+ * Evicting an inode outside of cluster locking is the odd slow path
+ * that involves lock contention during use the worst cross-mount
+ * open-unlink/delete case.
 */
 void scoutfs_evict_inode(struct inode *inode)
 {
 	struct super_block *sb = inode->i_sb;
+	struct scoutfs_inode_info *si = SCOUTFS_I(inode);
 	const u64 ino = scoutfs_ino(inode);
-	struct scoutfs_lock *orph_lock;
-	struct scoutfs_lock *lock;
-	int ret;

-	trace_scoutfs_evict_inode(inode->i_sb, scoutfs_ino(inode),
-				  inode->i_nlink, is_bad_inode(inode));
+	trace_scoutfs_evict_inode(sb, ino, inode->i_nlink, is_bad_inode(inode));

-	if (is_bad_inode(inode))
-		goto clear;
+	if (!is_bad_inode(inode)) {
+		truncate_inode_pages_final(&inode->i_data);

-	truncate_inode_pages_final(&inode->i_data);
+		/* clear before trying to delete tests */
+		scoutfs_omap_clear(sb, ino);

-	ret = scoutfs_omap_should_delete(sb, inode, &lock, &orph_lock);
-	if (ret > 0) {
-		ret = delete_inode_items(inode->i_sb, scoutfs_ino(inode), lock, orph_lock);
-		scoutfs_unlock(sb, lock, SCOUTFS_LOCK_WRITE);
-		scoutfs_unlock(sb, orph_lock, SCOUTFS_LOCK_WRITE_ONLY);
-	}
-	if (ret < 0) {
-		scoutfs_err(sb, "error %d while checking to delete inode nr %llu, it might linger.",
-			    ret, ino);
+		if (!scoutfs_lock_is_covered(sb, &si->ino_lock_cov) || inode->i_nlink == 0)
+			try_delete_inode_items(sb, scoutfs_ino(inode));
 	}

-	scoutfs_omap_dec(sb, ino);
-
-clear:
 	clear_inode(inode);
 }

@@ -1745,18 +1846,26 @@ void scoutfs_inode_queue_iput(struct inode *inode)
 /*
 * All mounts are performing this work concurrently.  We introduce
 * significant jitter between them to try and keep them from all
- * bunching up and working on the same inodes.
+ * bunching up and working on the same inodes.  We always try to delay
+ * for at least one jiffy if precision tricks us into calculating no
+ * delay.
 */
-static void schedule_orphan_dwork(struct inode_sb_info *inf)
+void scoutfs_inode_schedule_orphan_dwork(struct super_block *sb)
 {
-#define ORPHAN_SCAN_MIN_MS (10 * MSEC_PER_SEC)
-#define ORPHAN_SCAN_JITTER_MS (40 * MSEC_PER_SEC)
-	unsigned long delay = msecs_to_jiffies(ORPHAN_SCAN_MIN_MS +
-					       prandom_u32_max(ORPHAN_SCAN_JITTER_MS));
+	DECLARE_INODE_SB_INFO(sb, inf);
+	struct scoutfs_mount_options opts;
+	unsigned long low;
+	unsigned long high;
+	unsigned long delay;
+
 	if (!inf->stopped) {
-		delay = msecs_to_jiffies(ORPHAN_SCAN_MIN_MS +
-					 prandom_u32_max(ORPHAN_SCAN_JITTER_MS));
-		schedule_delayed_work(&inf->orphan_scan_dwork, delay);
+		scoutfs_options_read(sb, &opts);
+
+		low = (opts.orphan_scan_delay_ms * 80) / 100;
+		high = (opts.orphan_scan_delay_ms * 120) / 100;
+		delay = msecs_to_jiffies(low + prandom_u32_max(high - low)) ?: 1;
+
+		mod_delayed_work(system_wq, &inf->orphan_scan_dwork, delay);
 	}
 }

@@ -1764,11 +1873,10 @@ static void schedule_orphan_dwork(struct inode_sb_info *inf)
 * Find and delete inodes whose only remaining reference is the
 * persistent orphan item that was created as they were unlinked.
 *
- * Orphan items are created as the final directory entry referring to an
- * inode is deleted.  They're deleted as the final cached inode is
- * evicted and the inode items are destroyed.  They can linger if all
- * the cached inodes pinning the inode fail to delete as they are
- * evicted from the cache -- either through crashing or errors.
+ * Orphan items are maintained for inodes that have an nlink of 0.
+ * Typically this is from unlink, but tmpfiles are created with orphans.
+ * They're deleted as the final cached inode is evicted and the inode
+ * items are destroyed.
 *
 * This work runs in all mounts in the background looking for those
 * orphaned inodes that weren't fully deleted.
@@ -1777,20 +1885,16 @@ static void schedule_orphan_dwork(struct inode_sb_info *inf)
 * only find orphan items that made it to the fs root after being merged
 * from a mount's log btree.  This naturally avoids orphan items that
 * exist while inodes have been unlinked but are still cached, including
- * O_TMPFILE inodes that are actively used during normal operations.
+ * tmpfile inodes that are actively used during normal operations.
 * Scanning the read-only persistent fs root uses cached blocks and
 * avoids the lock contention we'd cause if we tried to use the
 * consistent item cache.  The downside is that it adds a bit of
- * latency.  If an orphan was created in error it'll take until the
- * mount's log btree is finalized and merged.  A crash will have the log
- * btree merged after it is fenced.
+ * latency.
 *
- * Once we find candidate orphan items we can first check our local
- * inode cache for inodes that are already on their way to eviction and
- * can be skipped.  Then we ask the server for the open map containing
- * the inode.  Only if we don't have it cached, and no one else does, do
- * we try and read it into our cache and evict it to trigger the final
- * inode deletion process.
+ * Once we find candidate orphan items we first check our local omap for
+ * a locally cached inode.  Then we ask the server for the open map
+ * containing the inode.  Only if we don't see any cached users do we do
+ * the expensive work of acquiring locks to try and delete the items.
 */
 static void inode_orphan_scan_worker(struct work_struct *work)
 {
@@ -1802,7 +1906,6 @@ static void inode_orphan_scan_worker(struct work_struct *work)
 	SCOUTFS_BTREE_ITEM_REF(iref);
 	struct scoutfs_key last;
 	struct scoutfs_key key;
-	struct inode *inode;
 	u64 group_nr;
 	int bit_nr;
 	u64 ino;
@@ -1841,17 +1944,14 @@ static void inode_orphan_scan_worker(struct work_struct *work)
 		scoutfs_inc_counter(sb, orphan_scan_item);
 		ino = le64_to_cpu(key.sko_ino);

-		/* locally cached inodes will already be deleted */
-		inode = scoutfs_ilookup(sb, ino);
-		if (inode) {
+		/* locally cached inodes will try to delete as they evict */
+		if (scoutfs_omap_test(sb, ino)) {
 			scoutfs_inc_counter(sb, orphan_scan_cached);
-			iput(inode);
 			continue;
 		}

 		/* get an omap that covers the orphaned ino */
-		group_nr = ino >> SCOUTFS_OPEN_INO_MAP_SHIFT;
-		bit_nr = ino & SCOUTFS_OPEN_INO_MAP_MASK;
+		scoutfs_omap_calc_group_nrs(ino, &group_nr, &bit_nr);

 		if (le64_to_cpu(omap.args.group_nr) != group_nr) {
 			ret = scoutfs_client_open_ino_map(sb, group_nr, &omap);
@@ -1859,25 +1959,15 @@ static void inode_orphan_scan_worker(struct work_struct *work)
 				goto out;
 		}

-		/* don't need to evict if someone else has it open (cached) */
+		/* remote cached inodes will also try to delete */
 		if (test_bit_le(bit_nr, omap.bits)) {
 			scoutfs_inc_counter(sb, orphan_scan_omap_set);
 			continue;
 		}

-		/* try to cached and evict unused inode to delete, can be racing */
-		inode = scoutfs_iget(sb, ino, 0, 0);
-		if (IS_ERR(inode)) {
-			ret = PTR_ERR(inode);
-			if (ret == -ENOENT)
-				continue;
-			else
-				goto out;
-		}
-
-		scoutfs_inc_counter(sb, orphan_scan_read);
-		SCOUTFS_I(inode)->drop_invalidated = true;
-		iput(inode);
+		/* seemingly orphaned and unused, get locks and check for sure */
+		scoutfs_inc_counter(sb, orphan_scan_attempts);
+		ret = try_delete_inode_items(sb, ino);
 	}

 	ret = 0;
@@ -1886,7 +1976,7 @@ out:
 	if (ret < 0)
 		scoutfs_inc_counter(sb, orphan_scan_error);

-	schedule_orphan_dwork(inf);
+	scoutfs_inode_schedule_orphan_dwork(sb);
 }

 /*
@@ -1994,8 +2084,6 @@ int scoutfs_inode_setup(struct super_block *sb)
 	spin_lock_init(&inf->dir_ino_alloc.lock);
 	spin_lock_init(&inf->ino_alloc.lock);
 	INIT_DELAYED_WORK(&inf->orphan_scan_dwork, inode_orphan_scan_worker);
-	spin_lock_init(&inf->deleting_items_lock);
-	INIT_LIST_HEAD(&inf->deleting_items_list);
 	INIT_WORK(&inf->iput_work, iput_worker);
 	init_llist_head(&inf->iput_llist);

@@ -2011,9 +2099,7 @@ int scoutfs_inode_setup(struct super_block *sb)
 */
 void scoutfs_inode_start(struct super_block *sb)
 {
-	DECLARE_INODE_SB_INFO(sb, inf);
-
-	schedule_orphan_dwork(inf);
+	scoutfs_inode_schedule_orphan_dwork(sb);
 }

 /*
--- a/kmod/src/inode.h
+++ b/kmod/src/inode.h
@@ -82,7 +82,9 @@ void scoutfs_inode_queue_iput(struct inode *inode);

 #define SCOUTFS_IGF_LINKED (1 << 0) /* enoent if nlink == 0 */
 struct inode *scoutfs_iget(struct super_block *sb, u64 ino, int lkf, int igf);
-struct inode *scoutfs_ilookup(struct super_block *sb, u64 ino);
+struct inode *scoutfs_ilookup_nowait(struct super_block *sb, u64 ino);
+struct inode *scoutfs_ilookup_nowait_nonewfree(struct super_block *sb, u64 ino);
+

 void scoutfs_inode_init_key(struct scoutfs_key *key, u64 ino);
 void scoutfs_inode_init_index_key(struct scoutfs_key *key, u8 type, u64 major,
@@ -104,9 +106,8 @@ void scoutfs_update_inode_item(struct inode *inode, struct scoutfs_lock *lock,
 			       struct list_head *ind_locks);

 int scoutfs_alloc_ino(struct super_block *sb, bool is_dir, u64 *ino_ret);
-struct inode *scoutfs_new_inode(struct super_block *sb, struct inode *dir,
-				umode_t mode, dev_t rdev, u64 ino,
-				struct scoutfs_lock *lock);
+int scoutfs_new_inode(struct super_block *sb, struct inode *dir, umode_t mode, dev_t rdev,
+		      u64 ino, struct scoutfs_lock *lock, struct inode **inode_ret);

 void scoutfs_inode_set_meta_seq(struct inode *inode);
 void scoutfs_inode_set_data_seq(struct inode *inode);
@@ -126,6 +127,7 @@ int scoutfs_setattr(struct dentry *dentry, struct iattr *attr);

 int scoutfs_inode_orphan_create(struct super_block *sb, u64 ino, struct scoutfs_lock *lock);
 int scoutfs_inode_orphan_delete(struct super_block *sb, u64 ino, struct scoutfs_lock *lock);
+void scoutfs_inode_schedule_orphan_dwork(struct super_block *sb);

 void scoutfs_inode_queue_writeback(struct inode *inode);
 int scoutfs_inode_walk_writeback(struct super_block *sb, bool write);
--- a/kmod/src/ioctl.c
+++ b/kmod/src/ioctl.c
@@ -387,7 +387,7 @@ static long scoutfs_ioc_data_wait_err(struct file *file, unsigned long arg)
 	if (sblock > eblock)
 		return -EINVAL;

-	inode = scoutfs_ilookup(sb, args.ino);
+	inode = scoutfs_ilookup_nowait_nonewfree(sb, args.ino);
 	if (!inode) {
 		ret = -ESTALE;
 		goto out;
--- a/kmod/src/lock.c
+++ b/kmod/src/lock.c
@@ -142,7 +142,7 @@ static void invalidate_inode(struct super_block *sb, u64 ino)
 	struct scoutfs_inode_info *si;
 	struct inode *inode;

-	inode = scoutfs_ilookup(sb, ino);
+	inode = scoutfs_ilookup_nowait_nonewfree(sb, ino);
 	if (inode) {
 		si = SCOUTFS_I(inode);

@@ -255,7 +255,7 @@ static void lock_free(struct lock_info *linfo, struct scoutfs_lock *lock)
 	BUG_ON(!list_empty(&lock->shrink_head));
 	BUG_ON(!list_empty(&lock->cov_list));

-	scoutfs_omap_free_lock_data(lock->omap_data);
+	kfree(lock->inode_deletion_data);
 	kfree(lock);
 }

@@ -291,7 +291,6 @@ static struct scoutfs_lock *lock_alloc(struct super_block *sb,
 	lock->mode = SCOUTFS_LOCK_NULL;

 	atomic64_set(&lock->forest_bloom_nr, 0);
-	spin_lock_init(&lock->omap_spinlock);

 	trace_scoutfs_lock_alloc(sb, lock);

--- a/kmod/src/lock.h
+++ b/kmod/src/lock.h
@@ -11,7 +11,7 @@

 #define SCOUTFS_LOCK_NR_MODES		SCOUTFS_LOCK_INVALID

-struct scoutfs_omap_lock;
+struct inode_deletion_lock_data;

 /*
 * A few fields (start, end, refresh_gen, write_seq, granted_mode)
@@ -47,9 +47,8 @@ struct scoutfs_lock {
 	/* the forest tracks which log tree last saw bloom bit updates */
 	atomic64_t forest_bloom_nr;

-	/* open ino mapping has a valid map for a held write lock */
-	spinlock_t omap_spinlock;
-	struct scoutfs_omap_lock_data *omap_data;
+	/* inode deletion tracks some state per lock */
+	struct inode_deletion_lock_data *inode_deletion_data;
 };

 struct scoutfs_lock_coverage {
--- a/kmod/src/omap.c
+++ b/kmod/src/omap.c
@@ -30,27 +30,22 @@
 /*
 * As a client removes an inode from its cache with an nlink of 0 it
 * needs to decide if it is the last client using the inode and should
- * fully delete all its items.  It needs to know if other mounts still
- * have the inode in use.
+ * fully delete all the inode's items.  It needs to know if other mounts
+ * still have the inode in use.
 *
- * We need a way to communicate between mounts that an inode is open.
+ * We need a way to communicate between mounts that an inode is in use.
 * We don't want to pay the synchronous per-file locking round trip
 * costs associated with per-inode open locks that you'd typically see
- * in systems to solve this problem.
+ * in systems to solve this problem.  The first prototypes of this
+ * tracked open file handles so this was coined the open map, though it
+ * now tracks cached inodes.
 *
- * Instead clients maintain open bitmaps that cover groups of inodes.
- * As inodes enter the cache their bit is set, and as the inode is
- * evicted the bit is cleared.  As an inode is evicted messages are sent
- * around the cluster to get the current bitmaps for that inode's group
- * from all active mounts.  If the inode's bit is clear then it can be
- * deleted.
- *
- * We associate the open bitmaps with our cluster locking of inode
- * groups to cache these open bitmaps.  As long as we have the lock then
- * nlink can't be changed on any remote mounts.  Specifically, it can't
- * increase from 0 so any clear bits can gain references on remote
- * mounts.  As long as we have the lock, all clear bits in the group for
- * inodes with 0 nlink can be deleted.
+ * Clients maintain bitmaps that cover groups of inodes.  As inodes
+ * enter the cache their bit is set and as the inode is evicted the bit
+ * is cleared.  As deletion is attempted, either by scanning orphans or
+ * evicting an inode with an nlink of 0, messages are sent around the
+ * cluster to get the current bitmaps for that inode's group from all
+ * active mounts.  If the inode's bit is clear then it can be deleted.
 *
 * This layer maintains a list of client rids to send messages to.  The
 * server calls us as clients enter and leave the cluster.    We can't
@@ -85,14 +80,12 @@ struct omap_info {
 	struct omap_info *name = SCOUTFS_SB(sb)->omap_info

 /*
- * The presence of an inode in the inode cache increases the count of
- * its inode number's position within its lock group.  These structs
- * track the counts for all the inodes in a lock group and maintain a
- * bitmap whose bits are set for each non-zero count.
+ * The presence of an inode in the inode sets its bit in the lock
+ * group's bitmap.
 *
 * We don't want to add additional global synchronization of inode cache
 * maintenance so these are tracked in an rcu hash table.  Once their
- * total count reaches zero they're removed from the hash and queued for
+ * total reaches zero they're removed from the hash and queued for
 * freeing and readers should ignore them.
 */
 struct omap_group {
@@ -102,7 +95,6 @@ struct omap_group {
 	u64 nr;
 	spinlock_t lock;
 	unsigned int total;
-	unsigned int *counts;
 	__le64 bits[SCOUTFS_OPEN_INO_MAP_LE64S];
 };

@@ -111,8 +103,7 @@ do {											\
 	__typeof__(group) _grp = (group);						\
 	__typeof__(bit_nr) _nr = (bit_nr);						\
 											\
-	trace_scoutfs_omap_group_##which(sb, _grp, _grp->nr, _grp->total, _nr,		\
-				        _nr < 0 ? -1 : _grp->counts[_nr]);		\
+	trace_scoutfs_omap_group_##which(sb, _grp, _grp->nr, _grp->total, _nr);		\
 } while (0)

 /*
@@ -134,18 +125,6 @@ struct omap_request {
 	struct scoutfs_open_ino_map map;
 };

-/*
- * In each inode group cluster lock we store data to track the open ino
- * map which tracks all the inodes that the cluster lock covers.  When
- * the seq shows that the map is stale we send a request to update it.
- */
-struct scoutfs_omap_lock_data {
-	u64 seq;
-	bool req_in_flight;
-	wait_queue_head_t waitq;
-	struct scoutfs_open_ino_map map;
-};
-
 static inline void init_rid_list(struct omap_rid_list *list)
 {
 	INIT_LIST_HEAD(&list->head);
@@ -232,7 +211,7 @@ static void free_rids(struct omap_rid_list *list)
 	}
 }

-static void calc_group_nrs(u64 ino, u64 *group_nr, int *bit_nr)
+void scoutfs_omap_calc_group_nrs(u64 ino, u64 *group_nr, int *bit_nr)
 {
 	*group_nr = ino >> SCOUTFS_OPEN_INO_MAP_SHIFT;
 	*bit_nr = ino & SCOUTFS_OPEN_INO_MAP_MASK;
@@ -242,21 +221,13 @@ static struct omap_group *alloc_group(struct super_block *sb, u64 group_nr)
 {
 	struct omap_group *group;

-	BUILD_BUG_ON((sizeof(group->counts[0]) * SCOUTFS_OPEN_INO_MAP_BITS) > PAGE_SIZE);
-
 	group = kzalloc(sizeof(struct omap_group), GFP_NOFS);
 	if (group) {
 		group->sb = sb;
 		group->nr = group_nr;
 		spin_lock_init(&group->lock);

-		group->counts = (void *)get_zeroed_page(GFP_NOFS);
-		if (!group->counts) {
-			kfree(group);
-			group = NULL;
-		} else {
-			trace_group(sb, alloc, group, -1);
-		}
+		trace_group(sb, alloc, group, -1);
 	}

 	return group;
@@ -265,7 +236,6 @@ static struct omap_group *alloc_group(struct super_block *sb, u64 group_nr)
 static void free_group(struct super_block *sb, struct omap_group *group)
 {
 	trace_group(sb, free, group, -1);
-	free_page((unsigned long)group->counts);
 	kfree(group);
 }

@@ -283,13 +253,16 @@ static const struct rhashtable_params group_ht_params = {
 };

 /*
- * Track an cached inode in its group.  Our increment can be racing with
- * a final decrement that removes the group from the hash, sets total to
+ * Track an cached inode in its group.  Our set can be racing with a
+ * final clear that removes the group from the hash, sets total to
 * UINT_MAX, and calls rcu free.  We can retry until the dead group is
 * no longer visible in the hash table and we can insert a new allocated
 * group.
+ *
+ * The caller must ensure that the bit is clear, -EEXIST will be
+ * returned otherwise.
 */
-int scoutfs_omap_inc(struct super_block *sb, u64 ino)
+int scoutfs_omap_set(struct super_block *sb, u64 ino)
 {
 	DECLARE_OMAP_INFO(sb, ominf);
 	struct omap_group *group;
@@ -298,7 +271,7 @@ int scoutfs_omap_inc(struct super_block *sb, u64 ino)
 	bool found;
 	int ret = 0;

-	calc_group_nrs(ino, &group_nr, &bit_nr);
+	scoutfs_omap_calc_group_nrs(ino, &group_nr, &bit_nr);

 retry:
 	found = false;
@@ -308,10 +281,10 @@ retry:
 		spin_lock(&group->lock);
 		if (group->total < UINT_MAX) {
 			found = true;
-			if (group->counts[bit_nr]++ == 0) {
-				set_bit_le(bit_nr, group->bits);
+			if (WARN_ON_ONCE(test_and_set_bit_le(bit_nr, group->bits)))
+				ret = -EEXIST;
+			else
 				group->total++;
-			}
 		}
 		trace_group(sb, inc, group, bit_nr);
 		spin_unlock(&group->lock);
@@ -342,29 +315,50 @@ retry:
 	return ret;
 }

+bool scoutfs_omap_test(struct super_block *sb, u64 ino)
+{
+	DECLARE_OMAP_INFO(sb, ominf);
+	struct omap_group *group;
+	bool ret = false;
+	u64 group_nr;
+	int bit_nr;
+
+	scoutfs_omap_calc_group_nrs(ino, &group_nr, &bit_nr);
+
+	rcu_read_lock();
+	group = rhashtable_lookup(&ominf->group_ht, &group_nr, group_ht_params);
+	if (group) {
+		spin_lock(&group->lock);
+		ret = !!test_bit_le(bit_nr, group->bits);
+		spin_unlock(&group->lock);
+	}
+	rcu_read_unlock();
+
+	return ret;
+}
+
 /*
- * Decrement a previously incremented ino count.  Not finding a count
- * implies imbalanced inc/dec or bugs freeing groups.  We only free
- * groups here as the last dec drops the group's total count to 0.
+ * Clear a previously set ino bit.  Trying to clear a bit that's already
+ * clear implies imbalanced set/clear or bugs freeing groups.  We only
+ * free groups here as the last clear drops the group's total to 0.
 */
-void scoutfs_omap_dec(struct super_block *sb, u64 ino)
+void scoutfs_omap_clear(struct super_block *sb, u64 ino)
 {
 	DECLARE_OMAP_INFO(sb, ominf);
 	struct omap_group *group;
 	u64 group_nr;
 	int bit_nr;

-	calc_group_nrs(ino, &group_nr, &bit_nr);
+	scoutfs_omap_calc_group_nrs(ino, &group_nr, &bit_nr);

 	rcu_read_lock();
 	group = rhashtable_lookup(&ominf->group_ht, &group_nr, group_ht_params);
 	if (group) {
 		spin_lock(&group->lock);
-		WARN_ON_ONCE(group->counts[bit_nr] == 0);
+		WARN_ON_ONCE(!test_bit_le(bit_nr, group->bits));
 		WARN_ON_ONCE(group->total == 0);
 		WARN_ON_ONCE(group->total == UINT_MAX);
-		if (--group->counts[bit_nr] == 0) {
-			clear_bit_le(bit_nr, group->bits);
+		if (test_and_clear_bit_le(bit_nr, group->bits)) {
 			if (--group->total == 0) {
 				group->total = UINT_MAX;
 				rhashtable_remove_fast(&ominf->group_ht, &group->ht_head,
@@ -664,8 +658,7 @@ int scoutfs_omap_server_handle_request(struct super_block *sb, u64 rid, u64 id,

 /*
 * The client is receiving a request from the server for its map for the
- * given group.  Look up the group and copy the bits to the map for
- * non-zero open counts.
+ * given group.  Look up the group and copy the bits to the map.
 *
 * The mount originating the request for this bitmap has the inode group
 * write locked.  We can't be adding links to any inodes in the group
@@ -814,179 +807,6 @@ void scoutfs_omap_server_shutdown(struct super_block *sb)
 	synchronize_rcu();
 }

-static bool omap_req_in_flight(struct scoutfs_lock *lock, struct scoutfs_omap_lock_data *ldata)
-{
-	bool in_flight;
-
-	spin_lock(&lock->omap_spinlock);
-	in_flight = ldata->req_in_flight;
-	spin_unlock(&lock->omap_spinlock);
-
-	return in_flight;
-}
-
-/*
- * Make sure the map covered by the cluster lock is current.  The caller
- * holds the cluster lock so once we store lock_data on the cluster lock
- * it won't be freed and the write_seq in the cluster lock won't change.
- *
- * The omap_spinlock protects the omap_data in the cluster lock.  We
- * have to drop it if we have to block to allocate lock_data, send a
- * request for a new map, or wait for a request in flight to finish.
- */
-static int get_current_lock_data(struct super_block *sb, struct scoutfs_lock *lock,
-				 struct scoutfs_omap_lock_data **ldata_ret, u64 group_nr)
-{
-	struct scoutfs_omap_lock_data *ldata;
-	bool send_req;
-	int ret = 0;
-
-	spin_lock(&lock->omap_spinlock);
-
-	ldata = lock->omap_data;
-	if (ldata == NULL) {
-		spin_unlock(&lock->omap_spinlock);
-		ldata = kzalloc(sizeof(struct scoutfs_omap_lock_data), GFP_NOFS);
-		spin_lock(&lock->omap_spinlock);
-
-		if (!ldata) {
-			ret = -ENOMEM;
-			goto out;
-		}
-
-		if (lock->omap_data == NULL) {
-			ldata->seq = lock->write_seq - 1; /* ensure refresh */
-			init_waitqueue_head(&ldata->waitq);
-
-			lock->omap_data = ldata;
-		} else {
-			kfree(ldata);
-			ldata = lock->omap_data;
-		}
-	}
-
-	while (ldata->seq != lock->write_seq) {
-		/* only one waiter sends a request at a time */
-		if (!ldata->req_in_flight) {
-			ldata->req_in_flight = true;
-			send_req = true;
-		} else {
-			send_req = false;
-		}
-
-		spin_unlock(&lock->omap_spinlock);
-		if (send_req)
-			ret = scoutfs_client_open_ino_map(sb, group_nr, &ldata->map);
-		else
-			wait_event(ldata->waitq, !omap_req_in_flight(lock, ldata));
-		spin_lock(&lock->omap_spinlock);
-
-		/* only sender can return error, other waiters retry */
-		if (send_req) {
-			ldata->req_in_flight = false;
-			if (ret == 0)
-				ldata->seq = lock->write_seq;
-			wake_up(&ldata->waitq);
-			if (ret < 0)
-				goto out;
-		}
-	}
-
-out:
-	spin_unlock(&lock->omap_spinlock);
-
-	if (ret == 0)
-		*ldata_ret = ldata;
-	else
-		*ldata_ret = NULL;
-
-	return ret;
-}
-
-/*
- * Return 1 and give the caller their locks when they should delete the
- * inode items.  It's safe to delete the inode items when it is no
- * longer reachable and nothing is referencing it.
- *
- * The inode is unreachable when nlink hits zero.  Cluster locks protect
- * modification and testing of nlink.  We use the ino_lock_cov covrage
- * to short circuit the common case of having a locked inode that hasn't
- * been deleted.  If it isn't locked, we have to acquire the lock to
- * refresh the inode to see its current nlink. 
- *
- * Then we use an open inode bitmap that covers all the inodes in the
- * lock group to determine if the inode is present in any other mount's
- * caches.  We refresh it by asking the server for all clients' maps and
- * then store it in the lock.  As long as we hold the lock nothing can
- * increase nlink from zero and let people get a reference to the inode.
- */
-int scoutfs_omap_should_delete(struct super_block *sb, struct inode *inode,
-			       struct scoutfs_lock **lock_ret, struct scoutfs_lock **orph_lock_ret)
-{
-	struct scoutfs_inode_info *si = SCOUTFS_I(inode);
-	struct scoutfs_lock *orph_lock = NULL;
-	struct scoutfs_lock *lock = NULL;
-	const u64 ino = scoutfs_ino(inode);
-	struct scoutfs_omap_lock_data *ldata;
-	u64 group_nr;
-	int bit_nr;
-	int ret;
-	int err;
-
-	/* lock group and omap constants are defined independently */
-	BUILD_BUG_ON(SCOUTFS_OPEN_INO_MAP_BITS != SCOUTFS_LOCK_INODE_GROUP_NR);
-
-	if (scoutfs_lock_is_covered(sb, &si->ino_lock_cov) && inode->i_nlink > 0) {
-		ret = 0;
-		goto out;
-	}
-
-	ret = scoutfs_lock_inode(sb, SCOUTFS_LOCK_WRITE, SCOUTFS_LKF_REFRESH_INODE, inode, &lock);
-	if (ret < 0)
-		goto out;
-
-	if (inode->i_nlink > 0) {
-		ret = 0;
-		goto out;
-	}
-
-	calc_group_nrs(ino, &group_nr, &bit_nr);
-
-	/* only one request to refresh the map at a time */
-	ret = get_current_lock_data(sb, lock, &ldata, group_nr);
-	if (ret < 0)
-		goto out;
-
-	/* can delete caller's zero nlink inode if it's not cached in other mounts */
-	ret = !test_bit_le(bit_nr, ldata->map.bits);
-out:
-	trace_scoutfs_omap_should_delete(sb, ino, inode->i_nlink, ret);
-
-	if (ret > 0) {
-		err = scoutfs_lock_orphan(sb, SCOUTFS_LOCK_WRITE_ONLY, 0, ino, &orph_lock);
-		if (err < 0)
-			ret = err;
-	}
-
-	if (ret <= 0) {
-		scoutfs_unlock(sb, lock, SCOUTFS_LOCK_WRITE);
-		lock = NULL;
-	}
-
-	*lock_ret = lock;
-	*orph_lock_ret = orph_lock;
-	return ret;
-}
-
-void scoutfs_omap_free_lock_data(struct scoutfs_omap_lock_data *ldata)
-{
-	if (ldata) {
-		WARN_ON_ONCE(ldata->req_in_flight);
-		WARN_ON_ONCE(waitqueue_active(&ldata->waitq));
-		kfree(ldata);
-	}
-}
-
 int scoutfs_omap_setup(struct super_block *sb)
 {
 	struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
--- a/kmod/src/omap.h
+++ b/kmod/src/omap.h
@@ -1,13 +1,12 @@
 #ifndef _SCOUTFS_OMAP_H_
 #define _SCOUTFS_OMAP_H_

-int scoutfs_omap_inc(struct super_block *sb, u64 ino);
-void scoutfs_omap_dec(struct super_block *sb, u64 ino);
-int scoutfs_omap_should_delete(struct super_block *sb, struct inode *inode,
-			       struct scoutfs_lock **lock_ret, struct scoutfs_lock **orph_lock_ret);
-void scoutfs_omap_free_lock_data(struct scoutfs_omap_lock_data *ldata);
+int scoutfs_omap_set(struct super_block *sb, u64 ino);
+bool scoutfs_omap_test(struct super_block *sb, u64 ino);
+void scoutfs_omap_clear(struct super_block *sb, u64 ino);
 int scoutfs_omap_client_handle_request(struct super_block *sb, u64 id,
 				       struct scoutfs_open_ino_map_args *args);
+void scoutfs_omap_calc_group_nrs(u64 ino, u64 *group_nr, int *bit_nr);

 int scoutfs_omap_add_rid(struct super_block *sb, u64 rid);
 int scoutfs_omap_remove_rid(struct super_block *sb, u64 rid);
--- a/kmod/src/options.c
+++ b/kmod/src/options.c
@@ -26,22 +26,30 @@
 #include "msg.h"
 #include "options.h"
 #include "super.h"
+#include "inode.h"
+
+enum {
+	Opt_metadev_path,
+	Opt_orphan_scan_delay_ms,
+	Opt_quorum_slot_nr,
+	Opt_err,
+};

 static const match_table_t tokens = {
-	{Opt_quorum_slot_nr, "quorum_slot_nr=%s"},
 	{Opt_metadev_path, "metadev_path=%s"},
+	{Opt_orphan_scan_delay_ms, "orphan_scan_delay_ms=%s"},
+	{Opt_quorum_slot_nr, "quorum_slot_nr=%s"},
 	{Opt_err, NULL}
 };

-struct options_sb_info {
-	struct dentry *debugfs_dir;
+struct options_info {
+	seqlock_t seqlock;
+	struct scoutfs_mount_options opts;
+	struct scoutfs_sysfs_attrs sysfs_attrs;
 };

-u32 scoutfs_option_u32(struct super_block *sb, int token)
-{
-	WARN_ON_ONCE(1);
-	return 0;
-}
+#define DECLARE_OPTIONS_INFO(sb, name) \
+	struct options_info *name = SCOUTFS_SB(sb)->options_info

 static int parse_bdev_path(struct super_block *sb, substring_t *substr,
 			      char **bdev_path_ret)
@@ -89,8 +97,29 @@ out:
 	return ret;
 }

-int scoutfs_parse_options(struct super_block *sb, char *options,
-			  struct mount_options *parsed)
+static void free_options(struct scoutfs_mount_options *opts)
+{
+	kfree(opts->metadev_path);
+}
+
+#define MIN_ORPHAN_SCAN_DELAY_MS	100UL
+#define DEFAULT_ORPHAN_SCAN_DELAY_MS	(10 * MSEC_PER_SEC)
+#define MAX_ORPHAN_SCAN_DELAY_MS	(60 * MSEC_PER_SEC)
+
+static void init_default_options(struct scoutfs_mount_options *opts)
+{
+	memset(opts, 0, sizeof(*opts));
+	opts->quorum_slot_nr = -1;
+	opts->orphan_scan_delay_ms = DEFAULT_ORPHAN_SCAN_DELAY_MS;
+}
+
+/*
+ * Parse the option string into our options struct.   This can allocate
+ * memory in the struct.  The caller is responsible for always calling
+ * free_options() when the struct is destroyed, including when we return
+ * an error.
+ */
+static int parse_options(struct super_block *sb, char *options, struct scoutfs_mount_options *opts)
 {
 	substring_t args[MAX_OPT_ARGS];
 	int nr;
@@ -98,49 +127,61 @@ int scoutfs_parse_options(struct super_block *sb, char *options,
 	char *p;
 	int ret;

-	/* Set defaults */
-	memset(parsed, 0, sizeof(*parsed));
-	parsed->quorum_slot_nr = -1;
-
 	while ((p = strsep(&options, ",")) != NULL) {
 		if (!*p)
 			continue;

 		token = match_token(p, tokens, args);
 		switch (token) {
-		case Opt_quorum_slot_nr:

-			if (parsed->quorum_slot_nr != -1) {
+		case Opt_metadev_path:
+			ret = parse_bdev_path(sb, &args[0], &opts->metadev_path);
+			if (ret < 0)
+				return ret;
+			break;
+
+		case Opt_orphan_scan_delay_ms:
+			if (opts->orphan_scan_delay_ms != -1) {
+				scoutfs_err(sb, "multiple orphan_scan_delay_ms options provided, only provide one.");
+				return -EINVAL;
+			}
+
+			ret = match_int(args, &nr);
+			if (ret < 0 ||
+			    nr < MIN_ORPHAN_SCAN_DELAY_MS || nr > MAX_ORPHAN_SCAN_DELAY_MS) {
+				scoutfs_err(sb, "invalid orphan_scan_delay_ms option, must be between %lu and %lu",
+					    MIN_ORPHAN_SCAN_DELAY_MS, MAX_ORPHAN_SCAN_DELAY_MS);
+				if (ret == 0)
+					ret = -EINVAL;
+				return ret;
+			}
+			opts->orphan_scan_delay_ms = nr;
+			break;
+
+		case Opt_quorum_slot_nr:
+			if (opts->quorum_slot_nr != -1) {
 				scoutfs_err(sb, "multiple quorum_slot_nr options provided, only provide one.");
 				return -EINVAL;
 			}

 			ret = match_int(args, &nr);
-			if (ret < 0 || nr < 0 ||
-			    nr >= SCOUTFS_QUORUM_MAX_SLOTS) {
+			if (ret < 0 || nr < 0 || nr >= SCOUTFS_QUORUM_MAX_SLOTS) {
 				scoutfs_err(sb, "invalid quorum_slot_nr option, must be between 0 and %u",
 					    SCOUTFS_QUORUM_MAX_SLOTS - 1);
 				if (ret == 0)
 					ret = -EINVAL;
 				return ret;
 			}
-			parsed->quorum_slot_nr = nr;
+			opts->quorum_slot_nr = nr;
 			break;
-		case Opt_metadev_path:

-			ret = parse_bdev_path(sb, &args[0],
-						 &parsed->metadev_path);
-			if (ret < 0)
-				return ret;
-			break;
 		default:
 			scoutfs_err(sb, "Unknown or malformed option, \"%s\"", p);
 			return -EINVAL;
-			break;
 		}
 	}

-	if (!parsed->metadev_path) {
+	if (!opts->metadev_path) {
 		scoutfs_err(sb, "Required mount option \"metadev_path\" not found");
 		return -EINVAL;
 	}
@@ -148,40 +189,181 @@ int scoutfs_parse_options(struct super_block *sb, char *options,
 	return 0;
 }

-int scoutfs_options_setup(struct super_block *sb)
+void scoutfs_options_read(struct super_block *sb, struct scoutfs_mount_options *opts)
+{
+	DECLARE_OPTIONS_INFO(sb, optinf);
+	unsigned int seq;
+
+	if (WARN_ON_ONCE(optinf == NULL)) {
+		/* trying to use options before early setup or after destroy */
+		init_default_options(opts);
+		return;
+	}
+
+	do {
+		seq = read_seqbegin(&optinf->seqlock);
+		memcpy(opts, &optinf->opts, sizeof(struct scoutfs_mount_options));
+	} while (read_seqretry(&optinf->seqlock, seq));
+}
+
+/*
+ * Early setup that parses and stores the options so that the rest of
+ * setup can use them.   Full options setup that relies on other
+ * components will be done later.
+ */
+int scoutfs_options_early_setup(struct super_block *sb, char *options)
 {
 	struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
-	struct options_sb_info *osi;
+	struct scoutfs_mount_options opts;
+	struct options_info *optinf;
 	int ret;

-	osi = kzalloc(sizeof(struct options_sb_info), GFP_KERNEL);
-	if (!osi)
-		return -ENOMEM;
+	init_default_options(&opts);

-	sbi->options = osi;
+	ret = parse_options(sb, options, &opts);
+	if (ret < 0)
+		goto out;

-	osi->debugfs_dir = debugfs_create_dir("options", sbi->debug_root);
-	if (!osi->debugfs_dir) {
+	optinf = kzalloc(sizeof(struct options_info), GFP_KERNEL);
+	if (!optinf) {
 		ret = -ENOMEM;
 		goto out;
 	}

+	seqlock_init(&optinf->seqlock);
+	scoutfs_sysfs_init_attrs(sb, &optinf->sysfs_attrs);
+
+	write_seqlock(&optinf->seqlock);
+	optinf->opts = opts;
+	write_sequnlock(&optinf->seqlock);
+
+	sbi->options_info = optinf;
 	ret = 0;
 out:
-	if (ret)
+	if (ret < 0)
+		free_options(&opts);
+
+	return ret;
+}
+
+int scoutfs_options_show(struct seq_file *seq, struct dentry *root)
+{
+	struct super_block *sb = root->d_sb;
+	struct scoutfs_mount_options opts;
+
+	scoutfs_options_read(sb, &opts);
+
+	seq_printf(seq, ",metadev_path=%s", opts.metadev_path);
+	seq_printf(seq, ",orphan_scan_delay_ms=%u", opts.orphan_scan_delay_ms);
+	if (opts.quorum_slot_nr >= 0)
+		seq_printf(seq, ",quorum_slot_nr=%d", opts.quorum_slot_nr);
+
+	return 0;
+}
+
+static ssize_t metadev_path_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf)
+{
+	struct super_block *sb = SCOUTFS_SYSFS_ATTRS_SB(kobj);
+	struct scoutfs_mount_options opts;
+
+	scoutfs_options_read(sb, &opts);
+
+	return snprintf(buf, PAGE_SIZE, "%s", opts.metadev_path);
+}
+SCOUTFS_ATTR_RO(metadev_path);
+
+static ssize_t orphan_scan_delay_ms_show(struct kobject *kobj, struct kobj_attribute *attr,
+					 char *buf)
+{
+	struct super_block *sb = SCOUTFS_SYSFS_ATTRS_SB(kobj);
+	struct scoutfs_mount_options opts;
+
+	scoutfs_options_read(sb, &opts);
+
+	return snprintf(buf, PAGE_SIZE, "%u", opts.orphan_scan_delay_ms);
+}
+static ssize_t orphan_scan_delay_ms_store(struct kobject *kobj, struct kobj_attribute *attr,
+					  const char *buf, size_t count)
+{
+	struct super_block *sb = SCOUTFS_SYSFS_ATTRS_SB(kobj);
+	DECLARE_OPTIONS_INFO(sb, optinf);
+	char nullterm[20]; /* more than enough for octal -U32_MAX */
+	long val;
+	int len;
+	int ret;
+
+	len = min(count, sizeof(nullterm) - 1);
+	memcpy(nullterm, buf, len);
+	nullterm[len] = '\0';
+
+	ret = kstrtol(nullterm, 0, &val);
+	if (ret < 0 || val < MIN_ORPHAN_SCAN_DELAY_MS || val > MAX_ORPHAN_SCAN_DELAY_MS) {
+		scoutfs_err(sb, "invalid orphan_scan_delay_ms value written to options sysfs file, must be between %lu and %lu",
+			    MIN_ORPHAN_SCAN_DELAY_MS, MAX_ORPHAN_SCAN_DELAY_MS);
+		return -EINVAL;
+	}
+
+	write_seqlock(&optinf->seqlock);
+	optinf->opts.orphan_scan_delay_ms = val;
+	write_sequnlock(&optinf->seqlock);
+
+	scoutfs_inode_schedule_orphan_dwork(sb);
+
+	return count;
+}
+SCOUTFS_ATTR_RW(orphan_scan_delay_ms);
+
+static ssize_t quorum_slot_nr_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf)
+{
+	struct super_block *sb = SCOUTFS_SYSFS_ATTRS_SB(kobj);
+	struct scoutfs_mount_options opts;
+
+	scoutfs_options_read(sb, &opts);
+
+	return snprintf(buf, PAGE_SIZE, "%d\n", opts.quorum_slot_nr);
+}
+SCOUTFS_ATTR_RO(quorum_slot_nr);
+
+static struct attribute *options_attrs[] = {
+	SCOUTFS_ATTR_PTR(metadev_path),
+	SCOUTFS_ATTR_PTR(orphan_scan_delay_ms),
+	SCOUTFS_ATTR_PTR(quorum_slot_nr),
+	NULL,
+};
+
+int scoutfs_options_setup(struct super_block *sb)
+{
+	DECLARE_OPTIONS_INFO(sb, optinf);
+	int ret;
+
+	ret = scoutfs_sysfs_create_attrs(sb, &optinf->sysfs_attrs, options_attrs, "mount_options");
+	if (ret < 0)
 		scoutfs_options_destroy(sb);
 	return ret;
 }

+/*
+ * We remove the sysfs files early in unmount so that they can't try to call other subsystems
+ * as they're being destroyed.
+ */
+void scoutfs_options_stop(struct super_block *sb)
+{
+	DECLARE_OPTIONS_INFO(sb, optinf);
+
+	if (optinf)
+		scoutfs_sysfs_destroy_attrs(sb, &optinf->sysfs_attrs);
+}
+
 void scoutfs_options_destroy(struct super_block *sb)
 {
 	struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
-	struct options_sb_info *osi = sbi->options;
+	DECLARE_OPTIONS_INFO(sb, optinf);

-	if (osi) {
-		if (osi->debugfs_dir)
-			debugfs_remove_recursive(osi->debugfs_dir);
-		kfree(osi);
-		sbi->options = NULL;
+	scoutfs_options_stop(sb);
+
+	if (optinf) {
+		free_options(&optinf->opts);
+		kfree(optinf);
+		sbi->options_info = NULL;
 	}
 }
--- a/kmod/src/options.h
+++ b/kmod/src/options.h
@@ -5,23 +5,19 @@
 #include <linux/in.h>
 #include "format.h"

-enum scoutfs_mount_options {
-	Opt_quorum_slot_nr,
-	Opt_metadev_path,
-	Opt_err,
-};
-
-struct mount_options {
-	int quorum_slot_nr;
+struct scoutfs_mount_options {
 	char *metadev_path;
+	unsigned int orphan_scan_delay_ms;
+	int quorum_slot_nr;
+
 };

-int scoutfs_parse_options(struct super_block *sb, char *options,
-			  struct mount_options *parsed);
+void scoutfs_options_read(struct super_block *sb, struct scoutfs_mount_options *opts);
+int scoutfs_options_show(struct seq_file *seq, struct dentry *root);
+
+int scoutfs_options_early_setup(struct super_block *sb, char *options);
 int scoutfs_options_setup(struct super_block *sb);
+void scoutfs_options_stop(struct super_block *sb);
 void scoutfs_options_destroy(struct super_block *sb);

-u32 scoutfs_option_u32(struct super_block *sb, int token);
-#define scoutfs_option_bool scoutfs_option_u32
-
 #endif	/* _SCOUTFS_OPTIONS_H_ */
--- a/kmod/src/quorum.c
+++ b/kmod/src/quorum.c
@@ -116,6 +116,7 @@ struct quorum_info {
 	struct socket *sock;
 	bool shutdown;

+	int our_quorum_slot_nr;
 	unsigned long flags;
 	int votes_needed;

@@ -160,9 +161,7 @@ static ktime_t heartbeat_timeout(void)
 static int create_socket(struct super_block *sb)
 {
 	DECLARE_QUORUM_INFO(sb, qinf);
-	struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
-	struct mount_options *opts = &sbi->opts;
-	struct scoutfs_super_block *super = &sbi->super;
+	struct scoutfs_super_block *super = &SCOUTFS_SB(sb)->super;
 	struct socket *sock = NULL;
 	struct sockaddr_in sin;
 	int addrlen;
@@ -176,7 +175,7 @@ static int create_socket(struct super_block *sb)

 	sock->sk->sk_allocation = GFP_NOFS;

-	scoutfs_quorum_slot_sin(super, opts->quorum_slot_nr, &sin);
+	scoutfs_quorum_slot_sin(super, qinf->our_quorum_slot_nr, &sin);

 	addrlen = sizeof(sin);
 	ret = kernel_bind(sock, (struct sockaddr *)&sin, addrlen);
@@ -207,7 +206,6 @@ static void send_msg_members(struct super_block *sb, int type, u64 term,
 			     int only)
 {
 	DECLARE_QUORUM_INFO(sb, qinf);
-	struct mount_options *opts = &SCOUTFS_SB(sb)->opts;
 	struct scoutfs_super_block *super = &SCOUTFS_SB(sb)->super;
 	ktime_t now;
 	int i;
@@ -216,7 +214,7 @@ static void send_msg_members(struct super_block *sb, int type, u64 term,
 		.fsid = super->hdr.fsid,
 		.term = cpu_to_le64(term),
 		.type = type,
-		.from = opts->quorum_slot_nr,
+		.from = qinf->our_quorum_slot_nr,
 	};
 	struct kvec kv =  {
 		.iov_base = &qmes,
@@ -238,7 +236,7 @@ static void send_msg_members(struct super_block *sb, int type, u64 term,

 	for (i = 0; i < SCOUTFS_QUORUM_MAX_SLOTS; i++) {
 		if (!quorum_slot_present(super, i) ||
-		    (only >= 0 && i != only) || i == opts->quorum_slot_nr)
+		    (only >= 0 && i != only) || i == qinf->our_quorum_slot_nr)
 			continue;

 		scoutfs_quorum_slot_sin(super, i, &sin);
@@ -476,8 +474,8 @@ static int write_quorum_block(struct super_block *sb, u64 blkno, struct scoutfs_
 */
 static int update_quorum_block(struct super_block *sb, int event, u64 term, bool check_rid)
 {
-	struct mount_options *opts = &SCOUTFS_SB(sb)->opts;
-	u64 blkno = SCOUTFS_QUORUM_BLKNO + opts->quorum_slot_nr;
+	DECLARE_QUORUM_INFO(sb, qinf);
+	u64 blkno = SCOUTFS_QUORUM_BLKNO + qinf->our_quorum_slot_nr;
 	struct scoutfs_quorum_block blk;
 	int ret;

@@ -622,7 +620,6 @@ static void scoutfs_quorum_worker(struct work_struct *work)
 {
 	struct quorum_info *qinf = container_of(work, struct quorum_info, work);
 	struct super_block *sb = qinf->sb;
-	struct mount_options *opts = &SCOUTFS_SB(sb)->opts;
 	struct sockaddr_in unused;
 	struct quorum_host_msg msg;
 	struct quorum_status qst;
@@ -724,7 +721,7 @@ static void scoutfs_quorum_worker(struct work_struct *work)
 			qst.term++;
 			qst.vote_for = -1;
 			qst.vote_bits = 0;
-			set_bit(opts->quorum_slot_nr, &qst.vote_bits);
+			set_bit(qinf->our_quorum_slot_nr, &qst.vote_bits);
 			send_msg_others(sb, SCOUTFS_QUORUM_MSG_REQUEST_VOTE,
 					qst.term);
 			qst.timeout = election_timeout();
@@ -954,7 +951,6 @@ static ssize_t status_show(struct kobject *kobj, struct kobj_attribute *attr,
 			   char *buf)
 {
 	DECLARE_QUORUM_INFO_KOBJ(kobj, qinf);
-	struct mount_options *opts = &SCOUTFS_SB(qinf->sb)->opts;
 	struct quorum_status qst;
 	struct last_msg last;
 	struct timespec64 ts;
@@ -971,7 +967,7 @@ static ssize_t status_show(struct kobject *kobj, struct kobj_attribute *attr,
 	ret = 0;

 	snprintf_ret(buf, size, &ret, "quorum_slot_nr %u\n",
-		     opts->quorum_slot_nr);
+		     qinf->our_quorum_slot_nr);
 	snprintf_ret(buf, size, &ret, "term %llu\n",
 		     qst.term);
 	snprintf_ret(buf, size, &ret, "role %d (%s)\n",
@@ -1048,7 +1044,6 @@ static inline bool valid_ipv4_port(__be16 port)
 static int verify_quorum_slots(struct super_block *sb)
 {
 	struct scoutfs_super_block *super = &SCOUTFS_SB(sb)->super;
-	struct mount_options *opts = &SCOUTFS_SB(sb)->opts;
 	char slots[(SCOUTFS_QUORUM_MAX_SLOTS * 3) + 1];
 	DECLARE_QUORUM_INFO(sb, qinf);
 	struct sockaddr_in other;
@@ -1099,7 +1094,7 @@ static int verify_quorum_slots(struct super_block *sb)
 		return -EINVAL;
 	}

-	if (!quorum_slot_present(super, opts->quorum_slot_nr)) {
+	if (!quorum_slot_present(super, qinf->our_quorum_slot_nr)) {
 		char *str = slots;
 		*str = '\0';
 		for (i = 0; i < SCOUTFS_QUORUM_MAX_SLOTS; i++) {
@@ -1114,7 +1109,7 @@ static int verify_quorum_slots(struct super_block *sb)
 			}
 		}
 		scoutfs_err(sb, "quorum_slot_nr=%u option references unused slot, must be one of the following configured slots:%s",
-			    opts->quorum_slot_nr, slots);
+			    qinf->our_quorum_slot_nr, slots);
 		return -EINVAL;
 	}

@@ -1137,11 +1132,12 @@ static int verify_quorum_slots(struct super_block *sb)
 int scoutfs_quorum_setup(struct super_block *sb)
 {
 	struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
-	struct mount_options *opts = &sbi->opts;
+	struct scoutfs_mount_options opts;
 	struct quorum_info *qinf;
 	int ret;

-	if (opts->quorum_slot_nr < 0)
+	scoutfs_options_read(sb, &opts);
+	if (opts.quorum_slot_nr < 0)
 		return 0;

 	qinf = kzalloc(sizeof(struct quorum_info), GFP_KERNEL);
@@ -1153,6 +1149,8 @@ int scoutfs_quorum_setup(struct super_block *sb)
 	spin_lock_init(&qinf->show_lock);
 	INIT_WORK(&qinf->work, scoutfs_quorum_worker);
 	scoutfs_sysfs_init_attrs(sb, &qinf->ssa);
+	/* static for the lifetime of the mount */
+	qinf->our_quorum_slot_nr = opts.quorum_slot_nr;

 	sbi->quorum_info = qinf;
 	qinf->sb = sb;
--- a/kmod/src/scoutfs_trace.h
+++ b/kmod/src/scoutfs_trace.h
@@ -2620,9 +2620,9 @@ TRACE_EVENT(scoutfs_item_invalidate_page,

 DECLARE_EVENT_CLASS(scoutfs_omap_group_class,
 	TP_PROTO(struct super_block *sb, void *grp, u64 group_nr, unsigned int group_total,
-		 int bit_nr, int bit_count),
+		 int bit_nr),

-	TP_ARGS(sb, grp, group_nr, group_total, bit_nr, bit_count),
+	TP_ARGS(sb, grp, group_nr, group_total, bit_nr),

 	TP_STRUCT__entry(
 		SCSB_TRACE_FIELDS
@@ -2630,7 +2630,6 @@ DECLARE_EVENT_CLASS(scoutfs_omap_group_class,
 		__field(__u64, group_nr)
 		__field(unsigned int, group_total)
 		__field(int, bit_nr)
-		__field(int, bit_count)
 	),

 	TP_fast_assign(
@@ -2639,43 +2638,42 @@ DECLARE_EVENT_CLASS(scoutfs_omap_group_class,
 		__entry->group_nr = group_nr;
 		__entry->group_total = group_total;
 		__entry->bit_nr = bit_nr;
-		__entry->bit_count = bit_count;
 	),

-	TP_printk(SCSBF" grp %p group_nr %llu group_total %u bit_nr %d bit_count %d",
+	TP_printk(SCSBF" grp %p group_nr %llu group_total %u bit_nr %d",
 		  SCSB_TRACE_ARGS, __entry->grp, __entry->group_nr, __entry->group_total,
-		  __entry->bit_nr, __entry->bit_count)
+		  __entry->bit_nr)
 );

 DEFINE_EVENT(scoutfs_omap_group_class, scoutfs_omap_group_alloc,
 	TP_PROTO(struct super_block *sb, void *grp, u64 group_nr, unsigned int group_total,
-		 int bit_nr, int bit_count),
-	TP_ARGS(sb, grp, group_nr, group_total, bit_nr, bit_count)
+		 int bit_nr),
+	TP_ARGS(sb, grp, group_nr, group_total, bit_nr)
 );
 DEFINE_EVENT(scoutfs_omap_group_class, scoutfs_omap_group_free,
 	TP_PROTO(struct super_block *sb, void *grp, u64 group_nr, unsigned int group_total,
-		 int bit_nr, int bit_count),
-	TP_ARGS(sb, grp, group_nr, group_total, bit_nr, bit_count)
+		 int bit_nr),
+	TP_ARGS(sb, grp, group_nr, group_total, bit_nr)
 );
 DEFINE_EVENT(scoutfs_omap_group_class, scoutfs_omap_group_inc,
 	TP_PROTO(struct super_block *sb, void *grp, u64 group_nr, unsigned int group_total,
-		 int bit_nr, int bit_count),
-	TP_ARGS(sb, grp, group_nr, group_total, bit_nr, bit_count)
+		 int bit_nr),
+	TP_ARGS(sb, grp, group_nr, group_total, bit_nr)
 );
 DEFINE_EVENT(scoutfs_omap_group_class, scoutfs_omap_group_dec,
 	TP_PROTO(struct super_block *sb, void *grp, u64 group_nr, unsigned int group_total,
-		 int bit_nr, int bit_count),
-	TP_ARGS(sb, grp, group_nr, group_total, bit_nr, bit_count)
+		 int bit_nr),
+	TP_ARGS(sb, grp, group_nr, group_total, bit_nr)
 );
 DEFINE_EVENT(scoutfs_omap_group_class, scoutfs_omap_group_request,
 	TP_PROTO(struct super_block *sb, void *grp, u64 group_nr, unsigned int group_total,
-		 int bit_nr, int bit_count),
-	TP_ARGS(sb, grp, group_nr, group_total, bit_nr, bit_count)
+		 int bit_nr),
+	TP_ARGS(sb, grp, group_nr, group_total, bit_nr)
 );
 DEFINE_EVENT(scoutfs_omap_group_class, scoutfs_omap_group_destroy,
 	TP_PROTO(struct super_block *sb, void *grp, u64 group_nr, unsigned int group_total,
-		 int bit_nr, int bit_count),
-	TP_ARGS(sb, grp, group_nr, group_total, bit_nr, bit_count)
+		 int bit_nr),
+	TP_ARGS(sb, grp, group_nr, group_total, bit_nr)
 );

 TRACE_EVENT(scoutfs_omap_should_delete,
--- a/kmod/src/server.c
+++ b/kmod/src/server.c
@@ -3842,8 +3842,8 @@ static void scoutfs_server_worker(struct work_struct *work)
 	struct super_block *sb = server->sb;
 	struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
 	struct scoutfs_super_block *super = &sbi->super;
-	struct mount_options *opts = &sbi->opts;
 	struct scoutfs_net_connection *conn = NULL;
+	struct scoutfs_mount_options opts;
 	DECLARE_WAIT_QUEUE_HEAD(waitq);
 	struct sockaddr_in sin;
 	bool alloc_init = false;
@@ -3852,7 +3852,8 @@ static void scoutfs_server_worker(struct work_struct *work)

 	trace_scoutfs_server_work_enter(sb, 0, 0);

-	scoutfs_quorum_slot_sin(super, opts->quorum_slot_nr, &sin);
+	scoutfs_options_read(sb, &opts);
+	scoutfs_quorum_slot_sin(super, opts.quorum_slot_nr, &sin);
 	scoutfs_info(sb, "server starting at "SIN_FMT, SIN_ARG(&sin));

 	scoutfs_block_writer_init(sb, &server->wri);
--- a/kmod/src/super.c
+++ b/kmod/src/super.c
@@ -132,44 +132,6 @@ out:
 	return ret;
 }

-static int scoutfs_show_options(struct seq_file *seq, struct dentry *root)
-{
-	struct super_block *sb = root->d_sb;
-	struct mount_options *opts = &SCOUTFS_SB(sb)->opts;
-
-	if (opts->quorum_slot_nr >= 0)
-		seq_printf(seq, ",quorum_slot_nr=%d", opts->quorum_slot_nr);
-	seq_printf(seq, ",metadev_path=%s", opts->metadev_path);
-
-	return 0;
-}
-
-static ssize_t metadev_path_show(struct kobject *kobj,
-				 struct kobj_attribute *attr, char *buf)
-{
-	struct super_block *sb = SCOUTFS_SYSFS_ATTRS_SB(kobj);
-	struct mount_options *opts = &SCOUTFS_SB(sb)->opts;
-
-	return snprintf(buf, PAGE_SIZE, "%s", opts->metadev_path);
-}
-SCOUTFS_ATTR_RO(metadev_path);
-
-static ssize_t quorum_server_nr_show(struct kobject *kobj,
-			      struct kobj_attribute *attr, char *buf)
-{
-	struct super_block *sb = SCOUTFS_SYSFS_ATTRS_SB(kobj);
-	struct mount_options *opts = &SCOUTFS_SB(sb)->opts;
-
-	return snprintf(buf, PAGE_SIZE, "%d\n", opts->quorum_slot_nr);
-}
-SCOUTFS_ATTR_RO(quorum_server_nr);
-
-static struct attribute *mount_options_attrs[] = {
-	SCOUTFS_ATTR_PTR(metadev_path),
-	SCOUTFS_ATTR_PTR(quorum_server_nr),
-	NULL,
-};
-
 static int scoutfs_sync_fs(struct super_block *sb, int wait)
 {
 	trace_scoutfs_sync_fs(sb, wait);
@@ -246,13 +208,11 @@ static void scoutfs_put_super(struct super_block *sb)
 	scoutfs_destroy_triggers(sb);
 	scoutfs_fence_destroy(sb);
 	scoutfs_options_destroy(sb);
-	scoutfs_sysfs_destroy_attrs(sb, &sbi->mopts_ssa);
 	debugfs_remove(sbi->debug_root);
 	scoutfs_destroy_counters(sb);
 	scoutfs_destroy_sysfs(sb);
 	scoutfs_metadev_close(sb);

-	kfree(sbi->opts.metadev_path);
 	kfree(sbi);

 	sb->s_fs_info = NULL;
@@ -282,7 +242,7 @@ static const struct super_operations scoutfs_super_ops = {
 	.destroy_inode = scoutfs_destroy_inode,
 	.sync_fs = scoutfs_sync_fs,
 	.statfs = scoutfs_statfs,
-	.show_options = scoutfs_show_options,
+	.show_options = scoutfs_options_show,
 	.put_super = scoutfs_put_super,
 	.umount_begin = scoutfs_umount_begin,
 };
@@ -511,9 +471,9 @@ out:

 static int scoutfs_fill_super(struct super_block *sb, void *data, int silent)
 {
-	struct scoutfs_sb_info *sbi;
-	struct mount_options opts;
+	struct scoutfs_mount_options opts;
 	struct block_device *meta_bdev;
+	struct scoutfs_sb_info *sbi;
 	struct inode *inode;
 	int ret;

@@ -541,13 +501,12 @@ static int scoutfs_fill_super(struct super_block *sb, void *data, int silent)
 	spin_lock_init(&sbi->next_ino_lock);
 	spin_lock_init(&sbi->data_wait_root.lock);
 	sbi->data_wait_root.root = RB_ROOT;
-	scoutfs_sysfs_init_attrs(sb, &sbi->mopts_ssa);

-	ret = scoutfs_parse_options(sb, data, &opts);
-	if (ret)
-		goto out;
-
-	sbi->opts = opts;
+	/* parse options early for use during setup */
+	ret = scoutfs_options_early_setup(sb, data);
+	if (ret < 0)
+		return ret;
+	scoutfs_options_read(sb, &opts);

 	ret = sb_set_blocksize(sb, SCOUTFS_BLOCK_SM_SIZE);
 	if (ret != SCOUTFS_BLOCK_SM_SIZE) {
@@ -556,9 +515,7 @@ static int scoutfs_fill_super(struct super_block *sb, void *data, int silent)
 		goto out;
 	}

-	meta_bdev =
-		blkdev_get_by_path(sbi->opts.metadev_path,
-				   SCOUTFS_META_BDEV_MODE, sb);
+	meta_bdev = blkdev_get_by_path(opts.metadev_path, SCOUTFS_META_BDEV_MODE, sb);
 	if (IS_ERR(meta_bdev)) {
 		scoutfs_err(sb, "could not open metadev: error %ld",
 			    PTR_ERR(meta_bdev));
@@ -578,8 +535,6 @@ static int scoutfs_fill_super(struct super_block *sb, void *data, int silent)
 	      scoutfs_setup_sysfs(sb) ?:
 	      scoutfs_setup_counters(sb) ?:
 	      scoutfs_options_setup(sb) ?:
-	      scoutfs_sysfs_create_attrs(sb, &sbi->mopts_ssa,
-				mount_options_attrs, "mount_options") ?:
 	      scoutfs_setup_triggers(sb) ?:
 	      scoutfs_fence_setup(sb) ?:
 	      scoutfs_block_setup(sb) ?:
@@ -652,6 +607,7 @@ static void scoutfs_kill_sb(struct super_block *sb)
 	}

 	if (SCOUTFS_HAS_SBI(sb)) {
+		scoutfs_options_stop(sb);
 		scoutfs_inode_orphan_stop(sb);
 		scoutfs_lock_unmount_begin(sb);
 	}
--- a/kmod/src/super.h
+++ b/kmod/src/super.h
@@ -44,6 +44,7 @@ struct scoutfs_sb_info {

 	spinlock_t next_ino_lock;

+	struct options_info *options_info;
 	struct data_info *data_info;
 	struct inode_sb_info *inode_sb_info;
 	struct btree_info *btree_info;
@@ -74,10 +75,6 @@ struct scoutfs_sb_info {
 	struct scoutfs_counters *counters;
 	struct scoutfs_triggers *triggers;

-	struct mount_options opts;
-	struct options_sb_info *options;
-	struct scoutfs_sysfs_attrs mopts_ssa;
-
 	struct dentry *debug_root;

 	bool forced_unmount;
--- a/kmod/src/trans.c
+++ b/kmod/src/trans.c
@@ -640,6 +640,7 @@ void scoutfs_shutdown_trans(struct super_block *sb)
 			tri->write_workq = NULL;
 		}

+		scoutfs_alloc_prepare_commit(sb, &tri->alloc, &tri->wri);
 		scoutfs_block_writer_forget_all(sb, &tri->wri);

 		kfree(tri);
--- a/tests/.gitignore
+++ b/tests/.gitignore
@@ -3,6 +3,7 @@ src/createmany
 src/dumb_renameat2
 src/dumb_setxattr
 src/handle_cat
+src/handle_fsetxattr
 src/bulk_create_paths
 src/find_xattrs
 src/stage_tmpfile
--- a/tests/Makefile
+++ b/tests/Makefile
@@ -6,6 +6,7 @@ BIN := src/createmany			\
 	src/dumb_renameat2		\
 	src/dumb_setxattr		\
 	src/handle_cat			\
+	src/handle_fsetxattr		\
 	src/bulk_create_paths		\
 	src/stage_tmpfile		\
 	src/find_xattrs			\
--- a/tests/funcs/fs.sh
+++ b/tests/funcs/fs.sh
@@ -362,3 +362,49 @@ t_wait_for_leader() {
 		done
 	done
 }
+
+t_set_sysfs_mount_option() {
+	local nr="$1"
+	local name="$2"
+	local val="$3"
+	local opt="$(t_sysfs_path $nr)/mount_options/$name"
+
+	echo "$val" > "$opt"
+}
+
+t_set_all_sysfs_mount_options() {
+	local name="$1"
+	local val="$2"
+	local i
+
+	for i in $(t_fs_nrs); do
+		t_set_sysfs_mount_option $i $name $val
+	done
+}
+
+declare -A _saved_opts
+t_save_all_sysfs_mount_options() {
+	local name="$1"
+	local ind
+	local opt
+	local i
+
+	for i in $(t_fs_nrs); do
+		opt="$(t_sysfs_path $i)/mount_options/$name"
+		ind="$name_$i"
+
+		_saved_opts[$ind]="$(cat $opt)"
+	done
+}
+
+t_restore_all_sysfs_mount_options() {
+	local name="$1"
+	local ind
+	local i
+
+	for i in $(t_fs_nrs); do
+		ind="$name_$i"
+
+		t_set_sysfs_mount_option $i $name "${_saved_opts[$ind]}"
+	done
+}
--- a/tests/golden/orphan-inodes
+++ b/tests/golden/orphan-inodes
@@ -2,3 +2,4 @@
 == unlinked and opened inodes still exist
 == orphan from failed evict deletion is picked up
 == orphaned inos in all mounts all deleted
+== 30s of racing evict deletion, orphan scanning, and open by handle
--- a/tests/src/handle_fsetxattr.c
+++ b/tests/src/handle_fsetxattr.c
@@ -0,0 +1,189 @@
+/*
+ * Copyright (C) 2022 Versity Software, Inc.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ */
+
+#ifndef _GNU_SOURCE
+#define _GNU_SOURCE
+#endif
+#include <unistd.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdbool.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <inttypes.h>
+#include <errno.h>
+#include <string.h>
+#include <endian.h>
+#include <time.h>
+#include <linux/types.h>
+#include <sys/xattr.h>
+
+#define FILEID_SCOUTFS			0x81
+#define FILEID_SCOUTFS_WITH_PARENT	0x82
+
+struct our_handle {
+	struct file_handle handle;
+	/*
+	 * scoutfs file handle can be ino or ino/parent. The
+	 * handle_type field of struct file_handle denotes which
+	 * version is in use. We only use the ino variant here.
+	 */
+	__le64 scoutfs_ino;
+};
+
+#define DEFAULT_NAME "user.handle_fsetxattr"
+#define DEFAULT_VALUE "value"
+
+static void exit_usage(void)
+{
+	printf(" -h/-?         output this usage message and exit\n"
+	       " -e            keep trying on enoent, consider success an error\n"
+	       " -i <num>      64bit inode number for handle open, can be multiple\n"
+	       " -m <string>   scoutfs mount path string for ioctl fd\n"
+	       " -n <string>   optional xattr name string, defaults to \""DEFAULT_NAME"\"\n"
+	       " -s <num>      loop for num seconds, defaults to 0 for one iteration"
+	       " -v <string>   optional xattr value string, defaults to \""DEFAULT_VALUE"\"\n");
+	exit(1);
+}
+
+int main(int argc, char **argv)
+{
+	struct our_handle handle;
+	struct timespec ts;
+	bool enoent_success_err = false;
+	uint64_t seconds = 0;
+	char *value = NULL;
+	char *name = NULL;
+	char *mnt = NULL;
+	int nr_inos = 0;
+	uint64_t *inos;
+	uint64_t i;
+	int *fds;
+	int mntfd;
+	int fd;
+	int ret;
+	char c;
+	int j;
+
+	/* can't have more inos than args */
+	inos = calloc(argc, sizeof(inos[0]));
+	fds = calloc(argc, sizeof(fds[0]));
+	if (!inos || !fds) {
+		perror("calloc");
+		exit(1);
+	}
+	for (i = 0; i < argc; i++)
+		fds[i] = -1;
+
+	while ((c = getopt(argc, argv, "+ei:m:n:s:v:")) != -1) {
+		switch (c) {
+			case 'e':
+				enoent_success_err = true;
+				break;
+			case 'i':
+				inos[nr_inos] = strtoll(optarg, NULL, 0);
+				nr_inos++;
+				break;
+			case 'm':
+				mnt = strdup(optarg);
+				break;
+			case 'n':
+				name = strdup(optarg);
+				break;
+			case 's':
+				seconds = strtoll(optarg, NULL, 0);
+				break;
+			case 'v':
+				value = strdup(optarg);
+				break;
+			case '?':
+				printf("unknown argument: %c\n", optind);
+			case 'h':
+				exit_usage();
+		}
+	}
+
+	if (nr_inos == 0) {
+		printf("specify non-zero inode number with -i\n");
+		exit(1);
+	}
+
+	if (!mnt) {
+		printf("specify scoutfs mount path for ioctl with -p\n");
+		exit(1);
+	}
+
+	if (name == NULL)
+		name = DEFAULT_NAME;
+	if (value == NULL)
+		value = DEFAULT_VALUE;
+
+	mntfd = open(mnt, O_RDONLY);
+	if (mntfd == -1) {
+		perror("opening mountpoint");
+		return 1;
+	}
+
+	clock_gettime(CLOCK_REALTIME, &ts);
+	seconds += ts.tv_sec;
+
+	for (i = 0; ; i++) {
+		for (j = 0; j < nr_inos; j++) {
+			fd = fds[j];
+
+			if (fd < 0) {
+				handle.handle.handle_bytes = sizeof(struct our_handle);
+				handle.handle.handle_type = FILEID_SCOUTFS;
+				handle.scoutfs_ino = htole64(inos[j]);
+
+				fd = open_by_handle_at(mntfd, &handle.handle, O_RDWR);
+				if (fd == -1) {
+					if (!enoent_success_err || errno != ENOENT) {
+						perror("open_by_handle_at");
+						return 1;
+					}
+					continue;
+				}
+				fds[j] = fd;
+			}
+
+			ret = fsetxattr(fd, name, value, strlen(value), 0);
+			if (ret < 0) {
+				perror("fsetxattr");
+				return 1;
+			}
+		}
+
+		if ((i % 10) == 0) {
+			clock_gettime(CLOCK_REALTIME, &ts);
+			if (ts.tv_sec >= seconds)
+				break;
+		}
+	}
+
+	if (enoent_success_err) {
+		bool able = false;
+		for (i = 0; i < nr_inos; i++) {
+			if (fds[i] >= 0) {
+				printf("was able to open ino %"PRIu64"\n", inos[i]);
+				able = true;
+			}
+		}
+		if (able)
+			exit(1);
+	}
+
+	/* not bothering to close or free */
+	return 0;
+}
--- a/tests/tests/orphan-inodes.sh
+++ b/tests/tests/orphan-inodes.sh
@@ -30,6 +30,13 @@ inode_exists()
 	test "$?" == 0 -a "$(head -1 $T_TMP.inos.log)" == "$ino"
 }

+t_save_all_sysfs_mount_options orphan_scan_delay_ms
+restore_delays()
+{
+	t_restore_all_sysfs_mount_options orphan_scan_delay_ms
+}
+trap restore_delays EXIT
+
 echo "== test our inode existance function"
 path="$T_D0/file"
 touch "$path"
@@ -38,6 +45,7 @@ inode_exists $ino || echo "$ino didn't exist"

 echo "== unlinked and opened inodes still exist"
 sleep 1000000 < "$path" &
+sleep .1 # wait for background sleep to run and open stdin
 pid="$!"
 rm -f "$path"
 inode_exists $ino || echo "$ino didn't exist"
@@ -45,7 +53,8 @@ inode_exists $ino || echo "$ino didn't exist"
 echo "== orphan from failed evict deletion is picked up"
 # pending kill signal stops evict from getting locks and deleting
 silent_kill $pid
-sleep 55
+t_set_sysfs_mount_option 0 orphan_scan_delay_ms 1000
+sleep 5
 inode_exists $ino && echo "$ino still exists"

 echo "== orphaned inos in all mounts all deleted"
@@ -56,6 +65,7 @@ for nr in $(t_fs_nrs); do
 	touch "$path"
 	inos="$inos $(stat -c %i $path)"
 	sleep 1000000 < "$path" &
+	sleep .1 # wait for background sleep to run and open stdin
 	pids="$pids $!"
 	rm -f "$path"
 done
@@ -70,9 +80,63 @@ while test -d $(echo /sys/fs/scoutfs/*/fence/* | cut -d " " -f 1); do
 	sleep .5
 done
 # wait for orphan scans to run
-sleep 55
+t_set_all_sysfs_mount_options orphan_scan_delay_ms 1000
+# also have to wait for delayed log merge work from mount
+sleep 15
 for ino in $inos; do
 	inode_exists $ino && echo "$ino still exists"
 done

+RUNTIME=30
+echo "== ${RUNTIME}s of racing evict deletion, orphan scanning, and open by handle"
+
+# exclude last client mount
+last=""
+for nr in $(t_fs_nrs); do
+	last=$nr
+done
+
+END=$((SECONDS + RUNTIME))
+while [ $SECONDS -lt $END ]; do
+	# hold open per-mount unlinked files
+	pids=""
+	ino_args=""
+	for nr in $(t_fs_nrs); do
+		test $nr == $last && continue
+
+		eval path="\$T_D${nr}/racing-$nr"
+		touch "$path"
+		ino_args="$ino_args -i $(stat -c %i $path)"
+
+		sleep 1000000 < "$path" &
+		sleep .1 # wait for sleep to start and open input :/
+		pids="$pids $!"
+		rm -f "$path"
+	done
+
+	# remount excluded last client to force log merging and make orphan visible
+	sync
+	t_umount $last
+	t_mount $last
+
+	# get all mounts scanning orphans at high frequency
+	t_set_all_sysfs_mount_options orphan_scan_delay_ms 100
+
+	# spin having tasks in each mount trying to open/fsetxattr all inos
+	for nr in $(t_fs_nrs); do
+		test $nr == $last && continue
+
+		eval path="\$T_M${nr}"
+		handle_fsetxattr -e $ino_args -m "$path" -s 2 &
+	done
+
+	# trigger eviction deletion of each file in each mount
+	silent_kill $pids
+
+	wait || t_fail "handle_fsetxattr failed"
+
+	# slow down orphan scanning for the next iteration
+	t_set_all_sysfs_mount_options orphan_scan_delay_ms $(((RUNTIME * 2) * 1000))
+done
+
 t_pass
--- a/utils/man/scoutfs.5
+++ b/utils/man/scoutfs.5
@@ -21,6 +21,21 @@ contains the filesystem's metadata.
 .sp
 This option is required.
 .TP
+.B orphan_scan_delay_ms=<number>
+This option sets the average expected delay, in milliseconds, between
+each mount's scan of the global orphaned inode list.  Jitter is added to
+avoid contention so each individual delay between scans is a random
+value up to 20% less than or greater than this average expected delay.
+.sp
+The minimum value for this option is 100ms which is very short and is
+only reasonable for testing or experiments.   The default is 10000ms (10
+seconds) and the maximum is 60000ms (1 minute).
+.sp
+This option can be changed in an active mount by writing to its file in
+the options directory in the mount's sysfs directory.  Writing a new
+value will cause the next pending orphan scan to be rescheduled
+with the newly written delay time.
+.TP
 .B quorum_slot_nr=<number>
 The quorum_slot_nr option assigns a quorum member slot to the mount.
 The mount will use the slot assignment to claim exclusive ownership of