No i_mutex in aio_read for data_wait_check

As a regular file reader first acquires a cluster lock it checks the file's extents to see if the region it is reading contains offline extents. When this was written the extent operations didn't have their own internal locking. It was up to callers to use vfs mechanisms to serialize readers and writers. The aio_read data_wait_check extent caller tried to use dio_count and an i_mutex acquisition to ensure that our ioctls wouldn't modify extents. This creates a bad inversion between the vfs i_mutex and our cluster inode lock. There are lots of fs methods which are called by the vfs with i_mutex held which acquire a lock. This read case was holding a cluster lock and then acquiring i_mutex. Since the data waiting was written the file data extent operations have added their own extent_sem to protect the extent items from concurrent callers. We can rely on that internal locking and drop the bad i_mutex use which causes the inversion. Not surprisingly, this was trivial to deadlock by racing simple utilities like cat and touch. Signed-off-by: Zach Brown <zab@versity.com>
Add export-lookup-evict-race test
2026-04-30 09:56:55 +00:00 · 2021-04-27 11:20:40 -07:00 · 2021-04-27 11:20:40 -07:00 · 2021-04-23 16:40:45 -07:00 · 2021-04-23 14:39:29 -07:00 · 2021-04-23 10:55:11 -07:00
32 changed files with 2289 additions and 372 deletions
--- a/kmod/src/Makefile
+++ b/kmod/src/Makefile
@@ -27,9 +27,11 @@ scoutfs-y +=			\
 	lock_server.o		\
 	msg.o			\
 	net.o			\
+	omap.o			\
 	options.o		\
 	per_task.o		\
 	quorum.o		\
+	recov.o			\
 	scoutfs_trace.o		\
 	server.o		\
 	sort_priv.o		\
--- a/kmod/src/block.c
+++ b/kmod/src/block.c
@@ -128,6 +128,7 @@ static __le32 block_calc_crc(struct scoutfs_block_header *hdr, u32 size)
 static struct block_private *block_alloc(struct super_block *sb, u64 blkno)
 {
 	struct block_private *bp;
+	unsigned int noio_flags;

 	/*
 	 * If we had multiple blocks per page we'd need to be a little
@@ -147,8 +148,19 @@ static struct block_private *block_alloc(struct super_block *sb, u64 blkno)
 		set_bit(BLOCK_BIT_PAGE_ALLOC, &bp->bits);
 		bp->bl.data = page_address(bp->page);
 	} else {
-		bp->virt = __vmalloc(SCOUTFS_BLOCK_LG_SIZE,
-				     GFP_NOFS | __GFP_HIGHMEM, PAGE_KERNEL);
+		/*
+		 * __vmalloc doesn't pass the gfp flags down to pte
+		 * allocs, they're done with user alloc flags.
+		 * Unfortunately, some lockdep doesn't know that
+		 * PF_NOMEMALLOC prevents __GFP_FS reclaim and generates
+		 * spurious reclaim-on dependencies and warnings.
+		 */
+		lockdep_off();
+		noio_flags = memalloc_noio_save();
+		bp->virt = __vmalloc(SCOUTFS_BLOCK_LG_SIZE, GFP_NOFS | __GFP_HIGHMEM, PAGE_KERNEL);
+		memalloc_noio_restore(noio_flags);
+		lockdep_on();
+
 		if (!bp->virt) {
 			kfree(bp);
 			bp = NULL;
@@ -1245,7 +1257,7 @@ out:
 	if (ret)
 		scoutfs_block_destroy(sb);

-	return 0;
+	return ret;
 }

 void scoutfs_block_destroy(struct super_block *sb)
--- a/kmod/src/client.c
+++ b/kmod/src/client.c
@@ -31,6 +31,7 @@
 #include "net.h"
 #include "endian_swap.h"
 #include "quorum.h"
+#include "omap.h"

 /*
 * The client is responsible for maintaining a connection to the server.
@@ -215,6 +216,39 @@ int scoutfs_client_srch_commit_compact(struct super_block *sb,
 					res, sizeof(*res), NULL, 0);
 }

+int scoutfs_client_send_omap_response(struct super_block *sb, u64 id,
+				      struct scoutfs_open_ino_map *map)
+{
+	struct client_info *client = SCOUTFS_SB(sb)->client_info;
+
+	return scoutfs_net_response(sb, client->conn, SCOUTFS_NET_CMD_OPEN_INO_MAP,
+				    id, 0, map, sizeof(*map));
+}
+
+/* The client is receiving an omap request from the server */
+static int client_open_ino_map(struct super_block *sb, struct scoutfs_net_connection *conn,
+			       u8 cmd, u64 id, void *arg, u16 arg_len)
+{
+	if (arg_len != sizeof(struct scoutfs_open_ino_map_args))
+		return -EINVAL;
+
+	return scoutfs_omap_client_handle_request(sb, id, arg);
+}
+
+/* The client is sending an omap request to the server */
+int scoutfs_client_open_ino_map(struct super_block *sb, u64 group_nr,
+				struct scoutfs_open_ino_map *map)
+{
+	struct client_info *client = SCOUTFS_SB(sb)->client_info;
+	struct scoutfs_open_ino_map_args args = {
+		.group_nr = cpu_to_le64(group_nr),
+		.req_id = 0,
+	};
+
+	return scoutfs_net_sync_request(sb, client->conn, SCOUTFS_NET_CMD_OPEN_INO_MAP,
+					&args, sizeof(args), map, sizeof(*map));
+}
+
 /* The client is receiving a invalidation request from the server */
 static int client_lock(struct super_block *sb,
 		       struct scoutfs_net_connection *conn, u8 cmd, u64 id,
@@ -413,6 +447,7 @@ out:
 static scoutfs_net_request_t client_req_funcs[] = {
 	[SCOUTFS_NET_CMD_LOCK]			= client_lock,
 	[SCOUTFS_NET_CMD_LOCK_RECOVER]		= client_lock_recover,
+	[SCOUTFS_NET_CMD_OPEN_INO_MAP]		= client_open_ino_map,
 };

 /*
--- a/kmod/src/client.h
+++ b/kmod/src/client.h
@@ -22,6 +22,10 @@ int scoutfs_client_srch_get_compact(struct super_block *sb,
 				    struct scoutfs_srch_compact *sc);
 int scoutfs_client_srch_commit_compact(struct super_block *sb,
 				       struct scoutfs_srch_compact *res);
+int scoutfs_client_send_omap_response(struct super_block *sb, u64 id,
+				      struct scoutfs_open_ino_map *map);
+int scoutfs_client_open_ino_map(struct super_block *sb, u64 group_nr,
+				struct scoutfs_open_ino_map *map);

 int scoutfs_client_setup(struct super_block *sb);
 void scoutfs_client_destroy(struct super_block *sb);
--- a/kmod/src/data.c
+++ b/kmod/src/data.c
@@ -1285,7 +1285,7 @@ int scoutfs_data_move_blocks(struct inode *from, u64 from_off,

 			if (is_stage) {
 				ret = scoutfs_ext_next(sb, &data_ext_ops, &to_args,
-						       to_iblock, 1, &off_ext);
+						       to_start, 1, &off_ext);
 				if (ret)
 					break;

--- a/kmod/src/dir.c
+++ b/kmod/src/dir.c
@@ -30,6 +30,7 @@
 #include "item.h"
 #include "lock.h"
 #include "hash.h"
+#include "omap.h"
 #include "counters.h"
 #include "scoutfs_trace.h"

--- a/kmod/src/file.c
+++ b/kmod/src/file.c
@@ -27,8 +27,14 @@
 #include "file.h"
 #include "inode.h"
 #include "per_task.h"
+#include "omap.h"

-/* TODO: Direct I/O, AIO */
+/*
+ * Start a high level file read.  We check for offline extents in the
+ * read region here so that we only check the extents once.  We use the
+ * dio count to prevent releasing while we're reading after we've
+ * checked the extents.
+ */
 ssize_t scoutfs_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
 			      unsigned long nr_segs, loff_t pos)
 {
@@ -42,30 +48,32 @@ ssize_t scoutfs_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
 	int ret;

 retry:
+	/* protect checked extents from release */
+	mutex_lock(&inode->i_mutex);
+	atomic_inc(&inode->i_dio_count);
+	mutex_unlock(&inode->i_mutex);
+
 	ret = scoutfs_lock_inode(sb, SCOUTFS_LOCK_READ,
 				 SCOUTFS_LKF_REFRESH_INODE, inode, &inode_lock);
 	if (ret)
 		goto out;

 	if (scoutfs_per_task_add_excl(&si->pt_data_lock, &pt_ent, inode_lock)) {
-		/* protect checked extents from stage/release */
-		mutex_lock(&inode->i_mutex);
-		atomic_inc(&inode->i_dio_count);
-		mutex_unlock(&inode->i_mutex);
-
 		ret = scoutfs_data_wait_check_iov(inode, iov, nr_segs, pos,
 						  SEF_OFFLINE,
 						  SCOUTFS_IOC_DWO_READ,
 						  &dw, inode_lock);
 		if (ret != 0)
 			goto out;
+	} else {
+		WARN_ON_ONCE(true);
 	}

 	ret = generic_file_aio_read(iocb, iov, nr_segs, pos);

 out:
-	if (scoutfs_per_task_del(&si->pt_data_lock, &pt_ent))
-		inode_dio_done(inode);
+	inode_dio_done(inode);
+	scoutfs_per_task_del(&si->pt_data_lock, &pt_ent);
 	scoutfs_unlock(sb, inode_lock, SCOUTFS_LOCK_READ);

 	if (scoutfs_data_wait_found(&dw)) {
--- a/kmod/src/format.h
+++ b/kmod/src/format.h
@@ -195,9 +195,6 @@ struct scoutfs_key {
 #define sklt_rid	_sk_first
 #define sklt_nr		_sk_second

-/* lock clients */
-#define sklc_rid	_sk_first
-
 /* seqs */
 #define skts_trans_seq	_sk_first
 #define skts_rid	_sk_second
@@ -493,11 +490,10 @@ struct scoutfs_bloom_block {
 #define SCOUTFS_LOCK_ZONE			4
 /* Items only stored in server btrees */
 #define SCOUTFS_LOG_TREES_ZONE			6
-#define SCOUTFS_LOCK_CLIENTS_ZONE		7
-#define SCOUTFS_TRANS_SEQ_ZONE			8
-#define SCOUTFS_MOUNTED_CLIENT_ZONE		9
-#define SCOUTFS_SRCH_ZONE			10
-#define SCOUTFS_FREE_EXTENT_ZONE		11
+#define SCOUTFS_TRANS_SEQ_ZONE			7
+#define SCOUTFS_MOUNTED_CLIENT_ZONE		8
+#define SCOUTFS_SRCH_ZONE			9
+#define SCOUTFS_FREE_EXTENT_ZONE		10

 /* inode index zone */
 #define SCOUTFS_INODE_INDEX_META_SEQ_TYPE	1
@@ -653,7 +649,6 @@ struct scoutfs_super_block {
 	struct scoutfs_alloc_list_head server_meta_freed[2];
 	struct scoutfs_btree_root fs_root;
 	struct scoutfs_btree_root logs_root;
-	struct scoutfs_btree_root lock_clients;
 	struct scoutfs_btree_root trans_seqs;
 	struct scoutfs_btree_root mounted_clients;
 	struct scoutfs_btree_root srch_root;
@@ -845,6 +840,7 @@ enum scoutfs_net_cmd {
 	SCOUTFS_NET_CMD_LOCK_RECOVER,
 	SCOUTFS_NET_CMD_SRCH_GET_COMPACT,
 	SCOUTFS_NET_CMD_SRCH_COMMIT_COMPACT,
+	SCOUTFS_NET_CMD_OPEN_INO_MAP,
 	SCOUTFS_NET_CMD_FAREWELL,
 	SCOUTFS_NET_CMD_UNKNOWN,
 };
@@ -965,4 +961,42 @@ enum scoutfs_corruption_sources {

 #define SC_NR_LONGS DIV_ROUND_UP(SC_NR_SOURCES, BITS_PER_LONG)

+#define SCOUTFS_OPEN_INO_MAP_SHIFT	10
+#define SCOUTFS_OPEN_INO_MAP_BITS	(1 << SCOUTFS_OPEN_INO_MAP_SHIFT)
+#define SCOUTFS_OPEN_INO_MAP_MASK	(SCOUTFS_OPEN_INO_MAP_BITS - 1)
+#define SCOUTFS_OPEN_INO_MAP_LE64S	(SCOUTFS_OPEN_INO_MAP_BITS / 64)
+
+/*
+ * The request and response conversation is as follows:
+ *
+ * client[init] -> server:
+ *	group_nr = G
+ *	req_id = 0	(I)
+ * server -> client[*]
+ *	group_nr = G
+ *	req_id = R
+ * client[*] -> server
+ *	group_nr = G	(I)
+ *	req_id = R
+ *	bits
+ * server -> client[init]
+ *	group_nr = G	(I)
+ *	req_id = R	(I)
+ *	bits
+ *
+ * Many of the fields in individual messages are ignored ("I") because
+ * the net id or the omap req_id can be used to identify the
+ * conversation.  We always include them on the wire to make inspected
+ * messages easier to follow.
+ */
+struct scoutfs_open_ino_map_args {
+	__le64 group_nr;
+	__le64 req_id;
+};
+
+struct scoutfs_open_ino_map {
+	struct scoutfs_open_ino_map_args args;
+	__le64 bits[SCOUTFS_OPEN_INO_MAP_LE64S];
+};
+
 #endif
--- a/kmod/src/inode.c
+++ b/kmod/src/inode.c
@@ -33,6 +33,7 @@
 #include "item.h"
 #include "client.h"
 #include "cmp.h"
+#include "omap.h"

 /*
 * XXX
@@ -82,6 +83,8 @@ static void scoutfs_inode_ctor(void *obj)
 	init_waitqueue_head(&si->data_waitq.waitq);
 	init_rwsem(&si->xattr_rwsem);
 	RB_CLEAR_NODE(&si->writeback_node);
+	scoutfs_lock_init_coverage(&si->ino_lock_cov);
+	atomic_set(&si->inv_iput_count, 0);

 	inode_init_once(&si->inode);
 }
@@ -141,12 +144,15 @@ static void remove_writeback_inode(struct inode_sb_info *inf,

 void scoutfs_destroy_inode(struct inode *inode)
 {
+	struct scoutfs_inode_info *si = SCOUTFS_I(inode);
 	DECLARE_INODE_SB_INFO(inode->i_sb, inf);

 	spin_lock(&inf->writeback_lock);
 	remove_writeback_inode(inf, SCOUTFS_I(inode));
 	spin_unlock(&inf->writeback_lock);

+	scoutfs_lock_del_coverage(inode->i_sb, &si->ino_lock_cov);
+
 	call_rcu(&inode->i_rcu, scoutfs_i_callback);
 }

@@ -307,6 +313,8 @@ int scoutfs_inode_refresh(struct inode *inode, struct scoutfs_lock *lock,
 		if (ret == 0) {
 			load_inode(inode, &sinode);
 			atomic64_set(&si->last_refreshed, refresh_gen);
+			scoutfs_lock_add_coverage(sb, lock, &si->ino_lock_cov);
+			si->drop_invalidated = false;
 		}
 	} else {
 		ret = 0;
@@ -664,6 +672,28 @@ struct inode *scoutfs_ilookup(struct super_block *sb, u64 ino)
 	return ilookup5(sb, ino, scoutfs_iget_test, &ino);
 }

+static int iget_test_nofreeing(struct inode *inode, void *arg)
+{
+	return !(inode->i_state & I_FREEING) && scoutfs_iget_test(inode, arg);
+}
+
+/*
+ * There's a natural risk of a deadlock between lock invalidation and
+ * eviction.  Invalidation blocks locks while looking up inodes and
+ * invalidating local caches.  Inode eviction gets a lock to check final
+ * inode deletion while the inode is marked FREEING which blocks
+ * lookups.
+ *
+ * We have a lookup variant which doesn't return I_FREEING inodes
+ * instead of waiting on them.  If an inode has made it to I_FREEING
+ * then it doesn't have any local caches that are reachable and the lock
+ * invalidation promise is kept.
+ */
+struct inode *scoutfs_ilookup_nofreeing(struct super_block *sb, u64 ino)
+{
+	return ilookup5(sb, ino, iget_test_nofreeing, &ino);
+}
+
 struct inode *scoutfs_iget(struct super_block *sb, u64 ino)
 {
 	struct scoutfs_lock *lock = NULL;
@@ -688,6 +718,8 @@ struct inode *scoutfs_iget(struct super_block *sb, u64 ino)
 		atomic64_set(&si->last_refreshed, 0);

 		ret = scoutfs_inode_refresh(inode, lock, 0);
+		if (ret == 0)
+			ret = scoutfs_omap_inc(sb, ino);
 		if (ret) {
 			iget_failed(inode);
 			inode = ERR_PTR(ret);
@@ -1384,6 +1416,8 @@ struct inode *scoutfs_new_inode(struct super_block *sb, struct inode *dir,
 	si->next_xattr_id = 0;
 	si->have_item = false;
 	atomic64_set(&si->last_refreshed, lock->refresh_gen);
+	scoutfs_lock_add_coverage(sb, lock, &si->ino_lock_cov);
+	si->drop_invalidated = false;
 	si->flags = 0;

 	scoutfs_inode_set_meta_seq(inode);
@@ -1399,10 +1433,17 @@ struct inode *scoutfs_new_inode(struct super_block *sb, struct inode *dir,
 	store_inode(&sinode, inode);
 	init_inode_key(&key, scoutfs_ino(inode));

+	ret = scoutfs_omap_inc(sb, ino);
+	if (ret < 0)
+		goto out;
+
 	ret = scoutfs_item_create(sb, &key, &sinode, sizeof(sinode), lock);
+	if (ret < 0)
+		scoutfs_omap_dec(sb, ino);
+out:
 	if (ret) {
 		iput(inode);
-		return ERR_PTR(ret);
+		inode = ERR_PTR(ret);
 	}

 	return inode;
@@ -1447,15 +1488,15 @@ int scoutfs_orphan_delete(struct super_block *sb, u64 ino)

 /*
 * Remove all the items associated with a given inode.  This is only
- * called once nlink has dropped to zero so we don't have to worry about
- * dirents referencing the inode or link backrefs.  Dropping nlink to 0
- * also created an orphan item.  That orphan item will continue
- * triggering attempts to finish previous partial deletion until all
- * deletion is complete and the orphan item is removed.
+ * called once nlink has dropped to zero and nothing has the inode open
+ * so we don't have to worry about dirents referencing the inode or link
+ * backrefs.  Dropping nlink to 0 also created an orphan item.  That
+ * orphan item will continue triggering attempts to finish previous
+ * partial deletion until all deletion is complete and the orphan item
+ * is removed.
 */
-static int delete_inode_items(struct super_block *sb, u64 ino)
+static int delete_inode_items(struct super_block *sb, u64 ino, struct scoutfs_lock *lock)
 {
-	struct scoutfs_lock *lock = NULL;
 	struct scoutfs_inode sinode;
 	struct scoutfs_key key;
 	LIST_HEAD(ind_locks);
@@ -1465,10 +1506,6 @@ static int delete_inode_items(struct super_block *sb, u64 ino)
 	u64 size;
 	int ret;

-	ret = scoutfs_lock_ino(sb, SCOUTFS_LOCK_WRITE, 0, ino, &lock);
-	if (ret)
-		return ret;
-
 	init_inode_key(&key, ino);

 	ret = scoutfs_item_lookup_exact(sb, &key, &sinode, sizeof(sinode),
@@ -1533,18 +1570,24 @@ out:
 	if (release)
 		scoutfs_release_trans(sb);
 	scoutfs_inode_index_unlock(sb, &ind_locks);
-	scoutfs_unlock(sb, lock, SCOUTFS_LOCK_WRITE);
+
 	return ret;
 }

 /*
 * iput_final has already written out the dirty pages to the inode
 * before we get here.  We're left with a clean inode that we have to
- * tear down.  If there are no more links to the inode then we also
- * remove all its persistent structures.
+ * tear down.  We use locking and open inode number bitmaps to decide if
+ * we should finally destroy an inode that is no longer open nor
+ * reachable through directory entries.
 */
 void scoutfs_evict_inode(struct inode *inode)
 {
+	struct super_block *sb = inode->i_sb;
+	const u64 ino = scoutfs_ino(inode);
+	struct scoutfs_lock *lock;
+	int ret;
+
 	trace_scoutfs_evict_inode(inode->i_sb, scoutfs_ino(inode),
 				  inode->i_nlink, is_bad_inode(inode));

@@ -1553,19 +1596,45 @@ void scoutfs_evict_inode(struct inode *inode)

 	truncate_inode_pages_final(&inode->i_data);

-	if (inode->i_nlink == 0)
-		delete_inode_items(inode->i_sb, scoutfs_ino(inode));
+	ret = scoutfs_omap_should_delete(sb, inode, &lock);
+	if (ret > 0) {
+		ret = delete_inode_items(inode->i_sb, scoutfs_ino(inode), lock);
+		scoutfs_unlock(sb, lock, SCOUTFS_LOCK_WRITE);
+	}
+	if (ret < 0)
+		scoutfs_err(sb, "error %d while checking to delete inode nr %llu, it might linger.",
+			    ret, ino);
+
+	scoutfs_omap_dec(sb, ino);
+
 clear:
 	clear_inode(inode);
 }

+/*
+ * We want to remove inodes from the cache as their count goes to 0 if
+ * they're no longer covered by a cluster lock or if while locked they
+ * were unlinked.
+ *
+ * We don't want unused cached inodes to linger outside of cluster
+ * locking so that they don't prevent final inode deletion on other
+ * nodes.  We don't have specific per-inode or per-dentry locks which
+ * would otherwise remove the stale caches as they're invalidated.
+ * Stale cached inodes provide little value because they're going to be
+ * refreshed the next time they're locked.  Populating the item cache
+ * and loading the inode item is a lot more expensive than initializing
+ * and inserting a newly allocated vfs inode.
+ */
 int scoutfs_drop_inode(struct inode *inode)
 {
-	int ret = generic_drop_inode(inode);
+	struct scoutfs_inode_info *si = SCOUTFS_I(inode);
+	struct super_block *sb = inode->i_sb;

-	trace_scoutfs_drop_inode(inode->i_sb, scoutfs_ino(inode),
-				 inode->i_nlink, inode_unhashed(inode));
-	return ret;
+	trace_scoutfs_drop_inode(sb, scoutfs_ino(inode), inode->i_nlink, inode_unhashed(inode),
+				 si->drop_invalidated);
+
+	return si->drop_invalidated || !scoutfs_lock_is_covered(sb, &si->ino_lock_cov) ||
+	       generic_drop_inode(inode);
 }

 /*
@@ -1582,8 +1651,10 @@ int scoutfs_scan_orphans(struct super_block *sb)
 {
 	struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
 	struct scoutfs_lock *lock = sbi->rid_lock;
+	struct scoutfs_lock *inode_lock = NULL;
 	struct scoutfs_key key;
 	struct scoutfs_key last;
+	u64 ino;
 	int err = 0;
 	int ret;

@@ -1599,7 +1670,13 @@ int scoutfs_scan_orphans(struct super_block *sb)
 		if (ret < 0)
 			goto out;

-		ret = delete_inode_items(sb, le64_to_cpu(key.sko_ino));
+		ino = le64_to_cpu(key.sko_ino);
+
+		ret = scoutfs_lock_ino(sb, SCOUTFS_LOCK_WRITE, 0, ino, &inode_lock);
+		if (ret == 0) {
+			ret = delete_inode_items(sb, le64_to_cpu(key.sko_ino), inode_lock);
+			scoutfs_unlock(sb, inode_lock, SCOUTFS_LOCK_WRITE);
+		}
 		if (ret && ret != -ENOENT && !err)
 			err = ret;

--- a/kmod/src/inode.h
+++ b/kmod/src/inode.h
@@ -51,6 +51,13 @@ struct scoutfs_inode_info {
 	struct rw_semaphore xattr_rwsem;
 	struct rb_node writeback_node;

+	struct scoutfs_lock_coverage ino_lock_cov;
+
+	/* drop if i_count hits 0, allows drop while invalidate holds coverage */
+	bool drop_invalidated;
+	struct llist_node inv_iput_llnode;
+	atomic_t inv_iput_count;
+
 	struct inode inode;
 };

@@ -72,6 +79,7 @@ int scoutfs_orphan_inode(struct inode *inode);

 struct inode *scoutfs_iget(struct super_block *sb, u64 ino);
 struct inode *scoutfs_ilookup(struct super_block *sb, u64 ino);
+struct inode *scoutfs_ilookup_nofreeing(struct super_block *sb, u64 ino);

 void scoutfs_inode_init_index_key(struct scoutfs_key *key, u8 type, u64 major,
 				  u32 minor, u64 ino);
--- a/kmod/src/lock.c
+++ b/kmod/src/lock.c
@@ -34,6 +34,7 @@
 #include "data.h"
 #include "xattr.h"
 #include "item.h"
+#include "omap.h"

 /*
 * scoutfs uses a lock service to manage item cache consistency between
@@ -74,6 +75,7 @@ struct lock_info {
 	struct super_block *sb;
 	spinlock_t lock;
 	bool shutdown;
+	bool unmounting;
 	struct rb_root lock_tree;
 	struct rb_root lock_range_tree;
 	struct shrinker shrinker;
@@ -87,6 +89,9 @@ struct lock_info {
 	struct work_struct shrink_work;
 	struct list_head shrink_list;
 	atomic64_t next_refresh_gen;
+	struct work_struct inv_iput_work;
+	struct llist_head inv_iput_llist;
+
 	struct dentry *tseq_dentry;
 	struct scoutfs_tseq_tree tseq_tree;
 };
@@ -122,21 +127,81 @@ static bool lock_modes_match(int granted, int requested)
 }

 /*
- * invalidate cached data associated with an inode whose lock is going
- * away.
+ * Final iput can get into evict and perform final inode deletion which
+ * can delete a lot of items under locks and transactions.  We really
+ * don't want to be doing all that in an iput during invalidation.  When
+ * invalidation sees that iput might perform final deletion it puts them
+ * on a list and queues this work.
+ *
+ * Nothing stops multiple puts for multiple invalidations of an inode
+ * before the work runs so we can track multiple puts in flight.
+ */
+static void lock_inv_iput_worker(struct work_struct *work)
+{
+	struct lock_info *linfo = container_of(work, struct lock_info, inv_iput_work);
+	struct scoutfs_inode_info *si;
+	struct scoutfs_inode_info *tmp;
+	struct llist_node *inodes;
+	bool more;
+
+	inodes = llist_del_all(&linfo->inv_iput_llist);
+
+	llist_for_each_entry_safe(si, tmp, inodes, inv_iput_llnode) {
+		do {
+			more = atomic_dec_return(&si->inv_iput_count) > 0;
+			iput(&si->inode);
+		} while (more);
+	}
+}
+
+/*
+ * Invalidate cached data associated with an inode whose lock is going
+ * away.  We ignore indoes with I_FREEING instead of waiting on them to
+ * avoid a deadlock, if they're freeing then they won't be visible to
+ * future lock users and we don't need to invalidate them.
+ *
+ * We try to drop cached dentries and inodes covered by the lock if they
+ * aren't referenced.  This removes them from the mount's open map and
+ * allows deletions to be performed by unlink without having to wait for
+ * remote cached inodes to be dropped.
+ *
+ * If the cached inode was already deferring final inode deletion then
+ * we can't perform that inline in invalidation.  The locking alone
+ * deadlock, and it might also take multiple transactions to fully
+ * delete an inode with significant metadata.  We only perform the iput
+ * inline if we know that possible eviction can't perform the final
+ * deletion, otherwise we kick it off to async work.
 */
 static void invalidate_inode(struct super_block *sb, u64 ino)
 {
+	DECLARE_LOCK_INFO(sb, linfo);
+	struct scoutfs_inode_info *si;
 	struct inode *inode;

-	inode = scoutfs_ilookup(sb, ino);
+	inode = scoutfs_ilookup_nofreeing(sb, ino);
 	if (inode) {
+		si = SCOUTFS_I(inode);
+
 		scoutfs_inc_counter(sb, lock_invalidate_inode);
 		if (S_ISREG(inode->i_mode)) {
 			truncate_inode_pages(inode->i_mapping, 0);
 			scoutfs_data_wait_changed(inode);
 		}
-		iput(inode);
+
+		/* can't touch during unmount, dcache destroys w/o locks */
+		if (!linfo->unmounting)
+			d_prune_aliases(inode);
+
+		si->drop_invalidated = true;
+		if (scoutfs_lock_is_covered(sb, &si->ino_lock_cov) && inode->i_nlink > 0) {
+			iput(inode);
+		} else {
+			/* defer iput to work context so we don't evict inodes from invalidation */ 
+			if (atomic_inc_return(&si->inv_iput_count) == 1)
+				llist_add(&si->inv_iput_llnode, &linfo->inv_iput_llist);
+			smp_wmb(); /* count and list visible before work executes */
+			queue_work(linfo->workq, &linfo->inv_iput_work);
+		}
 	}
 }

@@ -172,6 +237,16 @@ static int lock_invalidate(struct super_block *sb, struct scoutfs_lock *lock,
 	/* have to invalidate if we're not in the only usable case */
 	if (!(prev == SCOUTFS_LOCK_WRITE && mode == SCOUTFS_LOCK_READ)) {
 retry:
+		/* invalidate inodes before removing coverage */
+		if (lock->start.sk_zone == SCOUTFS_FS_ZONE) {
+			ino = le64_to_cpu(lock->start.ski_ino);
+			last = le64_to_cpu(lock->end.ski_ino);
+			while (ino <= last) {
+				invalidate_inode(sb, ino);
+				ino++;
+			}
+		}
+
 		/* remove cov items to tell users that their cache is stale */
 		spin_lock(&lock->cov_list_lock);
 		list_for_each_entry_safe(cov, tmp, &lock->cov_list, head) {
@@ -187,15 +262,6 @@ retry:
 		}
 		spin_unlock(&lock->cov_list_lock);

-		if (lock->start.sk_zone == SCOUTFS_FS_ZONE) {
-			ino = le64_to_cpu(lock->start.ski_ino);
-			last = le64_to_cpu(lock->end.ski_ino);
-			while (ino <= last) {
-				invalidate_inode(sb, ino);
-				ino++;
-			}
-		}
-
 		scoutfs_item_invalidate(sb, &lock->start, &lock->end);
 	}

@@ -229,6 +295,7 @@ static void lock_free(struct lock_info *linfo, struct scoutfs_lock *lock)
 	BUG_ON(!list_empty(&lock->shrink_head));
 	BUG_ON(!list_empty(&lock->cov_list));

+	scoutfs_omap_free_lock_data(lock->omap_data);
 	kfree(lock);
 }

@@ -264,6 +331,7 @@ static struct scoutfs_lock *lock_alloc(struct super_block *sb,
 	lock->mode = SCOUTFS_LOCK_NULL;

 	atomic64_set(&lock->forest_bloom_nr, 0);
+	spin_lock_init(&lock->omap_spinlock);

 	trace_scoutfs_lock_alloc(sb, lock);

@@ -553,7 +621,7 @@ static void queue_grant_work(struct lock_info *linfo)
 {
 	assert_spin_locked(&linfo->lock);

-	if (!list_empty(&linfo->grant_list) && !linfo->shutdown)
+	if (!list_empty(&linfo->grant_list))
 		queue_work(linfo->workq, &linfo->grant_work);
 }

@@ -569,7 +637,7 @@ static void queue_inv_work(struct lock_info *linfo)
 {
 	assert_spin_locked(&linfo->lock);

-	if (!list_empty(&linfo->inv_list) && !linfo->shutdown)
+	if (!list_empty(&linfo->inv_list))
 		mod_delayed_work(linfo->workq, &linfo->inv_dwork, 0);
 }

@@ -802,8 +870,11 @@ static void lock_invalidate_worker(struct work_struct *work)
 		nl = &lock->inv_nl;
 		net_id = lock->inv_net_id;

-		ret = lock_invalidate(sb, lock, nl->old_mode, nl->new_mode);
-		BUG_ON(ret);
+		/* only lock protocol, inv can't call subsystems after shutdown */
+		if (!linfo->shutdown) {
+			ret = lock_invalidate(sb, lock, nl->old_mode, nl->new_mode);
+			BUG_ON(ret);
+		}

 		/* allow another request after we respond but before we finish */
 		lock->inv_net_id = 0;
@@ -1011,7 +1082,7 @@ static int lock_key_range(struct super_block *sb, enum scoutfs_lock_mode mode, i
 	lock_inc_count(lock->waiters, mode);

 	for (;;) {
-		if (linfo->shutdown) {
+		if (WARN_ON_ONCE(linfo->shutdown)) {
 			ret = -ESHUTDOWN;
 			break;
 		}
@@ -1493,7 +1564,7 @@ restart:
 		BUG_ON(lock->mode == SCOUTFS_LOCK_NULL);
 		BUG_ON(!list_empty(&lock->shrink_head));

-		if (linfo->shutdown || nr-- == 0)
+		if (nr-- == 0)
 			break;

 		__lock_del_lru(linfo, lock);
@@ -1520,7 +1591,7 @@ out:
 	return ret;
 }

-void scoutfs_free_unused_locks(struct super_block *sb, unsigned long nr)
+void scoutfs_free_unused_locks(struct super_block *sb)
 {
 	struct lock_info *linfo = SCOUTFS_SB(sb)->lock_info;
 	struct shrink_control sc = {
@@ -1548,15 +1619,40 @@ static void lock_tseq_show(struct seq_file *m, struct scoutfs_tseq_entry *ent)
 }

 /*
- * The caller is going to be calling _destroy soon and, critically, is
- * about to shutdown networking before calling us so that we don't get
- * any callbacks while we're destroying.  We have to ensure that we
- * won't call networking after this returns.
+ * shrink_dcache_for_umount() tears down dentries with no locking.  We
+ * need to make sure that our invalidation won't touch dentries before
+ * we return and the caller calls the generic vfs unmount path.
+ */
+void scoutfs_lock_unmount_begin(struct super_block *sb)
+{
+	DECLARE_LOCK_INFO(sb, linfo);
+
+	if (linfo) {
+		linfo->unmounting = true;
+		flush_delayed_work(&linfo->inv_dwork);
+	}
+}
+
+/*
+ * The caller is going to be shutting down transactions and the client.
+ * We need to make sure that locking won't call either after we return.
 *
- * Internal fs threads can be using locking, and locking can have async
- * work pending.  We use ->shutdown to force callers to return
- * -ESHUTDOWN and to prevent the future queueing of work that could call
- * networking.  Locks whose work is stopped will be torn down by _destroy.
+ * At this point all fs callers and internal services that use locks
+ * should have stopped.  We won't have any callers initiating lock
+ * transitions and sending requests.   We set the shutdown flag to catch
+ * anyone who breaks this rule.
+ *
+ * We unregister the shrinker so that we won't try and send null
+ * requests in response to memory pressure.  The locks will all be
+ * unceremoniously dropped once we get a farewell response from the
+ * server which indicates that they destroyed our locking state.
+ *
+ * We will still respond to invalidation requests that have to be
+ * processed to let unmount in other mounts acquire locks and make
+ * progress.  However, we don't fully process the invalidation because
+ * we're shutting down.  We only update the lock state and send the
+ * response.  We shouldn't have any users of locking that require
+ * invalidation correctness at this point.
 */
 void scoutfs_lock_shutdown(struct super_block *sb)
 {
@@ -1569,19 +1665,18 @@ void scoutfs_lock_shutdown(struct super_block *sb)

 	trace_scoutfs_lock_shutdown(sb, linfo);

-	spin_lock(&linfo->lock);
+	/* stop the shrinker from queueing work */
+	unregister_shrinker(&linfo->shrinker);
+	flush_work(&linfo->shrink_work);

+	/* cause current and future lock calls to return errors */
+	spin_lock(&linfo->lock);
 	linfo->shutdown = true;
 	for (node = rb_first(&linfo->lock_tree); node; node = rb_next(node)) {
 		lock = rb_entry(node, struct scoutfs_lock, node);
 		wake_up(&lock->waitq);
 	}
-
 	spin_unlock(&linfo->lock);
-
-	flush_work(&linfo->grant_work);
-	flush_delayed_work(&linfo->inv_dwork);
-	flush_work(&linfo->shrink_work);
 }

 /*
@@ -1609,8 +1704,6 @@ void scoutfs_lock_destroy(struct super_block *sb)

 	trace_scoutfs_lock_destroy(sb, linfo);

-	/* stop the shrinker from queueing work */
-	unregister_shrinker(&linfo->shrinker);

 	/* make sure that no one's actively using locks */
 	spin_lock(&linfo->lock);
@@ -1656,8 +1749,10 @@ void scoutfs_lock_destroy(struct super_block *sb)
 			__lock_del_lru(linfo, lock);
 		if (!list_empty(&lock->grant_head))
 			list_del_init(&lock->grant_head);
-		if (!list_empty(&lock->inv_head))
+		if (!list_empty(&lock->inv_head)) {
 			list_del_init(&lock->inv_head);
+			lock->invalidate_pending = 0;
+		}
 		if (!list_empty(&lock->shrink_head))
 			list_del_init(&lock->shrink_head);
 		lock_remove(linfo, lock);
@@ -1694,6 +1789,8 @@ int scoutfs_lock_setup(struct super_block *sb)
 	INIT_WORK(&linfo->shrink_work, lock_shrink_worker);
 	INIT_LIST_HEAD(&linfo->shrink_list);
 	atomic64_set(&linfo->next_refresh_gen, 0);
+	INIT_WORK(&linfo->inv_iput_work, lock_inv_iput_worker);
+	init_llist_head(&linfo->inv_iput_llist);
 	scoutfs_tseq_tree_init(&linfo->tseq_tree, lock_tseq_show);

 	sbi->lock_info = linfo;
--- a/kmod/src/lock.h
+++ b/kmod/src/lock.h
@@ -10,6 +10,8 @@

 #define SCOUTFS_LOCK_NR_MODES		SCOUTFS_LOCK_INVALID

+struct scoutfs_omap_lock;
+
 /*
 * A few fields (start, end, refresh_gen, write_version, granted_mode)
 * are referenced by code outside lock.c.
@@ -47,6 +49,10 @@ struct scoutfs_lock {

 	/* the forest tracks which log tree last saw bloom bit updates */
 	atomic64_t forest_bloom_nr;
+
+	/* open ino mapping has a valid map for a held write lock */
+	spinlock_t omap_spinlock;
+	struct scoutfs_omap_lock_data *omap_data;
 };

 struct scoutfs_lock_coverage {
@@ -95,9 +101,10 @@ void scoutfs_lock_del_coverage(struct super_block *sb,
 bool scoutfs_lock_protected(struct scoutfs_lock *lock, struct scoutfs_key *key,
 			    enum scoutfs_lock_mode mode);

-void scoutfs_free_unused_locks(struct super_block *sb, unsigned long nr);
+void scoutfs_free_unused_locks(struct super_block *sb);

 int scoutfs_lock_setup(struct super_block *sb);
+void scoutfs_lock_unmount_begin(struct super_block *sb);
 void scoutfs_lock_shutdown(struct super_block *sb);
 void scoutfs_lock_destroy(struct super_block *sb);

--- a/kmod/src/lock_server.c
+++ b/kmod/src/lock_server.c
@@ -20,10 +20,10 @@
 #include "tseq.h"
 #include "spbm.h"
 #include "block.h"
-#include "btree.h"
 #include "msg.h"
 #include "scoutfs_trace.h"
 #include "lock_server.h"
+#include "recov.h"

 /*
 * The scoutfs server implements a simple lock service.  Client mounts
@@ -56,14 +56,11 @@
 * Message requests and responses are reliably delivered in order across
 * reconnection.
 *
- * The server maintains a persistent record of connected clients.  A new
- * server instance discovers these and waits for previously connected
- * clients to reconnect and recover their state before proceeding.  If
- * clients don't reconnect they are forcefully prevented from unsafely
- * accessing the shared persistent storage.  (fenced, according to the
- * rules of the platform.. could range from being powered off to having
- * their switch port disabled to having their local block device set
- * read-only.)
+ * As a new server comes up it recovers lock state from existing clients
+ * which were connected to a previous lock server.  Recover requests are
+ * sent to clients as they connect and they respond with all there
+ * locks.  Once all clients and locks are accounted for normal
+ * processing can resume.
 *
 * The lock server doesn't respond to memory pressure.  The only way
 * locks are freed is if they are invalidated to null on behalf of a
@@ -77,12 +74,8 @@ struct lock_server_info {
 	struct super_block *sb;

 	spinlock_t lock;
-	struct mutex mutex;
 	struct rb_root locks_root;

-	struct scoutfs_spbm recovery_pending;
-	struct delayed_work recovery_dwork;
-
 	struct scoutfs_tseq_tree tseq_tree;
 	struct dentry *tseq_dentry;

@@ -430,7 +423,7 @@ int scoutfs_lock_server_response(struct super_block *sb, u64 rid,
 		goto out;
 	}

-	/* XXX should always have a server lock here?  recovery? */
+	/* XXX should always have a server lock here? */
 	snode = get_server_lock(inf, &nl->key, NULL, false);
 	if (!snode) {
 		ret = -EINVAL;
@@ -473,12 +466,9 @@ out:
 * so we unlock the snode mutex.
 *
 * All progress must wait for all clients to finish with recovery
- * because we don't know which locks they'll hold.  The unlocked
- * recovery_pending test here is OK.  It's filled by setup before
- * anything runs.  It's emptied by recovery completion.  We can get a
- * false nonempty result if we race with recovery completion, but that's
- * OK because recovery completion processes all the locks that have
- * requests after emptying, including the unlikely loser of that race.
+ * because we don't know which locks they'll hold.  Once recover
+ * finishes the server calls us to kick all the locks that were waiting
+ * during recovery.
 */
 static int process_waiting_requests(struct super_block *sb,
 				    struct server_lock_node *snode)
@@ -496,7 +486,7 @@ static int process_waiting_requests(struct super_block *sb,

 	/* processing waits for all invalidation responses or recovery */
 	if (!list_empty(&snode->invalidated) ||
-	    !scoutfs_spbm_empty(&inf->recovery_pending)) {
+	    scoutfs_recov_next_pending(sb, SCOUTFS_RECOV_LOCKS) != 0) {
 		ret = 0;
 		goto out;
 	}
@@ -569,89 +559,39 @@ out:
 	return ret;
 }

-static void init_lock_clients_key(struct scoutfs_key *key, u64 rid)
-{
-	*key = (struct scoutfs_key) {
-		.sk_zone = SCOUTFS_LOCK_CLIENTS_ZONE,
-		.sklc_rid = cpu_to_le64(rid),
-	};
-}
-
 /*
 * The server received a greeting from a client for the first time.  If
- * the client had already talked to the server then we must find an
- * existing record for it and should begin recovery.  If it doesn't have
- * a record then its timed out and we can't allow it to reconnect.  If
- * we're creating a new record for a client we can see EEXIST if the
- * greeting is resent to a new server after the record was committed but
- * before the response was received by the client.
+ * the client is in lock recovery then we send the initial lock request.
 *
 * This is running in concurrent client greeting processing contexts.
 */
-int scoutfs_lock_server_greeting(struct super_block *sb, u64 rid,
-				 bool should_exist)
+int scoutfs_lock_server_greeting(struct super_block *sb, u64 rid)
 {
-	DECLARE_LOCK_SERVER_INFO(sb, inf);
-	struct scoutfs_super_block *super = &SCOUTFS_SB(sb)->super;
-	SCOUTFS_BTREE_ITEM_REF(iref);
 	struct scoutfs_key key;
 	int ret;

-	init_lock_clients_key(&key, rid);
-
-	mutex_lock(&inf->mutex);
-	if (should_exist) {
-		ret = scoutfs_btree_lookup(sb, &super->lock_clients, &key,
-					   &iref);
-		if (ret == 0)
-			scoutfs_btree_put_iref(&iref);
-	} else {
-		ret = scoutfs_btree_insert(sb, inf->alloc, inf->wri,
-					   &super->lock_clients,
-					   &key, NULL, 0);
-		if (ret == -EEXIST)
-			ret = 0;
-	}
-	mutex_unlock(&inf->mutex);
-
-	if (should_exist && ret == 0) {
+	if (scoutfs_recov_is_pending(sb, rid, SCOUTFS_RECOV_LOCKS)) {
 		scoutfs_key_set_zeros(&key);
 		ret = scoutfs_server_lock_recover_request(sb, rid, &key);
-		if (ret)
-			goto out;
+	} else {
+		ret = 0;
 	}

-out:
 	return ret;
 }

 /*
- * A client sent their last recovery response and can exit recovery.  If
- * they were the last client in recovery then we can process all the
- * server locks that had requests.
+ * All clients have finished lock recovery, we can make forward process
+ * on all the queued requests that were waiting on recovery.
 */
-static int finished_recovery(struct super_block *sb, u64 rid, bool cancel)
+int scoutfs_lock_server_finished_recovery(struct super_block *sb)
 {
 	DECLARE_LOCK_SERVER_INFO(sb, inf);
 	struct server_lock_node *snode;
 	struct scoutfs_key key;
-	bool still_pending;
 	int ret = 0;

-	spin_lock(&inf->lock);
-	scoutfs_spbm_clear(&inf->recovery_pending, rid);
-	still_pending = !scoutfs_spbm_empty(&inf->recovery_pending);
-	spin_unlock(&inf->lock);
-	if (still_pending)
-		return 0;
-
-	if (cancel)
-		cancel_delayed_work_sync(&inf->recovery_dwork);
-
 	scoutfs_key_set_zeros(&key);
-
-	scoutfs_info(sb, "all lock clients recovered");
-
 	while ((snode = get_server_lock(inf, &key, NULL, true))) {

 		key = snode->key;
@@ -695,16 +635,15 @@ int scoutfs_lock_server_recover_response(struct super_block *sb, u64 rid,
 	int i;

 	/* client must be in recovery */
-	spin_lock(&inf->lock);
-	if (!scoutfs_spbm_test(&inf->recovery_pending, rid))
+	if (!scoutfs_recov_is_pending(sb, rid, SCOUTFS_RECOV_LOCKS)) {
 		ret = -EINVAL;
-	spin_unlock(&inf->lock);
-	if (ret)
 		goto out;
+	}

 	/* client has sent us all their locks */
 	if (nlr->nr == 0) {
-		ret = finished_recovery(sb, rid, true);
+		scoutfs_server_recov_finish(sb, rid, SCOUTFS_RECOV_LOCKS);
+		ret = 0;
 		goto out;
 	}

@@ -755,101 +694,15 @@ out:
 	return ret;
 }

-static int get_rid_and_put_ref(struct scoutfs_btree_item_ref *iref, u64 *rid)
-{
-	int ret;
-
-	if (iref->val_len == 0) {
-		*rid = le64_to_cpu(iref->key->sklc_rid);
-		ret = 0;
-	} else {
-		ret = -EIO;
-	}
-	scoutfs_btree_put_iref(iref);
-	return ret;
-}
-
-/*
- * This work executes if enough time passes without all of the clients
- * finishing with recovery and canceling the work.  We walk through the
- * client records and find any that still have their recovery pending.
- */
-static void scoutfs_lock_server_recovery_timeout(struct work_struct *work)
-{
-	struct lock_server_info *inf = container_of(work,
-						    struct lock_server_info,
-						    recovery_dwork.work);
-	struct super_block *sb = inf->sb;
-	struct scoutfs_super_block *super = &SCOUTFS_SB(sb)->super;
-	SCOUTFS_BTREE_ITEM_REF(iref);
-	struct scoutfs_key key;
-	bool timed_out;
-	u64 rid;
-	int ret;
-
-	ret = scoutfs_server_hold_commit(sb);
-	if (ret)
-		goto out;
-
-	/* we enter recovery if there are any client records */
-	for (rid = 0; ; rid++) {
-		init_lock_clients_key(&key, rid);
-		ret = scoutfs_btree_next(sb, &super->lock_clients, &key, &iref);
-		if (ret == -ENOENT) {
-			ret = 0;
-			break;
-		}
-		if (ret == 0)
-			ret = get_rid_and_put_ref(&iref, &rid);
-		if (ret < 0)
-			break;
-
-		spin_lock(&inf->lock);
-		if (scoutfs_spbm_test(&inf->recovery_pending, rid)) {
-			scoutfs_spbm_clear(&inf->recovery_pending, rid);
-			timed_out = true;
-		} else {
-			timed_out = false;
-		}
-		spin_unlock(&inf->lock);
-
-		if (!timed_out)
-			continue;
-
-		scoutfs_err(sb, "client rid %016llx lock recovery timed out",
-			    rid);
-
-		init_lock_clients_key(&key, rid);
-		ret = scoutfs_btree_delete(sb, inf->alloc, inf->wri,
-					   &super->lock_clients, &key);
-		if (ret)
-			break;
-	}
-
-	ret = scoutfs_server_apply_commit(sb, ret);
-out:
-	/* force processing all pending lock requests */
-	if (ret == 0)
-		ret = finished_recovery(sb, 0, false);
-
-	if (ret < 0) {
-		scoutfs_err(sb, "lock server saw err %d while timing out clients, shutting down", ret);
-		scoutfs_server_abort(sb);
-	}
-}
-
 /*
 * A client is leaving the lock service.  They aren't using locks and
 * won't send any more requests.  We tear down all the state we had for
 * them.  This can be called multiple times for a given client as their
 * farewell is resent to new servers.  It's OK to not find any state.
- * If we fail to delete a persistent entry then we have to shut down and
- * hope that the next server has more luck.
 */
 int scoutfs_lock_server_farewell(struct super_block *sb, u64 rid)
 {
 	DECLARE_LOCK_SERVER_INFO(sb, inf);
-	struct scoutfs_super_block *super = &SCOUTFS_SB(sb)->super;
 	struct client_lock_entry *clent;
 	struct client_lock_entry *tmp;
 	struct server_lock_node *snode;
@@ -858,20 +711,7 @@ int scoutfs_lock_server_farewell(struct super_block *sb, u64 rid)
 	bool freed;
 	int ret = 0;

-	mutex_lock(&inf->mutex);
-	init_lock_clients_key(&key, rid);
-	ret = scoutfs_btree_delete(sb, inf->alloc, inf->wri,
-				   &super->lock_clients, &key);
-	mutex_unlock(&inf->mutex);
-	if (ret == -ENOENT) {
-		ret = 0;
-		goto out;
-	}
-	if (ret < 0)
-		goto out;
-
 	scoutfs_key_set_zeros(&key);
-
 	while ((snode = get_server_lock(inf, &key, NULL, true))) {

 		freed = false;
@@ -956,23 +796,14 @@ static void lock_server_tseq_show(struct seq_file *m,

 /*
 * Setup the lock server.  This is called before networking can deliver
- * requests.  If we find existing client records then we enter recovery.
- * Lock request processing is deferred until recovery is resolved for
- * all the existing clients, either they reconnect and replay locks or
- * we time them out.
+ * requests.
 */
 int scoutfs_lock_server_setup(struct super_block *sb,
 			      struct scoutfs_alloc *alloc,
 			      struct scoutfs_block_writer *wri, u64 max_vers)
 {
 	struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
-	struct scoutfs_super_block *super = &SCOUTFS_SB(sb)->super;
 	struct lock_server_info *inf;
-	SCOUTFS_BTREE_ITEM_REF(iref);
-	struct scoutfs_key key;
-	unsigned int nr;
-	u64 rid;
-	int ret;

 	inf = kzalloc(sizeof(struct lock_server_info), GFP_KERNEL);
 	if (!inf)
@@ -980,11 +811,7 @@ int scoutfs_lock_server_setup(struct super_block *sb,

 	inf->sb = sb;
 	spin_lock_init(&inf->lock);
-	mutex_init(&inf->mutex);
 	inf->locks_root = RB_ROOT;
-	scoutfs_spbm_init(&inf->recovery_pending);
-	INIT_DELAYED_WORK(&inf->recovery_dwork,
-			  scoutfs_lock_server_recovery_timeout);
 	scoutfs_tseq_tree_init(&inf->tseq_tree, lock_server_tseq_show);
 	inf->alloc = alloc;
 	inf->wri = wri;
@@ -999,36 +826,7 @@ int scoutfs_lock_server_setup(struct super_block *sb,

 	sbi->lock_server_info = inf;

-	/* we enter recovery if there are any client records */
-	nr = 0;
-	for (rid = 0; ; rid++) {
-		init_lock_clients_key(&key, rid);
-		ret = scoutfs_btree_next(sb, &super->lock_clients, &key, &iref);
-		if (ret == -ENOENT)
-			break;
-		if (ret == 0)
-			ret = get_rid_and_put_ref(&iref, &rid);
-		if (ret < 0)
-			goto out;
-
-		ret = scoutfs_spbm_set(&inf->recovery_pending, rid);
-		if (ret)
-			goto out;
-		nr++;
-
-		if (rid == U64_MAX)
-			break;
-	}
-	ret = 0;
-
-	if (nr) {
-		schedule_delayed_work(&inf->recovery_dwork,
-				msecs_to_jiffies(LOCK_SERVER_RECOVERY_MS));
-		scoutfs_info(sb, "waiting for %u lock clients to recover", nr);
-	}
-
-out:
-	return ret;
+	return 0;
 }

 /*
@@ -1046,8 +844,6 @@ void scoutfs_lock_server_destroy(struct super_block *sb)
 	LIST_HEAD(list);

 	if (inf) {
-		cancel_delayed_work_sync(&inf->recovery_dwork);
-
 		debugfs_remove(inf->tseq_dentry);

 		rbtree_postorder_for_each_entry_safe(snode, stmp,
@@ -1066,8 +862,6 @@ void scoutfs_lock_server_destroy(struct super_block *sb)
 			kfree(snode);
 		}

-		scoutfs_spbm_destroy(&inf->recovery_pending);
-
 		kfree(inf);
 		sbi->lock_server_info = NULL;
 	}
--- a/kmod/src/lock_server.h
+++ b/kmod/src/lock_server.h
@@ -3,10 +3,10 @@

 int scoutfs_lock_server_recover_response(struct super_block *sb, u64 rid,
 					 struct scoutfs_net_lock_recover *nlr);
+int scoutfs_lock_server_finished_recovery(struct super_block *sb);
 int scoutfs_lock_server_request(struct super_block *sb, u64 rid,
 				u64 net_id, struct scoutfs_net_lock *nl);
-int scoutfs_lock_server_greeting(struct super_block *sb, u64 rid,
-				 bool should_exist);
+int scoutfs_lock_server_greeting(struct super_block *sb, u64 rid);
 int scoutfs_lock_server_response(struct super_block *sb, u64 rid,
 				 struct scoutfs_net_lock *nl);
 int scoutfs_lock_server_farewell(struct super_block *sb, u64 rid);
--- a/kmod/src/omap.c
+++ b/kmod/src/omap.c
--- a/kmod/src/omap.h
+++ b/kmod/src/omap.h
@@ -0,0 +1,24 @@
+#ifndef _SCOUTFS_OMAP_H_
+#define _SCOUTFS_OMAP_H_
+
+int scoutfs_omap_inc(struct super_block *sb, u64 ino);
+void scoutfs_omap_dec(struct super_block *sb, u64 ino);
+int scoutfs_omap_should_delete(struct super_block *sb, struct inode *inode,
+			       struct scoutfs_lock **lock_ret);
+void scoutfs_omap_free_lock_data(struct scoutfs_omap_lock_data *ldata);
+int scoutfs_omap_client_handle_request(struct super_block *sb, u64 id,
+				       struct scoutfs_open_ino_map_args *args);
+
+int scoutfs_omap_add_rid(struct super_block *sb, u64 rid);
+int scoutfs_omap_remove_rid(struct super_block *sb, u64 rid);
+int scoutfs_omap_finished_recovery(struct super_block *sb);
+int scoutfs_omap_server_handle_request(struct super_block *sb, u64 rid, u64 id,
+				       struct scoutfs_open_ino_map_args *args);
+int scoutfs_omap_server_handle_response(struct super_block *sb, u64 rid,
+					struct scoutfs_open_ino_map *resp_map);
+void scoutfs_omap_server_shutdown(struct super_block *sb);
+
+int scoutfs_omap_setup(struct super_block *sb);
+void scoutfs_omap_destroy(struct super_block *sb);
+
+#endif
--- a/kmod/src/recov.c
+++ b/kmod/src/recov.c
@@ -0,0 +1,280 @@
+/*
+ * Copyright (C) 2021 Versity Software, Inc.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ */
+#include <linux/kernel.h>
+#include <linux/fs.h>
+#include <linux/slab.h>
+#include <linux/sched.h>
+#include <linux/rhashtable.h>
+#include <linux/rcupdate.h>
+
+#include "super.h"
+#include "recov.h"
+
+/*
+ * There are a few server messages which can't be processed until they
+ * know that they have state for all possibly active clients.  These
+ * little helpers track which clients have recovered what state and give
+ * those message handlers a call to check if recovery has completed.  We
+ * track the timeout here, but all we do is call back into the server to
+ * take steps to evict timed out clients and then let us know that their
+ * recovery has finished.
+ */
+
+struct recov_info {
+	struct super_block *sb;
+	spinlock_t lock;
+	struct list_head pending;
+	struct timer_list timer;
+	void (*timeout_fn)(struct super_block *);
+};
+
+#define DECLARE_RECOV_INFO(sb, name) \
+	struct recov_info *name = SCOUTFS_SB(sb)->recov_info
+
+struct recov_pending {
+	struct list_head head;
+	u64 rid;
+	int which;
+};
+
+static struct recov_pending *find_pending(struct recov_info *recinf, u64 rid, int which)
+{
+	struct recov_pending *pend;
+
+	list_for_each_entry(pend, &recinf->pending, head) {
+		if ((rid == 0 || pend->rid == rid) && (pend->which & which))
+			return pend;
+	}
+
+	return NULL;
+}
+
+/*
+ * Record that we'll be waiting for a client to recover something.
+ * _finished will eventually be called for every _prepare, either
+ * because recovery naturally finished or because it timed out and the
+ * server evicted the client. 
+ */
+int scoutfs_recov_prepare(struct super_block *sb, u64 rid, int which)
+{
+	DECLARE_RECOV_INFO(sb, recinf);
+	struct recov_pending *alloc;
+	struct recov_pending *pend;
+
+	if (WARN_ON_ONCE(which & SCOUTFS_RECOV_INVALID))
+		return -EINVAL;
+
+	alloc = kmalloc(sizeof(*pend), GFP_NOFS);
+	if (!alloc)
+		return -ENOMEM;
+
+	spin_lock(&recinf->lock);
+
+	pend = find_pending(recinf, rid, SCOUTFS_RECOV_ALL);
+	if (pend) {
+		pend->which |= which;
+	} else {
+		swap(pend, alloc);
+		pend->rid = rid;
+		pend->which = which;
+		list_add(&pend->head, &recinf->pending);
+	}
+
+	spin_unlock(&recinf->lock);
+
+	kfree(alloc);
+	return 0;
+}
+
+/*
+ * Recovery is only finished once we've begun (which sets the timer) and
+ * all clients have finished.  If we didn't test the timer we could
+ * claim it finished prematurely as clients are being prepared.
+ */
+static int recov_finished(struct recov_info *recinf)
+{
+	return !!(recinf->timeout_fn != NULL && list_empty(&recinf->pending));
+}
+
+static void timer_callback(struct timer_list *timer)
+{
+	struct recov_info *recinf = from_timer(recinf, timer, timer);
+
+	recinf->timeout_fn(recinf->sb);
+}
+
+/*
+ * Begin waiting for recovery once we've prepared all the clients.  If
+ * the timeout period elapses before _finish is called on all prepared
+ * clients then the timer will call the callback.
+ *
+ * Returns > 0 if all the prepared clients finish recovery before begin
+ * is called.
+ */
+int scoutfs_recov_begin(struct super_block *sb, void (*timeout_fn)(struct super_block *),
+			unsigned int timeout_ms)
+{
+	DECLARE_RECOV_INFO(sb, recinf);
+	int ret;
+
+	spin_lock(&recinf->lock);
+
+	recinf->timeout_fn = timeout_fn;
+	recinf->timer.expires = jiffies + msecs_to_jiffies(timeout_ms);
+	add_timer(&recinf->timer);
+
+	ret = recov_finished(recinf);
+
+	spin_unlock(&recinf->lock);
+
+	if (ret > 0)
+		del_timer_sync(&recinf->timer);
+
+	return ret;
+}
+
+/*
+ * A given client has recovered the given state.  If it's finished all
+ * recovery then we free it, and if all clients have finished recovery
+ * then we cancel the timeout timer.
+ *
+ * Returns > 0 if _begin has been called and all clients have finished.
+ * The caller will only see > 0 returned once.
+ */
+int scoutfs_recov_finish(struct super_block *sb, u64 rid, int which)
+{
+	DECLARE_RECOV_INFO(sb, recinf);
+	struct recov_pending *pend;
+	int ret = 0;
+
+	spin_lock(&recinf->lock);
+
+	pend = find_pending(recinf, rid, which);
+	if (pend) {
+		pend->which &= ~which;
+		if (pend->which) {
+			pend = NULL;
+		} else {
+			list_del(&pend->head);
+			ret = recov_finished(recinf);
+		}
+	}
+
+	spin_unlock(&recinf->lock);
+
+	if (ret > 0)
+		del_timer_sync(&recinf->timer);
+
+	kfree(pend);
+
+	return ret;
+}
+
+/*
+ * Returns true if the given client is still trying to recover
+ * the given state.
+ */
+bool scoutfs_recov_is_pending(struct super_block *sb, u64 rid, int which)
+{
+	DECLARE_RECOV_INFO(sb, recinf);
+	bool is_pending;
+
+	spin_lock(&recinf->lock);
+	is_pending = find_pending(recinf, rid, which) != NULL;
+	spin_unlock(&recinf->lock);
+
+	return is_pending;
+}
+
+/*
+ * Returns 0 if there are no rids waiting for the given state to be
+ * recovered.  Returns the rid of a client still waiting if there are
+ * any, in no specified order.
+ *
+ * This is inherently racey.  Callers are responsible for resolving any
+ * actions taken based on pending with the recovery finishing, perhaps
+ * before we return.
+ */
+u64 scoutfs_recov_next_pending(struct super_block *sb, int which)
+{
+	DECLARE_RECOV_INFO(sb, recinf);
+	struct recov_pending *pend;
+	u64 rid;
+
+	spin_lock(&recinf->lock);
+	pend = find_pending(recinf, 0, which);
+	rid = pend ? pend->rid : 0;
+	spin_unlock(&recinf->lock);
+
+	return rid;
+}
+
+/*
+ * The server is shutting down and doesn't need to worry about recovery
+ * anymore.  It'll be built up again by the next server, if needed.
+ */
+void scoutfs_recov_shutdown(struct super_block *sb)
+{
+	DECLARE_RECOV_INFO(sb, recinf);
+	struct recov_pending *pend;
+	struct recov_pending *tmp;
+	LIST_HEAD(list);
+
+	del_timer_sync(&recinf->timer);
+
+	spin_lock(&recinf->lock);
+	list_splice_init(&recinf->pending, &list);
+	recinf->timeout_fn = NULL;
+	spin_unlock(&recinf->lock);
+
+	list_for_each_entry_safe(pend, tmp, &recinf->pending, head) {
+		list_del(&pend->head);
+		kfree(pend);
+	}
+}
+
+int scoutfs_recov_setup(struct super_block *sb)
+{
+	struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
+	struct recov_info *recinf;
+	int ret;
+
+	recinf = kzalloc(sizeof(struct recov_info), GFP_KERNEL);
+	if (!recinf) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	recinf->sb = sb;
+	spin_lock_init(&recinf->lock);
+	INIT_LIST_HEAD(&recinf->pending);
+	timer_setup(&recinf->timer, timer_callback, 0);
+
+	sbi->recov_info = recinf;
+	ret = 0;
+out:
+	return ret;
+}
+
+void scoutfs_recov_destroy(struct super_block *sb)
+{
+	DECLARE_RECOV_INFO(sb, recinf);
+	struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
+
+	if (recinf) {
+		scoutfs_recov_shutdown(sb);
+
+		kfree(recinf);
+		sbi->recov_info = NULL;
+	}
+}
--- a/kmod/src/recov.h
+++ b/kmod/src/recov.h
@@ -0,0 +1,23 @@
+#ifndef _SCOUTFS_RECOV_H_
+#define _SCOUTFS_RECOV_H_
+
+enum {
+	SCOUTFS_RECOV_GREETING	= ( 1 <<  0),
+	SCOUTFS_RECOV_LOCKS	= ( 1 <<  1),
+
+	SCOUTFS_RECOV_INVALID	= (~0 <<  2),
+	SCOUTFS_RECOV_ALL	= (~SCOUTFS_RECOV_INVALID),
+};
+
+int scoutfs_recov_prepare(struct super_block *sb, u64 rid, int which);
+int scoutfs_recov_begin(struct super_block *sb, void (*timeout_fn)(struct super_block *),
+			unsigned int timeout_ms);
+int scoutfs_recov_finish(struct super_block *sb, u64 rid, int which);
+bool scoutfs_recov_is_pending(struct super_block *sb, u64 rid, int which);
+u64 scoutfs_recov_next_pending(struct super_block *sb, int which);
+void scoutfs_recov_shutdown(struct super_block *sb);
+
+int scoutfs_recov_setup(struct super_block *sb);
+void scoutfs_recov_destroy(struct super_block *sb);
+
+#endif
--- a/kmod/src/scoutfs_trace.h
+++ b/kmod/src/scoutfs_trace.h
@@ -690,15 +690,16 @@ TRACE_EVENT(scoutfs_evict_inode,

 TRACE_EVENT(scoutfs_drop_inode,
 	TP_PROTO(struct super_block *sb, __u64 ino, unsigned int nlink,
-		 unsigned int unhashed),
+		 unsigned int unhashed, bool drop_invalidated),

-	TP_ARGS(sb, ino, nlink, unhashed),
+	TP_ARGS(sb, ino, nlink, unhashed, drop_invalidated),

 	TP_STRUCT__entry(
 		SCSB_TRACE_FIELDS
 		__field(__u64, ino)
 		__field(unsigned int, nlink)
 		__field(unsigned int, unhashed)
+		__field(unsigned int, drop_invalidated)
 	),

 	TP_fast_assign(
@@ -706,10 +707,12 @@ TRACE_EVENT(scoutfs_drop_inode,
 		__entry->ino = ino;
 		__entry->nlink = nlink;
 		__entry->unhashed = unhashed;
+		__entry->drop_invalidated = !!drop_invalidated;
 	),

-	TP_printk(SCSBF" ino %llu nlink %u unhashed %d", SCSB_TRACE_ARGS,
-		  __entry->ino, __entry->nlink, __entry->unhashed)
+	TP_printk(SCSBF" ino %llu nlink %u unhashed %d drop_invalidated %u", SCSB_TRACE_ARGS,
+		  __entry->ino, __entry->nlink, __entry->unhashed,
+		  __entry->drop_invalidated)
 );

 TRACE_EVENT(scoutfs_inode_walk_writeback,
@@ -2402,6 +2405,89 @@ TRACE_EVENT(scoutfs_item_invalidate_page,
 		  sk_trace_args(pg_start), sk_trace_args(pg_end), __entry->pgi)
 );

+DECLARE_EVENT_CLASS(scoutfs_omap_group_class,
+	TP_PROTO(struct super_block *sb, void *grp, u64 group_nr, unsigned int group_total,
+		 int bit_nr, int bit_count),
+
+	TP_ARGS(sb, grp, group_nr, group_total, bit_nr, bit_count),
+
+	TP_STRUCT__entry(
+		SCSB_TRACE_FIELDS
+		__field(void *, grp)
+		__field(__u64, group_nr)
+		__field(unsigned int, group_total)
+		__field(int, bit_nr)
+		__field(int, bit_count)
+	),
+
+	TP_fast_assign(
+		SCSB_TRACE_ASSIGN(sb);
+		__entry->grp = grp;
+		__entry->group_nr = group_nr;
+		__entry->group_total = group_total;
+		__entry->bit_nr = bit_nr;
+		__entry->bit_count = bit_count;
+	),
+
+	TP_printk(SCSBF" grp %p group_nr %llu group_total %u bit_nr %d bit_count %d",
+		  SCSB_TRACE_ARGS, __entry->grp, __entry->group_nr, __entry->group_total,
+		  __entry->bit_nr, __entry->bit_count)
+);
+
+DEFINE_EVENT(scoutfs_omap_group_class, scoutfs_omap_group_alloc,
+	TP_PROTO(struct super_block *sb, void *grp, u64 group_nr, unsigned int group_total,
+		 int bit_nr, int bit_count),
+	TP_ARGS(sb, grp, group_nr, group_total, bit_nr, bit_count)
+);
+DEFINE_EVENT(scoutfs_omap_group_class, scoutfs_omap_group_free,
+	TP_PROTO(struct super_block *sb, void *grp, u64 group_nr, unsigned int group_total,
+		 int bit_nr, int bit_count),
+	TP_ARGS(sb, grp, group_nr, group_total, bit_nr, bit_count)
+);
+DEFINE_EVENT(scoutfs_omap_group_class, scoutfs_omap_group_inc,
+	TP_PROTO(struct super_block *sb, void *grp, u64 group_nr, unsigned int group_total,
+		 int bit_nr, int bit_count),
+	TP_ARGS(sb, grp, group_nr, group_total, bit_nr, bit_count)
+);
+DEFINE_EVENT(scoutfs_omap_group_class, scoutfs_omap_group_dec,
+	TP_PROTO(struct super_block *sb, void *grp, u64 group_nr, unsigned int group_total,
+		 int bit_nr, int bit_count),
+	TP_ARGS(sb, grp, group_nr, group_total, bit_nr, bit_count)
+);
+DEFINE_EVENT(scoutfs_omap_group_class, scoutfs_omap_group_request,
+	TP_PROTO(struct super_block *sb, void *grp, u64 group_nr, unsigned int group_total,
+		 int bit_nr, int bit_count),
+	TP_ARGS(sb, grp, group_nr, group_total, bit_nr, bit_count)
+);
+DEFINE_EVENT(scoutfs_omap_group_class, scoutfs_omap_group_destroy,
+	TP_PROTO(struct super_block *sb, void *grp, u64 group_nr, unsigned int group_total,
+		 int bit_nr, int bit_count),
+	TP_ARGS(sb, grp, group_nr, group_total, bit_nr, bit_count)
+);
+
+TRACE_EVENT(scoutfs_omap_should_delete,
+	TP_PROTO(struct super_block *sb, u64 ino, unsigned int nlink, int ret),
+
+	TP_ARGS(sb, ino, nlink, ret),
+
+	TP_STRUCT__entry(
+		SCSB_TRACE_FIELDS
+		__field(__u64, ino)
+		__field(unsigned int, nlink)
+		__field(int, ret)
+	),
+
+	TP_fast_assign(
+		SCSB_TRACE_ASSIGN(sb);
+		__entry->ino = ino;
+		__entry->nlink = nlink;
+		__entry->ret = ret;
+	),
+
+	TP_printk(SCSBF" ino %llu nlink %u ret %d",
+		  SCSB_TRACE_ARGS, __entry->ino, __entry->nlink, __entry->ret)
+);
+
 #endif /* _TRACE_SCOUTFS_H */

 /* This part must be outside protection */
--- a/kmod/src/server.c
+++ b/kmod/src/server.c
@@ -38,6 +38,8 @@
 #include "srch.h"
 #include "alloc.h"
 #include "forest.h"
+#include "recov.h"
+#include "omap.h"

 /*
 * Every active mount can act as the server that listens on a net
@@ -96,6 +98,9 @@ struct server_info {
 	/* stable versions stored from commits, given in locks and rpcs */
 	seqcount_t roots_seqcount;
 	struct scoutfs_net_roots roots;
+
+	/* recovery timeout fences from work */
+	struct work_struct fence_pending_recov_work;
 };

 #define DECLARE_SERVER_INFO(sb, name) \
@@ -1016,6 +1021,60 @@ out:
 	return scoutfs_net_response(sb, conn, cmd, id, ret, NULL, 0);
 }

+/* The server is receiving an omap response from the client */
+static int open_ino_map_response(struct super_block *sb, struct scoutfs_net_connection *conn,
+				 void *resp, unsigned int resp_len, int error, void *data)
+{
+	u64 rid = scoutfs_net_client_rid(conn);
+
+	if (resp_len != sizeof(struct scoutfs_open_ino_map))
+		return -EINVAL;
+
+	return scoutfs_omap_server_handle_response(sb, rid, resp);
+}
+
+/* The server is sending an omap request to the client */
+int scoutfs_server_send_omap_request(struct super_block *sb, u64 rid,
+				     struct scoutfs_open_ino_map_args *args)
+{
+	struct server_info *server = SCOUTFS_SB(sb)->server_info;
+
+	return scoutfs_net_submit_request_node(sb, server->conn, rid, SCOUTFS_NET_CMD_OPEN_INO_MAP,
+					      args, sizeof(*args),
+					      open_ino_map_response, NULL, NULL);
+}
+
+/* The server is sending an omap response to the client */
+int scoutfs_server_send_omap_response(struct super_block *sb, u64 rid, u64 id,
+				      struct scoutfs_open_ino_map *map, int err)
+{
+	struct server_info *server = SCOUTFS_SB(sb)->server_info;
+
+	return scoutfs_net_response_node(sb, server->conn, rid,
+					 SCOUTFS_NET_CMD_OPEN_INO_MAP, id, err,
+					 map, sizeof(*map));
+}
+
+/* The server is receiving an omap request from the client */
+static int server_open_ino_map(struct super_block *sb, struct scoutfs_net_connection *conn,
+			       u8 cmd, u64 id, void *arg, u16 arg_len)
+{
+	u64 rid = scoutfs_net_client_rid(conn);
+	int ret;
+
+	if (arg_len != sizeof(struct scoutfs_open_ino_map_args)) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	ret = scoutfs_omap_server_handle_request(sb, rid, id, arg);
+out:
+	if (ret < 0)
+		return scoutfs_net_response(sb, conn, cmd, id, ret, NULL, 0);
+
+	return 0;
+}
+
 static void init_mounted_client_key(struct scoutfs_key *key, u64 rid)
 {
 	*key = (struct scoutfs_key) {
@@ -1198,8 +1257,13 @@ static int server_greeting(struct super_block *sb,

 		ret = scoutfs_server_apply_commit(sb, ret);
 		queue_work(server->wq, &server->farewell_work);
+		if (ret < 0)
+			goto send_err;
 	}

+	scoutfs_server_recov_finish(sb, le64_to_cpu(gr->rid), SCOUTFS_RECOV_GREETING);
+	ret = 0;
+
 send_err:
 	err = ret;

@@ -1228,17 +1292,10 @@ send_err:
 	scoutfs_net_server_greeting(sb, conn, le64_to_cpu(gr->rid), id,
 				    reconnecting, first_contact, farewell);

-	/* lock server might send recovery request */
+	/* let layers know we have a client connecting for the first time */
 	if (le64_to_cpu(gr->server_term) != server->term) {
-
-		/* we're now doing two commits per greeting, not great */
-		ret = scoutfs_server_hold_commit(sb);
-		if (ret)
-			goto out;
-
-		ret = scoutfs_lock_server_greeting(sb, le64_to_cpu(gr->rid),
-						   gr->server_term != 0);
-		ret = scoutfs_server_apply_commit(sb, ret);
+		ret = scoutfs_lock_server_greeting(sb, le64_to_cpu(gr->rid)) ?:
+		      scoutfs_omap_add_rid(sb, le64_to_cpu(gr->rid));
 		if (ret)
 			goto out;
 	}
@@ -1259,6 +1316,25 @@ static bool invalid_mounted_client_item(struct scoutfs_btree_item_ref *iref)
 			sizeof(struct scoutfs_mounted_client_btree_val));
 }

+static int reclaim_rid(struct super_block *sb, u64 rid)
+{
+	int ret;
+
+	ret = scoutfs_server_hold_commit(sb);
+	if (ret < 0)
+		return ret;
+
+	/* delete mounted client last, client reconnect looks for it */
+	ret = scoutfs_lock_server_farewell(sb, rid) ?:
+	      remove_trans_seq(sb, rid) ?:
+	      reclaim_log_trees(sb, rid) ?:
+	      cancel_srch_compact(sb, rid) ?:
+	      delete_mounted_client(sb, rid) ?:
+	      scoutfs_omap_remove_rid(sb, rid);
+
+	return scoutfs_server_apply_commit(sb, ret);
+}
+
 /*
 * This work processes farewell requests asynchronously.  Requests from
 * quorum members can be held until only the final majority remains and
@@ -1386,18 +1462,7 @@ static void farewell_worker(struct work_struct *work)

 	/* process and send farewell responses */
 	list_for_each_entry_safe(fw, tmp, &send, entry) {
-		ret = scoutfs_server_hold_commit(sb);
-		if (ret)
-			goto out;
-
-		/* delete mounted client last, client reconnect looks for it */
-		ret = scoutfs_lock_server_farewell(sb, fw->rid) ?:
-		      remove_trans_seq(sb, fw->rid) ?:
-		      reclaim_log_trees(sb, fw->rid) ?:
-		      cancel_srch_compact(sb, fw->rid) ?:
-		      delete_mounted_client(sb, fw->rid);
-
-		ret = scoutfs_server_apply_commit(sb, ret);
+		ret = reclaim_rid(sb, fw->rid);
 		if (ret)
 			goto out;
 	}
@@ -1499,6 +1564,7 @@ static scoutfs_net_request_t server_req_funcs[] = {
 	[SCOUTFS_NET_CMD_LOCK]			= server_lock,
 	[SCOUTFS_NET_CMD_SRCH_GET_COMPACT]	= server_srch_get_compact,
 	[SCOUTFS_NET_CMD_SRCH_COMMIT_COMPACT]	= server_srch_commit_compact,
+	[SCOUTFS_NET_CMD_OPEN_INO_MAP]		= server_open_ino_map,
 	[SCOUTFS_NET_CMD_FAREWELL]		= server_farewell,
 };

@@ -1540,6 +1606,143 @@ static void server_notify_down(struct super_block *sb,
 	}
 }

+/*
+ * All clients have recovered all state.  Now we can kick all the work
+ * that was waiting on recovery.
+ *
+ * It's a bit of a false dependency to have all work wait for completion
+ * before any work can make progress, but recovery is naturally
+ * concerned about in-memory state.  It should all be quick to recover
+ * once a client arrives.
+ */
+static void finished_recovery(struct super_block *sb)
+{
+	DECLARE_SERVER_INFO(sb, server);
+	int ret = 0;
+
+	scoutfs_info(sb, "all clients recovered");
+
+	ret = scoutfs_omap_finished_recovery(sb) ?:
+	      scoutfs_lock_server_finished_recovery(sb);
+	if (ret < 0) {
+		scoutfs_err(sb, "error %d resuming after recovery finished, shutting down", ret);
+		stop_server(server);
+	}
+}
+
+void scoutfs_server_recov_finish(struct super_block *sb, u64 rid, int which)
+{
+	if (scoutfs_recov_finish(sb, rid, which) > 0)
+		finished_recovery(sb);
+}
+
+/*
+ * If the recovery timeout is too short we'll prematurely evict mounts
+ * that would have recovered.  They need time to have their sockets
+ * timeout, reconnect to the current server, and fully recover their
+ * state.
+ *
+ * If it's too long we'll needlessly delay resuming operations after
+ * clients crash and will never recover.
+ */
+#define SERVER_RECOV_TIMEOUT_MS (30 * MSEC_PER_SEC)
+
+/*
+ * Not all clients recovered in time.  We fence them and reclaim
+ * whatever resources they were using.  If we see a rid here then we're
+ * going to fence it, regardless of if it manages to finish recovery
+ * while we're fencing it.
+ */
+static void fence_pending_recov_worker(struct work_struct *work)
+{
+	struct server_info *server = container_of(work, struct server_info,
+						  fence_pending_recov_work);
+	struct super_block *sb = server->sb;
+	u64 rid;
+	int ret;
+
+	while ((rid = scoutfs_recov_next_pending(sb, SCOUTFS_RECOV_ALL)) > 0) {
+		scoutfs_err(sb, "%lu ms recovery timeout expired for client rid %016llx, fencing",
+			    SERVER_RECOV_TIMEOUT_MS, rid);
+
+		ret = reclaim_rid(sb, rid);
+		if (ret < 0) {
+			scoutfs_err(sb, "error %d reclaiming rid %016llx, shutting down", ret, rid);
+			stop_server(server);
+			break;
+		}
+
+		scoutfs_server_recov_finish(sb, rid, SCOUTFS_RECOV_ALL);
+	}
+}
+
+static void recovery_timeout(struct super_block *sb)
+{
+	DECLARE_SERVER_INFO(sb, server);
+
+	if (!server->shutting_down)
+		queue_work(server->wq, &server->fence_pending_recov_work);
+}
+
+/*
+ * As the server starts up it needs to start waiting for recovery from
+ * any clients which were previously still mounted in the last running
+ * server.  This is done before networking is started so we won't
+ * receive any messages from clients until we've prepared them all.  If
+ * the clients don't recover in time then they'll be fenced.
+ */
+static int start_recovery(struct super_block *sb)
+{
+	DECLARE_SERVER_INFO(sb, server);
+	struct scoutfs_super_block *super = &SCOUTFS_SB(sb)->super;
+	SCOUTFS_BTREE_ITEM_REF(iref);
+	struct scoutfs_key key;
+	unsigned int nr = 0;
+	u64 rid;
+	int ret;
+
+	for (rid = 0; ; rid++) {
+		init_mounted_client_key(&key, rid);
+		ret = scoutfs_btree_next(sb, &super->mounted_clients, &key, &iref);
+		if (ret == -ENOENT) {
+			ret = 0;
+			break;
+		}
+		if (ret == 0) {
+			rid = le64_to_cpu(iref.key->skmc_rid);
+			scoutfs_btree_put_iref(&iref);
+		}
+		if (ret < 0)
+			goto out;
+
+		ret = scoutfs_recov_prepare(sb, rid, SCOUTFS_RECOV_ALL);
+		if (ret < 0) {
+			scoutfs_err(sb, "error %d preparing recovery for client rid %016llx, shutting down",
+				     ret, rid);
+			goto out;
+		}
+
+		nr++;
+	}
+
+	if (nr > 0) {
+		scoutfs_info(sb, "waiting for %u clients to recover", nr);
+
+		ret = scoutfs_recov_begin(sb, recovery_timeout, SERVER_RECOV_TIMEOUT_MS);
+		if (ret > 0) {
+			finished_recovery(sb);
+			ret = 0;
+		}
+	}
+
+out:
+	if (ret < 0) {
+		scoutfs_err(sb, "error %d starting recovery, shutting down", ret);
+		stop_server(server);
+	}
+	return ret;
+}
+
 static void scoutfs_server_worker(struct work_struct *work)
 {
 	struct server_info *server = container_of(work, struct server_info,
@@ -1610,8 +1813,8 @@ static void scoutfs_server_worker(struct work_struct *work)
 		goto shutdown;
 	}

-	ret = scoutfs_lock_server_setup(sb, &server->alloc, &server->wri,
-					max_vers);
+	ret = scoutfs_lock_server_setup(sb, &server->alloc, &server->wri, max_vers) ?:
+	      start_recovery(sb);
 	if (ret)
 		goto shutdown;

@@ -1635,10 +1838,15 @@ shutdown:
 	scoutfs_net_shutdown(sb, conn);
 	server->conn = NULL;

+	/* stop tracking recovery, cancel timer, flush any fencing */
+	scoutfs_recov_shutdown(sb);
+	flush_work(&server->fence_pending_recov_work);
+
 	/* wait for extra queues by requests, won't find waiters */
 	flush_work(&server->commit_work);

 	scoutfs_lock_server_destroy(sb);
+	scoutfs_omap_server_shutdown(sb);

 out:
 	scoutfs_net_free_conn(sb, conn);
@@ -1724,6 +1932,7 @@ int scoutfs_server_setup(struct super_block *sb)
 	mutex_init(&server->srch_mutex);
 	mutex_init(&server->mounted_clients_mutex);
 	seqcount_init(&server->roots_seqcount);
+	INIT_WORK(&server->fence_pending_recov_work, fence_pending_recov_worker);

 	server->wq = alloc_workqueue("scoutfs_server",
 				     WQ_UNBOUND | WQ_NON_REENTRANT, 0);
--- a/kmod/src/server.h
+++ b/kmod/src/server.h
@@ -64,6 +64,12 @@ int scoutfs_server_lock_recover_request(struct super_block *sb, u64 rid,
 					struct scoutfs_key *key);
 int scoutfs_server_hold_commit(struct super_block *sb);
 int scoutfs_server_apply_commit(struct super_block *sb, int err);
+void scoutfs_server_recov_finish(struct super_block *sb, u64 rid, int which);
+
+int scoutfs_server_send_omap_request(struct super_block *sb, u64 rid,
+				     struct scoutfs_open_ino_map_args *args);
+int scoutfs_server_send_omap_response(struct super_block *sb, u64 rid, u64 id,
+				      struct scoutfs_open_ino_map *map, int err);

 struct sockaddr_in;
 struct scoutfs_quorum_elected_info;
--- a/kmod/src/super.c
+++ b/kmod/src/super.c
@@ -44,6 +44,8 @@
 #include "srch.h"
 #include "item.h"
 #include "alloc.h"
+#include "recov.h"
+#include "omap.h"
 #include "scoutfs_trace.h"

 static struct dentry *scoutfs_debugfs_root;
@@ -166,7 +168,7 @@ out:
 	 * try to free as many locks as possible.
 	 */
 	if (scoutfs_trigger(sb, STATFS_LOCK_PURGE))
-		scoutfs_free_unused_locks(sb, -1UL);
+		scoutfs_free_unused_locks(sb);

 	return ret;
 }
@@ -243,25 +245,26 @@ static void scoutfs_put_super(struct super_block *sb)

 	trace_scoutfs_put_super(sb);

-	sbi->shutdown = true;
-
-	scoutfs_data_destroy(sb);
 	scoutfs_srch_destroy(sb);

 	scoutfs_unlock(sb, sbi->rid_lock, SCOUTFS_LOCK_WRITE);
 	sbi->rid_lock = NULL;

+	scoutfs_lock_shutdown(sb);
+
 	scoutfs_shutdown_trans(sb);
 	scoutfs_client_destroy(sb);
 	scoutfs_inode_destroy(sb);
 	scoutfs_item_destroy(sb);
 	scoutfs_forest_destroy(sb);
+	scoutfs_data_destroy(sb);

 	scoutfs_quorum_destroy(sb);
-	scoutfs_lock_shutdown(sb);
 	scoutfs_server_destroy(sb);
+	scoutfs_recov_destroy(sb);
 	scoutfs_net_destroy(sb);
 	scoutfs_lock_destroy(sb);
+	scoutfs_omap_destroy(sb);

 	scoutfs_block_destroy(sb);
 	scoutfs_destroy_triggers(sb);
@@ -591,8 +594,10 @@ static int scoutfs_fill_super(struct super_block *sb, void *data, int silent)
 	      scoutfs_inode_setup(sb) ?:
 	      scoutfs_data_setup(sb) ?:
 	      scoutfs_setup_trans(sb) ?:
+	      scoutfs_omap_setup(sb) ?:
 	      scoutfs_lock_setup(sb) ?:
 	      scoutfs_net_setup(sb) ?:
+	      scoutfs_recov_setup(sb) ?:
 	      scoutfs_server_setup(sb) ?:
 	      scoutfs_quorum_setup(sb) ?:
 	      scoutfs_client_setup(sb) ?:
@@ -643,6 +648,9 @@ static void scoutfs_kill_sb(struct super_block *sb)
 {
 	trace_scoutfs_kill_sb(sb);

+	if (SCOUTFS_HAS_SBI(sb))
+		scoutfs_lock_unmount_begin(sb);
+
 	kill_block_super(sb);
 }

--- a/kmod/src/super.h
+++ b/kmod/src/super.h
@@ -26,6 +26,8 @@ struct net_info;
 struct block_info;
 struct forest_info;
 struct srch_info;
+struct recov_info;
+struct omap_info;

 struct scoutfs_sb_info {
 	struct super_block *sb;
@@ -48,6 +50,7 @@ struct scoutfs_sb_info {
 	struct block_info *block_info;
 	struct forest_info *forest_info;
 	struct srch_info *srch_info;
+	struct omap_info *omap_info;
 	struct item_cache_info *item_cache_info;

 	wait_queue_head_t trans_hold_wq;
@@ -70,6 +73,7 @@ struct scoutfs_sb_info {
 	struct lock_server_info *lock_server_info;
 	struct client_info *client_info;
 	struct server_info *server_info;
+	struct recov_info *recov_info;
 	struct sysfs_info *sfsinfo;

 	struct scoutfs_counters *counters;
@@ -81,8 +85,6 @@ struct scoutfs_sb_info {

 	struct dentry *debug_root;

-	bool shutdown;
-
 	unsigned long corruption_messages_once[SC_NR_LONGS];
 };

--- a/kmod/src/trans.c
+++ b/kmod/src/trans.c
@@ -564,8 +564,15 @@ int scoutfs_setup_trans(struct super_block *sb)
 }

 /*
- * kill_sb calls sync before getting here so we know that dirty data
- * should be in flight.  We just have to wait for it to quiesce.
+ * While the vfs will have done an fs level sync before calling
+ * put_super, we may have done work down in our level after all the fs
+ * ops were done.  An example is final inode deletion in iput, that's
+ * done in generic_shutdown_super after the sync and before calling our
+ * put_super.
+ *
+ * So we always try to write any remaining dirty transactions before
+ * shutting down.  Typically there won't be any dirty data and the
+ * worker will just return.
 */
 void scoutfs_shutdown_trans(struct super_block *sb)
 {
@@ -573,13 +580,18 @@ void scoutfs_shutdown_trans(struct super_block *sb)
 	DECLARE_TRANS_INFO(sb, tri);

 	if (tri) {
-		scoutfs_block_writer_forget_all(sb, &tri->wri);
 		if (sbi->trans_write_workq) {
+			/* immediately queues pending timer */
+			flush_delayed_work(&sbi->trans_write_work);
+			/* prevents re-arming if it has to wait */
 			cancel_delayed_work_sync(&sbi->trans_write_work);
 			destroy_workqueue(sbi->trans_write_workq);
 			/* trans work schedules after shutdown see null */
 			sbi->trans_write_workq = NULL;
 		}
+
+		scoutfs_block_writer_forget_all(sb, &tri->wri);
+
 		kfree(tri);
 		sbi->trans_info = NULL;
 	}
--- a/tests/funcs/filter.sh
+++ b/tests/funcs/filter.sh
@@ -52,8 +52,8 @@ t_filter_dmesg()

 	# tests that drop unmount io triggers fencing
 	re="$re|scoutfs .* error: fencing "
-	re="$re|scoutfs .*: waiting for .* lock clients"
-	re="$re|scoutfs .*: all lock clients recovered"
+	re="$re|scoutfs .*: waiting for .* clients"
+	re="$re|scoutfs .*: all clients recovered"
 	re="$re|scoutfs .* error: client rid.*lock recovery timed out"

 	# some tests mount w/o options
--- a/tests/funcs/fs.sh
+++ b/tests/funcs/fs.sh
@@ -129,7 +129,7 @@ t_umount()
 	test "$nr" -lt "$T_NR_MOUNTS" || \
 		t_fail "fs nr $nr invalid"

-	eval t_quiet umount \$T_M$i
+	eval t_quiet umount \$T_M$nr
 }

 #
--- a/tests/golden/export-lookup-evict-race
+++ b/tests/golden/export-lookup-evict-race
--- a/tests/golden/inode-deletion
+++ b/tests/golden/inode-deletion
@@ -0,0 +1,27 @@
+== basic unlink deletes
+ino found in dseq index
+ino not found in dseq index
+== local open-unlink waits for close to delete
+contents after rm: contents
+ino found in dseq index
+ino not found in dseq index
+== multiple local opens are protected
+contents after rm 1: contents
+contents after rm 2: contents
+ino found in dseq index
+ino not found in dseq index
+== remote unopened unlink deletes
+ino not found in dseq index
+ino not found in dseq index
+== unlink wait for open on other mount
+mount 0 contents after mount 1 rm: contents
+ino found in dseq index
+ino found in dseq index
+stat: cannot stat ‘/mnt/test/test/inode-deletion/file’: No such file or directory
+ino not found in dseq index
+ino not found in dseq index
+== lots of deletions use one open map
+== open files survive remote scanning orphans
+mount 0 contents after mount 1 remounted: contents
+ino not found in dseq index
+ino not found in dseq index
--- a/tests/sequence
+++ b/tests/sequence
@@ -13,6 +13,7 @@ lock-refleak.sh
 lock-shrink-consistency.sh
 lock-pr-cw-conflict.sh
 lock-revoke-getcwd.sh
+export-lookup-evict-race.sh
 createmany-parallel.sh
 createmany-large-names.sh
 createmany-rename-large-dir.sh
@@ -30,4 +31,5 @@ mount-unmount-race.sh
 createmany-parallel-mounts.sh
 archive-light-cycle.sh
 block-stale-reads.sh
+inode-deletion.sh
 xfstests.sh
--- a/tests/tests/export-lookup-evict-race.sh
+++ b/tests/tests/export-lookup-evict-race.sh
@@ -0,0 +1,32 @@
+#
+# test racing fh_to_dentry with evict from lock invalidation.   We've
+# had deadlocks between the ordering of iget and evict when they acquire
+# cluster locks.
+# 
+
+t_require_commands touch stat handle_cat
+t_require_mounts 2
+
+CPUS=$(getconf _NPROCESSORS_ONLN)
+NR=$((CPUS * 4))
+END=$((SECONDS + 30))
+
+touch "$T_D0/file"
+ino=$(stat -c "%i" "$T_D0/file")
+
+while test $SECONDS -lt $END; do
+	for i in $(seq 1 $NR); do
+		fs=$((RANDOM % T_NR_MOUNTS))
+		eval dir="\$T_D${fs}"
+		write=$((RANDOM & 1))
+
+		if [ "$write" == 1 ]; then
+			touch "$dir/file" &
+		else
+			handle_cat "$dir" "$ino" &
+		fi
+	done
+	wait
+done
+
+t_pass
--- a/tests/tests/inode-deletion.sh
+++ b/tests/tests/inode-deletion.sh
@@ -0,0 +1,98 @@
+#
+# test deleting an inode once all its links and references are gone.
+#
+
+t_require_commands cat scoutfs
+t_require_mounts 2
+
+FILE="$T_D0/file"
+
+check_ino_index() {
+	local ino="$1"
+	local dseq="$2"
+	local mnt="$3"
+
+	t_sync_seq_index
+
+	scoutfs walk-inodes -p "$mnt" -- data_seq $dseq $(($dseq + 1)) |
+		awk 'BEGIN { not = "not " }
+		     ($4 == '$ino') { not = ""; exit; }
+		     END { print "ino " not "found in dseq index" }'
+}
+
+echo "== basic unlink deletes"
+echo "contents" > "$FILE"
+ino=$(stat -c "%i" "$FILE")
+dseq=$(scoutfs stat -s data_seq "$FILE")
+check_ino_index "$ino" "$dseq" "$T_M0"
+rm -f "$FILE"
+check_ino_index "$ino" "$dseq" "$T_M0"
+
+echo "== local open-unlink waits for close to delete"
+echo "contents" > "$FILE"
+ino=$(stat -c "%i" "$FILE")
+dseq=$(scoutfs stat -s data_seq "$FILE")
+exec {FD}<"$FILE"  # open unused fd, assign to FD
+rm -f "$FILE"
+echo "contents after rm: $(cat <&$FD)"
+check_ino_index "$ino" "$dseq" "$T_M0"
+exec {FD}>&-  # close
+check_ino_index "$ino" "$dseq" "$T_M0"
+
+echo "== multiple local opens are protected"
+echo "contents" > "$FILE"
+ino=$(stat -c "%i" "$FILE")
+dseq=$(scoutfs stat -s data_seq "$FILE")
+exec {FD1}<"$FILE"
+exec {FD2}<"$FILE"
+rm -f "$FILE"
+echo "contents after rm 1: $(cat <&$FD1)"
+echo "contents after rm 2: $(cat <&$FD2)"
+check_ino_index "$ino" "$dseq" "$T_M0"
+exec {FD1}>&-  # close
+exec {FD2}>&-  # close
+check_ino_index "$ino" "$dseq" "$T_M0"
+
+echo "== remote unopened unlink deletes"
+echo "contents" > "$T_D0/file"
+ino=$(stat -c "%i" "$T_D0/file")
+dseq=$(scoutfs stat -s data_seq "$T_D0/file")
+rm -f "$T_D1/file"
+check_ino_index "$ino" "$dseq" "$T_M0"
+check_ino_index "$ino" "$dseq" "$T_M1"
+
+echo "== unlink wait for open on other mount"
+echo "contents" > "$T_D0/file"
+ino=$(stat -c "%i" "$T_D0/file")
+dseq=$(scoutfs stat -s data_seq "$T_D0/file")
+exec {FD}<"$T_D0/file"
+rm -f "$T_D1/file"
+echo "mount 0 contents after mount 1 rm: $(cat <&$FD)"
+check_ino_index "$ino" "$dseq" "$T_M0"
+check_ino_index "$ino" "$dseq" "$T_M1"
+exec {FD}>&-  # close
+# we know that revalidating will unhash the remote dentry
+stat "$T_D0/file" 2>&1 | t_filter_fs
+check_ino_index "$ino" "$dseq" "$T_M0"
+check_ino_index "$ino" "$dseq" "$T_M1"
+
+echo "== lots of deletions use one open map"
+mkdir "$T_D0/dir"
+touch "$T_D0/dir"/files-{1..5}
+rm -f "$T_D0/dir"/files-*
+rmdir "$T_D0/dir"
+
+echo "== open files survive remote scanning orphans"
+echo "contents" > "$T_D0/file"
+ino=$(stat -c "%i" "$T_D0/file")
+dseq=$(scoutfs stat -s data_seq "$T_D0/file")
+exec {FD}<"$T_D0/file"
+rm -f "$T_D0/file"
+t_umount 1
+t_mount 1
+echo "mount 0 contents after mount 1 remounted: $(cat <&$FD)"
+exec {FD}>&-  # close
+check_ino_index "$ino" "$dseq" "$T_M0"
+check_ino_index "$ino" "$dseq" "$T_M1"
+
+t_pass
--- a/utils/src/print.c
+++ b/utils/src/print.c
@@ -339,14 +339,6 @@ static int print_srch_root_item(struct scoutfs_key *key, void *val,
 	return 0;
 }

-static int print_lock_clients_entry(struct scoutfs_key *key, void *val,
-				    unsigned val_len, void *arg)
-{
-	printf("    rid %016llx\n", le64_to_cpu(key->sklc_rid));
-
-	return 0;
-}
-
 static int print_trans_seqs_entry(struct scoutfs_key *key, void *val,
 				  unsigned val_len, void *arg)
 {
@@ -876,7 +868,6 @@ static void print_super_block(struct scoutfs_super_block *super, u64 blkno)
 	       "  server_meta_avail[1]: "AL_HEAD_F"\n"
 	       "  server_meta_freed[0]: "AL_HEAD_F"\n"
 	       "  server_meta_freed[1]: "AL_HEAD_F"\n"
-	       "  lock_clients root: height %u blkno %llu seq %llu\n"
 	       "  mounted_clients root: height %u blkno %llu seq %llu\n"
 	       "  srch_root root: height %u blkno %llu seq %llu\n"
 	       "  trans_seqs root: height %u blkno %llu seq %llu\n"
@@ -896,9 +887,6 @@ static void print_super_block(struct scoutfs_super_block *super, u64 blkno)
 		AL_HEAD_A(&super->server_meta_avail[1]),
 		AL_HEAD_A(&super->server_meta_freed[0]),
 		AL_HEAD_A(&super->server_meta_freed[1]),
-		super->lock_clients.height,
-		le64_to_cpu(super->lock_clients.ref.blkno),
-		le64_to_cpu(super->lock_clients.ref.seq),
 		super->mounted_clients.height,
 		le64_to_cpu(super->mounted_clients.ref.blkno),
 		le64_to_cpu(super->mounted_clients.ref.seq),
@@ -947,11 +935,6 @@ static int print_volume(int fd)

 	ret = print_quorum_blocks(fd, super);

-	err = print_btree(fd, super, "lock_clients", &super->lock_clients,
-			  print_lock_clients_entry, NULL);
-	if (err && !ret)
-		ret = err;
-
 	err = print_btree(fd, super, "mounted_clients", &super->mounted_clients,
 			  print_mounted_client_entry, NULL);
 	if (err && !ret)
Author	SHA1	Message	Date
Zach Brown	8efb30afbc	No i_mutex in aio_read for data_wait_check As a regular file reader first acquires a cluster lock it checks the file's extents to see if the region it is reading contains offline extents. When this was written the extent operations didn't have their own internal locking. It was up to callers to use vfs mechanisms to serialize readers and writers. The aio_read data_wait_check extent caller tried to use dio_count and an i_mutex acquisition to ensure that our ioctls wouldn't modify extents. This creates a bad inversion between the vfs i_mutex and our cluster inode lock. There are lots of fs methods which are called by the vfs with i_mutex held which acquire a lock. This read case was holding a cluster lock and then acquiring i_mutex. Since the data waiting was written the file data extent operations have added their own extent_sem to protect the extent items from concurrent callers. We can rely on that internal locking and drop the bad i_mutex use which causes the inversion. Not surprisingly, this was trivial to deadlock by racing simple utilities like cat and touch. Signed-off-by: Zach Brown <zab@versity.com>	2021-04-27 11:20:40 -07:00
Zach Brown	df90b3eb90	Add export-lookup-evict-race test Add a test that creates races between fh_to_dentry and eviction triggered by lock invalidation. Signed-off-by: Zach Brown <zab@versity.com>	2021-04-27 11:20:40 -07:00
Andy Grover	6eeaab3322	Merge pull request #35 from versity/zab/invalidate_already_pending Handle back to back invalidation requests	2021-04-23 16:40:45 -07:00
Andy Grover	ac68d14b8d	Merge pull request #36 from versity/zab/move_blocks_next_einval Fix accidental EINVAL in move_blocks	2021-04-23 14:39:29 -07:00
Zach Brown	ecfc8a0d0e	Merge pull request #33 from versity/zab/open_ino_map Zab/open ino map	2021-04-23 10:55:11 -07:00
Zach Brown	63148d426e	Fix accidental EINVAL in move_blocks When move blocks is staging it requires an overlapping offline extent to cover the entire region to move. It performs the stage by modifying extents at a time. If there are fragmented source extents it will modify each of them at a time in the region. When looking for the extent to match the source extent it looked from the iblock of the start of the whole operation, not the start of the source extent it's matching. This meant that it would find a the first previous online extent it just modified, which wouldn't be online, and would return -EINVAL. The fix is to have it search from the logical start of the extent it's trying to match, not the start of the region. Signed-off-by: Zach Brown <zab@versity.com>	2021-04-23 10:39:34 -07:00
Zach Brown	dfc2f7a4e8	Remove unused scoutfs_free_unused_locks nr arg The nr argument wasn't used. It always tries to free as many as the shrinker call will let it. Signed-off-by: Zach Brown <zab@versity.com>	2021-04-21 12:17:33 -07:00
Zach Brown	94dd86f762	Process lock invalidation after shutdown Lock teardown during unmount involves first calling shutdown and then destroy. The shutdown call is meant to ensure that it's safe to tear down the client network connections. Once shutdown returns locking is promising that it won't call into the client to send new lock requests. The current shutdown implementation is very heavy handed and shuts down everything. This creates a deadlock. After calling lock shutdown, the client will send its farewell and wait for a response. The server might not send the farewell response until other mounts have unmounted if our client is in the server's mount. In this case we stil have to be processing lock invalidation requests to allow other unmounting clients to make forward progress. This is reasonably easy and safe to do. We only use the shutdown flag to stop lock calls that would change lock state and send requests. We don't have it stop incoming requests processing in the work queueing functions. It's safe to keep processing incoming requests between _shutdown and _destroy because the requests already come in through the client. As the client shuts down it will stop calling us. Signed-off-by: Zach Brown <zab@versity.com>	2021-04-21 12:17:33 -07:00
Zach Brown	841d22e26e	Disable task reclaim flags for block cache vmalloc Even though we can pass in gfp flags to vmalloc it eventually calls pte alloc functions which ignore the caller's flags and use user gfp flags. This risks reclaim re-entering fs paths during allocations in the block cache. These allocs that allowed reclaim deep in the fs was causing lockdep to add RECLAIM dependencies between locks and holler about deadlocks. We apply the same pattern that xfs does for disabling reclaim while allocating vmalloced block payloads. Setting PF_MEMALLOC_NOIO causes reclaim in that task to clear __GFP_IO and __GFP_FS, regardless of the individual allocation flags in the task, preventing recursion. Signed-off-by: Zach Brown <zab@versity.com>	2021-04-21 12:17:33 -07:00
Zach Brown	ba8bf13ae1	Update dmesg whitelist for recovery The shared recovery layer outputs different messages than when it ran only for lock_recovery in the lock server. Signed-off-by: Zach Brown <zab@versity.com>	2021-04-21 12:17:33 -07:00
Zach Brown	2949b6063f	Clear lock invalidate_pending during destroy Locks have a bunch of state that reflects concurrent processing. Testing that state determines when it's safe to free a lock because nothing is going on. During unmount we abruptly stop processing locks. Unmount will send a farewell to the server which will remove all the state associated with the client that's unmounting for all its locks, regardless of the state the locks were in. The client unmount path has to clean up the interupted lock state and free it, carefully avoiding assertions that would otherwise indicate that we're freeing used locks. The move to async lock invalidation forgot to clean up the invalidation state. Previously a synchronous work function would set and clear invalidate_pending while it was running. Once we finished waiting for it invalidate_pending would be clear. The move to async invalidation work meant that we can still have invalidate_pending with no work executing. Lock destruction removed locks from the invalidation list but forgot to clear the invalidate_pending flag. This triggered assertions during unmount that were otherwise harmless. There was other use of the lock, we just forgot to clean up the lock state. Signed-off-by: Zach Brown <zab@versity.com>	2021-04-21 12:17:33 -07:00
Zach Brown	1e88aa6c0f	Shutdown data after trans The data_info struct holds the data allocator that is filled by transactions as they commit. We have to free it after we've shutdown transactions. It's more like the forest in this regard so we move its desctruction down by the forest to group similar behaviour. Signed-off-by: Zach Brown <zab@versity.com>	2021-04-21 12:17:33 -07:00
Zach Brown	d9aea98220	Shutdown locking before transactions Shutting down the lock client waits for invalidation work and prevents future work from being queued. We're currently shutting down the subsystems that lock calls before lock itself, leading to crashes if we happen to have invalidations executing as we unmount. Shutting down locking before its dependencies fixes this. This was hit in testing during the inode deletion fixes because it created the perfect race by acquiring locks during unmount so that the server was very unlikely to send invalidations on behalf to one mount on behalf of another as they both unmounted. Signed-off-by: Zach Brown <zab@versity.com>	2021-04-21 12:17:33 -07:00
Zach Brown	04f4b8bcb3	Perform final transaction write before shutdown Shutting down the transaction during unmount relied on the vfs unmount path to perform a sync of any remaining dirty transaction. There are ways that we can dirty a transaction during unmount after it calls the fs sync, so we try to write any remaining dirty transaction before shutting down. Signed-off-by: Zach Brown <zab@versity.com>	2021-04-21 12:17:33 -07:00
Zach Brown	fead263af3	Remove unused sb_info shutdown We're no longer using the shutdown field in our sb info struct. Signed-off-by: Zach Brown <zab@versity.com>	2021-04-21 12:17:33 -07:00
Zach Brown	4389c73c14	Fix deadlock between lock invalidate and evict We've had a long-standing deadlock between lock invalidation and eviction. Invalidating a lock wants to lookup inodes and drop their resources while blocking locks. Eviction wants to get a lock to perform final deletion while the inodes has I_FREEING set which blocks lookups. We only saw this deadlock a handful of times in all of the time we've run the code, but it's now much more common now that we're acquiring locks in iput to test that nlink is zero instead of only when nlink is zero. I see unmount hang regularly when testing final inode deletion. This adds a lookup variant for invalidation which will refuse to return freeing inodes so they won't be waited on. Once they're freeing they can't be seen by future lock users so they don't need to be invalidated. This keeps the lock invalication promise and avoids sleeping on freeing inodes which creates the deadlock. Signed-off-by: Zach Brown <zab@versity.com>	2021-04-21 12:17:33 -07:00
Zach Brown	dba88705f7	Fix t_umount mount point number t_umount had a typo that had it try to unmount a mount based on a caller's variable, which accidentally happened to work for its only caller. Future callers would not have been so lucky. Signed-off-by: Zach Brown <zab@versity.com>	2021-04-21 12:17:33 -07:00
Zach Brown	715c29aad3	Proactively drop dentry/inode caches outside locks Previously we wouldn't try and remove cached dentries and inodes as lock revocation removed cluster lock coverage. The next time we tried to use the cached dentries or inodes we'd acquire a lock and refresh them. But now cached inodes prevent final inode deletion. If they linger outside cluster locking then any final deletion will need to be deferred until all its cached inodes are naturally dropped at some point in the future across the cluster. It might take refreshing the dentries or for memory pressure to push out the old cached inodes. This tries to proctively drop cached dentries and inodes as we lose cluster lock coverage if they're not actively referenced. We need to be careful not to perform final inode deletion during lock invalidation because it will deadlock, so we defer an iput which could delete during evict out to async work. Now deletion can be done synchronously in the task that is performing the unlink because previous use of the inode on remote mounts hasn't left unused cached inodes sitting around. Signed-off-by: Zach Brown <zab@versity.com>	2021-04-21 12:17:33 -07:00
Zach Brown	b244b2d59c	Add inode-deletion test Signed-off-by: Zach Brown <zab@versity.com>	2021-04-21 12:17:33 -07:00
Zach Brown	22371fe5bd	Fully destroy inodes after all mounts evict Today an inode's items are deleted once its nlink reaches zero and the final iput is called in a local mount. This can delete inodes from under other mounts which have opened the inode before it was unlinked on another mount. We fix this by adding cached inode tracking. Each mount maintains groups of cached inode bitmaps at the same granularity as inode locking. As a mount performs its final iput it gets a bitmap from the server which indicates if any other mount has inodes in the group open. This makes the two fast paths of opening and closing linked files and of deleting a file that was unlinked locally only pay a moderate cost of either maintaining the bitmap locally and only getting the open map once per lock group. Removing many files in a group will only lock and get the open map once per group. Signed-off-by: Zach Brown <zab@versity.com>	2021-04-21 12:17:33 -07:00
Zach Brown	c6fd807638	Use recov to manage lock recovery Now that we have the recov layer we can have the lock server use it to track lock recovery. The lock server no longer needs its own recovery tracking structures and can instead call recov. We add a call for the server to call to kick lock processing once lock recovery finishes. We can get rid of the persistent lock_client items now that the server is driving recovery from the mounted_client items. Signed-off-by: Zach Brown <zab@versity.com>	2021-04-13 12:10:35 -07:00
Zach Brown	592f472a1c	Use recov in server to recover client greetings The server starts recovery when it finds mounted client items as it starts up. The clients are done recovering once they send their greeting. If they don't recover in time then they'll be fenced. Signed-off-by: Zach Brown <zab@versity.com>	2021-04-13 12:10:35 -07:00
Zach Brown	a65775588f	Add server recovery helpers Add a little set of functions to help the server track which clients are waiting to recover which state. The open map messages need to wait for recovery so we're moving recovery out of being only in the lock server. Signed-off-by: Zach Brown <zab@versity.com>	2021-04-13 12:10:35 -07:00
Zach Brown	da1af9b841	Add scoutfs inode ino lock coverage Add lock coverage which tracks if the inode has been refreshed and is covered by the inode group cluster lock. This will be used by drop_inode and evict_inode to discover that the inode is current and doesn't need to be refreshed. Signed-off-by: Zach Brown <zab@versity.com>	2021-04-13 12:10:35 -07:00
Zach Brown	accd680a7e	Fix block setup always returning 0 Another case of returning 0 instead of ret. Signed-off-by: Zach Brown <zab@versity.com>	2021-04-13 12:10:35 -07:00