scoutfs: implement lock recovery

When a server crashes all the connected clients still have operational locks and can be using them to protect IO. As a new server starts up its lock service needs to account for those outstanding locks before granting new locks to clients. This implements lock recovery by having the lock service recover locks from clients as it starts up. First the lock service stores records of connected clients in a btree off the super block. Records are added as the server receives their greeting and are removed as the server receives their farewell. Then the server checks for existing persistent records as it starts up. If it finds any it enters recovery and waits for all the old clients to reconnect before resuming normal processing. We add lock recover request and response messages that are used to communicate locks from the clients to the server. Signed-off-by: Zach Brown <zab@versity.com>
2026-04-28 00:46:57 +00:00 · 2019-02-08 13:19:30 -08:00
parent 801f6ad9be
commit ec0fb5380a
11 changed files with 659 additions and 51 deletions
--- a/kmod/src/btree.c
+++ b/kmod/src/btree.c
@@ -348,6 +348,7 @@ static void advance_to_next_half(struct scoutfs_btree_ring *bring)
 static size_t super_root_offsets[] = {
 	offsetof(struct scoutfs_super_block, alloc_root),
 	offsetof(struct scoutfs_super_block, manifest.root),
+	offsetof(struct scoutfs_super_block, lock_clients),
 };

 #define for_each_super_root(super, i, root)				\
--- a/kmod/src/client.c
+++ b/kmod/src/client.c
@@ -260,6 +260,19 @@ int scoutfs_client_lock_response(struct super_block *sb, u64 net_id,
 				    net_id, 0, nl, sizeof(*nl));
 }

+/* Send a lock recover response to the server. */
+int scoutfs_client_lock_recover_response(struct super_block *sb, u64 net_id,
+					 struct scoutfs_net_lock_recover *nlr)
+{
+	struct client_info *client = SCOUTFS_SB(sb)->client_info;
+	u16 bytes = offsetof(struct scoutfs_net_lock_recover,
+			     locks[le16_to_cpu(nlr->nr)]);
+
+	return scoutfs_net_response(sb, client->conn,
+				    SCOUTFS_NET_CMD_LOCK_RECOVER,
+				    net_id, 0, nlr, bytes);
+}
+
 /* The client is receiving a invalidation request from the server */
 static int client_lock(struct super_block *sb,
 		       struct scoutfs_net_connection *conn, u8 cmd, u64 id,
@@ -273,6 +286,19 @@ static int client_lock(struct super_block *sb,
 	return scoutfs_lock_invalidate_request(sb, id, arg);
 }

+/* The server is asking us for the client's locks starting with the given key */
+static int client_lock_recover(struct super_block *sb,
+			       struct scoutfs_net_connection *conn,
+			       u8 cmd, u64 id, void *arg, u16 arg_len)
+{
+	if (arg_len != sizeof(struct scoutfs_key))
+		return -EINVAL;
+
+	/* XXX error? */
+
+	return scoutfs_lock_recover_request(sb, id, arg);
+}
+
 /*
 * Process a greeting response in the client from the server.  This is
 * called for every connected socket on the connection.  The first
@@ -508,6 +534,7 @@ out:
 static scoutfs_net_request_t client_req_funcs[] = {
 	[SCOUTFS_NET_CMD_COMPACT]		= client_compact,
 	[SCOUTFS_NET_CMD_LOCK]			= client_lock,
+	[SCOUTFS_NET_CMD_LOCK_RECOVER]		= client_lock_recover,
 };

 /*
--- a/kmod/src/client.h
+++ b/kmod/src/client.h
@@ -21,6 +21,8 @@ int scoutfs_client_lock_request(struct super_block *sb,
 				struct scoutfs_net_lock *nl);
 int scoutfs_client_lock_response(struct super_block *sb, u64 net_id,
 				struct scoutfs_net_lock *nl);
+int scoutfs_client_lock_recover_response(struct super_block *sb, u64 net_id,
+					 struct scoutfs_net_lock_recover *nlr);

 int scoutfs_client_wait_node_id(struct super_block *sb);
 int scoutfs_client_setup(struct super_block *sb);
--- a/kmod/src/counters.h
+++ b/kmod/src/counters.h
@@ -98,6 +98,7 @@
 	EXPAND_COUNTER(lock_lock)				\
 	EXPAND_COUNTER(lock_lock_error)				\
 	EXPAND_COUNTER(lock_nonblock_eagain)			\
+	EXPAND_COUNTER(lock_recover_request)			\
 	EXPAND_COUNTER(lock_shrink_queued)			\
 	EXPAND_COUNTER(lock_shrink_request_aborted)		\
 	EXPAND_COUNTER(lock_unlock)				\
--- a/kmod/src/format.h
+++ b/kmod/src/format.h
@@ -272,6 +272,14 @@ struct scoutfs_extent_btree_key {
 	__be64 minor;
 } __packed;

+/*
+ * The lock server keeps a persistent record of connected clients so that
+ * server failover knows who to wait for before resuming operations.
+ */
+struct scoutfs_lock_client_btree_key {
+	__be64 node_id;
+} __packed;
+
 /*
 * The max number of links defines the max number of entries that we can
 * index in o(log n) and the static list head storage size in the
@@ -456,6 +464,7 @@ struct scoutfs_super_block {
 	struct scoutfs_btree_root alloc_root;
 	struct scoutfs_manifest manifest;
 	struct scoutfs_quorum_config quorum_config;
+	struct scoutfs_btree_root lock_clients;
 } __packed;

 #define SCOUTFS_ROOT_INO 1
@@ -642,6 +651,7 @@ enum {
 	SCOUTFS_NET_CMD_STATFS,
 	SCOUTFS_NET_CMD_COMPACT,
 	SCOUTFS_NET_CMD_LOCK,
+	SCOUTFS_NET_CMD_LOCK_RECOVER,
 	SCOUTFS_NET_CMD_FAREWELL,
 	SCOUTFS_NET_CMD_UNKNOWN,
 };
@@ -768,6 +778,15 @@ struct scoutfs_net_lock {
 	__u8 new_mode;
 } __packed;

+struct scoutfs_net_lock_recover {
+	__le16 nr;
+	struct scoutfs_net_lock locks[0];
+} __packed;
+
+#define SCOUTFS_NET_LOCK_MAX_RECOVER_NR					       \
+	((SCOUTFS_NET_MAX_DATA_LEN - sizeof(struct scoutfs_net_lock_recover)) /\
+	 sizeof(struct scoutfs_net_lock))
+
 /* some enums for tracing */
 enum {
 	SLT_CLIENT,
--- a/kmod/src/lock.c
+++ b/kmod/src/lock.c
@@ -54,6 +54,13 @@
 * lock attempt can't immediately match an existing granted lock.  This
 * is fine for the only rare user which can back out of its lock
 * inversion and retry with a full blocking lock.
+ *
+ * Lock recovery is initiated by the server when it recognizes that
+ * we're reconnecting to it while a previous server left a persistenr
+ * record of us.  We resend all our pending requests which are deferred
+ * until recovery finishes.  The server sends us a recovery request and
+ * we respond with all our locks.  Our resent requests are processed
+ * relative to that lock state we resend.
 */

 #define GRACE_PERIOD_KT	ms_to_ktime(2)
@@ -407,7 +414,8 @@ static void lock_remove(struct lock_info *linfo, struct scoutfs_lock *lock)
 }

 static struct scoutfs_lock *lock_lookup(struct super_block *sb,
-					 struct scoutfs_key *start)
+					struct scoutfs_key *start,
+					struct scoutfs_lock **next)
 {
 	DECLARE_LOCK_INFO(sb, linfo);
 	struct rb_node *node = linfo->lock_tree.rb_node;
@@ -416,16 +424,22 @@ static struct scoutfs_lock *lock_lookup(struct super_block *sb,

 	assert_spin_locked(&linfo->lock);

+	if (next)
+		*next = NULL;
+
 	while (node) {
 		lock = container_of(node, struct scoutfs_lock, node);

 		cmp = scoutfs_key_compare(start, &lock->start);
-		if (cmp < 0)
+		if (cmp < 0) {
+			if (next)
+				*next = lock;
 			node = node->rb_left;
-		else if (cmp > 0)
+		} else if (cmp > 0) {
 			node = node->rb_right;
-		else
+		} else {
 			return lock;
+		}
 	}

 	return NULL;
@@ -454,7 +468,7 @@ static struct scoutfs_lock *get_lock(struct super_block *sb,

 	assert_spin_locked(&linfo->lock);

-	lock = lock_lookup(sb, start);
+	lock = lock_lookup(sb, start, NULL);
 	if (lock)
 		__lock_del_lru(linfo, lock);

@@ -599,7 +613,7 @@ int scoutfs_lock_grant_response(struct super_block *sb,
 	spin_lock(&linfo->lock);

 	/* lock must already be busy with request_pending */
-	lock = lock_lookup(sb, &nl->key);
+	lock = lock_lookup(sb, &nl->key, NULL);
 	BUG_ON(!lock);
 	BUG_ON(!lock->request_pending);

@@ -750,6 +764,58 @@ int scoutfs_lock_invalidate_request(struct super_block *sb, u64 net_id,
 	return 0;
 }

+/*
+ * The server is asking us to send them as many locks as we can starting
+ * with the given key.  We'll send a response with 0 locks to indicate
+ * that we've sent all our locks.  This is called in client processing
+ * so the client won't try to reconnect to another server until we
+ * return.
+ */
+int scoutfs_lock_recover_request(struct super_block *sb, u64 net_id,
+				 struct scoutfs_key *key)
+{
+	DECLARE_LOCK_INFO(sb, linfo);
+	struct scoutfs_net_lock_recover *nlr;
+	struct scoutfs_lock *lock;
+	struct scoutfs_lock *next;
+	struct rb_node *node;
+	int ret;
+	int i;
+
+	scoutfs_inc_counter(sb, lock_recover_request);
+
+	nlr = kmalloc(offsetof(struct scoutfs_net_lock_recover,
+			       locks[SCOUTFS_NET_LOCK_MAX_RECOVER_NR]),
+		      GFP_NOFS);
+	if (!nlr)
+		return -ENOMEM;
+
+	spin_lock(&linfo->lock);
+
+	lock = lock_lookup(sb, key, &next) ?: next;
+
+	for (i = 0; lock && i < SCOUTFS_NET_LOCK_MAX_RECOVER_NR; i++) {
+
+		nlr->locks[i].key = lock->start;
+		nlr->locks[i].old_mode = lock->mode;
+		nlr->locks[i].new_mode = lock->mode;
+
+		node = rb_next(&lock->node);
+		if (node)
+			lock = rb_entry(node, struct scoutfs_lock, node);
+		else
+			lock = NULL;
+	}
+
+	nlr->nr = cpu_to_le16(i);
+
+	spin_unlock(&linfo->lock);
+
+	ret = scoutfs_client_lock_recover_response(sb, net_id, nlr);
+	kfree(nlr);
+	return ret;
+}
+
 static bool lock_wait_cond(struct super_block *sb, struct scoutfs_lock *lock,
 			   int mode)
 {
--- a/kmod/src/lock.h
+++ b/kmod/src/lock.h
@@ -48,6 +48,8 @@ int scoutfs_lock_grant_response(struct super_block *sb,
 				struct scoutfs_net_lock *nl);
 int scoutfs_lock_invalidate_request(struct super_block *sb, u64 net_id,
 				    struct scoutfs_net_lock *nl);
+int scoutfs_lock_recover_request(struct super_block *sb, u64 net_id,
+				 struct scoutfs_key *key);

 int scoutfs_lock_inode(struct super_block *sb, int mode, int flags,
 		       struct inode *inode, struct scoutfs_lock **ret_lock);
--- a/kmod/src/lock_server.c
+++ b/kmod/src/lock_server.c
@@ -18,6 +18,9 @@
 #include "counters.h"
 #include "net.h"
 #include "tseq.h"
+#include "spbm.h"
+#include "btree.h"
+#include "msg.h"
 #include "scoutfs_trace.h"
 #include "lock_server.h"

@@ -67,10 +70,18 @@
 * server shuts down.
 */

+#define LOCK_SERVER_RECOVERY_MS	(10 * MSEC_PER_SEC)
+
 struct lock_server_info {
+	struct super_block *sb;
+
 	spinlock_t lock;
+	struct mutex mutex;
 	struct rb_root locks_root;

+	struct scoutfs_spbm recovery_pending;
+	struct delayed_work recovery_dwork;
+
 	struct scoutfs_tseq_tree tseq_tree;
 	struct dentry *tseq_dentry;
 };
@@ -222,10 +233,12 @@ static bool client_entries_compatible(struct client_lock_entry *granted,
 */
 static struct server_lock_node *get_server_lock(struct lock_server_info *inf,
 						struct scoutfs_key *key,
-						struct server_lock_node *ins)
+						struct server_lock_node *ins,
+						bool or_next)
 {
 	struct rb_root *root = &inf->locks_root;
 	struct server_lock_node *ret = NULL;
+	struct server_lock_node *next = NULL;
 	struct server_lock_node *snode;
 	struct rb_node *parent = NULL;
 	struct rb_node **node;
@@ -240,6 +253,8 @@ static struct server_lock_node *get_server_lock(struct lock_server_info *inf,

 		cmp = scoutfs_key_compare(key, &snode->key);
 		if (cmp < 0) {
+			if (or_next)
+				next = snode;
 			node = &(*node)->rb_left;
 		} else if (cmp > 0) {
 			node = &(*node)->rb_right;
@@ -255,6 +270,9 @@ static struct server_lock_node *get_server_lock(struct lock_server_info *inf,
 		ret = ins;
 	}

+	if (ret == NULL && or_next && next)
+		ret = next;
+
 	if (ret)
 		atomic_inc(&ret->refcount);

@@ -266,6 +284,33 @@ static struct server_lock_node *get_server_lock(struct lock_server_info *inf,
 	return ret;
 }

+/* Get a server lock node, allocating if one doesn't exist.  Caller must put. */
+static struct server_lock_node *alloc_server_lock(struct lock_server_info *inf,
+						  struct scoutfs_key *key)
+{
+	struct server_lock_node *snode;
+	struct server_lock_node *ins;
+
+	snode = get_server_lock(inf, key, NULL, false);
+	if (snode == NULL) {
+		ins = kzalloc(sizeof(struct server_lock_node), GFP_NOFS);
+		if (ins) {
+			atomic_set(&ins->refcount, 0);
+			mutex_init(&ins->mutex);
+			ins->key = *key;
+			INIT_LIST_HEAD(&ins->granted);
+			INIT_LIST_HEAD(&ins->requested);
+			INIT_LIST_HEAD(&ins->invalidated);
+
+			snode = get_server_lock(inf, key, ins, false);
+			if (snode != ins)
+				kfree(ins);
+		}
+	}
+
+	return snode;
+}
+
 /*
 * Finish with a server lock which has the mutex held, freeing it if
 * it's empty and unused.
@@ -324,7 +369,6 @@ int scoutfs_lock_server_request(struct super_block *sb, u64 node_id,
 	DECLARE_LOCK_SERVER_INFO(sb, inf);
 	struct client_lock_entry *clent;
 	struct server_lock_node *snode;
-	struct server_lock_node *ins;
 	int ret;

 	trace_scoutfs_lock_message(sb, SLT_SERVER, SLT_GRANT, SLT_REQUEST,
@@ -346,25 +390,11 @@ int scoutfs_lock_server_request(struct super_block *sb, u64 node_id,
 	clent->net_id = net_id;
 	clent->mode = nl->new_mode;

-	snode = get_server_lock(inf, &nl->key, NULL);
+	snode = alloc_server_lock(inf, &nl->key);
 	if (snode == NULL) {
-		ins = kzalloc(sizeof(struct server_lock_node), GFP_NOFS);
-		if (ins == NULL) {
-			kfree(clent);
-			ret = -ENOMEM;
-			goto out;
-		}
-
-		atomic_set(&ins->refcount, 0);
-		mutex_init(&ins->mutex);
-		ins->key = nl->key;
-		INIT_LIST_HEAD(&ins->granted);
-		INIT_LIST_HEAD(&ins->requested);
-		INIT_LIST_HEAD(&ins->invalidated);
-
-		snode = get_server_lock(inf, &nl->key, ins);
-		if (snode != ins)
-			kfree(ins);
+		kfree(clent);
+		ret = -ENOMEM;
+		goto out;
 	}

 	clent->snode = snode;
@@ -401,7 +431,7 @@ int scoutfs_lock_server_response(struct super_block *sb, u64 node_id,
 	}

 	/* XXX should always have a server lock here?  recovery? */
-	snode = get_server_lock(inf, &nl->key, NULL);
+	snode = get_server_lock(inf, &nl->key, NULL, false);
 	if (!snode) {
 		ret = -EINVAL;
 		goto out;
@@ -441,6 +471,14 @@ out:
 * This is called with the snode mutex held.  This can free the snode if
 * it's empty.  The caller can't reference the snode once this returns
 * so we unlock the snode mutex.
+ *
+ * All progress must wait for all clients to finish with recovery
+ * because we don't know which locks they'll hold.  The unlocked
+ * recovery_pending test here is OK.  It's filled by setup before
+ * anything runs.  It's emptied by recovery completion.  We can get a
+ * false nonempty result if we race with recovery completion, but that's
+ * OK because recovery completion processes all the locks that have
+ * requests after emptying, including the unlikely loser of that race.
 */
 static int process_waiting_requests(struct super_block *sb,
 				    struct server_lock_node *snode)
@@ -455,8 +493,9 @@ static int process_waiting_requests(struct super_block *sb,

 	BUG_ON(!mutex_is_locked(&snode->mutex));

-	/* request processing waits for all invalidation responses */
-	if (!list_empty(&snode->invalidated)) {
+	/* processing waits for all invalidation responses or recovery */
+	if (!list_empty(&snode->invalidated) ||
+	    !scoutfs_spbm_empty(&inf->recovery_pending)) {
 		ret = 0;
 		goto out;
 	}
@@ -523,6 +562,320 @@ out:
 	return ret;
 }

+/*
+ * The server received a greeting from a client for the first time.  If
+ * the client had already talked to the server then we must find an
+ * existing record for it and should begin recovery.  If it doesn't have
+ * a record then its timed out and we can't allow it to reconnect.  If
+ * its connecting for the first time then we insert a new record.  If
+ *
+ * This is running in concurrent client greeting processing contexts.
+ */
+int scoutfs_lock_server_greeting(struct super_block *sb, u64 node_id,
+				 bool should_exist)
+{
+	DECLARE_LOCK_SERVER_INFO(sb, inf);
+	struct scoutfs_super_block *super = &SCOUTFS_SB(sb)->super;
+	struct scoutfs_lock_client_btree_key cbk;
+	SCOUTFS_BTREE_ITEM_REF(iref);
+	struct scoutfs_key key;
+	int ret;
+
+	cbk.node_id = cpu_to_be64(node_id);
+
+	mutex_lock(&inf->mutex);
+	if (should_exist) {
+		ret = scoutfs_btree_lookup(sb, &super->lock_clients,
+					   &cbk, sizeof(cbk), &iref);
+		if (ret == 0)
+			scoutfs_btree_put_iref(&iref);
+	} else {
+		ret = scoutfs_btree_insert(sb, &super->lock_clients,
+					   &cbk, sizeof(cbk), NULL, 0);
+	}
+	mutex_unlock(&inf->mutex);
+
+	if (should_exist && ret == 0) {
+		scoutfs_key_set_zeros(&key);
+		ret = scoutfs_server_lock_recover_request(sb, node_id, &key);
+		if (ret)
+			goto out;
+	}
+
+out:
+	return ret;
+}
+
+/*
+ * A client sent their last recovery response and can exit recovery.  If
+ * they were the last client in recovery then we can process all the
+ * server locks that had requests.
+ */
+static int finished_recovery(struct super_block *sb, u64 node_id, bool cancel)
+{
+	DECLARE_LOCK_SERVER_INFO(sb, inf);
+	struct server_lock_node *snode;
+	struct scoutfs_key key;
+	bool still_pending;
+	int ret = 0;
+
+	spin_lock(&inf->lock);
+	scoutfs_spbm_clear(&inf->recovery_pending, node_id);
+	still_pending = !scoutfs_spbm_empty(&inf->recovery_pending);
+	spin_unlock(&inf->lock);
+	if (still_pending)
+		return 0;
+
+	if (cancel)
+		cancel_delayed_work_sync(&inf->recovery_dwork);
+
+	scoutfs_key_set_zeros(&key);
+
+	while ((snode = get_server_lock(inf, &key, NULL, true))) {
+
+		key = snode->key;
+		scoutfs_key_inc(&key);
+
+		if (!list_empty(&snode->requested)) {
+			ret = process_waiting_requests(sb, snode);
+			if (ret)
+				break;
+		} else {
+			put_server_lock(inf, snode);
+		}
+	}
+
+	return ret;
+}
+
+/*
+ * We sent a lock recover request to the client when we received its
+ * greeting while in recovery.  Here we instantiate all the locks it
+ * gave us in response and send another request from the next key.
+ * We're done once we receive an empty response.
+ */
+int scoutfs_lock_server_recover_response(struct super_block *sb, u64 node_id,
+					 struct scoutfs_net_lock_recover *nlr)
+{
+	DECLARE_LOCK_SERVER_INFO(sb, inf);
+	struct client_lock_entry *existing;
+	struct client_lock_entry *clent;
+	struct server_lock_node *snode;
+	struct scoutfs_key key;
+	int ret = 0;
+	int i;
+
+	/* client must be in recovery */
+	spin_lock(&inf->lock);
+	if (!scoutfs_spbm_test(&inf->recovery_pending, node_id))
+		ret = -EINVAL;
+	spin_unlock(&inf->lock);
+	if (ret)
+		goto out;
+
+	/* client has sent us all their locks */
+	if (nlr->nr == 0) {
+		ret = finished_recovery(sb, node_id, true);
+		goto out;
+	}
+
+	for (i = 0; i < le16_to_cpu(nlr->nr); i++) {
+		clent = kzalloc(sizeof(struct client_lock_entry), GFP_NOFS);
+		if (!clent) {
+			ret = -ENOMEM;
+			goto out;
+		}
+
+		INIT_LIST_HEAD(&clent->head);
+		clent->node_id = node_id;
+		clent->net_id = 0;
+		clent->mode = nlr->locks[i].new_mode;
+
+		snode = alloc_server_lock(inf, &nlr->locks[i].key);
+		if (snode == NULL) {
+			kfree(clent);
+			ret = -ENOMEM;
+			goto out;
+		}
+
+		existing = find_entry(snode, &snode->granted, node_id);
+		if (existing) {
+			kfree(clent);
+			put_server_lock(inf, snode);
+			ret = -EEXIST;
+			goto out;
+		}
+
+		clent->snode = snode;
+		add_client_entry(snode, &snode->granted, clent);
+		scoutfs_tseq_add(&inf->tseq_tree, &clent->tseq_entry);
+
+		put_server_lock(inf, snode);
+	}
+
+	/* send request for next batch of keys */
+	key = nlr->locks[le16_to_cpu(nlr->nr) - 1].key;
+	scoutfs_key_inc(&key);
+
+	ret = scoutfs_server_lock_recover_request(sb, node_id, &key);
+out:
+	return ret;
+}
+
+static int node_id_and_put_iref(struct scoutfs_btree_item_ref *iref,
+				u64 *node_id)
+{
+	struct scoutfs_lock_client_btree_key *cbk;
+	int ret;
+
+	if (iref->key_len == sizeof(*cbk) && iref->val_len == 0) {
+		cbk = iref->key;
+		*node_id = be64_to_cpu(cbk->node_id);
+		ret = 0;
+	} else {
+		ret = -EIO;
+	}
+	scoutfs_btree_put_iref(iref);
+	return ret;
+}
+
+/*
+ * This work executes if enough time passes without all of the clients
+ * finishing with recovery and canceling the work.  We walk through the
+ * client records and find any that still have their recovery pending.
+ */
+static void scoutfs_lock_server_recovery_timeout(struct work_struct *work)
+{
+	struct lock_server_info *inf = container_of(work,
+						    struct lock_server_info,
+						    recovery_dwork.work);
+	struct super_block *sb = inf->sb;
+	struct scoutfs_super_block *super = &SCOUTFS_SB(sb)->super;
+	struct scoutfs_lock_client_btree_key cbk;
+	SCOUTFS_BTREE_ITEM_REF(iref);
+	bool timed_out;
+	u64 node_id;
+	int ret;
+
+	/* we enter recovery if there are any client records */
+	for (node_id = 0; ; node_id++) {
+		cbk.node_id = cpu_to_be64(node_id);
+		ret = scoutfs_btree_next(sb, &super->lock_clients,
+					 &cbk, sizeof(cbk), &iref);
+		if (ret == -ENOENT) {
+			ret = 0;
+			break;
+		}
+		if (ret == 0)
+			ret = node_id_and_put_iref(&iref, &node_id);
+		if (ret < 0)
+			break;
+
+		spin_lock(&inf->lock);
+		if (scoutfs_spbm_test(&inf->recovery_pending, node_id)) {
+			scoutfs_spbm_clear(&inf->recovery_pending, node_id);
+			timed_out = true;
+		} else {
+			timed_out = false;
+		}
+		spin_unlock(&inf->lock);
+
+		if (!timed_out)
+			continue;
+
+		scoutfs_err(sb, "client node_id %llu lock recovery timed out",
+			    node_id);
+
+		/* XXX these aren't immediately committed */
+		cbk.node_id = cpu_to_be64(node_id);
+		ret = scoutfs_btree_delete(sb, &super->lock_clients,
+					   &cbk, sizeof(cbk));
+		if (ret)
+			break;
+	}
+
+	/* force processing all pending lock requests */
+	if (ret == 0)
+		ret = finished_recovery(sb, 0, false);
+
+	if (ret < 0) {
+		scoutfs_err(sb, "lock server saw err %d while timing out clients, shutting down", ret);
+		scoutfs_server_stop(sb);
+	}
+}
+
+/*
+ * A client is leaving the lock service.  They aren't using locks and
+ * won't send any more requests.  We tear down all the state we had for
+ * them.  This can be called multiple times for a given client as their
+ * farewell is resent to new servers.  It's OK to not find any state.
+ * If we fail to delete a persistent entry then we have to shut down and
+ * hope that the next server has more luck.
+ */
+int scoutfs_lock_server_farewell(struct super_block *sb, u64 node_id)
+{
+	DECLARE_LOCK_SERVER_INFO(sb, inf);
+	struct scoutfs_super_block *super = &SCOUTFS_SB(sb)->super;
+	struct scoutfs_lock_client_btree_key cli;
+	struct client_lock_entry *clent;
+	struct client_lock_entry *tmp;
+	struct server_lock_node *snode;
+	struct scoutfs_key key;
+	struct list_head *list;
+	bool freed;
+	int ret = 0;
+
+	cli.node_id = cpu_to_be64(node_id);
+	mutex_lock(&inf->mutex);
+	ret = scoutfs_btree_delete(sb, &super->lock_clients, &cli, sizeof(cli));
+	mutex_unlock(&inf->mutex);
+	if (ret == -ENOENT) {
+		ret = 0;
+		goto out;
+	}
+	if (ret < 0)
+		goto out;
+
+	scoutfs_key_set_zeros(&key);
+
+	while ((snode = get_server_lock(inf, &key, NULL, true))) {
+
+		freed = false;
+		for (list = &snode->granted; list != NULL;
+		     list = (list == &snode->granted) ? &snode->requested :
+			    (list == &snode->requested) ? &snode->invalidated :
+			    NULL) {
+
+			list_for_each_entry_safe(clent, tmp, list, head) {
+				if (clent->node_id == node_id) {
+					free_client_entry(inf, snode, clent);
+					freed = true;
+				}
+			}
+		}
+
+		key = snode->key;
+		scoutfs_key_inc(&key);
+
+		if (freed) {
+			ret = process_waiting_requests(sb, snode);
+			if (ret)
+				goto out;
+		} else {
+			put_server_lock(inf, snode);
+		}
+	}
+	ret = 0;
+
+out:
+	if (ret < 0) {
+		scoutfs_err(sb, "lock server err %d during node %llu farewell, shutting down", ret, node_id);
+		scoutfs_server_stop(sb);
+	}
+
+	return ret;
+}
+
 static char *lock_mode_string(u8 mode)
 {
 	static char *mode_strings[] = {
@@ -566,17 +919,35 @@ static void lock_server_tseq_show(struct seq_file *m,
 		   clent->net_id);
 }

+/*
+ * Setup the lock server.  This is called before networking can deliver
+ * requests.  If we find existing client records then we enter recovery.
+ * Lock request processing is deferred until recovery is resolved for
+ * all the existing clients, either they reconnect and replay locks or
+ * we time them out.
+ */
 int scoutfs_lock_server_setup(struct super_block *sb)
 {
 	struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
+	struct scoutfs_super_block *super = &SCOUTFS_SB(sb)->super;
 	struct lock_server_info *inf;
+	SCOUTFS_BTREE_ITEM_REF(iref);
+	struct scoutfs_lock_client_btree_key cbk;
+	unsigned int nr;
+	u64 node_id;
+	int ret;

 	inf = kzalloc(sizeof(struct lock_server_info), GFP_KERNEL);
 	if (!inf)
 		return -ENOMEM;

+	inf->sb = sb;
 	spin_lock_init(&inf->lock);
+	mutex_init(&inf->mutex);
 	inf->locks_root = RB_ROOT;
+	scoutfs_spbm_init(&inf->recovery_pending);
+	INIT_DELAYED_WORK(&inf->recovery_dwork,
+			  scoutfs_lock_server_recovery_timeout);
 	scoutfs_tseq_tree_init(&inf->tseq_tree, lock_server_tseq_show);

 	inf->tseq_dentry = scoutfs_tseq_create("server_locks", sbi->debug_root,
@@ -588,7 +959,37 @@ int scoutfs_lock_server_setup(struct super_block *sb)

 	sbi->lock_server_info = inf;

-	return 0;
+	/* we enter recovery if there are any client records */
+	nr = 0;
+	for (node_id = 0; ; node_id++) {
+		cbk.node_id = cpu_to_be64(node_id);
+		ret = scoutfs_btree_next(sb, &super->lock_clients,
+					 &cbk, sizeof(cbk), &iref);
+		if (ret == -ENOENT)
+			break;
+		if (ret == 0)
+			ret = node_id_and_put_iref(&iref, &node_id);
+		if (ret < 0)
+			goto out;
+
+		ret = scoutfs_spbm_set(&inf->recovery_pending, node_id);
+		if (ret)
+			goto out;
+		nr++;
+
+		if (node_id == U64_MAX)
+			break;
+	}
+	ret = 0;
+
+	if (nr) {
+		schedule_delayed_work(&inf->recovery_dwork,
+				msecs_to_jiffies(LOCK_SERVER_RECOVERY_MS));
+		scoutfs_warn(sb, "waiting for %u lock clients to connect", nr);
+	}
+
+out:
+	return ret;
 }

 /*
@@ -606,6 +1007,8 @@ void scoutfs_lock_server_destroy(struct super_block *sb)
 	LIST_HEAD(list);

 	if (inf) {
+		cancel_delayed_work_sync(&inf->recovery_dwork);
+
 		debugfs_remove(inf->tseq_dentry);

 		rbtree_postorder_for_each_entry_safe(snode, stmp,
@@ -624,6 +1027,8 @@ void scoutfs_lock_server_destroy(struct super_block *sb)
 			kfree(snode);
 		}

+		scoutfs_spbm_destroy(&inf->recovery_pending);
+
 		kfree(inf);
 		sbi->lock_server_info = NULL;
 	}
--- a/kmod/src/lock_server.h
+++ b/kmod/src/lock_server.h
@@ -1,10 +1,15 @@
 #ifndef _SCOUTFS_LOCK_SERVER_H_
 #define _SCOUTFS_LOCK_SERVER_H_

+int scoutfs_lock_server_recover_response(struct super_block *sb, u64 node_id,
+					 struct scoutfs_net_lock_recover *nlr);
 int scoutfs_lock_server_request(struct super_block *sb, u64 node_id,
 				u64 net_id, struct scoutfs_net_lock *nl);
+int scoutfs_lock_server_greeting(struct super_block *sb, u64 node_id,
+				 bool should_exist);
 int scoutfs_lock_server_response(struct super_block *sb, u64 node_id,
 				 struct scoutfs_net_lock *nl);
+int scoutfs_lock_server_farewell(struct super_block *sb, u64 node_id);

 int scoutfs_lock_server_setup(struct super_block *sb);
 void scoutfs_lock_server_destroy(struct super_block *sb);
--- a/kmod/src/server.c
+++ b/kmod/src/server.c
@@ -1066,6 +1066,39 @@ int scoutfs_server_lock_response(struct super_block *sb, u64 node_id,
 					 nl, sizeof(*nl));
 }

+static bool invalid_recover(struct scoutfs_net_lock_recover *nlr,
+			    unsigned long bytes)
+{
+	return ((bytes < sizeof(*nlr)) ||
+	        (bytes != offsetof(struct scoutfs_net_lock_recover,
+			       locks[le16_to_cpu(nlr->nr)])));
+}
+
+static int lock_recover_response(struct super_block *sb,
+				 struct scoutfs_net_connection *conn,
+				 void *resp, unsigned int resp_len,
+				 int error, void *data)
+{
+	u64 node_id = scoutfs_net_client_node_id(conn);
+
+	if (invalid_recover(resp, resp_len))
+		return -EINVAL;
+
+	return scoutfs_lock_server_recover_response(sb, node_id, resp);
+}
+
+int scoutfs_server_lock_recover_request(struct super_block *sb, u64 node_id,
+					struct scoutfs_key *key)
+{
+	struct server_info *server = SCOUTFS_SB(sb)->server_info;
+
+	return scoutfs_net_submit_request_node(sb, server->conn, node_id,
+					      SCOUTFS_NET_CMD_LOCK_RECOVER,
+					      key, sizeof(*key),
+					      lock_recover_response,
+					      NULL, NULL);
+}
+
 /*
 * Process an incoming greeting request in the server from the client.
 * We try to send responses to failed greetings so that the sender can
@@ -1083,6 +1116,14 @@ int scoutfs_server_lock_response(struct super_block *sb, u64 node_id,
 * disconnect before they receive the response and resent and initial
 * blank greeting.  We could use a client uuid to associate with
 * allocated node_ids.
+ *
+ * XXX The logic of this has gotten convoluted.  The lock server can
+ * send a recovery request so it needs to be called after the core net
+ * greeting call enables messages.  But we want the greeting reply to be
+ * sent first, so we currently queue it on the send queue before
+ * enabling messages.  That means that a lot of errors that happen after
+ * the reply can't be sent to the client.  They'll just see a disconnect
+ * and won't know what's happened.  This all needs to be refactored.
 */
 static int server_greeting(struct super_block *sb,
 			   struct scoutfs_net_connection *conn,
@@ -1098,10 +1139,11 @@ static int server_greeting(struct super_block *sb,
 	bool first_contact;
 	bool farewell;
 	int ret = 0;
+	int err;

 	if (arg_len != sizeof(struct scoutfs_net_greeting)) {
 		ret = -EINVAL;
-		goto out;
+		goto send_err;
 	}

 	if (gr->fsid != super->hdr.fsid) {
@@ -1109,7 +1151,7 @@ static int server_greeting(struct super_block *sb,
 			     le64_to_cpu(gr->fsid),
 			     le64_to_cpu(super->hdr.fsid));
 		ret = -EINVAL;
-		goto out;
+		goto send_err;
 	}

 	if (gr->format_hash != super->format_hash) {
@@ -1117,7 +1159,7 @@ static int server_greeting(struct super_block *sb,
 			     le64_to_cpu(gr->format_hash),
 			     le64_to_cpu(super->format_hash));
 		ret = -EINVAL;
-		goto out;
+		goto send_err;
 	}

 	if (gr->node_id == 0) {
@@ -1131,35 +1173,58 @@ static int server_greeting(struct super_block *sb,
 		queue_commit_work(server, &cw);
 		up_read(&server->commit_rwsem);
 		ret = wait_for_commit(&cw);
-		if (ret) {
-			node_id = 0;
-			goto out;
-		}
 	} else {
 		node_id = gr->node_id;
 	}

+send_err:
+	err = ret;
+	if (err)
+		node_id = 0;
+
 	greet.fsid = super->hdr.fsid;
 	greet.format_hash = super->format_hash;
 	greet.server_term = cpu_to_le64(server->term);
 	greet.node_id = node_id;
 	greet.flags = 0;
-out:
-	ret = scoutfs_net_response(sb, conn, cmd, id, ret,
-				   &greet, sizeof(greet));
-	if (node_id != 0 && ret == 0) {
-		sent_node_id = gr->node_id != 0;
-		first_contact = le64_to_cpu(gr->server_term) != server->term;
-		if (gr->flags & cpu_to_le64(SCOUTFS_NET_GREETING_FLAG_FAREWELL))
-			farewell = true;
-		else
-			farewell = false;

-		scoutfs_net_server_greeting(sb, conn, le64_to_cpu(node_id), id,
-					    sent_node_id, first_contact,
-					    farewell);
+	/* queue greeting response to be sent first once messaging enabled */
+	ret = scoutfs_net_response(sb, conn, cmd, id, err,
+				   &greet, sizeof(greet));
+	if (ret == 0 && err)
+		ret = err;
+	if (ret)
+		goto out;
+
+	/* have the net core enable messaging and resend */
+	sent_node_id = gr->node_id != 0;
+	first_contact = le64_to_cpu(gr->server_term) != server->term;
+	if (gr->flags & cpu_to_le64(SCOUTFS_NET_GREETING_FLAG_FAREWELL))
+		farewell = true;
+	else
+		farewell = false;
+
+	scoutfs_net_server_greeting(sb, conn, le64_to_cpu(node_id), id,
+				    sent_node_id, first_contact, farewell);
+
+	/* lock server might send recovery request */
+	if (le64_to_cpu(gr->server_term) != server->term) {
+
+		/* we're now doing two commits per greeting, not great */
+		down_read(&server->commit_rwsem);
+
+		ret = scoutfs_lock_server_greeting(sb, le64_to_cpu(node_id),
+						   gr->server_term != 0);
+		if (ret == 0)
+			queue_commit_work(server, &cw);
+		up_read(&server->commit_rwsem);
+		if (ret == 0)
+			ret = wait_for_commit(&cw);
+		if (ret)
+			goto out;
 	}

+out:
 	return ret;
 }

@@ -1178,12 +1243,25 @@ static int server_farewell(struct super_block *sb,
 			   struct scoutfs_net_connection *conn,
 			   u8 cmd, u64 id, void *arg, u16 arg_len)
 {
+	struct server_info *server = SCOUTFS_SB(sb)->server_info;
+	u64 node_id = scoutfs_net_client_node_id(conn);
+	struct commit_waiter cw;
+	int ret;
+
 	if (arg_len != 0)
 		return -EINVAL;

 	scoutfs_net_server_farewell(sb, conn);

-	return scoutfs_net_response(sb, conn, cmd, id, 0, NULL, 0);
+	down_read(&server->commit_rwsem);
+	ret = scoutfs_lock_server_farewell(sb, node_id);
+	if (ret == 0)
+		queue_commit_work(server, &cw);
+	up_read(&server->commit_rwsem);
+	if (ret == 0)
+		ret = wait_for_commit(&cw);
+
+	return scoutfs_net_response(sb, conn, cmd, id, ret, NULL, 0);
 }

 /* requests sent to clients are tracked so we can free resources */
--- a/kmod/src/server.h
+++ b/kmod/src/server.h
@@ -68,6 +68,8 @@ int scoutfs_server_lock_request(struct super_block *sb, u64 node_id,
 				struct scoutfs_net_lock *nl);
 int scoutfs_server_lock_response(struct super_block *sb, u64 node_id,
 				 u64 id, struct scoutfs_net_lock *nl);
+int scoutfs_server_lock_recover_request(struct super_block *sb, u64 node_id,
+					struct scoutfs_key *key);

 struct sockaddr_in;
 int scoutfs_server_start(struct super_block *sb, struct sockaddr_in *sin,