/*
 * Copyright (C) 2019 Versity Software, Inc.  All rights reserved.
 *
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public
 * License v2 as published by the Free Software Foundation.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * General Public License for more details.
 */
#include <linux/kernel.h>
#include <linux/fs.h>
#include <linux/slab.h>

#include "format.h"
#include "counters.h"
#include "net.h"
#include "tseq.h"
#include "spbm.h"
#include "btree.h"
#include "msg.h"
#include "scoutfs_trace.h"
#include "lock_server.h"

/*
 * The scoutfs server implements a simple lock service.  Client mounts
 * request access to locks identified by a key.  The server ensures that
 * access mode exclusion is properly enforced.
 *
 * The server processing paths are implemented in network message
 * receive processing callbacks.  We're receiving either a grant request
 * or an invalidation response.  These processing callbacks are fully
 * concurrent.  Our grant responses and invalidation requests are sent
 * from these contexts.
 *
 * We separate the locking of the global index of tracked locks from the
 * locking of a lock's state.  This allows concurrent work on unrelated
 * locks and lets processing block sending responses to unresponsive
 * clients without affecting other locks.
 *
 * Correctness of the protocol relies on the client and server each only
 * sending one request at a time for a given lock.  The server won't
 * process a request from a client until its outstanding invalidation
 * requests for the lock to other clients have been completed.  The
 * server specifies both the old mode and new mode when sending messages
 * to the client.  This lets the client resolve possible reordering when
 * processing incoming grant responses and invalidation requests.  The
 * server doesn't use the modes specified by the clients but they're
 * provided to add context.
 *
 * The server relies on the node_id allocation and reliable messaging
 * layers of the system.  Each client has a node_id that is unique for
 * its life time.   Message requests and responses are reliably
 * delivered in order across reconnection.
 *
 * The server maintains a persistent record of connected clients.  A new
 * server instance discovers these and waits for previously connected
 * clients to reconnect and recover their state before proceeding.  If
 * clients don't reconnect they are forcefully prevented from unsafely
 * accessing the shared persistent storage.  (fenced, according to the
 * rules of the platform.. could range from being powered off to having
 * their switch port disabled to having their local block device set
 * read-only.)
 *
 * The lock server doesn't respond to memory pressure.  The only way
 * locks are freed is if they are invalidated to null on behalf of a
 * conflicting request, clients specifically request a null mode, or the
 * server shuts down.
 */

#define LOCK_SERVER_RECOVERY_MS	(10 * MSEC_PER_SEC)

struct lock_server_info {
	struct super_block *sb;

	spinlock_t lock;
	struct mutex mutex;
	struct rb_root locks_root;

	struct scoutfs_spbm recovery_pending;
	struct delayed_work recovery_dwork;

	struct scoutfs_tseq_tree tseq_tree;
	struct dentry *tseq_dentry;
};

#define DECLARE_LOCK_SERVER_INFO(sb, name) \
	struct lock_server_info *name = SCOUTFS_SB(sb)->lock_server_info

/*
 * The state of a lock on the server is a function of the state of the
 * locks on all clients.
 *
 * @granted:
 * granted or trigger invalidation of previously granted.
 * The state of a lock on the server is a function of messages that have
 * been sent and received from clients on behalf of a given lock.
 *
 * While the invalidated list has entries, which means invalidation
 * messages are still in flight, no more requests will be processed.
 */
struct server_lock_node {
	atomic_t refcount;
	struct mutex mutex;
	struct rb_node node;
	struct scoutfs_key key;

	struct list_head granted;
	struct list_head requested;
	struct list_head invalidated;
};

enum {
	CLE_GRANTED,
	CLE_REQUESTED,
	CLE_INVALIDATED,
};

/*
 * Interactions with the client are tracked with these little mode
 * wrappers.
 *
 * @entry: The client mode's entry on one of the server lock lists indicating
 * that the mode is actively granted, a pending request from the client,
 * or a pending invalidation sent to the client.
 *
 * @node_id: The client's node_id used to send messages and tear down
 * state as client's exit.
 *
 * @net_id: The id of a client's request used to send grant responses.  The
 * id of invalidation requests sent to clients that could be used to cancel
 * the message.
 *
 * @mode: the mode that is granted to the client, that the client
 * requested, or that the server is asserting with a pending
 * invalidation request message.
 */
struct client_lock_entry {
	struct list_head head;
	u64 node_id;
	u64 net_id;
	u8 mode;

	struct server_lock_node *snode;
	struct scoutfs_tseq_entry tseq_entry;
	u8 on_list;
};

enum {
	OL_GRANTED = 0,
	OL_REQUESTED,
	OL_INVALIDATED,
};

/*
 * Put an entry on a server lock's list while being careful to move or
 * add the list head and while maintaining debugging info.
 */
static void add_client_entry(struct server_lock_node *snode,
			     struct list_head *list,
			     struct client_lock_entry *clent)
{
	WARN_ON_ONCE(!mutex_is_locked(&snode->mutex));

	if (list_empty(&clent->head))
		list_add_tail(&clent->head, list);
	else
		list_move_tail(&clent->head, list);

	clent->on_list = list == &snode->granted ? OL_GRANTED :
			 list == &snode->requested ? OL_REQUESTED :
			 OL_INVALIDATED;
}

static void free_client_entry(struct lock_server_info *inf,
			      struct server_lock_node *snode,
			      struct client_lock_entry *clent)
{
	WARN_ON_ONCE(!mutex_is_locked(&snode->mutex));

	if (!list_empty(&clent->head))
		list_del_init(&clent->head);
	scoutfs_tseq_del(&inf->tseq_tree, &clent->tseq_entry);
	kfree(clent);
}

static bool invalid_mode(u8 mode)
{
	return mode >= SCOUTFS_LOCK_INVALID;
}

/*
 * Return the mode that we should invalidate a granted lock down to
 * given an incompatible requested mode.  Usually we completely
 * invalidate the items because incompatible requests have to be writers
 * and our cache will then be stale, but the single exception is
 * invalidating down to a read lock having held a write lock because the
 * cache is still valid for reads after being written out.
 */
static u8 invalidation_mode(u8 granted, u8 requested)
{
	if (granted == SCOUTFS_LOCK_WRITE && requested == SCOUTFS_LOCK_READ)
		return SCOUTFS_LOCK_READ;

	return SCOUTFS_LOCK_NULL;
}

/*
 * Return true of the client lock instances described by the entries can
 * be granted at the same time.  Typically this only means they're both
 * modes that are compatible between nodes. In addition there's the
 * special case where a read lock on a client is compatible with a write
 * lock on the same client because the client's cache covered by the
 * read lock is still valid if they get a write lock.
 */
static bool client_entries_compatible(struct client_lock_entry *granted,
				      struct client_lock_entry *requested)
{
	return (granted->mode == requested->mode &&
		(granted->mode == SCOUTFS_LOCK_READ ||
		 granted->mode == SCOUTFS_LOCK_WRITE_ONLY)) ||
	       (granted->node_id == requested->node_id &&
		granted->mode == SCOUTFS_LOCK_READ &&
		requested->mode == SCOUTFS_LOCK_WRITE);
}

/*
 * Get a locked server lock, possibly inserting the caller's allocated
 * lock if we don't find one for the given key.  The server lock's mutex
 * is held on return and the caller must put the lock when they're done.
 */
static struct server_lock_node *get_server_lock(struct lock_server_info *inf,
						struct scoutfs_key *key,
						struct server_lock_node *ins,
						bool or_next)
{
	struct rb_root *root = &inf->locks_root;
	struct server_lock_node *ret = NULL;
	struct server_lock_node *next = NULL;
	struct server_lock_node *snode;
	struct rb_node *parent = NULL;
	struct rb_node **node;
	int cmp;

	spin_lock(&inf->lock);

	node = &root->rb_node;
	while (*node) {
		parent = *node;
		snode = container_of(*node, struct server_lock_node, node);

		cmp = scoutfs_key_compare(key, &snode->key);
		if (cmp < 0) {
			if (or_next)
				next = snode;
			node = &(*node)->rb_left;
		} else if (cmp > 0) {
			node = &(*node)->rb_right;
		} else {
			ret = snode;
			break;
		}
	}

	if (ret == NULL && ins) {
		rb_link_node(&ins->node, parent, node);
		rb_insert_color(&ins->node, root);
		ret = ins;
	}

	if (ret == NULL && or_next && next)
		ret = next;

	if (ret)
		atomic_inc(&ret->refcount);

	spin_unlock(&inf->lock);

	if (ret)
		mutex_lock(&ret->mutex);

	return ret;
}

/* Get a server lock node, allocating if one doesn't exist.  Caller must put. */
static struct server_lock_node *alloc_server_lock(struct lock_server_info *inf,
						  struct scoutfs_key *key)
{
	struct server_lock_node *snode;
	struct server_lock_node *ins;

	snode = get_server_lock(inf, key, NULL, false);
	if (snode == NULL) {
		ins = kzalloc(sizeof(struct server_lock_node), GFP_NOFS);
		if (ins) {
			atomic_set(&ins->refcount, 0);
			mutex_init(&ins->mutex);
			ins->key = *key;
			INIT_LIST_HEAD(&ins->granted);
			INIT_LIST_HEAD(&ins->requested);
			INIT_LIST_HEAD(&ins->invalidated);

			snode = get_server_lock(inf, key, ins, false);
			if (snode != ins)
				kfree(ins);
		}
	}

	return snode;
}

/*
 * Finish with a server lock which has the mutex held, freeing it if
 * it's empty and unused.
 */
static void put_server_lock(struct lock_server_info *inf,
			    struct server_lock_node *snode)
{
	bool should_free = false;

	BUG_ON(!mutex_is_locked(&snode->mutex));

	if (atomic_dec_and_test(&snode->refcount) &&
	    list_empty(&snode->granted) &&
	    list_empty(&snode->requested) &&
	    list_empty(&snode->invalidated)) {
		spin_lock(&inf->lock);
		rb_erase(&snode->node, &inf->locks_root);
		spin_unlock(&inf->lock);
		should_free = true;
	}

	mutex_unlock(&snode->mutex);

	if (should_free)
		kfree(snode);
}

static struct client_lock_entry *find_entry(struct server_lock_node *snode,
					    struct list_head *list,
					    u64 node_id)
{
	struct client_lock_entry *clent;

	WARN_ON_ONCE(!mutex_is_locked(&snode->mutex));

	list_for_each_entry(clent, list, head) {
		if (clent->node_id == node_id)
			return clent;
	}

	return NULL;
}

static int process_waiting_requests(struct super_block *sb,
				    struct server_lock_node *snode);

/*
 * The server is receiving an incoming request from a client.  We queue
 * it on the lock and process it.
 *
 * XXX shut down if we get enomem?
 */
int scoutfs_lock_server_request(struct super_block *sb, u64 node_id,
				u64 net_id, struct scoutfs_net_lock *nl)
{
	DECLARE_LOCK_SERVER_INFO(sb, inf);
	struct client_lock_entry *clent;
	struct server_lock_node *snode;
	int ret;

	trace_scoutfs_lock_message(sb, SLT_SERVER, SLT_GRANT, SLT_REQUEST,
				   node_id, net_id, nl);

	if (invalid_mode(nl->old_mode) || invalid_mode(nl->new_mode)) {
		ret = -EINVAL;
		goto out;
	}

	clent = kzalloc(sizeof(struct client_lock_entry), GFP_NOFS);
	if (!clent) {
		ret = -ENOMEM;
		goto out;
	}

	INIT_LIST_HEAD(&clent->head);
	clent->node_id = node_id;
	clent->net_id = net_id;
	clent->mode = nl->new_mode;

	snode = alloc_server_lock(inf, &nl->key);
	if (snode == NULL) {
		kfree(clent);
		ret = -ENOMEM;
		goto out;
	}

	clent->snode = snode;
	add_client_entry(snode, &snode->requested, clent);
	scoutfs_tseq_add(&inf->tseq_tree, &clent->tseq_entry);

	ret = process_waiting_requests(sb, snode);
out:
	return ret;
}

/*
 * The server is receiving an invalidation response from the client.
 * Find the client's entry on the server lock's invalidation list and
 * free it so that request processing might be able to make forward
 * progress.
 *
 * XXX what to do with errors?  kick the client?
 */
int scoutfs_lock_server_response(struct super_block *sb, u64 node_id,
				 struct scoutfs_net_lock *nl)
{
	DECLARE_LOCK_SERVER_INFO(sb, inf);
	struct client_lock_entry *clent;
	struct server_lock_node *snode;
	int ret;

	trace_scoutfs_lock_message(sb, SLT_SERVER, SLT_INVALIDATE, SLT_RESPONSE,
				   node_id, 0, nl);

	if (invalid_mode(nl->old_mode) || invalid_mode(nl->new_mode)) {
		ret = -EINVAL;
		goto out;
	}

	/* XXX should always have a server lock here?  recovery? */
	snode = get_server_lock(inf, &nl->key, NULL, false);
	if (!snode) {
		ret = -EINVAL;
		goto out;
	}

	clent = find_entry(snode, &snode->invalidated, node_id);
	if (!clent) {
		put_server_lock(inf, snode);
		ret = -EINVAL;
		goto out;
	}

	if (nl->new_mode == SCOUTFS_LOCK_NULL) {
		free_client_entry(inf, snode, clent);
	} else {
		clent->mode = nl->new_mode;
		add_client_entry(snode, &snode->granted, clent);
	}

	ret = process_waiting_requests(sb, snode);
out:
	return ret;
}

/*
 * Make forward progress on a lock by checking each waiting request in
 * the order that they were received.  If the next request is compatible
 * with all the clients' grants then the request is granted and a
 * response is sent.
 *
 * Invalidation requests are sent for every client grant that is
 * incompatible with the next request.  We won't process the next
 * request again until we receive all the invalidation responses.  Once
 * they're all received then the request can be processed and will be
 * compatible with the remaining grants.
 *
 * This is called with the snode mutex held.  This can free the snode if
 * it's empty.  The caller can't reference the snode once this returns
 * so we unlock the snode mutex.
 *
 * All progress must wait for all clients to finish with recovery
 * because we don't know which locks they'll hold.  The unlocked
 * recovery_pending test here is OK.  It's filled by setup before
 * anything runs.  It's emptied by recovery completion.  We can get a
 * false nonempty result if we race with recovery completion, but that's
 * OK because recovery completion processes all the locks that have
 * requests after emptying, including the unlikely loser of that race.
 */
static int process_waiting_requests(struct super_block *sb,
				    struct server_lock_node *snode)
{
	DECLARE_LOCK_SERVER_INFO(sb, inf);
	struct scoutfs_net_lock nl;
	struct client_lock_entry *req;
	struct client_lock_entry *req_tmp;
	struct client_lock_entry *gr;
	struct client_lock_entry *gr_tmp;
	int ret;

	BUG_ON(!mutex_is_locked(&snode->mutex));

	/* processing waits for all invalidation responses or recovery */
	if (!list_empty(&snode->invalidated) ||
	    !scoutfs_spbm_empty(&inf->recovery_pending)) {
		ret = 0;
		goto out;
	}

	/* walk through pending requests in order received */
	list_for_each_entry_safe(req, req_tmp, &snode->requested, head) {

		/* send invalidation to any incompatible grants */
		list_for_each_entry_safe(gr, gr_tmp, &snode->granted, head) {
			if (client_entries_compatible(gr, req))
				continue;

			nl.key = snode->key;
			nl.old_mode = gr->mode;
			nl.new_mode = invalidation_mode(gr->mode, req->mode);

			ret = scoutfs_server_lock_request(sb, gr->node_id, &nl);
			if (ret)
				goto out;

			trace_scoutfs_lock_message(sb, SLT_SERVER,
						   SLT_INVALIDATE, SLT_REQUEST,
						   gr->node_id, 0, &nl);

			add_client_entry(snode, &snode->invalidated, gr);
		}

		/* wait for any newly sent invalidations */
		if (!list_empty(&snode->invalidated))
			break;

		nl.key = snode->key;
		nl.new_mode = req->mode;

		/* see if there's an existing compatible grant to replace */
		gr = find_entry(snode, &snode->granted, req->node_id);
		if (gr) {
			nl.old_mode = gr->mode;
			free_client_entry(inf, snode, gr);
		} else {
			nl.old_mode = SCOUTFS_LOCK_NULL;
		}

		ret = scoutfs_server_lock_response(sb, req->node_id,
						   req->net_id, &nl);
		if (ret)
			goto out;

		trace_scoutfs_lock_message(sb, SLT_SERVER, SLT_GRANT,
					   SLT_RESPONSE, req->node_id,
					   req->net_id, &nl);

		/* don't track null client locks, track all else */ 
		if (req->mode == SCOUTFS_LOCK_NULL)
			free_client_entry(inf, snode, req);
		else
			add_client_entry(snode, &snode->granted, req);
	}

	ret = 0;
out:
	put_server_lock(inf, snode);

	return ret;
}

/*
 * The server received a greeting from a client for the first time.  If
 * the client had already talked to the server then we must find an
 * existing record for it and should begin recovery.  If it doesn't have
 * a record then its timed out and we can't allow it to reconnect.  If
 * its connecting for the first time then we insert a new record.  If
 *
 * This is running in concurrent client greeting processing contexts.
 */
int scoutfs_lock_server_greeting(struct super_block *sb, u64 node_id,
				 bool should_exist)
{
	DECLARE_LOCK_SERVER_INFO(sb, inf);
	struct scoutfs_super_block *super = &SCOUTFS_SB(sb)->super;
	struct scoutfs_lock_client_btree_key cbk;
	SCOUTFS_BTREE_ITEM_REF(iref);
	struct scoutfs_key key;
	int ret;

	cbk.node_id = cpu_to_be64(node_id);

	mutex_lock(&inf->mutex);
	if (should_exist) {
		ret = scoutfs_btree_lookup(sb, &super->lock_clients,
					   &cbk, sizeof(cbk), &iref);
		if (ret == 0)
			scoutfs_btree_put_iref(&iref);
	} else {
		ret = scoutfs_btree_insert(sb, &super->lock_clients,
					   &cbk, sizeof(cbk), NULL, 0);
	}
	mutex_unlock(&inf->mutex);

	if (should_exist && ret == 0) {
		scoutfs_key_set_zeros(&key);
		ret = scoutfs_server_lock_recover_request(sb, node_id, &key);
		if (ret)
			goto out;
	}

out:
	return ret;
}

/*
 * A client sent their last recovery response and can exit recovery.  If
 * they were the last client in recovery then we can process all the
 * server locks that had requests.
 */
static int finished_recovery(struct super_block *sb, u64 node_id, bool cancel)
{
	DECLARE_LOCK_SERVER_INFO(sb, inf);
	struct server_lock_node *snode;
	struct scoutfs_key key;
	bool still_pending;
	int ret = 0;

	spin_lock(&inf->lock);
	scoutfs_spbm_clear(&inf->recovery_pending, node_id);
	still_pending = !scoutfs_spbm_empty(&inf->recovery_pending);
	spin_unlock(&inf->lock);
	if (still_pending)
		return 0;

	if (cancel)
		cancel_delayed_work_sync(&inf->recovery_dwork);

	scoutfs_key_set_zeros(&key);

	while ((snode = get_server_lock(inf, &key, NULL, true))) {

		key = snode->key;
		scoutfs_key_inc(&key);

		if (!list_empty(&snode->requested)) {
			ret = process_waiting_requests(sb, snode);
			if (ret)
				break;
		} else {
			put_server_lock(inf, snode);
		}
	}

	return ret;
}

/*
 * We sent a lock recover request to the client when we received its
 * greeting while in recovery.  Here we instantiate all the locks it
 * gave us in response and send another request from the next key.
 * We're done once we receive an empty response.
 */
int scoutfs_lock_server_recover_response(struct super_block *sb, u64 node_id,
					 struct scoutfs_net_lock_recover *nlr)
{
	DECLARE_LOCK_SERVER_INFO(sb, inf);
	struct client_lock_entry *existing;
	struct client_lock_entry *clent;
	struct server_lock_node *snode;
	struct scoutfs_key key;
	int ret = 0;
	int i;

	/* client must be in recovery */
	spin_lock(&inf->lock);
	if (!scoutfs_spbm_test(&inf->recovery_pending, node_id))
		ret = -EINVAL;
	spin_unlock(&inf->lock);
	if (ret)
		goto out;

	/* client has sent us all their locks */
	if (nlr->nr == 0) {
		ret = finished_recovery(sb, node_id, true);
		goto out;
	}

	for (i = 0; i < le16_to_cpu(nlr->nr); i++) {
		clent = kzalloc(sizeof(struct client_lock_entry), GFP_NOFS);
		if (!clent) {
			ret = -ENOMEM;
			goto out;
		}

		INIT_LIST_HEAD(&clent->head);
		clent->node_id = node_id;
		clent->net_id = 0;
		clent->mode = nlr->locks[i].new_mode;

		snode = alloc_server_lock(inf, &nlr->locks[i].key);
		if (snode == NULL) {
			kfree(clent);
			ret = -ENOMEM;
			goto out;
		}

		existing = find_entry(snode, &snode->granted, node_id);
		if (existing) {
			kfree(clent);
			put_server_lock(inf, snode);
			ret = -EEXIST;
			goto out;
		}

		clent->snode = snode;
		add_client_entry(snode, &snode->granted, clent);
		scoutfs_tseq_add(&inf->tseq_tree, &clent->tseq_entry);

		put_server_lock(inf, snode);
	}

	/* send request for next batch of keys */
	key = nlr->locks[le16_to_cpu(nlr->nr) - 1].key;
	scoutfs_key_inc(&key);

	ret = scoutfs_server_lock_recover_request(sb, node_id, &key);
out:
	return ret;
}

static int node_id_and_put_iref(struct scoutfs_btree_item_ref *iref,
				u64 *node_id)
{
	struct scoutfs_lock_client_btree_key *cbk;
	int ret;

	if (iref->key_len == sizeof(*cbk) && iref->val_len == 0) {
		cbk = iref->key;
		*node_id = be64_to_cpu(cbk->node_id);
		ret = 0;
	} else {
		ret = -EIO;
	}
	scoutfs_btree_put_iref(iref);
	return ret;
}

/*
 * This work executes if enough time passes without all of the clients
 * finishing with recovery and canceling the work.  We walk through the
 * client records and find any that still have their recovery pending.
 */
static void scoutfs_lock_server_recovery_timeout(struct work_struct *work)
{
	struct lock_server_info *inf = container_of(work,
						    struct lock_server_info,
						    recovery_dwork.work);
	struct super_block *sb = inf->sb;
	struct scoutfs_super_block *super = &SCOUTFS_SB(sb)->super;
	struct scoutfs_lock_client_btree_key cbk;
	SCOUTFS_BTREE_ITEM_REF(iref);
	bool timed_out;
	u64 node_id;
	int ret;

	/* we enter recovery if there are any client records */
	for (node_id = 0; ; node_id++) {
		cbk.node_id = cpu_to_be64(node_id);
		ret = scoutfs_btree_next(sb, &super->lock_clients,
					 &cbk, sizeof(cbk), &iref);
		if (ret == -ENOENT) {
			ret = 0;
			break;
		}
		if (ret == 0)
			ret = node_id_and_put_iref(&iref, &node_id);
		if (ret < 0)
			break;

		spin_lock(&inf->lock);
		if (scoutfs_spbm_test(&inf->recovery_pending, node_id)) {
			scoutfs_spbm_clear(&inf->recovery_pending, node_id);
			timed_out = true;
		} else {
			timed_out = false;
		}
		spin_unlock(&inf->lock);

		if (!timed_out)
			continue;

		scoutfs_err(sb, "client node_id %llu lock recovery timed out",
			    node_id);

		/* XXX these aren't immediately committed */
		cbk.node_id = cpu_to_be64(node_id);
		ret = scoutfs_btree_delete(sb, &super->lock_clients,
					   &cbk, sizeof(cbk));
		if (ret)
			break;
	}

	/* force processing all pending lock requests */
	if (ret == 0)
		ret = finished_recovery(sb, 0, false);

	if (ret < 0) {
		scoutfs_err(sb, "lock server saw err %d while timing out clients, shutting down", ret);
		scoutfs_server_abort(sb);
	}
}

/*
 * A client is leaving the lock service.  They aren't using locks and
 * won't send any more requests.  We tear down all the state we had for
 * them.  This can be called multiple times for a given client as their
 * farewell is resent to new servers.  It's OK to not find any state.
 * If we fail to delete a persistent entry then we have to shut down and
 * hope that the next server has more luck.
 */
int scoutfs_lock_server_farewell(struct super_block *sb, u64 node_id)
{
	DECLARE_LOCK_SERVER_INFO(sb, inf);
	struct scoutfs_super_block *super = &SCOUTFS_SB(sb)->super;
	struct scoutfs_lock_client_btree_key cli;
	struct client_lock_entry *clent;
	struct client_lock_entry *tmp;
	struct server_lock_node *snode;
	struct scoutfs_key key;
	struct list_head *list;
	bool freed;
	int ret = 0;

	cli.node_id = cpu_to_be64(node_id);
	mutex_lock(&inf->mutex);
	ret = scoutfs_btree_delete(sb, &super->lock_clients, &cli, sizeof(cli));
	mutex_unlock(&inf->mutex);
	if (ret == -ENOENT) {
		ret = 0;
		goto out;
	}
	if (ret < 0)
		goto out;

	scoutfs_key_set_zeros(&key);

	while ((snode = get_server_lock(inf, &key, NULL, true))) {

		freed = false;
		for (list = &snode->granted; list != NULL;
		     list = (list == &snode->granted) ? &snode->requested :
			    (list == &snode->requested) ? &snode->invalidated :
			    NULL) {

			list_for_each_entry_safe(clent, tmp, list, head) {
				if (clent->node_id == node_id) {
					free_client_entry(inf, snode, clent);
					freed = true;
				}
			}
		}

		key = snode->key;
		scoutfs_key_inc(&key);

		if (freed) {
			ret = process_waiting_requests(sb, snode);
			if (ret)
				goto out;
		} else {
			put_server_lock(inf, snode);
		}
	}
	ret = 0;

out:
	if (ret < 0) {
		scoutfs_err(sb, "lock server err %d during node %llu farewell, shutting down", ret, node_id);
		scoutfs_server_abort(sb);
	}

	return ret;
}

static char *lock_mode_string(u8 mode)
{
	static char *mode_strings[] = {
		[SCOUTFS_LOCK_NULL] = "null",
		[SCOUTFS_LOCK_READ] = "read",
		[SCOUTFS_LOCK_WRITE] = "write",
		[SCOUTFS_LOCK_WRITE_ONLY] = "write_only",
	};

	if (mode < ARRAY_SIZE(mode_strings) && mode_strings[mode])
		return mode_strings[mode];

	return "unknown";
}

static char *lock_on_list_string(u8 on_list)
{
	static char *on_list_strings[] = {
		[OL_GRANTED] = "granted",
		[OL_REQUESTED] = "requested",
		[OL_INVALIDATED] = "invalidated",
	};

	if (on_list < ARRAY_SIZE(on_list_strings) && on_list_strings[on_list])
		return on_list_strings[on_list];

	return "unknown";
}

static void lock_server_tseq_show(struct seq_file *m,
				  struct scoutfs_tseq_entry *ent)
{
	struct client_lock_entry *clent = container_of(ent,
						       struct client_lock_entry,
						       tseq_entry);
	struct server_lock_node *snode = clent->snode;

	seq_printf(m, SK_FMT" %s %s node_id %llu net_id %llu\n",
		   SK_ARG(&snode->key), lock_mode_string(clent->mode),
		   lock_on_list_string(clent->on_list), clent->node_id,
		   clent->net_id);
}

/*
 * Setup the lock server.  This is called before networking can deliver
 * requests.  If we find existing client records then we enter recovery.
 * Lock request processing is deferred until recovery is resolved for
 * all the existing clients, either they reconnect and replay locks or
 * we time them out.
 */
int scoutfs_lock_server_setup(struct super_block *sb)
{
	struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
	struct scoutfs_super_block *super = &SCOUTFS_SB(sb)->super;
	struct lock_server_info *inf;
	SCOUTFS_BTREE_ITEM_REF(iref);
	struct scoutfs_lock_client_btree_key cbk;
	unsigned int nr;
	u64 node_id;
	int ret;

	inf = kzalloc(sizeof(struct lock_server_info), GFP_KERNEL);
	if (!inf)
		return -ENOMEM;

	inf->sb = sb;
	spin_lock_init(&inf->lock);
	mutex_init(&inf->mutex);
	inf->locks_root = RB_ROOT;
	scoutfs_spbm_init(&inf->recovery_pending);
	INIT_DELAYED_WORK(&inf->recovery_dwork,
			  scoutfs_lock_server_recovery_timeout);
	scoutfs_tseq_tree_init(&inf->tseq_tree, lock_server_tseq_show);

	inf->tseq_dentry = scoutfs_tseq_create("server_locks", sbi->debug_root,
					       &inf->tseq_tree);
	if (!inf->tseq_dentry) {
		kfree(inf);
		return -ENOMEM;
	}

	sbi->lock_server_info = inf;

	/* we enter recovery if there are any client records */
	nr = 0;
	for (node_id = 0; ; node_id++) {
		cbk.node_id = cpu_to_be64(node_id);
		ret = scoutfs_btree_next(sb, &super->lock_clients,
					 &cbk, sizeof(cbk), &iref);
		if (ret == -ENOENT)
			break;
		if (ret == 0)
			ret = node_id_and_put_iref(&iref, &node_id);
		if (ret < 0)
			goto out;

		ret = scoutfs_spbm_set(&inf->recovery_pending, node_id);
		if (ret)
			goto out;
		nr++;

		if (node_id == U64_MAX)
			break;
	}
	ret = 0;

	if (nr) {
		schedule_delayed_work(&inf->recovery_dwork,
				msecs_to_jiffies(LOCK_SERVER_RECOVERY_MS));
		scoutfs_warn(sb, "waiting for %u lock clients to connect", nr);
	}

out:
	return ret;
}

/*
 * The server will have shut down networking before stopping us so we
 * don't have to worry about message processing calls while we free.
 */
void scoutfs_lock_server_destroy(struct super_block *sb)
{
	struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
	DECLARE_LOCK_SERVER_INFO(sb, inf);
	struct server_lock_node *snode;
	struct server_lock_node *stmp;
	struct client_lock_entry *clent;
	struct client_lock_entry *ctmp;
	LIST_HEAD(list);

	if (inf) {
		cancel_delayed_work_sync(&inf->recovery_dwork);

		debugfs_remove(inf->tseq_dentry);

		rbtree_postorder_for_each_entry_safe(snode, stmp,
						     &inf->locks_root, node) {

			list_splice_init(&snode->granted, &list);
			list_splice_init(&snode->requested, &list);
			list_splice_init(&snode->invalidated, &list);

			mutex_lock(&snode->mutex);
			list_for_each_entry_safe(clent, ctmp, &list, head) {
				free_client_entry(inf, snode, clent);
			}
			mutex_unlock(&snode->mutex);

			kfree(snode);
		}

		scoutfs_spbm_destroy(&inf->recovery_pending);

		kfree(inf);
		sbi->lock_server_info = NULL;
	}
}