scoutfs: add unmount barrier

Now that a mount's client is responsible for electing and starting a server we need to be careful about coordinating unmount. We can't let unmounting clients leave the remaining mounted clients without quorum. The server carefully tracks who is mounted and who is unmounting while it is processing farewell requests. It only sends responses to voting mounts while quorum remains or once all the voting clients are all trying to unmount. We use a field in the quorum blocks to communicate to the final set of unmounting voters that their farewells have been processed and that they can finish unmounting without trying to restablish quorum. The commit introduces and maintains the unmount_barrier field in the quorum blocks. It is passed to the server from the election, the server sends it to the client and writes new versions, and the client compares what it received with what it sees in quorum blocks. The commit then has the clients send their unique name to the server who stores it in persistent mounted client records and compares the names to the quorum config when deciding which farewell reqeusts can be responded to. Now that farewell response processing can block for a very long time it is moved off into async work so that it doesn't prevent net connections from being shutdown and re-established. This also makes it easier to make global decisions based on the count of pending farewell requests. Signed-off-by: Zach Brown <zab@versity.com>
2026-04-27 08:35:05 +00:00 · 2019-04-08 12:44:03 -07:00
parent fe63b566c9
commit 36b0df336b
11 changed files with 488 additions and 60 deletions
--- a/kmod/src/btree.c
+++ b/kmod/src/btree.c
@@ -350,6 +350,7 @@ static size_t super_root_offsets[] = {
 	offsetof(struct scoutfs_super_block, manifest.root),
 	offsetof(struct scoutfs_super_block, lock_clients),
 	offsetof(struct scoutfs_super_block, trans_seqs),
+	offsetof(struct scoutfs_super_block, mounted_clients),
 };

 #define for_each_super_root(super, i, root)				\
--- a/kmod/src/client.c
+++ b/kmod/src/client.c
@@ -59,6 +59,7 @@ struct client_info {
 	u64 old_elected_nr;

 	u64 server_term;
+	u64 greeting_umb;

 	bool sending_farewell;
 	int farewell_error;
@@ -365,22 +366,27 @@ static int client_greeting(struct super_block *sb,
 	scoutfs_net_client_greeting(sb, conn, new_server);

 	client->server_term = le64_to_cpu(gr->server_term);
+	client->greeting_umb = le64_to_cpu(gr->unmount_barrier);
 	ret = 0;
 out:
 	return ret;
 }

 /*
- * If the previous election told us to start the server then stop it
- * and wipe the old election info.  If we're not fast enough to clear
- * the election block then the next server might fence us.  Should
- * be very unlikely as election requires multiple RMW cycles.
+ * If the previous election told us to start the server then stop it and
+ * clear the indication that we were elected.   We get the current
+ * version of the election info from the server because they might have
+ * modified it while they were running. the old election info.
+ *
+ * If we're not fast enough to clear the election from the quorum block
+ * then the next server might fence us.  Should be very unlikely as
+ * election requires multiple RMW cycles.
 */
 static void stop_our_server(struct super_block *sb,
 			    struct scoutfs_quorum_elected_info *qei)
 {
 	if (qei->run_server) {
-		scoutfs_server_stop(sb);
+		scoutfs_server_stop(sb, qei);
 		scoutfs_quorum_clear_elected(sb, qei);
 		memset(qei, 0, sizeof(*qei));
 	}
@@ -431,12 +437,22 @@ static void scoutfs_client_connect_worker(struct work_struct *work)

 	ret = scoutfs_quorum_election(sb, opts->uniq_name,
 				      client->old_elected_nr,
-			              timeout_abs, qei);
+			              timeout_abs, client->sending_farewell,
+				      client->greeting_umb, qei);
 	if (ret)
 		goto out;

+	/* we saw that the server wrote a new unmount barrier */
+	if (client->sending_farewell && qei->elected_nr == 0 &&
+	    qei->unmount_barrier > client->greeting_umb) {
+		client->farewell_error = 0;
+		complete(&client->farewell_comp);
+		ret = 0;
+		goto out;
+	}
+
 	if (qei->run_server) {
-		ret = scoutfs_server_start(sb, &qei->sin, qei->elected_nr);
+		ret = scoutfs_server_start(sb, &qei->sin, qei->elected_nr, qei);
 		if (ret) {
 			/* forget that we tried to start the server */
 			memset(qei, 0, sizeof(*qei));
@@ -459,9 +475,11 @@ static void scoutfs_client_connect_worker(struct work_struct *work)
 	client->old_elected_nr = 0;

 	/* send a greeting to verify endpoints of each connection */
+	memcpy(greet.name, opts->uniq_name, sizeof(greet.name));
 	greet.fsid = super->hdr.fsid;
 	greet.format_hash = super->format_hash;
 	greet.server_term = cpu_to_le64(client->server_term);
+	greet.unmount_barrier = 0;
 	greet.node_id = cpu_to_le64(sbi->node_id);
 	greet.flags = 0;
 	if (client->sending_farewell)
--- a/kmod/src/format.h
+++ b/kmod/src/format.h
@@ -48,6 +48,8 @@
 #define SCOUTFS_QUORUM_BLOCKS		((128ULL * 1024) >> SCOUTFS_BLOCK_SHIFT)
 #define SCOUTFS_QUORUM_MAX_SLOTS	SCOUTFS_QUORUM_BLOCKS

+#define SCOUTFS_UNIQUE_NAME_MAX_BYTES	64 /* includes null */
+
 /*
 * Base types used by other structures.
 */
@@ -289,6 +291,18 @@ struct scoutfs_trans_seq_btree_key {
 	__be64 node_id;
 } __packed;

+/*
+ * The server keeps a persistent record of mounted clients.
+ */
+struct scoutfs_mounted_client_btree_key {
+	__be64 node_id;
+} __packed;
+
+struct scoutfs_mounted_client_btree_val {
+	__u8 name[SCOUTFS_UNIQUE_NAME_MAX_BYTES];
+} __packed;
+
+
 /*
 * The max number of links defines the max number of entries that we can
 * index in o(log n) and the static list head storage size in the
@@ -395,7 +409,6 @@ struct scoutfs_xattr {
 #define member_sizeof(TYPE, MEMBER) (sizeof(((TYPE *)0)->MEMBER))

 #define SCOUTFS_UUID_BYTES 16
-#define SCOUTFS_UNIQUE_NAME_MAX_BYTES	64 /* includes null */

 /*
 * During each quorum voting interval the fabric has to process 2 reads
@@ -418,6 +431,7 @@ struct scoutfs_xattr {
 * @config_gen: references the config gen in the super block
 * @write_nr: incremented for every write, only 0 when never written
 * @elected_nr: incremented when elected, 0 otherwise
+ * @unmount_barrier: incremented by servers when all members have unmounted
 * @vote_slot: the active config slot that the writer is voting for
 */
 struct scoutfs_quorum_block {
@@ -426,6 +440,7 @@ struct scoutfs_quorum_block {
 	__le64 config_gen;
 	__le64 write_nr;
 	__le64 elected_nr;
+	__le64 unmount_barrier;
 	__le32 crc;
 	__u8 vote_slot;
 } __packed;
@@ -475,6 +490,7 @@ struct scoutfs_super_block {
 	struct scoutfs_quorum_config quorum_config;
 	struct scoutfs_btree_root lock_clients;
 	struct scoutfs_btree_root trans_seqs;
+	struct scoutfs_btree_root mounted_clients;
 } __packed;

 #define SCOUTFS_ROOT_INO 1
@@ -593,20 +609,30 @@ enum {
 * Greetings verify identity of communicating nodes.  The sender sends
 * their credentials and the receiver verifies them.
 *
+ * @name: The client sends its unique name to the server.
+ *
 * @server_term: The raft term that elected the server.  Initially 0
 * from the client, sent by the server, then sent by the client as it
 * tries to reconnect.  Used to identify a client reconnecting to a
 * server that has timed out its connection.
 *
+ * @unmount_barrier: Incremented every time the remaining majority of
+ * quorum members all agree to leave.  The server tells a quorum member
+ * the value that it's connecting under so that if the client sees the
+ * value increase in a quorum block it knows that the server has
+ * processed its farewell and can safely unmount.
+ *
 * @node_id: The id of the client.  Initially 0 from the client,
 * assigned by the server, and sent by the client as it reconnects.
 * Used by the server to identify reconnecting clients whose existing
 * state must be dealt with.
 */
 struct scoutfs_net_greeting {
+	__u8 name[SCOUTFS_UNIQUE_NAME_MAX_BYTES];
 	__le64 fsid;
 	__le64 format_hash;
 	__le64 server_term;
+	__le64 unmount_barrier;
 	__le64 node_id;
 	__le64 flags;
 } __packed;
--- a/kmod/src/lock_server.c
+++ b/kmod/src/lock_server.c
@@ -800,7 +800,7 @@ static void scoutfs_lock_server_recovery_timeout(struct work_struct *work)

 	if (ret < 0) {
 		scoutfs_err(sb, "lock server saw err %d while timing out clients, shutting down", ret);
-		scoutfs_server_stop(sb);
+		scoutfs_server_abort(sb);
 	}
 }

@@ -870,7 +870,7 @@ int scoutfs_lock_server_farewell(struct super_block *sb, u64 node_id)
 out:
 	if (ret < 0) {
 		scoutfs_err(sb, "lock server err %d during node %llu farewell, shutting down", ret, node_id);
-		scoutfs_server_stop(sb);
+		scoutfs_server_abort(sb);
 	}

 	return ret;
--- a/kmod/src/net.c
+++ b/kmod/src/net.c
@@ -100,7 +100,7 @@ struct scoutfs_net_connection {
 		      established:1,	/* added sends queue send work */
 		      shutting_down:1,	/* shutdown work has been queued */
 		      saw_greeting:1,	/* saw greeting on this sock */
-		      saw_farewell:1,	/* saw farewell request from client */
+		      saw_farewell:1,	/* saw farewell response to client */
 		      reconn_wait:1,	/* shutdown, waiting for reconnect */
 		      reconn_freeing:1;	/* waiting done, setter frees */
 	unsigned long reconn_deadline;
@@ -788,6 +788,11 @@ static void scoutfs_net_send_worker(struct work_struct *work)
 			continue;
 		}

+		if ((msend->nh.cmd == SCOUTFS_NET_CMD_FAREWELL) &&
+		    nh_is_response(&msend->nh)) {
+			conn->saw_farewell = 1;
+		}
+
 		msend->nh.recv_seq =
 			cpu_to_le64(atomic64_read(&conn->recv_seq));

@@ -1629,22 +1634,6 @@ restart:
 		conn->notify_up(sb, conn, conn->info, node_id);
 }

-/*
- * The server has received a farewell message and is sending a response.
- * All we do is mark the connection so that it is freed the next time it
- * is shutdown, presumably as the client disconnects after receiving the
- * response.  The server caller has cleaned up all the state it had
- * associated with the client.
- */
-void scoutfs_net_server_farewell(struct super_block *sb,
-				 struct scoutfs_net_connection *conn)
-{
-	spin_lock(&conn->lock);
-	conn->saw_farewell = 1;
-	spin_unlock(&conn->lock);
-}
-
-
 /*
 * Submit a request down the connection.  It's up to the caller to
 * ensure that the conn is allocated.  Sends submitted when the
--- a/kmod/src/net.h
+++ b/kmod/src/net.h
@@ -75,8 +75,6 @@ void scoutfs_net_server_greeting(struct super_block *sb,
 				 u64 node_id, u64 greeting_id,
 				 bool sent_node_id, bool first_contact,
 				 bool farewell);
-void scoutfs_net_server_farewell(struct super_block *sb,
-				 struct scoutfs_net_connection *conn);
 void scoutfs_net_farewell(struct super_block *sb,
 			  struct scoutfs_net_connection *conn);

--- a/kmod/src/quorum.c
+++ b/kmod/src/quorum.c
@@ -392,7 +392,8 @@ static inline int first_slot_flags(struct scoutfs_quorum_config *conf,
 */
 static int write_quorum_block(struct super_block *sb, __le64 fsid,
 			      __le64 config_gen, u8 our_slot, __le64 write_nr,
-			      u64 elected_nr, u8 vote_slot)
+			      u64 elected_nr, u64 unmount_barrier,
+			      u8 vote_slot)
 {
 	struct scoutfs_quorum_block *blk;
 	struct buffer_head *bh;
@@ -416,6 +417,7 @@ static int write_quorum_block(struct super_block *sb, __le64 fsid,
 	blk->config_gen = config_gen;
 	blk->write_nr = write_nr;
 	blk->elected_nr = cpu_to_le64(elected_nr);
+	blk->unmount_barrier = cpu_to_le64(unmount_barrier);
 	blk->vote_slot = vote_slot;

 	blk->crc = quorum_block_crc(blk);
@@ -473,8 +475,8 @@ static int fence_other_elected(struct super_block *sb,
 			scoutfs_inc_counter(sb, quorum_fenced);

 			ret = write_quorum_block(sb, super->hdr.fsid,
-						 conf->gen, i, blk.write_nr,
-						 0, i);
+					conf->gen, i, blk.write_nr, 0,
+					le64_to_cpu(blk.unmount_barrier), i);
 			if (ret)
 				break;
 		}
@@ -509,9 +511,13 @@ struct quorum_block_history {
 * When we return success we update the caller's elected info with the
 * most recent elected leader we found, which may well be long gone.  We
 * return -ENOENT if we didn't find any elected leaders.
+ *
+ * If we return success because we saw a larger unmount barrier we set
+ * elected_nr to 0 and fill the unmount_barrier.
 */
 int scoutfs_quorum_election(struct super_block *sb, char *our_name,
 			    u64 old_elected_nr, ktime_t timeout_abs,
+			    bool unmounting, u64 our_umb,
 			    struct scoutfs_quorum_elected_info *qei)
 {
 	struct scoutfs_super_block *super = NULL;
@@ -524,6 +530,7 @@ int scoutfs_quorum_election(struct super_block *sb, char *our_name,
 	ktime_t now;
 	__le64 write_nr = 0;
 	u64 elected_nr = 0;
+	u64 unmount_barrier = 0;
 	int vote_streak = 0;
 	int vote_slot;
 	int our_slot;
@@ -551,13 +558,7 @@ int scoutfs_quorum_election(struct super_block *sb, char *our_name,
 			goto out;
 		conf = &super->quorum_config;

-		/* allow a single vote majority when 1 or 2 active */
-		if (nr_active <= 2)
-			majority = 1;
-		else if (nr_active & 1)
-			majority = (nr_active + 1) / 2;
-		else
-			majority = (nr_active / 2) + 1;
+		majority = scoutfs_quorum_majority(sb, conf);

 		readahead_quorum_blocks(sb);

@@ -590,6 +591,8 @@ int scoutfs_quorum_election(struct super_block *sb, char *our_name,
 				qei->config_gen = blk.config_gen;
 				qei->write_nr = blk.write_nr;
 				qei->elected_nr = le64_to_cpu(blk.elected_nr);
+				qei->unmount_barrier =
+					le64_to_cpu(blk.unmount_barrier);
 				qei->config_slot = i;
 			}
 		}
@@ -639,12 +642,23 @@ int scoutfs_quorum_election(struct super_block *sb, char *our_name,
 		nr_votes = 0;
 		write_nr = cpu_to_le64(1);
 		elected_nr = 0;
+		unmount_barrier = 0;

 		for_each_active_block(sb, super, conf, hist, hi, &blk, slot, i){
 			/* count our votes (maybe including from us) */
 			if (hi->writing >= 2 && blk.vote_slot == our_slot)
 				nr_votes++;

+			/* can finish unmounting if members all left */
+			if (unmounting &&
+			    le64_to_cpu(blk.unmount_barrier) > our_umb) {
+				qei->elected_nr = 0;
+				qei->unmount_barrier =
+					le64_to_cpu(blk.unmount_barrier);
+				ret = 0;
+				goto out;
+			}
+
 			/* sample existing fields for our write */
 			if (i == our_slot) {
 				write_nr = blk.write_nr;
@@ -652,6 +666,8 @@ int scoutfs_quorum_election(struct super_block *sb, char *our_name,
 			}
 			elected_nr = max(elected_nr,
 					 le64_to_cpu(blk.elected_nr));
+			unmount_barrier = max(unmount_barrier,
+					      le64_to_cpu(blk.unmount_barrier));
 		}


@@ -667,7 +683,8 @@ int scoutfs_quorum_election(struct super_block *sb, char *our_name,
 			elected_nr = 0;

 		write_quorum_block(sb, super->hdr.fsid, conf->gen, our_slot,
-				   write_nr, elected_nr, vote_slot);
+				   write_nr, elected_nr, unmount_barrier,
+				   vote_slot);

 		set_current_state(TASK_UNINTERRUPTIBLE);
 		schedule_hrtimeout(&expires, HRTIMER_MODE_ABS);
@@ -714,5 +731,65 @@ int scoutfs_quorum_clear_elected(struct super_block *sb,

 	return write_quorum_block(sb, super->hdr.fsid, qei->config_gen,
 				  qei->config_slot, qei->write_nr, 0,
+				  qei->unmount_barrier, qei->config_slot);
+}
+
+int scoutfs_quorum_update_barrier(struct super_block *sb,
+				  struct scoutfs_quorum_elected_info *qei,
+				  u64 unmount_barrier)
+{
+	struct scoutfs_super_block *super = &SCOUTFS_SB(sb)->super;
+
+	qei->unmount_barrier = unmount_barrier;
+
+	return write_quorum_block(sb, super->hdr.fsid, qei->config_gen,
+				  qei->config_slot, qei->write_nr,
+				  qei->elected_nr, qei->unmount_barrier,
 				  qei->config_slot);
 }
+
+/*
+ * If there's only one or two active slots then a single vote is sufficient
+ * for a majority.
+ */
+int scoutfs_quorum_majority(struct super_block *sb,
+			    struct scoutfs_quorum_config *conf)
+{
+	struct scoutfs_quorum_slot *slot;
+	int nr_active = 0;
+	int majority;
+	int i;
+
+	for (i = 0; i < SCOUTFS_QUORUM_MAX_SLOTS; i++) {
+		slot = &conf->slots[i];
+
+		if (slot->flags & SCOUTFS_QUORUM_SLOT_ACTIVE)
+			nr_active++;
+	}
+
+	if (nr_active <= 2)
+		majority = 1;
+	else if (nr_active & 1)
+		majority = (nr_active + 1) / 2;
+	else
+		majority = (nr_active / 2) + 1;
+
+	return majority;
+}
+
+bool scoutfs_quorum_voting_member(struct super_block *sb,
+				  struct scoutfs_quorum_config *conf,
+				  char *name)
+{
+	struct scoutfs_quorum_slot *slot;
+	int i;
+
+	for (i = 0; i < SCOUTFS_QUORUM_MAX_SLOTS; i++) {
+		slot = &conf->slots[i];
+
+		if (strcmp(slot->name, name) == 0)
+			return true;
+	}
+
+	return false;
+}
--- a/kmod/src/quorum.h
+++ b/kmod/src/quorum.h
@@ -6,14 +6,24 @@ struct scoutfs_quorum_elected_info {
 	__le64 config_gen;
 	__le64 write_nr;
 	u64 elected_nr;
+	u64 unmount_barrier;
 	unsigned int config_slot;
 	bool run_server;
 };

 int scoutfs_quorum_election(struct super_block *sb, char *our_name,
 			    u64 old_elected_nr, ktime_t timeout_abs,
+			    bool unmounting, u64 our_umb,
 			    struct scoutfs_quorum_elected_info *qei);
 int scoutfs_quorum_clear_elected(struct super_block *sb,
 				 struct scoutfs_quorum_elected_info *qei);
+int scoutfs_quorum_update_barrier(struct super_block *sb,
+				  struct scoutfs_quorum_elected_info *qei,
+				  u64 unmount_barrier);
+int scoutfs_quorum_majority(struct super_block *sb,
+			    struct scoutfs_quorum_config *conf);
+bool scoutfs_quorum_voting_member(struct super_block *sb,
+				  struct scoutfs_quorum_config *conf,
+				  char *name);

 #endif
--- a/kmod/src/scoutfs_trace.h
+++ b/kmod/src/scoutfs_trace.h
@@ -2482,6 +2482,7 @@ DECLARE_EVENT_CLASS(scoutfs_quorum_block_class,
 		__field(__u64, config_gen)
 		__field(__u64, write_nr)
 		__field(__u64, elected_nr)
+		__field(__u64, unmount_barrier)
 		__field(__u32, crc)
 		__field(__u8, vote_slot)
 	),
@@ -2493,14 +2494,15 @@ DECLARE_EVENT_CLASS(scoutfs_quorum_block_class,
 		__entry->config_gen = le64_to_cpu(blk->config_gen);
 		__entry->write_nr = le64_to_cpu(blk->write_nr);
 		__entry->elected_nr = le64_to_cpu(blk->elected_nr);
+		__entry->unmount_barrier = le64_to_cpu(blk->unmount_barrier);
 		__entry->crc = le32_to_cpu(blk->crc);
 		__entry->vote_slot = blk->vote_slot;
 	),

-	TP_printk("fsid "FSID_FMT" io_blkno %llu hdr_blkno %llu config_gen %llu write_nr %llu elected_nr %llu crc 0x%08x vote_slot %u",
+	TP_printk("fsid "FSID_FMT" io_blkno %llu hdr_blkno %llu config_gen %llu write_nr %llu elected_nr %llu umb %llu crc 0x%08x vote_slot %u",
 		  __entry->fsid, __entry->io_blkno, __entry->hdr_blkno,
 		  __entry->config_gen, __entry->write_nr, __entry->elected_nr,
-		  __entry->crc, __entry->vote_slot)
+		  __entry->unmount_barrier, __entry->crc, __entry->vote_slot)
 );
 DEFINE_EVENT(scoutfs_quorum_block_class, scoutfs_quorum_read_block,
 	TP_PROTO(struct super_block *sb, u64 io_blkno,
--- a/kmod/src/server.c
+++ b/kmod/src/server.c
@@ -35,6 +35,7 @@
 #include "net.h"
 #include "lock_server.h"
 #include "endian_swap.h"
+#include "quorum.h"

 /*
 * Every active mount can act as the server that listens on a net
@@ -60,6 +61,8 @@ struct server_info {
 	u64 term;
 	struct scoutfs_net_connection *conn;

+	struct scoutfs_quorum_elected_info qei;
+
 	/* request processing coordinates committing manifest and alloc */
 	struct rw_semaphore commit_rwsem;
 	struct llist_head commit_waiters;
@@ -84,6 +87,11 @@ struct server_info {
 	unsigned long nr_compacts;
 	struct list_head compacts;
 	struct work_struct compact_work;
+
+	/* track clients waiting in unmmount for farewell response */
+	struct mutex farewell_mutex;
+	struct list_head farewell_requests;
+	struct work_struct farewell_work;
 };

 #define DECLARE_SERVER_INFO(sb, name) \
@@ -1176,6 +1184,46 @@ int scoutfs_server_lock_recover_request(struct super_block *sb, u64 node_id,
 					      NULL, NULL);
 }

+static int insert_mounted_client(struct super_block *sb, u64 node_id,
+				 char *name)
+{
+	struct scoutfs_super_block *super = &SCOUTFS_SB(sb)->super;
+	struct scoutfs_mounted_client_btree_key mck;
+	struct scoutfs_mounted_client_btree_val mcv;
+
+	mck.node_id = cpu_to_be64(node_id);
+	strncpy(mcv.name, name, sizeof(mcv.name));
+
+	return scoutfs_btree_insert(sb, &super->mounted_clients,
+				    &mck, sizeof(mck), &mcv, sizeof(mcv));
+}
+
+/*
+ * Remove the record of a mounted client.  The record can already be
+ * removed if we're processing a farewell on behalf of a client that
+ * already had a previous server process its farewell.
+ *
+ * When we remove the last mounted client that's voting we write a new
+ * quorum block with the updated unmount_barrier.
+ *
+ * The caller has to serialize with farewell processing.
+ */
+static int delete_mounted_client(struct super_block *sb, u64 node_id)
+{
+	struct scoutfs_super_block *super = &SCOUTFS_SB(sb)->super;
+	struct scoutfs_mounted_client_btree_key mck;
+	int ret;
+
+	mck.node_id = cpu_to_be64(node_id);
+
+	ret = scoutfs_btree_delete(sb, &super->mounted_clients,
+				   &mck, sizeof(mck));
+	if (ret == -ENOENT)
+		ret = 0;
+
+	return ret;
+}
+
 /*
 * Process an incoming greeting request in the server from the client.
 * We try to send responses to failed greetings so that the sender can
@@ -1247,9 +1295,17 @@ static int server_greeting(struct super_block *sb,
 		le64_add_cpu(&super->next_node_id, 1);
 		spin_unlock(&server->lock);

-		queue_commit_work(server, &cw);
+		mutex_lock(&server->farewell_mutex);
+		ret = insert_mounted_client(sb, le64_to_cpu(node_id), gr->name);
+		mutex_unlock(&server->farewell_mutex);
+
+		if (ret == 0)
+			queue_commit_work(server, &cw);
 		up_read(&server->commit_rwsem);
-		ret = wait_for_commit(&cw);
+		if (ret == 0) {
+			ret = wait_for_commit(&cw);
+			queue_work(server->wq, &server->farewell_work);
+		}
 	} else {
 		node_id = gr->node_id;
 	}
@@ -1259,9 +1315,11 @@ send_err:
 	if (err)
 		node_id = 0;

+	memset(greet.name, 0, sizeof(greet.name));
 	greet.fsid = super->hdr.fsid;
 	greet.format_hash = super->format_hash;
 	greet.server_term = cpu_to_le64(server->term);
+	greet.unmount_barrier = cpu_to_le64(server->qei.unmount_barrier);
 	greet.node_id = node_id;
 	greet.flags = 0;

@@ -1305,6 +1363,223 @@ out:
 	return ret;
 }

+struct farewell_request {
+	struct list_head entry;
+	u64 net_id;
+	u64 node_id;
+};
+
+static bool invalid_mounted_client_item(struct scoutfs_btree_item_ref *iref)
+{
+	return (iref->key_len !=
+			sizeof(struct scoutfs_mounted_client_btree_key)) ||
+	       (iref->val_len !=
+			sizeof(struct scoutfs_mounted_client_btree_val));
+}
+
+/*
+ * This work processes farewell requests asynchronously.  Requests from
+ * voting quorum members can be held until they're no longer needed to
+ * vote for quorum and elect a server to process farewell requests.
+ *
+ * This will hold farewell requests from voting clients until either it
+ * isn't needed for quorum because a majority remains without it, or it
+ * won't be needed for quorum because all the remaining mounted clients
+ * are voting and waiting for farewell.
+ *
+ * When we remove the last mounted client record for the last voting
+ * client then we increase the unmount_barrier and write it to the
+ * server's quorum block.  If voting clients don't get their farewell
+ * response they'll attempt to form quorum again to start the server for
+ * their farewell response but will find the increased umount_barrier.
+ * The'll know that their farewell has been processed and they can exit
+ * without forming quorum.
+ *
+ * Responses that are waiting for clients who aren't voting are
+ * immediately sent.  Clients that don't have a mounted client record
+ * have already had their farewell processed by another server and can
+ * proceed.
+ *
+ * This can trust the quorum config found in the super that was read
+ * when the server started.  Only the current server can rewrite the
+ * working config.
+ *
+ * Farewell responses are unique in that sending them causes the server
+ * to shutdown the connection to the client next time the socket
+ * disconnects.  If the socket is destroyed before the client gets the
+ * response they'll reconnect and we'll see them as a brand new client
+ * who immediately sends a farewell.  It'll be processed and it all
+ * works out.
+ *
+ * If this worker sees an error it assumes that this sever is done for
+ * and that another had better take its place.
+ */
+static void farewell_worker(struct work_struct *work)
+{
+	struct server_info *server = container_of(work, struct server_info,
+						  farewell_work);
+	struct super_block *sb = server->sb;
+	struct scoutfs_super_block *super = &SCOUTFS_SB(sb)->super;
+	struct scoutfs_quorum_config *conf = &super->quorum_config;
+	struct scoutfs_mounted_client_btree_key mck;
+	struct scoutfs_mounted_client_btree_val *mcv;
+	struct farewell_request *tmp;
+	struct farewell_request *fw;
+	SCOUTFS_BTREE_ITEM_REF(iref);
+	struct commit_waiter cw;
+	unsigned int nr_unmounting = 0;
+	unsigned int nr_mounted = 0;
+	unsigned int majority;
+	LIST_HEAD(reqs);
+	LIST_HEAD(send);
+	bool deleted = false;
+	bool voting;
+	bool more_reqs;
+	int ret;
+
+	majority = scoutfs_quorum_majority(sb, conf);
+
+	/* grab all the requests that are waiting */
+	mutex_lock(&server->farewell_mutex);
+	list_splice_init(&server->farewell_requests, &reqs);
+	mutex_unlock(&server->farewell_mutex);
+
+	/* count how many reqs requests are from voting clients */
+	nr_unmounting = 0;
+	list_for_each_entry_safe(fw, tmp, &reqs, entry) {
+		mck.node_id = cpu_to_be64(fw->node_id);
+		ret = scoutfs_btree_lookup(sb, &super->mounted_clients,
+					   &mck, sizeof(mck), &iref);
+		if (ret == 0 && invalid_mounted_client_item(&iref)) {
+			scoutfs_btree_put_iref(&iref);
+			ret = -EIO;
+		}
+		if (ret < 0) {
+			if (ret == -ENOENT) {
+				list_move_tail(&fw->entry, &send);
+				continue;
+			}
+			goto out;
+		}
+
+		mcv = iref.val;
+		voting = scoutfs_quorum_voting_member(sb, conf, mcv->name);
+		scoutfs_btree_put_iref(&iref);
+
+		if (!voting) {
+			list_move_tail(&fw->entry, &send);
+			continue;
+		}
+
+		nr_unmounting++;
+	}
+
+	/* see how many mounted clients could vote for quorum */
+	memset(&mck, 0, sizeof(mck));
+	for (;;) {
+		ret = scoutfs_btree_next(sb, &super->mounted_clients,
+					 &mck, sizeof(mck), &iref);
+		if (ret == 0 && invalid_mounted_client_item(&iref)) {
+			scoutfs_btree_put_iref(&iref);
+			ret = -EIO;
+		}
+		if (ret != 0) {
+			if (ret == -ENOENT)
+				break;
+			goto out;
+		}
+
+		memcpy(&mck, iref.key, sizeof(mck));
+		mcv = iref.val;
+
+		if (scoutfs_quorum_voting_member(sb, conf, mcv->name))
+			nr_mounted++;
+
+		scoutfs_btree_put_iref(&iref);
+		be64_add_cpu(&mck.node_id, 1);
+
+	}
+
+	/* send as many responses as we can to maintain quorum */
+	while ((fw = list_first_entry_or_null(&reqs, struct farewell_request,
+					      entry)) &&
+	       (nr_mounted > majority || nr_unmounting >= nr_mounted)) {
+
+		list_move_tail(&fw->entry, &send);
+		nr_mounted--;
+		nr_unmounting--;
+		deleted = true;
+	}
+
+	/* process and send farewell responses */
+	list_for_each_entry_safe(fw, tmp, &send, entry) {
+
+		down_read(&server->commit_rwsem);
+
+		ret = scoutfs_lock_server_farewell(sb, fw->node_id) ?:
+		      remove_trans_seq(sb, fw->node_id) ?:
+		      delete_mounted_client(sb, fw->node_id);
+		if (ret == 0)
+			queue_commit_work(server, &cw);
+
+		up_read(&server->commit_rwsem);
+		if (ret == 0)
+			ret = wait_for_commit(&cw);
+		if (ret)
+			goto out;
+	}
+
+	/* update the unmount barrier the first time we delete all mounted */
+	if (deleted && nr_mounted == 0) {
+		ret = scoutfs_quorum_update_barrier(sb, &server->qei,
+					server->qei.unmount_barrier + 1);
+		if (ret)
+			goto out;
+	}
+
+	/* and finally send all the responses */
+	list_for_each_entry_safe(fw, tmp, &send, entry) {
+
+		ret = scoutfs_net_response_node(sb, server->conn, fw->node_id,
+						SCOUTFS_NET_CMD_FAREWELL,
+						fw->net_id, 0, NULL, 0);
+		if (ret)
+			break;
+
+		list_del_init(&fw->entry);
+		kfree(fw);
+	}
+
+	ret = 0;
+out:
+	mutex_lock(&server->farewell_mutex);
+	more_reqs = !list_empty(&server->farewell_requests);
+	list_splice_init(&reqs, &server->farewell_requests);
+	list_splice_init(&send, &server->farewell_requests);
+	mutex_unlock(&server->farewell_mutex);
+
+	if (ret < 0)
+		stop_server(server);
+	else if (more_reqs && !server->shutting_down)
+		queue_work(server->wq, &server->farewell_work);
+}
+
+static void free_farewell_requests(struct super_block *sb, u64 node_id)
+{
+	struct server_info *server = SCOUTFS_SB(sb)->server_info;
+	struct farewell_request *tmp;
+	struct farewell_request *fw;
+
+	mutex_lock(&server->farewell_mutex);
+	list_for_each_entry_safe(fw, tmp, &server->farewell_requests, entry) {
+		if (node_id == 0 || fw->node_id == node_id) {
+			list_del_init(&fw->entry);
+			kfree(fw);
+		}
+	}
+	mutex_unlock(&server->farewell_mutex);
+}
+
 /*
 * The server is receiving a farewell message from a client that is
 * unmounting.  It won't send any more requests and once it receives our
@@ -1322,26 +1597,28 @@ static int server_farewell(struct super_block *sb,
 {
 	struct server_info *server = SCOUTFS_SB(sb)->server_info;
 	u64 node_id = scoutfs_net_client_node_id(conn);
-	struct commit_waiter cw;
-	int ret;
+	struct farewell_request *fw;

 	if (arg_len != 0)
 		return -EINVAL;

-	scoutfs_net_server_farewell(sb, conn);
+	/* XXX tear down if we fence, or if we shut down */

-	down_read(&server->commit_rwsem);
+	fw = kmalloc(sizeof(struct farewell_request), GFP_NOFS);
+	if (fw == NULL)
+		return -ENOMEM;

-	ret = scoutfs_lock_server_farewell(sb, node_id) ?:
-	      remove_trans_seq(sb, node_id);
-	if (ret == 0)
-		queue_commit_work(server, &cw);
+	fw->node_id = node_id;
+	fw->net_id = id;

-	up_read(&server->commit_rwsem);
-	if (ret == 0)
-		ret = wait_for_commit(&cw);
+	mutex_lock(&server->farewell_mutex);
+	list_add_tail(&fw->entry, &server->farewell_requests);
+	mutex_unlock(&server->farewell_mutex);

-	return scoutfs_net_response(sb, conn, cmd, id, ret, NULL, 0);
+	queue_work(server->wq, &server->farewell_work);
+
+	/* response will be sent later */
+	return 0;
 }

 /* requests sent to clients are tracked so we can free resources */
@@ -1992,6 +2269,8 @@ static void server_notify_down(struct super_block *sb,
 						 server->nr_clients);
 		spin_unlock(&server->lock);

+		free_farewell_requests(sb, node_id);
+
 		forget_client_compacts(sb, sci);
 		try_queue_compact(server);
 	} else {
@@ -2085,7 +2364,7 @@ out:

 /* XXX can we call start multiple times? */
 int scoutfs_server_start(struct super_block *sb, struct sockaddr_in *sin,
-			 u64 term)
+			 u64 term, struct scoutfs_quorum_elected_info *qei)
 {
 	DECLARE_SERVER_INFO(sb, server);

@@ -2093,6 +2372,7 @@ int scoutfs_server_start(struct super_block *sb, struct sockaddr_in *sin,
 	server->shutting_down = false;
 	server->listen_sin = *sin;
 	server->term = term;
+	server->qei = *qei;
 	init_completion(&server->start_comp);

 	queue_work(server->wq, &server->work);
@@ -2101,7 +2381,22 @@ int scoutfs_server_start(struct super_block *sb, struct sockaddr_in *sin,
 	return server->err;
 }

-void scoutfs_server_stop(struct super_block *sb)
+/*
+ * Start shutdown on the server but don't want for it to finish.
+ */
+void scoutfs_server_abort(struct super_block *sb)
+{
+	DECLARE_SERVER_INFO(sb, server);
+
+	stop_server(server);
+}
+
+/*
+ * Once the server is stopped we give the caller our election info
+ * which might have been modified while we were running.
+ */
+void scoutfs_server_stop(struct super_block *sb,
+			 struct scoutfs_quorum_elected_info *qei)
 {
 	DECLARE_SERVER_INFO(sb, server);

@@ -2109,6 +2404,8 @@ void scoutfs_server_stop(struct super_block *sb)
 	/* XXX not sure both are needed */
 	cancel_work_sync(&server->work);
 	cancel_work_sync(&server->commit_work);
+
+	*qei = server->qei;
 }

 int scoutfs_server_setup(struct super_block *sb)
@@ -2135,6 +2432,9 @@ int scoutfs_server_setup(struct super_block *sb)
 	server->compacts_per_client = 2;
 	INIT_LIST_HEAD(&server->compacts);
 	INIT_WORK(&server->compact_work, scoutfs_server_compact_worker);
+	mutex_init(&server->farewell_mutex);
+	INIT_LIST_HEAD(&server->farewell_requests);
+	INIT_WORK(&server->farewell_work, farewell_worker);

 	server->wq = alloc_workqueue("scoutfs_server",
 				     WQ_UNBOUND | WQ_NON_REENTRANT, 0);
@@ -2164,6 +2464,10 @@ void scoutfs_server_destroy(struct super_block *sb)
 		/* recv work/compaction could have left commit_work queued */
 		cancel_work_sync(&server->commit_work);

+		/* pending farewell requests are another server's problem */
+		cancel_work_sync(&server->farewell_work);
+		free_farewell_requests(sb, 0);
+
 		trace_scoutfs_server_workqueue_destroy(sb, 0, 0);
 		destroy_workqueue(server->wq);

--- a/kmod/src/server.h
+++ b/kmod/src/server.h
@@ -72,9 +72,12 @@ int scoutfs_server_lock_recover_request(struct super_block *sb, u64 node_id,
 					struct scoutfs_key *key);

 struct sockaddr_in;
+struct scoutfs_quorum_elected_info;
 int scoutfs_server_start(struct super_block *sb, struct sockaddr_in *sin,
-			 u64 term);
-void scoutfs_server_stop(struct super_block *sb);
+			 u64 term, struct scoutfs_quorum_elected_info *qei);
+void scoutfs_server_abort(struct super_block *sb);
+void scoutfs_server_stop(struct super_block *sb,
+			 struct scoutfs_quorum_elected_info *qei);

 int scoutfs_server_setup(struct super_block *sb);
 void scoutfs_server_destroy(struct super_block *sb);