Stop writing to other quorum slot blocks

The core quorum work loop assumes that it has exclusive access to its slot's quorum block. It uniquely marks blocks it writes and verifies the marks on read to discover if another mount has written to its slot under the assumption that this must be a configuration error that put two mounts in the same slot. But the design of the leader bit in the block violates the invariant that only a slot will write to its block. As the server comes up and fences previous leaders it writes to their block to clear their leader bit. The final hole in the design is that because we're fencing mounts, not slots, each slot can have two mounts in play. An active mount can be using the slot and there can still be a persistent record of a previous mount in the slot that crashed that needs to be fenced. All this comes together to have the server fence an old mount in a slot while a new mount is coming up. The new mount sees the mark change and freaks out and stops participating in quorum. The fix is to rework the quorum blocks so that each slot only writes to its own block. Instead of the server writing to each fenced mount's slot, it writes a fence event to its block once all previous mounts have been fenced. We add a bit of bookkeeping so that the server can discover when all block leader fence operations have completed. Each event gets its own term so we can compare events to discover live servers. We get rid of the write marks and instead have an event that is written as a quorum agent starts up and is then checked on every read to make sure it still matches. Signed-off-by: Zach Brown <zab@versity.com>
2026-01-05 11:45:09 +00:00 · 2021-05-31 10:12:45 -07:00
parent 76076011a2
commit 38a4a56741
8 changed files with 225 additions and 211 deletions
--- a/kmod/src/fence.c
+++ b/kmod/src/fence.c
@@ -300,6 +300,24 @@ int scoutfs_fence_next(struct super_block *sb, u64 *rid, int *reason, bool *erro
 	return ret;
 }

+int scoutfs_fence_reason_pending(struct super_block *sb, int reason)
+{
+	DECLARE_FENCE_INFO(sb, fi);
+	struct pending_fence *fence;
+	bool pending = false;
+
+	spin_lock(&fi->lock);
+	list_for_each_entry(fence, &fi->list, entry) {
+		if (fence->reason == reason) {
+			pending = true;
+			break;
+		}
+	}
+	spin_unlock(&fi->lock);
+
+	return pending;
+}
+
 int scoutfs_fence_free(struct super_block *sb, u64 rid)
 {
 	DECLARE_FENCE_INFO(sb, fi);
--- a/kmod/src/fence.h
+++ b/kmod/src/fence.h
@@ -9,6 +9,7 @@ enum {

 int scoutfs_fence_start(struct super_block *sb, u64 rid, __be32 ipv4_addr, int reason);
 int scoutfs_fence_next(struct super_block *sb, u64 *rid, int *reason, bool *error);
+int scoutfs_fence_reason_pending(struct super_block *sb, int reason);
 int scoutfs_fence_free(struct super_block *sb, u64 rid);
 int scoutfs_fence_wait_fenced(struct super_block *sb, long timeout_jiffies);

--- a/kmod/src/format.h
+++ b/kmod/src/format.h
@@ -625,18 +625,24 @@ struct scoutfs_quorum_config {
 	} slots[SCOUTFS_QUORUM_MAX_SLOTS];
 };

-struct scoutfs_quorum_block {
-	struct scoutfs_block_header hdr;
-	__le64 term;
-	__le64 random_write_mark;
-	__le64 flags;
-	struct scoutfs_quorum_block_event {
-		__le64 rid;
-		struct scoutfs_timespec ts;
-	} write, update_term, set_leader, clear_leader, fenced;
+enum {
+	SCOUTFS_QUORUM_EVENT_BEGIN,		/* quorum service starting up */
+	SCOUTFS_QUORUM_EVENT_TERM,		/* updated persistent term */
+	SCOUTFS_QUORUM_EVENT_ELECT,		/* won election */
+	SCOUTFS_QUORUM_EVENT_FENCE,		/* server fenced others */
+	SCOUTFS_QUORUM_EVENT_STOP,		/* server stopped */
+	SCOUTFS_QUORUM_EVENT_END,		/* quorum service shutting down */
+	SCOUTFS_QUORUM_EVENT_NR,
 };

-#define SCOUTFS_QUORUM_BLOCK_LEADER (1 << 0)
+struct scoutfs_quorum_block {
+	struct scoutfs_block_header hdr;
+	struct scoutfs_quorum_block_event {
+		__le64 rid;
+		__le64 term;
+		struct scoutfs_timespec ts;
+	} events[SCOUTFS_QUORUM_EVENT_NR];
+};

 /*
 * Tunable options that apply to the entire system.  They can be set in
--- a/kmod/src/quorum.c
+++ b/kmod/src/quorum.c
@@ -61,10 +61,9 @@
 * running (maybe they've deadlocked, or lost network communications).
 * In addition to a configuration slot in the super block, each quorum
 * member also has a known block location that represents their slot.
- * They set a flag in their block indicating that they've been elected
- * leader, then read slots for all the other blocks looking for
- * previously active leaders to fence.  After that it can start the
- * server.
+ * The block contains an array of events which are updated during the life
+ * time of the quorum agent.  The elected leader set its elected event
+ * and can then start the server.
 *
 * It's critical to raft elections that a participant's term not go
 * backwards in time so each mount also uses its quorum block to store
@@ -335,17 +334,17 @@ static int recv_msg(struct super_block *sb, struct quorum_host_msg *msg,
 }

 /*
- * The caller can provide a mark that they're using to track their
- * written blocks.  It's updated as they write the block and we can
- * compare it with what we read to see if there have been unexpected
- * intervening writes to the block -- the caller is supposed to have
- * exclusive access to the block (or was fenced).
+ * Read and verify block fields before giving it to the caller.  We
+ * should have exclusive write access to the block.  We know that
+ * something has gone horribly wrong if we don't see our rid in the
+ * begin event after we've written it as we started up.
 */
-static int read_quorum_block(struct super_block *sb, u64 blkno,
-			     struct scoutfs_quorum_block *blk, __le64 *mark)
+static int read_quorum_block(struct super_block *sb, u64 blkno, struct scoutfs_quorum_block *blk,
+			     bool check_rid)
 {
 	struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
 	struct scoutfs_super_block *super = &sbi->super;
+	const u64 rid = sbi->rid;
 	char msg[150];
 	__le32 crc;
 	int ret;
@@ -375,9 +374,9 @@ static int read_quorum_block(struct super_block *sb, u64 blkno,
 	else if (le64_to_cpu(blk->hdr.blkno) != blkno)
 		snprintf(msg, sizeof(msg), "blk blkno %llu != %llu",
 			 le64_to_cpu(blk->hdr.blkno), blkno);
-	else if (mark && *mark != 0 && blk->random_write_mark != *mark)
-		snprintf(msg, sizeof(msg), "blk mark %016llx != %016llx, are multiple mounts configured with the same slot?",
-			 le64_to_cpu(blk->random_write_mark), le64_to_cpu(*mark));
+	else if (check_rid && le64_to_cpu(blk->events[SCOUTFS_QUORUM_EVENT_BEGIN].rid) != rid)
+		snprintf(msg, sizeof(msg), "quorum block begin rid %016llx != our rid %016llx, are multiple mounts configured with this slot?",
+		le64_to_cpu(blk->events[SCOUTFS_QUORUM_EVENT_BEGIN].rid), rid);
 	else
 		msg[0] = '\0';

@@ -391,184 +390,159 @@ out:
 	return ret;
 }

-static void set_quorum_block_event(struct super_block *sb,
-				   struct scoutfs_quorum_block *blk,
-				   struct scoutfs_quorum_block_event *ev)
+static void set_quorum_block_event(struct super_block *sb, struct scoutfs_quorum_block *blk,
+				   int event, u64 term)
 {
 	struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
+	struct scoutfs_quorum_block_event *ev;
 	struct timespec64 ts;

+	if (WARN_ON_ONCE(event < 0 || event >= SCOUTFS_QUORUM_EVENT_NR))
+		return;
+
 	getnstimeofday64(&ts);

+	ev = &blk->events[event];
 	ev->rid = cpu_to_le64(sbi->rid);
+	ev->term = cpu_to_le64(term);
 	ev->ts.sec = cpu_to_le64(ts.tv_sec);
 	ev->ts.nsec = cpu_to_le32(ts.tv_nsec);
 }

-/*
- * Every time we write a block we update the write stamp and random
- * write mark so readers can see our write.
- */
-static int write_quorum_block(struct super_block *sb, u64 blkno,
-			      struct scoutfs_quorum_block *blk, __le64 *mark)
+static int write_quorum_block(struct super_block *sb, u64 blkno, struct scoutfs_quorum_block *blk)
 {
 	struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
-	int ret;

 	if (WARN_ON_ONCE(blkno < SCOUTFS_QUORUM_BLKNO) ||
 	    WARN_ON_ONCE(blkno >= (SCOUTFS_QUORUM_BLKNO +
 				   SCOUTFS_QUORUM_BLOCKS)))
 		return -EINVAL;

-	do {
-		get_random_bytes(&blk->random_write_mark,
-				 sizeof(blk->random_write_mark));
-	} while (blk->random_write_mark == 0);
-
-	if (mark)
-		*mark = blk->random_write_mark;
-
-	set_quorum_block_event(sb, blk, &blk->write);
-
-	ret = scoutfs_block_write_sm(sb, sbi->meta_bdev, blkno,
-				      &blk->hdr, sizeof(*blk));
-	if (ret < 0)
-		scoutfs_err(sb, "quorum block write error %d", ret);
-
-	return ret;
+	return scoutfs_block_write_sm(sb, sbi->meta_bdev, blkno, &blk->hdr, sizeof(*blk));
 }

 /*
- * Read the caller's slot's current quorum block, make a change, and
- * write it back out.  If the caller provides a mark it can cause read
- * errors if we read a mark that doesn't match the last mark that the
- * caller wrote.
+ * Read the caller's slot's quorum block, make a change, and write it
+ * back out.
 */
-static int update_quorum_block(struct super_block *sb, u64 blkno,
-			       __le64 *mark, int role, u64 term)
+static int update_quorum_block(struct super_block *sb, int event, u64 term, bool check_rid)
 {
+	struct mount_options *opts = &SCOUTFS_SB(sb)->opts;
+	u64 blkno = SCOUTFS_QUORUM_BLKNO + opts->quorum_slot_nr;
 	struct scoutfs_quorum_block blk;
-	u64 flags;
-	u64 bits;
-	u64 set;
 	int ret;

-	ret = read_quorum_block(sb, blkno, &blk, mark);
+	ret = read_quorum_block(sb, blkno, &blk, check_rid);
 	if (ret == 0) {
-		if (blk.term != cpu_to_le64(term)) {
-			blk.term = cpu_to_le64(term);
-			set_quorum_block_event(sb, &blk, &blk.update_term);
-		}
-
-		flags = le64_to_cpu(blk.flags);
-		bits = SCOUTFS_QUORUM_BLOCK_LEADER;
-		set = role == LEADER ? SCOUTFS_QUORUM_BLOCK_LEADER : 0;
-		if ((flags & bits) != set)
-			set_quorum_block_event(sb, &blk,
-					       set ? &blk.set_leader :
-					             &blk.clear_leader);
-		blk.flags = cpu_to_le64((flags & ~bits) | set);
-
-		ret = write_quorum_block(sb, blkno, &blk, mark);
-	}
-
-	return ret;
-}
-
-/*
- * The calling server had fenced previous leaders before starting up,
- * now that it's up it has reclaimed their resources and can clear their
- * leader flags.
- */
-int scoutfs_quorum_clear_rid_leader(struct super_block *sb, u64 rid)
-{
-	struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
-	struct scoutfs_super_block *super = &sbi->super;
-	struct mount_options *opts = &sbi->opts;
-	struct scoutfs_quorum_block blk;
-	int ret = 0;
-	u64 blkno;
-	int i;
-
-	for (i = 0; i < SCOUTFS_QUORUM_MAX_SLOTS; i++) {
-		if (i == opts->quorum_slot_nr || !quorum_slot_present(super, i))
-			continue;
-
-		blkno = SCOUTFS_QUORUM_BLKNO + i;
-		ret = read_quorum_block(sb, blkno, &blk, NULL);
+		set_quorum_block_event(sb, &blk, event, term);
+		ret = write_quorum_block(sb, blkno, &blk);
 		if (ret < 0)
-			break;
-
-		if (le64_to_cpu(blk.set_leader.rid) == rid) {
-			blk.flags &= ~cpu_to_le64(SCOUTFS_QUORUM_BLOCK_LEADER);
-			set_quorum_block_event(sb, &blk, &blk.fenced);
-
-			ret = write_quorum_block(sb, blkno, &blk, NULL);
-			break;
-		}
+			scoutfs_err(sb, "error %d reading quorum block %llu to update event %d term %llu",
+				    ret, blkno, event, term);
+	} else {
+		scoutfs_err(sb, "error %d writing quorum block %llu after updating event %d term %llu",
+			    ret, blkno, event, term);
 	}

-	if (ret < 0)
-		scoutfs_err(sb, "error %d clearing leader block for rid %016llx", ret, rid);
-
 	return ret;
 }

 /*
- * The calling server has been elected, had its block updated, and has
- * started running but can't yet assume that it has exclusive access to
- * the metadata device.  We read all the quorum blocks looking for
- * previously elected leaders to fence so that we're the only leader
- * running.
+ * The calling server has fenced previous leaders and reclaimed their
+ * resources.  We can now update our fence event with a greater term to
+ * stop future leaders from doing the same.
+ */
+int scoutfs_quorum_fence_complete(struct super_block *sb, u64 term)
+{
+	return update_quorum_block(sb, SCOUTFS_QUORUM_EVENT_FENCE, term, true);
+}
+
+/*
+ * The calling server has been elected and has started running but can't
+ * yet assume that it has exclusive access to the metadata device.  We
+ * read all the quorum blocks looking for previously elected leaders to
+ * fence so that we're the only leader running.
 *
- * We only wait for the previous leaders to be fenced.  We don't clear
- * the leader bits because the server is going to reclaim their
- * resources once its up and running.  Only then will the leader bits be
- * cleared.
+ * We're relying on the invariant that there can't be two mounts running
+ * with the same slot nr at the same time.  With this constraint there
+ * can be at most two previous leaders per slot that need to be fenced:
+ * a persistent record of an old mount on the slot, and an active mount.
+ *
+ * If we start fence requests then we only wait for them to complete
+ * before returning.  The server will reclaim their resources once it is
+ * up and running and will call us to update the fence event.  If we
+ * don't start fence requests then we update the fence event
+ * immediately, the server has nothing more to do.
 *
 * Quorum will be sending heartbeats while we wait for fencing.  That
 * keeps us from being fenced while we allow userspace fencing to take a
 * reasonably long time.  We still want to timeout eventually.
 */
-int scoutfs_quorum_fence_leader_blocks(struct super_block *sb, u64 term)
+int scoutfs_quorum_fence_leaders(struct super_block *sb, u64 term)
 {
+#define NR_OLD 2
+	struct scoutfs_quorum_block_event old[SCOUTFS_QUORUM_MAX_SLOTS][NR_OLD] = {{{0,}}};
 	struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
 	struct scoutfs_super_block *super = &sbi->super;
-	struct mount_options *opts = &sbi->opts;
 	struct scoutfs_quorum_block blk;
 	struct sockaddr_in sin;
+	const u64 rid = sbi->rid;
 	bool fence_started = false;
-	u64 blkno;
+	u64 fenced = 0;
+	__le64 fence_rid;
 	int ret = 0;
 	int err;
 	int i;
+	int j;

 	BUILD_BUG_ON(SCOUTFS_QUORUM_BLOCKS < SCOUTFS_QUORUM_MAX_SLOTS);

 	for (i = 0; i < SCOUTFS_QUORUM_MAX_SLOTS; i++) {
-		if (i == opts->quorum_slot_nr || !quorum_slot_present(super, i))
+		if (!quorum_slot_present(super, i))
 			continue;

-		blkno = SCOUTFS_QUORUM_BLKNO + i;
-		ret = read_quorum_block(sb, blkno, &blk, NULL);
+		ret = read_quorum_block(sb, SCOUTFS_QUORUM_BLKNO + i, &blk, false);
 		if (ret < 0)
 			goto out;

-		if (!(le64_to_cpu(blk.flags) & SCOUTFS_QUORUM_BLOCK_LEADER) ||
-		    le64_to_cpu(blk.term) > term)
-			continue;
+		/* elected leader still running */
+		if (le64_to_cpu(blk.events[SCOUTFS_QUORUM_EVENT_ELECT].term) >
+		    le64_to_cpu(blk.events[SCOUTFS_QUORUM_EVENT_STOP].term))
+			old[i][0] = blk.events[SCOUTFS_QUORUM_EVENT_ELECT];

-		scoutfs_inc_counter(sb, quorum_fence_leader);
-		scoutfs_quorum_slot_sin(super, i, &sin);
+		/* persistent record of previous server before elected */
+		if ((le64_to_cpu(blk.events[SCOUTFS_QUORUM_EVENT_FENCE].term) >
+		     le64_to_cpu(blk.events[SCOUTFS_QUORUM_EVENT_STOP].term)) &&
+		    (le64_to_cpu(blk.events[SCOUTFS_QUORUM_EVENT_FENCE].term) <
+		     le64_to_cpu(blk.events[SCOUTFS_QUORUM_EVENT_ELECT].term)))
+			old[i][1] = blk.events[SCOUTFS_QUORUM_EVENT_FENCE];

-		scoutfs_info(sb, "fencing previous leader "SCSBF" in slot %u with address "SIN_FMT,
-			     SCSB_LEFR_ARGS(super->hdr.fsid, blk.set_leader.rid), i, SIN_ARG(&sin));
-		ret = scoutfs_fence_start(sb, le64_to_cpu(blk.set_leader.rid), sin.sin_addr.s_addr,
-					  SCOUTFS_FENCE_QUORUM_BLOCK_LEADER);
-		if (ret < 0)
-			goto out;
-		fence_started = true;
+		/* find greatest term that has fenced everything before it */
+		fenced = max(fenced, le64_to_cpu(blk.events[SCOUTFS_QUORUM_EVENT_FENCE].term));
+	}

+	/* now actually fence any old leaders which haven't been fenced yet */
+	for (i = 0; i < SCOUTFS_QUORUM_MAX_SLOTS; i++) {
+		for (j = 0; j < NR_OLD; j++) {
+			if (le64_to_cpu(old[i][j].term) == 0 ||		/* uninitialized */
+			    le64_to_cpu(old[i][j].term) < fenced ||	/* already fenced */
+			    le64_to_cpu(old[i][j].term) > term ||	/* newer than us */
+			    le64_to_cpu(old[i][j].rid) == rid)		/* us */
+				continue;
+
+			scoutfs_inc_counter(sb, quorum_fence_leader);
+			scoutfs_quorum_slot_sin(super, i, &sin);
+			fence_rid = old[i][j].rid;
+
+			scoutfs_info(sb, "fencing previous leader "SCSBF" at term %llu in slot %u with address "SIN_FMT,
+				     SCSB_LEFR_ARGS(super->hdr.fsid, fence_rid),
+				     le64_to_cpu(old[i][j].term), i, SIN_ARG(&sin));
+			ret = scoutfs_fence_start(sb, le64_to_cpu(fence_rid), sin.sin_addr.s_addr,
+						  SCOUTFS_FENCE_QUORUM_BLOCK_LEADER);
+			if (ret < 0)
+				goto out;
+			fence_started = true;
+		}
 	}

 out:
@@ -576,9 +550,14 @@ out:
 		err = scoutfs_fence_wait_fenced(sb, msecs_to_jiffies(SCOUTFS_QUORUM_FENCE_TO_MS));
 		if (ret == 0)
 			ret = err;
+	} else {
+		err = scoutfs_quorum_fence_complete(sb, term);
+		if (ret == 0)
+			ret = err;
 	}
+
 	if (ret < 0) {
-		scoutfs_err(sb, "error %d fencing leader blocks", ret);
+		scoutfs_err(sb, "error %d attempting to find and fence previous leaders", ret);
 		scoutfs_inc_counter(sb, quorum_fence_error);
 	}

@@ -601,23 +580,22 @@ static void scoutfs_quorum_worker(struct work_struct *work)
 	struct sockaddr_in unused;
 	struct quorum_host_msg msg;
 	struct quorum_status qst;
-	__le64 mark;
 	u64 blkno;
 	int ret;
+	int err;

 	/* recording votes from slots as native single word bitmap */
 	BUILD_BUG_ON(SCOUTFS_QUORUM_MAX_SLOTS > BITS_PER_LONG);

 	/* get our starting term from our persistent block */
-	mark = 0;
 	blkno = SCOUTFS_QUORUM_BLKNO + opts->quorum_slot_nr;
-	ret = read_quorum_block(sb, blkno, &blk, &mark);
+	ret = read_quorum_block(sb, blkno, &blk, false);
 	if (ret < 0)
 		goto out;

 	/* start out as a follower */
 	qst.role = FOLLOWER;
-	qst.term = le64_to_cpu(blk.term);
+	qst.term = le64_to_cpu(blk.events[SCOUTFS_QUORUM_EVENT_TERM].term);
 	qst.vote_for = -1;
 	qst.vote_bits = 0;

@@ -627,6 +605,11 @@ static void scoutfs_quorum_worker(struct work_struct *work)
 	else
 		qst.timeout = election_timeout();

+	/* record that we're up and running, readers check that it isn't updated */
+	ret = update_quorum_block(sb, SCOUTFS_QUORUM_EVENT_BEGIN, qst.term, false);
+	if (ret < 0)
+		goto out;
+
 	while (!qinf->shutdown) {

 		ret = recv_msg(sb, &msg, qst.timeout);
@@ -657,11 +640,6 @@ static void scoutfs_quorum_worker(struct work_struct *work)
 			send_msg_others(sb, SCOUTFS_QUORUM_MSG_RESIGNATION,
 					qst.term);
 			scoutfs_inc_counter(sb, quorum_send_resignation);
-
-			ret = update_quorum_block(sb, blkno, &mark,
-						  qst.role, qst.term);
-			if (ret < 0)
-				goto out;
 		}

 		spin_lock(&qinf->show_lock);
@@ -692,8 +670,7 @@ static void scoutfs_quorum_worker(struct work_struct *work)
 				qst.timeout = election_timeout();

 			/* store our increased term */
-			ret = update_quorum_block(sb, blkno, &mark,
-						  qst.role, qst.term);
+			ret = update_quorum_block(sb, SCOUTFS_QUORUM_EVENT_TERM, qst.term, true);
 			if (ret < 0)
 				goto out;
 		}
@@ -710,6 +687,11 @@ static void scoutfs_quorum_worker(struct work_struct *work)
 					qst.term);
 			qst.timeout = election_timeout();
 			scoutfs_inc_counter(sb, quorum_send_request);
+
+			/* store our increased term */
+			ret = update_quorum_block(sb, SCOUTFS_QUORUM_EVENT_TERM, qst.term, true);
+			if (ret < 0)
+				goto out;
 		}

 		/* candidates count votes in their term */
@@ -738,8 +720,8 @@ static void scoutfs_quorum_worker(struct work_struct *work)
 					qst.term);
 			qst.timeout = heartbeat_interval();

-			/* set our leader flag before starting server */
-			ret = update_quorum_block(sb, blkno, &mark, qst.role, qst.term);
+			/* record that we've been elected before starting up server */
+			ret = update_quorum_block(sb, SCOUTFS_QUORUM_EVENT_ELECT, qst.term, true);
 			if (ret < 0)
 				goto out;

@@ -750,8 +732,13 @@ static void scoutfs_quorum_worker(struct work_struct *work)

 			ret = scoutfs_server_start(sb, qst.term);
 			if (ret < 0) {
-				scoutfs_err(sb, "server startup failed with %d",
-					    ret);
+				clear_bit(QINF_FLAG_SERVER, &qinf->flags);
+				scoutfs_err(sb, "server startup failed with %d", ret);
+				/* store our increased term */
+				err = update_quorum_block(sb, SCOUTFS_QUORUM_EVENT_STOP, qst.term,
+							  true);
+				if (err < 0 && ret == 0)
+					ret = err;
 				goto out;
 			}
 		}
@@ -798,13 +785,8 @@ static void scoutfs_quorum_worker(struct work_struct *work)
 				qst.term);
 	}

-	/* always try to clear leader block as we stop to avoid fencing */
-	if (qst.role == LEADER) {
-		ret = update_quorum_block(sb, blkno, &mark,
-					  FOLLOWER, qst.term);
-		if (ret < 0)
-			goto out;
-	}
+	/* informational event that we're shutting down, nothing relies on it */
+	update_quorum_block(sb, SCOUTFS_QUORUM_EVENT_END, qst.term, true);
 out:
 	if (ret < 0) {
 		scoutfs_err(sb, "quorum service saw error %d, shutting down.  Cluster will be degraded until this slot is remounted to restart the quorum service",
@@ -813,58 +795,60 @@ out:
 }

 /*
- * Clear the server flag for the quorum work's next iteration to
- * indicate that the server has shutdown and that it should step down as
- * leader, update quorum blocks, and stop sending heartbeats.
+ * The calling server has shutdown and is no longer using shared
+ * resources.  Clear the bit so that we stop sending heartbeats and
+ * allow the next server to be elected.  Update the stop event so that
+ * it won't be considered available by clients or fenced by the next
+ * leader.
 */
-void scoutfs_quorum_server_shutdown(struct super_block *sb)
+void scoutfs_quorum_server_shutdown(struct super_block *sb, u64 term)
 {
 	DECLARE_QUORUM_INFO(sb, qinf);

 	clear_bit(QINF_FLAG_SERVER, &qinf->flags);
+	update_quorum_block(sb, SCOUTFS_QUORUM_EVENT_STOP, term, true);
 }

 /*
 * Clients read quorum blocks looking for the leader with a server whose
 * address it can try and connect to.
 *
- * There can be multiple running servers if a client checks before a
- * server has had a chance to fence any old servers.  We try to use the
- * block with the most recent timestamp.  If we get it wrong the
- * connection will timeout and the client will try again, presumably
- * finding a single server block.
+ * There can be records of multiple previous elected leaders if the
+ * current server hasn't yet fenced any old servers.  We use the elected
+ * leader with the greatest elected term.  If we get it wrong the
+ * connection will timeout and the client will try again.
 */
 int scoutfs_quorum_server_sin(struct super_block *sb, struct sockaddr_in *sin)
 {
 	struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
 	struct scoutfs_super_block *super = &sbi->super;
 	struct scoutfs_quorum_block blk;
-	struct timespec64 recent = {0,};
-	struct timespec64 ts;
-	int ret;
+	u64 elect_term;
+	u64 term = 0;
+	int ret = 0;
 	int i;

 	for (i = 0; i < SCOUTFS_QUORUM_MAX_SLOTS; i++) {
-		ret = read_quorum_block(sb, SCOUTFS_QUORUM_BLKNO + i, &blk,
-					NULL);
+		if (!quorum_slot_present(super, i))
+			continue;
+
+		ret = read_quorum_block(sb, SCOUTFS_QUORUM_BLKNO + i, &blk, false);
 		if (ret < 0) {
 			scoutfs_err(sb, "error reading quorum block nr %u: %d",
 				    i, ret);
 			goto out;
 		}

-		ts.tv_sec = le64_to_cpu(blk.set_leader.ts.sec);
-		ts.tv_nsec = le32_to_cpu(blk.set_leader.ts.nsec);
-
-		if ((le64_to_cpu(blk.flags) & SCOUTFS_QUORUM_BLOCK_LEADER) &&
-		    (timespec64_to_ns(&ts) > timespec64_to_ns(&recent))) {
-			recent = ts;
+		elect_term = le64_to_cpu(blk.events[SCOUTFS_QUORUM_EVENT_ELECT].term);
+		if (elect_term > term &&
+		    elect_term > le64_to_cpu(blk.events[SCOUTFS_QUORUM_EVENT_STOP].term)) {
+			term = elect_term;
 			scoutfs_quorum_slot_sin(super, i, sin);
 			continue;
 		}
 	}

-	if (timespec64_to_ns(&recent) == 0)
+	if (term == 0)
 		ret = -ENOENT;

 out:
--- a/kmod/src/quorum.h
+++ b/kmod/src/quorum.h
@@ -2,14 +2,14 @@
 #define _SCOUTFS_QUORUM_H_

 int scoutfs_quorum_server_sin(struct super_block *sb, struct sockaddr_in *sin);
-void scoutfs_quorum_server_shutdown(struct super_block *sb);
+void scoutfs_quorum_server_shutdown(struct super_block *sb, u64 term);

 u8 scoutfs_quorum_votes_needed(struct super_block *sb);
 void scoutfs_quorum_slot_sin(struct scoutfs_super_block *super, int i,
 			     struct sockaddr_in *sin);

-int scoutfs_quorum_fence_leader_blocks(struct super_block *sb, u64 term);
-int scoutfs_quorum_clear_rid_leader(struct super_block *sb, u64 rid);
+int scoutfs_quorum_fence_leaders(struct super_block *sb, u64 term);
+int scoutfs_quorum_fence_complete(struct super_block *sb, u64 term);

 int scoutfs_quorum_setup(struct super_block *sb);
 void scoutfs_quorum_shutdown(struct super_block *sb);
--- a/kmod/src/server.c
+++ b/kmod/src/server.c
@@ -1723,7 +1723,7 @@ struct farewell_request {
 * individual action knows to recognize that it's already been performed
 * and return success.
 */
-static int reclaim_rid(struct super_block *sb, u64 rid, bool clear_leader)
+static int reclaim_rid(struct super_block *sb, u64 rid)
 {
 	int ret;

@@ -1737,7 +1737,6 @@ static int reclaim_rid(struct super_block *sb, u64 rid, bool clear_leader)
 	      reclaim_log_trees(sb, rid) ?:
 	      cancel_srch_compact(sb, rid) ?:
 	      scoutfs_omap_remove_rid(sb, rid) ?:
-	      (clear_leader ? scoutfs_quorum_clear_rid_leader(sb, rid) : 0) ?:
 	      delete_mounted_client(sb, rid);

 	return scoutfs_server_apply_commit(sb, ret);
@@ -1870,7 +1869,7 @@ static void farewell_worker(struct work_struct *work)

 	/* clean up resources for mounts before sending responses */
 	list_for_each_entry_safe(fw, tmp, &send, entry) {
-		ret = reclaim_rid(sb, fw->rid, false);
+		ret = reclaim_rid(sb, fw->rid);
 		if (ret)
 			goto out;
 	}
@@ -2204,7 +2203,7 @@ static void reclaim_worker(struct work_struct *work)
 		goto out;
 	}

-	ret = reclaim_rid(sb, rid, reason == SCOUTFS_FENCE_QUORUM_BLOCK_LEADER);
+	ret = reclaim_rid(sb, rid);
 	if (ret < 0) {
 		scoutfs_err(sb, "failure to reclaim fenced rid %016llx: err %d, shutting down server",
 			    rid, ret);
@@ -2215,6 +2214,15 @@ static void reclaim_worker(struct work_struct *work)
 	scoutfs_info(sb, "successfully reclaimed resources for fenced rid %016llx", rid);
 	scoutfs_fence_free(sb, rid);
 	scoutfs_server_recov_finish(sb, rid, SCOUTFS_RECOV_ALL);
+
+	/* tell quorum we've finished fencing all previous leaders */
+	if (reason == SCOUTFS_FENCE_QUORUM_BLOCK_LEADER &&
+	    !scoutfs_fence_reason_pending(sb, reason)) {
+		ret = scoutfs_quorum_fence_complete(sb, server->term);
+		if (ret < 0)
+			goto out;
+	}
+
 	ret = 0;

 out:
@@ -2242,7 +2250,7 @@ static void scoutfs_server_worker(struct work_struct *work)
 	trace_scoutfs_server_work_enter(sb, 0, 0);

 	/* first make sure no other servers are still running */
-	ret = scoutfs_quorum_fence_leader_blocks(sb, server->term);
+	ret = scoutfs_quorum_fence_leaders(sb, server->term);
 	if (ret < 0)
 		goto out;

@@ -2348,7 +2356,7 @@ out:
 	scoutfs_net_free_conn(sb, conn);

 	/* let quorum know that we've shutdown */
-	scoutfs_quorum_server_shutdown(sb);
+	scoutfs_quorum_server_shutdown(sb, server->term);

 	scoutfs_info(sb, "server stopped at "SIN_FMT, SIN_ARG(&sin));
 	trace_scoutfs_server_work_exit(sb, 0, ret);
--- a/tests/funcs/filter.sh
+++ b/tests/funcs/filter.sh
@@ -70,6 +70,7 @@ t_filter_dmesg()
 	re="$re|scoutfs .* reclaimed resources"
 	re="$re|scoutfs .* quorum .* error"
 	re="$re|scoutfs .* error reading quorum block"
+	re="$re|scoutfs .* error .* writing quorum block"

 	egrep -v "($re)" 
 }
--- a/utils/src/print.c
+++ b/utils/src/print.c
@@ -816,16 +816,16 @@ static char *alloc_addr_str(union scoutfs_inet_addr *ia)

 static int print_quorum_blocks(int fd, struct scoutfs_super_block *super)
 {
-	struct print_events {
-		size_t offset;
-		char *name;
-	} events[] = {
-		OFF_NAME(write), OFF_NAME(update_term), OFF_NAME(set_leader),
-		OFF_NAME(clear_leader), OFF_NAME(fenced),
+	const static char *event_names[] = {
+		[SCOUTFS_QUORUM_EVENT_BEGIN] = "begin",
+		[SCOUTFS_QUORUM_EVENT_TERM] = "term",
+		[SCOUTFS_QUORUM_EVENT_ELECT] = "elect",
+		[SCOUTFS_QUORUM_EVENT_FENCE] = "fence",
+		[SCOUTFS_QUORUM_EVENT_STOP] = "stop",
+		[SCOUTFS_QUORUM_EVENT_END] = "end",
 	};
 	struct scoutfs_quorum_block *blk = NULL;
 	struct scoutfs_quorum_block_event *ev;
-	char *log_addr = NULL;
 	u64 blkno;
 	int ret;
 	int i;
@@ -834,6 +834,7 @@ static int print_quorum_blocks(int fd, struct scoutfs_super_block *super)
 	for (i = 0; i < SCOUTFS_QUORUM_BLOCKS; i++) {
 		blkno = SCOUTFS_QUORUM_BLKNO + i;
 		free(blk);
+		blk = NULL;
 		ret = read_block(fd, blkno, SCOUTFS_BLOCK_SM_SHIFT, (void **)&blk);
 		if (ret)
 			goto out;
@@ -841,24 +842,19 @@ static int print_quorum_blocks(int fd, struct scoutfs_super_block *super)
 		printf("quorum blkno %llu (slot %llu)\n",
 		       blkno, blkno - SCOUTFS_QUORUM_BLKNO);
 		print_block_header(&blk->hdr, SCOUTFS_BLOCK_SM_SIZE);
-		printf("  term %llu random_write_mark 0x%llx flags 0x%llx\n",
-		       le64_to_cpu(blk->term),
-		       le64_to_cpu(blk->random_write_mark),
-		       le64_to_cpu(blk->flags));

-		for (e = 0; e < array_size(events); e++) {
-			ev = (void *)blk + events[e].offset;
+		for (e = 0; e < array_size(event_names); e++) {
+			ev = &blk->events[e];

-			printf("  %12s: rid %016llx ts %llu.%08u\n",
-			       events[e].name, le64_to_cpu(ev->rid),
-			       le64_to_cpu(ev->ts.sec),
-			       le32_to_cpu(ev->ts.nsec));
+			printf("  %12s: rid %016llx term %llu ts %llu.%08u\n",
+			       event_names[e], le64_to_cpu(ev->rid), le64_to_cpu(ev->term),
+			       le64_to_cpu(ev->ts.sec), le32_to_cpu(ev->ts.nsec));
 		}
 	}

 	ret = 0;
 out:
-	free(log_addr);
+	free(blk);

 	return ret;
 }