v1.11 Release

Finish the release notes for the 1.11 release. Signed-off-by: Zach Brown <zab@versity.com>
Merge pull request #115 from versity/zab/utils_flush
2026-01-11 06:00:19 +00:00 · 2023-02-02 11:00:38 -08:00 · 2023-02-02 10:25:12 -08:00 · 2023-01-18 10:52:02 -08:00 · 2023-01-18 10:27:47 -08:00 · 2023-01-17 16:07:13 -08:00
28 changed files with 510 additions and 253 deletions
--- a/ReleaseNotes.md
+++ b/ReleaseNotes.md
@@ -1,6 +1,29 @@
 Versity ScoutFS Release Notes
 =============================

+---
+v1.11
+\
+*Feb 2, 2023*
+
+Fixed a free extent processing error that could prevent mount from
+proceeding when free data extents were sufficiently fragmented.  It now
+properly handle very fragmented free extent maps.
+
+Fixed a statfs server processing race that could return spurious errors
+and shut down the server.  With the race closed statfs processing is
+reliable.
+
+Fixed a rare livelock in the move\_blocks ioctl.  With the right
+relationship between ioctl arguments and eventual file extent items the
+core loop in the move\_blocks ioctl could get stuck looping on an extent
+item and never return.  The loop exit conditions were fixed and the loop
+will always advance through all extents.
+
+Changed the 'print' scoutfs commands to flush the block cache for the
+devices.  It was inconvenient to expect cache flushing to be a separate
+step to ensure consistency with remote node writes.
+
 ---
 v1.10
 \
--- a/kmod/src/alloc.c
+++ b/kmod/src/alloc.c
@@ -976,6 +976,16 @@ int scoutfs_alloc_move(struct super_block *sb, struct scoutfs_alloc *alloc,
 			break;
 		}

+		/* return partial if the server alloc can't dirty any more */
+		if (scoutfs_alloc_meta_low(sb, alloc, 50 + extent_mod_blocks(src->root.height) +
+						      extent_mod_blocks(dst->root.height))) {
+			if (WARN_ON_ONCE(!moved))
+				ret = -ENOSPC;
+			else
+				ret = 0;
+			break;
+		}
+
 		/* searching set start/len, finish initializing alloced extent */
 		ext.map = found.map ? ext.start - found.start + found.map : 0;
 		ext.flags = found.flags;
@@ -1572,12 +1582,10 @@ out:
 * call the caller's callback.  This assumes that the super it's reading
 * could be stale and will retry if it encounters stale blocks.
 */
-int scoutfs_alloc_foreach(struct super_block *sb,
-			  scoutfs_alloc_foreach_cb_t cb, void *arg)
+int scoutfs_alloc_foreach(struct super_block *sb, scoutfs_alloc_foreach_cb_t cb, void *arg)
 {
 	struct scoutfs_super_block *super = NULL;
-	struct scoutfs_block_ref stale_refs[2] = {{0,}};
-	struct scoutfs_block_ref refs[2] = {{0,}};
+	DECLARE_SAVED_REFS(saved);
 	int ret;

 	super = kmalloc(sizeof(struct scoutfs_super_block), GFP_NOFS);
@@ -1586,26 +1594,18 @@ int scoutfs_alloc_foreach(struct super_block *sb,
 		goto out;
 	}

-retry:
-	ret = scoutfs_read_super(sb, super);
-	if (ret < 0)
-		goto out;
+	do {
+		ret = scoutfs_read_super(sb, super);
+		if (ret < 0)
+			goto out;

-	refs[0] = super->logs_root.ref;
-	refs[1] = super->srch_root.ref;
+		ret = scoutfs_alloc_foreach_super(sb, super, cb, arg);
+
+		ret = scoutfs_block_check_stale(sb, ret, &saved, &super->logs_root.ref,
+						&super->srch_root.ref);
+	} while (ret == -ESTALE);

-	ret = scoutfs_alloc_foreach_super(sb, super, cb, arg);
 out:
-	if (ret == -ESTALE) {
-		if (memcmp(&stale_refs, &refs, sizeof(refs)) == 0) {
-			ret = -EIO;
-		} else {
-			BUILD_BUG_ON(sizeof(stale_refs) != sizeof(refs));
-			memcpy(stale_refs, refs, sizeof(stale_refs));
-			goto retry;
-		}
-	}
-
 	kfree(super);
 	return ret;
 }
--- a/kmod/src/block.c
+++ b/kmod/src/block.c
@@ -677,7 +677,7 @@ out:
 int scoutfs_block_read_ref(struct super_block *sb, struct scoutfs_block_ref *ref, u32 magic,
 			   struct scoutfs_block **bl_ret)
 {
-	struct scoutfs_super_block *super = &SCOUTFS_SB(sb)->super;
+	struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
 	struct scoutfs_block_header *hdr;
 	struct block_private *bp = NULL;
 	bool retried = false;
@@ -701,7 +701,7 @@ retry:
 		set_bit(BLOCK_BIT_CRC_VALID, &bp->bits);
 	}

-	if (hdr->magic != cpu_to_le32(magic) || hdr->fsid != super->hdr.fsid ||
+	if (hdr->magic != cpu_to_le32(magic) || hdr->fsid != cpu_to_le64(sbi->fsid) ||
 	    hdr->seq != ref->seq || hdr->blkno != ref->blkno) {
 		ret = -ESTALE;
 		goto out;
@@ -728,6 +728,36 @@ out:
 	return ret;
 }

+static bool stale_refs_match(struct scoutfs_block_ref *caller, struct scoutfs_block_ref *saved)
+{
+	return !caller || (caller->blkno == saved->blkno && caller->seq == saved->seq);
+}
+
+/*
+ * Check if a read of a reference that gave ESTALE should be retried or
+ * should generate a hard error.  If this is the second time we got
+ * ESTALE from the same refs then we return EIO and the caller should
+ * stop.  As long as we keep seeing different refs we'll return ESTALE
+ * and the caller can keep trying.
+ */
+int scoutfs_block_check_stale(struct super_block *sb, int ret,
+			      struct scoutfs_block_saved_refs *saved,
+			      struct scoutfs_block_ref *a, struct scoutfs_block_ref *b)
+{
+	if (ret == -ESTALE) {
+		if (stale_refs_match(a, &saved->refs[0]) && stale_refs_match(b, &saved->refs[1])){
+			ret = -EIO;
+		} else {
+			if (a)
+				saved->refs[0] = *a;
+			if (b)
+				saved->refs[1] = *b;
+		}
+	}
+
+	return ret;
+}
+
 void scoutfs_block_put(struct super_block *sb, struct scoutfs_block *bl)
 {
 	if (!IS_ERR_OR_NULL(bl))
@@ -797,7 +827,7 @@ int scoutfs_block_dirty_ref(struct super_block *sb, struct scoutfs_alloc *alloc,
 			    u32 magic, struct scoutfs_block **bl_ret,
 			    u64 dirty_blkno, u64 *ref_blkno)
 {
-	struct scoutfs_super_block *super = &SCOUTFS_SB(sb)->super;
+	struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
 	struct scoutfs_block *cow_bl = NULL;
 	struct scoutfs_block *bl = NULL;
 	struct block_private *exist_bp = NULL;
@@ -865,7 +895,7 @@ int scoutfs_block_dirty_ref(struct super_block *sb, struct scoutfs_alloc *alloc,

 	hdr = bl->data;
 	hdr->magic = cpu_to_le32(magic);
-	hdr->fsid = super->hdr.fsid;
+	hdr->fsid = cpu_to_le64(sbi->fsid);
 	hdr->blkno = cpu_to_le64(bl->blkno);
 	prandom_bytes(&hdr->seq, sizeof(hdr->seq));

--- a/kmod/src/block.h
+++ b/kmod/src/block.h
@@ -13,6 +13,17 @@ struct scoutfs_block {
 	void *priv;
 };

+struct scoutfs_block_saved_refs {
+	struct scoutfs_block_ref refs[2];
+};
+
+#define DECLARE_SAVED_REFS(name) \
+	struct scoutfs_block_saved_refs name = {{{0,}}}
+
+int scoutfs_block_check_stale(struct super_block *sb, int ret,
+			      struct scoutfs_block_saved_refs *saved,
+			      struct scoutfs_block_ref *a, struct scoutfs_block_ref *b);
+
 int scoutfs_block_read_ref(struct super_block *sb, struct scoutfs_block_ref *ref, u32 magic,
 			   struct scoutfs_block **bl_ret);
 void scoutfs_block_put(struct super_block *sb, struct scoutfs_block *bl);
--- a/kmod/src/client.c
+++ b/kmod/src/client.c
@@ -356,7 +356,6 @@ static int client_greeting(struct super_block *sb,
 {
 	struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
 	struct client_info *client = sbi->client_info;
-	struct scoutfs_super_block *super = &SCOUTFS_SB(sb)->super;
 	struct scoutfs_net_greeting *gr = resp;
 	bool new_server;
 	int ret;
@@ -371,9 +370,9 @@ static int client_greeting(struct super_block *sb,
 		goto out;
 	}

-	if (gr->fsid != super->hdr.fsid) {
+	if (gr->fsid != cpu_to_le64(sbi->fsid)) {
 		scoutfs_warn(sb, "server greeting response fsid 0x%llx did not match client fsid 0x%llx",
-			     le64_to_cpu(gr->fsid), le64_to_cpu(super->hdr.fsid));
+			     le64_to_cpu(gr->fsid), sbi->fsid);
 		ret = -EINVAL;
 		goto out;
 	}
@@ -476,7 +475,6 @@ static void scoutfs_client_connect_worker(struct work_struct *work)
 						  connect_dwork.work);
 	struct super_block *sb = client->sb;
 	struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
-	struct scoutfs_super_block *super = &sbi->super;
 	struct scoutfs_mount_options opts;
 	struct scoutfs_net_greeting greet;
 	struct sockaddr_in sin;
@@ -508,7 +506,7 @@ static void scoutfs_client_connect_worker(struct work_struct *work)
 		goto out;

 	/* send a greeting to verify endpoints of each connection */
-	greet.fsid = super->hdr.fsid;
+	greet.fsid = cpu_to_le64(sbi->fsid);
 	greet.fmt_vers = cpu_to_le64(sbi->fmt_vers);
 	greet.server_term = cpu_to_le64(client->server_term);
 	greet.rid = cpu_to_le64(sbi->rid);
--- a/kmod/src/counters.h
+++ b/kmod/src/counters.h
@@ -187,8 +187,6 @@
 	EXPAND_COUNTER(srch_search_retry_empty)			\
 	EXPAND_COUNTER(srch_search_sorted)			\
 	EXPAND_COUNTER(srch_search_sorted_block)		\
-	EXPAND_COUNTER(srch_search_stale_eio)			\
-	EXPAND_COUNTER(srch_search_stale_retry)			\
 	EXPAND_COUNTER(srch_search_xattrs)			\
 	EXPAND_COUNTER(srch_read_stale)				\
 	EXPAND_COUNTER(statfs)					\
--- a/kmod/src/data.c
+++ b/kmod/src/data.c
@@ -1192,9 +1192,9 @@ static void truncate_inode_pages_extent(struct inode *inode, u64 start, u64 len)
 * explained above the move_blocks ioctl argument structure definition.
 *
 * The caller has processed the ioctl args and performed the most basic
- * inode checks, but we perform more detailed inode checks once we have
- * the inode lock and refreshed inodes.  Our job is to safely lock the
- * two files and move the extents.
+ * argument sanity and inode checks, but we perform more detailed inode
+ * checks once we have the inode lock and refreshed inodes.  Our job is
+ * to safely lock the two files and move the extents.
 */
 #define MOVE_DATA_EXTENTS_PER_HOLD 16
 int scoutfs_data_move_blocks(struct inode *from, u64 from_off,
@@ -1254,6 +1254,15 @@ int scoutfs_data_move_blocks(struct inode *from, u64 from_off,
 	count = (byte_len + SCOUTFS_BLOCK_SM_MASK) >> SCOUTFS_BLOCK_SM_SHIFT;
 	to_iblock = to_off >> SCOUTFS_BLOCK_SM_SHIFT;

+	/* only move extent blocks inside i_size, careful not to wrap */
+	from_size = i_size_read(from);
+	if (from_off >= from_size) {
+		ret = 0;
+		goto out;
+	}
+	if (from_off + byte_len > from_size)
+		count = ((from_size - from_off) + SCOUTFS_BLOCK_SM_MASK) >> SCOUTFS_BLOCK_SM_SHIFT;
+
 	if (S_ISDIR(from->i_mode) || S_ISDIR(to->i_mode)) {
 		ret = -EISDIR;
 		goto out;
@@ -1329,9 +1338,8 @@ int scoutfs_data_move_blocks(struct inode *from, u64 from_off,
 				break;
 			}

-			/* only move extents within count and i_size */
-			if (ext.start >= from_iblock + count ||
-			    ext.start >= i_size_read(from)) {
+			/* done if next extent starts after moving region */
+			if (ext.start >= from_iblock + count) {
 				done = true;
 				ret = 0;
 				break;
@@ -1339,13 +1347,15 @@ int scoutfs_data_move_blocks(struct inode *from, u64 from_off,

 			from_start = max(ext.start, from_iblock);
 			map = ext.map + (from_start - ext.start);
-			len = min3(from_iblock + count,
-				   round_up((u64)i_size_read(from),
-					    SCOUTFS_BLOCK_SM_SIZE),
-				   ext.start + ext.len) - from_start;
-
+			len = min(from_iblock + count, ext.start + ext.len) - from_start;
 			to_start = to_iblock + (from_start - from_iblock);

+			/* we'd get stuck, shouldn't happen */
+			if (WARN_ON_ONCE(len == 0)) {
+				ret = -EIO;
+				goto out;
+			}
+
 			if (is_stage) {
 				ret = scoutfs_ext_next(sb, &data_ext_ops, &to_args,
 						       to_start, 1, &off_ext);
--- a/kmod/src/forest.c
+++ b/kmod/src/forest.c
@@ -78,11 +78,6 @@ struct forest_refs {
 	struct scoutfs_block_ref logs_ref;
 };

-/* initialize some refs that initially aren't equal */
-#define DECLARE_STALE_TRACKING_SUPER_REFS(a, b)		\
-	struct forest_refs a = {{cpu_to_le64(0),}};	\
-	struct forest_refs b = {{cpu_to_le64(1),}}
-
 struct forest_bloom_nrs {
 	unsigned int nrs[SCOUTFS_FOREST_BLOOM_NRS];
 };
@@ -136,11 +131,11 @@ static struct scoutfs_block *read_bloom_ref(struct super_block *sb, struct scout
 int scoutfs_forest_next_hint(struct super_block *sb, struct scoutfs_key *key,
 			     struct scoutfs_key *next)
 {
-	DECLARE_STALE_TRACKING_SUPER_REFS(prev_refs, refs);
 	struct scoutfs_net_roots roots;
 	struct scoutfs_btree_root item_root;
 	struct scoutfs_log_trees *lt;
 	SCOUTFS_BTREE_ITEM_REF(iref);
+	DECLARE_SAVED_REFS(saved);
 	struct scoutfs_key found;
 	struct scoutfs_key ltk;
 	bool checked_fs;
@@ -155,8 +150,6 @@ retry:
 		goto out;

 	trace_scoutfs_forest_using_roots(sb, &roots.fs_root, &roots.logs_root);
-	refs.fs_ref = roots.fs_root.ref;
-	refs.logs_ref = roots.logs_root.ref;

 	scoutfs_key_init_log_trees(&ltk, 0, 0);
 	checked_fs = false;
@@ -212,14 +205,10 @@ retry:
 		}
 	}

-	if (ret == -ESTALE) {
-		if (memcmp(&prev_refs, &refs, sizeof(refs)) == 0)
-			return -EIO;
-		prev_refs = refs;
+	ret = scoutfs_block_check_stale(sb, ret, &saved, &roots.fs_root.ref, &roots.logs_root.ref);
+	if (ret == -ESTALE)
 		goto retry;
-	}
 out:
-
 	return ret;
 }

@@ -541,9 +530,8 @@ void scoutfs_forest_dec_inode_count(struct super_block *sb)

 /*
 * Return the total inode count from the super block and all the
- * log_btrees it references.   This assumes it's working with a block
- * reference hierarchy that should be fully consistent.   If we see
- * ESTALE we've hit persistent corruption.
+ * log_btrees it references.  ESTALE from read blocks is returned to the
+ * caller who is expected to retry or return hard errors.
 */
 int scoutfs_forest_inode_count(struct super_block *sb, struct scoutfs_super_block *super,
 			       u64 *inode_count)
@@ -572,8 +560,6 @@ int scoutfs_forest_inode_count(struct super_block *sb, struct scoutfs_super_bloc
 		if (ret < 0) {
 			if (ret == -ENOENT)
 				ret = 0;
-			else if (ret == -ESTALE)
-				ret = -EIO;
 			break;
 		}
 	}
--- a/kmod/src/quorum.c
+++ b/kmod/src/quorum.c
@@ -114,6 +114,7 @@ struct quorum_status {

 struct quorum_info {
 	struct super_block *sb;
+	struct scoutfs_quorum_config qconf;
 	struct work_struct work;
 	struct socket *sock;
 	bool shutdown;
@@ -134,11 +135,18 @@ struct quorum_info {
 #define DECLARE_QUORUM_INFO_KOBJ(kobj, name) \
 	DECLARE_QUORUM_INFO(SCOUTFS_SYSFS_ATTRS_SB(kobj), name)

-static bool quorum_slot_present(struct scoutfs_super_block *super, int i)
+static bool quorum_slot_present(struct scoutfs_quorum_config *qconf, int i)
 {
 	BUG_ON(i < 0 || i > SCOUTFS_QUORUM_MAX_SLOTS);

-	return super->qconf.slots[i].addr.v4.family == cpu_to_le16(SCOUTFS_AF_IPV4);
+	return qconf->slots[i].addr.v4.family == cpu_to_le16(SCOUTFS_AF_IPV4);
+}
+
+static void quorum_slot_sin(struct scoutfs_quorum_config *qconf, int i, struct sockaddr_in *sin)
+{
+	BUG_ON(i < 0 || i >= SCOUTFS_QUORUM_MAX_SLOTS);
+
+	scoutfs_addr_to_sin(sin, &qconf->slots[i].addr);
 }

 static ktime_t election_timeout(void)
@@ -160,7 +168,6 @@ static ktime_t heartbeat_timeout(void)
 static int create_socket(struct super_block *sb)
 {
 	DECLARE_QUORUM_INFO(sb, qinf);
-	struct scoutfs_super_block *super = &SCOUTFS_SB(sb)->super;
 	struct socket *sock = NULL;
 	struct sockaddr_in sin;
 	int addrlen;
@@ -174,7 +181,7 @@ static int create_socket(struct super_block *sb)

 	sock->sk->sk_allocation = GFP_NOFS;

-	scoutfs_quorum_slot_sin(super, qinf->our_quorum_slot_nr, &sin);
+	quorum_slot_sin(&qinf->qconf, qinf->our_quorum_slot_nr, &sin);

 	addrlen = sizeof(sin);
 	ret = kernel_bind(sock, (struct sockaddr *)&sin, addrlen);
@@ -204,13 +211,13 @@ static __le32 quorum_message_crc(struct scoutfs_quorum_message *qmes)
 static void send_msg_members(struct super_block *sb, int type, u64 term,
 			     int only)
 {
+	struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
 	DECLARE_QUORUM_INFO(sb, qinf);
-	struct scoutfs_super_block *super = &SCOUTFS_SB(sb)->super;
 	ktime_t now;
 	int i;

 	struct scoutfs_quorum_message qmes = {
-		.fsid = super->hdr.fsid,
+		.fsid = cpu_to_le64(sbi->fsid),
 		.term = cpu_to_le64(term),
 		.type = type,
 		.from = qinf->our_quorum_slot_nr,
@@ -234,11 +241,11 @@ static void send_msg_members(struct super_block *sb, int type, u64 term,


 	for (i = 0; i < SCOUTFS_QUORUM_MAX_SLOTS; i++) {
-		if (!quorum_slot_present(super, i) ||
+		if (!quorum_slot_present(&qinf->qconf, i) ||
 		    (only >= 0 && i != only) || i == qinf->our_quorum_slot_nr)
 			continue;

-		scoutfs_quorum_slot_sin(super, i, &sin);
+		scoutfs_quorum_slot_sin(&qinf->qconf, i, &sin);
 		now = ktime_get();
 		kernel_sendmsg(qinf->sock, &mh, &kv, 1, kv.iov_len);

@@ -266,7 +273,7 @@ static int recv_msg(struct super_block *sb, struct quorum_host_msg *msg,
 		    ktime_t abs_to)
 {
 	DECLARE_QUORUM_INFO(sb, qinf);
-	struct scoutfs_super_block *super = &SCOUTFS_SB(sb)->super;
+	struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
 	struct scoutfs_quorum_message qmes;
 	struct timeval tv;
 	ktime_t rel_to;
@@ -309,10 +316,10 @@ static int recv_msg(struct super_block *sb, struct quorum_host_msg *msg,

 	if (ret != sizeof(qmes) ||
 	    qmes.crc != quorum_message_crc(&qmes) ||
-	    qmes.fsid != super->hdr.fsid ||
+	    qmes.fsid != cpu_to_le64(sbi->fsid) ||
 	    qmes.type >= SCOUTFS_QUORUM_MSG_INVALID ||
 	    qmes.from >= SCOUTFS_QUORUM_MAX_SLOTS ||
-	    !quorum_slot_present(super, qmes.from)) {
+	    !quorum_slot_present(&qinf->qconf, qmes.from)) {
 		/* should we be trying to open a new socket? */
 		scoutfs_inc_counter(sb, quorum_recv_invalid);
 		return -EAGAIN;
@@ -342,7 +349,7 @@ static int read_quorum_block(struct super_block *sb, u64 blkno, struct scoutfs_q
 			     bool check_rid)
 {
 	struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
-	struct scoutfs_super_block *super = &sbi->super;
+	const u64 fsid = sbi->fsid;
 	const u64 rid = sbi->rid;
 	char msg[150];
 	__le32 crc;
@@ -367,9 +374,9 @@ static int read_quorum_block(struct super_block *sb, u64 blkno, struct scoutfs_q
 	else if (le32_to_cpu(blk->hdr.magic) != SCOUTFS_BLOCK_MAGIC_QUORUM) 
 		snprintf(msg, sizeof(msg), "blk magic %08x != %08x",
 			 le32_to_cpu(blk->hdr.magic), SCOUTFS_BLOCK_MAGIC_QUORUM);
-	else if (blk->hdr.fsid != super->hdr.fsid)
+	else if (blk->hdr.fsid != cpu_to_le64(fsid))
 		snprintf(msg, sizeof(msg), "blk fsid %016llx != %016llx",
-			 le64_to_cpu(blk->hdr.fsid), le64_to_cpu(super->hdr.fsid));
+			 le64_to_cpu(blk->hdr.fsid), fsid);
 	else if (le64_to_cpu(blk->hdr.blkno) != blkno)
 		snprintf(msg, sizeof(msg), "blk blkno %llu != %llu",
 			 le64_to_cpu(blk->hdr.blkno), blkno);
@@ -410,8 +417,7 @@ out:
 */
 static void read_greatest_term(struct super_block *sb, u64 *term)
 {
-	struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
-	struct scoutfs_super_block *super = &sbi->super;
+	DECLARE_QUORUM_INFO(sb, qinf);
 	struct scoutfs_quorum_block blk;
 	int ret;
 	int e;
@@ -420,7 +426,7 @@ static void read_greatest_term(struct super_block *sb, u64 *term)
 	*term = 0;

 	for (s = 0; s < SCOUTFS_QUORUM_MAX_SLOTS; s++) {
-		if (!quorum_slot_present(super, s))
+		if (!quorum_slot_present(&qinf->qconf, s))
 			continue;

 		ret = read_quorum_block(sb, SCOUTFS_QUORUM_BLKNO + s, &blk, false);
@@ -514,14 +520,15 @@ static int update_quorum_block(struct super_block *sb, int event, u64 term, bool
 * keeps us from being fenced while we allow userspace fencing to take a
 * reasonably long time.  We still want to timeout eventually.
 */
-int scoutfs_quorum_fence_leaders(struct super_block *sb, u64 term)
+int scoutfs_quorum_fence_leaders(struct super_block *sb, struct scoutfs_quorum_config *qconf,
+				 u64 term)
 {
 #define NR_OLD 2
 	struct scoutfs_quorum_block_event old[SCOUTFS_QUORUM_MAX_SLOTS][NR_OLD] = {{{0,}}};
 	struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
-	struct scoutfs_super_block *super = &sbi->super;
 	struct scoutfs_quorum_block blk;
 	struct sockaddr_in sin;
+	const __le64 lefsid = cpu_to_le64(sbi->fsid);
 	const u64 rid = sbi->rid;
 	bool fence_started = false;
 	u64 fenced = 0;
@@ -534,7 +541,7 @@ int scoutfs_quorum_fence_leaders(struct super_block *sb, u64 term)
 	BUILD_BUG_ON(SCOUTFS_QUORUM_BLOCKS < SCOUTFS_QUORUM_MAX_SLOTS);

 	for (i = 0; i < SCOUTFS_QUORUM_MAX_SLOTS; i++) {
-		if (!quorum_slot_present(super, i))
+		if (!quorum_slot_present(qconf, i))
 			continue;

 		ret = read_quorum_block(sb, SCOUTFS_QUORUM_BLKNO + i, &blk, false);
@@ -567,11 +574,11 @@ int scoutfs_quorum_fence_leaders(struct super_block *sb, u64 term)
 				continue;

 			scoutfs_inc_counter(sb, quorum_fence_leader);
-			scoutfs_quorum_slot_sin(super, i, &sin);
+			quorum_slot_sin(qconf, i, &sin);
 			fence_rid = old[i][j].rid;

 			scoutfs_info(sb, "fencing previous leader "SCSBF" at term %llu in slot %u with address "SIN_FMT,
-				     SCSB_LEFR_ARGS(super->hdr.fsid, fence_rid),
+				     SCSB_LEFR_ARGS(lefsid, fence_rid),
 				     le64_to_cpu(old[i][j].term), i, SIN_ARG(&sin));
 			ret = scoutfs_fence_start(sb, le64_to_cpu(fence_rid), sin.sin_addr.s_addr,
 						  SCOUTFS_FENCE_QUORUM_BLOCK_LEADER);
@@ -752,7 +759,7 @@ static void scoutfs_quorum_worker(struct work_struct *work)

 			qst.server_start_term = qst.term;
 			qst.server_event = SCOUTFS_QUORUM_EVENT_ELECT;
-			scoutfs_server_start(sb, qst.term);
+			scoutfs_server_start(sb, &qinf->qconf, qst.term);
 		}

 		/*
@@ -877,16 +884,25 @@ out:
 */
 int scoutfs_quorum_server_sin(struct super_block *sb, struct sockaddr_in *sin)
 {
-	struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
-	struct scoutfs_super_block *super = &sbi->super;
+	struct scoutfs_super_block *super = NULL;
 	struct scoutfs_quorum_block blk;
 	u64 elect_term;
 	u64 term = 0;
 	int ret = 0;
 	int i;

+	super = kmalloc(sizeof(struct scoutfs_super_block), GFP_NOFS);
+	if (!super) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	ret = scoutfs_read_super(sb, super);
+	if (ret)
+		goto out;
+
 	for (i = 0; i < SCOUTFS_QUORUM_MAX_SLOTS; i++) {
-		if (!quorum_slot_present(super, i))
+		if (!quorum_slot_present(&super->qconf, i))
 			continue;

 		ret = read_quorum_block(sb, SCOUTFS_QUORUM_BLKNO + i, &blk, false);
@@ -900,7 +916,7 @@ int scoutfs_quorum_server_sin(struct super_block *sb, struct sockaddr_in *sin)
 		if (elect_term > term &&
 		    elect_term > le64_to_cpu(blk.events[SCOUTFS_QUORUM_EVENT_STOP].term)) {
 			term = elect_term;
-			scoutfs_quorum_slot_sin(super, i, sin);
+			scoutfs_quorum_slot_sin(&super->qconf, i, sin);
 			continue;
 		}
 	}
@@ -909,6 +925,7 @@ int scoutfs_quorum_server_sin(struct super_block *sb, struct sockaddr_in *sin)
 		ret = -ENOENT;

 out:
+	kfree(super);
 	return ret;
 }

@@ -924,12 +941,9 @@ u8 scoutfs_quorum_votes_needed(struct super_block *sb)
 	return qinf->votes_needed;
 }

-void scoutfs_quorum_slot_sin(struct scoutfs_super_block *super, int i,
-			     struct sockaddr_in *sin)
+void scoutfs_quorum_slot_sin(struct scoutfs_quorum_config *qconf, int i, struct sockaddr_in *sin)
 {
-	BUG_ON(i < 0 || i >= SCOUTFS_QUORUM_MAX_SLOTS);
-
-	scoutfs_addr_to_sin(sin, &super->qconf.slots[i].addr);
+	return quorum_slot_sin(qconf, i, sin);
 }

 static char *role_str(int role)
@@ -1060,11 +1074,10 @@ static inline bool valid_ipv4_port(__be16 port)
 	return port != 0 && be16_to_cpu(port) != U16_MAX;
 }

-static int verify_quorum_slots(struct super_block *sb)
+static int verify_quorum_slots(struct super_block *sb, struct quorum_info *qinf,
+			       struct scoutfs_quorum_config *qconf)
 {
-	struct scoutfs_super_block *super = &SCOUTFS_SB(sb)->super;
 	char slots[(SCOUTFS_QUORUM_MAX_SLOTS * 3) + 1];
-	DECLARE_QUORUM_INFO(sb, qinf);
 	struct sockaddr_in other;
 	struct sockaddr_in sin;
 	int found = 0;
@@ -1074,10 +1087,10 @@ static int verify_quorum_slots(struct super_block *sb)


 	for (i = 0; i < SCOUTFS_QUORUM_MAX_SLOTS; i++) {
-		if (!quorum_slot_present(super, i))
+		if (!quorum_slot_present(qconf, i))
 			continue;

-		scoutfs_quorum_slot_sin(super, i, &sin);
+		scoutfs_quorum_slot_sin(qconf, i, &sin);

 		if (!valid_ipv4_unicast(sin.sin_addr.s_addr)) {
 			scoutfs_err(sb, "quorum slot #%d has invalid ipv4 unicast address: "SIN_FMT,
@@ -1092,10 +1105,10 @@ static int verify_quorum_slots(struct super_block *sb)
 		}

 		for (j = i + 1; j < SCOUTFS_QUORUM_MAX_SLOTS; j++) {
-			if (!quorum_slot_present(super, j))
+			if (!quorum_slot_present(qconf, j))
 				continue;

-			scoutfs_quorum_slot_sin(super, j, &other);
+			scoutfs_quorum_slot_sin(qconf, j, &other);

 			if (sin.sin_addr.s_addr == other.sin_addr.s_addr &&
 			    sin.sin_port == other.sin_port) {
@@ -1113,11 +1126,11 @@ static int verify_quorum_slots(struct super_block *sb)
 		return -EINVAL;
 	}

-	if (!quorum_slot_present(super, qinf->our_quorum_slot_nr)) {
+	if (!quorum_slot_present(qconf, qinf->our_quorum_slot_nr)) {
 		char *str = slots;
 		*str = '\0';
 		for (i = 0; i < SCOUTFS_QUORUM_MAX_SLOTS; i++) {
-			if (quorum_slot_present(super, i)) {
+			if (quorum_slot_present(qconf, i)) {
 				ret = snprintf(str, &slots[ARRAY_SIZE(slots)] - str, "%c%u",
 					       str == slots ? ' ' : ',', i);
 				if (ret < 2 || ret > 3) {
@@ -1141,16 +1154,22 @@ static int verify_quorum_slots(struct super_block *sb)
 	else
 		qinf->votes_needed = (found / 2) + 1;

+	qinf->qconf = *qconf;
+
 	return 0;
 }

 /*
 * Once this schedules the quorum worker it can be elected leader and
- * start the server, possibly before this returns.
+ * start the server, possibly before this returns.  The quorum agent
+ * would be responsible for tracking the quorum config in the super
+ * block if it changes.  Until then uses a static config that it reads
+ * during setup.
 */
 int scoutfs_quorum_setup(struct super_block *sb)
 {
 	struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
+	struct scoutfs_super_block *super = NULL;
 	struct scoutfs_mount_options opts;
 	struct quorum_info *qinf;
 	int ret;
@@ -1160,7 +1179,9 @@ int scoutfs_quorum_setup(struct super_block *sb)
 		return 0;

 	qinf = kzalloc(sizeof(struct quorum_info), GFP_KERNEL);
-	if (!qinf) {
+	super = kmalloc(sizeof(struct scoutfs_super_block), GFP_KERNEL);
+	if (!qinf || !super) {
+		kfree(qinf);
 		ret = -ENOMEM;
 		goto out;
 	}
@@ -1174,7 +1195,11 @@ int scoutfs_quorum_setup(struct super_block *sb)
 	sbi->quorum_info = qinf;
 	qinf->sb = sb;

-	ret = verify_quorum_slots(sb);
+	ret = scoutfs_read_super(sb, super);
+	if (ret < 0)
+		goto out;
+
+	ret = verify_quorum_slots(sb, qinf, &super->qconf);
 	if (ret < 0)
 		goto out;

@@ -1194,6 +1219,7 @@ out:
 	if (ret)
 		scoutfs_quorum_destroy(sb);

+	kfree(super);
 	return ret;
 }

--- a/kmod/src/quorum.h
+++ b/kmod/src/quorum.h
@@ -4,10 +4,11 @@
 int scoutfs_quorum_server_sin(struct super_block *sb, struct sockaddr_in *sin);

 u8 scoutfs_quorum_votes_needed(struct super_block *sb);
-void scoutfs_quorum_slot_sin(struct scoutfs_super_block *super, int i,
+void scoutfs_quorum_slot_sin(struct scoutfs_quorum_config *qconf, int i,
 			     struct sockaddr_in *sin);

-int scoutfs_quorum_fence_leaders(struct super_block *sb, u64 term);
+int scoutfs_quorum_fence_leaders(struct super_block *sb, struct scoutfs_quorum_config *qconf,
+				 u64 term);

 int scoutfs_quorum_setup(struct super_block *sb);
 void scoutfs_quorum_shutdown(struct super_block *sb);
--- a/kmod/src/server.c
+++ b/kmod/src/server.c
@@ -130,9 +130,9 @@ struct server_info {
 	struct mutex srch_mutex;
 	struct mutex mounted_clients_mutex;

-	/* stable versions stored from commits, given in locks and rpcs */
-	seqcount_t roots_seqcount;
-	struct scoutfs_net_roots roots;
+	/* stable super stored from commits, given in locks and rpcs */
+	seqcount_t stable_seqcount;
+	struct scoutfs_super_block stable_super;

 	/* serializing and get and set volume options */
 	seqcount_t volopt_seqcount;
@@ -143,11 +143,18 @@ struct server_info {
 	struct work_struct fence_pending_recov_work;
 	/* while running we check for fenced mounts to reclaim */
 	struct delayed_work reclaim_dwork;
+
+	/* a running server gets a static quorum config from quorum as it starts */
+	struct scoutfs_quorum_config qconf;
+	/* a running server maintains a private dirty super */
+	struct scoutfs_super_block dirty_super;
 };

 #define DECLARE_SERVER_INFO(sb, name) \
 	struct server_info *name = SCOUTFS_SB(sb)->server_info

+#define DIRTY_SUPER_SB(sb)	(&SCOUTFS_SB(sb)->server_info->dirty_super)
+
 /*
 * The server tracks each connected client.
 */
@@ -469,16 +476,22 @@ static void commit_end(struct super_block *sb, struct commit_users *cusers, int
 	wake_up(&cusers->waitq);
 }

-static void get_roots(struct super_block *sb,
-			      struct scoutfs_net_roots *roots)
+static void get_stable(struct super_block *sb, struct scoutfs_super_block *super,
+		       struct scoutfs_net_roots *roots)
 {
 	DECLARE_SERVER_INFO(sb, server);
 	unsigned int seq;

 	do {
-		seq = read_seqcount_begin(&server->roots_seqcount);
-		*roots = server->roots;
-	} while (read_seqcount_retry(&server->roots_seqcount, seq));
+		seq = read_seqcount_begin(&server->stable_seqcount);
+		if (super)
+			*super = server->stable_super;
+		if (roots) {
+			roots->fs_root = server->stable_super.fs_root;
+			roots->logs_root = server->stable_super.logs_root;
+			roots->srch_root = server->stable_super.srch_root;
+		}
+	} while (read_seqcount_retry(&server->stable_seqcount, seq));
 }

 u64 scoutfs_server_seq(struct super_block *sb)
@@ -510,17 +523,12 @@ void scoutfs_server_set_seq_if_greater(struct super_block *sb, u64 seq)
 	}
 }

-static void set_roots(struct server_info *server,
-		      struct scoutfs_btree_root *fs_root,
-		      struct scoutfs_btree_root *logs_root,
-		      struct scoutfs_btree_root *srch_root)
+static void set_stable_super(struct server_info *server, struct scoutfs_super_block *super)
 {
 	preempt_disable();
-	write_seqcount_begin(&server->roots_seqcount);
-	server->roots.fs_root = *fs_root;
-	server->roots.logs_root = *logs_root;
-	server->roots.srch_root = *srch_root;
-	write_seqcount_end(&server->roots_seqcount);
+	write_seqcount_begin(&server->stable_seqcount);
+	server->stable_super = *super;
+	write_seqcount_end(&server->stable_seqcount);
 	preempt_enable();
 }

@@ -545,7 +553,7 @@ static void scoutfs_server_commit_func(struct work_struct *work)
 	struct server_info *server = container_of(work, struct server_info,
 						  commit_work);
 	struct super_block *sb = server->sb;
-	struct scoutfs_super_block *super = &SCOUTFS_SB(sb)->super;
+	struct scoutfs_super_block *super = DIRTY_SUPER_SB(sb);
 	struct commit_users *cusers = &server->cusers;
 	int ret;

@@ -603,8 +611,7 @@ static void scoutfs_server_commit_func(struct work_struct *work)
 		goto out;
 	}

-	set_roots(server, &super->fs_root, &super->logs_root,
-		  &super->srch_root);
+	set_stable_super(server, super);

 	/* swizzle the active and idle server alloc/freed heads */
 	server->other_ind ^= 1;
@@ -641,7 +648,7 @@ static int server_alloc_inodes(struct super_block *sb,
 			       u8 cmd, u64 id, void *arg, u16 arg_len)
 {
 	struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
-	struct scoutfs_super_block *super = &sbi->super;
+	struct scoutfs_super_block *super = DIRTY_SUPER_SB(sb);
 	struct scoutfs_net_inode_alloc ial = { 0, };
 	COMMIT_HOLD(hold);
 	__le64 lecount;
@@ -809,7 +816,7 @@ static void mod_bitmap_bits(__le64 *dst, u64 dst_zone_blocks,
 static int get_data_alloc_zone_bits(struct super_block *sb, u64 rid, __le64 *exclusive,
 				    __le64 *vacant, u64 zone_blocks)
 {
-	struct scoutfs_super_block *super = &SCOUTFS_SB(sb)->super;
+	struct scoutfs_super_block *super = DIRTY_SUPER_SB(sb);
 	SCOUTFS_BTREE_ITEM_REF(iref);
 	struct scoutfs_log_trees *lt;
 	struct scoutfs_key key;
@@ -1040,7 +1047,7 @@ static int finalize_and_start_log_merge(struct super_block *sb, struct scoutfs_l
 					u64 rid, struct commit_hold *hold)
 {
 	struct server_info *server = SCOUTFS_SB(sb)->server_info;
-	struct scoutfs_super_block *super = &SCOUTFS_SB(sb)->super;
+	struct scoutfs_super_block *super = DIRTY_SUPER_SB(sb);
 	struct scoutfs_log_merge_status stat;
 	struct scoutfs_log_merge_range rng;
 	struct scoutfs_log_trees each_lt;
@@ -1242,7 +1249,7 @@ static int finalize_and_start_log_merge(struct super_block *sb, struct scoutfs_l
 static void try_drain_data_freed(struct super_block *sb, struct scoutfs_log_trees *lt)
 {
 	DECLARE_SERVER_INFO(sb, server);
-	struct scoutfs_super_block *super = &SCOUTFS_SB(sb)->super;
+	struct scoutfs_super_block *super = DIRTY_SUPER_SB(sb);
 	const u64 rid = le64_to_cpu(lt->rid);
 	const u64 nr = le64_to_cpu(lt->nr);
 	struct scoutfs_log_trees drain;
@@ -1329,7 +1336,7 @@ static int server_get_log_trees(struct super_block *sb,
 				struct scoutfs_net_connection *conn,
 				u8 cmd, u64 id, void *arg, u16 arg_len)
 {
-	struct scoutfs_super_block *super = &SCOUTFS_SB(sb)->super;
+	struct scoutfs_super_block *super = DIRTY_SUPER_SB(sb);
 	u64 rid = scoutfs_net_client_rid(conn);
 	DECLARE_SERVER_INFO(sb, server);
 	__le64 exclusive[SCOUTFS_DATA_ALLOC_ZONE_LE64S];
@@ -1524,7 +1531,7 @@ static int server_commit_log_trees(struct super_block *sb,
 				   struct scoutfs_net_connection *conn,
 				   u8 cmd, u64 id, void *arg, u16 arg_len)
 {
-	struct scoutfs_super_block *super = &SCOUTFS_SB(sb)->super;
+	struct scoutfs_super_block *super = DIRTY_SUPER_SB(sb);
 	const u64 rid = scoutfs_net_client_rid(conn);
 	DECLARE_SERVER_INFO(sb, server);
 	SCOUTFS_BTREE_ITEM_REF(iref);
@@ -1579,6 +1586,13 @@ static int server_commit_log_trees(struct super_block *sb,
 	if (ret < 0 || committed)
 		goto unlock;

+	/* make sure _update succeeds before we modify srch items */
+	ret = scoutfs_btree_dirty(sb, &server->alloc, &server->wri, &super->logs_root, &key);
+	if (ret < 0) {
+		err_str = "dirtying lt item";
+		goto unlock;
+	}
+
 	/* try to rotate the srch log when big enough */
 	mutex_lock(&server->srch_mutex);
 	ret = scoutfs_srch_rotate_log(sb, &server->alloc, &server->wri,
@@ -1593,6 +1607,7 @@ static int server_commit_log_trees(struct super_block *sb,

 	ret = scoutfs_btree_update(sb, &server->alloc, &server->wri,
 				   &super->logs_root, &key, &lt, sizeof(lt));
+	BUG_ON(ret < 0); /* dirtying should have guaranteed success */
 	if (ret < 0)
 		err_str = "updating log trees item";

@@ -1624,7 +1639,7 @@ static int server_get_roots(struct super_block *sb,
 		memset(&roots, 0, sizeof(roots));
 		ret = -EINVAL;
 	}  else {
-		get_roots(sb, &roots);
+		get_stable(sb, NULL, &roots);
 		ret = 0;
 	}

@@ -1654,7 +1669,7 @@ static int server_get_roots(struct super_block *sb,
 */
 static int reclaim_open_log_tree(struct super_block *sb, u64 rid)
 {
-	struct scoutfs_super_block *super = &SCOUTFS_SB(sb)->super;
+	struct scoutfs_super_block *super = DIRTY_SUPER_SB(sb);
 	DECLARE_SERVER_INFO(sb, server);
 	SCOUTFS_BTREE_ITEM_REF(iref);
 	struct scoutfs_log_trees lt;
@@ -1751,9 +1766,8 @@ out:
 */
 static int get_stable_trans_seq(struct super_block *sb, u64 *last_seq_ret)
 {
+	struct scoutfs_super_block *super = DIRTY_SUPER_SB(sb);
 	DECLARE_SERVER_INFO(sb, server);
-	struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
-	struct scoutfs_super_block *super = &sbi->super;
 	SCOUTFS_BTREE_ITEM_REF(iref);
 	struct scoutfs_log_trees *lt;
 	struct scoutfs_key key;
@@ -1909,9 +1923,8 @@ static int server_srch_get_compact(struct super_block *sb,
 				   u8 cmd, u64 id, void *arg, u16 arg_len)
 {
 	DECLARE_SERVER_INFO(sb, server);
+	struct scoutfs_super_block *super = DIRTY_SUPER_SB(sb);
 	u64 rid = scoutfs_net_client_rid(conn);
-	struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
-	struct scoutfs_super_block *super = &sbi->super;
 	struct scoutfs_srch_compact *sc = NULL;
 	COMMIT_HOLD(hold);
 	int ret;
@@ -1976,8 +1989,7 @@ static int server_srch_commit_compact(struct super_block *sb,
 {
 	DECLARE_SERVER_INFO(sb, server);
 	u64 rid = scoutfs_net_client_rid(conn);
-	struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
-	struct scoutfs_super_block *super = &sbi->super;
+	struct scoutfs_super_block *super = DIRTY_SUPER_SB(sb);
 	struct scoutfs_srch_compact *sc;
 	struct scoutfs_alloc_list_head av;
 	struct scoutfs_alloc_list_head fr;
@@ -2052,8 +2064,7 @@ static int splice_log_merge_completions(struct super_block *sb,
 					bool no_ranges)
 {
 	struct server_info *server = SCOUTFS_SB(sb)->server_info;
-	struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
-	struct scoutfs_super_block *super = &sbi->super;
+	struct scoutfs_super_block *super = DIRTY_SUPER_SB(sb);
 	struct scoutfs_log_merge_complete comp;
 	struct scoutfs_log_merge_freeing fr;
 	struct scoutfs_log_merge_range rng;
@@ -2370,7 +2381,7 @@ static void server_log_merge_free_work(struct work_struct *work)
 	struct server_info *server = container_of(work, struct server_info,
 						  log_merge_free_work);
 	struct super_block *sb = server->sb;
-	struct scoutfs_super_block *super = &SCOUTFS_SB(sb)->super;
+	struct scoutfs_super_block *super = DIRTY_SUPER_SB(sb);
 	struct scoutfs_log_merge_freeing fr;
 	struct scoutfs_key key;
 	COMMIT_HOLD(hold);
@@ -2462,8 +2473,7 @@ static int server_get_log_merge(struct super_block *sb,
 {
 	DECLARE_SERVER_INFO(sb, server);
 	u64 rid = scoutfs_net_client_rid(conn);
-	struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
-	struct scoutfs_super_block *super = &sbi->super;
+	struct scoutfs_super_block *super = DIRTY_SUPER_SB(sb);
 	struct scoutfs_log_merge_status stat;
 	struct scoutfs_log_merge_range rng;
 	struct scoutfs_log_merge_range remain;
@@ -2746,8 +2756,7 @@ static int server_commit_log_merge(struct super_block *sb,
 {
 	DECLARE_SERVER_INFO(sb, server);
 	u64 rid = scoutfs_net_client_rid(conn);
-	struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
-	struct scoutfs_super_block *super = &sbi->super;
+	struct scoutfs_super_block *super = DIRTY_SUPER_SB(sb);
 	struct scoutfs_log_merge_request orig_req;
 	struct scoutfs_log_merge_complete *comp;
 	struct scoutfs_log_merge_status stat;
@@ -2982,7 +2991,7 @@ static int server_set_volopt(struct super_block *sb, struct scoutfs_net_connecti
 			     u8 cmd, u64 id, void *arg, u16 arg_len)
 {
 	DECLARE_SERVER_INFO(sb, server);
-	struct scoutfs_super_block *super = &SCOUTFS_SB(sb)->super;
+	struct scoutfs_super_block *super = DIRTY_SUPER_SB(sb);
 	struct scoutfs_volume_options *volopt;
 	COMMIT_HOLD(hold);
 	u64 opt;
@@ -3051,7 +3060,7 @@ static int server_clear_volopt(struct super_block *sb, struct scoutfs_net_connec
 			       u8 cmd, u64 id, void *arg, u16 arg_len)
 {
 	DECLARE_SERVER_INFO(sb, server);
-	struct scoutfs_super_block *super = &SCOUTFS_SB(sb)->super;
+	struct scoutfs_super_block *super = DIRTY_SUPER_SB(sb);
 	struct scoutfs_volume_options *volopt;
 	COMMIT_HOLD(hold);
 	__le64 *opt;
@@ -3105,7 +3114,7 @@ static int server_resize_devices(struct super_block *sb, struct scoutfs_net_conn
 {
 	DECLARE_SERVER_INFO(sb, server);
 	struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
-	struct scoutfs_super_block *super = &SCOUTFS_SB(sb)->super;
+	struct scoutfs_super_block *super = DIRTY_SUPER_SB(sb);
 	struct scoutfs_net_resize_devices *nrd;
 	COMMIT_HOLD(hold);
 	u64 meta_tot;
@@ -3212,16 +3221,19 @@ static int count_free_blocks(struct super_block *sb, void *arg, int owner,
 }

 /*
- * We calculate the total inode count and free blocks from the current in-memory dirty
- * versions of the super block and log_trees structs, so we have to lock them.
+ * We calculate the total inode count and free blocks from the last
+ * stable super that was written.  Other users also walk stable blocks
+ * so by joining them we don't have to worry about ensuring that we've
+ * locked all the dirty structures that the summations could reference.
+ * We handle stale reads by retrying with the most recent stable super.
 */
 static int server_statfs(struct super_block *sb, struct scoutfs_net_connection *conn,
 			 u8 cmd, u64 id, void *arg, u16 arg_len)
 {
-	DECLARE_SERVER_INFO(sb, server);
-	struct scoutfs_super_block *super = &SCOUTFS_SB(sb)->super;
+	struct scoutfs_super_block super;
 	struct scoutfs_net_statfs nst = {{0,}};
 	struct statfs_free_blocks sfb = {0,};
+	DECLARE_SAVED_REFS(saved);
 	u64 inode_count;
 	int ret;

@@ -3230,24 +3242,24 @@ static int server_statfs(struct super_block *sb, struct scoutfs_net_connection *
 		goto out;
 	}

-	mutex_lock(&server->alloc_mutex);
-	ret = scoutfs_alloc_foreach_super(sb, super, count_free_blocks, &sfb);
-	mutex_unlock(&server->alloc_mutex);
-	if (ret < 0)
-		goto out;
+	do {
+		get_stable(sb, &super, NULL);

-	mutex_lock(&server->logs_mutex);
-	ret = scoutfs_forest_inode_count(sb, super, &inode_count);
-	mutex_unlock(&server->logs_mutex);
-	if (ret < 0)
-		goto out;
+		ret = scoutfs_alloc_foreach_super(sb, &super, count_free_blocks, &sfb) ?:
+		      scoutfs_forest_inode_count(sb, &super, &inode_count);
+		if (ret < 0 && ret != -ESTALE)
+			goto out;

-	BUILD_BUG_ON(sizeof(nst.uuid) != sizeof(super->uuid));
-	memcpy(nst.uuid, super->uuid, sizeof(nst.uuid));
+		ret = scoutfs_block_check_stale(sb, ret, &saved, &super.logs_root.ref,
+						&super.srch_root.ref);
+	} while (ret == -ESTALE);
+
+	BUILD_BUG_ON(sizeof(nst.uuid) != sizeof(super.uuid));
+	memcpy(nst.uuid, super.uuid, sizeof(nst.uuid));
 	nst.free_meta_blocks = cpu_to_le64(sfb.meta);
-	nst.total_meta_blocks = super->total_meta_blocks;
+	nst.total_meta_blocks = super.total_meta_blocks;
 	nst.free_data_blocks = cpu_to_le64(sfb.data);
-	nst.total_data_blocks = super->total_data_blocks;
+	nst.total_data_blocks = super.total_data_blocks;
 	nst.inode_count = cpu_to_le64(inode_count);

 	ret = 0;
@@ -3278,7 +3290,7 @@ static int insert_mounted_client(struct super_block *sb, u64 rid, u64 gr_flags,
 				 struct sockaddr_in *sin)
 {
 	DECLARE_SERVER_INFO(sb, server);
-	struct scoutfs_super_block *super = &SCOUTFS_SB(sb)->super;
+	struct scoutfs_super_block *super = DIRTY_SUPER_SB(sb);
 	struct scoutfs_mounted_client_btree_val mcv;
 	struct scoutfs_key key;
 	int ret;
@@ -3304,7 +3316,7 @@ static int lookup_mounted_client_addr(struct super_block *sb, u64 rid,
 				      union scoutfs_inet_addr *addr)
 {
 	DECLARE_SERVER_INFO(sb, server);
-	struct scoutfs_super_block *super = &SCOUTFS_SB(sb)->super;
+	struct scoutfs_super_block *super = DIRTY_SUPER_SB(sb);
 	struct scoutfs_mounted_client_btree_val *mcv;
 	SCOUTFS_BTREE_ITEM_REF(iref);
 	struct scoutfs_key key;
@@ -3338,7 +3350,7 @@ static int lookup_mounted_client_addr(struct super_block *sb, u64 rid,
 static int delete_mounted_client(struct super_block *sb, u64 rid)
 {
 	DECLARE_SERVER_INFO(sb, server);
-	struct scoutfs_super_block *super = &SCOUTFS_SB(sb)->super;
+	struct scoutfs_super_block *super = DIRTY_SUPER_SB(sb);
 	struct scoutfs_key key;
 	int ret;

@@ -3362,7 +3374,7 @@ static int delete_mounted_client(struct super_block *sb, u64 rid)
 static int cancel_srch_compact(struct super_block *sb, u64 rid)
 {
 	DECLARE_SERVER_INFO(sb, server);
-	struct scoutfs_super_block *super = &SCOUTFS_SB(sb)->super;
+	struct scoutfs_super_block *super = DIRTY_SUPER_SB(sb);
 	struct scoutfs_alloc_list_head av;
 	struct scoutfs_alloc_list_head fr;
 	int ret;
@@ -3414,7 +3426,7 @@ static int cancel_srch_compact(struct super_block *sb, u64 rid)
 static int cancel_log_merge(struct super_block *sb, u64 rid)
 {
 	DECLARE_SERVER_INFO(sb, server);
-	struct scoutfs_super_block *super = &SCOUTFS_SB(sb)->super;
+	struct scoutfs_super_block *super = DIRTY_SUPER_SB(sb);
 	struct scoutfs_log_merge_status stat;
 	struct scoutfs_log_merge_request req;
 	struct scoutfs_log_merge_range rng;
@@ -3538,7 +3550,7 @@ static int server_greeting(struct super_block *sb,
 			   u8 cmd, u64 id, void *arg, u16 arg_len)
 {
 	struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
-	struct scoutfs_super_block *super = &sbi->super;
+	struct scoutfs_super_block *super = DIRTY_SUPER_SB(sb);
 	struct scoutfs_net_greeting *gr = arg;
 	struct scoutfs_net_greeting greet;
 	DECLARE_SERVER_INFO(sb, server);
@@ -3554,10 +3566,9 @@ static int server_greeting(struct super_block *sb,
 		goto send_err;
 	}

-	if (gr->fsid != super->hdr.fsid) {
+	if (gr->fsid != cpu_to_le64(sbi->fsid)) {
 		scoutfs_warn(sb, "client rid %016llx greeting fsid 0x%llx did not match server fsid 0x%llx",
-			     le64_to_cpu(gr->rid), le64_to_cpu(gr->fsid),
-			     le64_to_cpu(super->hdr.fsid));
+			     le64_to_cpu(gr->rid), le64_to_cpu(gr->fsid), sbi->fsid);
 		ret = -EINVAL;
 		goto send_err;
 	}
@@ -3697,7 +3708,7 @@ static void farewell_worker(struct work_struct *work)
 	struct server_info *server = container_of(work, struct server_info,
 						  farewell_work);
 	struct super_block *sb = server->sb;
-	struct scoutfs_super_block *super = &SCOUTFS_SB(sb)->super;
+	struct scoutfs_super_block *super = DIRTY_SUPER_SB(sb);
 	struct scoutfs_mounted_client_btree_val *mcv;
 	struct farewell_request *tmp;
 	struct farewell_request *fw;
@@ -4059,7 +4070,7 @@ static void recovery_timeout(struct super_block *sb)
 static int start_recovery(struct super_block *sb)
 {
 	DECLARE_SERVER_INFO(sb, server);
-	struct scoutfs_super_block *super = &SCOUTFS_SB(sb)->super;
+	struct scoutfs_super_block *super = DIRTY_SUPER_SB(sb);
 	SCOUTFS_BTREE_ITEM_REF(iref);
 	struct scoutfs_key key;
 	unsigned int nr = 0;
@@ -4176,8 +4187,7 @@ static void scoutfs_server_worker(struct work_struct *work)
 	struct server_info *server = container_of(work, struct server_info,
 						  work);
 	struct super_block *sb = server->sb;
-	struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
-	struct scoutfs_super_block *super = &sbi->super;
+	struct scoutfs_super_block *super = DIRTY_SUPER_SB(sb);
 	struct scoutfs_net_connection *conn = NULL;
 	struct scoutfs_mount_options opts;
 	DECLARE_WAIT_QUEUE_HEAD(waitq);
@@ -4189,13 +4199,13 @@ static void scoutfs_server_worker(struct work_struct *work)
 	trace_scoutfs_server_work_enter(sb, 0, 0);

 	scoutfs_options_read(sb, &opts);
-	scoutfs_quorum_slot_sin(super, opts.quorum_slot_nr, &sin);
+	scoutfs_quorum_slot_sin(&server->qconf, opts.quorum_slot_nr, &sin);
 	scoutfs_info(sb, "server starting at "SIN_FMT, SIN_ARG(&sin));

 	scoutfs_block_writer_init(sb, &server->wri);

 	/* first make sure no other servers are still running */
-	ret = scoutfs_quorum_fence_leaders(sb, server->term);
+	ret = scoutfs_quorum_fence_leaders(sb, &server->qconf, server->term);
 	if (ret < 0) {
 		scoutfs_err(sb, "server error %d attempting to fence previous leaders", ret);
 		goto out;
@@ -4231,8 +4241,7 @@ static void scoutfs_server_worker(struct work_struct *work)
 	write_seqcount_end(&server->volopt_seqcount);

 	atomic64_set(&server->seq_atomic, le64_to_cpu(super->seq));
-	set_roots(server, &super->fs_root, &super->logs_root,
-		  &super->srch_root);
+	set_stable_super(server, super);

 	/* prepare server alloc for this transaction, larger first */
 	if (le64_to_cpu(super->server_meta_avail[0].total_nr) <
@@ -4326,11 +4335,12 @@ out:
 /*
 * Start the server but don't wait for it to complete.
 */
-void scoutfs_server_start(struct super_block *sb, u64 term)
+void scoutfs_server_start(struct super_block *sb, struct scoutfs_quorum_config *qconf, u64 term)
 {
 	DECLARE_SERVER_INFO(sb, server);

 	if (cmpxchg(&server->status, SERVER_DOWN, SERVER_STARTING) == SERVER_DOWN) {
+		server->qconf = *qconf;
 		server->term = term;
 		queue_work(server->wq, &server->work);
 	}
@@ -4382,7 +4392,7 @@ int scoutfs_server_setup(struct super_block *sb)
 	INIT_WORK(&server->log_merge_free_work, server_log_merge_free_work);
 	mutex_init(&server->srch_mutex);
 	mutex_init(&server->mounted_clients_mutex);
-	seqcount_init(&server->roots_seqcount);
+	seqcount_init(&server->stable_seqcount);
 	seqcount_init(&server->volopt_seqcount);
 	mutex_init(&server->volopt_mutex);
 	INIT_WORK(&server->fence_pending_recov_work, fence_pending_recov_worker);
--- a/kmod/src/server.h
+++ b/kmod/src/server.h
@@ -75,7 +75,7 @@ u64 scoutfs_server_seq(struct super_block *sb);
 u64 scoutfs_server_next_seq(struct super_block *sb);
 void scoutfs_server_set_seq_if_greater(struct super_block *sb, u64 seq);

-void scoutfs_server_start(struct super_block *sb, u64 term);
+void scoutfs_server_start(struct super_block *sb, struct scoutfs_quorum_config *qconf, u64 term);
 void scoutfs_server_stop(struct super_block *sb);
 void scoutfs_server_stop_wait(struct super_block *sb);
 bool scoutfs_server_is_running(struct super_block *sb);
--- a/kmod/src/srch.c
+++ b/kmod/src/srch.c
@@ -861,7 +861,6 @@ int scoutfs_srch_search_xattrs(struct super_block *sb,
 			       struct scoutfs_srch_rb_root *sroot,
 			       u64 hash, u64 ino, u64 last_ino, bool *done)
 {
-	struct scoutfs_net_roots prev_roots;
 	struct scoutfs_net_roots roots;
 	struct scoutfs_srch_entry start;
 	struct scoutfs_srch_entry end;
@@ -869,6 +868,7 @@ int scoutfs_srch_search_xattrs(struct super_block *sb,
 	struct scoutfs_log_trees lt;
 	struct scoutfs_srch_file sfl;
 	SCOUTFS_BTREE_ITEM_REF(iref);
+	DECLARE_SAVED_REFS(saved);
 	struct scoutfs_key key;
 	unsigned long limit = SRCH_LIMIT;
 	int ret;
@@ -877,7 +877,6 @@ int scoutfs_srch_search_xattrs(struct super_block *sb,

 	*done = false;
 	srch_init_rb_root(sroot);
-	memset(&prev_roots, 0, sizeof(prev_roots));

 	start.hash = cpu_to_le64(hash);
 	start.ino = cpu_to_le64(ino);
@@ -892,7 +891,6 @@ retry:
 	ret = scoutfs_client_get_roots(sb, &roots);
 	if (ret)
 		goto out;
-	memset(&roots.fs_root, 0, sizeof(roots.fs_root));

 	end = final;

@@ -968,16 +966,10 @@ retry:
 	*done = sre_cmp(&end, &final) == 0;
 	ret = 0;
 out:
-	if (ret == -ESTALE) {
-		if (memcmp(&prev_roots, &roots, sizeof(roots)) == 0) {
-			scoutfs_inc_counter(sb, srch_search_stale_eio);
-			ret = -EIO;
-		} else {
-			scoutfs_inc_counter(sb, srch_search_stale_retry);
-			prev_roots = roots;
-			goto retry;
-		}
-	}
+	ret = scoutfs_block_check_stale(sb, ret, &saved, &roots.srch_root.ref,
+					&roots.logs_root.ref);
+	if (ret == -ESTALE)
+		goto retry;

 	return ret;
 }
@@ -1003,6 +995,14 @@ int scoutfs_srch_rotate_log(struct super_block *sb,
 		      le64_to_cpu(sfl->ref.blkno), 0);
 	ret = scoutfs_btree_insert(sb, alloc, wri, root, &key,
 				   sfl, sizeof(*sfl));
+	/*
+	 * While it's fine to replay moving the client's logging srch
+	 * file to the core btree item, server commits should keep it
+	 * from happening.  So we'll warn if we see it happen.  This can
+	 * be removed eventually.
+	 */
+	if (WARN_ON_ONCE(ret == -EEXIST))
+		ret = 0;
 	if (ret == 0) {
 		memset(sfl, 0, sizeof(*sfl));
 		scoutfs_inc_counter(sb, srch_rotate_log);
--- a/kmod/src/super.c
+++ b/kmod/src/super.c
@@ -461,9 +461,8 @@ static int scoutfs_read_supers(struct super_block *sb)
 		goto out;
 	}

-
+	sbi->fsid = le64_to_cpu(meta_super->hdr.fsid);
 	sbi->fmt_vers = le64_to_cpu(meta_super->fmt_vers);
-	sbi->super = *meta_super;
 out:
 	kfree(meta_super);
 	kfree(data_super);
--- a/kmod/src/super.h
+++ b/kmod/src/super.h
@@ -35,11 +35,10 @@ struct scoutfs_sb_info {
 	struct super_block *sb;

 	/* assigned once at the start of each mount, read-only */
+	u64 fsid;
 	u64 rid;
 	u64 fmt_vers;

-	struct scoutfs_super_block super;
-
 	struct block_device *meta_bdev;

 	spinlock_t next_ino_lock;
@@ -135,14 +134,14 @@ static inline bool scoutfs_unmounting(struct super_block *sb)
 	(int)(le64_to_cpu(fsid) >> SCSB_SHIFT),				    \
 	(int)(le64_to_cpu(rid) >> SCSB_SHIFT)
 #define SCSB_ARGS(sb)							    \
-	(int)(le64_to_cpu(SCOUTFS_SB(sb)->super.hdr.fsid) >> SCSB_SHIFT),   \
+	(int)(SCOUTFS_SB(sb)->fsid >> SCSB_SHIFT),			    \
 	(int)(SCOUTFS_SB(sb)->rid >> SCSB_SHIFT)
 #define SCSB_TRACE_FIELDS	\
 	__field(__u64, fsid)	\
 	__field(__u64, rid)
 #define SCSB_TRACE_ASSIGN(sb)						\
 	__entry->fsid = SCOUTFS_HAS_SBI(sb) ?				\
-			le64_to_cpu(SCOUTFS_SB(sb)->super.hdr.fsid) : 0;\
+		        SCOUTFS_SB(sb)->fsid : 0;			\
 	__entry->rid = SCOUTFS_HAS_SBI(sb) ?				\
 		       SCOUTFS_SB(sb)->rid : 0;
 #define SCSB_TRACE_ARGS				\
--- a/kmod/src/sysfs.c
+++ b/kmod/src/sysfs.c
@@ -60,10 +60,9 @@ static ssize_t fsid_show(struct kobject *kobj, struct attribute *attr,
 			 char *buf)
 {
 	struct super_block *sb = KOBJ_TO_SB(kobj, sb_id_kobj);
-	struct scoutfs_super_block *super = &SCOUTFS_SB(sb)->super;
+	struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);

-	return snprintf(buf, PAGE_SIZE, "%016llx\n",
-			le64_to_cpu(super->hdr.fsid));
+	return snprintf(buf, PAGE_SIZE, "%016llx\n", sbi->fsid);
 }
 ATTR_FUNCS_RO(fsid);

--- a/tests/.gitignore
+++ b/tests/.gitignore
@@ -8,3 +8,4 @@ src/bulk_create_paths
 src/find_xattrs
 src/stage_tmpfile
 src/create_xattr_loop
+src/o_tmpfile_umask
--- a/tests/Makefile
+++ b/tests/Makefile
@@ -11,7 +11,8 @@ BIN := src/createmany			\
 	src/stage_tmpfile		\
 	src/find_xattrs			\
 	src/create_xattr_loop		\
-	src/fragmented_data_extents
+	src/fragmented_data_extents	\
+	src/o_tmpfile_umask

 DEPS := $(wildcard src/*.d)

--- a/tests/golden/stage-tmpfile
+++ b/tests/golden/stage-tmpfile
@@ -1,3 +1,11 @@
+== non-acl O_TMPFILE creation honors umask
+umask 022
+fstat after open(0777): 0100755
+stat after linkat: 0100755
+umask 077
+fstat after open(0777): 0100700
+stat after linkat: 0100700
+== stage from tmpfile
 total file size 33669120
 00000000  41 41 41 41 41 41 41 41  41 41 41 41 41 41 41 41  |AAAAAAAAAAAAAAAA|
 *
--- a/tests/sequence
+++ b/tests/sequence
@@ -27,7 +27,7 @@ createmany-large-names.sh
 createmany-rename-large-dir.sh
 stage-release-race-alloc.sh
 stage-multi-part.sh
-stage-tmpfile.sh
+o_tmpfile.sh
 basic-posix-consistency.sh
 dirent-consistency.sh
 mkdir-rename-rmdir.sh
--- a/tests/src/o_tmpfile_umask.c
+++ b/tests/src/o_tmpfile_umask.c
@@ -0,0 +1,97 @@
+/*
+ * Show the modes of files as we create them with O_TMPFILE and link
+ * them into the namespace.
+ *
+ * Copyright (C) 2022 Versity Software, Inc.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ */
+
+#ifndef _GNU_SOURCE
+#define _GNU_SOURCE
+#endif
+#include <unistd.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/ioctl.h>
+#include <fcntl.h>
+#include <errno.h>
+#include <sys/stat.h>
+#include <assert.h>
+#include <limits.h>
+
+static void linkat_tmpfile_modes(char *dir, char *lpath, mode_t mode)
+{
+	char proc_self[PATH_MAX];
+	struct stat st;
+	int ret;
+	int fd;
+
+	umask(mode);
+	printf("umask 0%o\n", mode);
+
+	fd = open(dir, O_RDWR | O_TMPFILE, 0777);
+	if (fd < 0) {
+		perror("open(O_TMPFILE)");
+		exit(1);
+	}
+
+	ret = fstat(fd, &st);
+	if (ret < 0) {
+		perror("fstat");
+		exit(1);
+	}
+
+	printf("fstat after open(0777): 0%o\n", st.st_mode);
+
+	snprintf(proc_self, sizeof(proc_self), "/proc/self/fd/%d", fd);
+
+	ret = linkat(AT_FDCWD, proc_self, AT_FDCWD, lpath, AT_SYMLINK_FOLLOW);
+	if (ret < 0) {
+		perror("linkat");
+		exit(1);
+	}
+
+	close(fd);
+
+	ret = stat(lpath, &st);
+	if (ret < 0) {
+		perror("fstat");
+		exit(1);
+	}
+
+	printf("stat after linkat: 0%o\n", st.st_mode);
+
+	ret = unlink(lpath);
+	if (ret < 0) {
+		perror("unlink");
+		exit(1);
+	}
+}
+
+int main(int argc, char **argv)
+{
+	char *lpath;
+	char *dir;
+
+	if (argc < 3) {
+		printf("%s <open_dir> <linkat_path>\n", argv[0]);
+		return 1;
+	}
+
+	dir = argv[1];
+	lpath = argv[2];
+
+	linkat_tmpfile_modes(dir, lpath, 022);
+	linkat_tmpfile_modes(dir, lpath, 077);
+
+	return 0;
+}
--- a/tests/tests/o_tmpfile.sh
+++ b/tests/tests/o_tmpfile.sh
@@ -0,0 +1,16 @@
+#
+# basic tests of O_TMPFILE
+#
+
+t_require_commands stage_tmpfile hexdump
+
+echo "== non-acl O_TMPFILE creation honors umask"
+o_tmpfile_umask "$T_D0" "$T_D0/umask-file" 
+
+echo "== stage from tmpfile"
+DEST_FILE="$T_D0/dest_file"
+stage_tmpfile $T_D0 $DEST_FILE
+hexdump -C "$DEST_FILE"
+rm -f "$DEST_FILE"
+
+t_pass
--- a/tests/tests/stage-tmpfile.sh
+++ b/tests/tests/stage-tmpfile.sh
@@ -1,15 +0,0 @@
-#
-# Run tmpfile_stage and check the output with hexdump.
-#
-
-t_require_commands stage_tmpfile hexdump
-
-DEST_FILE="$T_D0/dest_file"
-
-stage_tmpfile $T_D0 $DEST_FILE
-
-hexdump -C "$DEST_FILE"
-
-rm -fr "$DEST_FILE"
-
-t_pass
--- a/utils/man/scoutfs.8
+++ b/utils/man/scoutfs.8
@@ -623,11 +623,9 @@ space of the volume making the output much more useful for inspection.
 .TP
 .B "META-DEVICE"
 The path to the metadata device for the filesystem whose metadata will be
-printed.  Since this command reads via the host's buffer cache, it may not
-reflect the current blocks in the filesystem possibly written to the shared
-block devices from another host, unless
-.B blockdev \--flushbufs
-command is used first.
+printed.  An attempt will be made to flush the host's buffer cache for
+this device with the BLKFLSBUF ioctl, or with posix_fadvise() if
+the path refers to a regular file.
 .RE
 .PD

--- a/utils/src/dev.c
+++ b/utils/src/dev.c
@@ -3,6 +3,7 @@
 #include <sys/types.h>
 #include <sys/stat.h>
 #include <unistd.h>
+#include <fcntl.h>
 #include <sys/ioctl.h>
 #include <linux/fs.h>
 #include <errno.h>
@@ -103,3 +104,44 @@ char *size_str(u64 nr, unsigned size)

 	return suffixes[i];
 }
+
+/*
+ * Try to flush the local read cache for a device.  This is only a best
+ * effort as these interfaces don't block waiting to fully purge the
+ * cache.  This is OK because it's used by cached readers that are known
+ * to be racy anyway.
+ */
+int flush_device(int fd)
+{
+	struct stat st;
+	int ret;
+
+	ret = fstat(fd, &st);
+	if (ret < 0) {
+		ret = -errno;
+		fprintf(stderr, "fstat failed: %s (%d)\n", strerror(errno), errno);
+		goto out;
+	}
+
+	if (S_ISREG(st.st_mode)) {
+		ret = posix_fadvise(fd, 0, st.st_size, POSIX_FADV_DONTNEED);
+		if (ret < 0) {
+			ret = -errno;
+			fprintf(stderr, "POSIX_FADV_DONTNEED failed: %s (%d)\n",
+				strerror(errno), errno);
+			goto out;
+		}
+
+	} else if (S_ISBLK(st.st_mode)) {
+		ret = ioctl(fd, BLKFLSBUF, 0);
+		if (ret < 0) {
+			ret = -errno;
+			fprintf(stderr, "BLKFLSBUF, failed: %s (%d)\n", strerror(errno), errno);
+			goto out;
+		}
+	}
+
+	ret = 0;
+out:
+	return ret;
+}
--- a/utils/src/dev.h
+++ b/utils/src/dev.h
@@ -14,5 +14,6 @@ int device_size(char *path, int fd,
 		char *use_type, u64 *size_ret);
 float size_flt(u64 nr, unsigned size);
 char *size_str(u64 nr, unsigned size);
+int flush_device(int fd);

 #endif
--- a/utils/src/mkfs.c
+++ b/utils/src/mkfs.c
@@ -118,6 +118,33 @@ struct mkfs_args {
 	struct scoutfs_quorum_slot slots[SCOUTFS_QUORUM_MAX_SLOTS];
 };

+static int open_mkfs_dev(struct mkfs_args *args, char *path, mode_t mode, char *which)
+{
+	int ret;
+	int fd = -1;
+
+	fd = open(path, mode);
+	if (fd < 0) {
+		ret = -errno;
+		fprintf(stderr, "failed to open %s dev '%s': %s (%d)\n",
+			which, path, strerror(errno), errno);
+		goto out;
+	}
+
+	ret = flush_device(fd);
+	if (ret < 0)
+		goto out;
+
+	if (!args->force)
+		ret = check_bdev(fd, path, which);
+
+out:
+	if (ret < 0 && fd >= 0)
+		close(fd);
+
+	return ret ?: fd;
+}
+
 /*
 * Make a new file system by writing:
 *  - super blocks
@@ -156,32 +183,17 @@ static int do_mkfs(struct mkfs_args *args)
 	gettimeofday(&tv, NULL);
 	pseudo_random_bytes(&fsid, sizeof(fsid));

-	meta_fd = open(args->meta_device, O_RDWR | O_EXCL);
+	meta_fd = open_mkfs_dev(args, args->meta_device, O_RDWR | O_EXCL, "meta");
 	if (meta_fd < 0) {
-		ret = -errno;
-		fprintf(stderr, "failed to open '%s': %s (%d)\n",
-			args->meta_device, strerror(errno), errno);
+		ret = meta_fd;
 		goto out;
 	}
-	if (!args->force) {
-		ret = check_bdev(meta_fd, args->meta_device, "meta");
-		if (ret)
-			return ret;
-	}

-	data_fd = open(args->data_device, O_RDWR | O_EXCL);
+	data_fd = open_mkfs_dev(args, args->data_device, O_RDWR | O_EXCL, "data");
 	if (data_fd < 0) {
-		ret = -errno;
-		fprintf(stderr, "failed to open '%s': %s (%d)\n",
-			args->data_device, strerror(errno), errno);
+		ret = data_fd;
 		goto out;
 	}
-	if (!args->force) {
-		ret = check_bdev(data_fd, args->data_device, "data");
-		if (ret)
-			return ret;
-	}
-

 	super = calloc(1, SCOUTFS_BLOCK_SM_SIZE);
 	bt = calloc(1, SCOUTFS_BLOCK_LG_SIZE);
--- a/utils/src/print.c
+++ b/utils/src/print.c
@@ -27,6 +27,7 @@
 #include "avl.h"
 #include "srch.h"
 #include "leaf_item_hash.h"
+#include "dev.h"

 static void print_block_header(struct scoutfs_block_header *hdr, int size)
 {
@@ -1107,7 +1108,12 @@ static int do_print(struct print_args *args)
 		return ret;
 	}

+	ret = flush_device(fd);
+	if (ret < 0)
+		goto out;
+
 	ret = print_volume(fd, args);
+out:
 	close(fd);
 	return ret;
 };
Author	SHA1	Message	Date
Zach Brown	5512d5c03e	v1.11 Release Finish the release notes for the 1.11 release. Signed-off-by: Zach Brown <zab@versity.com>	2023-02-02 11:00:38 -08:00
Zach Brown	8cf7be4651	Merge pull request #115 from versity/zab/utils_flush Zab/utils flush	2023-02-02 10:25:12 -08:00
Zach Brown	3363b4fb79	Flush device caches in buffered util cmds Add calls to our new device cache flushing helper in commands that use buffered reads. Signed-off-by: Zach Brown <zab@versity.com>	2023-01-18 10:52:02 -08:00
Zach Brown	ddb5cce2a5	Add quick utils flush_device helper Add a quick helper that just calls cache flushing ioctls on different kinds of files. Signed-off-by: Zach Brown <zab@versity.com>	2023-01-18 10:27:47 -08:00
Zach Brown	1b0e9c45f4	Merge pull request #114 from versity/zab/commit_lt_dirty Allow replaying srch file rotation	2023-01-17 16:07:13 -08:00
Zach Brown	2e2ccb6f61	Allow replaying srch file rotation When a client no longer needs to append to a srch file, for whatever reason, we move the reference from the log_trees item into a specific srch file btree item in the server's srch file tracking btree. Zeroing the log_trees item and inserting the server's btree item are done in a server commit and should be written atomically. But commit_log_trees had an error handling case that could leave the newly inserted item dirty in memory without zeroing the srch file reference in the existing log_trees item. Future attempts to rotate the file reference, perhaps by retrying the commit or by reclaiming the client's rid, would get EEXIST and fail. This fixes the error handling path to ensure that we'll keep the dirty srch file btree and log_trees item in sync. The desynced items can still exist in the world so we'll tolerate getting EEXIST on insertion. After enough time has passed, or if repair zeroed the duplicate reference, we could remove this special case from insertion. Signed-off-by: Zach Brown <zab@versity.com>	2023-01-17 14:33:27 -08:00
Zach Brown	01c8bba56d	Merge pull request #109 from versity/zab/server_statfs_stable_blocks Zab/server statfs stable blocks	2023-01-12 09:58:48 -08:00
Zach Brown	17cb1fe84b	Merge pull request #110 from versity/zab/partial_alloc_move Allow partial extent motion	2023-01-12 09:58:12 -08:00
Zach Brown	78ae87031b	Merge pull request #112 from versity/zab/tmpfile_umask Zab/tmpfile umask	2023-01-12 09:57:56 -08:00
Zach Brown	bf93ea73c4	Merge pull request #113 from versity/zab/move_blocks_loop_fixes Fix move_blocks loop exit conditions	2023-01-12 09:56:25 -08:00
Zach Brown	a23e7478a0	Fix move_blocks loop exit conditions The move_blocks ioctl intends to only move extents whose bytes fall inside i_size. This is easy except for a final extent that straddles an i_size that isn't aligned to 4K data blocks. The code that either checked for an extent being entirely past i_size or for limiting the number of blocks to move by i_size clumsily compared i_size offsets in bytes with extent counts in 4KB blocks. In just the right circumstances, probably with the help of a byte length to move that is much larger than i_size, the length calculation could result in trying to move 0 blocks. Once this hit the loop would keep finding that extent and calculating 0 blocks to move and would be stuck. We fix this by clamping the count of blocks in extents to move in terms of byte offsets at the start of the loop. This gets rid of the extra size checks and byte offset use in the loop. We also add a sanity check to make sure that we can't get stuck if, say, corruption resulted in an otherwise impossible zero length extent. Signed-off-by: Zach Brown <zab@versity.com>	2023-01-10 09:34:52 -08:00
Zach Brown	9ba2ee5c88	Add testing of O_TMPFILE umask There were kernels that didn't apply the current umask to inode modes created with O_TMPFILE without acls. Let's have a test running to make sure that we're not surprised if we come across one. Signed-off-by: Zach Brown <zab@versity.com>	2023-01-09 14:49:23 -08:00
Zach Brown	fe33a492c2	Make o_tmpfile test more generic The o_tmpfile test only did one thing, clean it up a bit so we can add more tests to the file. Signed-off-by: Zach Brown <zab@versity.com>	2023-01-09 10:14:40 -08:00
Zach Brown	77c0ff89fb	Rename stage-tmpfile to o_tmpfile We had a one-off test that was overly specific to staging from tmpfile. This renames it to a more generic test where we can add more tests of o_tmpfile in general. Signed-off-by: Zach Brown <zab@versity.com>	2023-01-09 10:07:15 -08:00
Zach Brown	7c2d83e2f8	Remove saved super block in scoutfs_sb_info Now that we've removed its users we can remove the global saved copy of the super block from scoutfs_sb_info. Signed-off-by: Zach Brown <zab@versity.com>	2023-01-06 11:15:45 -08:00
Zach Brown	40aa47c888	Have the server keep a private dirty super block As the server does its work its transactions modify a dirty super block in memory. This used the global super block in scoutfs_sb_info which was visible to everything, including the client. Move the dirty super block over to the private server info so that only the server can see it. This is mostly boring storage motion but we do change that the quorum code hands the server a static copy of the quorum config to use as it starts up before it reads the most recent super block. Signed-off-by: Zach Brown <zab@versity.com>	2023-01-06 11:15:45 -08:00
Zach Brown	c1bd7bcce5	Allow partial extent motion Refilling a client's data_avail is the only alloc_move call that doesn't try and limit the number of blocks that it dirties. If it doesn't find sufficiently large extents it can exhaust the server's alloc budget without hitting the target. It'll try to dirty blocks and return a hard error. This changes that behaviour to allow returning 0 if it moved any extents. Other callers can deal with partial progress as they already limit the blocks they dirty. This will also return ENOSPC if it hadn't moved anything just as the current code would. The result is that data fill can not necessarily hit the target. It might take multiple commits to fill the data_avail btree. Signed-off-by: Zach Brown <zab@versity.com>	2022-12-15 20:47:41 -08:00
Zach Brown	7720222588	Have statfs use unlocked stable roots The server's statfs request handler was intending to lock dirty structures as they were walked to get sums used for statfs fields. Other callers walk stable structures, though, so the summation calls had grown iteration over other structures that the server didn't know it had to lock. This meant that the server was walking unlocked dirty structures as they were being modified. The races are very tight, but it can result in request handling errors that shut down connections and IO errors from trying to read inconsistent refs as they were modified by the locked writer. We've built up infrastructure so the server can now walk stable structures just like the other callers. It will no longer wander into dirty blocks so it doesn't need to lock them and it will retry if its walk of stale data crosses a broken reference. Signed-off-by: Zach Brown <zab@versity.com>	2022-12-12 14:59:22 -08:00
Zach Brown	fff07ce19c	Use stale block read retrying helper Transition from manual checking for persistent ESTALE to the shared helper that we just added. This should not change behavior. Signed-off-by: Zach Brown <zab@versity.com>	2022-12-12 14:59:22 -08:00
Zach Brown	464de56d28	Add stale block read retrying helper Many readers had little implementations of the logic to decide to retry stale reads with different refs or decide that they're persistent and return hard errors. Let's move that into a small helper. Signed-off-by: Zach Brown <zab@versity.com>	2022-12-12 14:59:22 -08:00
Zach Brown	342c206550	Have scoutfs_forest_inode_count return stale reads scoutfs_forest_inode_count() assumed it was called with stable refs and would always translate ESTALE to EIO. Change it so that it passes ESTALE to the caller who is responsible for handling it. The server will use this to retry reading from stable supers that it's storing in memory. Signed-off-by: Zach Brown <zab@versity.com>	2022-12-12 14:59:22 -08:00
Zach Brown	fe4734d019	Save a full stable super in the server The server has a mechanism for tracking the last stable roots used by network rpcs. We expand it a bit to include the entire super so that we can add users in the server which want the last full stable super. We can still use the stable super to give out the stable roots. Signed-off-by: Zach Brown <zab@versity.com>	2022-12-12 14:59:22 -08:00
Zach Brown	b1a43bb312	Make quorum config use more precise The quorum code was using the copy of the super block in the sb info for its config. With that going away we make different users more carefully reference the config. The quorum agent has a copy that it reads on setup, the client rarely reads a copy when trying to connect, and the server uses its super. This is about data access isolation and should have no functional effect other than to cause more super reads. Signed-off-by: Zach Brown <zab@versity.com>	2022-12-12 14:59:22 -08:00
Zach Brown	929703213f	Add fsid sbi field A few paths throughout the code get the fsid for the current mount by using the copy of the super block that we store in the scoutfs_sb_info for the mount. We'd like to remove the super block from the sbi and it's cleaner to have a specific constant field for the fsid of the mount which will not change. Signed-off-by: Zach Brown <zab@versity.com>	2022-12-12 14:59:22 -08:00
Zach Brown	78279ffb4a	Merge pull request #108 from versity/zab/v1.10 v1.10 Release	2022-12-07 13:33:45 -08:00