Introduce meta_reserve_blocks mount option, default value.

This option adds a mount option, with default value of 16384, that adds an additional reserve amount of blocks for the meta device. The default value is 16384, which corresponds to 1GB of space, and just about doubles the internal value for the reserve that is calculated based on clients/mounts dynamically in sort of standard values. It also just compromises about less than 2% of the meta device size for the smallest meta device size. A suggested value for larger deployments is like somewhere around 256 blocks per GB of meta device size, i.e. 1/64 of the meta device space, and about 1.6% in effect. Customers who are running into issues can adjust their mount options to increase the value to have a larger safety buffer, or decrease it to potentially have a way to get out of low space conditions temporarily. Obviously one would want to increase the value of this option after resolving the low space condition issues as soon as possible. Our test suite will run with meta_reserve_blocks=0, so that the behavior of any of our tests is functionally unaffected by this change, and won't interfere with resolving underlying ENOSPC issues and their resolution. The addition of this option however allows us to artifically create ENOSPC conditions at will, and we may want to add tests specifically that do so. Signed-off-by: Auke Kok <auke.kok@versity.com>
2026-01-10 05:37:25 +00:00 · 2025-04-17 16:06:33 -04:00
11 changed files with 121 additions and 137 deletions
--- a/ReleaseNotes.md
+++ b/ReleaseNotes.md
@@ -1,27 +1,6 @@
 Versity ScoutFS Release Notes
 =============================

---
-v1.25
-\
-*Jun 3, 2025*
-
-Fix a bug that could cause indefinite retries of failed client commits.
-Under specific error conditions the client and server's understanding of
-the current client commit could get out of sync.  The client would retry
-commits indefinitely that could never succeed.  This manifested as
-infinite "critical transaction commit failure" messages in the kernel
-log on the client and matching "error <nr> committing client logs" on
-the server.
-
-Fix a bug in a specific case of server error handling that could result
-in sending references to unwritten blocks to the client.  The client
-would try to read blocks that hadn't been written and return spurious
-errors.  This was seen under low free space conditions on the server and
-resulted in error messages with error code 116 (The errno enum for
-ESTALE, the client's indication that it couldn't read the blocks that it
-expected.)
-
 ---
 v1.24
 \
--- a/kmod/src/options.c
+++ b/kmod/src/options.c
@@ -39,6 +39,7 @@ enum {
 	Opt_orphan_scan_delay_ms,
 	Opt_quorum_heartbeat_timeout_ms,
 	Opt_quorum_slot_nr,
+	Opt_meta_reserve_blocks,
 	Opt_err,
 };

@@ -52,6 +53,7 @@ static const match_table_t tokens = {
 	{Opt_orphan_scan_delay_ms, "orphan_scan_delay_ms=%s"},
 	{Opt_quorum_heartbeat_timeout_ms, "quorum_heartbeat_timeout_ms=%s"},
 	{Opt_quorum_slot_nr, "quorum_slot_nr=%s"},
+	{Opt_meta_reserve_blocks, "meta_reserve_blocks=%s"},
 	{Opt_err, NULL}
 };

@@ -126,6 +128,9 @@ static void free_options(struct scoutfs_mount_options *opts)
 #define MIN_DATA_PREALLOC_BLOCKS	1ULL
 #define MAX_DATA_PREALLOC_BLOCKS	((unsigned long long)SCOUTFS_BLOCK_SM_MAX)

+#define SCOUTFS_META_RESERVE_DEFAULT_BLOCKS 16384
+
+
 static void init_default_options(struct scoutfs_mount_options *opts)
 {
 	memset(opts, 0, sizeof(*opts));
@@ -136,6 +141,7 @@ static void init_default_options(struct scoutfs_mount_options *opts)
 	opts->orphan_scan_delay_ms = -1;
 	opts->quorum_heartbeat_timeout_ms = SCOUTFS_QUORUM_DEF_HB_TIMEO_MS;
 	opts->quorum_slot_nr = -1;
+	opts->meta_reserve_blocks = SCOUTFS_META_RESERVE_DEFAULT_BLOCKS;
 }

 static int verify_log_merge_wait_timeout_ms(struct super_block *sb, int ret, int val)
@@ -167,6 +173,24 @@ static int verify_quorum_heartbeat_timeout_ms(struct super_block *sb, int ret, u

 	return 0;
 }
+static int verify_meta_reserve_blocks(struct super_block *sb, int ret, int val)
+{
+	/*
+	 *  Ideally we set a limit to something reasonable like 1/2 the actual
+	 * total_meta_blocks, but we can't yet get this info when mount is called
+	 */
+	if (ret < 0) {
+		scoutfs_err(sb, "failed to parse meta_reserve_blocks value");
+		return -EINVAL;
+	}
+	if (val < 0 || val > INT_MAX) {
+		scoutfs_err(sb, "invalid meta_reserve_blocks value %d, must be between 0 and %d",
+			    val, INT_MAX);
+		return -EINVAL;
+	}
+
+	return 0;
+}

 /*
 * Parse the option string into our options struct.   This can allocate
@@ -279,6 +303,14 @@ static int parse_options(struct super_block *sb, char *options, struct scoutfs_m
 			opts->quorum_slot_nr = nr;
 			break;

+		case Opt_meta_reserve_blocks:
+			ret = match_int(args, &nr);
+			ret = verify_meta_reserve_blocks(sb, ret, nr);
+			if (ret < 0)
+				return ret;
+			opts->meta_reserve_blocks = nr;
+			break;
+
 		default:
 			scoutfs_err(sb, "Unknown or malformed option, \"%s\"", p);
 			return -EINVAL;
@@ -371,6 +403,7 @@ int scoutfs_options_show(struct seq_file *seq, struct dentry *root)
 	seq_printf(seq, ",orphan_scan_delay_ms=%u", opts.orphan_scan_delay_ms);
 	if (opts.quorum_slot_nr >= 0)
 		seq_printf(seq, ",quorum_slot_nr=%d", opts.quorum_slot_nr);
+	seq_printf(seq, ".meta_reserve_blocks=%llu", opts.meta_reserve_blocks);

 	return 0;
 }
@@ -589,6 +622,17 @@ static ssize_t quorum_slot_nr_show(struct kobject *kobj, struct kobj_attribute *
 }
 SCOUTFS_ATTR_RO(quorum_slot_nr);

+static ssize_t meta_reserve_blocks_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf)
+{
+	struct super_block *sb = SCOUTFS_SYSFS_ATTRS_SB(kobj);
+	struct scoutfs_mount_options opts;
+
+	scoutfs_options_read(sb, &opts);
+
+	return snprintf(buf, PAGE_SIZE, "%lld\n", opts.meta_reserve_blocks);
+}
+SCOUTFS_ATTR_RO(meta_reserve_blocks);
+
 static struct attribute *options_attrs[] = {
 	SCOUTFS_ATTR_PTR(data_prealloc_blocks),
 	SCOUTFS_ATTR_PTR(data_prealloc_contig_only),
@@ -597,6 +641,7 @@ static struct attribute *options_attrs[] = {
 	SCOUTFS_ATTR_PTR(orphan_scan_delay_ms),
 	SCOUTFS_ATTR_PTR(quorum_heartbeat_timeout_ms),
 	SCOUTFS_ATTR_PTR(quorum_slot_nr),
+	SCOUTFS_ATTR_PTR(meta_reserve_blocks),
 	NULL,
 };

--- a/kmod/src/options.h
+++ b/kmod/src/options.h
@@ -13,6 +13,7 @@ struct scoutfs_mount_options {
 	unsigned int orphan_scan_delay_ms;
 	int quorum_slot_nr;
 	u64 quorum_heartbeat_timeout_ms;
+	u64 meta_reserve_blocks;
 };

 void scoutfs_options_read(struct super_block *sb, struct scoutfs_mount_options *opts);
--- a/kmod/src/server.c
+++ b/kmod/src/server.c
@@ -772,11 +772,14 @@ static int alloc_move_empty(struct super_block *sb,
 u64 scoutfs_server_reserved_meta_blocks(struct super_block *sb)
 {
 	DECLARE_SERVER_INFO(sb, server);
+	struct scoutfs_mount_options opts;
 	u64 server_blocks;
 	u64 client_blocks;
 	u64 log_blocks;
 	u64 nr_clients;

+	scoutfs_options_read(sb, &opts);
+
 	/* server has two meta_avail lists it swaps between */
 	server_blocks = SCOUTFS_SERVER_META_FILL_TARGET * 2;

@@ -801,7 +804,7 @@ u64 scoutfs_server_reserved_meta_blocks(struct super_block *sb)
 	nr_clients = server->nr_clients;
 	spin_unlock(&server->lock);

-	return server_blocks + (max(1ULL, nr_clients) * client_blocks);
+	return server_blocks + (max(1ULL, nr_clients) * client_blocks) + opts.meta_reserve_blocks;
 }

 /*
@@ -1299,10 +1302,12 @@ static int finalize_and_start_log_merge(struct super_block *sb, struct scoutfs_l
 * is nested inside holding commits so we recheck the persistent item
 * each time we commit to make sure it's still what we think.   The
 * caller is still going to send the item to the client so we update the
- * caller's each time we make progress.  If we hit an error applying the
- * changes we make then we can't send the log_trees to the client.
+ * caller's each time we make progress.  This is a best-effort attempt
+ * to clean up and it's valid to leave extents in data_freed we don't
+ * return errors to the caller.  The client will continue the work later
+ * in get_log_trees or as the rid is reclaimed.
 */
-static int try_drain_data_freed(struct super_block *sb, struct scoutfs_log_trees *lt)
+static void try_drain_data_freed(struct super_block *sb, struct scoutfs_log_trees *lt)
 {
 	DECLARE_SERVER_INFO(sb, server);
 	struct scoutfs_super_block *super = DIRTY_SUPER_SB(sb);
@@ -1311,7 +1316,6 @@ static int try_drain_data_freed(struct super_block *sb, struct scoutfs_log_trees
 	struct scoutfs_log_trees drain;
 	struct scoutfs_key key;
 	COMMIT_HOLD(hold);
-	bool apply = false;
 	int ret = 0;
 	int err;

@@ -1320,27 +1324,22 @@ static int try_drain_data_freed(struct super_block *sb, struct scoutfs_log_trees
 	while (lt->data_freed.total_len != 0) {
 		server_hold_commit(sb, &hold);
 		mutex_lock(&server->logs_mutex);
-		apply = true;

 		ret = find_log_trees_item(sb, &super->logs_root, false, rid, U64_MAX, &drain);
-		if (ret < 0) {
-			ret = 0;
+		if (ret < 0)
 			break;
-		}

 		/* careful to only keep draining the caller's specific open trans */
 		if (drain.nr != lt->nr || drain.get_trans_seq != lt->get_trans_seq ||
 		    drain.commit_trans_seq != lt->commit_trans_seq || drain.flags != lt->flags) {
-			ret = 0;
+			ret = -ENOENT;
 			break;
 		}

 		ret = scoutfs_btree_dirty(sb, &server->alloc, &server->wri,
 					  &super->logs_root, &key);
-		if (ret < 0) {
-			ret = 0;
+		if (ret < 0)
 			break;
-		}

 		/* moving can modify and return errors, always update caller and item */
 		mutex_lock(&server->alloc_mutex);
@@ -1356,19 +1355,19 @@ static int try_drain_data_freed(struct super_block *sb, struct scoutfs_log_trees
 		BUG_ON(err < 0); /* dirtying must guarantee success */

 		mutex_unlock(&server->logs_mutex);
+
 		ret = server_apply_commit(sb, &hold, ret);
-		apply = false;
-
-		if (ret < 0)
+		if (ret < 0) {
+			ret = 0; /* don't try to abort, ignoring ret */
 			break;
+		}
 	}

-	if (apply) {
+	/* try to cleanly abort and write any partial dirty btree blocks, but ignore result */
+	if (ret < 0) {
 		mutex_unlock(&server->logs_mutex);
-		server_apply_commit(sb, &hold, ret);
+		server_apply_commit(sb, &hold, 0);
 	}
-
-	return ret;
 }

 /*
@@ -1576,9 +1575,9 @@ out:
 		scoutfs_err(sb, "error %d getting log trees for rid %016llx: %s",
 			    ret, rid, err_str);

-	/* try to drain excessive data_freed with additional commits, if needed */
+	/* try to drain excessive data_freed with additional commits, if needed, ignoring err */
 	if (ret == 0)
-		ret = try_drain_data_freed(sb, &lt);
+		try_drain_data_freed(sb, &lt);

 	return scoutfs_net_response(sb, conn, cmd, id, ret, &lt, sizeof(lt));
 }
@@ -4153,7 +4152,7 @@ static void fence_pending_recov_worker(struct work_struct *work)
 	struct server_info *server = container_of(work, struct server_info,
 						  fence_pending_recov_work);
 	struct super_block *sb = server->sb;
-	union scoutfs_inet_addr addr = {{0,}};
+	union scoutfs_inet_addr addr;
 	u64 rid = 0;
 	int ret = 0;

--- a/kmod/src/trans.c
+++ b/kmod/src/trans.c
@@ -159,58 +159,6 @@ static bool drained_holders(struct trans_info *tri)
 	return holders == 0;
 }

-static int commit_current_log_trees(struct super_block *sb, char **str)
-{
-	DECLARE_TRANS_INFO(sb, tri);
-
-	return (*str = "data submit", scoutfs_inode_walk_writeback(sb, true)) ?:
-	       (*str = "item dirty", scoutfs_item_write_dirty(sb))  ?:
-	       (*str = "data prepare", scoutfs_data_prepare_commit(sb))  ?:
-	       (*str = "alloc prepare", scoutfs_alloc_prepare_commit(sb, &tri->alloc, &tri->wri)) ?:
-	       (*str = "meta write", scoutfs_block_writer_write(sb, &tri->wri))  ?:
-	       (*str = "data wait", scoutfs_inode_walk_writeback(sb, false)) ?:
-	       (*str = "commit log trees", commit_btrees(sb)) ?:
-	       scoutfs_item_write_done(sb);
-}
-
-static int get_next_log_trees(struct super_block *sb, char **str)
-{
-	return (*str = "get log trees", scoutfs_trans_get_log_trees(sb));
-}
-
-static int retry_forever(struct super_block *sb, int (*func)(struct super_block *sb, char **str))
-{
-	bool retrying = false;
-	char *str;
-	int ret;
-
-	do {
-		str = NULL;
-
-		ret = func(sb, &str);
-		if (ret < 0) {
-			if (!retrying) {
-				scoutfs_warn(sb, "critical transaction commit failure: %s = %d, retrying",
-					    str, ret);
-				retrying = true;
-			}
-
-			if (scoutfs_forcing_unmount(sb)) {
-				ret = -EIO;
-				break;
-			}
-
-			msleep(2 * MSEC_PER_SEC);
-
-		} else if (retrying) {
-			scoutfs_info(sb, "retried transaction commit succeeded");
-		}
-
-	} while (ret < 0);
-
-	return ret;
-}
-
 /*
 * This work func is responsible for writing out all the dirty blocks
 * that make up the current dirty transaction.  It prevents writers from
@@ -236,6 +184,8 @@ void scoutfs_trans_write_func(struct work_struct *work)
 	struct trans_info *tri = container_of(work, struct trans_info, write_work.work);
 	struct super_block *sb = tri->sb;
 	struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
+	bool retrying = false;
+	char *s = NULL;
 	int ret = 0;

 	tri->task = current;
@@ -264,9 +214,37 @@ void scoutfs_trans_write_func(struct work_struct *work)

 	scoutfs_inc_counter(sb, trans_commit_written);

-	/* retry {commit,get}_log_trees until they succeeed, can only fail when forcing unmount */
-	ret = retry_forever(sb, commit_current_log_trees) ?:
-	      retry_forever(sb, get_next_log_trees);
+	do {
+		ret = (s = "data submit", scoutfs_inode_walk_writeback(sb, true)) ?:
+		      (s = "item dirty", scoutfs_item_write_dirty(sb))  ?:
+		      (s = "data prepare", scoutfs_data_prepare_commit(sb))  ?:
+		      (s = "alloc prepare", scoutfs_alloc_prepare_commit(sb, &tri->alloc,
+									 &tri->wri))  ?:
+		      (s = "meta write", scoutfs_block_writer_write(sb, &tri->wri))  ?:
+		      (s = "data wait", scoutfs_inode_walk_writeback(sb, false)) ?:
+		      (s = "commit log trees", commit_btrees(sb)) ?:
+		      scoutfs_item_write_done(sb) ?:
+		      (s = "get log trees", scoutfs_trans_get_log_trees(sb));
+		if (ret < 0) {
+			if (!retrying) {
+				scoutfs_warn(sb, "critical transaction commit failure: %s = %d, retrying",
+					    s, ret);
+				retrying = true;
+			}
+
+			if (scoutfs_forcing_unmount(sb)) {
+				ret = -EIO;
+				break;
+			}
+
+			msleep(2 * MSEC_PER_SEC);
+
+		} else if (retrying) {
+			scoutfs_info(sb, "retried transaction commit succeeded");
+		}
+
+	} while (ret < 0);
+
 out:
 	spin_lock(&tri->write_lock);
 	tri->write_count++;
--- a/tests/funcs/exec.sh
+++ b/tests/funcs/exec.sh
@@ -80,15 +80,3 @@ t_compare_output()
 {
 	"$@" >&7 2>&1
 }
-
-#
-# usually bash prints an annoying output message when jobs
-# are killed.  We can avoid that by redirecting stderr for
-# the bash process when it reaps the jobs that are killed.
-#
-t_silent_kill() {
-	exec {ERR}>&2 2>/dev/null
-	kill "$@"
-	wait "$@"
-	exec 2>&$ERR {ERR}>&-
-}
--- a/tests/funcs/filter.sh
+++ b/tests/funcs/filter.sh
@@ -160,9 +160,6 @@ t_filter_dmesg()
 	re="$re|Pipe handler or fully qualified core dump path required.*"
 	re="$re|Set kernel.core_pattern before fs.suid_dumpable.*"

-	# perf warning that it adjusted sample rate
-	re="$re|perf: interrupt took too long.*lowering kernel.perf_event_max_sample_rate.*"
-
 	egrep -v "($re)" | \
 		ignore_harmless_unwind_kasan_stack_oob
 }
--- a/tests/run-tests.sh
+++ b/tests/run-tests.sh
@@ -464,6 +464,7 @@ for i in $(seq 0 $((T_NR_MOUNTS - 1))); do
 	if [ "$i" -lt "$T_QUORUM" ]; then
 		opts="$opts,quorum_slot_nr=$i"
 	fi
+	opts="$opts,meta_reserve_blocks=0"
 	opts="${opts}${T_MNT_OPTIONS}"

 	msg "mounting $meta_dev|$data_dev on $dir"
@@ -532,15 +533,12 @@ for t in $tests; do
 	cmd rm -rf "$T_TMPDIR"
 	cmd mkdir -p "$T_TMPDIR"

-	# create a test name dir in the fs, clean up old data as needed
+	# create a test name dir in the fs
 	T_DS=""
 	for i in $(seq 0 $((T_NR_MOUNTS - 1))); do
 		dir="${T_M[$i]}/test/$test_name"

-		test $i == 0 && (
-			test -d "$dir" && cmd rm -rf "$dir"
-			cmd mkdir -p "$dir"
-		)
+		test $i == 0 && cmd mkdir -p "$dir"

 		eval T_D$i=$dir
 		T_D[$i]=$dir
--- a/tests/tests/enospc.sh
+++ b/tests/tests/enospc.sh
@@ -88,11 +88,6 @@ rm -rf "$SCR/xattrs"

 echo "== make sure we can create again"
 file="$SCR/file-after"
-C=120
-while (( C-- )); do
-	touch $file 2> /dev/null && break
-	sleep 1
-done
 touch $file
 setfattr -n user.scoutfs-enospc -v 1 "$file"
 sync
--- a/tests/tests/lock-recover-invalidate.sh
+++ b/tests/tests/lock-recover-invalidate.sh
@@ -38,6 +38,6 @@ while [ "$SECONDS" -lt "$END" ]; do
 done

 echo "== stopping background load"
-t_silent_kill $load_pids
+kill $load_pids

 t_pass
--- a/tests/tests/orphan-inodes.sh
+++ b/tests/tests/orphan-inodes.sh
@@ -5,6 +5,18 @@
 t_require_commands sleep touch sync stat handle_cat kill rm
 t_require_mounts 2

+#
+# usually bash prints an annoying output message when jobs
+# are killed.  We can avoid that by redirecting stderr for
+# the bash process when it reaps the jobs that are killed.
+#
+silent_kill() {
+	exec {ERR}>&2 2>/dev/null
+	kill "$@"
+	wait "$@"
+	exec 2>&$ERR {ERR}>&-
+}
+
 #
 # We don't have a great way to test that inode items still exist.   We
 # don't prevent opening handles with nlink 0 today, so we'll use that.
@@ -40,7 +52,7 @@ inode_exists $ino || echo "$ino didn't exist"

 echo "== orphan from failed evict deletion is picked up"
 # pending kill signal stops evict from getting locks and deleting
-t_silent_kill $pid
+silent_kill $pid
 t_set_sysfs_mount_option 0 orphan_scan_delay_ms 1000
 sleep 5
 inode_exists $ino && echo "$ino still exists"
@@ -58,7 +70,7 @@ for nr in $(t_fs_nrs); do
 	rm -f "$path"
 done
 sync
-t_silent_kill $pids
+silent_kill $pids
 for nr in $(t_fs_nrs); do
 	t_force_umount $nr
 done
@@ -70,15 +82,7 @@ done
 # wait for orphan scans to run
 t_set_all_sysfs_mount_options orphan_scan_delay_ms 1000
 # also have to wait for delayed log merge work from mount
-C=120
-while (( C-- )); do
-	brk=1
-	for ino in $inos; do
-		inode_exists $ino && brk=0
-	done
-	test $brk -eq 1 && break
-	sleep 1
-done
+sleep 15
 for ino in $inos; do
 	inode_exists $ino && echo "$ino still exists"
 done
@@ -127,7 +131,7 @@ while [ $SECONDS -lt $END ]; do
 	done

 	# trigger eviction deletion of each file in each mount
-	t_silent_kill $pids
+	silent_kill $pids

 	wait || t_fail "handle_fsetxattr failed"