Compare commits

..

1 Commits

Author SHA1 Message Date
Auke Kok
72dc5695a6 Introduce meta_reserve_blocks mount option, default value.
This option adds a mount option, with default value of 16384, that adds
an additional reserve amount of blocks for the meta device.

The default value is 16384, which corresponds to 1GB of space, and just
about doubles the internal value for the reserve that is calculated
based on clients/mounts dynamically in sort of standard values. It also
just compromises about less than 2% of the meta device size for the
smallest meta device size.

A suggested value for larger deployments is like somewhere around 256
blocks per GB of meta device size, i.e. 1/64 of the meta device space,
and about 1.6% in effect.

Customers who are running into issues can adjust their mount options to
increase the value to have a larger safety buffer, or decrease it to
potentially have a way to get out of low space conditions temporarily.
Obviously one would want to increase the value of this option after
resolving the low space condition issues as soon as possible.

Our test suite will run with meta_reserve_blocks=0, so that the behavior
of any of our tests is functionally unaffected by this change, and won't
interfere with resolving underlying ENOSPC issues and their resolution.
The addition of this option however allows us to artifically create
ENOSPC conditions at will, and we may want to add tests specifically
that do so.

Signed-off-by: Auke Kok <auke.kok@versity.com>
2025-04-17 16:06:33 -04:00
11 changed files with 121 additions and 137 deletions

View File

@@ -1,27 +1,6 @@
Versity ScoutFS Release Notes
=============================
---
v1.25
\
*Jun 3, 2025*
Fix a bug that could cause indefinite retries of failed client commits.
Under specific error conditions the client and server's understanding of
the current client commit could get out of sync. The client would retry
commits indefinitely that could never succeed. This manifested as
infinite "critical transaction commit failure" messages in the kernel
log on the client and matching "error <nr> committing client logs" on
the server.
Fix a bug in a specific case of server error handling that could result
in sending references to unwritten blocks to the client. The client
would try to read blocks that hadn't been written and return spurious
errors. This was seen under low free space conditions on the server and
resulted in error messages with error code 116 (The errno enum for
ESTALE, the client's indication that it couldn't read the blocks that it
expected.)
---
v1.24
\

View File

@@ -39,6 +39,7 @@ enum {
Opt_orphan_scan_delay_ms,
Opt_quorum_heartbeat_timeout_ms,
Opt_quorum_slot_nr,
Opt_meta_reserve_blocks,
Opt_err,
};
@@ -52,6 +53,7 @@ static const match_table_t tokens = {
{Opt_orphan_scan_delay_ms, "orphan_scan_delay_ms=%s"},
{Opt_quorum_heartbeat_timeout_ms, "quorum_heartbeat_timeout_ms=%s"},
{Opt_quorum_slot_nr, "quorum_slot_nr=%s"},
{Opt_meta_reserve_blocks, "meta_reserve_blocks=%s"},
{Opt_err, NULL}
};
@@ -126,6 +128,9 @@ static void free_options(struct scoutfs_mount_options *opts)
#define MIN_DATA_PREALLOC_BLOCKS 1ULL
#define MAX_DATA_PREALLOC_BLOCKS ((unsigned long long)SCOUTFS_BLOCK_SM_MAX)
#define SCOUTFS_META_RESERVE_DEFAULT_BLOCKS 16384
static void init_default_options(struct scoutfs_mount_options *opts)
{
memset(opts, 0, sizeof(*opts));
@@ -136,6 +141,7 @@ static void init_default_options(struct scoutfs_mount_options *opts)
opts->orphan_scan_delay_ms = -1;
opts->quorum_heartbeat_timeout_ms = SCOUTFS_QUORUM_DEF_HB_TIMEO_MS;
opts->quorum_slot_nr = -1;
opts->meta_reserve_blocks = SCOUTFS_META_RESERVE_DEFAULT_BLOCKS;
}
static int verify_log_merge_wait_timeout_ms(struct super_block *sb, int ret, int val)
@@ -167,6 +173,24 @@ static int verify_quorum_heartbeat_timeout_ms(struct super_block *sb, int ret, u
return 0;
}
static int verify_meta_reserve_blocks(struct super_block *sb, int ret, int val)
{
/*
* Ideally we set a limit to something reasonable like 1/2 the actual
* total_meta_blocks, but we can't yet get this info when mount is called
*/
if (ret < 0) {
scoutfs_err(sb, "failed to parse meta_reserve_blocks value");
return -EINVAL;
}
if (val < 0 || val > INT_MAX) {
scoutfs_err(sb, "invalid meta_reserve_blocks value %d, must be between 0 and %d",
val, INT_MAX);
return -EINVAL;
}
return 0;
}
/*
* Parse the option string into our options struct. This can allocate
@@ -279,6 +303,14 @@ static int parse_options(struct super_block *sb, char *options, struct scoutfs_m
opts->quorum_slot_nr = nr;
break;
case Opt_meta_reserve_blocks:
ret = match_int(args, &nr);
ret = verify_meta_reserve_blocks(sb, ret, nr);
if (ret < 0)
return ret;
opts->meta_reserve_blocks = nr;
break;
default:
scoutfs_err(sb, "Unknown or malformed option, \"%s\"", p);
return -EINVAL;
@@ -371,6 +403,7 @@ int scoutfs_options_show(struct seq_file *seq, struct dentry *root)
seq_printf(seq, ",orphan_scan_delay_ms=%u", opts.orphan_scan_delay_ms);
if (opts.quorum_slot_nr >= 0)
seq_printf(seq, ",quorum_slot_nr=%d", opts.quorum_slot_nr);
seq_printf(seq, ".meta_reserve_blocks=%llu", opts.meta_reserve_blocks);
return 0;
}
@@ -589,6 +622,17 @@ static ssize_t quorum_slot_nr_show(struct kobject *kobj, struct kobj_attribute *
}
SCOUTFS_ATTR_RO(quorum_slot_nr);
static ssize_t meta_reserve_blocks_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf)
{
struct super_block *sb = SCOUTFS_SYSFS_ATTRS_SB(kobj);
struct scoutfs_mount_options opts;
scoutfs_options_read(sb, &opts);
return snprintf(buf, PAGE_SIZE, "%lld\n", opts.meta_reserve_blocks);
}
SCOUTFS_ATTR_RO(meta_reserve_blocks);
static struct attribute *options_attrs[] = {
SCOUTFS_ATTR_PTR(data_prealloc_blocks),
SCOUTFS_ATTR_PTR(data_prealloc_contig_only),
@@ -597,6 +641,7 @@ static struct attribute *options_attrs[] = {
SCOUTFS_ATTR_PTR(orphan_scan_delay_ms),
SCOUTFS_ATTR_PTR(quorum_heartbeat_timeout_ms),
SCOUTFS_ATTR_PTR(quorum_slot_nr),
SCOUTFS_ATTR_PTR(meta_reserve_blocks),
NULL,
};

View File

@@ -13,6 +13,7 @@ struct scoutfs_mount_options {
unsigned int orphan_scan_delay_ms;
int quorum_slot_nr;
u64 quorum_heartbeat_timeout_ms;
u64 meta_reserve_blocks;
};
void scoutfs_options_read(struct super_block *sb, struct scoutfs_mount_options *opts);

View File

@@ -772,11 +772,14 @@ static int alloc_move_empty(struct super_block *sb,
u64 scoutfs_server_reserved_meta_blocks(struct super_block *sb)
{
DECLARE_SERVER_INFO(sb, server);
struct scoutfs_mount_options opts;
u64 server_blocks;
u64 client_blocks;
u64 log_blocks;
u64 nr_clients;
scoutfs_options_read(sb, &opts);
/* server has two meta_avail lists it swaps between */
server_blocks = SCOUTFS_SERVER_META_FILL_TARGET * 2;
@@ -801,7 +804,7 @@ u64 scoutfs_server_reserved_meta_blocks(struct super_block *sb)
nr_clients = server->nr_clients;
spin_unlock(&server->lock);
return server_blocks + (max(1ULL, nr_clients) * client_blocks);
return server_blocks + (max(1ULL, nr_clients) * client_blocks) + opts.meta_reserve_blocks;
}
/*
@@ -1299,10 +1302,12 @@ static int finalize_and_start_log_merge(struct super_block *sb, struct scoutfs_l
* is nested inside holding commits so we recheck the persistent item
* each time we commit to make sure it's still what we think. The
* caller is still going to send the item to the client so we update the
* caller's each time we make progress. If we hit an error applying the
* changes we make then we can't send the log_trees to the client.
* caller's each time we make progress. This is a best-effort attempt
* to clean up and it's valid to leave extents in data_freed we don't
* return errors to the caller. The client will continue the work later
* in get_log_trees or as the rid is reclaimed.
*/
static int try_drain_data_freed(struct super_block *sb, struct scoutfs_log_trees *lt)
static void try_drain_data_freed(struct super_block *sb, struct scoutfs_log_trees *lt)
{
DECLARE_SERVER_INFO(sb, server);
struct scoutfs_super_block *super = DIRTY_SUPER_SB(sb);
@@ -1311,7 +1316,6 @@ static int try_drain_data_freed(struct super_block *sb, struct scoutfs_log_trees
struct scoutfs_log_trees drain;
struct scoutfs_key key;
COMMIT_HOLD(hold);
bool apply = false;
int ret = 0;
int err;
@@ -1320,27 +1324,22 @@ static int try_drain_data_freed(struct super_block *sb, struct scoutfs_log_trees
while (lt->data_freed.total_len != 0) {
server_hold_commit(sb, &hold);
mutex_lock(&server->logs_mutex);
apply = true;
ret = find_log_trees_item(sb, &super->logs_root, false, rid, U64_MAX, &drain);
if (ret < 0) {
ret = 0;
if (ret < 0)
break;
}
/* careful to only keep draining the caller's specific open trans */
if (drain.nr != lt->nr || drain.get_trans_seq != lt->get_trans_seq ||
drain.commit_trans_seq != lt->commit_trans_seq || drain.flags != lt->flags) {
ret = 0;
ret = -ENOENT;
break;
}
ret = scoutfs_btree_dirty(sb, &server->alloc, &server->wri,
&super->logs_root, &key);
if (ret < 0) {
ret = 0;
if (ret < 0)
break;
}
/* moving can modify and return errors, always update caller and item */
mutex_lock(&server->alloc_mutex);
@@ -1356,19 +1355,19 @@ static int try_drain_data_freed(struct super_block *sb, struct scoutfs_log_trees
BUG_ON(err < 0); /* dirtying must guarantee success */
mutex_unlock(&server->logs_mutex);
ret = server_apply_commit(sb, &hold, ret);
apply = false;
if (ret < 0)
if (ret < 0) {
ret = 0; /* don't try to abort, ignoring ret */
break;
}
}
if (apply) {
/* try to cleanly abort and write any partial dirty btree blocks, but ignore result */
if (ret < 0) {
mutex_unlock(&server->logs_mutex);
server_apply_commit(sb, &hold, ret);
server_apply_commit(sb, &hold, 0);
}
return ret;
}
/*
@@ -1576,9 +1575,9 @@ out:
scoutfs_err(sb, "error %d getting log trees for rid %016llx: %s",
ret, rid, err_str);
/* try to drain excessive data_freed with additional commits, if needed */
/* try to drain excessive data_freed with additional commits, if needed, ignoring err */
if (ret == 0)
ret = try_drain_data_freed(sb, &lt);
try_drain_data_freed(sb, &lt);
return scoutfs_net_response(sb, conn, cmd, id, ret, &lt, sizeof(lt));
}
@@ -4153,7 +4152,7 @@ static void fence_pending_recov_worker(struct work_struct *work)
struct server_info *server = container_of(work, struct server_info,
fence_pending_recov_work);
struct super_block *sb = server->sb;
union scoutfs_inet_addr addr = {{0,}};
union scoutfs_inet_addr addr;
u64 rid = 0;
int ret = 0;

View File

@@ -159,58 +159,6 @@ static bool drained_holders(struct trans_info *tri)
return holders == 0;
}
static int commit_current_log_trees(struct super_block *sb, char **str)
{
DECLARE_TRANS_INFO(sb, tri);
return (*str = "data submit", scoutfs_inode_walk_writeback(sb, true)) ?:
(*str = "item dirty", scoutfs_item_write_dirty(sb)) ?:
(*str = "data prepare", scoutfs_data_prepare_commit(sb)) ?:
(*str = "alloc prepare", scoutfs_alloc_prepare_commit(sb, &tri->alloc, &tri->wri)) ?:
(*str = "meta write", scoutfs_block_writer_write(sb, &tri->wri)) ?:
(*str = "data wait", scoutfs_inode_walk_writeback(sb, false)) ?:
(*str = "commit log trees", commit_btrees(sb)) ?:
scoutfs_item_write_done(sb);
}
static int get_next_log_trees(struct super_block *sb, char **str)
{
return (*str = "get log trees", scoutfs_trans_get_log_trees(sb));
}
static int retry_forever(struct super_block *sb, int (*func)(struct super_block *sb, char **str))
{
bool retrying = false;
char *str;
int ret;
do {
str = NULL;
ret = func(sb, &str);
if (ret < 0) {
if (!retrying) {
scoutfs_warn(sb, "critical transaction commit failure: %s = %d, retrying",
str, ret);
retrying = true;
}
if (scoutfs_forcing_unmount(sb)) {
ret = -EIO;
break;
}
msleep(2 * MSEC_PER_SEC);
} else if (retrying) {
scoutfs_info(sb, "retried transaction commit succeeded");
}
} while (ret < 0);
return ret;
}
/*
* This work func is responsible for writing out all the dirty blocks
* that make up the current dirty transaction. It prevents writers from
@@ -236,6 +184,8 @@ void scoutfs_trans_write_func(struct work_struct *work)
struct trans_info *tri = container_of(work, struct trans_info, write_work.work);
struct super_block *sb = tri->sb;
struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
bool retrying = false;
char *s = NULL;
int ret = 0;
tri->task = current;
@@ -264,9 +214,37 @@ void scoutfs_trans_write_func(struct work_struct *work)
scoutfs_inc_counter(sb, trans_commit_written);
/* retry {commit,get}_log_trees until they succeeed, can only fail when forcing unmount */
ret = retry_forever(sb, commit_current_log_trees) ?:
retry_forever(sb, get_next_log_trees);
do {
ret = (s = "data submit", scoutfs_inode_walk_writeback(sb, true)) ?:
(s = "item dirty", scoutfs_item_write_dirty(sb)) ?:
(s = "data prepare", scoutfs_data_prepare_commit(sb)) ?:
(s = "alloc prepare", scoutfs_alloc_prepare_commit(sb, &tri->alloc,
&tri->wri)) ?:
(s = "meta write", scoutfs_block_writer_write(sb, &tri->wri)) ?:
(s = "data wait", scoutfs_inode_walk_writeback(sb, false)) ?:
(s = "commit log trees", commit_btrees(sb)) ?:
scoutfs_item_write_done(sb) ?:
(s = "get log trees", scoutfs_trans_get_log_trees(sb));
if (ret < 0) {
if (!retrying) {
scoutfs_warn(sb, "critical transaction commit failure: %s = %d, retrying",
s, ret);
retrying = true;
}
if (scoutfs_forcing_unmount(sb)) {
ret = -EIO;
break;
}
msleep(2 * MSEC_PER_SEC);
} else if (retrying) {
scoutfs_info(sb, "retried transaction commit succeeded");
}
} while (ret < 0);
out:
spin_lock(&tri->write_lock);
tri->write_count++;

View File

@@ -80,15 +80,3 @@ t_compare_output()
{
"$@" >&7 2>&1
}
#
# usually bash prints an annoying output message when jobs
# are killed. We can avoid that by redirecting stderr for
# the bash process when it reaps the jobs that are killed.
#
t_silent_kill() {
exec {ERR}>&2 2>/dev/null
kill "$@"
wait "$@"
exec 2>&$ERR {ERR}>&-
}

View File

@@ -160,9 +160,6 @@ t_filter_dmesg()
re="$re|Pipe handler or fully qualified core dump path required.*"
re="$re|Set kernel.core_pattern before fs.suid_dumpable.*"
# perf warning that it adjusted sample rate
re="$re|perf: interrupt took too long.*lowering kernel.perf_event_max_sample_rate.*"
egrep -v "($re)" | \
ignore_harmless_unwind_kasan_stack_oob
}

View File

@@ -464,6 +464,7 @@ for i in $(seq 0 $((T_NR_MOUNTS - 1))); do
if [ "$i" -lt "$T_QUORUM" ]; then
opts="$opts,quorum_slot_nr=$i"
fi
opts="$opts,meta_reserve_blocks=0"
opts="${opts}${T_MNT_OPTIONS}"
msg "mounting $meta_dev|$data_dev on $dir"
@@ -532,15 +533,12 @@ for t in $tests; do
cmd rm -rf "$T_TMPDIR"
cmd mkdir -p "$T_TMPDIR"
# create a test name dir in the fs, clean up old data as needed
# create a test name dir in the fs
T_DS=""
for i in $(seq 0 $((T_NR_MOUNTS - 1))); do
dir="${T_M[$i]}/test/$test_name"
test $i == 0 && (
test -d "$dir" && cmd rm -rf "$dir"
cmd mkdir -p "$dir"
)
test $i == 0 && cmd mkdir -p "$dir"
eval T_D$i=$dir
T_D[$i]=$dir

View File

@@ -88,11 +88,6 @@ rm -rf "$SCR/xattrs"
echo "== make sure we can create again"
file="$SCR/file-after"
C=120
while (( C-- )); do
touch $file 2> /dev/null && break
sleep 1
done
touch $file
setfattr -n user.scoutfs-enospc -v 1 "$file"
sync

View File

@@ -38,6 +38,6 @@ while [ "$SECONDS" -lt "$END" ]; do
done
echo "== stopping background load"
t_silent_kill $load_pids
kill $load_pids
t_pass

View File

@@ -5,6 +5,18 @@
t_require_commands sleep touch sync stat handle_cat kill rm
t_require_mounts 2
#
# usually bash prints an annoying output message when jobs
# are killed. We can avoid that by redirecting stderr for
# the bash process when it reaps the jobs that are killed.
#
silent_kill() {
exec {ERR}>&2 2>/dev/null
kill "$@"
wait "$@"
exec 2>&$ERR {ERR}>&-
}
#
# We don't have a great way to test that inode items still exist. We
# don't prevent opening handles with nlink 0 today, so we'll use that.
@@ -40,7 +52,7 @@ inode_exists $ino || echo "$ino didn't exist"
echo "== orphan from failed evict deletion is picked up"
# pending kill signal stops evict from getting locks and deleting
t_silent_kill $pid
silent_kill $pid
t_set_sysfs_mount_option 0 orphan_scan_delay_ms 1000
sleep 5
inode_exists $ino && echo "$ino still exists"
@@ -58,7 +70,7 @@ for nr in $(t_fs_nrs); do
rm -f "$path"
done
sync
t_silent_kill $pids
silent_kill $pids
for nr in $(t_fs_nrs); do
t_force_umount $nr
done
@@ -70,15 +82,7 @@ done
# wait for orphan scans to run
t_set_all_sysfs_mount_options orphan_scan_delay_ms 1000
# also have to wait for delayed log merge work from mount
C=120
while (( C-- )); do
brk=1
for ino in $inos; do
inode_exists $ino && brk=0
done
test $brk -eq 1 && break
sleep 1
done
sleep 15
for ino in $inos; do
inode_exists $ino && echo "$ino still exists"
done
@@ -127,7 +131,7 @@ while [ $SECONDS -lt $END ]; do
done
# trigger eviction deletion of each file in each mount
t_silent_kill $pids
silent_kill $pids
wait || t_fail "handle_fsetxattr failed"