Compare commits

..

1 Commits

Author SHA1 Message Date
Auke Kok
72dc5695a6 Introduce meta_reserve_blocks mount option, default value.
This option adds a mount option, with default value of 16384, that adds
an additional reserve amount of blocks for the meta device.

The default value is 16384, which corresponds to 1GB of space, and just
about doubles the internal value for the reserve that is calculated
based on clients/mounts dynamically in sort of standard values. It also
just compromises about less than 2% of the meta device size for the
smallest meta device size.

A suggested value for larger deployments is like somewhere around 256
blocks per GB of meta device size, i.e. 1/64 of the meta device space,
and about 1.6% in effect.

Customers who are running into issues can adjust their mount options to
increase the value to have a larger safety buffer, or decrease it to
potentially have a way to get out of low space conditions temporarily.
Obviously one would want to increase the value of this option after
resolving the low space condition issues as soon as possible.

Our test suite will run with meta_reserve_blocks=0, so that the behavior
of any of our tests is functionally unaffected by this change, and won't
interfere with resolving underlying ENOSPC issues and their resolution.
The addition of this option however allows us to artifically create
ENOSPC conditions at will, and we may want to add tests specifically
that do so.

Signed-off-by: Auke Kok <auke.kok@versity.com>
2025-04-17 16:06:33 -04:00
12 changed files with 122 additions and 582 deletions

View File

@@ -39,6 +39,7 @@ enum {
Opt_orphan_scan_delay_ms,
Opt_quorum_heartbeat_timeout_ms,
Opt_quorum_slot_nr,
Opt_meta_reserve_blocks,
Opt_err,
};
@@ -52,6 +53,7 @@ static const match_table_t tokens = {
{Opt_orphan_scan_delay_ms, "orphan_scan_delay_ms=%s"},
{Opt_quorum_heartbeat_timeout_ms, "quorum_heartbeat_timeout_ms=%s"},
{Opt_quorum_slot_nr, "quorum_slot_nr=%s"},
{Opt_meta_reserve_blocks, "meta_reserve_blocks=%s"},
{Opt_err, NULL}
};
@@ -126,6 +128,9 @@ static void free_options(struct scoutfs_mount_options *opts)
#define MIN_DATA_PREALLOC_BLOCKS 1ULL
#define MAX_DATA_PREALLOC_BLOCKS ((unsigned long long)SCOUTFS_BLOCK_SM_MAX)
#define SCOUTFS_META_RESERVE_DEFAULT_BLOCKS 16384
static void init_default_options(struct scoutfs_mount_options *opts)
{
memset(opts, 0, sizeof(*opts));
@@ -136,6 +141,7 @@ static void init_default_options(struct scoutfs_mount_options *opts)
opts->orphan_scan_delay_ms = -1;
opts->quorum_heartbeat_timeout_ms = SCOUTFS_QUORUM_DEF_HB_TIMEO_MS;
opts->quorum_slot_nr = -1;
opts->meta_reserve_blocks = SCOUTFS_META_RESERVE_DEFAULT_BLOCKS;
}
static int verify_log_merge_wait_timeout_ms(struct super_block *sb, int ret, int val)
@@ -167,6 +173,24 @@ static int verify_quorum_heartbeat_timeout_ms(struct super_block *sb, int ret, u
return 0;
}
static int verify_meta_reserve_blocks(struct super_block *sb, int ret, int val)
{
/*
* Ideally we set a limit to something reasonable like 1/2 the actual
* total_meta_blocks, but we can't yet get this info when mount is called
*/
if (ret < 0) {
scoutfs_err(sb, "failed to parse meta_reserve_blocks value");
return -EINVAL;
}
if (val < 0 || val > INT_MAX) {
scoutfs_err(sb, "invalid meta_reserve_blocks value %d, must be between 0 and %d",
val, INT_MAX);
return -EINVAL;
}
return 0;
}
/*
* Parse the option string into our options struct. This can allocate
@@ -279,6 +303,14 @@ static int parse_options(struct super_block *sb, char *options, struct scoutfs_m
opts->quorum_slot_nr = nr;
break;
case Opt_meta_reserve_blocks:
ret = match_int(args, &nr);
ret = verify_meta_reserve_blocks(sb, ret, nr);
if (ret < 0)
return ret;
opts->meta_reserve_blocks = nr;
break;
default:
scoutfs_err(sb, "Unknown or malformed option, \"%s\"", p);
return -EINVAL;
@@ -371,6 +403,7 @@ int scoutfs_options_show(struct seq_file *seq, struct dentry *root)
seq_printf(seq, ",orphan_scan_delay_ms=%u", opts.orphan_scan_delay_ms);
if (opts.quorum_slot_nr >= 0)
seq_printf(seq, ",quorum_slot_nr=%d", opts.quorum_slot_nr);
seq_printf(seq, ".meta_reserve_blocks=%llu", opts.meta_reserve_blocks);
return 0;
}
@@ -589,6 +622,17 @@ static ssize_t quorum_slot_nr_show(struct kobject *kobj, struct kobj_attribute *
}
SCOUTFS_ATTR_RO(quorum_slot_nr);
static ssize_t meta_reserve_blocks_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf)
{
struct super_block *sb = SCOUTFS_SYSFS_ATTRS_SB(kobj);
struct scoutfs_mount_options opts;
scoutfs_options_read(sb, &opts);
return snprintf(buf, PAGE_SIZE, "%lld\n", opts.meta_reserve_blocks);
}
SCOUTFS_ATTR_RO(meta_reserve_blocks);
static struct attribute *options_attrs[] = {
SCOUTFS_ATTR_PTR(data_prealloc_blocks),
SCOUTFS_ATTR_PTR(data_prealloc_contig_only),
@@ -597,6 +641,7 @@ static struct attribute *options_attrs[] = {
SCOUTFS_ATTR_PTR(orphan_scan_delay_ms),
SCOUTFS_ATTR_PTR(quorum_heartbeat_timeout_ms),
SCOUTFS_ATTR_PTR(quorum_slot_nr),
SCOUTFS_ATTR_PTR(meta_reserve_blocks),
NULL,
};

View File

@@ -13,6 +13,7 @@ struct scoutfs_mount_options {
unsigned int orphan_scan_delay_ms;
int quorum_slot_nr;
u64 quorum_heartbeat_timeout_ms;
u64 meta_reserve_blocks;
};
void scoutfs_options_read(struct super_block *sb, struct scoutfs_mount_options *opts);

View File

@@ -772,11 +772,14 @@ static int alloc_move_empty(struct super_block *sb,
u64 scoutfs_server_reserved_meta_blocks(struct super_block *sb)
{
DECLARE_SERVER_INFO(sb, server);
struct scoutfs_mount_options opts;
u64 server_blocks;
u64 client_blocks;
u64 log_blocks;
u64 nr_clients;
scoutfs_options_read(sb, &opts);
/* server has two meta_avail lists it swaps between */
server_blocks = SCOUTFS_SERVER_META_FILL_TARGET * 2;
@@ -801,7 +804,7 @@ u64 scoutfs_server_reserved_meta_blocks(struct super_block *sb)
nr_clients = server->nr_clients;
spin_unlock(&server->lock);
return server_blocks + (max(1ULL, nr_clients) * client_blocks);
return server_blocks + (max(1ULL, nr_clients) * client_blocks) + opts.meta_reserve_blocks;
}
/*
@@ -1299,10 +1302,12 @@ static int finalize_and_start_log_merge(struct super_block *sb, struct scoutfs_l
* is nested inside holding commits so we recheck the persistent item
* each time we commit to make sure it's still what we think. The
* caller is still going to send the item to the client so we update the
* caller's each time we make progress. If we hit an error applying the
* changes we make then we can't send the log_trees to the client.
* caller's each time we make progress. This is a best-effort attempt
* to clean up and it's valid to leave extents in data_freed we don't
* return errors to the caller. The client will continue the work later
* in get_log_trees or as the rid is reclaimed.
*/
static int try_drain_data_freed(struct super_block *sb, struct scoutfs_log_trees *lt)
static void try_drain_data_freed(struct super_block *sb, struct scoutfs_log_trees *lt)
{
DECLARE_SERVER_INFO(sb, server);
struct scoutfs_super_block *super = DIRTY_SUPER_SB(sb);
@@ -1311,7 +1316,6 @@ static int try_drain_data_freed(struct super_block *sb, struct scoutfs_log_trees
struct scoutfs_log_trees drain;
struct scoutfs_key key;
COMMIT_HOLD(hold);
bool apply = false;
int ret = 0;
int err;
@@ -1320,27 +1324,22 @@ static int try_drain_data_freed(struct super_block *sb, struct scoutfs_log_trees
while (lt->data_freed.total_len != 0) {
server_hold_commit(sb, &hold);
mutex_lock(&server->logs_mutex);
apply = true;
ret = find_log_trees_item(sb, &super->logs_root, false, rid, U64_MAX, &drain);
if (ret < 0) {
ret = 0;
if (ret < 0)
break;
}
/* careful to only keep draining the caller's specific open trans */
if (drain.nr != lt->nr || drain.get_trans_seq != lt->get_trans_seq ||
drain.commit_trans_seq != lt->commit_trans_seq || drain.flags != lt->flags) {
ret = 0;
ret = -ENOENT;
break;
}
ret = scoutfs_btree_dirty(sb, &server->alloc, &server->wri,
&super->logs_root, &key);
if (ret < 0) {
ret = 0;
if (ret < 0)
break;
}
/* moving can modify and return errors, always update caller and item */
mutex_lock(&server->alloc_mutex);
@@ -1356,19 +1355,19 @@ static int try_drain_data_freed(struct super_block *sb, struct scoutfs_log_trees
BUG_ON(err < 0); /* dirtying must guarantee success */
mutex_unlock(&server->logs_mutex);
ret = server_apply_commit(sb, &hold, ret);
apply = false;
if (ret < 0)
if (ret < 0) {
ret = 0; /* don't try to abort, ignoring ret */
break;
}
}
if (apply) {
/* try to cleanly abort and write any partial dirty btree blocks, but ignore result */
if (ret < 0) {
mutex_unlock(&server->logs_mutex);
server_apply_commit(sb, &hold, ret);
server_apply_commit(sb, &hold, 0);
}
return ret;
}
/*
@@ -1576,9 +1575,9 @@ out:
scoutfs_err(sb, "error %d getting log trees for rid %016llx: %s",
ret, rid, err_str);
/* try to drain excessive data_freed with additional commits, if needed */
/* try to drain excessive data_freed with additional commits, if needed, ignoring err */
if (ret == 0)
ret = try_drain_data_freed(sb, &lt);
try_drain_data_freed(sb, &lt);
return scoutfs_net_response(sb, conn, cmd, id, ret, &lt, sizeof(lt));
}
@@ -4153,7 +4152,7 @@ static void fence_pending_recov_worker(struct work_struct *work)
struct server_info *server = container_of(work, struct server_info,
fence_pending_recov_work);
struct super_block *sb = server->sb;
union scoutfs_inet_addr addr = {{0,}};
union scoutfs_inet_addr addr;
u64 rid = 0;
int ret = 0;

View File

@@ -159,58 +159,6 @@ static bool drained_holders(struct trans_info *tri)
return holders == 0;
}
static int commit_current_log_trees(struct super_block *sb, char **str)
{
DECLARE_TRANS_INFO(sb, tri);
return (*str = "data submit", scoutfs_inode_walk_writeback(sb, true)) ?:
(*str = "item dirty", scoutfs_item_write_dirty(sb)) ?:
(*str = "data prepare", scoutfs_data_prepare_commit(sb)) ?:
(*str = "alloc prepare", scoutfs_alloc_prepare_commit(sb, &tri->alloc, &tri->wri)) ?:
(*str = "meta write", scoutfs_block_writer_write(sb, &tri->wri)) ?:
(*str = "data wait", scoutfs_inode_walk_writeback(sb, false)) ?:
(*str = "commit log trees", commit_btrees(sb)) ?:
scoutfs_item_write_done(sb);
}
static int get_next_log_trees(struct super_block *sb, char **str)
{
return (*str = "get log trees", scoutfs_trans_get_log_trees(sb));
}
static int retry_forever(struct super_block *sb, int (*func)(struct super_block *sb, char **str))
{
bool retrying = false;
char *str;
int ret;
do {
str = NULL;
ret = func(sb, &str);
if (ret < 0) {
if (!retrying) {
scoutfs_warn(sb, "critical transaction commit failure: %s = %d, retrying",
str, ret);
retrying = true;
}
if (scoutfs_forcing_unmount(sb)) {
ret = -EIO;
break;
}
msleep(2 * MSEC_PER_SEC);
} else if (retrying) {
scoutfs_info(sb, "retried transaction commit succeeded");
}
} while (ret < 0);
return ret;
}
/*
* This work func is responsible for writing out all the dirty blocks
* that make up the current dirty transaction. It prevents writers from
@@ -236,6 +184,8 @@ void scoutfs_trans_write_func(struct work_struct *work)
struct trans_info *tri = container_of(work, struct trans_info, write_work.work);
struct super_block *sb = tri->sb;
struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
bool retrying = false;
char *s = NULL;
int ret = 0;
tri->task = current;
@@ -264,9 +214,37 @@ void scoutfs_trans_write_func(struct work_struct *work)
scoutfs_inc_counter(sb, trans_commit_written);
/* retry {commit,get}_log_trees until they succeeed, can only fail when forcing unmount */
ret = retry_forever(sb, commit_current_log_trees) ?:
retry_forever(sb, get_next_log_trees);
do {
ret = (s = "data submit", scoutfs_inode_walk_writeback(sb, true)) ?:
(s = "item dirty", scoutfs_item_write_dirty(sb)) ?:
(s = "data prepare", scoutfs_data_prepare_commit(sb)) ?:
(s = "alloc prepare", scoutfs_alloc_prepare_commit(sb, &tri->alloc,
&tri->wri)) ?:
(s = "meta write", scoutfs_block_writer_write(sb, &tri->wri)) ?:
(s = "data wait", scoutfs_inode_walk_writeback(sb, false)) ?:
(s = "commit log trees", commit_btrees(sb)) ?:
scoutfs_item_write_done(sb) ?:
(s = "get log trees", scoutfs_trans_get_log_trees(sb));
if (ret < 0) {
if (!retrying) {
scoutfs_warn(sb, "critical transaction commit failure: %s = %d, retrying",
s, ret);
retrying = true;
}
if (scoutfs_forcing_unmount(sb)) {
ret = -EIO;
break;
}
msleep(2 * MSEC_PER_SEC);
} else if (retrying) {
scoutfs_info(sb, "retried transaction commit succeeded");
}
} while (ret < 0);
out:
spin_lock(&tri->write_lock);
tri->write_count++;

View File

@@ -15,8 +15,7 @@ BIN := src/createmany \
src/o_tmpfile_umask \
src/o_tmpfile_linkat \
src/mmap_stress \
src/mmap_validate \
src/walk_inodes_for_estale
src/mmap_validate
DEPS := $(wildcard src/*.d)

View File

@@ -80,15 +80,3 @@ t_compare_output()
{
"$@" >&7 2>&1
}
#
# usually bash prints an annoying output message when jobs
# are killed. We can avoid that by redirecting stderr for
# the bash process when it reaps the jobs that are killed.
#
t_silent_kill() {
exec {ERR}>&2 2>/dev/null
kill "$@"
wait "$@"
exec 2>&$ERR {ERR}>&-
}

View File

@@ -160,9 +160,6 @@ t_filter_dmesg()
re="$re|Pipe handler or fully qualified core dump path required.*"
re="$re|Set kernel.core_pattern before fs.suid_dumpable.*"
# perf warning that it adjusted sample rate
re="$re|perf: interrupt took too long.*lowering kernel.perf_event_max_sample_rate.*"
egrep -v "($re)" | \
ignore_harmless_unwind_kasan_stack_oob
}

View File

@@ -464,6 +464,7 @@ for i in $(seq 0 $((T_NR_MOUNTS - 1))); do
if [ "$i" -lt "$T_QUORUM" ]; then
opts="$opts,quorum_slot_nr=$i"
fi
opts="$opts,meta_reserve_blocks=0"
opts="${opts}${T_MNT_OPTIONS}"
msg "mounting $meta_dev|$data_dev on $dir"
@@ -532,15 +533,12 @@ for t in $tests; do
cmd rm -rf "$T_TMPDIR"
cmd mkdir -p "$T_TMPDIR"
# create a test name dir in the fs, clean up old data as needed
# create a test name dir in the fs
T_DS=""
for i in $(seq 0 $((T_NR_MOUNTS - 1))); do
dir="${T_M[$i]}/test/$test_name"
test $i == 0 && (
test -d "$dir" && cmd rm -rf "$dir"
cmd mkdir -p "$dir"
)
test $i == 0 && cmd mkdir -p "$dir"
eval T_D$i=$dir
T_D[$i]=$dir

View File

@@ -1,464 +0,0 @@
/*
* Copyright (C) 2025 Versity Software, Inc. All rights reserved.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public
* License v2 as published by the Free Software Foundation.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* General Public License for more details.
*/
#ifndef _GNU_SOURCE
#define _GNU_SOURCE
#endif
#include <errno.h>
#include <fcntl.h>
#include <inttypes.h>
#include <limits.h>
#include <signal.h>
#include <stdbool.h>
#include <stddef.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <time.h>
#include <unistd.h>
#include <linux/types.h>
#include <sys/ioctl.h>
#include <sys/stat.h>
#include <sys/syscall.h>
#include "ioctl.h"
#define array_size(arr) (sizeof(arr) / sizeof(arr[0]))
#define FILEID_SCOUTFS 0x81
#define FILEID_SCOUTFS_WITH_PARENT 0x82
static uint64_t meta_seq = 0;
static bool sig_received = false;
static bool tracing_on = false;
static bool exit_on_current = false;
static bool exiting = false;
static uint64_t count = 0;
struct our_handle {
struct file_handle handle;
/*
* scoutfs file handle can be ino or ino/parent. The
* handle_type field of struct file_handle denotes which
* version is in use. We only use the ino variant here.
*/
__le64 scoutfs_ino;
};
static void exit_usage(void)
{
printf(
" -e exit once stable meta_seq has been reached\n"
" -m <string> scoutfs mount path string for seq walk\n"
" -s <number> start from meta_seq number, instead of 0\n"
);
exit(1);
}
static int write_at(int tracefd, char *path, char *val)
{
int fd = -1;
int ret;
fd = openat(tracefd, path, O_TRUNC | O_RDWR);
if (fd < 0)
return errno;
ret = write(fd, val, strlen(val));
if (ret < 0)
ret = errno;
close(fd);
return 0;
}
static int do_trace(int fd, uint64_t ino)
{
struct our_handle handle;
int tracefd = -1;
int targetfd = -1;
int outfd = -1;
int infd = -1;
char *pidstr;
char *name;
char *buf;
ssize_t bytes;
ssize_t written;
ssize_t off = 0;
unsigned long e = 0;
int ret;
if (asprintf(&pidstr, "%u", getpid()) < 0)
return ENOMEM;
if (asprintf(&name, "trace.scoutfs.open_by_handle_at.ino-%lu", ino) < 0)
return ENOMEM;
buf = malloc(4096);
if (!buf)
return ENOMEM;
handle.handle.handle_bytes = sizeof(struct our_handle);
handle.handle.handle_type = FILEID_SCOUTFS;
handle.scoutfs_ino = htole64(ino);
/* keep a quick dirfd around for easy writing sysfs files */
tracefd = open("/sys/kernel/debug/tracing", 0);
if (tracefd < 0)
return errno;
/* start tracing */
ret = write_at(tracefd, "current_tracer", "nop") ?:
write_at(tracefd, "current_tracer", "function_graph") ?:
write_at(tracefd, "set_ftrace_pid", pidstr) ?:
write_at(tracefd, "tracing_on", "1");
tracing_on = true;
if (ret)
goto out;
targetfd = open_by_handle_at(fd, &handle.handle, O_RDWR);
e = errno;
out:
/* turn off tracing first */
ret = write_at(tracefd, "tracing_on", "0");
if (ret)
return ret;
tracing_on = false;
if (targetfd != -1) {
close(targetfd);
return 0;
}
if (e == ESTALE) {
/* capture trace */
outfd = open(name, O_CREAT | O_TRUNC | O_RDWR, 0644);
if (outfd < 0) {
fprintf(stderr, "Error opening trace\n");
return errno;
}
infd = openat(tracefd, "trace", O_RDONLY);
if (infd < 0) {
fprintf(stderr, "Error opening trace output\n");
return errno;
}
for (;;) {
bytes = pread(infd, buf, 4096, off);
if (bytes < 0)
return errno;
if (bytes == 0)
break;
written = pwrite(outfd, buf, bytes, off);
if (written < 0)
return errno;
if (written != bytes)
return EIO;
off += bytes;
}
close(outfd);
close(infd);
fprintf(stderr, "Wrote \"%s\"\n", name);
}
/* cleanup */
ret = write_at(tracefd, "current_tracer", "nop");
free(pidstr);
free(name);
free(buf);
close(tracefd);
/* collect trace output */
return ret;
}
/*
* lookup path for ino using ino_path
*/
struct ino_args {
char *path;
__u64 ino;
};
static int do_resolve(int fd, uint64_t ino, char **path)
{
struct scoutfs_ioctl_ino_path ioctl_args = {0};
struct scoutfs_ioctl_ino_path_result *res;
unsigned int result_bytes;
int ret;
result_bytes = offsetof(struct scoutfs_ioctl_ino_path_result,
path[PATH_MAX]);
res = malloc(result_bytes);
if (!res)
return ENOMEM;
ioctl_args.ino = ino;
ioctl_args.dir_ino = 0;
ioctl_args.dir_pos = 0;
ioctl_args.result_ptr = (intptr_t)res;
ioctl_args.result_bytes = result_bytes;
ret = ioctl(fd, SCOUTFS_IOC_INO_PATH, &ioctl_args);
if (ret < 0) {
if (errno == ENOENT) {
*path = NULL;
return 0;
}
return errno;
}
ret = asprintf(path, "%.*s", res->path_bytes, res->path);
if (ret <= 0)
return ENOMEM;
free(res);
return 0;
}
static int do_test_ino(int fd, uint64_t ino)
{
struct our_handle handle = {{0}};
struct stat sb = {0};
char *path = NULL;
int targetfd = -1;
int ret;
/* filter: open_by_handle_at() must fail */
handle.handle.handle_bytes = sizeof(struct our_handle);
handle.handle.handle_type = FILEID_SCOUTFS;
handle.scoutfs_ino = htole64(ino);
targetfd = open_by_handle_at(fd, &handle.handle, O_RDWR);
if (targetfd != -1) {
close(targetfd);
return 0;
}
/* filter: errno must be ESTALE */
if (errno != ESTALE)
return 0;
/* filter: path resolution succeeds to an actual file entry */
ret = do_resolve(fd, ino, &path);
if (path == NULL)
return 0;
if (ret)
return ret;
/* filter: stat() must succeed on resolved path */
ret = fstatat(fd, path, &sb, AT_SYMLINK_NOFOLLOW);
free(path);
if (ret != 0) {
if (errno == ENOENT)
/* doesn't exist */
return 0;
return errno;
}
return do_trace(fd, ino);
}
static uint64_t do_get_meta_seq_stable(int fd)
{
struct scoutfs_ioctl_stat_more stm;
if (ioctl(fd, SCOUTFS_IOC_STAT_MORE, &stm) < 0)
return errno;
return stm.meta_seq;
}
static int do_walk_seq(int fd)
{
struct scoutfs_ioctl_walk_inodes_entry ents[128];
struct scoutfs_ioctl_walk_inodes walk = {{0}};
struct timespec ts;
time_t seconds;
int ret;
uint64_t total = 0;
uint64_t stable;
int i;
int j;
walk.index = SCOUTFS_IOC_WALK_INODES_META_SEQ;
/* make sure not to advance to stable meta_seq, we can just trail behind */
stable = do_get_meta_seq_stable(fd);
if (stable == 0)
return 0;
if (meta_seq >= stable - 1) {
if (exit_on_current)
exiting = true;
return 0;
}
meta_seq = meta_seq ? meta_seq + 1 : 0;
walk.first.major = meta_seq;
walk.first.minor = 0;
walk.first.ino = 0;
walk.last.major = stable - 1;
walk.last.minor = ~0;
walk.last.ino = ~0ULL;
walk.entries_ptr = (unsigned long)ents;
walk.nr_entries = array_size(ents);
clock_gettime(CLOCK_REALTIME, &ts);
seconds = ts.tv_sec;
for (j = 0;; j++) {
if (sig_received)
return 0;
ret = ioctl(fd, SCOUTFS_IOC_WALK_INODES, &walk);
if (ret < 0)
return ret;
if (ret == 0)
break;
for (i = 0; i < ret; i++) {
meta_seq = ents[i].major;
if (ents[i].ino == 1)
continue;
/* poke at it */
ret = do_test_ino(fd, ents[i].ino);
count++;
if (ret < 0)
return ret;
}
total += i;
walk.first = ents[i - 1];
if (++walk.first.ino == 0 && ++walk.first.minor == 0)
walk.first.major++;
/* yield once in a while */
if (j % 32 == 0) {
clock_gettime(CLOCK_REALTIME, &ts);
if (ts.tv_sec > seconds + 1)
break;
}
}
return 0;
}
void handle_signal(int sig)
{
int tracefd = -1;
sig_received = true;
if (!tracing_on)
return;
tracefd = open("/sys/kernel/debug/tracing", 0);
write_at(tracefd, "tracing_on", "0");
close(tracefd);
}
int main(int argc, char **argv)
{
char *mnt = NULL;
char c;
int mntfd;
int ret;
meta_seq = 0;
/* All we need is the mount point arg */
while ((c = getopt(argc, argv, "+em:s:")) != -1) {
switch (c) {
case 'e':
exit_on_current = true;
break;
case 'm':
mnt = strdup(optarg);
break;
case 's':
meta_seq = strtoull(optarg, NULL, 0);
break;
case '?':
printf("unknown argument: %c\n", optind);
case 'h':
exit_usage();
}
}
if (!mnt) {
fprintf(stderr, "Must provide a mount point with -m\n");
exit(EXIT_FAILURE);
}
if (meta_seq > 0)
fprintf(stdout, "Starting from meta_seq = %lu\n", meta_seq);
/* lower prio */
ret = nice(10);
if (ret == -1)
fprintf(stderr, "Error setting nice value\n");
ret = syscall(SYS_ioprio_set, 1, 0, 0); /* IOPRIO_WHO_PROCESS = 1, IOPRIO_PRIO_CLASS(IOPRIO_CLASS_IDLE) = 0 */
if (ret == -1)
fprintf(stderr, "Error setting ioprio value\n");
signal(SIGINT, handle_signal);
signal(SIGTERM, handle_signal);
for (;;) {
if (sig_received)
break;
mntfd = open(mnt, O_RDONLY);
if (mntfd == -1) {
perror("open(mntfd)");
exit(EXIT_FAILURE);
}
ret = do_walk_seq(mntfd);
/* handle unmounts? EAGAIN? */
if (ret)
break;
close(mntfd);
if (exiting)
break;
/* yield */
if (!sig_received)
sleep(5);
}
free(mnt);
fprintf(stdout, "Last meta_seq = %lu\n", meta_seq);
if (ret)
fprintf(stderr, "Error walking inodes: %s(%d)\n", strerror(errno), ret);
exit(ret);
}

View File

@@ -88,11 +88,6 @@ rm -rf "$SCR/xattrs"
echo "== make sure we can create again"
file="$SCR/file-after"
C=120
while (( C-- )); do
touch $file 2> /dev/null && break
sleep 1
done
touch $file
setfattr -n user.scoutfs-enospc -v 1 "$file"
sync

View File

@@ -38,6 +38,6 @@ while [ "$SECONDS" -lt "$END" ]; do
done
echo "== stopping background load"
t_silent_kill $load_pids
kill $load_pids
t_pass

View File

@@ -5,6 +5,18 @@
t_require_commands sleep touch sync stat handle_cat kill rm
t_require_mounts 2
#
# usually bash prints an annoying output message when jobs
# are killed. We can avoid that by redirecting stderr for
# the bash process when it reaps the jobs that are killed.
#
silent_kill() {
exec {ERR}>&2 2>/dev/null
kill "$@"
wait "$@"
exec 2>&$ERR {ERR}>&-
}
#
# We don't have a great way to test that inode items still exist. We
# don't prevent opening handles with nlink 0 today, so we'll use that.
@@ -40,7 +52,7 @@ inode_exists $ino || echo "$ino didn't exist"
echo "== orphan from failed evict deletion is picked up"
# pending kill signal stops evict from getting locks and deleting
t_silent_kill $pid
silent_kill $pid
t_set_sysfs_mount_option 0 orphan_scan_delay_ms 1000
sleep 5
inode_exists $ino && echo "$ino still exists"
@@ -58,7 +70,7 @@ for nr in $(t_fs_nrs); do
rm -f "$path"
done
sync
t_silent_kill $pids
silent_kill $pids
for nr in $(t_fs_nrs); do
t_force_umount $nr
done
@@ -70,15 +82,7 @@ done
# wait for orphan scans to run
t_set_all_sysfs_mount_options orphan_scan_delay_ms 1000
# also have to wait for delayed log merge work from mount
C=120
while (( C-- )); do
brk=1
for ino in $inos; do
inode_exists $ino && brk=0
done
test $brk -eq 1 && break
sleep 1
done
sleep 15
for ino in $inos; do
inode_exists $ino && echo "$ino still exists"
done
@@ -127,7 +131,7 @@ while [ $SECONDS -lt $END ]; do
done
# trigger eviction deletion of each file in each mount
t_silent_kill $pids
silent_kill $pids
wait || t_fail "handle_fsetxattr failed"