mirror of
https://github.com/versity/scoutfs.git
synced 2026-01-10 13:47:27 +00:00
Compare commits
1 Commits
auke/estal
...
auke/meta_
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
72dc5695a6 |
@@ -39,6 +39,7 @@ enum {
|
||||
Opt_orphan_scan_delay_ms,
|
||||
Opt_quorum_heartbeat_timeout_ms,
|
||||
Opt_quorum_slot_nr,
|
||||
Opt_meta_reserve_blocks,
|
||||
Opt_err,
|
||||
};
|
||||
|
||||
@@ -52,6 +53,7 @@ static const match_table_t tokens = {
|
||||
{Opt_orphan_scan_delay_ms, "orphan_scan_delay_ms=%s"},
|
||||
{Opt_quorum_heartbeat_timeout_ms, "quorum_heartbeat_timeout_ms=%s"},
|
||||
{Opt_quorum_slot_nr, "quorum_slot_nr=%s"},
|
||||
{Opt_meta_reserve_blocks, "meta_reserve_blocks=%s"},
|
||||
{Opt_err, NULL}
|
||||
};
|
||||
|
||||
@@ -126,6 +128,9 @@ static void free_options(struct scoutfs_mount_options *opts)
|
||||
#define MIN_DATA_PREALLOC_BLOCKS 1ULL
|
||||
#define MAX_DATA_PREALLOC_BLOCKS ((unsigned long long)SCOUTFS_BLOCK_SM_MAX)
|
||||
|
||||
#define SCOUTFS_META_RESERVE_DEFAULT_BLOCKS 16384
|
||||
|
||||
|
||||
static void init_default_options(struct scoutfs_mount_options *opts)
|
||||
{
|
||||
memset(opts, 0, sizeof(*opts));
|
||||
@@ -136,6 +141,7 @@ static void init_default_options(struct scoutfs_mount_options *opts)
|
||||
opts->orphan_scan_delay_ms = -1;
|
||||
opts->quorum_heartbeat_timeout_ms = SCOUTFS_QUORUM_DEF_HB_TIMEO_MS;
|
||||
opts->quorum_slot_nr = -1;
|
||||
opts->meta_reserve_blocks = SCOUTFS_META_RESERVE_DEFAULT_BLOCKS;
|
||||
}
|
||||
|
||||
static int verify_log_merge_wait_timeout_ms(struct super_block *sb, int ret, int val)
|
||||
@@ -167,6 +173,24 @@ static int verify_quorum_heartbeat_timeout_ms(struct super_block *sb, int ret, u
|
||||
|
||||
return 0;
|
||||
}
|
||||
static int verify_meta_reserve_blocks(struct super_block *sb, int ret, int val)
|
||||
{
|
||||
/*
|
||||
* Ideally we set a limit to something reasonable like 1/2 the actual
|
||||
* total_meta_blocks, but we can't yet get this info when mount is called
|
||||
*/
|
||||
if (ret < 0) {
|
||||
scoutfs_err(sb, "failed to parse meta_reserve_blocks value");
|
||||
return -EINVAL;
|
||||
}
|
||||
if (val < 0 || val > INT_MAX) {
|
||||
scoutfs_err(sb, "invalid meta_reserve_blocks value %d, must be between 0 and %d",
|
||||
val, INT_MAX);
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
/*
|
||||
* Parse the option string into our options struct. This can allocate
|
||||
@@ -279,6 +303,14 @@ static int parse_options(struct super_block *sb, char *options, struct scoutfs_m
|
||||
opts->quorum_slot_nr = nr;
|
||||
break;
|
||||
|
||||
case Opt_meta_reserve_blocks:
|
||||
ret = match_int(args, &nr);
|
||||
ret = verify_meta_reserve_blocks(sb, ret, nr);
|
||||
if (ret < 0)
|
||||
return ret;
|
||||
opts->meta_reserve_blocks = nr;
|
||||
break;
|
||||
|
||||
default:
|
||||
scoutfs_err(sb, "Unknown or malformed option, \"%s\"", p);
|
||||
return -EINVAL;
|
||||
@@ -371,6 +403,7 @@ int scoutfs_options_show(struct seq_file *seq, struct dentry *root)
|
||||
seq_printf(seq, ",orphan_scan_delay_ms=%u", opts.orphan_scan_delay_ms);
|
||||
if (opts.quorum_slot_nr >= 0)
|
||||
seq_printf(seq, ",quorum_slot_nr=%d", opts.quorum_slot_nr);
|
||||
seq_printf(seq, ".meta_reserve_blocks=%llu", opts.meta_reserve_blocks);
|
||||
|
||||
return 0;
|
||||
}
|
||||
@@ -589,6 +622,17 @@ static ssize_t quorum_slot_nr_show(struct kobject *kobj, struct kobj_attribute *
|
||||
}
|
||||
SCOUTFS_ATTR_RO(quorum_slot_nr);
|
||||
|
||||
static ssize_t meta_reserve_blocks_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf)
|
||||
{
|
||||
struct super_block *sb = SCOUTFS_SYSFS_ATTRS_SB(kobj);
|
||||
struct scoutfs_mount_options opts;
|
||||
|
||||
scoutfs_options_read(sb, &opts);
|
||||
|
||||
return snprintf(buf, PAGE_SIZE, "%lld\n", opts.meta_reserve_blocks);
|
||||
}
|
||||
SCOUTFS_ATTR_RO(meta_reserve_blocks);
|
||||
|
||||
static struct attribute *options_attrs[] = {
|
||||
SCOUTFS_ATTR_PTR(data_prealloc_blocks),
|
||||
SCOUTFS_ATTR_PTR(data_prealloc_contig_only),
|
||||
@@ -597,6 +641,7 @@ static struct attribute *options_attrs[] = {
|
||||
SCOUTFS_ATTR_PTR(orphan_scan_delay_ms),
|
||||
SCOUTFS_ATTR_PTR(quorum_heartbeat_timeout_ms),
|
||||
SCOUTFS_ATTR_PTR(quorum_slot_nr),
|
||||
SCOUTFS_ATTR_PTR(meta_reserve_blocks),
|
||||
NULL,
|
||||
};
|
||||
|
||||
|
||||
@@ -13,6 +13,7 @@ struct scoutfs_mount_options {
|
||||
unsigned int orphan_scan_delay_ms;
|
||||
int quorum_slot_nr;
|
||||
u64 quorum_heartbeat_timeout_ms;
|
||||
u64 meta_reserve_blocks;
|
||||
};
|
||||
|
||||
void scoutfs_options_read(struct super_block *sb, struct scoutfs_mount_options *opts);
|
||||
|
||||
@@ -772,11 +772,14 @@ static int alloc_move_empty(struct super_block *sb,
|
||||
u64 scoutfs_server_reserved_meta_blocks(struct super_block *sb)
|
||||
{
|
||||
DECLARE_SERVER_INFO(sb, server);
|
||||
struct scoutfs_mount_options opts;
|
||||
u64 server_blocks;
|
||||
u64 client_blocks;
|
||||
u64 log_blocks;
|
||||
u64 nr_clients;
|
||||
|
||||
scoutfs_options_read(sb, &opts);
|
||||
|
||||
/* server has two meta_avail lists it swaps between */
|
||||
server_blocks = SCOUTFS_SERVER_META_FILL_TARGET * 2;
|
||||
|
||||
@@ -801,7 +804,7 @@ u64 scoutfs_server_reserved_meta_blocks(struct super_block *sb)
|
||||
nr_clients = server->nr_clients;
|
||||
spin_unlock(&server->lock);
|
||||
|
||||
return server_blocks + (max(1ULL, nr_clients) * client_blocks);
|
||||
return server_blocks + (max(1ULL, nr_clients) * client_blocks) + opts.meta_reserve_blocks;
|
||||
}
|
||||
|
||||
/*
|
||||
@@ -1299,10 +1302,12 @@ static int finalize_and_start_log_merge(struct super_block *sb, struct scoutfs_l
|
||||
* is nested inside holding commits so we recheck the persistent item
|
||||
* each time we commit to make sure it's still what we think. The
|
||||
* caller is still going to send the item to the client so we update the
|
||||
* caller's each time we make progress. If we hit an error applying the
|
||||
* changes we make then we can't send the log_trees to the client.
|
||||
* caller's each time we make progress. This is a best-effort attempt
|
||||
* to clean up and it's valid to leave extents in data_freed we don't
|
||||
* return errors to the caller. The client will continue the work later
|
||||
* in get_log_trees or as the rid is reclaimed.
|
||||
*/
|
||||
static int try_drain_data_freed(struct super_block *sb, struct scoutfs_log_trees *lt)
|
||||
static void try_drain_data_freed(struct super_block *sb, struct scoutfs_log_trees *lt)
|
||||
{
|
||||
DECLARE_SERVER_INFO(sb, server);
|
||||
struct scoutfs_super_block *super = DIRTY_SUPER_SB(sb);
|
||||
@@ -1311,7 +1316,6 @@ static int try_drain_data_freed(struct super_block *sb, struct scoutfs_log_trees
|
||||
struct scoutfs_log_trees drain;
|
||||
struct scoutfs_key key;
|
||||
COMMIT_HOLD(hold);
|
||||
bool apply = false;
|
||||
int ret = 0;
|
||||
int err;
|
||||
|
||||
@@ -1320,27 +1324,22 @@ static int try_drain_data_freed(struct super_block *sb, struct scoutfs_log_trees
|
||||
while (lt->data_freed.total_len != 0) {
|
||||
server_hold_commit(sb, &hold);
|
||||
mutex_lock(&server->logs_mutex);
|
||||
apply = true;
|
||||
|
||||
ret = find_log_trees_item(sb, &super->logs_root, false, rid, U64_MAX, &drain);
|
||||
if (ret < 0) {
|
||||
ret = 0;
|
||||
if (ret < 0)
|
||||
break;
|
||||
}
|
||||
|
||||
/* careful to only keep draining the caller's specific open trans */
|
||||
if (drain.nr != lt->nr || drain.get_trans_seq != lt->get_trans_seq ||
|
||||
drain.commit_trans_seq != lt->commit_trans_seq || drain.flags != lt->flags) {
|
||||
ret = 0;
|
||||
ret = -ENOENT;
|
||||
break;
|
||||
}
|
||||
|
||||
ret = scoutfs_btree_dirty(sb, &server->alloc, &server->wri,
|
||||
&super->logs_root, &key);
|
||||
if (ret < 0) {
|
||||
ret = 0;
|
||||
if (ret < 0)
|
||||
break;
|
||||
}
|
||||
|
||||
/* moving can modify and return errors, always update caller and item */
|
||||
mutex_lock(&server->alloc_mutex);
|
||||
@@ -1356,19 +1355,19 @@ static int try_drain_data_freed(struct super_block *sb, struct scoutfs_log_trees
|
||||
BUG_ON(err < 0); /* dirtying must guarantee success */
|
||||
|
||||
mutex_unlock(&server->logs_mutex);
|
||||
|
||||
ret = server_apply_commit(sb, &hold, ret);
|
||||
apply = false;
|
||||
|
||||
if (ret < 0)
|
||||
if (ret < 0) {
|
||||
ret = 0; /* don't try to abort, ignoring ret */
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (apply) {
|
||||
/* try to cleanly abort and write any partial dirty btree blocks, but ignore result */
|
||||
if (ret < 0) {
|
||||
mutex_unlock(&server->logs_mutex);
|
||||
server_apply_commit(sb, &hold, ret);
|
||||
server_apply_commit(sb, &hold, 0);
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
/*
|
||||
@@ -1576,9 +1575,9 @@ out:
|
||||
scoutfs_err(sb, "error %d getting log trees for rid %016llx: %s",
|
||||
ret, rid, err_str);
|
||||
|
||||
/* try to drain excessive data_freed with additional commits, if needed */
|
||||
/* try to drain excessive data_freed with additional commits, if needed, ignoring err */
|
||||
if (ret == 0)
|
||||
ret = try_drain_data_freed(sb, <);
|
||||
try_drain_data_freed(sb, <);
|
||||
|
||||
return scoutfs_net_response(sb, conn, cmd, id, ret, <, sizeof(lt));
|
||||
}
|
||||
@@ -4153,7 +4152,7 @@ static void fence_pending_recov_worker(struct work_struct *work)
|
||||
struct server_info *server = container_of(work, struct server_info,
|
||||
fence_pending_recov_work);
|
||||
struct super_block *sb = server->sb;
|
||||
union scoutfs_inet_addr addr = {{0,}};
|
||||
union scoutfs_inet_addr addr;
|
||||
u64 rid = 0;
|
||||
int ret = 0;
|
||||
|
||||
|
||||
@@ -159,58 +159,6 @@ static bool drained_holders(struct trans_info *tri)
|
||||
return holders == 0;
|
||||
}
|
||||
|
||||
static int commit_current_log_trees(struct super_block *sb, char **str)
|
||||
{
|
||||
DECLARE_TRANS_INFO(sb, tri);
|
||||
|
||||
return (*str = "data submit", scoutfs_inode_walk_writeback(sb, true)) ?:
|
||||
(*str = "item dirty", scoutfs_item_write_dirty(sb)) ?:
|
||||
(*str = "data prepare", scoutfs_data_prepare_commit(sb)) ?:
|
||||
(*str = "alloc prepare", scoutfs_alloc_prepare_commit(sb, &tri->alloc, &tri->wri)) ?:
|
||||
(*str = "meta write", scoutfs_block_writer_write(sb, &tri->wri)) ?:
|
||||
(*str = "data wait", scoutfs_inode_walk_writeback(sb, false)) ?:
|
||||
(*str = "commit log trees", commit_btrees(sb)) ?:
|
||||
scoutfs_item_write_done(sb);
|
||||
}
|
||||
|
||||
static int get_next_log_trees(struct super_block *sb, char **str)
|
||||
{
|
||||
return (*str = "get log trees", scoutfs_trans_get_log_trees(sb));
|
||||
}
|
||||
|
||||
static int retry_forever(struct super_block *sb, int (*func)(struct super_block *sb, char **str))
|
||||
{
|
||||
bool retrying = false;
|
||||
char *str;
|
||||
int ret;
|
||||
|
||||
do {
|
||||
str = NULL;
|
||||
|
||||
ret = func(sb, &str);
|
||||
if (ret < 0) {
|
||||
if (!retrying) {
|
||||
scoutfs_warn(sb, "critical transaction commit failure: %s = %d, retrying",
|
||||
str, ret);
|
||||
retrying = true;
|
||||
}
|
||||
|
||||
if (scoutfs_forcing_unmount(sb)) {
|
||||
ret = -EIO;
|
||||
break;
|
||||
}
|
||||
|
||||
msleep(2 * MSEC_PER_SEC);
|
||||
|
||||
} else if (retrying) {
|
||||
scoutfs_info(sb, "retried transaction commit succeeded");
|
||||
}
|
||||
|
||||
} while (ret < 0);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
/*
|
||||
* This work func is responsible for writing out all the dirty blocks
|
||||
* that make up the current dirty transaction. It prevents writers from
|
||||
@@ -236,6 +184,8 @@ void scoutfs_trans_write_func(struct work_struct *work)
|
||||
struct trans_info *tri = container_of(work, struct trans_info, write_work.work);
|
||||
struct super_block *sb = tri->sb;
|
||||
struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
|
||||
bool retrying = false;
|
||||
char *s = NULL;
|
||||
int ret = 0;
|
||||
|
||||
tri->task = current;
|
||||
@@ -264,9 +214,37 @@ void scoutfs_trans_write_func(struct work_struct *work)
|
||||
|
||||
scoutfs_inc_counter(sb, trans_commit_written);
|
||||
|
||||
/* retry {commit,get}_log_trees until they succeeed, can only fail when forcing unmount */
|
||||
ret = retry_forever(sb, commit_current_log_trees) ?:
|
||||
retry_forever(sb, get_next_log_trees);
|
||||
do {
|
||||
ret = (s = "data submit", scoutfs_inode_walk_writeback(sb, true)) ?:
|
||||
(s = "item dirty", scoutfs_item_write_dirty(sb)) ?:
|
||||
(s = "data prepare", scoutfs_data_prepare_commit(sb)) ?:
|
||||
(s = "alloc prepare", scoutfs_alloc_prepare_commit(sb, &tri->alloc,
|
||||
&tri->wri)) ?:
|
||||
(s = "meta write", scoutfs_block_writer_write(sb, &tri->wri)) ?:
|
||||
(s = "data wait", scoutfs_inode_walk_writeback(sb, false)) ?:
|
||||
(s = "commit log trees", commit_btrees(sb)) ?:
|
||||
scoutfs_item_write_done(sb) ?:
|
||||
(s = "get log trees", scoutfs_trans_get_log_trees(sb));
|
||||
if (ret < 0) {
|
||||
if (!retrying) {
|
||||
scoutfs_warn(sb, "critical transaction commit failure: %s = %d, retrying",
|
||||
s, ret);
|
||||
retrying = true;
|
||||
}
|
||||
|
||||
if (scoutfs_forcing_unmount(sb)) {
|
||||
ret = -EIO;
|
||||
break;
|
||||
}
|
||||
|
||||
msleep(2 * MSEC_PER_SEC);
|
||||
|
||||
} else if (retrying) {
|
||||
scoutfs_info(sb, "retried transaction commit succeeded");
|
||||
}
|
||||
|
||||
} while (ret < 0);
|
||||
|
||||
out:
|
||||
spin_lock(&tri->write_lock);
|
||||
tri->write_count++;
|
||||
|
||||
@@ -15,8 +15,7 @@ BIN := src/createmany \
|
||||
src/o_tmpfile_umask \
|
||||
src/o_tmpfile_linkat \
|
||||
src/mmap_stress \
|
||||
src/mmap_validate \
|
||||
src/walk_inodes_for_estale
|
||||
src/mmap_validate
|
||||
|
||||
DEPS := $(wildcard src/*.d)
|
||||
|
||||
|
||||
@@ -80,15 +80,3 @@ t_compare_output()
|
||||
{
|
||||
"$@" >&7 2>&1
|
||||
}
|
||||
|
||||
#
|
||||
# usually bash prints an annoying output message when jobs
|
||||
# are killed. We can avoid that by redirecting stderr for
|
||||
# the bash process when it reaps the jobs that are killed.
|
||||
#
|
||||
t_silent_kill() {
|
||||
exec {ERR}>&2 2>/dev/null
|
||||
kill "$@"
|
||||
wait "$@"
|
||||
exec 2>&$ERR {ERR}>&-
|
||||
}
|
||||
|
||||
@@ -160,9 +160,6 @@ t_filter_dmesg()
|
||||
re="$re|Pipe handler or fully qualified core dump path required.*"
|
||||
re="$re|Set kernel.core_pattern before fs.suid_dumpable.*"
|
||||
|
||||
# perf warning that it adjusted sample rate
|
||||
re="$re|perf: interrupt took too long.*lowering kernel.perf_event_max_sample_rate.*"
|
||||
|
||||
egrep -v "($re)" | \
|
||||
ignore_harmless_unwind_kasan_stack_oob
|
||||
}
|
||||
|
||||
@@ -464,6 +464,7 @@ for i in $(seq 0 $((T_NR_MOUNTS - 1))); do
|
||||
if [ "$i" -lt "$T_QUORUM" ]; then
|
||||
opts="$opts,quorum_slot_nr=$i"
|
||||
fi
|
||||
opts="$opts,meta_reserve_blocks=0"
|
||||
opts="${opts}${T_MNT_OPTIONS}"
|
||||
|
||||
msg "mounting $meta_dev|$data_dev on $dir"
|
||||
@@ -532,15 +533,12 @@ for t in $tests; do
|
||||
cmd rm -rf "$T_TMPDIR"
|
||||
cmd mkdir -p "$T_TMPDIR"
|
||||
|
||||
# create a test name dir in the fs, clean up old data as needed
|
||||
# create a test name dir in the fs
|
||||
T_DS=""
|
||||
for i in $(seq 0 $((T_NR_MOUNTS - 1))); do
|
||||
dir="${T_M[$i]}/test/$test_name"
|
||||
|
||||
test $i == 0 && (
|
||||
test -d "$dir" && cmd rm -rf "$dir"
|
||||
cmd mkdir -p "$dir"
|
||||
)
|
||||
test $i == 0 && cmd mkdir -p "$dir"
|
||||
|
||||
eval T_D$i=$dir
|
||||
T_D[$i]=$dir
|
||||
|
||||
@@ -1,464 +0,0 @@
|
||||
|
||||
/*
|
||||
* Copyright (C) 2025 Versity Software, Inc. All rights reserved.
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU General Public
|
||||
* License v2 as published by the Free Software Foundation.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* General Public License for more details.
|
||||
*/
|
||||
|
||||
#ifndef _GNU_SOURCE
|
||||
#define _GNU_SOURCE
|
||||
#endif
|
||||
#include <errno.h>
|
||||
#include <fcntl.h>
|
||||
#include <inttypes.h>
|
||||
#include <limits.h>
|
||||
#include <signal.h>
|
||||
#include <stdbool.h>
|
||||
#include <stddef.h>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include <time.h>
|
||||
#include <unistd.h>
|
||||
#include <linux/types.h>
|
||||
#include <sys/ioctl.h>
|
||||
#include <sys/stat.h>
|
||||
#include <sys/syscall.h>
|
||||
|
||||
#include "ioctl.h"
|
||||
|
||||
#define array_size(arr) (sizeof(arr) / sizeof(arr[0]))
|
||||
|
||||
#define FILEID_SCOUTFS 0x81
|
||||
#define FILEID_SCOUTFS_WITH_PARENT 0x82
|
||||
|
||||
static uint64_t meta_seq = 0;
|
||||
static bool sig_received = false;
|
||||
static bool tracing_on = false;
|
||||
static bool exit_on_current = false;
|
||||
static bool exiting = false;
|
||||
static uint64_t count = 0;
|
||||
|
||||
struct our_handle {
|
||||
struct file_handle handle;
|
||||
/*
|
||||
* scoutfs file handle can be ino or ino/parent. The
|
||||
* handle_type field of struct file_handle denotes which
|
||||
* version is in use. We only use the ino variant here.
|
||||
*/
|
||||
__le64 scoutfs_ino;
|
||||
};
|
||||
|
||||
static void exit_usage(void)
|
||||
{
|
||||
printf(
|
||||
" -e exit once stable meta_seq has been reached\n"
|
||||
" -m <string> scoutfs mount path string for seq walk\n"
|
||||
" -s <number> start from meta_seq number, instead of 0\n"
|
||||
);
|
||||
exit(1);
|
||||
}
|
||||
|
||||
static int write_at(int tracefd, char *path, char *val)
|
||||
{
|
||||
int fd = -1;
|
||||
int ret;
|
||||
|
||||
fd = openat(tracefd, path, O_TRUNC | O_RDWR);
|
||||
if (fd < 0)
|
||||
return errno;
|
||||
ret = write(fd, val, strlen(val));
|
||||
if (ret < 0)
|
||||
ret = errno;
|
||||
|
||||
close(fd);
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int do_trace(int fd, uint64_t ino)
|
||||
{
|
||||
struct our_handle handle;
|
||||
int tracefd = -1;
|
||||
int targetfd = -1;
|
||||
int outfd = -1;
|
||||
int infd = -1;
|
||||
char *pidstr;
|
||||
char *name;
|
||||
char *buf;
|
||||
ssize_t bytes;
|
||||
ssize_t written;
|
||||
ssize_t off = 0;
|
||||
unsigned long e = 0;
|
||||
int ret;
|
||||
|
||||
if (asprintf(&pidstr, "%u", getpid()) < 0)
|
||||
return ENOMEM;
|
||||
|
||||
if (asprintf(&name, "trace.scoutfs.open_by_handle_at.ino-%lu", ino) < 0)
|
||||
return ENOMEM;
|
||||
|
||||
buf = malloc(4096);
|
||||
if (!buf)
|
||||
return ENOMEM;
|
||||
|
||||
handle.handle.handle_bytes = sizeof(struct our_handle);
|
||||
handle.handle.handle_type = FILEID_SCOUTFS;
|
||||
handle.scoutfs_ino = htole64(ino);
|
||||
|
||||
/* keep a quick dirfd around for easy writing sysfs files */
|
||||
tracefd = open("/sys/kernel/debug/tracing", 0);
|
||||
if (tracefd < 0)
|
||||
return errno;
|
||||
|
||||
/* start tracing */
|
||||
ret = write_at(tracefd, "current_tracer", "nop") ?:
|
||||
write_at(tracefd, "current_tracer", "function_graph") ?:
|
||||
write_at(tracefd, "set_ftrace_pid", pidstr) ?:
|
||||
write_at(tracefd, "tracing_on", "1");
|
||||
|
||||
tracing_on = true;
|
||||
|
||||
if (ret)
|
||||
goto out;
|
||||
|
||||
targetfd = open_by_handle_at(fd, &handle.handle, O_RDWR);
|
||||
e = errno;
|
||||
|
||||
out:
|
||||
/* turn off tracing first */
|
||||
ret = write_at(tracefd, "tracing_on", "0");
|
||||
if (ret)
|
||||
return ret;
|
||||
|
||||
tracing_on = false;
|
||||
|
||||
if (targetfd != -1) {
|
||||
close(targetfd);
|
||||
return 0;
|
||||
}
|
||||
|
||||
if (e == ESTALE) {
|
||||
/* capture trace */
|
||||
outfd = open(name, O_CREAT | O_TRUNC | O_RDWR, 0644);
|
||||
if (outfd < 0) {
|
||||
fprintf(stderr, "Error opening trace\n");
|
||||
return errno;
|
||||
}
|
||||
infd = openat(tracefd, "trace", O_RDONLY);
|
||||
if (infd < 0) {
|
||||
fprintf(stderr, "Error opening trace output\n");
|
||||
return errno;
|
||||
}
|
||||
for (;;) {
|
||||
bytes = pread(infd, buf, 4096, off);
|
||||
if (bytes < 0)
|
||||
return errno;
|
||||
if (bytes == 0)
|
||||
break;
|
||||
written = pwrite(outfd, buf, bytes, off);
|
||||
if (written < 0)
|
||||
return errno;
|
||||
if (written != bytes)
|
||||
return EIO;
|
||||
off += bytes;
|
||||
}
|
||||
close(outfd);
|
||||
close(infd);
|
||||
|
||||
fprintf(stderr, "Wrote \"%s\"\n", name);
|
||||
}
|
||||
|
||||
/* cleanup */
|
||||
ret = write_at(tracefd, "current_tracer", "nop");
|
||||
|
||||
free(pidstr);
|
||||
free(name);
|
||||
free(buf);
|
||||
close(tracefd);
|
||||
/* collect trace output */
|
||||
return ret;
|
||||
}
|
||||
|
||||
/*
|
||||
* lookup path for ino using ino_path
|
||||
*/
|
||||
struct ino_args {
|
||||
char *path;
|
||||
__u64 ino;
|
||||
};
|
||||
|
||||
static int do_resolve(int fd, uint64_t ino, char **path)
|
||||
{
|
||||
struct scoutfs_ioctl_ino_path ioctl_args = {0};
|
||||
struct scoutfs_ioctl_ino_path_result *res;
|
||||
unsigned int result_bytes;
|
||||
int ret;
|
||||
|
||||
result_bytes = offsetof(struct scoutfs_ioctl_ino_path_result,
|
||||
path[PATH_MAX]);
|
||||
|
||||
res = malloc(result_bytes);
|
||||
if (!res)
|
||||
return ENOMEM;
|
||||
|
||||
ioctl_args.ino = ino;
|
||||
ioctl_args.dir_ino = 0;
|
||||
ioctl_args.dir_pos = 0;
|
||||
ioctl_args.result_ptr = (intptr_t)res;
|
||||
ioctl_args.result_bytes = result_bytes;
|
||||
|
||||
ret = ioctl(fd, SCOUTFS_IOC_INO_PATH, &ioctl_args);
|
||||
if (ret < 0) {
|
||||
if (errno == ENOENT) {
|
||||
*path = NULL;
|
||||
return 0;
|
||||
}
|
||||
return errno;
|
||||
}
|
||||
|
||||
ret = asprintf(path, "%.*s", res->path_bytes, res->path);
|
||||
if (ret <= 0)
|
||||
return ENOMEM;
|
||||
|
||||
free(res);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int do_test_ino(int fd, uint64_t ino)
|
||||
{
|
||||
struct our_handle handle = {{0}};
|
||||
struct stat sb = {0};
|
||||
char *path = NULL;
|
||||
int targetfd = -1;
|
||||
int ret;
|
||||
|
||||
/* filter: open_by_handle_at() must fail */
|
||||
handle.handle.handle_bytes = sizeof(struct our_handle);
|
||||
handle.handle.handle_type = FILEID_SCOUTFS;
|
||||
handle.scoutfs_ino = htole64(ino);
|
||||
|
||||
targetfd = open_by_handle_at(fd, &handle.handle, O_RDWR);
|
||||
if (targetfd != -1) {
|
||||
close(targetfd);
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* filter: errno must be ESTALE */
|
||||
if (errno != ESTALE)
|
||||
return 0;
|
||||
|
||||
/* filter: path resolution succeeds to an actual file entry */
|
||||
ret = do_resolve(fd, ino, &path);
|
||||
if (path == NULL)
|
||||
return 0;
|
||||
if (ret)
|
||||
return ret;
|
||||
|
||||
/* filter: stat() must succeed on resolved path */
|
||||
ret = fstatat(fd, path, &sb, AT_SYMLINK_NOFOLLOW);
|
||||
free(path);
|
||||
if (ret != 0) {
|
||||
if (errno == ENOENT)
|
||||
/* doesn't exist */
|
||||
return 0;
|
||||
return errno;
|
||||
}
|
||||
|
||||
return do_trace(fd, ino);
|
||||
}
|
||||
|
||||
static uint64_t do_get_meta_seq_stable(int fd)
|
||||
{
|
||||
struct scoutfs_ioctl_stat_more stm;
|
||||
|
||||
if (ioctl(fd, SCOUTFS_IOC_STAT_MORE, &stm) < 0)
|
||||
return errno;
|
||||
|
||||
return stm.meta_seq;
|
||||
}
|
||||
|
||||
static int do_walk_seq(int fd)
|
||||
{
|
||||
struct scoutfs_ioctl_walk_inodes_entry ents[128];
|
||||
struct scoutfs_ioctl_walk_inodes walk = {{0}};
|
||||
struct timespec ts;
|
||||
time_t seconds;
|
||||
int ret;
|
||||
uint64_t total = 0;
|
||||
uint64_t stable;
|
||||
int i;
|
||||
int j;
|
||||
|
||||
walk.index = SCOUTFS_IOC_WALK_INODES_META_SEQ;
|
||||
|
||||
/* make sure not to advance to stable meta_seq, we can just trail behind */
|
||||
stable = do_get_meta_seq_stable(fd);
|
||||
if (stable == 0)
|
||||
return 0;
|
||||
if (meta_seq >= stable - 1) {
|
||||
if (exit_on_current)
|
||||
exiting = true;
|
||||
return 0;
|
||||
}
|
||||
|
||||
meta_seq = meta_seq ? meta_seq + 1 : 0;
|
||||
|
||||
walk.first.major = meta_seq;
|
||||
walk.first.minor = 0;
|
||||
walk.first.ino = 0;
|
||||
|
||||
walk.last.major = stable - 1;
|
||||
walk.last.minor = ~0;
|
||||
walk.last.ino = ~0ULL;
|
||||
|
||||
walk.entries_ptr = (unsigned long)ents;
|
||||
walk.nr_entries = array_size(ents);
|
||||
|
||||
clock_gettime(CLOCK_REALTIME, &ts);
|
||||
seconds = ts.tv_sec;
|
||||
|
||||
for (j = 0;; j++) {
|
||||
if (sig_received)
|
||||
return 0;
|
||||
|
||||
ret = ioctl(fd, SCOUTFS_IOC_WALK_INODES, &walk);
|
||||
if (ret < 0)
|
||||
return ret;
|
||||
|
||||
if (ret == 0)
|
||||
break;
|
||||
|
||||
for (i = 0; i < ret; i++) {
|
||||
meta_seq = ents[i].major;
|
||||
if (ents[i].ino == 1)
|
||||
continue;
|
||||
|
||||
/* poke at it */
|
||||
ret = do_test_ino(fd, ents[i].ino);
|
||||
|
||||
count++;
|
||||
|
||||
if (ret < 0)
|
||||
return ret;
|
||||
}
|
||||
|
||||
total += i;
|
||||
|
||||
walk.first = ents[i - 1];
|
||||
if (++walk.first.ino == 0 && ++walk.first.minor == 0)
|
||||
walk.first.major++;
|
||||
|
||||
/* yield once in a while */
|
||||
if (j % 32 == 0) {
|
||||
clock_gettime(CLOCK_REALTIME, &ts);
|
||||
if (ts.tv_sec > seconds + 1)
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
void handle_signal(int sig)
|
||||
{
|
||||
int tracefd = -1;
|
||||
|
||||
sig_received = true;
|
||||
|
||||
if (!tracing_on)
|
||||
return;
|
||||
|
||||
tracefd = open("/sys/kernel/debug/tracing", 0);
|
||||
write_at(tracefd, "tracing_on", "0");
|
||||
close(tracefd);
|
||||
}
|
||||
|
||||
int main(int argc, char **argv)
|
||||
{
|
||||
char *mnt = NULL;
|
||||
char c;
|
||||
int mntfd;
|
||||
int ret;
|
||||
|
||||
meta_seq = 0;
|
||||
|
||||
/* All we need is the mount point arg */
|
||||
while ((c = getopt(argc, argv, "+em:s:")) != -1) {
|
||||
switch (c) {
|
||||
case 'e':
|
||||
exit_on_current = true;
|
||||
break;
|
||||
case 'm':
|
||||
mnt = strdup(optarg);
|
||||
break;
|
||||
case 's':
|
||||
meta_seq = strtoull(optarg, NULL, 0);
|
||||
break;
|
||||
case '?':
|
||||
printf("unknown argument: %c\n", optind);
|
||||
case 'h':
|
||||
exit_usage();
|
||||
}
|
||||
}
|
||||
|
||||
if (!mnt) {
|
||||
fprintf(stderr, "Must provide a mount point with -m\n");
|
||||
exit(EXIT_FAILURE);
|
||||
}
|
||||
|
||||
if (meta_seq > 0)
|
||||
fprintf(stdout, "Starting from meta_seq = %lu\n", meta_seq);
|
||||
|
||||
/* lower prio */
|
||||
ret = nice(10);
|
||||
if (ret == -1)
|
||||
fprintf(stderr, "Error setting nice value\n");
|
||||
ret = syscall(SYS_ioprio_set, 1, 0, 0); /* IOPRIO_WHO_PROCESS = 1, IOPRIO_PRIO_CLASS(IOPRIO_CLASS_IDLE) = 0 */
|
||||
if (ret == -1)
|
||||
fprintf(stderr, "Error setting ioprio value\n");
|
||||
|
||||
signal(SIGINT, handle_signal);
|
||||
signal(SIGTERM, handle_signal);
|
||||
|
||||
for (;;) {
|
||||
if (sig_received)
|
||||
break;
|
||||
|
||||
mntfd = open(mnt, O_RDONLY);
|
||||
if (mntfd == -1) {
|
||||
perror("open(mntfd)");
|
||||
exit(EXIT_FAILURE);
|
||||
}
|
||||
|
||||
ret = do_walk_seq(mntfd);
|
||||
/* handle unmounts? EAGAIN? */
|
||||
if (ret)
|
||||
break;
|
||||
|
||||
close(mntfd);
|
||||
|
||||
if (exiting)
|
||||
break;
|
||||
|
||||
/* yield */
|
||||
if (!sig_received)
|
||||
sleep(5);
|
||||
}
|
||||
|
||||
free(mnt);
|
||||
|
||||
fprintf(stdout, "Last meta_seq = %lu\n", meta_seq);
|
||||
|
||||
if (ret)
|
||||
fprintf(stderr, "Error walking inodes: %s(%d)\n", strerror(errno), ret);
|
||||
|
||||
exit(ret);
|
||||
}
|
||||
@@ -88,11 +88,6 @@ rm -rf "$SCR/xattrs"
|
||||
|
||||
echo "== make sure we can create again"
|
||||
file="$SCR/file-after"
|
||||
C=120
|
||||
while (( C-- )); do
|
||||
touch $file 2> /dev/null && break
|
||||
sleep 1
|
||||
done
|
||||
touch $file
|
||||
setfattr -n user.scoutfs-enospc -v 1 "$file"
|
||||
sync
|
||||
|
||||
@@ -38,6 +38,6 @@ while [ "$SECONDS" -lt "$END" ]; do
|
||||
done
|
||||
|
||||
echo "== stopping background load"
|
||||
t_silent_kill $load_pids
|
||||
kill $load_pids
|
||||
|
||||
t_pass
|
||||
|
||||
@@ -5,6 +5,18 @@
|
||||
t_require_commands sleep touch sync stat handle_cat kill rm
|
||||
t_require_mounts 2
|
||||
|
||||
#
|
||||
# usually bash prints an annoying output message when jobs
|
||||
# are killed. We can avoid that by redirecting stderr for
|
||||
# the bash process when it reaps the jobs that are killed.
|
||||
#
|
||||
silent_kill() {
|
||||
exec {ERR}>&2 2>/dev/null
|
||||
kill "$@"
|
||||
wait "$@"
|
||||
exec 2>&$ERR {ERR}>&-
|
||||
}
|
||||
|
||||
#
|
||||
# We don't have a great way to test that inode items still exist. We
|
||||
# don't prevent opening handles with nlink 0 today, so we'll use that.
|
||||
@@ -40,7 +52,7 @@ inode_exists $ino || echo "$ino didn't exist"
|
||||
|
||||
echo "== orphan from failed evict deletion is picked up"
|
||||
# pending kill signal stops evict from getting locks and deleting
|
||||
t_silent_kill $pid
|
||||
silent_kill $pid
|
||||
t_set_sysfs_mount_option 0 orphan_scan_delay_ms 1000
|
||||
sleep 5
|
||||
inode_exists $ino && echo "$ino still exists"
|
||||
@@ -58,7 +70,7 @@ for nr in $(t_fs_nrs); do
|
||||
rm -f "$path"
|
||||
done
|
||||
sync
|
||||
t_silent_kill $pids
|
||||
silent_kill $pids
|
||||
for nr in $(t_fs_nrs); do
|
||||
t_force_umount $nr
|
||||
done
|
||||
@@ -70,15 +82,7 @@ done
|
||||
# wait for orphan scans to run
|
||||
t_set_all_sysfs_mount_options orphan_scan_delay_ms 1000
|
||||
# also have to wait for delayed log merge work from mount
|
||||
C=120
|
||||
while (( C-- )); do
|
||||
brk=1
|
||||
for ino in $inos; do
|
||||
inode_exists $ino && brk=0
|
||||
done
|
||||
test $brk -eq 1 && break
|
||||
sleep 1
|
||||
done
|
||||
sleep 15
|
||||
for ino in $inos; do
|
||||
inode_exists $ino && echo "$ino still exists"
|
||||
done
|
||||
@@ -127,7 +131,7 @@ while [ $SECONDS -lt $END ]; do
|
||||
done
|
||||
|
||||
# trigger eviction deletion of each file in each mount
|
||||
t_silent_kill $pids
|
||||
silent_kill $pids
|
||||
|
||||
wait || t_fail "handle_fsetxattr failed"
|
||||
|
||||
|
||||
Reference in New Issue
Block a user