Compare commits

..

1 Commits

Author SHA1 Message Date
Auke Kok
68f7d1f2d0 Walk met_seq for inodes to open_by_handle_at() = -ESTALE, and trace.
This walks the meta_seq from 0, or, a passed in number to the current
meta_seq, and polls further advances, until it approaches (but doesn't
reach) the current stable meta_seq. Inodes found in the index will
be tested to see if they return -ESTALE when open_by_handle_at().

If they do, further tests assure the file exists and can
be resolved. If that happens to be the case, we ftrace the
open_by_handle_at() on the inode, and write out the tracefile.

The program can be ^C'd and will print out how far in the meta_seq
it got, so it can be resumed where it left off. If tracing happens
to be on while a TERM or INT signal is received, we immediately turn
off tracing again.

The program runs without producing needless terminal output. If traces
are generated, the file name is printed out. If the program terminates,
for whatever reason, it prints out how far it has advanced through
the meta_seq, and that information can be used to resume testing from
there on.

The program can be run in tailing (normal) mode, the default, and
continue to wait for new work to appear. Alternatively, the command line
option `-e` tells the program to stop execution once the current stable
meta_seq has been reached.

Signed-off-by: Auke Kok <auke.kok@versity.com>
2025-05-21 10:00:35 -07:00
10 changed files with 518 additions and 96 deletions

View File

@@ -1,27 +1,6 @@
Versity ScoutFS Release Notes
=============================
---
v1.25
\
*Jun 3, 2025*
Fix a bug that could cause indefinite retries of failed client commits.
Under specific error conditions the client and server's understanding of
the current client commit could get out of sync. The client would retry
commits indefinitely that could never succeed. This manifested as
infinite "critical transaction commit failure" messages in the kernel
log on the client and matching "error <nr> committing client logs" on
the server.
Fix a bug in a specific case of server error handling that could result
in sending references to unwritten blocks to the client. The client
would try to read blocks that hadn't been written and return spurious
errors. This was seen under low free space conditions on the server and
resulted in error messages with error code 116 (The errno enum for
ESTALE, the client's indication that it couldn't read the blocks that it
expected.)
---
v1.24
\

View File

@@ -296,39 +296,28 @@ static s64 truncate_extents(struct super_block *sb, struct inode *inode,
* and offline blocks. If it's not provided then the inode is being
* destroyed and isn't reachable, we don't need to update it.
*
* If 'pause' is set, then we are destroying the inode and we should take
* breaks occasionally to allow other nodes access to this inode lock shard.
*
* The caller is in charge of locking the inode and data, but we may
* have to modify far more items than fit in a transaction so we're in
* charge of batching updates into transactions. If the inode is
* provided then we're responsible for updating its item as we go.
*/
int scoutfs_data_truncate_items(struct super_block *sb, struct inode *inode,
u64 ino, u64 *iblock, u64 last, bool offline,
struct scoutfs_lock *lock, bool pause)
u64 ino, u64 iblock, u64 last, bool offline,
struct scoutfs_lock *lock)
{
struct scoutfs_inode_info *si = NULL;
LIST_HEAD(ind_locks);
u64 cur_seq;
s64 ret = 0;
WARN_ON_ONCE(inode && !inode_is_locked(inode));
/*
* If the inode is provided, then we aren't destroying it. So it's not
* safe to pause while removing items- it needs to be done in one chunk.
*/
if (WARN_ON_ONCE(pause && inode))
return -EINVAL;
/* clamp last to the last possible block? */
if (last > SCOUTFS_BLOCK_SM_MAX)
last = SCOUTFS_BLOCK_SM_MAX;
trace_scoutfs_data_truncate_items(sb, *iblock, last, offline, pause);
trace_scoutfs_data_truncate_items(sb, iblock, last, offline);
if (WARN_ON_ONCE(last < *iblock))
if (WARN_ON_ONCE(last < iblock))
return -EINVAL;
if (inode) {
@@ -336,9 +325,7 @@ int scoutfs_data_truncate_items(struct super_block *sb, struct inode *inode,
down_write(&si->extent_sem);
}
cur_seq = scoutfs_trans_sample_seq(sb);
while (*iblock <= last) {
while (iblock <= last) {
if (inode)
ret = scoutfs_inode_index_lock_hold(inode, &ind_locks, true, false);
else
@@ -352,7 +339,7 @@ int scoutfs_data_truncate_items(struct super_block *sb, struct inode *inode,
ret = 0;
if (ret == 0)
ret = truncate_extents(sb, inode, ino, *iblock, last,
ret = truncate_extents(sb, inode, ino, iblock, last,
offline, lock);
if (inode)
@@ -364,19 +351,8 @@ int scoutfs_data_truncate_items(struct super_block *sb, struct inode *inode,
if (ret <= 0)
break;
*iblock = ret;
iblock = ret;
ret = 0;
/*
* We know there's more to do because truncate_extents()
* pauses every EXTENTS_PER_HOLD extents and it returned the
* next starting block. Our caller might also want us to pause,
* which we will do whenever we cross a transaction boundary.
*/
if (pause && (cur_seq != scoutfs_trans_sample_seq(sb))) {
ret = -EINPROGRESS;
break;
}
}
if (si)

View File

@@ -47,8 +47,8 @@ int scoutfs_get_block_write(struct inode *inode, sector_t iblock, struct buffer_
int create);
int scoutfs_data_truncate_items(struct super_block *sb, struct inode *inode,
u64 ino, u64 *iblock, u64 last, bool offline,
struct scoutfs_lock *lock, bool pause);
u64 ino, u64 iblock, u64 last, bool offline,
struct scoutfs_lock *lock);
int scoutfs_data_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
u64 start, u64 len);
long scoutfs_fallocate(struct file *file, int mode, loff_t offset, loff_t len);

View File

@@ -468,8 +468,8 @@ int scoutfs_complete_truncate(struct inode *inode, struct scoutfs_lock *lock)
start = (i_size_read(inode) + SCOUTFS_BLOCK_SM_SIZE - 1) >>
SCOUTFS_BLOCK_SM_SHIFT;
ret = scoutfs_data_truncate_items(inode->i_sb, inode,
scoutfs_ino(inode), &start, ~0ULL,
false, lock, false);
scoutfs_ino(inode), start, ~0ULL,
false, lock);
err = clear_truncate_flag(inode, lock);
return ret ? ret : err;
@@ -1635,8 +1635,7 @@ int scoutfs_inode_orphan_delete(struct super_block *sb, u64 ino, struct scoutfs_
* partial deletion until all deletion is complete and the orphan item
* is removed.
*/
static int delete_inode_items(struct super_block *sb, u64 ino,
struct scoutfs_inode *sinode, u64 *start,
static int delete_inode_items(struct super_block *sb, u64 ino, struct scoutfs_inode *sinode,
struct scoutfs_lock *lock, struct scoutfs_lock *orph_lock)
{
struct scoutfs_key key;
@@ -1655,8 +1654,8 @@ static int delete_inode_items(struct super_block *sb, u64 ino,
/* remove data items in their own transactions */
if (S_ISREG(mode)) {
ret = scoutfs_data_truncate_items(sb, NULL, ino, start, ~0ULL,
false, lock, true);
ret = scoutfs_data_truncate_items(sb, NULL, ino, 0, ~0ULL,
false, lock);
if (ret)
goto out;
}
@@ -1804,23 +1803,16 @@ out:
*/
static int try_delete_inode_items(struct super_block *sb, u64 ino)
{
struct inode_deletion_lock_data *ldata;
struct scoutfs_lock *orph_lock;
struct scoutfs_lock *lock;
struct inode_deletion_lock_data *ldata = NULL;
struct scoutfs_lock *orph_lock = NULL;
struct scoutfs_lock *lock = NULL;
struct scoutfs_inode sinode;
struct scoutfs_key key;
bool clear_trying = false;
bool more = false;
u64 group_nr;
u64 start = 0;
int bit_nr;
int ret;
again:
ldata = NULL;
orph_lock = NULL;
lock = NULL;
ret = scoutfs_lock_ino(sb, SCOUTFS_LOCK_WRITE, 0, ino, &lock);
if (ret < 0)
goto out;
@@ -1832,12 +1824,11 @@ again:
goto out;
/* only one local attempt per inode at a time */
if (!more && test_and_set_bit(bit_nr, ldata->trying)) {
if (test_and_set_bit(bit_nr, ldata->trying)) {
ret = 0;
goto out;
}
clear_trying = true;
more = false;
/* can't delete if it's cached in local or remote mounts */
if (scoutfs_omap_test(sb, ino) || test_bit_le(bit_nr, ldata->map.bits)) {
@@ -1862,15 +1853,7 @@ again:
if (ret < 0)
goto out;
ret = delete_inode_items(sb, ino, &sinode, &start, lock, orph_lock);
if (ret == -EINPROGRESS) {
more = true;
clear_trying = false;
} else {
more = false;
}
ret = delete_inode_items(sb, ino, &sinode, lock, orph_lock);
out:
if (clear_trying)
clear_bit(bit_nr, ldata->trying);
@@ -1878,9 +1861,6 @@ out:
scoutfs_unlock(sb, lock, SCOUTFS_LOCK_WRITE);
scoutfs_unlock(sb, orph_lock, SCOUTFS_LOCK_WRITE_ONLY);
if (more)
goto again;
return ret;
}

View File

@@ -372,9 +372,9 @@ static long scoutfs_ioc_release(struct file *file, unsigned long arg)
sblock = args.offset >> SCOUTFS_BLOCK_SM_SHIFT;
eblock = (args.offset + args.length - 1) >> SCOUTFS_BLOCK_SM_SHIFT;
ret = scoutfs_data_truncate_items(sb, inode, scoutfs_ino(inode),
&sblock,
sblock,
eblock, true,
lock, false);
lock);
if (ret == 0) {
scoutfs_inode_get_onoff(inode, &online, &offline);
isize = i_size_read(inode);
@@ -383,8 +383,8 @@ static long scoutfs_ioc_release(struct file *file, unsigned long arg)
>> SCOUTFS_BLOCK_SM_SHIFT;
ret = scoutfs_data_truncate_items(sb, inode,
scoutfs_ino(inode),
&sblock, U64_MAX,
false, lock, false);
sblock, U64_MAX,
false, lock);
}
}

View File

@@ -378,16 +378,15 @@ DEFINE_EVENT(scoutfs_data_file_extent_class, scoutfs_data_fiemap_extent,
);
TRACE_EVENT(scoutfs_data_truncate_items,
TP_PROTO(struct super_block *sb, __u64 iblock, __u64 last, int offline, bool pause),
TP_PROTO(struct super_block *sb, __u64 iblock, __u64 last, int offline),
TP_ARGS(sb, iblock, last, offline, pause),
TP_ARGS(sb, iblock, last, offline),
TP_STRUCT__entry(
SCSB_TRACE_FIELDS
__field(__u64, iblock)
__field(__u64, last)
__field(int, offline)
__field(bool, pause)
),
TP_fast_assign(
@@ -395,12 +394,10 @@ TRACE_EVENT(scoutfs_data_truncate_items,
__entry->iblock = iblock;
__entry->last = last;
__entry->offline = offline;
__entry->pause = pause;
),
TP_printk(SCSBF" iblock %llu last %llu offline %u pause %d",
SCSB_TRACE_ARGS, __entry->iblock, __entry->last,
__entry->offline, __entry->pause)
TP_printk(SCSBF" iblock %llu last %llu offline %u", SCSB_TRACE_ARGS,
__entry->iblock, __entry->last, __entry->offline)
);
TRACE_EVENT(scoutfs_data_wait_check,

View File

@@ -15,7 +15,8 @@ BIN := src/createmany \
src/o_tmpfile_umask \
src/o_tmpfile_linkat \
src/mmap_stress \
src/mmap_validate
src/mmap_validate \
src/walk_inodes_for_estale
DEPS := $(wildcard src/*.d)

View File

@@ -1,3 +1,4 @@
== setting longer hung task timeout
== creating fragmented extents
== unlink file with moved extents to free extents per block
== cleanup

View File

@@ -0,0 +1,464 @@
/*
* Copyright (C) 2025 Versity Software, Inc. All rights reserved.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public
* License v2 as published by the Free Software Foundation.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* General Public License for more details.
*/
#ifndef _GNU_SOURCE
#define _GNU_SOURCE
#endif
#include <errno.h>
#include <fcntl.h>
#include <inttypes.h>
#include <limits.h>
#include <signal.h>
#include <stdbool.h>
#include <stddef.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <time.h>
#include <unistd.h>
#include <linux/types.h>
#include <sys/ioctl.h>
#include <sys/stat.h>
#include <sys/syscall.h>
#include "ioctl.h"
#define array_size(arr) (sizeof(arr) / sizeof(arr[0]))
#define FILEID_SCOUTFS 0x81
#define FILEID_SCOUTFS_WITH_PARENT 0x82
static uint64_t meta_seq = 0;
static bool sig_received = false;
static bool tracing_on = false;
static bool exit_on_current = false;
static bool exiting = false;
static uint64_t count = 0;
struct our_handle {
struct file_handle handle;
/*
* scoutfs file handle can be ino or ino/parent. The
* handle_type field of struct file_handle denotes which
* version is in use. We only use the ino variant here.
*/
__le64 scoutfs_ino;
};
static void exit_usage(void)
{
printf(
" -e exit once stable meta_seq has been reached\n"
" -m <string> scoutfs mount path string for seq walk\n"
" -s <number> start from meta_seq number, instead of 0\n"
);
exit(1);
}
static int write_at(int tracefd, char *path, char *val)
{
int fd = -1;
int ret;
fd = openat(tracefd, path, O_TRUNC | O_RDWR);
if (fd < 0)
return errno;
ret = write(fd, val, strlen(val));
if (ret < 0)
ret = errno;
close(fd);
return 0;
}
static int do_trace(int fd, uint64_t ino)
{
struct our_handle handle;
int tracefd = -1;
int targetfd = -1;
int outfd = -1;
int infd = -1;
char *pidstr;
char *name;
char *buf;
ssize_t bytes;
ssize_t written;
ssize_t off = 0;
unsigned long e = 0;
int ret;
if (asprintf(&pidstr, "%u", getpid()) < 0)
return ENOMEM;
if (asprintf(&name, "trace.scoutfs.open_by_handle_at.ino-%lu", ino) < 0)
return ENOMEM;
buf = malloc(4096);
if (!buf)
return ENOMEM;
handle.handle.handle_bytes = sizeof(struct our_handle);
handle.handle.handle_type = FILEID_SCOUTFS;
handle.scoutfs_ino = htole64(ino);
/* keep a quick dirfd around for easy writing sysfs files */
tracefd = open("/sys/kernel/debug/tracing", 0);
if (tracefd < 0)
return errno;
/* start tracing */
ret = write_at(tracefd, "current_tracer", "nop") ?:
write_at(tracefd, "current_tracer", "function_graph") ?:
write_at(tracefd, "set_ftrace_pid", pidstr) ?:
write_at(tracefd, "tracing_on", "1");
tracing_on = true;
if (ret)
goto out;
targetfd = open_by_handle_at(fd, &handle.handle, O_RDWR);
e = errno;
out:
/* turn off tracing first */
ret = write_at(tracefd, "tracing_on", "0");
if (ret)
return ret;
tracing_on = false;
if (targetfd != -1) {
close(targetfd);
return 0;
}
if (e == ESTALE) {
/* capture trace */
outfd = open(name, O_CREAT | O_TRUNC | O_RDWR, 0644);
if (outfd < 0) {
fprintf(stderr, "Error opening trace\n");
return errno;
}
infd = openat(tracefd, "trace", O_RDONLY);
if (infd < 0) {
fprintf(stderr, "Error opening trace output\n");
return errno;
}
for (;;) {
bytes = pread(infd, buf, 4096, off);
if (bytes < 0)
return errno;
if (bytes == 0)
break;
written = pwrite(outfd, buf, bytes, off);
if (written < 0)
return errno;
if (written != bytes)
return EIO;
off += bytes;
}
close(outfd);
close(infd);
fprintf(stderr, "Wrote \"%s\"\n", name);
}
/* cleanup */
ret = write_at(tracefd, "current_tracer", "nop");
free(pidstr);
free(name);
free(buf);
close(tracefd);
/* collect trace output */
return ret;
}
/*
* lookup path for ino using ino_path
*/
struct ino_args {
char *path;
__u64 ino;
};
static int do_resolve(int fd, uint64_t ino, char **path)
{
struct scoutfs_ioctl_ino_path ioctl_args = {0};
struct scoutfs_ioctl_ino_path_result *res;
unsigned int result_bytes;
int ret;
result_bytes = offsetof(struct scoutfs_ioctl_ino_path_result,
path[PATH_MAX]);
res = malloc(result_bytes);
if (!res)
return ENOMEM;
ioctl_args.ino = ino;
ioctl_args.dir_ino = 0;
ioctl_args.dir_pos = 0;
ioctl_args.result_ptr = (intptr_t)res;
ioctl_args.result_bytes = result_bytes;
ret = ioctl(fd, SCOUTFS_IOC_INO_PATH, &ioctl_args);
if (ret < 0) {
if (errno == ENOENT) {
*path = NULL;
return 0;
}
return errno;
}
ret = asprintf(path, "%.*s", res->path_bytes, res->path);
if (ret <= 0)
return ENOMEM;
free(res);
return 0;
}
static int do_test_ino(int fd, uint64_t ino)
{
struct our_handle handle = {{0}};
struct stat sb = {0};
char *path = NULL;
int targetfd = -1;
int ret;
/* filter: open_by_handle_at() must fail */
handle.handle.handle_bytes = sizeof(struct our_handle);
handle.handle.handle_type = FILEID_SCOUTFS;
handle.scoutfs_ino = htole64(ino);
targetfd = open_by_handle_at(fd, &handle.handle, O_RDWR);
if (targetfd != -1) {
close(targetfd);
return 0;
}
/* filter: errno must be ESTALE */
if (errno != ESTALE)
return 0;
/* filter: path resolution succeeds to an actual file entry */
ret = do_resolve(fd, ino, &path);
if (path == NULL)
return 0;
if (ret)
return ret;
/* filter: stat() must succeed on resolved path */
ret = fstatat(fd, path, &sb, AT_SYMLINK_NOFOLLOW);
free(path);
if (ret != 0) {
if (errno == ENOENT)
/* doesn't exist */
return 0;
return errno;
}
return do_trace(fd, ino);
}
static uint64_t do_get_meta_seq_stable(int fd)
{
struct scoutfs_ioctl_stat_more stm;
if (ioctl(fd, SCOUTFS_IOC_STAT_MORE, &stm) < 0)
return errno;
return stm.meta_seq;
}
static int do_walk_seq(int fd)
{
struct scoutfs_ioctl_walk_inodes_entry ents[128];
struct scoutfs_ioctl_walk_inodes walk = {{0}};
struct timespec ts;
time_t seconds;
int ret;
uint64_t total = 0;
uint64_t stable;
int i;
int j;
walk.index = SCOUTFS_IOC_WALK_INODES_META_SEQ;
/* make sure not to advance to stable meta_seq, we can just trail behind */
stable = do_get_meta_seq_stable(fd);
if (stable == 0)
return 0;
if (meta_seq >= stable - 1) {
if (exit_on_current)
exiting = true;
return 0;
}
meta_seq = meta_seq ? meta_seq + 1 : 0;
walk.first.major = meta_seq;
walk.first.minor = 0;
walk.first.ino = 0;
walk.last.major = stable - 1;
walk.last.minor = ~0;
walk.last.ino = ~0ULL;
walk.entries_ptr = (unsigned long)ents;
walk.nr_entries = array_size(ents);
clock_gettime(CLOCK_REALTIME, &ts);
seconds = ts.tv_sec;
for (j = 0;; j++) {
if (sig_received)
return 0;
ret = ioctl(fd, SCOUTFS_IOC_WALK_INODES, &walk);
if (ret < 0)
return ret;
if (ret == 0)
break;
for (i = 0; i < ret; i++) {
meta_seq = ents[i].major;
if (ents[i].ino == 1)
continue;
/* poke at it */
ret = do_test_ino(fd, ents[i].ino);
count++;
if (ret < 0)
return ret;
}
total += i;
walk.first = ents[i - 1];
if (++walk.first.ino == 0 && ++walk.first.minor == 0)
walk.first.major++;
/* yield once in a while */
if (j % 32 == 0) {
clock_gettime(CLOCK_REALTIME, &ts);
if (ts.tv_sec > seconds + 1)
break;
}
}
return 0;
}
void handle_signal(int sig)
{
int tracefd = -1;
sig_received = true;
if (!tracing_on)
return;
tracefd = open("/sys/kernel/debug/tracing", 0);
write_at(tracefd, "tracing_on", "0");
close(tracefd);
}
int main(int argc, char **argv)
{
char *mnt = NULL;
char c;
int mntfd;
int ret;
meta_seq = 0;
/* All we need is the mount point arg */
while ((c = getopt(argc, argv, "+em:s:")) != -1) {
switch (c) {
case 'e':
exit_on_current = true;
break;
case 'm':
mnt = strdup(optarg);
break;
case 's':
meta_seq = strtoull(optarg, NULL, 0);
break;
case '?':
printf("unknown argument: %c\n", optind);
case 'h':
exit_usage();
}
}
if (!mnt) {
fprintf(stderr, "Must provide a mount point with -m\n");
exit(EXIT_FAILURE);
}
if (meta_seq > 0)
fprintf(stdout, "Starting from meta_seq = %lu\n", meta_seq);
/* lower prio */
ret = nice(10);
if (ret == -1)
fprintf(stderr, "Error setting nice value\n");
ret = syscall(SYS_ioprio_set, 1, 0, 0); /* IOPRIO_WHO_PROCESS = 1, IOPRIO_PRIO_CLASS(IOPRIO_CLASS_IDLE) = 0 */
if (ret == -1)
fprintf(stderr, "Error setting ioprio value\n");
signal(SIGINT, handle_signal);
signal(SIGTERM, handle_signal);
for (;;) {
if (sig_received)
break;
mntfd = open(mnt, O_RDONLY);
if (mntfd == -1) {
perror("open(mntfd)");
exit(EXIT_FAILURE);
}
ret = do_walk_seq(mntfd);
/* handle unmounts? EAGAIN? */
if (ret)
break;
close(mntfd);
if (exiting)
break;
/* yield */
if (!sig_received)
sleep(5);
}
free(mnt);
fprintf(stdout, "Last meta_seq = %lu\n", meta_seq);
if (ret)
fprintf(stderr, "Error walking inodes: %s(%d)\n", strerror(errno), ret);
exit(ret);
}

View File

@@ -10,6 +10,30 @@ EXTENTS_PER_BTREE_BLOCK=600
EXTENTS_PER_LIST_BLOCK=8192
FREED_EXTENTS=$((EXTENTS_PER_BTREE_BLOCK * EXTENTS_PER_LIST_BLOCK))
#
# This test specifically creates a pathologically sparse file that will
# be as expensive as possible to free. This is usually fine on
# dedicated or reasonable hardware, but trying to run this in
# virtualized debug kernels can take a very long time. This test is
# about making sure that the server doesn't fail, not that the platform
# can handle the scale of work that our btree formats happen to require
# while execution is bogged down with use-after-free memory reference
# tracking. So we give the test a lot more breathing room before
# deciding that its hung.
#
echo "== setting longer hung task timeout"
if [ -w /proc/sys/kernel/hung_task_timeout_secs ]; then
secs=$(cat /proc/sys/kernel/hung_task_timeout_secs)
test "$secs" -gt 0 || \
t_fail "confusing value '$secs' from /proc/sys/kernel/hung_task_timeout_secs"
restore_hung_task_timeout()
{
echo "$secs" > /proc/sys/kernel/hung_task_timeout_secs
}
trap restore_hung_task_timeout EXIT
echo "$((secs * 5))" > /proc/sys/kernel/hung_task_timeout_secs
fi
echo "== creating fragmented extents"
fragmented_data_extents $FREED_EXTENTS $EXTENTS_PER_BTREE_BLOCK "$T_D0/alloc" "$T_D0/move"