From a077104531f12f33fd13fcd877f68a722db889f8 Mon Sep 17 00:00:00 2001 From: Zach Brown Date: Thu, 6 Nov 2025 11:54:40 -0800 Subject: [PATCH] Add crash monitor to run-tests Add a little background function that runs during the test which triggers a crash if it finds catastrophic failure conditions. This is the second bg task we want to kill and we can only have one function run on the EXIT trap, so we create a generic process killing trap function. We feed it the fenced pid as well. run-tests didn't log much of value into the fenced log, and we're not logging the kills into anymore, so we just remove run-tests fenced logging. Signed-off-by: Zach Brown --- tests/run-tests.sh | 77 +++++++++++++++++++++++++++++++++++----------- 1 file changed, 59 insertions(+), 18 deletions(-) diff --git a/tests/run-tests.sh b/tests/run-tests.sh index cbfad39e..f7e6d354 100755 --- a/tests/run-tests.sh +++ b/tests/run-tests.sh @@ -438,6 +438,30 @@ cmd grep . /sys/kernel/debug/tracing/options/trace_printk \ /sys/kernel/debug/tracing/buffer_size_kb \ /proc/sys/kernel/ftrace_dump_on_oops +# we can record pids to kill as we exit, we kill in reverse added order +atexit_kill_pids="" +add_atexit_kill_pid() +{ + atexit_kill_pids="$1 $atexit_kill_pids" +} +atexit_kill() +{ + local pid + + # suppress bg function exited messages + exec {ERR}>&2 2>/dev/null + + for pid in $atexit_kill_pids; do + if test -e "/proc/$pid/status" ; then + kill "$pid" + wait "$pid" + fi + done + + exec 2>&$ERR {ERR}>&- +} +trap atexit_kill EXIT + # # Build a fenced config that runs scripts out of the repository rather # than the default system directory @@ -451,26 +475,43 @@ EOF export SCOUTFS_FENCED_CONFIG_FILE="$conf" T_FENCED_LOG="$T_RESULTS/fenced.log" -# -# Run the agent in the background, log its output, an kill it if we -# exit -# -fenced_log() -{ - echo "[$(timestamp)] $*" >> "$T_FENCED_LOG" -} -fenced_pid="" -kill_fenced() -{ - if test -n "$fenced_pid" -a -d "/proc/$fenced_pid" ; then - fenced_log "killing fenced pid $fenced_pid" - kill "$fenced_pid" - fi -} -trap kill_fenced EXIT $T_UTILS/fenced/scoutfs-fenced > "$T_FENCED_LOG" 2>&1 & fenced_pid=$! -fenced_log "started fenced pid $fenced_pid in the background" +add_atexit_kill_pid $fenced_pid + +# +# some critical failures will cause fs operations to hang. We can watch +# for evidence of them and cause the system to crash, at least. +# +crash_monitor() +{ + local bad=0 + + while sleep 1; do + if dmesg | grep -q "inserting extent.*overlaps existing"; then + echo "run-tests monitor saw overlapping extent message" + bad=1 + fi + + if dmesg | grep -q "error indicated by fence action" ; then + echo "run-tests monitor saw fence agent error message" + bad=1 + fi + + if [ ! -e "/proc/${fenced_pid}/status" ]; then + echo "run-tests monitor didn't see fenced pid $fenced_pid /proc dir" + bad=1 + fi + + if [ "$bad" != 0 ]; then + echo "run-tests monitor triggering crash" + echo c > /proc/sysrq-trigger + exit 1 + fi + done +} +crash_monitor & +add_atexit_kill_pid $! # setup dm tables echo "0 $(blockdev --getsz $T_META_DEVICE) linear $T_META_DEVICE 0" > \