Add crash monitor to run-tests

Add a little background function that runs during the test which
triggers a crash if it finds catastrophic failure conditions.

This is the second bg task we want to kill and we can only have one
function run on the EXIT trap, so we create a generic process killing
trap function.

We feed it the fenced pid as well.  run-tests didn't log much of value
into the fenced log, and we're not logging the kills into anymore, so we
just remove run-tests fenced logging.

Signed-off-by: Zach Brown <zab@versity.com>
This commit is contained in:
Zach Brown
2025-11-06 11:54:40 -08:00
parent 23aaa994df
commit ae84271b37

View File

@@ -39,6 +39,20 @@ cmd() {
die "cmd failed (check the run.log)"
}
# we can record pids to kill as we exit, we kill in reverse added order
declare -a atexit_kill_pids
atexit_kill()
{
local pid
for pid in $(echo ${atexit_kill_pids[*]} | rev); do
if test -e "/proc/$pid/status" ; then
kill "$pid"
fi
done
}
trap atexit_kill EXIT
show_help()
{
cat << EOF
@@ -451,26 +465,44 @@ EOF
export SCOUTFS_FENCED_CONFIG_FILE="$conf"
T_FENCED_LOG="$T_RESULTS/fenced.log"
#
# Run the agent in the background, log its output, an kill it if we
# exit
#
fenced_log()
{
echo "[$(timestamp)] $*" >> "$T_FENCED_LOG"
}
fenced_pid=""
kill_fenced()
{
if test -n "$fenced_pid" -a -d "/proc/$fenced_pid" ; then
fenced_log "killing fenced pid $fenced_pid"
kill "$fenced_pid"
fi
}
trap kill_fenced EXIT
$T_UTILS/fenced/scoutfs-fenced > "$T_FENCED_LOG" 2>&1 &
fenced_pid=$!
fenced_log "started fenced pid $fenced_pid in the background"
atexit_kill_pids+=($fenced_pid)
#
# some critical failures will cause fs operations to hang. We can watch
# for evidence of them and cause the system to crash, at least.
#
crash_monitor()
{
local bad=0
while sleep 1; do
if dmesg | grep -q "inserting extent.*overlaps existing"; then
echo "run-tests monitor saw overlapping extent message"
bad=1
fi
if dmesg | grep -q "error indicated by fence action" ; then
echo "run-tests monitor saw fence agent error message"
bad=1
fi
if [ ! -e "/proc/${fenced_pid}/status" ]; then
echo "run-tests monitor didn't see fenced pid $fenced_pid /proc dir"
bad=1
fi
if [ "$bad" != 0 ]; then
echo "run-tests monitor triggering crash"
echo c > /proc/sysrq-trigger
# bg function doesn't reload bash, $$ is parent run-tests.sh
kill -9 $$
fi
done
}
crash_monitor &
atexit_kill_pids+=($!)
# setup dm tables
echo "0 $(blockdev --getsz $T_META_DEVICE) linear $T_META_DEVICE 0" > \