Add crash monitor to run-tests

Add a little background function that runs during the test which triggers a crash if it finds catastrophic failure conditions. This is the second bg task we want to kill and we can only have one function run on the EXIT trap, so we create a generic process killing trap function. We feed it the fenced pid as well. run-tests didn't log much of value into the fenced log, and we're not logging the kills into anymore, so we just remove run-tests fenced logging. Signed-off-by: Zach Brown <zab@versity.com>
2026-01-10 13:47:27 +00:00 · 2025-11-06 11:54:40 -08:00
parent 23aaa994df
commit ae84271b37
1 changed files with 50 additions and 18 deletions
--- a/tests/run-tests.sh
+++ b/tests/run-tests.sh
@@ -39,6 +39,20 @@ cmd() {
 		die "cmd failed (check the run.log)"
 }

+# we can record pids to kill as we exit, we kill in reverse added order
+declare -a atexit_kill_pids
+atexit_kill()
+{
+	local pid
+
+	for pid in $(echo ${atexit_kill_pids[*]} | rev); do
+		if test -e "/proc/$pid/status" ; then
+			kill "$pid"
+		fi
+	done
+}
+trap atexit_kill EXIT
+
 show_help()
 {
 cat << EOF
@@ -451,26 +465,44 @@ EOF
 export SCOUTFS_FENCED_CONFIG_FILE="$conf"
 T_FENCED_LOG="$T_RESULTS/fenced.log"

-#
-# Run the agent in the background, log its output, an kill it if we
-# exit
-#
-fenced_log()
-{
-	echo "[$(timestamp)] $*" >> "$T_FENCED_LOG"
-}
-fenced_pid=""
-kill_fenced()
-{
-	if test -n "$fenced_pid" -a -d "/proc/$fenced_pid" ; then
-		fenced_log "killing fenced pid $fenced_pid"
-		kill "$fenced_pid"
-	fi
-}
-trap kill_fenced EXIT
 $T_UTILS/fenced/scoutfs-fenced > "$T_FENCED_LOG" 2>&1 &
 fenced_pid=$!
-fenced_log "started fenced pid $fenced_pid in the background"
+atexit_kill_pids+=($fenced_pid)
+
+#
+# some critical failures will cause fs operations to hang.  We can watch
+# for evidence of them and cause the system to crash, at least.
+#
+crash_monitor()
+{
+	local bad=0
+
+	while sleep 1; do
+		if dmesg | grep -q "inserting extent.*overlaps existing"; then
+			echo "run-tests monitor saw overlapping extent message"
+			bad=1
+		fi
+
+		if dmesg | grep -q "error indicated by fence action" ; then
+			echo "run-tests monitor saw fence agent error message"
+			bad=1
+		fi
+
+		if [ ! -e "/proc/${fenced_pid}/status" ]; then
+			echo "run-tests monitor didn't see fenced pid $fenced_pid /proc dir"
+			bad=1
+		fi
+
+		if [ "$bad" != 0 ]; then
+			echo "run-tests monitor triggering crash"
+			echo c > /proc/sysrq-trigger
+			# bg function doesn't reload bash, $$ is parent run-tests.sh
+			kill -9 $$
+		fi
+	done
+}
+crash_monitor &
+atexit_kill_pids+=($!)

 # setup dm tables
 echo "0 $(blockdev --getsz $T_META_DEVICE) linear $T_META_DEVICE 0" > \