From 26ae9c6e049957d12cf09479a9eb7bca0598c28f Mon Sep 17 00:00:00 2001
From: Zach Brown <zab@versity.com>
Date: Fri, 25 Mar 2022 10:04:28 -0700
Subject: [PATCH] Verify local unmount testing fence script

The fence script we use for our single node multi-mount tests only knows
how to fence by using forced unmount to destroy a mount.  As of now, the
tests only generate failing nodes that need to be fenced by using forced
unmount as well.  This results in the awkward situation where the
testing fence script doesn't have anything to do because the mount is
already gone.

When the test fence script has nothing to do we might not notice if it
isn't run.  This adds explicit verification to the fencing tests that
the script was really run.  It adds per-invocation logging to the fence
script and the test makes sure that it was run.

While we're at it, we take the opportunity to tidy up some of the
scripting around this.  We use a sysfs file with the data device
major:minor numbers so that the fencing script can find and unmount
mounts without having to ask them for their rid.  They may not be
operational.

Signed-off-by: Zach Brown <zab@versity.com>
---
 tests/fenced-local-force-unmount.sh | 50 +++++++++++++++++------------
 tests/run-tests.sh                  |  9 +++---
 tests/tests/fence-and-reclaim.sh    | 23 +++++++++++++
 utils/fenced/scoutfs-fenced         | 21 +++++++++---
 4 files changed, 74 insertions(+), 29 deletions(-)

diff --git a/tests/fenced-local-force-unmount.sh b/tests/fenced-local-force-unmount.sh
index 9d97a79b..f5553be1 100755
--- a/tests/fenced-local-force-unmount.sh
+++ b/tests/fenced-local-force-unmount.sh
@@ -1,5 +1,18 @@
 #!/usr/bin/bash
 
+#
+# This fencing script is used for testing clusters of multiple mounts on
+# a single host.  It finds mounts to fence by looking for their rids and
+# only knows how to "fence" by using forced unmount.
+#
+
+echo "$0 running rid '$SCOUTFS_FENCED_REQ_RID' ip '$SCOUTFS_FENCED_REQ_IP' args '$@'"
+
+log() {
+	echo "$@" > /dev/stderr
+	exit 1
+}
+
 echo_fail() {
 	echo "$@" > /dev/stderr
 	exit 1
@@ -7,29 +20,24 @@ echo_fail() {
 
 rid="$SCOUTFS_FENCED_REQ_RID"
 
-#
-# Look for a local mount with the rid to fence.  Typically we'll at
-# least find the mount with the server that requested the fence that
-# we're processing.   But it's possible that mounts are unmounted
-# before, or while, we're running.
-#
-mnts=$(findmnt -l -n -t scoutfs -o TARGET) || \
-	echo_fail "findmnt -t scoutfs failed" > /dev/stderr
+for fs in /sys/fs/scoutfs/*; do
+	[ ! -d "$fs" ] && continue
 
-for mnt in $mnts; do
-	mnt_rid=$(scoutfs statfs -p "$mnt" -s rid) || \
-		echo_fail "scoutfs statfs $mnt failed"
-
-	if [ "$mnt_rid" == "$rid" ]; then
-		umount -f "$mnt" || \
-			echo_fail "umout -f $mnt"
-
-		exit 0
+	fs_rid="$(cat $fs/rid)" || \
+		echo_fail "failed to get rid in $fs"
+	if [ "$fs_rid" != "$rid" ]; then
+		continue
 	fi
+
+	nr="$(cat $fs/data_device_maj_min)" || \
+		echo_fail "failed to get data device major:minor in $fs"
+
+	mnts=$(findmnt -l -n -t scoutfs -o TARGET -S $nr) || \
+		echo_fail "findmnt -t scoutfs -S $nr failed"
+	for mnt in $mnts; do
+		umount -f "$mnt" || \
+			echo_fail "umout -f $mnt failed"
+	done
 done
 
-#
-# If the mount doesn't exist on this host then it can't access the
-# devices by definition and can be considered fenced.
-#
 exit 0
diff --git a/tests/run-tests.sh b/tests/run-tests.sh
index 5f826474..a8aa02af 100755
--- a/tests/run-tests.sh
+++ b/tests/run-tests.sh
@@ -380,13 +380,14 @@ cmd grep .  /sys/kernel/debug/tracing/options/trace_printk \
 # Build a fenced config that runs scripts out of the repository rather
 # than the default system directory
 #
-conf="$T_RESULTS/scoutfs-fencd.conf"
+conf="$T_RESULTS/scoutfs-fenced.conf"
 cat > $conf << EOF
 SCOUTFS_FENCED_DELAY=1
 SCOUTFS_FENCED_RUN=$T_TESTS/fenced-local-force-unmount.sh
-SCOUTFS_FENCED_RUN_ARGS=""
+SCOUTFS_FENCED_RUN_ARGS="ignored run args"
 EOF
 export SCOUTFS_FENCED_CONFIG_FILE="$conf"
+T_FENCED_LOG="$T_RESULTS/fenced.log"
 
 #
 # Run the agent in the background, log its output, an kill it if we
@@ -394,7 +395,7 @@ export SCOUTFS_FENCED_CONFIG_FILE="$conf"
 #
 fenced_log()
 {
-	echo "[$(timestamp)] $*" >> "$T_RESULTS/fenced.stdout.log"
+	echo "[$(timestamp)] $*" >> "$T_FENCED_LOG"
 }
 fenced_pid=""
 kill_fenced()
@@ -405,7 +406,7 @@ kill_fenced()
 	fi
 }
 trap kill_fenced EXIT
-$T_UTILS/fenced/scoutfs-fenced > "$T_RESULTS/fenced.stdout.log" 2> "$T_RESULTS/fenced.stderr.log" &
+$T_UTILS/fenced/scoutfs-fenced > "$T_FENCED_LOG" 2>&1 &
 fenced_pid=$!
 fenced_log "started fenced pid $fenced_pid in the background"
 
diff --git a/tests/tests/fence-and-reclaim.sh b/tests/tests/fence-and-reclaim.sh
index 1ce52048..1fe1ad2e 100644
--- a/tests/tests/fence-and-reclaim.sh
+++ b/tests/tests/fence-and-reclaim.sh
@@ -45,6 +45,18 @@ check_read_write()
 	fi
 }
 
+# verify that fenced ran our testing fence script
+verify_fenced_run()
+{
+	local rids="$@"
+	local rid
+
+	for rid in $rids; do
+		grep -q ".* running rid '$rid'.* args 'ignored run args'" "$T_FENCED_LOG" || \
+			t_fail "fenced didn't execute RUN script for rid $rid"
+	done
+}
+
 echo "== make sure all mounts can see each other"
 check_read_write
 
@@ -62,12 +74,14 @@ done
 while t_rid_is_fencing $rid; do
 	sleep .5
 done
+verify_fenced_run $rid
 t_mount $cl
 check_read_write
 
 echo "== force unmount all non-server, connection timeout, fence nop, mount"
 sv=$(t_server_nr)
 pattern="nonsense"
+rids=""
 sync
 for cl in $(t_fs_nrs); do
 	if [ $cl == $sv ]; then
@@ -75,6 +89,7 @@ for cl in $(t_fs_nrs); do
 	fi
 
 	rid=$(t_mount_rid $cl)
+	rids="$rids $rid"
 	pattern="$pattern|$rid"
 	echo "cl $cl sv $sv rid $rid" >> "$T_TMP.log"
 
@@ -89,6 +104,7 @@ done
 while test -d $(echo /sys/fs/scoutfs/*/fence/* | cut -d " " -f 1); do
 	sleep .5
 done
+verify_fenced_run $rids
 # remount all the clients
 for cl in $(t_fs_nrs); do
 	if [ $cl == $sv ]; then
@@ -109,11 +125,17 @@ t_wait_for_leader
 while t_rid_is_fencing $rid; do
 	sleep .5
 done
+verify_fenced_run $rid
 t_mount $sv
 check_read_write
 
 echo "== force unmount everything, new server fences all previous"
 sync
+rids=""
+# get rids before forced unmount breaks scoutfs statfs
+for nr in $(t_fs_nrs); do
+	rids="$rids $(t_mount_rid $nr)"
+done
 for nr in $(t_fs_nrs); do
 	t_force_umount $nr
 done
@@ -122,6 +144,7 @@ t_mount_all
 while test -d $(echo /sys/fs/scoutfs/*/fence/* | cut -d " " -f 1); do
 	sleep .5
 done
+verify_fenced_run $rids
 check_read_write
 
 t_pass
diff --git a/utils/fenced/scoutfs-fenced b/utils/fenced/scoutfs-fenced
index 6e53099a..fa866e25 100755
--- a/utils/fenced/scoutfs-fenced
+++ b/utils/fenced/scoutfs-fenced
@@ -55,9 +55,21 @@ test -x "$SCOUTFS_FENCED_RUN" || \
 	error_exit "SCOUTFS_FENCED_RUN '$SCOUTFS_FENCED_RUN' isn't executable"
 
 #
-# main loop watching for fence request across all filesystems 
+# Main loop watching for fence request across all filesystems.   The
+# server can shut down without waiting for pending fence requests to
+# finish.  All of the interaction with the fence directory and files can
+# fail at any moment.  We will generate log messages when the dir or
+# files disappear.
 #
 
+# generate failure messages to stderr while still echoing 0 for the caller
+careful_cat()
+{
+	local path="$@"
+
+	cat "$@" || echo 0
+}
+
 while sleep $SCOUTFS_FENCED_DELAY; do
 	for fence in /sys/fs/scoutfs/*/fence/*; do
 		# catches unmatched regex when no dirs
@@ -66,7 +78,8 @@ while sleep $SCOUTFS_FENCED_DELAY; do
 		fi
 
 		# skip requests that have been handled
-		if [ $(cat "$fence/fenced") == 1 -o $(cat "$fence/error") == 1 ]; then
+		if [ "$(careful_cat $fence/fenced)" == 1 -o \
+		     "$(careful_cat $fence/error)" == 1 ]; then
 			continue
 		fi
 
@@ -81,10 +94,10 @@ while sleep $SCOUTFS_FENCED_DELAY; do
 		export SCOUTFS_FENCED_REQ_RID="$rid"
 		export SCOUTFS_FENCED_REQ_IP="$ip"
 
-		$run $SCOUTFS_FENCED_RUN_ARGS
+		$SCOUTFS_FENCED_RUN $SCOUTFS_FENCED_RUN_ARGS
 		rc=$?
 		if [ "$rc" != 0 ]; then
-			log_message "server $srv fencing rid $rid saw error status $rc from $run"
+			log_message "server $srv fencing rid $rid saw error status $rc"
 			echo 1 > "$fence/error"
 			continue
 		fi