From 26ae9c6e049957d12cf09479a9eb7bca0598c28f Mon Sep 17 00:00:00 2001 From: Zach Brown Date: Fri, 25 Mar 2022 10:04:28 -0700 Subject: [PATCH] Verify local unmount testing fence script The fence script we use for our single node multi-mount tests only knows how to fence by using forced unmount to destroy a mount. As of now, the tests only generate failing nodes that need to be fenced by using forced unmount as well. This results in the awkward situation where the testing fence script doesn't have anything to do because the mount is already gone. When the test fence script has nothing to do we might not notice if it isn't run. This adds explicit verification to the fencing tests that the script was really run. It adds per-invocation logging to the fence script and the test makes sure that it was run. While we're at it, we take the opportunity to tidy up some of the scripting around this. We use a sysfs file with the data device major:minor numbers so that the fencing script can find and unmount mounts without having to ask them for their rid. They may not be operational. Signed-off-by: Zach Brown --- tests/fenced-local-force-unmount.sh | 50 +++++++++++++++++------------ tests/run-tests.sh | 9 +++--- tests/tests/fence-and-reclaim.sh | 23 +++++++++++++ utils/fenced/scoutfs-fenced | 21 +++++++++--- 4 files changed, 74 insertions(+), 29 deletions(-) diff --git a/tests/fenced-local-force-unmount.sh b/tests/fenced-local-force-unmount.sh index 9d97a79b..f5553be1 100755 --- a/tests/fenced-local-force-unmount.sh +++ b/tests/fenced-local-force-unmount.sh @@ -1,5 +1,18 @@ #!/usr/bin/bash +# +# This fencing script is used for testing clusters of multiple mounts on +# a single host. It finds mounts to fence by looking for their rids and +# only knows how to "fence" by using forced unmount. +# + +echo "$0 running rid '$SCOUTFS_FENCED_REQ_RID' ip '$SCOUTFS_FENCED_REQ_IP' args '$@'" + +log() { + echo "$@" > /dev/stderr + exit 1 +} + echo_fail() { echo "$@" > /dev/stderr exit 1 @@ -7,29 +20,24 @@ echo_fail() { rid="$SCOUTFS_FENCED_REQ_RID" -# -# Look for a local mount with the rid to fence. Typically we'll at -# least find the mount with the server that requested the fence that -# we're processing. But it's possible that mounts are unmounted -# before, or while, we're running. -# -mnts=$(findmnt -l -n -t scoutfs -o TARGET) || \ - echo_fail "findmnt -t scoutfs failed" > /dev/stderr +for fs in /sys/fs/scoutfs/*; do + [ ! -d "$fs" ] && continue -for mnt in $mnts; do - mnt_rid=$(scoutfs statfs -p "$mnt" -s rid) || \ - echo_fail "scoutfs statfs $mnt failed" - - if [ "$mnt_rid" == "$rid" ]; then - umount -f "$mnt" || \ - echo_fail "umout -f $mnt" - - exit 0 + fs_rid="$(cat $fs/rid)" || \ + echo_fail "failed to get rid in $fs" + if [ "$fs_rid" != "$rid" ]; then + continue fi + + nr="$(cat $fs/data_device_maj_min)" || \ + echo_fail "failed to get data device major:minor in $fs" + + mnts=$(findmnt -l -n -t scoutfs -o TARGET -S $nr) || \ + echo_fail "findmnt -t scoutfs -S $nr failed" + for mnt in $mnts; do + umount -f "$mnt" || \ + echo_fail "umout -f $mnt failed" + done done -# -# If the mount doesn't exist on this host then it can't access the -# devices by definition and can be considered fenced. -# exit 0 diff --git a/tests/run-tests.sh b/tests/run-tests.sh index 5f826474..a8aa02af 100755 --- a/tests/run-tests.sh +++ b/tests/run-tests.sh @@ -380,13 +380,14 @@ cmd grep . /sys/kernel/debug/tracing/options/trace_printk \ # Build a fenced config that runs scripts out of the repository rather # than the default system directory # -conf="$T_RESULTS/scoutfs-fencd.conf" +conf="$T_RESULTS/scoutfs-fenced.conf" cat > $conf << EOF SCOUTFS_FENCED_DELAY=1 SCOUTFS_FENCED_RUN=$T_TESTS/fenced-local-force-unmount.sh -SCOUTFS_FENCED_RUN_ARGS="" +SCOUTFS_FENCED_RUN_ARGS="ignored run args" EOF export SCOUTFS_FENCED_CONFIG_FILE="$conf" +T_FENCED_LOG="$T_RESULTS/fenced.log" # # Run the agent in the background, log its output, an kill it if we @@ -394,7 +395,7 @@ export SCOUTFS_FENCED_CONFIG_FILE="$conf" # fenced_log() { - echo "[$(timestamp)] $*" >> "$T_RESULTS/fenced.stdout.log" + echo "[$(timestamp)] $*" >> "$T_FENCED_LOG" } fenced_pid="" kill_fenced() @@ -405,7 +406,7 @@ kill_fenced() fi } trap kill_fenced EXIT -$T_UTILS/fenced/scoutfs-fenced > "$T_RESULTS/fenced.stdout.log" 2> "$T_RESULTS/fenced.stderr.log" & +$T_UTILS/fenced/scoutfs-fenced > "$T_FENCED_LOG" 2>&1 & fenced_pid=$! fenced_log "started fenced pid $fenced_pid in the background" diff --git a/tests/tests/fence-and-reclaim.sh b/tests/tests/fence-and-reclaim.sh index 1ce52048..1fe1ad2e 100644 --- a/tests/tests/fence-and-reclaim.sh +++ b/tests/tests/fence-and-reclaim.sh @@ -45,6 +45,18 @@ check_read_write() fi } +# verify that fenced ran our testing fence script +verify_fenced_run() +{ + local rids="$@" + local rid + + for rid in $rids; do + grep -q ".* running rid '$rid'.* args 'ignored run args'" "$T_FENCED_LOG" || \ + t_fail "fenced didn't execute RUN script for rid $rid" + done +} + echo "== make sure all mounts can see each other" check_read_write @@ -62,12 +74,14 @@ done while t_rid_is_fencing $rid; do sleep .5 done +verify_fenced_run $rid t_mount $cl check_read_write echo "== force unmount all non-server, connection timeout, fence nop, mount" sv=$(t_server_nr) pattern="nonsense" +rids="" sync for cl in $(t_fs_nrs); do if [ $cl == $sv ]; then @@ -75,6 +89,7 @@ for cl in $(t_fs_nrs); do fi rid=$(t_mount_rid $cl) + rids="$rids $rid" pattern="$pattern|$rid" echo "cl $cl sv $sv rid $rid" >> "$T_TMP.log" @@ -89,6 +104,7 @@ done while test -d $(echo /sys/fs/scoutfs/*/fence/* | cut -d " " -f 1); do sleep .5 done +verify_fenced_run $rids # remount all the clients for cl in $(t_fs_nrs); do if [ $cl == $sv ]; then @@ -109,11 +125,17 @@ t_wait_for_leader while t_rid_is_fencing $rid; do sleep .5 done +verify_fenced_run $rid t_mount $sv check_read_write echo "== force unmount everything, new server fences all previous" sync +rids="" +# get rids before forced unmount breaks scoutfs statfs +for nr in $(t_fs_nrs); do + rids="$rids $(t_mount_rid $nr)" +done for nr in $(t_fs_nrs); do t_force_umount $nr done @@ -122,6 +144,7 @@ t_mount_all while test -d $(echo /sys/fs/scoutfs/*/fence/* | cut -d " " -f 1); do sleep .5 done +verify_fenced_run $rids check_read_write t_pass diff --git a/utils/fenced/scoutfs-fenced b/utils/fenced/scoutfs-fenced index 6e53099a..fa866e25 100755 --- a/utils/fenced/scoutfs-fenced +++ b/utils/fenced/scoutfs-fenced @@ -55,9 +55,21 @@ test -x "$SCOUTFS_FENCED_RUN" || \ error_exit "SCOUTFS_FENCED_RUN '$SCOUTFS_FENCED_RUN' isn't executable" # -# main loop watching for fence request across all filesystems +# Main loop watching for fence request across all filesystems. The +# server can shut down without waiting for pending fence requests to +# finish. All of the interaction with the fence directory and files can +# fail at any moment. We will generate log messages when the dir or +# files disappear. # +# generate failure messages to stderr while still echoing 0 for the caller +careful_cat() +{ + local path="$@" + + cat "$@" || echo 0 +} + while sleep $SCOUTFS_FENCED_DELAY; do for fence in /sys/fs/scoutfs/*/fence/*; do # catches unmatched regex when no dirs @@ -66,7 +78,8 @@ while sleep $SCOUTFS_FENCED_DELAY; do fi # skip requests that have been handled - if [ $(cat "$fence/fenced") == 1 -o $(cat "$fence/error") == 1 ]; then + if [ "$(careful_cat $fence/fenced)" == 1 -o \ + "$(careful_cat $fence/error)" == 1 ]; then continue fi @@ -81,10 +94,10 @@ while sleep $SCOUTFS_FENCED_DELAY; do export SCOUTFS_FENCED_REQ_RID="$rid" export SCOUTFS_FENCED_REQ_IP="$ip" - $run $SCOUTFS_FENCED_RUN_ARGS + $SCOUTFS_FENCED_RUN $SCOUTFS_FENCED_RUN_ARGS rc=$? if [ "$rc" != 0 ]; then - log_message "server $srv fencing rid $rid saw error status $rc from $run" + log_message "server $srv fencing rid $rid saw error status $rc" echo 1 > "$fence/error" continue fi