From 8ddf9b8c8c118b9443ff32b2e15ffff2cd084f1f Mon Sep 17 00:00:00 2001 From: Zach Brown Date: Thu, 13 Nov 2025 12:15:43 -0800 Subject: [PATCH] Handle disappearing fencing requests and targets The userspace fencing process wasn't careful about handling underlying directories that disappear while it was working. On the server/fenced side, fencing requests can linger after they've been resolved by writing 1 to fenced or error. The script could come back around to see the directory before the server finally removes it, causing all later uses of the request dir to fail. We saw this in the logs as a bunch of cat errors for the various request files. On the local fence script side, all the mounts can be in the process of being unmounted so both the /sys/fs dirs and the mount it self can be removed while we're working. For both, when we're working with the /sys/fs files we read them without logging errors and then test that the dir still exists before using what we read. When fencing a mount, we stop if findmnt doesn't find the mount and then raise a umount error if the /sys/fs dir exists after umount fails. And while we're at it, we have each scripts logging append instead of truncating (if, say, it's a log file instead of an interactive tty). Signed-off-by: Zach Brown --- tests/fenced-local-force-unmount.sh | 38 ++++++++++++++--------------- utils/fenced/scoutfs-fenced | 34 ++++++++++++-------------- 2 files changed, 33 insertions(+), 39 deletions(-) diff --git a/tests/fenced-local-force-unmount.sh b/tests/fenced-local-force-unmount.sh index f5553be1..cf879b85 100755 --- a/tests/fenced-local-force-unmount.sh +++ b/tests/fenced-local-force-unmount.sh @@ -8,36 +8,34 @@ echo "$0 running rid '$SCOUTFS_FENCED_REQ_RID' ip '$SCOUTFS_FENCED_REQ_IP' args '$@'" -log() { - echo "$@" > /dev/stderr +echo_fail() { + echo "$@" >> /dev/stderr exit 1 } -echo_fail() { - echo "$@" > /dev/stderr - exit 1 +# silence error messages +quiet_cat() +{ + cat "$@" 2>/dev/null } rid="$SCOUTFS_FENCED_REQ_RID" +shopt -s nullglob for fs in /sys/fs/scoutfs/*; do - [ ! -d "$fs" ] && continue + fs_rid="$(quiet_cat $fs/rid)" + nr="$(quiet_cat $fs/data_device_maj_min)" + [ ! -d "$fs" -o "$fs_rid" != "$rid" ] && continue - fs_rid="$(cat $fs/rid)" || \ - echo_fail "failed to get rid in $fs" - if [ "$fs_rid" != "$rid" ]; then - continue - fi - - nr="$(cat $fs/data_device_maj_min)" || \ - echo_fail "failed to get data device major:minor in $fs" - - mnts=$(findmnt -l -n -t scoutfs -o TARGET -S $nr) || \ + mnt=$(findmnt -l -n -t scoutfs -o TARGET -S $nr) || \ echo_fail "findmnt -t scoutfs -S $nr failed" - for mnt in $mnts; do - umount -f "$mnt" || \ - echo_fail "umout -f $mnt failed" - done + [ -z "$mnt" ] && continue + + if ! umount -qf "$mnt"; then + if [ -d "$fs" ]; then + echo_fail "umount -qf $mnt failed" + fi + fi done exit 0 diff --git a/utils/fenced/scoutfs-fenced b/utils/fenced/scoutfs-fenced index fa866e25..5070de4f 100755 --- a/utils/fenced/scoutfs-fenced +++ b/utils/fenced/scoutfs-fenced @@ -7,7 +7,7 @@ message_output() error_message() { - message_output "$@" >&2 + message_output "$@" >> /dev/stderr } error_exit() @@ -62,31 +62,27 @@ test -x "$SCOUTFS_FENCED_RUN" || \ # files disappear. # -# generate failure messages to stderr while still echoing 0 for the caller -careful_cat() +# silence error messages +quiet_cat() { - local path="$@" - - cat "$@" || echo 0 + cat "$@" 2>/dev/null } while sleep $SCOUTFS_FENCED_DELAY; do + shopt -s nullglob for fence in /sys/fs/scoutfs/*/fence/*; do - # catches unmatched regex when no dirs - if [ ! -d "$fence" ]; then - continue - fi - - # skip requests that have been handled - if [ "$(careful_cat $fence/fenced)" == 1 -o \ - "$(careful_cat $fence/error)" == 1 ]; then - continue - fi srv=$(basename $(dirname $(dirname $fence))) - rid="$(cat $fence/rid)" - ip="$(cat $fence/ipv4_addr)" - reason="$(cat $fence/reason)" + fenced="$(quiet_cat $fence/fenced)" + error="$(quiet_cat $fence/error)" + rid="$(quiet_cat $fence/rid)" + ip="$(quiet_cat $fence/ipv4_addr)" + reason="$(quiet_cat $fence/reason)" + + # request dirs can linger then disappear after fenced/error is set + if [ ! -d "$fence" -o "$fenced" == "1" -o "$error" == "1" ]; then + continue + fi log_message "server $srv fencing rid $rid at IP $ip for $reason"