Handle disappearing fencing requests and targets

The userspace fencing process wasn't careful about handling underlying
directories that disappear while it was working.

On the server/fenced side, fencing requests can linger after they've
been resolved by writing 1 to fenced or error.  The script could come
back around to see the directory before the server finally removes it,
causing all later uses of the request dir to fail.  We saw this in the
logs as a bunch of cat errors for the various request files.

On the local fence script side, all the mounts can be in the process of
being unmounted so both the /sys/fs dirs and the mount it self can be
removed while we're working.

For both, when we're working with the /sys/fs files we read them without
logging errors and then test that the dir still exists before using what
we read.  When fencing a mount, we stop if findmnt doesn't find the
mount and then raise a umount error if the /sys/fs dir exists after
umount fails.

And while we're at it, we have each scripts logging append instead of
truncating (if, say, it's a log file instead of an interactive tty).

Signed-off-by: Zach Brown <zab@versity.com>
This commit is contained in:
Zach Brown
2025-11-13 12:15:43 -08:00
parent fd80c17ab6
commit 8ddf9b8c8c
2 changed files with 33 additions and 39 deletions

View File

@@ -8,36 +8,34 @@
echo "$0 running rid '$SCOUTFS_FENCED_REQ_RID' ip '$SCOUTFS_FENCED_REQ_IP' args '$@'" echo "$0 running rid '$SCOUTFS_FENCED_REQ_RID' ip '$SCOUTFS_FENCED_REQ_IP' args '$@'"
log() { echo_fail() {
echo "$@" > /dev/stderr echo "$@" >> /dev/stderr
exit 1 exit 1
} }
echo_fail() { # silence error messages
echo "$@" > /dev/stderr quiet_cat()
exit 1 {
cat "$@" 2>/dev/null
} }
rid="$SCOUTFS_FENCED_REQ_RID" rid="$SCOUTFS_FENCED_REQ_RID"
shopt -s nullglob
for fs in /sys/fs/scoutfs/*; do for fs in /sys/fs/scoutfs/*; do
[ ! -d "$fs" ] && continue fs_rid="$(quiet_cat $fs/rid)"
nr="$(quiet_cat $fs/data_device_maj_min)"
[ ! -d "$fs" -o "$fs_rid" != "$rid" ] && continue
fs_rid="$(cat $fs/rid)" || \ mnt=$(findmnt -l -n -t scoutfs -o TARGET -S $nr) || \
echo_fail "failed to get rid in $fs"
if [ "$fs_rid" != "$rid" ]; then
continue
fi
nr="$(cat $fs/data_device_maj_min)" || \
echo_fail "failed to get data device major:minor in $fs"
mnts=$(findmnt -l -n -t scoutfs -o TARGET -S $nr) || \
echo_fail "findmnt -t scoutfs -S $nr failed" echo_fail "findmnt -t scoutfs -S $nr failed"
for mnt in $mnts; do [ -z "$mnt" ] && continue
umount -f "$mnt" || \
echo_fail "umout -f $mnt failed" if ! umount -qf "$mnt"; then
done if [ -d "$fs" ]; then
echo_fail "umount -qf $mnt failed"
fi
fi
done done
exit 0 exit 0

View File

@@ -7,7 +7,7 @@ message_output()
error_message() error_message()
{ {
message_output "$@" >&2 message_output "$@" >> /dev/stderr
} }
error_exit() error_exit()
@@ -62,31 +62,27 @@ test -x "$SCOUTFS_FENCED_RUN" || \
# files disappear. # files disappear.
# #
# generate failure messages to stderr while still echoing 0 for the caller # silence error messages
careful_cat() quiet_cat()
{ {
local path="$@" cat "$@" 2>/dev/null
cat "$@" || echo 0
} }
while sleep $SCOUTFS_FENCED_DELAY; do while sleep $SCOUTFS_FENCED_DELAY; do
shopt -s nullglob
for fence in /sys/fs/scoutfs/*/fence/*; do for fence in /sys/fs/scoutfs/*/fence/*; do
# catches unmatched regex when no dirs
if [ ! -d "$fence" ]; then
continue
fi
# skip requests that have been handled
if [ "$(careful_cat $fence/fenced)" == 1 -o \
"$(careful_cat $fence/error)" == 1 ]; then
continue
fi
srv=$(basename $(dirname $(dirname $fence))) srv=$(basename $(dirname $(dirname $fence)))
rid="$(cat $fence/rid)" fenced="$(quiet_cat $fence/fenced)"
ip="$(cat $fence/ipv4_addr)" error="$(quiet_cat $fence/error)"
reason="$(cat $fence/reason)" rid="$(quiet_cat $fence/rid)"
ip="$(quiet_cat $fence/ipv4_addr)"
reason="$(quiet_cat $fence/reason)"
# request dirs can linger then disappear after fenced/error is set
if [ ! -d "$fence" -o "$fenced" == "1" -o "$error" == "1" ]; then
continue
fi
log_message "server $srv fencing rid $rid at IP $ip for $reason" log_message "server $srv fencing rid $rid at IP $ip for $reason"