Handle disappearing fencing requests and targets

The userspace fencing process wasn't careful about handling underlying
directories that disappear while it was working.

On the server/fenced side, fencing requests can linger after they've
been resolved by writing 1 to fenced or error.  The script could come
back around to see the directory before the server finally removes it,
causing all later uses of the request dir to fail.  We saw this in the
logs as a bunch of cat errors for the various request files.

On the local fence script side, all the mounts can be in the process of
being unmounted so both the /sys/fs dirs and the mount it self can be
removed while we're working.

For both, when we're working with the /sys/fs files we read them without
logging errors and then test that the dir still exists before using what
we read.  When fencing a mount, we stop if findmnt doesn't find the
mount and then raise a umount error if the /sys/fs dir exists after
umount fails.

And while we're at it, we have each scripts logging append instead of
truncating (if, say, it's a log file instead of an interactive tty).

Signed-off-by: Zach Brown <zab@versity.com>
This commit is contained in:
Zach Brown
2025-11-13 12:15:43 -08:00
parent fd80c17ab6
commit 8ddf9b8c8c
2 changed files with 33 additions and 39 deletions

View File

@@ -8,36 +8,34 @@
echo "$0 running rid '$SCOUTFS_FENCED_REQ_RID' ip '$SCOUTFS_FENCED_REQ_IP' args '$@'"
log() {
echo "$@" > /dev/stderr
echo_fail() {
echo "$@" >> /dev/stderr
exit 1
}
echo_fail() {
echo "$@" > /dev/stderr
exit 1
# silence error messages
quiet_cat()
{
cat "$@" 2>/dev/null
}
rid="$SCOUTFS_FENCED_REQ_RID"
shopt -s nullglob
for fs in /sys/fs/scoutfs/*; do
[ ! -d "$fs" ] && continue
fs_rid="$(quiet_cat $fs/rid)"
nr="$(quiet_cat $fs/data_device_maj_min)"
[ ! -d "$fs" -o "$fs_rid" != "$rid" ] && continue
fs_rid="$(cat $fs/rid)" || \
echo_fail "failed to get rid in $fs"
if [ "$fs_rid" != "$rid" ]; then
continue
fi
nr="$(cat $fs/data_device_maj_min)" || \
echo_fail "failed to get data device major:minor in $fs"
mnts=$(findmnt -l -n -t scoutfs -o TARGET -S $nr) || \
mnt=$(findmnt -l -n -t scoutfs -o TARGET -S $nr) || \
echo_fail "findmnt -t scoutfs -S $nr failed"
for mnt in $mnts; do
umount -f "$mnt" || \
echo_fail "umout -f $mnt failed"
done
[ -z "$mnt" ] && continue
if ! umount -qf "$mnt"; then
if [ -d "$fs" ]; then
echo_fail "umount -qf $mnt failed"
fi
fi
done
exit 0

View File

@@ -7,7 +7,7 @@ message_output()
error_message()
{
message_output "$@" >&2
message_output "$@" >> /dev/stderr
}
error_exit()
@@ -62,31 +62,27 @@ test -x "$SCOUTFS_FENCED_RUN" || \
# files disappear.
#
# generate failure messages to stderr while still echoing 0 for the caller
careful_cat()
# silence error messages
quiet_cat()
{
local path="$@"
cat "$@" || echo 0
cat "$@" 2>/dev/null
}
while sleep $SCOUTFS_FENCED_DELAY; do
shopt -s nullglob
for fence in /sys/fs/scoutfs/*/fence/*; do
# catches unmatched regex when no dirs
if [ ! -d "$fence" ]; then
continue
fi
# skip requests that have been handled
if [ "$(careful_cat $fence/fenced)" == 1 -o \
"$(careful_cat $fence/error)" == 1 ]; then
continue
fi
srv=$(basename $(dirname $(dirname $fence)))
rid="$(cat $fence/rid)"
ip="$(cat $fence/ipv4_addr)"
reason="$(cat $fence/reason)"
fenced="$(quiet_cat $fence/fenced)"
error="$(quiet_cat $fence/error)"
rid="$(quiet_cat $fence/rid)"
ip="$(quiet_cat $fence/ipv4_addr)"
reason="$(quiet_cat $fence/reason)"
# request dirs can linger then disappear after fenced/error is set
if [ ! -d "$fence" -o "$fenced" == "1" -o "$error" == "1" ]; then
continue
fi
log_message "server $srv fencing rid $rid at IP $ip for $reason"