From 004f693af3b3884b79fd6e392896b4ba9e97742f Mon Sep 17 00:00:00 2001
From: Zach Brown <zab@versity.com>
Date: Thu, 21 Jan 2021 10:48:20 -0800
Subject: [PATCH 01/17] Add golden output for mount-unmount-race test

Signed-off-by: Zach Brown <zab@versity.com>
---
 tests/golden/mount-unmount-race | 3 +++
 1 file changed, 3 insertions(+)
 create mode 100644 tests/golden/mount-unmount-race

diff --git a/tests/golden/mount-unmount-race b/tests/golden/mount-unmount-race
new file mode 100644
index 00000000..7094e557
--- /dev/null
+++ b/tests/golden/mount-unmount-race
@@ -0,0 +1,3 @@
+== create per mount files
+== 30s of racing random mount/umount
+== mounting any unmounted

From 47a1ac92f70079fadca5ac66f0dd6cc9ab7bf1ac Mon Sep 17 00:00:00 2001
From: Zach Brown <zab@versity.com>
Date: Thu, 21 Jan 2021 10:51:07 -0800
Subject: [PATCH 02/17] Update ino-path args in basic-posix-consistency

The ino-path calls in basic-posix-consistency weren't updated for the
recent change to scoutfs cli args.

Signed-off-by: Zach Brown <zab@versity.com>
---
 tests/tests/basic-posix-consistency.sh | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/tests/basic-posix-consistency.sh b/tests/tests/basic-posix-consistency.sh
index 9c0d2398..94c5bd0d 100644
--- a/tests/tests/basic-posix-consistency.sh
+++ b/tests/tests/basic-posix-consistency.sh
@@ -160,8 +160,8 @@ for i in $(seq 1 1); do
 		mkdir -p $(dirname $lnk)
 		ln "$T_D0/file" $lnk
 
-		scoutfs ino-path $ino "$T_M0" > "$T_TMP.0"
-		scoutfs ino-path $ino "$T_M1" > "$T_TMP.1"
+		scoutfs ino-path -p "$T_M0" $ino > "$T_TMP.0"
+		scoutfs ino-path -p "$T_M1" $ino > "$T_TMP.1"
 		diff -u "$T_TMP.0" "$T_TMP.1"
 	done
 done

From 631801c45cd11c1a176cf001fd24d34b98461403 Mon Sep 17 00:00:00 2001
From: Zach Brown <zab@versity.com>
Date: Fri, 22 Jan 2021 09:45:27 -0800
Subject: [PATCH 03/17] Don't queue lock invalidation work during shutdown

The lock invalidation work function needs to be careful not to requeue
itself while we're shutting down or we can be left with invalidation
functions racing with shutdown.  Invalidation calls igrab so we can end
up with unmount warning that there are still inodes in use.

Signed-off-by: Zach Brown <zab@versity.com>
---
 kmod/src/lock.c | 19 +++++++++----------
 1 file changed, 9 insertions(+), 10 deletions(-)

diff --git a/kmod/src/lock.c b/kmod/src/lock.c
index 309a1152..6ab4081c 100644
--- a/kmod/src/lock.c
+++ b/kmod/src/lock.c
@@ -770,16 +770,6 @@ static void lock_invalidate_worker(struct work_struct *work)
 	list_for_each_entry_safe(lock, tmp, &linfo->inv_list, inv_head) {
 		nl = &lock->inv_nl;
 
-		/* skip if grace hasn't elapsed, record earliest */
-		deadline = lock->grace_deadline;
-		if (ktime_before(now, deadline)) {
-			delay = min(delay,
-				    nsecs_to_jiffies(ktime_to_ns(
-						ktime_sub(deadline, now))));
-			scoutfs_inc_counter(linfo->sb, lock_grace_wait);
-			continue;
-		}
-
 		/* wait for reordered grant to finish */
 		if (lock->mode != nl->old_mode)
 			continue;
@@ -788,6 +778,15 @@ static void lock_invalidate_worker(struct work_struct *work)
 		if (!lock_counts_match(nl->new_mode, lock->users))
 			continue;
 
+		/* skip if grace hasn't elapsed, record earliest */
+		deadline = lock->grace_deadline;
+		if (!linfo->shutdown && ktime_before(now, deadline)) {
+			delay = min(delay,
+				    nsecs_to_jiffies(ktime_to_ns(
+						ktime_sub(deadline, now))));
+			scoutfs_inc_counter(linfo->sb, lock_grace_wait);
+			continue;
+		}
 		/* set the new mode, no incompatible users during inval */
 		lock->mode = nl->new_mode;
 

From 0354bb64c583aefa8e69f0012529f2c11230a724 Mon Sep 17 00:00:00 2001
From: Zach Brown <zab@versity.com>
Date: Fri, 22 Jan 2021 11:42:50 -0800
Subject: [PATCH 04/17] More carefully enable tracing in run-tests

run-tests.sh has a -t argument which takes a whitespace seperated string
of globs of events to enable.  This was hard to use and made it very
easy to accidentally expand the globs at the wrong place in the script.

This makes each -t argument specify a single word glob which is stored
in an array so the glob isn't expanded until it's applied to the trace
event path.   We also add an error for -t globs that didn't match any
events and add a message with the count of -t arguments and enabled
events.

Signed-off-by: Zach Brown <zab@versity.com>
---
 tests/run-tests.sh | 22 +++++++++++++++++-----
 1 file changed, 17 insertions(+), 5 deletions(-)

diff --git a/tests/run-tests.sh b/tests/run-tests.sh
index 4963ff9a..59b58c4a 100755
--- a/tests/run-tests.sh
+++ b/tests/run-tests.sh
@@ -62,6 +62,7 @@ $(basename $0) options:
               | exist.  Previous results will be deleted as each test runs.
     -s        | Skip git repo checkouts.
     -t        | Enabled trace events that match the given glob argument.
+              | Multiple options enable multiple globbed events.
     -X        | xfstests git repo. Used by tests/xfstests.sh.
     -x        | xfstests git branch to checkout and track.
     -y        | xfstests ./check additional args
@@ -77,6 +78,9 @@ done
 T_TRACE_DUMP="0"
 T_TRACE_PRINTK="0"
 
+# array declarations to be able to use array ops
+declare -a T_TRACE_GLOB
+
 while true; do
 	case $1 in
 	-a)
@@ -147,7 +151,7 @@ while true; do
 		;;
 	-t)
 		test -n "$2" || die "-t must have trace glob argument"
-		T_TRACE_GLOB="$2"
+		T_TRACE_GLOB+=("$2")
 		shift
 		;;
 	-X)
@@ -314,12 +318,17 @@ if [ -n "$T_INSMOD" ]; then
 	cmd insmod "$T_KMOD/src/scoutfs.ko"
 fi
 
-if [ -n "$T_TRACE_GLOB" ]; then
-	msg "enabling trace events"
+nr_globs=${#T_TRACE_GLOB[@]}
+if [ $nr_globs -gt 0 ]; then
 	echo 0 > /sys/kernel/debug/tracing/events/scoutfs/enable
-	for g in $T_TRACE_GLOB; do
+
+	for g in "${T_TRACE_GLOB[@]}"; do
 		for e in /sys/kernel/debug/tracing/events/scoutfs/$g/enable; do
-			echo 1 > $e
+			if test -w "$e"; then
+				echo 1 > "$e"
+			else
+				die "-t glob '$g' matched no scoutfs events"
+			fi
 		done
 	done
 
@@ -329,6 +338,9 @@ if [ -n "$T_TRACE_GLOB" ]; then
 	cmd cat /sys/kernel/debug/tracing/set_event
 	cmd grep .  /sys/kernel/debug/tracing/options/trace_printk \
 		    /proc/sys/kernel/ftrace_dump_on_oops
+
+	nr_events=$(cat /sys/kernel/debug/tracing/set_event | wc -l)
+	msg "enabled $nr_events trace events from $nr_globs -t globs"
 fi
 
 #

From 0607dfdac8d6ea3d251acd891957947edea3d0b3 Mon Sep 17 00:00:00 2001
From: Zach Brown <zab@versity.com>
Date: Fri, 22 Jan 2021 12:37:18 -0800
Subject: [PATCH 05/17] Enable and collect trace_printk

Weirdly, run-tests was treating trace_printk not as an option to enable
trace_printk() traces but as an option to print trace events to the
console with printk?  That's not a thing.

Make -P really enable trace_printk tracing and collect it as it would
enabled trace events.  It needs to be treated seperately from the -t
options that enable trace events.

While we're at it treat the -P trace dumping option as a stand-alone
option that works without -t arguments.

Signed-off-by: Zach Brown <zab@versity.com>
---
 tests/run-tests.sh | 25 ++++++++++++++++---------
 1 file changed, 16 insertions(+), 9 deletions(-)

diff --git a/tests/run-tests.sh b/tests/run-tests.sh
index 59b58c4a..e3920a75 100755
--- a/tests/run-tests.sh
+++ b/tests/run-tests.sh
@@ -53,7 +53,7 @@ $(basename $0) options:
     -m        | Run mkfs on the device before mounting and running
               | tests.  Implies unmounting existing mounts first.
     -n        | The number of devices and mounts to test.
-    -P        | Output trace events with printk as they're generated.
+    -P        | Enable trace_printk.
     -p        | Exit script after preparing mounts only, don't run tests.
     -q <nr>   | Specify the quorum count needed to mount.  This is
               | used when running mkfs and is needed by a few tests.
@@ -332,17 +332,23 @@ if [ $nr_globs -gt 0 ]; then
 		done
 	done
 
-	echo "$T_TRACE_DUMP" > /proc/sys/kernel/ftrace_dump_on_oops
-	echo "$T_TRACE_PRINTK" > /sys/kernel/debug/tracing/options/trace_printk
-
-	cmd cat /sys/kernel/debug/tracing/set_event
-	cmd grep .  /sys/kernel/debug/tracing/options/trace_printk \
-		    /proc/sys/kernel/ftrace_dump_on_oops
-
 	nr_events=$(cat /sys/kernel/debug/tracing/set_event | wc -l)
 	msg "enabled $nr_events trace events from $nr_globs -t globs"
 fi
 
+if [ -n "$T_TRACE_PRINTK" ]; then
+	echo "$T_TRACE_PRINTK" > /sys/kernel/debug/tracing/options/trace_printk
+fi
+
+if [ -n "$T_TRACE_DUMP" ]; then
+	echo "$T_TRACE_DUMP" > /proc/sys/kernel/ftrace_dump_on_oops
+fi
+
+# always describe tracing in the logs
+cmd cat /sys/kernel/debug/tracing/set_event
+cmd grep .  /sys/kernel/debug/tracing/options/trace_printk \
+	    /proc/sys/kernel/ftrace_dump_on_oops
+
 #
 # mount concurrently so that a quorum is present to elect the leader and
 # start a server.
@@ -529,9 +535,10 @@ msg "all tests run: $passed passed, $skipped skipped, $failed failed"
 
 unmount_all
 
-if [ -n "$T_TRACE_GLOB" ]; then
+if [ -n "$T_TRACE_GLOB" -o -n "$T_TRACE_PRINTK" ]; then
 	msg "saving traces and disabling tracing"
 	echo 0 > /sys/kernel/debug/tracing/events/scoutfs/enable
+	echo 0 > /sys/kernel/debug/tracing/options/trace_printk
 	cat /sys/kernel/debug/tracing/trace > "$T_RESULTS/traces"
 fi
 

From 7a965372105c6220ed715399ee59bb03564516dc Mon Sep 17 00:00:00 2001
From: Zach Brown <zab@versity.com>
Date: Fri, 22 Jan 2021 14:59:23 -0800
Subject: [PATCH 06/17] Leave mounts mounted if run-tests fails

We can lose interesting state if the mounts are unmounted as tests fail,
only unmount if all the tests pass.

Signed-off-by: Zach Brown <zab@versity.com>
---
 tests/run-tests.sh | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/tests/run-tests.sh b/tests/run-tests.sh
index e3920a75..648dffa9 100755
--- a/tests/run-tests.sh
+++ b/tests/run-tests.sh
@@ -533,7 +533,6 @@ done
 
 msg "all tests run: $passed passed, $skipped skipped, $failed failed"
 
-unmount_all
 
 if [ -n "$T_TRACE_GLOB" -o -n "$T_TRACE_PRINTK" ]; then
 	msg "saving traces and disabling tracing"
@@ -544,13 +543,14 @@ fi
 
 if [ "$skipped" == 0 -a "$failed" == 0 ]; then
 	msg "all tests passed"
+	unmount_all
 	exit 0
 fi
 
 if [ "$skipped" != 0 ]; then
-	msg "$skipped tests skipped, check skip.log"
+	msg "$skipped tests skipped, check skip.log, still mounted"
 fi
 if [ "$failed" != 0 ]; then
-	msg "$failed tests failed, check fail.log"
+	msg "$failed tests failed, check fail.log, still mounted"
 fi
 exit 1

From 098fc420bef6d33df89f8182463fc81ae71b607f Mon Sep 17 00:00:00 2001
From: Zach Brown <zab@versity.com>
Date: Fri, 22 Jan 2021 15:34:28 -0800
Subject: [PATCH 07/17] Add some item cache page tracing

Signed-off-by: Zach Brown <zab@versity.com>
---
 kmod/src/item.c          |  4 ++++
 kmod/src/scoutfs_trace.h | 47 ++++++++++++++++++++++++++++++++++++++++
 2 files changed, 51 insertions(+)

diff --git a/kmod/src/item.c b/kmod/src/item.c
index 60cec43a..2b03c39f 100644
--- a/kmod/src/item.c
+++ b/kmod/src/item.c
@@ -1494,6 +1494,8 @@ retry:
 			rbtree_erase(&rd->node, &root);
 			rbtree_insert(&rd->node, par, pnode, &cinf->pg_root);
 			lru_accessed(sb, cinf, rd);
+			trace_scoutfs_item_read_page(sb, key, &rd->start,
+						     &rd->end);
 			continue;
 		}
 
@@ -2345,6 +2347,8 @@ retry:
 		write_lock(&pg->rwlock);
 
 		pgi = trim_page_intersection(sb, cinf, pg, right, start, end);
+		trace_scoutfs_item_invalidate_page(sb, start, end,
+						   &pg->start, &pg->end, pgi);
 		BUG_ON(pgi == PGI_DISJOINT); /* walk wouldn't ret disjoint */
 
 		if (pgi == PGI_INSIDE) {
diff --git a/kmod/src/scoutfs_trace.h b/kmod/src/scoutfs_trace.h
index 580cb6a2..ca0471e4 100644
--- a/kmod/src/scoutfs_trace.h
+++ b/kmod/src/scoutfs_trace.h
@@ -2421,6 +2421,53 @@ TRACE_EVENT(scoutfs_alloc_move,
 		  __entry->ret)
 );
 
+TRACE_EVENT(scoutfs_item_read_page,
+	TP_PROTO(struct super_block *sb, struct scoutfs_key *key,
+		 struct scoutfs_key *pg_start, struct scoutfs_key *pg_end),
+	TP_ARGS(sb, key, pg_start, pg_end),
+	TP_STRUCT__entry(
+		SCSB_TRACE_FIELDS
+		sk_trace_define(key)
+		sk_trace_define(pg_start)
+		sk_trace_define(pg_end)
+	),
+	TP_fast_assign(
+		SCSB_TRACE_ASSIGN(sb);
+		sk_trace_assign(key, key);
+		sk_trace_assign(pg_start, pg_start);
+		sk_trace_assign(pg_end, pg_end);
+	),
+	TP_printk(SCSBF" key "SK_FMT" pg_start "SK_FMT" pg_end "SK_FMT,
+		  SCSB_TRACE_ARGS, sk_trace_args(key), sk_trace_args(pg_start),
+		  sk_trace_args(pg_end))
+);
+
+TRACE_EVENT(scoutfs_item_invalidate_page,
+	TP_PROTO(struct super_block *sb, struct scoutfs_key *start,
+		 struct scoutfs_key *end, struct scoutfs_key *pg_start,
+		 struct scoutfs_key *pg_end, int pgi),
+	TP_ARGS(sb, start, end, pg_start, pg_end, pgi),
+	TP_STRUCT__entry(
+		SCSB_TRACE_FIELDS
+		sk_trace_define(start)
+		sk_trace_define(end)
+		sk_trace_define(pg_start)
+		sk_trace_define(pg_end)
+		__field(int, pgi)
+	),
+	TP_fast_assign(
+		SCSB_TRACE_ASSIGN(sb);
+		sk_trace_assign(start, start);
+		sk_trace_assign(end, end);
+		sk_trace_assign(pg_start, pg_start);
+		sk_trace_assign(pg_end, pg_end);
+		__entry->pgi = pgi;
+	),
+	TP_printk(SCSBF" start "SK_FMT" end "SK_FMT" pg_start "SK_FMT" pg_end "SK_FMT" pgi %d",
+		  SCSB_TRACE_ARGS, sk_trace_args(start), sk_trace_args(end),
+		  sk_trace_args(pg_start), sk_trace_args(pg_end), __entry->pgi)
+);
+
 #endif /* _TRACE_SCOUTFS_H */
 
 /* This part must be outside protection */

From da5911c311955038e5ed24a7ea7d7ef0c6717a69 Mon Sep 17 00:00:00 2001
From: Zach Brown <zab@versity.com>
Date: Fri, 22 Jan 2021 18:41:06 -0800
Subject: [PATCH 08/17] Use d_materialise_unique to splice dir dentries

When we're splicing in dentries in lookup we can be splicing the result
of changes on other nodes into a stale dcache.  The stale dcache might
contain dir entries and the dcache does not allow aliased directories.

Use d_materialise_unique() to splice in dir inodes so that we remove all
aliased dentries which must be stale.

We can still use d_splice_alias() for all other inode types.  Any
existing stale dentries will fail revalidation before they're used.

Signed-off-by: Zach Brown <zab@versity.com>
---
 kmod/src/dir.c | 13 ++++++++++++-
 1 file changed, 12 insertions(+), 1 deletion(-)

diff --git a/kmod/src/dir.c b/kmod/src/dir.c
index 8fdb29d0..0bfc0e7b 100644
--- a/kmod/src/dir.c
+++ b/kmod/src/dir.c
@@ -463,7 +463,18 @@ out:
 	else
 		inode = scoutfs_iget(sb, ino);
 
-	return d_splice_alias(inode, dentry);
+	/*
+	 * We can't splice dir aliases into the dcache.  dir entries
+	 * might have changed on other nodes so our dcache could still
+	 * contain them, rather than having been moved in rename.  For
+	 * dirs, we use d_materialize_unique to remove any existing
+	 * aliases which must be stale.  Our inode numbers aren't reused
+	 * so inodes pointed to by entries can't change types.
+	 */
+	if (!IS_ERR_OR_NULL(inode) && S_ISDIR(inode->i_mode))
+		return d_materialise_unique(dentry, inode);
+	else
+		return d_splice_alias(inode, dentry);
 }
 
 /*

From 8123b8fc357a081bb831ec70ca9174684787e694 Mon Sep 17 00:00:00 2001
From: Zach Brown <zab@versity.com>
Date: Mon, 25 Jan 2021 09:43:52 -0800
Subject: [PATCH 09/17] fix lock-conflicting-batch-commit conf output

The test had a silly typo in the label it put on the time it took mounts
to perform conflicting metadata changes.

Signed-off-by: Zach Brown <zab@versity.com>
---
 tests/tests/lock-conflicting-batch-commit.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/tests/lock-conflicting-batch-commit.sh b/tests/tests/lock-conflicting-batch-commit.sh
index 4f9d0ede..1b408ffc 100644
--- a/tests/tests/lock-conflicting-batch-commit.sh
+++ b/tests/tests/lock-conflicting-batch-commit.sh
@@ -50,7 +50,7 @@ for m in 0 1; do
 done
 wait
 CONF="$((SECONDS - START))"
-echo "conf: $IND" >> $T_TMP.log
+echo "conf: $CONF" >> $T_TMP.log
 
 if [ "$CONF" -gt "$((IND * 5))" ]; then
 	t_fail "conflicting $CONF secs is more than 5x independent $IND secs"

From 32e7978a6ec5d5f6f397b59738a75ae7cc534231 Mon Sep 17 00:00:00 2001
From: Zach Brown <zab@versity.com>
Date: Mon, 25 Jan 2021 09:50:13 -0800
Subject: [PATCH 10/17] Extend lock invalidate grace period

The grace period is intended to let lock holders squeeze in more bulk
work before another node pulls the lock out from under them.  The length
of the delay is a balance between getting more work done per lock hold
and adding latency to ping-ponging workloads.

The current grace period was too short.  To do work in the conflicting
case you often have to read the result that the other mount wrote as you
invalidated their lock.  The test was written in the LSM world where
we'd effectively read a single level 0 1MB segment.  In the btree world
we're checking bloom blocks and reading the other mount's btree.  It has
more dependent read latency.

So we turn up the grace period to let conflicting readers squeeze in
more work before pulling the lock out from under them.  This value was
chosen to make lock-conflicting-batch-commit pass in guests sharing nvme
metadata devices in debugging kernels.

Signed-off-by: Zach Brown <zab@versity.com>
---
 kmod/src/lock.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kmod/src/lock.c b/kmod/src/lock.c
index 6ab4081c..189af768 100644
--- a/kmod/src/lock.c
+++ b/kmod/src/lock.c
@@ -65,7 +65,7 @@
  * relative to that lock state we resend.
  */
 
-#define GRACE_PERIOD_KT	ms_to_ktime(2)
+#define GRACE_PERIOD_KT	ms_to_ktime(10)
 
 /*
  * allocated per-super, freed on unmount.

From 35ed1a2438624ee2f7913babd945bf7d7c05556d Mon Sep 17 00:00:00 2001
From: Zach Brown <zab@versity.com>
Date: Mon, 25 Jan 2021 10:28:31 -0800
Subject: [PATCH 11/17] Add t_require_meta_size function

Add a function that tests can use to skip when the metadata device isn't
large enough.  I thought we needed to avoid enospc in a particular test,
but it turns out the test's failure was unrelated.  So this isn't used
for now but it seems nice to keep around.

Signed-off-by: Zach Brown <zab@versity.com>
---
 tests/funcs/require.sh | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/tests/funcs/require.sh b/tests/funcs/require.sh
index 17b757cf..2d0f36ae 100644
--- a/tests/funcs/require.sh
+++ b/tests/funcs/require.sh
@@ -23,3 +23,18 @@ t_require_mounts() {
 	test "$T_NR_MOUNTS" -ge "$req" || \
 		t_skip "$req mounts required, only have $T_NR_MOUNTS"
 }
+
+#
+# Require that the meta device be at least the size string argument, as
+# parsed by numfmt using single char base 2 suffixes (iec).. 64G, etc.
+#
+t_require_meta_size() {
+	local dev="$T_META_DEVICE"
+	local req_iec="$1"
+	local req_bytes=$(numfmt --from=iec --to=none $req_iec)
+	local dev_bytes=$(blockdev --getsize64 $dev)
+	local dev_iec=$(numfmt --from=auto --to=iec $dev_bytes)
+
+	test "$dev_bytes" -ge "$req_bytes" || \
+		t_skip "$dev must be at least $req_iec, is $dev_iec"
+}

From d39268bbc1a51bd06b4e099eb1a9db30692b1e7e Mon Sep 17 00:00:00 2001
From: Zach Brown <zab@versity.com>
Date: Mon, 25 Jan 2021 15:27:13 -0800
Subject: [PATCH 12/17] Fix spurious EIO from scoutfs_srch_get_compact

scoutfs_srch_get_compact() is building up a compaction request which has
a list of srch files to read and sort and write into a new srch file.
It finds input files by searching for a sufficient number of similar
files: first any unsorted log files and then sorted log files that are
around the same size.

It finds the files by using btree next on the srch zone which has types
for unsorted srch log files, sorted srch files, but also pending and
busy compaction items.

It was being far too cute about iterating over different key types.  It
was trying to adapt to finding the next key and was making assumptions
about the order of key types.  It didn't notice that the pending and
busy key types followed log and sorted and would generate EIO when it
ran into them and found their value length didn't match what it was
expecting.

Rework the next item ref parsing so that it returns -ENOENT if it gets
an unexpected key type, then look for the next key type when checking
enoent.

Signed-off-by: Zach Brown <zab@versity.com>
---
 kmod/src/srch.c | 35 ++++++++++++++++-------------------
 1 file changed, 16 insertions(+), 19 deletions(-)

diff --git a/kmod/src/srch.c b/kmod/src/srch.c
index 4c361dd1..00c4f0fd 100644
--- a/kmod/src/srch.c
+++ b/kmod/src/srch.c
@@ -1198,14 +1198,10 @@ int scoutfs_srch_get_compact(struct super_block *sb,
 
 	for (;;scoutfs_key_inc(&key)) {
 		ret = scoutfs_btree_next(sb, root, &key, &iref);
-		if (ret == -ENOENT) {
-			ret = 0;
-			sc->nr = 0;
-			goto out;
-		}
-
 		if (ret == 0) {
-			if (iref.val_len == sizeof(struct scoutfs_srch_file)) {
+			if (iref.key->sk_type != type) {
+				ret = -ENOENT;
+			} else if (iref.val_len == sizeof(sfl)) {
 				key = *iref.key;
 				memcpy(&sfl, iref.val, iref.val_len);
 			} else {
@@ -1213,24 +1209,25 @@ int scoutfs_srch_get_compact(struct super_block *sb,
 			}
 			scoutfs_btree_put_iref(&iref);
 		}
-		if (ret < 0)
+		if (ret < 0) {
+			/* see if we ran out of log files or files entirely */
+			if (ret == -ENOENT) {
+				sc->nr = 0;
+				if (type == SCOUTFS_SRCH_LOG_TYPE) {
+					type = SCOUTFS_SRCH_BLOCKS_TYPE;
+					init_srch_key(&key, type, 0, 0);
+					continue;
+				} else {
+					ret = 0;
+				}
+			}
 			goto out;
+		}
 
 		/* skip any files already being compacted */
 		if (scoutfs_spbm_test(&busy, le64_to_cpu(sfl.ref.blkno)))
 			continue;
 
-		/* see if we ran out of log files or files entirely */
-		if (key.sk_type != type) {
-			sc->nr = 0;
-			if (key.sk_type == SCOUTFS_SRCH_BLOCKS_TYPE) {
-				type = SCOUTFS_SRCH_BLOCKS_TYPE;
-			} else {
-				ret = 0;
-				goto out;
-			}
-		}
-
 		/* reset if we iterated into the next size category */
 		if (type == SCOUTFS_SRCH_BLOCKS_TYPE) {
 			order = fls64(le64_to_cpu(sfl.blocks)) /

From e9c3aa650157a5c07085e88f776a136aaa3809ef Mon Sep 17 00:00:00 2001
From: Zach Brown <zab@versity.com>
Date: Thu, 21 Jan 2021 11:05:14 -0800
Subject: [PATCH 13/17] More carefully cancel server farewell work

Farewell work is queued by farewell message processing.  Server shutdown
didn't properly wait for pending farewell work to finish before tearing
down.  As the server work destroyed the server's connection the farewell
work could stlil be running and try to send responses down the socket.

We make the server more carefully avoid queueuing farewell work if it's
in the process of shutting down and wait for farewell work to finish
before destroying the server's resources.

This fixed all manner of crashes that were seen in testing when a bunch
of nodes unmounted, creating farewell work on the server as it itself
unmounted and destroyed the server.

Signed-off-by: Zach Brown <zab@versity.com>
---
 kmod/src/server.c | 37 +++++++++++++++++++++++++++++--------
 1 file changed, 29 insertions(+), 8 deletions(-)

diff --git a/kmod/src/server.c b/kmod/src/server.c
index 958343a1..ba4c9b4e 100644
--- a/kmod/src/server.c
+++ b/kmod/src/server.c
@@ -1096,6 +1096,20 @@ static int cancel_srch_compact(struct super_block *sb, u64 rid)
 	return ret;
 }
 
+/*
+ * Farewell processing is async to the request processing work.  Shutdown
+ * waits for request processing to finish and then tears down the connection.
+ * We don't want to queue farewell processing once we start shutting down
+ * so that we don't have farewell processing racing with the connecting
+ * being shutdown.  If a mount's farewell message is dropped by a server
+ * it will be processed by the next server.
+ */
+static void queue_farewell_work(struct server_info *server)
+{
+	if (!server->shutting_down)
+		queue_work(server->wq, &server->farewell_work);
+}
+
 /*
  * Process an incoming greeting request in the server from the client.
  * We try to send responses to failed greetings so that the sender can
@@ -1400,8 +1414,8 @@ out:
 
 	if (ret < 0)
 		stop_server(server);
-	else if (more_reqs && !server->shutting_down)
-		queue_work(server->wq, &server->farewell_work);
+	else if (more_reqs)
+		queue_farewell_work(server);
 }
 
 static void free_farewell_requests(struct super_block *sb, u64 rid)
@@ -1455,7 +1469,7 @@ static int server_farewell(struct super_block *sb,
 	list_add_tail(&fw->entry, &server->farewell_requests);
 	mutex_unlock(&server->farewell_mutex);
 
-	queue_work(server->wq, &server->farewell_work);
+	queue_farewell_work(server);
 
 	/* response will be sent later */
 	return 0;
@@ -1618,12 +1632,17 @@ static void scoutfs_server_worker(struct work_struct *work)
 
 shutdown:
 	scoutfs_info(sb, "server shutting down at "SIN_FMT, SIN_ARG(&sin));
-	/* wait for request processing */
+
+	/* wait for farewell to finish sending messages */
+	flush_work(&server->farewell_work);
+
+	/* wait for requests to finish, no more requests */
 	scoutfs_net_shutdown(sb, conn);
-	/* wait for commit queued by request processing */
-	flush_work(&server->commit_work);
 	server->conn = NULL;
 
+	/* wait for extra queues by requests, won't find waiters */
+	flush_work(&server->commit_work);
+
 	scoutfs_lock_server_destroy(sb);
 
 out:
@@ -1696,8 +1715,9 @@ void scoutfs_server_stop(struct super_block *sb)
 	DECLARE_SERVER_INFO(sb, server);
 
 	stop_server(server);
-	/* XXX not sure both are needed */
+
 	cancel_work_sync(&server->work);
+	cancel_work_sync(&server->farewell_work);
 	cancel_work_sync(&server->commit_work);
 }
 
@@ -1752,11 +1772,12 @@ void scoutfs_server_destroy(struct super_block *sb)
 
 		/* wait for server work to wait for everything to shut down */
 		cancel_work_sync(&server->work);
+		/* farewell work triggers commits */
+		cancel_work_sync(&server->farewell_work);
 		/* recv work/compaction could have left commit_work queued */
 		cancel_work_sync(&server->commit_work);
 
 		/* pending farewell requests are another server's problem */
-		cancel_work_sync(&server->farewell_work);
 		free_farewell_requests(sb, 0);
 
 		trace_scoutfs_server_workqueue_destroy(sb, 0, 0);

From 1fc706bf3f41baf1385c05acff6e36f8904a49a7 Mon Sep 17 00:00:00 2001
From: Zach Brown <zab@versity.com>
Date: Tue, 26 Jan 2021 10:30:04 -0800
Subject: [PATCH 14/17] Filter hrtimer slow messages from dmesg

When running in debug kernels in guests we can really bog down things
enough to trigger hrtimer warnings.  I don't think there's much we can
reasonably do about that.

Signed-off-by: Zach Brown <zab@versity.com>
---
 tests/funcs/filter.sh | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/tests/funcs/filter.sh b/tests/funcs/filter.sh
index 65353a04..1429fec8 100644
--- a/tests/funcs/filter.sh
+++ b/tests/funcs/filter.sh
@@ -59,5 +59,8 @@ t_filter_dmesg()
 	# some tests mount w/o options
 	re="$re|scoutfs .* error: Required mount option \"metadev_path\" not found"
 
+	# in debugging kernels we can slow things down a bit
+	re="$re|hrtimer: interrupt took .*"
+
 	egrep -v "($re)" 
 }

From f81e4cb98ae60565bcda3e99f23bade2a264fae8 Mon Sep 17 00:00:00 2001
From: Zach Brown <zab@versity.com>
Date: Tue, 26 Jan 2021 10:35:39 -0800
Subject: [PATCH 15/17] Add whitespace to xfstests output message

The message indicating that xfstests output was now being shown was
mashed up against the previous passed stats and it was gross and I hated
it.

Signed-off-by: Zach Brown <zab@versity.com>
---
 tests/tests/xfstests.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/tests/xfstests.sh b/tests/tests/xfstests.sh
index 3d21ca39..274088e6 100644
--- a/tests/tests/xfstests.sh
+++ b/tests/tests/xfstests.sh
@@ -83,7 +83,7 @@ generic/375	# utils output change?  update branch?
 EOF
 
 t_restore_output
-echo "(showing output of xfstests)"
+echo "  (showing output of xfstests)"
 
 args="-E local.exclude ${T_XFSTESTS_ARGS:--g quick}"
 ./check $args

From 5a90234c94ac8081f518a7a94853e4008aa1c046 Mon Sep 17 00:00:00 2001
From: Zach Brown <zab@versity.com>
Date: Tue, 26 Jan 2021 10:32:31 -0800
Subject: [PATCH 16/17] Use terminated test name when saving passed stats

We've grown some test names that are prefixes of others
(createmany-parallel, createmany-parallel-mounts).  When we're searching
for lines with the test name we have to search for the exact test name,
by terminating the name with a space, instead of searching for a line
that starts with the test name.

This fixes strange output and saved passed stats for the names that
share a prefix.

Signed-off-by: Zach Brown <zab@versity.com>
---
 tests/run-tests.sh | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/run-tests.sh b/tests/run-tests.sh
index 648dffa9..f2e5983f 100755
--- a/tests/run-tests.sh
+++ b/tests/run-tests.sh
@@ -452,7 +452,7 @@ for t in $tests; do
 
 	# get stats from previous pass
 	last="$T_RESULTS/last-passed-test-stats"
-	stats=$(grep -s "^$test_name" "$last" | cut -d " " -f 2-)
+	stats=$(grep -s "^$test_name " "$last" | cut -d " " -f 2-)
 	test -n "$stats" && stats="last: $stats"
 
 	printf "  %-30s $stats" "$test_name"
@@ -515,7 +515,7 @@ for t in $tests; do
 		echo "  passed: $stats"
 		((passed++))
 		# save stats for passed test
-		grep -s -v "^$test_name" "$last" > "$last.tmp"
+		grep -s -v "^$test_name " "$last" > "$last.tmp"
 		echo "$test_name $stats" >> "$last.tmp"
 		mv -f "$last.tmp" "$last"
 	elif [ "$sts" == "$T_SKIP_STATUS" ]; then

From ade539217e5a09955e903c5ced1dd7c70e82cf34 Mon Sep 17 00:00:00 2001
From: Zach Brown <zab@versity.com>
Date: Tue, 26 Jan 2021 12:58:02 -0800
Subject: [PATCH 17/17] Handle advance_seq being replayed in new server

As a core principle, all server message processing needs to be safe to
replay as servers shut down and requests are resent to new servers.

The advance_seq handler got this wrong.  It would only try to remove a
trans_seq item for the seq sent by the client before inserting a new
item for the next seq.  This change could be committed before the reply
was lost as the server shuts down.  The next server would process the
resent request but wouldn't find the old item for the seq that the
client sent, and would ignore the new item that the previous server
inserted.  It would then insert another greater seq for the same client.

This would leave behind a stale old trans_seq that would be returned as
the last_seq which would forever limit the results that could be
returned from the seq index walks.

This fix is to always remove all previous seq items for the client
before inserting a new one.  This creates O(clients) server work, but
it's minimal.

This manifest as occasional simple-inode-index test failures (say 1 in
5?) which would trigger if the unmounts during previous tests would
happen to have advance_seq resent across server shutdowns.  With this
change the test now reliably passes.

Signed-off-by: Zach Brown <zab@versity.com>
---
 kmod/src/client.c        |   8 +-
 kmod/src/scoutfs_trace.h |  18 ++---
 kmod/src/server.c        | 166 +++++++++++++++++++++------------------
 3 files changed, 100 insertions(+), 92 deletions(-)

diff --git a/kmod/src/client.c b/kmod/src/client.c
index ffe06de8..250872d8 100644
--- a/kmod/src/client.c
+++ b/kmod/src/client.c
@@ -121,16 +121,14 @@ int scoutfs_client_get_roots(struct super_block *sb,
 int scoutfs_client_advance_seq(struct super_block *sb, u64 *seq)
 {
 	struct client_info *client = SCOUTFS_SB(sb)->client_info;
-	__le64 before = cpu_to_le64p(seq);
-	__le64 after;
+	__le64 leseq;
 	int ret;
 
 	ret = scoutfs_net_sync_request(sb, client->conn,
 				       SCOUTFS_NET_CMD_ADVANCE_SEQ,
-				       &before, sizeof(before),
-				       &after, sizeof(after));
+				       NULL, 0, &leseq, sizeof(leseq));
 	if (ret == 0)
-		*seq = le64_to_cpu(after);
+		*seq = le64_to_cpu(leseq);
 
 	return ret;
 }
diff --git a/kmod/src/scoutfs_trace.h b/kmod/src/scoutfs_trace.h
index ca0471e4..5b02781e 100644
--- a/kmod/src/scoutfs_trace.h
+++ b/kmod/src/scoutfs_trace.h
@@ -1938,31 +1938,27 @@ DEFINE_EVENT(scoutfs_clock_sync_class, scoutfs_recv_clock_sync,
 );
 
 TRACE_EVENT(scoutfs_trans_seq_advance,
-	TP_PROTO(struct super_block *sb, u64 rid, u64 prev_seq,
-		 u64 next_seq),
+	TP_PROTO(struct super_block *sb, u64 rid, u64 trans_seq),
 
-	TP_ARGS(sb, rid, prev_seq, next_seq),
+	TP_ARGS(sb, rid, trans_seq),
 
 	TP_STRUCT__entry(
 		SCSB_TRACE_FIELDS
 		__field(__u64, s_rid)
-		__field(__u64, prev_seq)
-		__field(__u64, next_seq)
+		__field(__u64, trans_seq)
 	),
 
 	TP_fast_assign(
 		SCSB_TRACE_ASSIGN(sb);
 		__entry->s_rid = rid;
-		__entry->prev_seq = prev_seq;
-		__entry->next_seq = next_seq;
+		__entry->trans_seq = trans_seq;
 	),
 
-	TP_printk(SCSBF" rid %016llx prev_seq %llu next_seq %llu",
-		  SCSB_TRACE_ARGS, __entry->s_rid, __entry->prev_seq,
-		  __entry->next_seq)
+	TP_printk(SCSBF" rid %016llx trans_seq %llu\n",
+		  SCSB_TRACE_ARGS, __entry->s_rid, __entry->trans_seq)
 );
 
-TRACE_EVENT(scoutfs_trans_seq_farewell,
+TRACE_EVENT(scoutfs_trans_seq_remove,
 	TP_PROTO(struct super_block *sb, u64 rid, u64 trans_seq),
 
 	TP_ARGS(sb, rid, trans_seq),
diff --git a/kmod/src/server.c b/kmod/src/server.c
index ba4c9b4e..c73ddf71 100644
--- a/kmod/src/server.c
+++ b/kmod/src/server.c
@@ -649,79 +649,10 @@ static void init_trans_seq_key(struct scoutfs_key *key, u64 seq, u64 rid)
 }
 
 /*
- * Give the client the next sequence number for their transaction.  They
- * provide their previous transaction sequence number that they've
- * committed.
- *
- * We track the sequence numbers of transactions that clients have open.
- * This limits the transaction sequence numbers that can be returned in
- * the index of inodes by meta and data transaction numbers.  We
- * communicate the largest possible sequence number to clients via an
- * rpc.
- *
- * The transaction sequence tracking is stored in a btree so it is
- * shared across servers.  Final entries are removed when processing a
- * client's farewell or when it's removed.
+ * Remove all trans_seq items owned by the client rid, the caller holds
+ * the seq_rwsem.
  */
-static int server_advance_seq(struct super_block *sb,
-			      struct scoutfs_net_connection *conn,
-			      u8 cmd, u64 id, void *arg, u16 arg_len)
-{
-	DECLARE_SERVER_INFO(sb, server);
-	struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
-	struct scoutfs_super_block *super = &sbi->super;
-	__le64 their_seq;
-	__le64 next_seq;
-	u64 rid = scoutfs_net_client_rid(conn);
-	struct scoutfs_key key;
-	int ret;
-
-	if (arg_len != sizeof(__le64)) {
-		ret = -EINVAL;
-		goto out;
-	}
-	memcpy(&their_seq, arg, sizeof(their_seq));
-
-	ret = scoutfs_server_hold_commit(sb);
-	if (ret)
-		goto out;
-
-	down_write(&server->seq_rwsem);
-
-	if (their_seq != 0) {
-		init_trans_seq_key(&key, le64_to_cpu(their_seq), rid);
-		ret = scoutfs_btree_delete(sb, &server->alloc, &server->wri,
-					   &super->trans_seqs, &key);
-		if (ret < 0 && ret != -ENOENT)
-			goto unlock;
-	}
-
-	next_seq = super->next_trans_seq;
-	le64_add_cpu(&super->next_trans_seq, 1);
-
-	trace_scoutfs_trans_seq_advance(sb, rid, le64_to_cpu(their_seq),
-					le64_to_cpu(next_seq));
-
-	init_trans_seq_key(&key, le64_to_cpu(next_seq), rid);
-	ret = scoutfs_btree_insert(sb, &server->alloc, &server->wri,
-				   &super->trans_seqs, &key, NULL, 0);
-unlock:
-	up_write(&server->seq_rwsem);
-	ret = scoutfs_server_apply_commit(sb, ret);
-
-out:
-	return scoutfs_net_response(sb, conn, cmd, id, ret,
-				    &next_seq, sizeof(next_seq));
-}
-
-/*
- * Remove any transaction sequences owned by the client.  They must have
- * committed any final transaction by the time they get here via sending
- * their farewell message.  This can be called multiple times as the
- * client's farewell is retransmitted so it's OK to not find any
- * entries.  This is called with the server commit rwsem held.
- */
-static int remove_trans_seq(struct super_block *sb, u64 rid)
+static int remove_trans_seq_locked(struct super_block *sb, u64 rid)
 {
 	DECLARE_SERVER_INFO(sb, server);
 	struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
@@ -730,8 +661,6 @@ static int remove_trans_seq(struct super_block *sb, u64 rid)
 	struct scoutfs_key key;
 	int ret = 0;
 
-	down_write(&server->seq_rwsem);
-
 	init_trans_seq_key(&key, 0, 0);
 
 	for (;;) {
@@ -746,17 +675,102 @@ static int remove_trans_seq(struct super_block *sb, u64 rid)
 		scoutfs_btree_put_iref(&iref);
 
 		if (le64_to_cpu(key.skts_rid) == rid) {
-			trace_scoutfs_trans_seq_farewell(sb, rid,
+			trace_scoutfs_trans_seq_remove(sb, rid,
 					le64_to_cpu(key.skts_trans_seq));
 			ret = scoutfs_btree_delete(sb, &server->alloc,
 						   &server->wri,
 						   &super->trans_seqs, &key);
-			break;
+			if (ret < 0)
+				break;
 		}
 
 		scoutfs_key_inc(&key);
 	}
 
+	return ret;
+}
+
+/*
+ * Give the client the next sequence number for the transaction that
+ * they're opening.
+ *
+ * We track the sequence numbers of transactions that clients have open.
+ * This limits the transaction sequence numbers that can be returned in
+ * the index of inodes by meta and data transaction numbers.  We
+ * communicate the largest possible sequence number to clients via an
+ * rpc.
+ *
+ * The transaction sequence tracking is stored in a btree so it is
+ * shared across servers.  Final entries are removed when processing a
+ * client's farewell or when it's removed.  We can be processent a
+ * resent request that was committed by a previous server before the
+ * reply was lost.  At this point the client has no transactions open
+ * and may or may not have just finished one.  To keep it simple we
+ * always remove any previous seq items, if there are any, and then
+ * insert a new item for the client at the next greatest seq.
+ */
+static int server_advance_seq(struct super_block *sb,
+			      struct scoutfs_net_connection *conn,
+			      u8 cmd, u64 id, void *arg, u16 arg_len)
+{
+	DECLARE_SERVER_INFO(sb, server);
+	struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
+	struct scoutfs_super_block *super = &sbi->super;
+	u64 rid = scoutfs_net_client_rid(conn);
+	struct scoutfs_key key;
+	__le64 leseq = 0;
+	u64 seq;
+	int ret;
+
+	if (arg_len != 0) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	ret = scoutfs_server_hold_commit(sb);
+	if (ret)
+		goto out;
+
+	down_write(&server->seq_rwsem);
+
+	ret = remove_trans_seq_locked(sb, rid);
+	if (ret < 0)
+		goto unlock;
+
+	seq = le64_to_cpu(super->next_trans_seq);
+	le64_add_cpu(&super->next_trans_seq, 1);
+
+	trace_scoutfs_trans_seq_advance(sb, rid, seq);
+
+	init_trans_seq_key(&key, seq, rid);
+	ret = scoutfs_btree_insert(sb, &server->alloc, &server->wri,
+				   &super->trans_seqs, &key, NULL, 0);
+	if (ret == 0)
+		leseq = cpu_to_le64(seq);
+unlock:
+	up_write(&server->seq_rwsem);
+	ret = scoutfs_server_apply_commit(sb, ret);
+
+out:
+	return scoutfs_net_response(sb, conn, cmd, id, ret,
+				    &leseq, sizeof(leseq));
+}
+
+/*
+ * Remove any transaction sequences owned by the client who's sent a
+ * farewell They must have committed any final transaction by the time
+ * they get here via sending their farewell message.  This can be called
+ * multiple times as the client's farewell is retransmitted so it's OK
+ * to not find any entries.  This is called with the server commit rwsem
+ * held.
+ */
+static int remove_trans_seq(struct super_block *sb, u64 rid)
+{
+	DECLARE_SERVER_INFO(sb, server);
+	int ret = 0;
+
+	down_write(&server->seq_rwsem);
+	ret = remove_trans_seq_locked(sb, rid);
 	up_write(&server->seq_rwsem);
 
 	return ret;