Add nr_log_trees debugfs counter

Signed-off-by: Zach Brown <zab@versity.com>
Force merge creation timeout
2026-06-09 21:22:36 +00:00 · 2024-01-25 12:48:27 -08:00 · 2024-01-25 12:46:46 -08:00 · 2024-01-09 09:23:32 -08:00 · 2024-01-08 16:27:41 -08:00 · 2023-11-22 10:04:18 -08:00
21 changed files with 708 additions and 129 deletions
@@ -1,6 +1,18 @@
 Versity ScoutFS Release Notes
 =============================

+---
+v1.18
+\
+*Nov 7, 2023*
+
+Fixed a bug where background srch file compaction could stop making
+forward progress if a partial compaction operation was committed at a
+specific byte offset in a block.  This would cause srch file searches to
+be progressively more expensive over time.  Once this fix is running
+background compaction will resume, bringing the cost of searches back
+down.
+
 ---
 v1.17
 \
@@ -68,6 +68,8 @@ struct forest_info {
 	struct delayed_work log_merge_dwork;

 	atomic64_t inode_count_delta;
+
+	struct dentry *dent;
 };

 #define DECLARE_FOREST_INFO(sb, name) \
@@ -750,6 +752,60 @@ resched:
 	queue_delayed_work(finf->workq, &finf->log_merge_dwork, delay);
 }

+static int count_log_trees(struct super_block *sb, struct scoutfs_key *key, u64 seq,
+			   u8 flags, void *val, int val_len, void *arg)
+{
+	u64 *count = arg;
+
+	(*count)++;
+	return 0;
+}
+
+static int debugfs_nr_log_trees_get(void *data, u64 *val)
+{
+	struct super_block *sb = data;
+	struct scoutfs_super_block *super = NULL;
+	struct scoutfs_key start;
+	struct scoutfs_key end;
+	struct scoutfs_key key;
+	u64 count;
+	int ret;
+
+	super = kmalloc(sizeof(struct scoutfs_super_block), GFP_NOFS);
+	if (!super) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	ret = scoutfs_read_super(sb, super);
+	if (ret < 0)
+		goto out;
+
+	scoutfs_key_init_log_trees(&key, 0, 0);
+	count = 0;
+	for (;;) {
+		scoutfs_key_set_zeros(&start);
+		scoutfs_key_set_ones(&end);
+		ret = scoutfs_btree_read_items(sb, &super->logs_root, &key, &start, &end,
+					       count_log_trees, &count);
+		if (ret == -ENOENT || scoutfs_key_is_ones(&end))
+			break;
+		if (ret < 0)
+			goto out;
+
+		key = end;
+		scoutfs_key_inc(&key);
+	}
+
+	*val = count;
+	ret = 0;
+out:
+	kfree(super);
+	return ret ? -EIO : 0;
+}
+
+DEFINE_SIMPLE_ATTRIBUTE(fops_nr_log_trees, debugfs_nr_log_trees_get, NULL, "%llu\n");
+
 int scoutfs_forest_setup(struct super_block *sb)
 {
 	struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
@@ -770,6 +826,13 @@ int scoutfs_forest_setup(struct super_block *sb)
 			  scoutfs_forest_log_merge_worker);
 	sbi->forest_info = finf;

+	finf->dent = debugfs_create_file("nr_log_trees", S_IFREG|S_IRUSR, sbi->debug_root, sb,
+					 &fops_nr_log_trees);
+	if (IS_ERR(finf->dent)) {
+		ret = PTR_ERR(finf->dent);
+		goto out;
+	}
+
 	finf->workq = alloc_workqueue("scoutfs_log_merge", WQ_NON_REENTRANT |
 				      WQ_UNBOUND | WQ_HIGHPRI, 0);
 	if (!finf->workq) {
@@ -799,6 +862,8 @@ void scoutfs_forest_stop(struct super_block *sb)

 	if (finf && finf->workq) {
 		cancel_delayed_work_sync(&finf->log_merge_dwork);
+		if (!IS_ERR_OR_NULL(finf->dent))
+			debugfs_remove(finf->dent);
 		destroy_workqueue(finf->workq);
 	}
 }
@@ -439,6 +439,7 @@ DECLARE_EVENT_CLASS(scoutfs_trans_hold_release_class,
 		SCSB_TRACE_ASSIGN(sb);
 		__entry->journal_info = (unsigned long)journal_info;
 		__entry->holders = holders;
+		__entry->ret = ret;
 	),

 	TP_printk(SCSBF" journal_info 0x%0lx holders %d ret %d",
@@ -2799,6 +2800,81 @@ TRACE_EVENT(scoutfs_omap_should_delete,
 		  SCSB_TRACE_ARGS, __entry->ino, __entry->nlink, __entry->ret)
 );

+#define SSCF_FMT "[bo %llu bs %llu es %llu]"
+#define SSCF_FIELDS(pref)					\
+	__field(__u64, pref##_blkno)				\
+	__field(__u64, pref##_blocks)				\
+	__field(__u64, pref##_entries)
+#define SSCF_ASSIGN(pref, sfl)					\
+	__entry->pref##_blkno = le64_to_cpu((sfl)->ref.blkno);	\
+	__entry->pref##_blocks = le64_to_cpu((sfl)->blocks);	\
+	__entry->pref##_entries = le64_to_cpu((sfl)->entries);
+#define SSCF_ENTRY_ARGS(pref)					\
+	__entry->pref##_blkno,					\
+	__entry->pref##_blocks,					\
+	__entry->pref##_entries
+
+DECLARE_EVENT_CLASS(scoutfs_srch_compact_class,
+	TP_PROTO(struct super_block *sb, struct scoutfs_srch_compact *sc),
+
+	TP_ARGS(sb, sc),
+
+	TP_STRUCT__entry(
+		SCSB_TRACE_FIELDS
+		__field(__u64, id)
+		__field(__u8, nr)
+		__field(__u8, flags)
+		SSCF_FIELDS(out)
+		__field(__u64, in0_blk)
+		__field(__u64, in0_pos)
+		SSCF_FIELDS(in0)
+		__field(__u64, in1_blk)
+		__field(__u64, in1_pos)
+		SSCF_FIELDS(in1)
+		__field(__u64, in2_blk)
+		__field(__u64, in2_pos)
+		SSCF_FIELDS(in2)
+		__field(__u64, in3_blk)
+		__field(__u64, in3_pos)
+		SSCF_FIELDS(in3)
+	),
+
+	TP_fast_assign(
+		SCSB_TRACE_ASSIGN(sb);
+		__entry->id = le64_to_cpu(sc->id);
+		__entry->nr = sc->nr;
+		__entry->flags = sc->flags;
+		SSCF_ASSIGN(out, &sc->out)
+		__entry->in0_blk = le64_to_cpu(sc->in[0].blk);
+		__entry->in0_pos = le64_to_cpu(sc->in[0].pos);
+		SSCF_ASSIGN(in0, &sc->in[0].sfl)
+		__entry->in1_blk = le64_to_cpu(sc->in[0].blk);
+		__entry->in1_pos = le64_to_cpu(sc->in[0].pos);
+		SSCF_ASSIGN(in1, &sc->in[1].sfl)
+		__entry->in2_blk = le64_to_cpu(sc->in[0].blk);
+		__entry->in2_pos = le64_to_cpu(sc->in[0].pos);
+		SSCF_ASSIGN(in2, &sc->in[2].sfl)
+		__entry->in3_blk = le64_to_cpu(sc->in[0].blk);
+		__entry->in3_pos = le64_to_cpu(sc->in[0].pos);
+		SSCF_ASSIGN(in3, &sc->in[3].sfl)
+	),
+
+	TP_printk(SCSBF" id %llu nr %u flags 0x%x out "SSCF_FMT" in0 b %llu p %llu "SSCF_FMT" in1 b %llu p %llu "SSCF_FMT" in2 b %llu p %llu "SSCF_FMT" in3 b %llu p %llu "SSCF_FMT,
+		  SCSB_TRACE_ARGS, __entry->id, __entry->nr, __entry->flags, SSCF_ENTRY_ARGS(out),
+		  __entry->in0_blk, __entry->in0_pos, SSCF_ENTRY_ARGS(in0),
+		  __entry->in1_blk, __entry->in1_pos, SSCF_ENTRY_ARGS(in1),
+		  __entry->in2_blk, __entry->in2_pos, SSCF_ENTRY_ARGS(in2),
+		  __entry->in3_blk, __entry->in3_pos, SSCF_ENTRY_ARGS(in3))
+);
+DEFINE_EVENT(scoutfs_srch_compact_class, scoutfs_srch_compact_client_send,
+	TP_PROTO(struct super_block *sb, struct scoutfs_srch_compact *sc),
+	TP_ARGS(sb, sc)
+);
+DEFINE_EVENT(scoutfs_srch_compact_class, scoutfs_srch_compact_client_recv,
+	TP_PROTO(struct super_block *sb, struct scoutfs_srch_compact *sc),
+	TP_ARGS(sb, sc)
+);
+
 #endif /* _TRACE_SCOUTFS_H */

 /* This part must be outside protection */
@@ -91,6 +91,7 @@ do {												\
 struct server_info {
 	struct super_block *sb;
 	spinlock_t lock;
+	seqlock_t seqlock;
 	wait_queue_head_t waitq;

 	struct workqueue_struct *wq;
@@ -132,11 +133,9 @@ struct server_info {
 	struct mutex mounted_clients_mutex;

 	/* stable super stored from commits, given in locks and rpcs */
-	seqcount_t stable_seqcount;
 	struct scoutfs_super_block stable_super;

 	/* serializing and get and set volume options */
-	seqcount_t volopt_seqcount;
 	struct mutex volopt_mutex;
 	struct scoutfs_volume_options volopt;

@@ -182,7 +181,7 @@ static bool get_volopt_val(struct server_info *server, int nr, u64 *val)
 	unsigned seq;

 	do {
-		seq = read_seqcount_begin(&server->volopt_seqcount);
+		seq = read_seqbegin(&server->seqlock);
 		if ((le64_to_cpu(server->volopt.set_bits) & bit)) {
 			is_set = true;
 			*val = le64_to_cpup(opt);
@@ -190,7 +189,7 @@ static bool get_volopt_val(struct server_info *server, int nr, u64 *val)
 			is_set = false;
 			*val = 0;
 		};
-	} while (read_seqcount_retry(&server->volopt_seqcount, seq));
+	} while (read_seqretry(&server->seqlock, seq));

 	return is_set;
 }
@@ -506,7 +505,7 @@ static void get_stable(struct super_block *sb, struct scoutfs_super_block *super
 	unsigned int seq;

 	do {
-		seq = read_seqcount_begin(&server->stable_seqcount);
+		seq = read_seqbegin(&server->seqlock);
 		if (super)
 			*super = server->stable_super;
 		if (roots) {
@@ -514,7 +513,7 @@ static void get_stable(struct super_block *sb, struct scoutfs_super_block *super
 			roots->logs_root = server->stable_super.logs_root;
 			roots->srch_root = server->stable_super.srch_root;
 		}
-	} while (read_seqcount_retry(&server->stable_seqcount, seq));
+	} while (read_seqretry(&server->seqlock, seq));
 }

 u64 scoutfs_server_seq(struct super_block *sb)
@@ -548,11 +547,9 @@ void scoutfs_server_set_seq_if_greater(struct super_block *sb, u64 seq)

 static void set_stable_super(struct server_info *server, struct scoutfs_super_block *super)
 {
-	preempt_disable();
-	write_seqcount_begin(&server->stable_seqcount);
+	write_seqlock(&server->seqlock);
 	server->stable_super = *super;
-	write_seqcount_end(&server->stable_seqcount);
-	preempt_enable();
+	write_sequnlock(&server->seqlock);
 }

 /*
@@ -1203,7 +1200,7 @@ static int finalize_and_start_log_merge(struct super_block *sb, struct scoutfs_l
 			mutex_lock(&server->logs_mutex);

 			/* done if we timed out */
-			if (time_after(jiffies, timeo)) {
+			if (1 || time_after(jiffies, timeo)) {
 				ret = 0;
 				break;
 			}
@@ -1969,9 +1966,7 @@ static int server_srch_get_compact(struct super_block *sb,
 	ret = scoutfs_srch_get_compact(sb, &server->alloc, &server->wri,
 				       &super->srch_root, rid, sc);
 	mutex_unlock(&server->srch_mutex);
-	if (ret == 0 && sc->nr == 0)
-		ret = -ENOENT;
-	if (ret < 0)
+	if (ret < 0 || (ret == 0 && sc->nr == 0))
 		goto apply;

 	mutex_lock(&server->alloc_mutex);
@@ -3073,9 +3068,9 @@ static int server_get_volopt(struct super_block *sb, struct scoutfs_net_connecti
 	}

 	do {
-		seq = read_seqcount_begin(&server->volopt_seqcount);
+		seq = read_seqbegin(&server->seqlock);
 		volopt = server->volopt;
-	} while (read_seqcount_retry(&server->volopt_seqcount, seq));
+	} while (read_seqretry(&server->seqlock, seq));

 out:
 	return scoutfs_net_response(sb, conn, cmd, id, ret, &volopt, sizeof(volopt));
@@ -3144,12 +3139,12 @@ static int server_set_volopt(struct super_block *sb, struct scoutfs_net_connecti
 apply:
 	ret = server_apply_commit(sb, &hold, ret);

-	write_seqcount_begin(&server->volopt_seqcount);
+	write_seqlock(&server->seqlock);
 	if (ret == 0)
 		server->volopt = super->volopt;
 	else
 		super->volopt = server->volopt;
-	write_seqcount_end(&server->volopt_seqcount);
+	write_sequnlock(&server->seqlock);

 	mutex_unlock(&server->volopt_mutex);
 out:
@@ -3192,12 +3187,12 @@ static int server_clear_volopt(struct super_block *sb, struct scoutfs_net_connec

 	ret = server_apply_commit(sb, &hold, ret);

-	write_seqcount_begin(&server->volopt_seqcount);
+	write_seqlock(&server->seqlock);
 	if (ret == 0)
 		server->volopt = super->volopt;
 	else
 		super->volopt = server->volopt;
-	write_seqcount_end(&server->volopt_seqcount);
+	write_sequnlock(&server->seqlock);

 	mutex_unlock(&server->volopt_mutex);
 out:
@@ -4336,9 +4331,9 @@ static void scoutfs_server_worker(struct work_struct *work)
 	}

 	/* update volume options early, possibly for use during startup */
-	write_seqcount_begin(&server->volopt_seqcount);
+	write_seqlock(&server->seqlock);
 	server->volopt = super->volopt;
-	write_seqcount_end(&server->volopt_seqcount);
+	write_sequnlock(&server->seqlock);

 	atomic64_set(&server->seq_atomic, le64_to_cpu(super->seq));
 	set_stable_super(server, super);
@@ -4478,6 +4473,7 @@ int scoutfs_server_setup(struct super_block *sb)

 	server->sb = sb;
 	spin_lock_init(&server->lock);
+	seqlock_init(&server->seqlock);
 	init_waitqueue_head(&server->waitq);
 	INIT_WORK(&server->work, scoutfs_server_worker);
 	server->status = SERVER_DOWN;
@@ -4492,8 +4488,6 @@ int scoutfs_server_setup(struct super_block *sb)
 	INIT_WORK(&server->log_merge_free_work, server_log_merge_free_work);
 	mutex_init(&server->srch_mutex);
 	mutex_init(&server->mounted_clients_mutex);
-	seqcount_init(&server->stable_seqcount);
-	seqcount_init(&server->volopt_seqcount);
 	mutex_init(&server->volopt_mutex);
 	INIT_WORK(&server->fence_pending_recov_work, fence_pending_recov_worker);
 	INIT_DELAYED_WORK(&server->reclaim_dwork, reclaim_worker);
@@ -30,6 +30,9 @@
 #include "client.h"
 #include "counters.h"
 #include "scoutfs_trace.h"
+#include "triggers.h"
+#include "sysfs.h"
+#include "msg.h"

 /*
 * This srch subsystem gives us a way to find inodes that have a given
@@ -68,10 +71,14 @@ struct srch_info {
 	atomic_t shutdown;
 	struct workqueue_struct *workq;
 	struct delayed_work compact_dwork;
+	struct scoutfs_sysfs_attrs ssa;
+	atomic_t compact_delay_ms;
 };

 #define DECLARE_SRCH_INFO(sb, name) \
 	struct srch_info *name = SCOUTFS_SB(sb)->srch_info
+#define DECLARE_SRCH_INFO_KOBJ(kobj, name) \
+	DECLARE_SRCH_INFO(SCOUTFS_SYSFS_ATTRS_SB(kobj), name)

 #define SRE_FMT "%016llx.%llu.%llu"
 #define SRE_ARG(sre)						\
@@ -520,6 +527,95 @@ out:
 	return ret;
 }

+/*
+ * Padded entries are encoded in pairs after an existing entry.  All of
+ * the pairs cancel each other out by all readers (the second encoding
+ * looks like deletion) so they aren't visible to the first/last bounds of
+ * the block or file.
+ */
+static int append_padded_entry(struct scoutfs_srch_file *sfl, u64 blk,
+			       struct scoutfs_srch_block *srb, struct scoutfs_srch_entry *sre)
+{
+	int ret;
+
+	ret = encode_entry(srb->entries + le32_to_cpu(srb->entry_bytes),
+			   sre, &srb->tail);
+	if (ret > 0) {
+		srb->tail = *sre;
+		le32_add_cpu(&srb->entry_nr, 1);
+		le32_add_cpu(&srb->entry_bytes, ret);
+		le64_add_cpu(&sfl->entries, 1);
+		ret = 0;
+	}
+
+	return ret;
+}
+
+/*
+ * This is called by a testing trigger to create a very specific case of
+ * encoded entry offsets.  We want the last entry in the block to start
+ * precisely at the _SAFE_BYTES offset.
+ *
+ * This is called when there is a single existing entry in the block.
+ * We have the entire block to work with.  We encode pairs of matching
+ * entries.  This hides them from readers (both searches and merging) as
+ * they're interpreted as creation and deletion and are deleted.  We use
+ * the existing hash value of the first entry in the block but then set
+ * the inode to an impossibly large number so it doesn't interfere with
+ * anything.
+ *
+ * To hit the specific offset we very carefully manage the amount of
+ * bytes of change between fields in the entry.  We know that if we
+ * change all the byte of the ino and id we end up with a 20 byte
+ * (2+8+8,2) encoding of the pair of entries.  To have the last entry
+ * start at the _SAFE_POS offset we know that the final 20 byte pair
+ * encoding needs to end at 2 bytes (second entry encoding) after the
+ * _SAFE_POS offset.
+ *
+ * So as we encode pairs we watch the delta of our current offset from
+ * that desired final offset of 2 past _SAFE_POS.  If we're a multiple
+ * of 20 away then we encode the full 20 byte pairs.  If we're not, then
+ * we drop a byte to encode 19 bytes.  That'll slowly change the offset
+ * to be a multiple of 20 again while encoding large entries.
+ */
+static void pad_entries_at_safe(struct scoutfs_srch_file *sfl, u64 blk,
+				struct scoutfs_srch_block *srb)
+{
+	struct scoutfs_srch_entry sre;
+	u32 target;
+	s32 diff;
+	u64 hash;
+	u64 ino;
+	u64 id;
+	int ret;
+
+	hash = le64_to_cpu(srb->tail.hash);
+	ino = le64_to_cpu(srb->tail.ino) | (1ULL << 62);
+	id = le64_to_cpu(srb->tail.id);
+
+	target = SCOUTFS_SRCH_BLOCK_SAFE_BYTES + 2;
+
+	while ((diff = target - le32_to_cpu(srb->entry_bytes)) > 0) {
+		ino ^= 1ULL << (7 * 8);
+		if (diff % 20 == 0) {
+			id ^= 1ULL << (7 * 8);
+		} else {
+			id ^= 1ULL << (6 * 8);
+		}
+
+		sre.hash = cpu_to_le64(hash);
+		sre.ino = cpu_to_le64(ino);
+		sre.id = cpu_to_le64(id);
+
+		ret = append_padded_entry(sfl, blk, srb, &sre);
+		if (ret == 0)
+			ret = append_padded_entry(sfl, blk, srb, &sre);
+		BUG_ON(ret != 0);
+
+		diff = target - le32_to_cpu(srb->entry_bytes);
+	}
+}
+
 /*
 * The caller is dropping an ino/id because the tracking rbtree is full.
 * This loses information so we can't return any entries at or after the
@@ -987,6 +1083,9 @@ int scoutfs_srch_rotate_log(struct super_block *sb,
 	struct scoutfs_key key;
 	int ret;

+	if (sfl->ref.blkno && !force && scoutfs_trigger(sb, SRCH_FORCE_LOG_ROTATE))
+		force = true;
+
 	if (sfl->ref.blkno == 0 ||
 	    (!force && le64_to_cpu(sfl->blocks) < SCOUTFS_SRCH_LOG_BLOCK_LIMIT))
 		return 0;
@@ -1462,7 +1561,7 @@ static int kway_merge(struct super_block *sb,
 		      struct scoutfs_block_writer *wri,
 		      struct scoutfs_srch_file *sfl,
 		      kway_get_t kway_get, kway_advance_t kway_adv,
-		      void **args, int nr)
+		      void **args, int nr, bool logs_input)
 {
 	DECLARE_SRCH_INFO(sb, srinf);
 	struct scoutfs_srch_block *srb = NULL;
@@ -1567,6 +1666,15 @@ static int kway_merge(struct super_block *sb,
 				blk++;
 			}

+			/* end sorted block on _SAFE offset for testing */
+			if (bl && le32_to_cpu(srb->entry_nr) == 1 && logs_input &&
+			    scoutfs_trigger(sb, SRCH_COMPACT_LOGS_PAD_SAFE)) {
+				pad_entries_at_safe(sfl, blk, srb);
+				scoutfs_block_put(sb, bl);
+				bl = NULL;
+				blk++;
+			}
+
 			scoutfs_inc_counter(sb, srch_compact_entry);

 		} else {
@@ -1609,6 +1717,8 @@ static int kway_merge(struct super_block *sb,
 			empty++;
 			ret = 0;
 		} else if (ret < 0) {
+			if (ret == -ENOANO) /* just testing trigger */
+				ret = 0;
 			goto out;
 		}

@@ -1816,7 +1926,7 @@ static int compact_logs(struct super_block *sb,
 	}

 	ret = kway_merge(sb, alloc, wri, &sc->out, kway_get_page, kway_adv_page,
-			 args, nr_pages);
+			 args, nr_pages, true);
 	if (ret < 0)
 		goto out;

@@ -1874,12 +1984,18 @@ static int kway_get_reader(struct super_block *sb,
 	srb = rdr->bl->data;

 	if (rdr->pos > SCOUTFS_SRCH_BLOCK_SAFE_BYTES ||
-	    rdr->skip >= SCOUTFS_SRCH_BLOCK_SAFE_BYTES ||
+	    rdr->skip > SCOUTFS_SRCH_BLOCK_SAFE_BYTES ||
 	    rdr->skip >= le32_to_cpu(srb->entry_bytes)) {
 		/* XXX inconsistency */
 		return -EIO;
 	}

+	if (rdr->decoded_bytes == 0 && rdr->pos == SCOUTFS_SRCH_BLOCK_SAFE_BYTES &&
+	    scoutfs_trigger(sb, SRCH_MERGE_STOP_SAFE)) {
+		/* only used in testing */
+		return -ENOANO;
+	}
+
 	/* decode entry, possibly skipping start of the block */
 	while (rdr->decoded_bytes == 0 || rdr->pos < rdr->skip) {
 		ret = decode_entry(srb->entries + rdr->pos,
@@ -1969,7 +2085,7 @@ static int compact_sorted(struct super_block *sb,
 	}

 	ret = kway_merge(sb, alloc, wri, &sc->out, kway_get_reader,
-			 kway_adv_reader, args, nr);
+			 kway_adv_reader, args, nr, false);

 	sc->flags |= SCOUTFS_SRCH_COMPACT_FLAG_DONE;
 	for (i = 0; i < nr; i++) {
@@ -2098,8 +2214,15 @@ static int delete_files(struct super_block *sb, struct scoutfs_alloc *alloc,
 	return ret;
 }

-/* wait 10s between compact attempts on error, immediate after success */
-#define SRCH_COMPACT_DELAY_MS (10 * MSEC_PER_SEC)
+static void queue_compact_work(struct srch_info *srinf, bool immediate)
+{
+	unsigned long delay;
+
+	if (!atomic_read(&srinf->shutdown)) {
+		delay = immediate ? 0 : msecs_to_jiffies(atomic_read(&srinf->compact_delay_ms));
+		queue_delayed_work(srinf->workq, &srinf->compact_dwork, delay);
+	}
+}

 /*
 * Get a compaction operation from the server, sort the entries from the
@@ -2127,7 +2250,6 @@ static void scoutfs_srch_compact_worker(struct work_struct *work)
 	struct super_block *sb = srinf->sb;
 	struct scoutfs_block_writer wri;
 	struct scoutfs_alloc alloc;
-	unsigned long delay;
 	int ret;
 	int err;

@@ -2140,6 +2262,8 @@ static void scoutfs_srch_compact_worker(struct work_struct *work)
 	scoutfs_block_writer_init(sb, &wri);

 	ret = scoutfs_client_srch_get_compact(sb, sc);
+	if (ret >= 0)
+		trace_scoutfs_srch_compact_client_recv(sb, sc);
 	if (ret < 0 || sc->nr == 0)
 		goto out;

@@ -2168,6 +2292,7 @@ commit:
 	sc->meta_freed = alloc.freed;
 	sc->flags |= ret < 0 ? SCOUTFS_SRCH_COMPACT_FLAG_ERROR : 0;

+	trace_scoutfs_srch_compact_client_send(sb, sc);
 	err = scoutfs_client_srch_commit_compact(sb, sc);
 	if (err < 0 && ret == 0)
 		ret = err;
@@ -2178,14 +2303,56 @@ out:
 		scoutfs_inc_counter(sb, srch_compact_error);

 	scoutfs_block_writer_forget_all(sb, &wri);
-	if (!atomic_read(&srinf->shutdown)) {
-		delay = ret == 0 ? 0 : msecs_to_jiffies(SRCH_COMPACT_DELAY_MS);
-		queue_delayed_work(srinf->workq, &srinf->compact_dwork, delay);
-	}
+	queue_compact_work(srinf, sc->nr > 0 && ret == 0);

 	kfree(sc);
 }

+static ssize_t compact_delay_ms_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf)
+{
+	DECLARE_SRCH_INFO_KOBJ(kobj, srinf);
+
+	return snprintf(buf, PAGE_SIZE, "%u", atomic_read(&srinf->compact_delay_ms));
+}
+
+#define MIN_COMPACT_DELAY_MS MSEC_PER_SEC
+#define DEF_COMPACT_DELAY_MS (10 * MSEC_PER_SEC)
+#define MAX_COMPACT_DELAY_MS (60 * MSEC_PER_SEC)
+
+static ssize_t compact_delay_ms_store(struct kobject *kobj, struct kobj_attribute *attr,
+				      const char *buf, size_t count)
+{
+	struct super_block *sb = SCOUTFS_SYSFS_ATTRS_SB(kobj);
+	DECLARE_SRCH_INFO(sb, srinf);
+	char nullterm[30]; /* more than enough for octal -U64_MAX */
+	u64 val;
+	int len;
+	int ret;
+
+	len = min(count, sizeof(nullterm) - 1);
+	memcpy(nullterm, buf, len);
+	nullterm[len] = '\0';
+
+	ret = kstrtoll(nullterm, 0, &val);
+	if (ret < 0 || val < MIN_COMPACT_DELAY_MS || val > MAX_COMPACT_DELAY_MS) {
+		scoutfs_err(sb, "invalid compact_delay_ms value, must be between %lu and %lu",
+			    MIN_COMPACT_DELAY_MS, MAX_COMPACT_DELAY_MS);
+		return -EINVAL;
+	}
+
+	atomic_set(&srinf->compact_delay_ms, val);
+	cancel_delayed_work(&srinf->compact_dwork);
+	queue_compact_work(srinf, false);
+
+	return count;
+}
+SCOUTFS_ATTR_RW(compact_delay_ms);
+
+static struct attribute *srch_attrs[] = {
+	SCOUTFS_ATTR_PTR(compact_delay_ms),
+	NULL,
+};
+
 void scoutfs_srch_destroy(struct super_block *sb)
 {
 	struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
@@ -2202,6 +2369,8 @@ void scoutfs_srch_destroy(struct super_block *sb)
 		destroy_workqueue(srinf->workq);
 	}

+	scoutfs_sysfs_destroy_attrs(sb, &srinf->ssa);
+
 	kfree(srinf);
 	sbi->srch_info = NULL;
 }
@@ -2219,8 +2388,15 @@ int scoutfs_srch_setup(struct super_block *sb)
 	srinf->sb = sb;
 	atomic_set(&srinf->shutdown, 0);
 	INIT_DELAYED_WORK(&srinf->compact_dwork, scoutfs_srch_compact_worker);
+	scoutfs_sysfs_init_attrs(sb, &srinf->ssa);
+	atomic_set(&srinf->compact_delay_ms, DEF_COMPACT_DELAY_MS);
+
 	sbi->srch_info = srinf;

+	ret = scoutfs_sysfs_create_attrs(sb, &srinf->ssa, srch_attrs, "srch");
+	if (ret < 0)
+		goto out;
+
 	srinf->workq = alloc_workqueue("scoutfs_srch_compact",
 				       WQ_NON_REENTRANT | WQ_UNBOUND |
 				       WQ_HIGHPRI, 0);
@@ -2229,8 +2405,7 @@ int scoutfs_srch_setup(struct super_block *sb)
 		goto out;
 	}

-	queue_delayed_work(srinf->workq, &srinf->compact_dwork,
-			   msecs_to_jiffies(SRCH_COMPACT_DELAY_MS));
+	queue_compact_work(srinf, false);

 	ret = 0;
 out:
@@ -39,6 +39,9 @@ struct scoutfs_triggers {

 static char *names[] = {
 	[SCOUTFS_TRIGGER_BLOCK_REMOVE_STALE] = "block_remove_stale",
+	[SCOUTFS_TRIGGER_SRCH_COMPACT_LOGS_PAD_SAFE] = "srch_compact_logs_pad_safe",
+	[SCOUTFS_TRIGGER_SRCH_FORCE_LOG_ROTATE] = "srch_force_log_rotate",
+	[SCOUTFS_TRIGGER_SRCH_MERGE_STOP_SAFE] = "srch_merge_stop_safe",
 	[SCOUTFS_TRIGGER_STATFS_LOCK_PURGE] = "statfs_lock_purge",
 };

@@ -3,6 +3,9 @@

 enum scoutfs_trigger {
 	SCOUTFS_TRIGGER_BLOCK_REMOVE_STALE,
+	SCOUTFS_TRIGGER_SRCH_COMPACT_LOGS_PAD_SAFE,
+	SCOUTFS_TRIGGER_SRCH_FORCE_LOG_ROTATE,
+	SCOUTFS_TRIGGER_SRCH_MERGE_STOP_SAFE,
 	SCOUTFS_TRIGGER_STATFS_LOCK_PURGE,
 	SCOUTFS_TRIGGER_NR,
 };
@@ -25,8 +25,9 @@ All options can be seen by running with -h.
 This script is built to test multi-node systems on one host by using
 different mounts of the same devices.  The script creates a fake block
 device in front of each fs block device for each mount that will be
-tested.  Currently it will create free loop devices and will mount on
-/mnt/test.[0-9].
+tested.  It will create predictable device mapper devices and mounts
+them on /mnt/test.N.  These static device names and mount paths limit
+the script to a single execution per host.

 All tests will be run by default.  Particular tests can be included or
 excluded by providing test name regular expressions with the -I and -E
@@ -104,8 +105,8 @@ used during the test.

 | Variable         | Description          | Origin          | Example           |
 | ---------------- | -------------------  | --------------- | ----------------- |
-| T\_MB[0-9]       | per-mount meta bdev  | created per run | /dev/loop0        |
-| T\_DB[0-9]       | per-mount data bdev  | created per run | /dev/loop1        |
+| T\_MB[0-9]       | per-mount meta bdev  | created per run | /dev/mapper/\_scoutfs\_test\_meta\_[0-9]        |
+| T\_DB[0-9]       | per-mount data bdev  | created per run | /dev/mapper/\_scoutfs\_test\_data\_[0-9]        |
 | T\_D[0-9]        | per-mount test dir   | made for test   | /mnt/test.[0-9]/t |
 | T\_META\_DEVICE  | main FS meta bdev    | -M              | /dev/vda          |
 | T\_DATA\_DEVICE  | main FS data bdev    | -D              | /dev/vdb          |
@@ -6,6 +6,61 @@ t_filter_fs()
 	    -e 's@Device: [a-fA-F0-9]*h/[0-9]*d@Device: 0h/0d@g'
 }

+#
+# We can hit a spurious kasan warning that was fixed upstream:
+#
+#  e504e74cc3a2 x86/unwind/orc: Disable KASAN checking in the ORC unwinder, part 2
+#
+# KASAN can get mad when the unwinder doesn't find ORC metadata and
+# wanders up without using frames and hits the KASAN stack red zones.
+# We can ignore these messages.
+#
+# They're bracketed by:
+# [ 2687.690127] ==================================================================
+# [ 2687.691366] BUG: KASAN: stack-out-of-bounds in get_reg+0x1bc/0x230
+# ...
+# [ 2687.706220] ==================================================================
+# [ 2687.707284] Disabling lock debugging due to kernel taint
+#
+# That final lock debugging message may not be included.
+#
+ignore_harmless_unwind_kasan_stack_oob()
+{
+awk '
+        BEGIN {
+                in_soob = 0
+                soob_nr = 0
+        }
+        ( !in_soob && $0 ~ /==================================================================/ ) {
+                in_soob = 1
+                soob_nr = NR
+                saved = $0
+        }
+        ( in_soob == 1 && NR == (soob_nr + 1) ) {
+                if (match($0, /KASAN: stack-out-of-bounds in get_reg/) != 0) {
+                        in_soob = 2
+                } else {
+                        in_soob = 0
+                        print saved
+                }
+		saved=""
+        }
+        ( in_soob == 2 && $0 ~ /==================================================================/ ) {
+                in_soob = 3
+                soob_nr = NR
+        }
+        ( in_soob == 3 && NR > soob_nr && $0 !~ /Disabling lock debugging/ ) {
+                in_soob = 0
+        }
+        ( !in_soob ) { print $0 }
+        END {
+                if (saved) {
+                        print saved
+                }
+        }
+'
+}
+
 #
 # Filter out expected messages.  Putting messages here implies that
 # tests aren't relying on messages to discover failures.. they're
@@ -86,10 +141,12 @@ t_filter_dmesg()
 	re="$re|scoutfs .* critical transaction commit failure.*"

 	# change-devices causes loop device resizing
+	re="$re|loop: module loaded"
 	re="$re|loop[0-9].* detected capacity change from.*"

 	# ignore systemd-journal rotating
 	re="$re|systemd-journald.*"

-	egrep -v "($re)" 
+	egrep -v "($re)" | \
+		ignore_harmless_unwind_kasan_stack_oob
 }
@@ -265,6 +265,15 @@ t_trigger_get() {
 	cat "$(t_trigger_path "$nr")/$which"
 }

+t_trigger_set() {
+	local which="$1"
+	local nr="$2"
+	local val="$3"
+	local path=$(t_trigger_path "$nr")
+
+	echo "$val" > "$path/$which"
+}
+
 t_trigger_show() {
 	local which="$1"
 	local string="$2"
@@ -276,9 +285,8 @@ t_trigger_show() {
 t_trigger_arm_silent() {
 	local which="$1"
 	local nr="$2"
-	local path=$(t_trigger_path "$nr")

-	echo 1 > "$path/$which"
+	t_trigger_set "$which" "$nr" 1
 }

 t_trigger_arm() {
@@ -1,3 +1,4 @@
 == measure initial createmany
 == measure initial createmany
 == measure two concurrent createmany runs
+== cleanup
@@ -1,3 +1,4 @@
+== setting longer hung task timeout
 == creating fragmented extents
 == unlink file with moved extents to free extents per block
 == cleanup
@@ -0,0 +1,37 @@
+== initialize per-mount values
+== arm compaction triggers
+trigger srch_compact_logs_pad_safe armed: 1
+trigger srch_merge_stop_safe armed: 1
+trigger srch_compact_logs_pad_safe armed: 1
+trigger srch_merge_stop_safe armed: 1
+trigger srch_compact_logs_pad_safe armed: 1
+trigger srch_merge_stop_safe armed: 1
+trigger srch_compact_logs_pad_safe armed: 1
+trigger srch_merge_stop_safe armed: 1
+trigger srch_compact_logs_pad_safe armed: 1
+trigger srch_merge_stop_safe armed: 1
+== compact more often
+== create padded sorted inputs by forcing log rotation
+trigger srch_force_log_rotate armed: 1
+trigger srch_force_log_rotate armed: 1
+trigger srch_force_log_rotate armed: 1
+trigger srch_force_log_rotate armed: 1
+trigger srch_compact_logs_pad_safe armed: 1
+trigger srch_force_log_rotate armed: 1
+trigger srch_force_log_rotate armed: 1
+trigger srch_force_log_rotate armed: 1
+trigger srch_force_log_rotate armed: 1
+trigger srch_compact_logs_pad_safe armed: 1
+trigger srch_force_log_rotate armed: 1
+trigger srch_force_log_rotate armed: 1
+trigger srch_force_log_rotate armed: 1
+trigger srch_force_log_rotate armed: 1
+trigger srch_compact_logs_pad_safe armed: 1
+trigger srch_force_log_rotate armed: 1
+trigger srch_force_log_rotate armed: 1
+trigger srch_force_log_rotate armed: 1
+trigger srch_force_log_rotate armed: 1
+trigger srch_compact_logs_pad_safe armed: 1
+== compaction of padded should stop at safe
+== verify no compaction errors
+== cleanup
@@ -326,16 +326,10 @@ unmount_all() {
 		cmd wait $p
 	done

-	# delete all temp meta devices
-	for dev in $(losetup --associated "$T_META_DEVICE" | cut -d : -f 1); do
-		if [ -e "$dev" ]; then
-			cmd losetup -d "$dev"
-		fi
-	done
-	# delete all temp data devices
-	for dev in $(losetup --associated "$T_DATA_DEVICE" | cut -d : -f 1); do
-		if [ -e "$dev" ]; then
-			cmd losetup -d "$dev"
+	# delete all temp devices
+	for dev in /dev/mapper/_scoutfs_test_*; do
+		if [ -b "$dev" ]; then
+			cmd dmsetup remove $dev
 		fi
 	done
 }
@@ -434,6 +428,12 @@ $T_UTILS/fenced/scoutfs-fenced > "$T_FENCED_LOG" 2>&1 &
 fenced_pid=$!
 fenced_log "started fenced pid $fenced_pid in the background"

+# setup dm tables
+echo "0 $(blockdev --getsz $T_META_DEVICE) linear $T_META_DEVICE 0" > \
+	$T_RESULTS/dmtable.meta
+echo "0 $(blockdev --getsz $T_DATA_DEVICE) linear $T_DATA_DEVICE 0" > \
+	$T_RESULTS/dmtable.data
+
 #
 # mount concurrently so that a quorum is present to elect the leader and
 # start a server.
@@ -442,10 +442,13 @@ msg "mounting $T_NR_MOUNTS mounts on meta $T_META_DEVICE data $T_DATA_DEVICE"
 pids=""
 for i in $(seq 0 $((T_NR_MOUNTS - 1))); do

-	meta_dev=$(losetup --find --show $T_META_DEVICE)
-	test -b "$meta_dev" || die "failed to create temp device $meta_dev"
-	data_dev=$(losetup --find --show $T_DATA_DEVICE)
-	test -b "$data_dev" || die "failed to create temp device $data_dev"
+	name="_scoutfs_test_meta_$i"
+	cmd dmsetup create "$name" --table "$(cat $T_RESULTS/dmtable.meta)"
+	meta_dev="/dev/mapper/$name"
+
+	name="_scoutfs_test_data_$i"
+	cmd dmsetup create "$name" --table "$(cat $T_RESULTS/dmtable.data)"
+	data_dev="/dev/mapper/$name"

 	dir="/mnt/test.$i"
 	test -d "$dir" || cmd mkdir -p "$dir"
@@ -14,6 +14,7 @@ offline-extent-waiting.sh
 move-blocks.sh
 large-fragmented-free.sh
 enospc.sh
+srch-safe-merge-pos.sh
 srch-basic-functionality.sh
 simple-xattr-unit.sh
 totl-xattr-tag.sh
@@ -1,6 +1,7 @@
 #include <unistd.h>
 #include <stdlib.h>
 #include <stdio.h>
+#include <stdarg.h>
 #include <errno.h>
 #include <string.h>
 #include <sys/stat.h>
@@ -35,10 +36,10 @@ struct opts {
 	unsigned int dry_run:1,
 		     ls_output:1,
 		     quiet:1,
-		     user_xattr:1,
-		     same_srch_xattr:1,
-		     group_srch_xattr:1,
-		     unique_srch_xattr:1;
+		     xattr_set:1,
+		     xattr_file:1,
+		     xattr_group:1;
+	char *xattr_name;
 };

 struct stats {
@@ -149,12 +150,31 @@ static void free_dir(struct dir *dir)
 	free(dir);
 }

+static size_t snprintf_off(void *buf, size_t sz, size_t off, char *fmt, ...)
+{
+	va_list ap;
+	int ret;
+
+	if (off >= sz)
+		return sz;
+
+	va_start(ap, fmt);
+	ret = vsnprintf(buf + off, sz - off, fmt, ap);
+	va_end(ap);
+
+	if (ret <= 0)
+		return sz;
+
+	return off + ret;
+}
+
 static void create_dir(struct dir *dir, struct opts *opts,
 		       struct stats *stats)
 {
 	struct str_list *s;
-	char name[100];
+	char name[256]; /* max len and null term */
 	char val = 'v';
+	size_t off;
 	int rc;
 	int i;

@@ -175,29 +195,21 @@ static void create_dir(struct dir *dir, struct opts *opts,
 		rc = mknod(s->str, S_IFREG | 0644, 0);
 		error_exit(rc, "mknod %s failed"ERRF, s->str, ERRA);

-		rc = 0;
-		if (rc == 0 && opts->user_xattr) {
-			strcpy(name, "user.scoutfs_bcp");
-			rc = setxattr(s->str, name, &val, 1, 0);
-		}
-		if (rc == 0 && opts->same_srch_xattr) {
-			strcpy(name, "scoutfs.srch.scoutfs_bcp");
-			rc = setxattr(s->str, name, &val, 1, 0);
-		}
-		if (rc == 0 && opts->group_srch_xattr) {
-			snprintf(name, sizeof(name),
-				 "scoutfs.srch.scoutfs_bcp.group.%lu",
-				 stats->files / 10000);
-			rc = setxattr(s->str, name, &val, 1, 0);
-		}
-		if (rc == 0 && opts->unique_srch_xattr) {
-			snprintf(name, sizeof(name),
-				 "scoutfs.srch.scoutfs_bcp.unique.%lu",
-				 stats->files);
+		if (opts->xattr_set) {
+			off = snprintf_off(name, sizeof(name), 0, "%s", opts->xattr_name);
+			if (opts->xattr_file)
+				off = snprintf_off(name, sizeof(name), off,
+						   "-f-%lu", stats->files);
+			if (opts->xattr_group)
+				off = snprintf_off(name, sizeof(name), off,
+						   "-g-%lu", stats->files / 10000);
+
+			error_exit(off >= sizeof(name), "xattr name longer than 255 bytes");
+
 			rc = setxattr(s->str, name, &val, 1, 0);
+			error_exit(rc, "setxattr %s %s failed"ERRF, s->str, name, ERRA);
 		}

-		error_exit(rc, "setxattr %s %s failed"ERRF, s->str, name, ERRA);

 		stats->files++;
 		rate_banner(opts, stats);
@@ -365,11 +377,10 @@ static void usage(void)
 	       " -d DIR | create all files in DIR top level directory\n"
 	       " -n     | dry run, only parse, don't create any files\n"
 	       " -q     | quiet, don't regularly print rates\n"
+	       " -F     | append \"-f-NR\" file nr to xattr name, requires -X\n"
+	       " -G     | append \"-g-NR\" file nr/10000 to xattr name, requires -X\n"
 	       " -L     | parse ls output; only reg, skip meta, paths at ./\n"
-	       " -X     | set the same user. xattr name in all files\n"
-	       " -S     | set the same .srch. xattr name in all files\n"
-	       " -G     | set a .srch. xattr name shared by groups of files\n"
-	       " -U     | set a unique .srch. xattr name in all files\n");
+	       " -X NAM | set named xattr in all files\n");
 }

 int main(int argc, char **argv)
@@ -386,7 +397,7 @@ int main(int argc, char **argv)

 	memset(&opts, 0, sizeof(opts));

-        while ((c = getopt(argc, argv, "d:nqLXSGU")) != -1) {
+        while ((c = getopt(argc, argv, "d:nqFGLX:")) != -1) {
                switch(c) {
                case 'd':
                        top_dir = strdup(optarg);
@@ -397,20 +408,19 @@ int main(int argc, char **argv)
                case 'q':
                        opts.quiet = 1;
                        break;
+                case 'F':
+                        opts.xattr_file = 1;
+                        break;
+                case 'G':
+                        opts.xattr_group = 1;
+                        break;
                case 'L':
                        opts.ls_output = 1;
                        break;
                case 'X':
-                        opts.user_xattr = 1;
-                        break;
-                case 'S':
-                        opts.same_srch_xattr = 1;
-                        break;
-                case 'G':
-                        opts.group_srch_xattr = 1;
-                        break;
-                case 'U':
-                        opts.unique_srch_xattr = 1;
+			opts.xattr_set = 1;
+			opts.xattr_name = strdup(optarg);
+			error_exit(!opts.xattr_name, "error allocating xattr name");
                        break;
                case '?':
                        printf("Unknown option '%c'\n", optopt);
@@ -419,6 +429,11 @@ int main(int argc, char **argv)
                }
        }

+	error_exit(opts.xattr_file && !opts.xattr_set,
+		   "must specify xattr -X when appending file nr with -F");
+	error_exit(opts.xattr_group && !opts.xattr_set,
+		   "must specify xattr -X when appending file nr with -G");
+
 	if (!opts.dry_run) {
 		error_exit(!top_dir,
 			   "must specify top level directory with -d");
@@ -11,8 +11,13 @@ FILE="$T_D0/file"
 # final block as we truncated past it.
 #
 echo "== truncate writes zeroed partial end of file block"
-yes | dd of="$FILE" bs=8K count=1 status=none
+yes | dd of="$FILE" bs=8K count=1 status=none iflag=fullblock
 sync
+
+# not passing iflag=fullblock causes the file occasionally to just be
+# 4K, so just to be safe we should at least check size once
+test `stat --printf="%s\n" "$FILE"` -eq 8192 || t_fail "test file incorrect start size"
+
 truncate -s 6K "$FILE"
 truncate -s 12K "$FILE"
 echo 3 > /proc/sys/vm/drop_caches
@@ -7,9 +7,11 @@ t_require_mounts 2

 COUNT=50000

-# Prep dirs for test. Each mount needs to make their own parent dir for
-# the createmany run, otherwise both dirs will end up in the same inode
-# group, causing updates to bounce that lock around.
+#
+# Prep dirs for test.  We have per-directory inode number allocators so
+# by putting each createmany in a per-mount dir they get their own inode
+# number region and cluster locks.
+#
 echo "== measure initial createmany"
 mkdir -p $T_D0/dir/0
 mkdir $T_D1/dir/1
@@ -17,18 +19,20 @@ mkdir $T_D1/dir/1
 echo "== measure initial createmany"
 START=$SECONDS
 createmany -o "$T_D0/file_" $COUNT >> $T_TMP.full
+sync
 SINGLE=$((SECONDS - START))
 echo single $SINGLE >> $T_TMP.full

 echo "== measure two concurrent createmany runs"
 START=$SECONDS
-createmany -o $T_D0/dir/0/file $COUNT > /dev/null &
+(cd $T_D0/dir/0; createmany -o ./file_ $COUNT > /dev/null) &
 pids="$!"
-createmany -o $T_D1/dir/1/file $COUNT > /dev/null &
+(cd $T_D1/dir/1; createmany -o ./file_ $COUNT > /dev/null) &
 pids="$pids $!"
 for p in $pids; do
        wait $p
 done
+sync
 BOTH=$((SECONDS - START))
 echo both $BOTH >> $T_TMP.full

@@ -41,7 +45,10 @@ echo both $BOTH >> $T_TMP.full
 # synchronized operation.
 FACTOR=200
 if [ "$BOTH" -gt $(($SINGLE*$FACTOR)) ]; then
-	echo "both createmany took $BOTH sec, more than $FACTOR x single $SINGLE sec"
+	t_fail "both createmany took $BOTH sec, more than $FACTOR x single $SINGLE sec"
 fi

+echo "== cleanup"
+find $T_D0/dir -delete
+
 t_pass
@@ -10,6 +10,30 @@ EXTENTS_PER_BTREE_BLOCK=600
 EXTENTS_PER_LIST_BLOCK=8192
 FREED_EXTENTS=$((EXTENTS_PER_BTREE_BLOCK * EXTENTS_PER_LIST_BLOCK))

+#
+# This test specifically creates a pathologically sparse file that will
+# be as expensive as possible to free.  This is usually fine on
+# dedicated or reasonable hardware, but trying to run this in
+# virtualized debug kernels can take a very long time.  This test is
+# about making sure that the server doesn't fail, not that the platform
+# can handle the scale of work that our btree formats happen to require
+# while execution is bogged down with use-after-free memory reference
+# tracking.  So we give the test a lot more breathing room before
+# deciding that its hung.
+#
+echo "== setting longer hung task timeout"
+if [ -w /proc/sys/kernel/hung_task_timeout_secs ]; then
+	secs=$(cat /proc/sys/kernel/hung_task_timeout_secs)
+	test "$secs" -gt 0 || \
+		t_fail "confusing value '$secs' from /proc/sys/kernel/hung_task_timeout_secs"
+	restore_hung_task_timeout()
+	{
+		echo "$secs" > /proc/sys/kernel/hung_task_timeout_secs
+	}
+	trap restore_hung_task_timeout EXIT
+	echo "$((secs * 5))" > /proc/sys/kernel/hung_task_timeout_secs
+fi
+
 echo "== creating fragmented extents"
 fragmented_data_extents $FREED_EXTENTS $EXTENTS_PER_BTREE_BLOCK "$T_D0/alloc" "$T_D0/move"

@@ -9,6 +9,7 @@ LOG=340000
 LIM=1000000

 SEQF="%.20g"
+SXA="scoutfs.srch.test-srch-basic-functionality"

 t_require_commands touch rm setfattr scoutfs find_xattrs

@@ -27,20 +28,20 @@ diff_srch_find()

 echo "== create new xattrs"
 touch "$T_D0/"{create,update}
-setfattr -n scoutfs.srch.test -v 1 "$T_D0/"{create,update} 2>&1 | t_filter_fs
-diff_srch_find scoutfs.srch.test
+setfattr -n $SXA -v 1 "$T_D0/"{create,update} 2>&1 | t_filter_fs
+diff_srch_find $SXA

 echo "== update existing xattr"
-setfattr -n scoutfs.srch.test -v 2 "$T_D0/update" 2>&1 | t_filter_fs
-diff_srch_find scoutfs.srch.test
+setfattr -n $SXA -v 2 "$T_D0/update" 2>&1 | t_filter_fs
+diff_srch_find $SXA

 echo "== remove an xattr"
-setfattr -x scoutfs.srch.test "$T_D0/create" 2>&1 | t_filter_fs
-diff_srch_find scoutfs.srch.test
+setfattr -x $SXA "$T_D0/create" 2>&1 | t_filter_fs
+diff_srch_find $SXA

 echo "== remove xattr with files"
 rm -f "$T_D0/"{create,update}
-diff_srch_find scoutfs.srch.test
+diff_srch_find $SXA

 echo "== trigger small log merges by rotating single block with unmount"
 sv=$(t_server_nr)
@@ -56,7 +57,7 @@ while [ "$i" -lt "8" ]; do

 		eval path="\$T_D${nr}/single-block-$i"
 		touch "$path"
-		setfattr -n scoutfs.srch.single-block-logs -v $i "$path"
+		setfattr -n $SXA -v $i "$path"
 		t_umount $nr
 		t_mount $nr

@@ -65,51 +66,51 @@ while [ "$i" -lt "8" ]; do
 done
 # wait for srch compaction worker delay
 sleep 10
-rm -rf "$T_D0/single-block-*"
+find "$T_D0" -type f -name 'single-block-*' -delete

 echo "== create entries in current log"
 DIR="$T_D0/dir"
 NR=$((LOG / 4))
 mkdir -p "$DIR"
-seq -f "f-$SEQF" 1 $NR | src/bulk_create_paths -S -d "$DIR" > /dev/null
-diff_srch_find scoutfs.srch.scoutfs_bcp
+seq -f "f-$SEQF" 1 $NR | src/bulk_create_paths -X $SXA -d "$DIR" > /dev/null
+diff_srch_find $SXA

 echo "== delete small fraction"
-seq -f "$DIR/f-$SEQF" 1 7 $NR | xargs setfattr -x scoutfs.srch.scoutfs_bcp
-diff_srch_find scoutfs.srch.scoutfs_bcp
+seq -f "$DIR/f-$SEQF" 1 7 $NR | xargs setfattr -x $SXA
+diff_srch_find $SXA

 echo "== remove files"
 rm -rf "$DIR"
-diff_srch_find scoutfs.srch.scoutfs_bcp
+diff_srch_find $SXA

 echo "== create entries that exceed one log"
 NR=$((LOG * 3 / 2))
 mkdir -p "$DIR"
-seq -f "f-$SEQF" 1 $NR | src/bulk_create_paths -S -d "$DIR" > /dev/null
-diff_srch_find scoutfs.srch.scoutfs_bcp
+seq -f "f-$SEQF" 1 $NR | src/bulk_create_paths -X $SXA -d "$DIR" > /dev/null
+diff_srch_find $SXA

 echo "== delete fractions in phases"
 for i in $(seq 1 3); do
-	seq -f "$DIR/f-$SEQF" $i 3 $NR | xargs setfattr -x scoutfs.srch.scoutfs_bcp
-	diff_srch_find scoutfs.srch.scoutfs_bcp
+	seq -f "$DIR/f-$SEQF" $i 3 $NR | xargs setfattr -x $SXA
+	diff_srch_find $SXA
 done

 echo "== remove files"
 rm -rf "$DIR"
-diff_srch_find scoutfs.srch.scoutfs_bcp
+diff_srch_find $SXA

 echo "== create entries for exceed search entry limit"
 NR=$((LIM * 3 / 2))
 mkdir -p "$DIR"
-seq -f "f-$SEQF" 1 $NR | src/bulk_create_paths -S -d "$DIR" > /dev/null
-diff_srch_find scoutfs.srch.scoutfs_bcp
+seq -f "f-$SEQF" 1 $NR | src/bulk_create_paths -X $SXA -d "$DIR" > /dev/null
+diff_srch_find $SXA

 echo "== delete half"
-seq -f "$DIR/f-$SEQF" 1 2 $NR | xargs setfattr -x scoutfs.srch.scoutfs_bcp
-diff_srch_find scoutfs.srch.scoutfs_bcp
+seq -f "$DIR/f-$SEQF" 1 2 $NR | xargs setfattr -x $SXA
+diff_srch_find $SXA

 echo "== entirely remove third batch"
 rm -rf "$DIR"
-diff_srch_find scoutfs.srch.scoutfs_bcp
+diff_srch_find $SXA

 t_pass
@@ -0,0 +1,90 @@
+#
+# There was a bug where srch file compaction could get stuck if a
+# partial compaction finished at the specific _SAFE_BYTES offset in a
+# block.  Resuming from that position would return an error and
+# compaction would stop making forward progress.
+#
+# We use triggers to pad the output of log compaction to end on the safe
+# offset and then cause compaction of those padded inputs to stop at the
+# safe offset.  Continuation will either succeed or return errors.  
+#
+
+# forcing rotation, so just a few
+NR=10
+SEQF="%.20g"
+COMPACT_NR=4
+
+echo "== initialize per-mount values"
+declare -a err
+declare -a compact_delay
+for nr in $(t_fs_nrs); do
+	err[$nr]=$(t_counter srch_compact_error $nr)
+	compact_delay[$nr]=$(cat $(t_sysfs_path $nr)/srch/compact_delay_ms)
+done
+restore_compact_delay()
+{
+	for nr in $(t_fs_nrs); do
+		echo ${compact_delay[$nr]} > $(t_sysfs_path $nr)/srch/compact_delay_ms
+	done
+}
+trap restore_compact_delay EXIT
+
+echo "== arm compaction triggers"
+for nr in $(t_fs_nrs); do
+	t_trigger_arm srch_compact_logs_pad_safe $nr
+	t_trigger_arm srch_merge_stop_safe $nr
+done
+
+echo "== compact more often"
+for nr in $(t_fs_nrs); do
+	echo 1000 > $(t_sysfs_path $nr)/srch/compact_delay_ms
+done
+
+echo "== create padded sorted inputs by forcing log rotation"
+sv=$(t_server_nr)
+for i in $(seq 1 $COMPACT_NR); do
+	for j in $(seq 1 $COMPACT_NR); do
+		t_trigger_arm srch_force_log_rotate $sv
+
+		seq -f "f-$i-$j-$SEQF" 1 10 | \
+			bulk_create_paths -X "scoutfs.srch.t-srch-safe-merge-pos" -d "$T_D0" > \
+			/dev/null
+		sync
+
+		test "$(t_trigger_get srch_force_log_rotate $sv)" == "0" || \
+			t_fail "srch_force_log_rotate didn't trigger"
+	done
+
+	padded=0
+	while test $padded == 0 && sleep .5; do
+		for nr in $(t_fs_nrs); do
+			if [ "$(t_trigger_get srch_compact_logs_pad_safe $nr)" == "0" ]; then
+				t_trigger_arm srch_compact_logs_pad_safe $nr
+				padded=1
+				break
+			fi
+			test "$(t_counter srch_compact_error $nr)" == "${err[$nr]}" || \
+				t_fail "srch_compact_error counter increased on mount $nr"
+		done
+	done
+done
+
+echo "== compaction of padded should stop at safe"
+sleep 2
+for nr in $(t_fs_nrs); do
+	if [ "$(t_trigger_get srch_merge_stop_safe $nr)" == "0" ]; then
+		break
+	fi
+done
+
+echo "== verify no compaction errors"
+sleep 2
+for nr in $(t_fs_nrs); do
+	test "$(t_counter srch_compact_error $nr)" == "${err[$nr]}" || \
+		t_fail "srch_compact_error counter increased on mount $nr"
+done
+
+echo "== cleanup"
+find "$T_D0" -type f -name 'f-*' -delete
+
+t_pass
Author	SHA1	Message	Date
Zach Brown	55f0a0ded4	Add nr_log_trees debugfs counter Signed-off-by: Zach Brown <zab@versity.com>	2024-01-25 12:48:27 -08:00
Zach Brown	dcfd22e4b1	Force merge creation timeout Signed-off-by: Zach Brown <zab@versity.com>	2024-01-25 12:46:46 -08:00
Zach Brown	8d3e6883c6	Merge pull request #159 from versity/auke/trans_hold Fix ret output for scoutfs_trans_hold trace pt.	2024-01-09 09:23:32 -08:00
Auke Kok	8747dae61c	Fix ret output for scoutfs_trans_hold trace pt. Signed-off-by: Auke Kok <auke.kok@versity.com>	2024-01-08 16:27:41 -08:00
Zach Brown	fffcf4a9bb	Merge pull request #158 from versity/zab/kasan_stack_oob_get_reg Ignore spurious KASAN unwind warning	2023-11-22 10:04:18 -08:00
Zach Brown	b552406427	Ignore spurious KASAN unwind warning KASAN could raise a spurious warning if the unwinder started in code without ORC metadata and tried to access in the KASAN stack frame redzones. This was fixed upstream but we can rarely see it in older kernels. We can ignore these messages. Signed-off-by: Zach Brown <zab@versity.com>	2023-11-21 12:25:16 -08:00
Zach Brown	d812599e6b	Merge pull request #157 from versity/zab/dmsetup_test_devices Zab/dmsetup test devices	2023-11-21 10:13:02 -08:00
Zach Brown	03ab5cedb6	clean up createmany-parallel-mounts test This test is trying to make sure that concurrent work isn't much, much, slower than individual work. It does this by timing creating a bunch of files in a dir on a mount and then timing doing the same in two mounts concurrently. But it messed it up the concurrency pretty badly. It had the concurrent createmany tasks creating files with a full path. That means that every create is trying to read all the parent directories. The way inode number allocation works means that one of the mounts is likely to be getting a write lock that includes a shared parent. This created a ton of cluster lock contention between the two tasks. Then it didn't sync the creates between phases. It could be accidentally recording the time it took to write out the dirty single creates as time taken during the parallel creates. By syncing between phases and having the createmany tasks create files relative to their per-mount directories we actually perform concurrent work and test that we're not creating contention outside of the task load. This became a problem as we switched from loopback devices to device mapper devices. The loopback writers were using buffered writes so we were masking the io cost of constantly invalidating and refilling the item cache by turning the reads into memory copies out of the page cache. While we're in here we actually clean up the created files and then use t_fail to fail the test while the files still exist so they can be examined. Signed-off-by: Zach Brown <zab@versity.com>	2023-11-15 15:12:57 -08:00
Zach Brown	2b94cd6468	Add loop module kernel message filter Now that we're not setting up per-mount loopback devices we can not have the loop module loaded until tests are running. Signed-off-by: Zach Brown <zab@versity.com>	2023-11-15 13:39:38 -08:00
Zach Brown	5507ee5351	Use device-mapper for per-mount test devices We don't directly mount the underlying devices for each mount because the kernel notices multiple mounts and doesn't setup a new super block for each. Previously the script used loopback devices to create the local shared block construct 'cause it was easy. This introduced corruption of blocks that saw concurrent read and write IOs. The buffered kernel file IO paths that loopback eventually degrades into by default (via splice) could have buffered readers copying out of pages without the page lock while writers modified the page. This manifest as occasional crc failure of blocks that we knowingly issue concurrent reads and writes to from multiple mounts (the quorum and super blocks). This changes the script to use device-mapper linear passthrough devices. Their IOs don't hit a caching layer and don't provide an opportunity to corrupt blocks. Signed-off-by: Zach Brown <zab@versity.com>	2023-11-15 13:39:38 -08:00
Zach Brown	1600a121d9	Merge pull request #156 from versity/zab/large_fragmented_free_hung_task Extend hung task timeout for large-fragmented-free	2023-11-15 09:49:13 -08:00
Zach Brown	6daf24ff37	Extend hung task timeout for large-fragmented-free Our large fragmented free test creates pathologically file extents which are as expensive as possible to free. We know that debugging kernels can take a long time to do this so we can extend the hung task timeout. Signed-off-by: Zach Brown <zab@versity.com>	2023-11-14 15:01:37 -08:00
Zach Brown	cd5d9ff3e0	Merge pull request #154 from versity/zab/srch_test_fixes Zab/srch test fixes	2023-11-13 09:47:46 -08:00
Zach Brown	d94e49eb63	Fix quoted glob in srch-basic-functionality One of the phases of this test wanted to delete files but got the glob quoting wrong. This didn't matter for the original test but when we changed the test to use its own xattr name then those existing undeleted files got confused with other files in later phases of the test. This changes the test to delete the files with a more reliable find pattern instead of using shell glob expansion. Signed-off-by: Zach Brown <zab@versity.com>	2023-11-09 14:16:36 -08:00
Zach Brown	1dbe408539	Add tracing of srch compact struct communication Signed-off-by: Zach Brown <zab@versity.com>	2023-11-09 14:16:33 -08:00
Zach Brown	bf21699ad7	bulk_create_paths test tool takes xattr name Previously the bulk_create_paths test tool used the same xattr name for each category of xattrs it was creating. This created a problem where two tests got their xattrs confused with each other. The first test created a bunch of srch xattrs, failed, and didn't clean up after itself. The second test saw these search xattrs as its own and got very confused when there were far more srch xattrs than it thought it had created. This lets each test specify the srch xattr names that are created by bulk_create_paths so that tests can work with their xattrs independent of each other. Signed-off-by: Zach Brown <zab@versity.com>	2023-11-09 14:15:44 -08:00
Zach Brown	c7c67a173d	Specifically wait for compaction in srch test We just added a test to try and get srch compaction stuck by having an input file continue at a specific offset. To exercise the bug the test needs to perform 6 compactions. It needs to merge 4 sets of logs into 4 sorted files, it needs to make partial progress merging those 4 sorted files into another file, and then finall attempt to continue compacting from the partial progress offset. The first version of the test didn't necessarily ensure that these compactions happened. It created far too many log files then just waited for time to pass. If the host was slow then the mounts may not make it through the initial logs to try and compact the sorted files. The triggers wouldn't fire and the test would fail. These changes much more carefully orchestrate and watch the various steps of compaction to make sure that we trigger the bug. Signed-off-by: Zach Brown <zab@versity.com>	2023-11-09 14:13:13 -08:00
Zach Brown	0d10189f58	Make srch compact request delay tunable Add a sysfs file for getting and setting the delay between srch compaction requests from the client. We'll use this in testing to ensure compaction runs promptly. Signed-off-by: Zach Brown <zab@versity.com>	2023-11-09 14:13:07 -08:00
Zach Brown	6b88f3268e	Merge pull request #153 from versity/zab/v1.18 v1.18 Release	2023-11-08 10:57:56 -08:00
Zach Brown	4b2afa61b8	v1.18 Release Finish the release notes for the 1.18 release. Signed-off-by: Zach Brown <zab@versity.com>	2023-11-07 16:01:59 -08:00
Zach Brown	222ba2cede	Merge pull request #152 from versity/zab/stuck_srch_compact Zab/stuck srch compact	2023-11-07 15:56:39 -08:00
Zach Brown	c7e97eeb1f	Allow srch compaction from _SAFE_BYTES Compacting sorted srch files can take multiple transactions because they can be very large. Each transaction resumes at a byte offset in a block where the previous transaction stopped. The resuming code tests that the byte offsets are sane but had a mistake in testing the offset to skip to. It returned an error if the compaction resumed from the last possible safe offset for decoding entries. If a system is unlucky enough to have a compaction transaction stop at just this offset then compaction stops making forward progress as each attempt to resume returns an error. The fix allows continuation from this last safe offset while returning errors for attempts to continue past that offset. This matches all the encoding code which allows encoding the last entry in the block at this offset. Signed-off-by: Zach Brown <zab@versity.com>	2023-11-07 12:34:00 -08:00
Zach Brown	21c070b42d	Add test for srch continutation safe pos errors Add a test for srch compaction getting stuck hitting errors continuing a partial operation. It ensures that a block has an encoded entry at the _SAFE_BYTES offset, that an operaton stops precisely at that offset, and then watches for errors. Signed-off-by: Zach Brown <zab@versity.com>	2023-11-07 12:34:00 -08:00
Zach Brown	77fbf92968	Add t_trigger_set helper Add a helper to arm or disarm a trigger with a value argument. Signed-off-by: Zach Brown <zab@versity.com>	2023-11-07 12:12:10 -08:00
Zach Brown	d5c699c3b4	Don't respond with ENOENT for no srch compaction The srch compaction request building function and the srch compaction worker both have logic to recognize a valid response with no input files indicating that there's no work to do. The server unfortunately translated nr == 0 into ENOENT and send that error response to the client. This caused the client to increment error counters in the common case when there's no compaction work to perform. We'd like the error counter to reflect actual errors, we're about to check it in a test, so let's fix this up to the server sends a sucessful response with nr == 0 to indicate that there's no work to do. Signed-off-by: Zach Brown <zab@versity.com>	2023-11-07 10:30:38 -08:00
Zach Brown	b56b8e502c	Merge pull request #145 from versity/zab/server_seqlock Use seqlock instead of seqcount in server	2023-10-24 14:36:56 -07:00
Zach Brown	5ff372561d	Merge pull request #146 from versity/auke/truncatedd Ensure dd creates the full 8K input test file.	2023-10-24 10:10:11 -07:00
Zach Brown	bdecee5e5d	Merge pull request #147 from versity/zab/v1.17 v1.17 Release	2023-10-24 09:52:36 -07:00
Auke Kok	707e1b2d59	Ensure dd creates the full 8K input test file. Without `iflag=fullblock` we encounter sporadic cases where the input file to the truncate test isn't fully written to 8K and ends up to be only 4K. The subsequent truncate tests then fail. We add a check to the input test file size just to be sure in the future. Signed-off-by: Auke Kok <auke.kok@versity.com>	2023-10-23 17:04:19 -04:00
Zach Brown	006f429f72	Use seqlock instead of seqcount in server The server had a few lower level seqcounts that it used to protect state. One user got it wrong by forgetting to disable pre-emption around writers. Debug kernels warned as write_seqcount_begin() was called without preemption disabled. We fix that user and make it easier to get right in the future by having one higher level seqlock and using that consistently for seq read begin/retry and write lock/unlock patterns. Signed-off-by: Zach Brown <zab@versity.com>	2023-10-19 15:43:15 -07:00