From bad1c602f92391835e4c93469795f8311ebedbfd Mon Sep 17 00:00:00 2001
From: Zach Brown <zab@versity.com>
Date: Tue, 8 Dec 2020 13:53:22 -0800
Subject: [PATCH 01/29] server hold_commit returns void

When we moved to the current allocator we fixed up the server commit
path to initialize the pair of allocators as a commit is finished rather
than before it starts.  This removed all the error cases from
hold_commit.  Remove the error handling from hold_commit calls to make
the system just a bit simpler.

Signed-off-by: Zach Brown <zab@versity.com>
---
 kmod/src/server.c | 50 ++++++++++++-----------------------------------
 kmod/src/server.h |  2 +-
 2 files changed, 13 insertions(+), 39 deletions(-)

diff --git a/kmod/src/server.c b/kmod/src/server.c
index 4eeefccd..872af0a7 100644
--- a/kmod/src/server.c
+++ b/kmod/src/server.c
@@ -187,15 +187,13 @@ static void stop_server(struct server_info *server)
  * (lock_server) and which are not called directly by the server core
  * (async timeout work).
  */
-int scoutfs_server_hold_commit(struct super_block *sb)
+void scoutfs_server_hold_commit(struct super_block *sb)
 {
 	DECLARE_SERVER_INFO(sb, server);
 
 	scoutfs_inc_counter(sb, server_commit_hold);
 
 	down_read(&server->commit_rwsem);
-
-	return 0;
 }
 
 /*
@@ -394,9 +392,7 @@ static int server_alloc_inodes(struct super_block *sb,
 
 	memcpy(&lecount, arg, arg_len);
 
-	ret = scoutfs_server_hold_commit(sb);
-	if (ret)
-		goto out;
+	scoutfs_server_hold_commit(sb);
 
 	spin_lock(&sbi->next_ino_lock);
 	ino = le64_to_cpu(super->next_ino);
@@ -404,7 +400,7 @@ static int server_alloc_inodes(struct super_block *sb,
 	le64_add_cpu(&super->next_ino, nr);
 	spin_unlock(&sbi->next_ino_lock);
 
-	ret = scoutfs_server_apply_commit(sb, ret);
+	ret = scoutfs_server_apply_commit(sb, 0);
 	if (ret == 0) {
 		ial.ino = cpu_to_le64(ino);
 		ial.nr = cpu_to_le64(nr);
@@ -606,9 +602,7 @@ static int server_get_log_trees(struct super_block *sb,
 		goto out;
 	}
 
-	ret = scoutfs_server_hold_commit(sb);
-	if (ret)
-		goto out;
+	scoutfs_server_hold_commit(sb);
 
 	mutex_lock(&server->logs_mutex);
 
@@ -717,11 +711,7 @@ static int server_commit_log_trees(struct super_block *sb,
 	/* don't modify the caller's log_trees */
 	memcpy(&lt, arg, sizeof(struct scoutfs_log_trees));
 
-	ret = scoutfs_server_hold_commit(sb);
-	if (ret < 0) {
-		scoutfs_err(sb, "server error preparing commit: %d", ret);
-		goto out;
-	}
+	scoutfs_server_hold_commit(sb);
 
 	mutex_lock(&server->logs_mutex);
 
@@ -952,9 +942,7 @@ static int server_advance_seq(struct super_block *sb,
 		goto out;
 	}
 
-	ret = scoutfs_server_hold_commit(sb);
-	if (ret)
-		goto out;
+	scoutfs_server_hold_commit(sb);
 
 	down_write(&server->seq_rwsem);
 
@@ -1151,9 +1139,7 @@ static int server_srch_get_compact(struct super_block *sb,
 		goto out;
 	}
 
-	ret = scoutfs_server_hold_commit(sb);
-	if (ret)
-		goto out;
+	scoutfs_server_hold_commit(sb);
 
 	mutex_lock(&server->srch_mutex);
 	ret = scoutfs_srch_get_compact(sb, &server->alloc, &server->wri,
@@ -1215,9 +1201,7 @@ static int server_srch_commit_compact(struct super_block *sb,
 	}
 	sc = arg;
 
-	ret = scoutfs_server_hold_commit(sb);
-	if (ret)
-		goto out;
+	scoutfs_server_hold_commit(sb);
 
 	mutex_lock(&server->srch_mutex);
 	ret = scoutfs_srch_commit_compact(sb, &server->alloc, &server->wri,
@@ -1347,9 +1331,7 @@ static int server_set_volopt(struct super_block *sb, struct scoutfs_net_connecti
 
 	mutex_lock(&server->volopt_mutex);
 
-	ret = scoutfs_server_hold_commit(sb);
-	if (ret)
-		goto unlock;
+	scoutfs_server_hold_commit(sb);
 
 	if (le64_to_cpu(volopt->set_bits) & SCOUTFS_VOLOPT_DATA_ALLOC_ZONE_BLOCKS_BIT) {
 		opt = le64_to_cpu(volopt->data_alloc_zone_blocks);
@@ -1389,7 +1371,6 @@ apply:
 		super->volopt = server->volopt;
 	write_seqcount_end(&server->volopt_seqcount);
 
-unlock:
 	mutex_unlock(&server->volopt_mutex);
 out:
 	return scoutfs_net_response(sb, conn, cmd, id, ret, NULL, 0);
@@ -1419,9 +1400,7 @@ static int server_clear_volopt(struct super_block *sb, struct scoutfs_net_connec
 
 	mutex_lock(&server->volopt_mutex);
 
-	ret = scoutfs_server_hold_commit(sb);
-	if (ret)
-		goto unlock;
+	scoutfs_server_hold_commit(sb);
 
 	for (i = 0, bit = 1, opt = first_valopt(&super->volopt); i < 64; i++, bit <<= 1, opt++) {
 		if (le64_to_cpu(volopt->set_bits) & bit) {
@@ -1439,7 +1418,6 @@ static int server_clear_volopt(struct super_block *sb, struct scoutfs_net_connec
 		super->volopt = server->volopt;
 	write_seqcount_end(&server->volopt_seqcount);
 
-unlock:
 	mutex_unlock(&server->volopt_mutex);
 out:
 	return scoutfs_net_response(sb, conn, cmd, id, ret, NULL, 0);
@@ -1652,9 +1630,7 @@ static int server_greeting(struct super_block *sb,
 	}
 
 	if (gr->server_term == 0) {
-		ret = scoutfs_server_hold_commit(sb);
-		if (ret < 0)
-			goto send_err;
+		scoutfs_server_hold_commit(sb);
 
 		ret = insert_mounted_client(sb, le64_to_cpu(gr->rid), le64_to_cpu(gr->flags),
 					    &conn->peername);
@@ -1727,9 +1703,7 @@ static int reclaim_rid(struct super_block *sb, u64 rid)
 {
 	int ret;
 
-	ret = scoutfs_server_hold_commit(sb);
-	if (ret < 0)
-		return ret;
+	scoutfs_server_hold_commit(sb);
 
 	/* delete mounted client last, recovery looks for it */
 	ret = scoutfs_lock_server_farewell(sb, rid) ?:
diff --git a/kmod/src/server.h b/kmod/src/server.h
index 8d31a271..40450aa9 100644
--- a/kmod/src/server.h
+++ b/kmod/src/server.h
@@ -62,7 +62,7 @@ int scoutfs_server_lock_response(struct super_block *sb, u64 rid, u64 id,
 				 struct scoutfs_net_lock *nl);
 int scoutfs_server_lock_recover_request(struct super_block *sb, u64 rid,
 					struct scoutfs_key *key);
-int scoutfs_server_hold_commit(struct super_block *sb);
+void scoutfs_server_hold_commit(struct super_block *sb);
 int scoutfs_server_apply_commit(struct super_block *sb, int err);
 void scoutfs_server_recov_finish(struct super_block *sb, u64 rid, int which);
 

From 9051ceb6fc6e02e54d8f6e616aec98b85257f7be Mon Sep 17 00:00:00 2001
From: Zach Brown <zab@versity.com>
Date: Tue, 8 Dec 2020 14:10:41 -0800
Subject: [PATCH 02/29] Add core seq to the super block

Add a new seq field to the super block which will be the source of all
incremented seqs throughout the system.  We give out incremented seqs to
callers with an atomic64_t in memory which is synced back to the super
block as we commit transactions in the server.

Signed-off-by: Zach Brown <zab@versity.com>
---
 kmod/src/format.h |  1 +
 kmod/src/server.c | 34 ++++++++++++++++++++++++++++++++++
 kmod/src/server.h |  4 ++++
 3 files changed, 39 insertions(+)

diff --git a/kmod/src/format.h b/kmod/src/format.h
index 924a1842..ee45a422 100644
--- a/kmod/src/format.h
+++ b/kmod/src/format.h
@@ -688,6 +688,7 @@ struct scoutfs_super_block {
 	__le64 version;
 	__le64 flags;
 	__u8 uuid[SCOUTFS_UUID_BYTES];
+	__le64 seq;
 	__le64 next_ino;
 	__le64 next_trans_seq;
 	__le64 total_meta_blocks;	/* both static and dynamic */
diff --git a/kmod/src/server.c b/kmod/src/server.c
index 872af0a7..dfeeac15 100644
--- a/kmod/src/server.c
+++ b/kmod/src/server.c
@@ -65,6 +65,9 @@ struct server_info {
 	u64 term;
 	struct scoutfs_net_connection *conn;
 
+	/* synced with superblock seq on commits */
+	atomic64_t seq_atomic;
+
 	/* request processing coordinates shared commits */
 	struct rw_semaphore commit_rwsem;
 	struct llist_head commit_waiters;
@@ -248,6 +251,35 @@ static void get_roots(struct super_block *sb,
 	} while (read_seqcount_retry(&server->roots_seqcount, seq));
 }
 
+u64 scoutfs_server_seq(struct super_block *sb)
+{
+	DECLARE_SERVER_INFO(sb, server);
+
+	return atomic64_read(&server->seq_atomic);
+}
+
+u64 scoutfs_server_next_seq(struct super_block *sb)
+{
+	DECLARE_SERVER_INFO(sb, server);
+
+	return atomic64_inc_return(&server->seq_atomic);
+}
+
+void scoutfs_server_set_seq_if_greater(struct super_block *sb, u64 seq)
+{
+	DECLARE_SERVER_INFO(sb, server);
+	u64 expect;
+	u64 was;
+
+	expect = atomic64_read(&server->seq_atomic);
+	while (seq > expect) {
+	       was = atomic64_cmpxchg(&server->seq_atomic, expect, seq);
+	       if (was == expect)
+		       break;
+	       expect = was;
+	}
+}
+
 static void set_roots(struct server_info *server,
 		      struct scoutfs_btree_root *fs_root,
 		      struct scoutfs_btree_root *logs_root,
@@ -333,6 +365,7 @@ static void scoutfs_server_commit_func(struct work_struct *work)
 		goto out;
 	}
 
+	super->seq = cpu_to_le64(atomic64_read(&server->seq_atomic));
 	super->server_meta_avail[server->other_ind ^ 1] = server->alloc.avail;
 	super->server_meta_freed[server->other_ind ^ 1] = server->alloc.freed;
 
@@ -2258,6 +2291,7 @@ static void scoutfs_server_worker(struct work_struct *work)
 	server->volopt = super->volopt;
 	write_seqcount_end(&server->volopt_seqcount);
 
+	atomic64_set(&server->seq_atomic, le64_to_cpu(super->seq));
 	set_roots(server, &super->fs_root, &super->logs_root,
 		  &super->srch_root);
 	scoutfs_block_writer_init(sb, &server->wri);
diff --git a/kmod/src/server.h b/kmod/src/server.h
index 40450aa9..79fcb443 100644
--- a/kmod/src/server.h
+++ b/kmod/src/server.h
@@ -71,6 +71,10 @@ int scoutfs_server_send_omap_request(struct super_block *sb, u64 rid,
 int scoutfs_server_send_omap_response(struct super_block *sb, u64 rid, u64 id,
 				      struct scoutfs_open_ino_map *map, int err);
 
+u64 scoutfs_server_seq(struct super_block *sb);
+u64 scoutfs_server_next_seq(struct super_block *sb);
+void scoutfs_server_set_seq_if_greater(struct super_block *sb, u64 seq);
+
 struct sockaddr_in;
 struct scoutfs_quorum_elected_info;
 int scoutfs_server_start(struct super_block *sb, u64 term);

From 05ae756b74233d7ca284cd11a782e88e50e1d5fc Mon Sep 17 00:00:00 2001
From: Zach Brown <zab@versity.com>
Date: Tue, 8 Dec 2020 14:21:28 -0800
Subject: [PATCH 03/29] Get trans seq from core seq

Get the next seq for a client transaction from the core seq in the super
block.  Remove its specific next_trans_seq field.

While making this change we switch to only using le64 in the network
message payloads, the rest of the processing now uses natural u64s.

Signed-off-by: Zach Brown <zab@versity.com>
---
 kmod/src/format.h | 1 -
 kmod/src/server.c | 5 ++---
 2 files changed, 2 insertions(+), 4 deletions(-)

diff --git a/kmod/src/format.h b/kmod/src/format.h
index ee45a422..f750f638 100644
--- a/kmod/src/format.h
+++ b/kmod/src/format.h
@@ -690,7 +690,6 @@ struct scoutfs_super_block {
 	__u8 uuid[SCOUTFS_UUID_BYTES];
 	__le64 seq;
 	__le64 next_ino;
-	__le64 next_trans_seq;
 	__le64 total_meta_blocks;	/* both static and dynamic */
 	__le64 first_meta_blkno;	/* first dynamically allocated */
 	__le64 last_meta_blkno;
diff --git a/kmod/src/server.c b/kmod/src/server.c
index dfeeac15..dbdfd823 100644
--- a/kmod/src/server.c
+++ b/kmod/src/server.c
@@ -983,8 +983,7 @@ static int server_advance_seq(struct super_block *sb,
 	if (ret < 0)
 		goto unlock;
 
-	seq = le64_to_cpu(super->next_trans_seq);
-	le64_add_cpu(&super->next_trans_seq, 1);
+	seq = scoutfs_server_next_seq(sb);
 
 	trace_scoutfs_trans_seq_advance(sb, rid, seq);
 
@@ -1058,7 +1057,7 @@ static int server_get_last_seq(struct super_block *sb,
 		last_seq = key.skts_trans_seq;
 
 	} else if (ret == -ENOENT) {
-		last_seq = super->next_trans_seq;
+		last_seq = cpu_to_le64(scoutfs_server_seq(sb));
 		ret = 0;
 	}
 

From 3c69861c038b44ebd34a440dec94df1f47ebed7d Mon Sep 17 00:00:00 2001
From: Zach Brown <zab@versity.com>
Date: Tue, 8 Dec 2020 14:59:27 -0800
Subject: [PATCH 04/29] Use core seq for lock write_seq

Rename the write_version lock field to write_seq and get it from the
core seq in the super block.

We're doing this to create a relationship between a client transaction's
seq and a lock's write_seq.  New transactions will have a greater seq
than all previously granted write locks and new write locks will have a
greater seq than all open transactions.  This will be used to resolve
ambiguities in item merging as transaction seqs are written out of order
and write locks span transactions.

Signed-off-by: Zach Brown <zab@versity.com>
---
 kmod/src/format.h      |  2 +-
 kmod/src/item.c        | 10 +++++-----
 kmod/src/lock.c        |  4 ++--
 kmod/src/lock.h        |  4 ++--
 kmod/src/lock_server.c | 26 ++++++++------------------
 kmod/src/lock_server.h |  2 +-
 kmod/src/omap.c        | 14 ++++++--------
 kmod/src/server.c      |  3 ++-
 8 files changed, 27 insertions(+), 38 deletions(-)

diff --git a/kmod/src/format.h b/kmod/src/format.h
index f750f638..585cb91f 100644
--- a/kmod/src/format.h
+++ b/kmod/src/format.h
@@ -943,7 +943,7 @@ struct scoutfs_net_roots {
 
 struct scoutfs_net_lock {
 	struct scoutfs_key key;
-	__le64 write_version;
+	__le64 write_seq;
 	__u8 old_mode;
 	__u8 new_mode;
 	__u8 __pad[6];
diff --git a/kmod/src/item.c b/kmod/src/item.c
index 2b03c39f..d2e0f566 100644
--- a/kmod/src/item.c
+++ b/kmod/src/item.c
@@ -1816,7 +1816,7 @@ int scoutfs_item_dirty(struct super_block *sb, struct scoutfs_key *key,
 		ret = -ENOENT;
 	} else {
 		mark_item_dirty(sb, cinf, pg, NULL, item);
-		item->liv.vers = cpu_to_le64(lock->write_version);
+		item->liv.vers = cpu_to_le64(lock->write_seq);
 		ret = 0;
 	}
 
@@ -1836,7 +1836,7 @@ static int item_create(struct super_block *sb, struct scoutfs_key *key,
 {
 	DECLARE_ITEM_CACHE_INFO(sb, cinf);
 	struct scoutfs_log_item_value liv = {
-		.vers = cpu_to_le64(lock->write_version),
+		.vers = cpu_to_le64(lock->write_seq),
 	};
 	struct cached_item *found;
 	struct cached_item *item;
@@ -1911,7 +1911,7 @@ int scoutfs_item_update(struct super_block *sb, struct scoutfs_key *key,
 {
 	DECLARE_ITEM_CACHE_INFO(sb, cinf);
 	struct scoutfs_log_item_value liv = {
-		.vers = cpu_to_le64(lock->write_version),
+		.vers = cpu_to_le64(lock->write_seq),
 	};
 	struct cached_item *item;
 	struct cached_item *found;
@@ -1978,7 +1978,7 @@ static int item_delete(struct super_block *sb, struct scoutfs_key *key,
 {
 	DECLARE_ITEM_CACHE_INFO(sb, cinf);
 	struct scoutfs_log_item_value liv = {
-		.vers = cpu_to_le64(lock->write_version),
+		.vers = cpu_to_le64(lock->write_seq),
 	};
 	struct cached_item *item;
 	struct cached_page *pg;
@@ -2020,7 +2020,7 @@ static int item_delete(struct super_block *sb, struct scoutfs_key *key,
 		erase_item(pg, item);
 	} else {
 		/* must emit deletion to clobber old persistent item */
-		item->liv.vers = cpu_to_le64(lock->write_version);
+		item->liv.vers = cpu_to_le64(lock->write_seq);
 		item->liv.flags |= SCOUTFS_LOG_ITEM_FLAG_DELETION;
 		item->deletion = 1;
 		pg->erased_bytes += item->val_len;
diff --git a/kmod/src/lock.c b/kmod/src/lock.c
index 50a33d26..36227eae 100644
--- a/kmod/src/lock.c
+++ b/kmod/src/lock.c
@@ -730,7 +730,7 @@ static void lock_grant_worker(struct work_struct *work)
 
 		lock->request_pending = 0;
 		lock->mode = nl->new_mode;
-		lock->write_version = le64_to_cpu(nl->write_version);
+		lock->write_seq = le64_to_cpu(nl->write_seq);
 
 		if (lock_count_match_exists(nl->new_mode, lock->waiters))
 			extend_grace(sb, lock);
@@ -988,7 +988,7 @@ int scoutfs_lock_recover_request(struct super_block *sb, u64 net_id,
 	for (i = 0; lock && i < SCOUTFS_NET_LOCK_MAX_RECOVER_NR; i++) {
 
 		nlr->locks[i].key = lock->start;
-		nlr->locks[i].write_version = cpu_to_le64(lock->write_version);
+		nlr->locks[i].write_seq = cpu_to_le64(lock->write_seq);
 		nlr->locks[i].old_mode = lock->mode;
 		nlr->locks[i].new_mode = lock->mode;
 
diff --git a/kmod/src/lock.h b/kmod/src/lock.h
index 40f8f5b9..d043f9fc 100644
--- a/kmod/src/lock.h
+++ b/kmod/src/lock.h
@@ -13,7 +13,7 @@
 struct scoutfs_omap_lock;
 
 /*
- * A few fields (start, end, refresh_gen, write_version, granted_mode)
+ * A few fields (start, end, refresh_gen, write_seq, granted_mode)
  * are referenced by code outside lock.c.
  */
 struct scoutfs_lock {
@@ -23,7 +23,7 @@ struct scoutfs_lock {
 	struct rb_node node;
 	struct rb_node range_node;
 	u64 refresh_gen;
-	u64 write_version;
+	u64 write_seq;
 	u64 dirty_trans_seq;
 	struct list_head lru_head;
 	wait_queue_head_t waitq;
diff --git a/kmod/src/lock_server.c b/kmod/src/lock_server.c
index 09ce48d7..3012d5d7 100644
--- a/kmod/src/lock_server.c
+++ b/kmod/src/lock_server.c
@@ -81,8 +81,6 @@ struct lock_server_info {
 
 	struct scoutfs_alloc *alloc;
 	struct scoutfs_block_writer *wri;
-
-	atomic64_t write_version;
 };
 
 #define DECLARE_LOCK_SERVER_INFO(sb, name) \
@@ -479,7 +477,7 @@ static int process_waiting_requests(struct super_block *sb,
 	struct client_lock_entry *req_tmp;
 	struct client_lock_entry *gr;
 	struct client_lock_entry *gr_tmp;
-	u64 wv;
+	u64 seq;
 	int ret;
 
 	BUG_ON(!mutex_is_locked(&snode->mutex));
@@ -532,8 +530,9 @@ static int process_waiting_requests(struct super_block *sb,
 
 		if (nl.new_mode == SCOUTFS_LOCK_WRITE ||
 		    nl.new_mode == SCOUTFS_LOCK_WRITE_ONLY) {
-			wv = atomic64_inc_return(&inf->write_version);
-			nl.write_version = cpu_to_le64(wv);
+			/* doesn't commit seq update, recovered with locks */
+			seq = scoutfs_server_next_seq(sb);
+			nl.write_seq = cpu_to_le64(seq);
 		}
 
 		ret = scoutfs_server_lock_response(sb, req->rid,
@@ -609,14 +608,6 @@ int scoutfs_lock_server_finished_recovery(struct super_block *sb)
 	return ret;
 }
 
-static void set_max_write_version(struct lock_server_info *inf, u64 new)
-{
-	u64 old;
-
-	while (new > (old = atomic64_read(&inf->write_version)) &&
-	       (atomic64_cmpxchg(&inf->write_version, old, new) != old));
-}
-
 /*
  * We sent a lock recover request to the client when we received its
  * greeting while in recovery.  Here we instantiate all the locks it
@@ -680,9 +671,9 @@ int scoutfs_lock_server_recover_response(struct super_block *sb, u64 rid,
 
 		put_server_lock(inf, snode);
 
-		/* make sure next write lock is greater than all recovered */
-		set_max_write_version(inf,
-				le64_to_cpu(nlr->locks[i].write_version));
+		/* make sure next core seq is greater than all lock write seq */
+		scoutfs_server_set_seq_if_greater(sb,
+				le64_to_cpu(nlr->locks[i].write_seq));
 	}
 
 	/* send request for next batch of keys */
@@ -800,7 +791,7 @@ static void lock_server_tseq_show(struct seq_file *m,
  */
 int scoutfs_lock_server_setup(struct super_block *sb,
 			      struct scoutfs_alloc *alloc,
-			      struct scoutfs_block_writer *wri, u64 max_vers)
+			      struct scoutfs_block_writer *wri)
 {
 	struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
 	struct lock_server_info *inf;
@@ -815,7 +806,6 @@ int scoutfs_lock_server_setup(struct super_block *sb,
 	scoutfs_tseq_tree_init(&inf->tseq_tree, lock_server_tseq_show);
 	inf->alloc = alloc;
 	inf->wri = wri;
-	atomic64_set(&inf->write_version, max_vers); /* inc_return gives +1 */
 
 	inf->tseq_dentry = scoutfs_tseq_create("server_locks", sbi->debug_root,
 					       &inf->tseq_tree);
diff --git a/kmod/src/lock_server.h b/kmod/src/lock_server.h
index e77f116f..60ce31ce 100644
--- a/kmod/src/lock_server.h
+++ b/kmod/src/lock_server.h
@@ -13,7 +13,7 @@ int scoutfs_lock_server_farewell(struct super_block *sb, u64 rid);
 
 int scoutfs_lock_server_setup(struct super_block *sb,
 			      struct scoutfs_alloc *alloc,
-			      struct scoutfs_block_writer *wri, u64 max_vers);
+			      struct scoutfs_block_writer *wri);
 void scoutfs_lock_server_destroy(struct super_block *sb);
 
 #endif
diff --git a/kmod/src/omap.c b/kmod/src/omap.c
index 3dfcbea8..bbe80976 100644
--- a/kmod/src/omap.c
+++ b/kmod/src/omap.c
@@ -137,11 +137,10 @@ struct omap_request {
 /*
  * In each inode group cluster lock we store data to track the open ino
  * map which tracks all the inodes that the cluster lock covers.  When
- * the version shows that the map is stale we send a request to update
- * it.
+ * the seq shows that the map is stale we send a request to update it.
  */
 struct scoutfs_omap_lock_data {
-	u64 version;
+	u64 seq;
 	bool req_in_flight;
 	wait_queue_head_t waitq;
 	struct scoutfs_open_ino_map map;
@@ -833,8 +832,7 @@ static bool omap_req_in_flight(struct scoutfs_lock *lock, struct scoutfs_omap_lo
 /*
  * Make sure the map covered by the cluster lock is current.  The caller
  * holds the cluster lock so once we store lock_data on the cluster lock
- * it won't be freed and the write_version in the cluster lock won't
- * change.
+ * it won't be freed and the write_seq in the cluster lock won't change.
  *
  * The omap_spinlock protects the omap_data in the cluster lock.  We
  * have to drop it if we have to block to allocate lock_data, send a
@@ -861,7 +859,7 @@ static int get_current_lock_data(struct super_block *sb, struct scoutfs_lock *lo
 		}
 
 		if (lock->omap_data == NULL) {
-			ldata->version = lock->write_version - 1; /* ensure refresh */
+			ldata->seq = lock->write_seq - 1; /* ensure refresh */
 			init_waitqueue_head(&ldata->waitq);
 
 			lock->omap_data = ldata;
@@ -871,7 +869,7 @@ static int get_current_lock_data(struct super_block *sb, struct scoutfs_lock *lo
 		}
 	}
 
-	while (ldata->version != lock->write_version) {
+	while (ldata->seq != lock->write_seq) {
 		/* only one waiter sends a request at a time */
 		if (!ldata->req_in_flight) {
 			ldata->req_in_flight = true;
@@ -891,7 +889,7 @@ static int get_current_lock_data(struct super_block *sb, struct scoutfs_lock *lo
 		if (send_req) {
 			ldata->req_in_flight = false;
 			if (ret == 0)
-				ldata->version = lock->write_version;
+				ldata->seq = lock->write_seq;
 			wake_up(&ldata->waitq);
 			if (ret < 0)
 				goto out;
diff --git a/kmod/src/server.c b/kmod/src/server.c
index dbdfd823..0dafff15 100644
--- a/kmod/src/server.c
+++ b/kmod/src/server.c
@@ -2319,8 +2319,9 @@ static void scoutfs_server_worker(struct work_struct *work)
 		scoutfs_err(sb, "server couldn't find max item vers: %d", ret);
 		goto shutdown;
 	}
+	scoutfs_server_set_seq_if_greater(sb, max_vers);
 
-	ret = scoutfs_lock_server_setup(sb, &server->alloc, &server->wri, max_vers) ?:
+	ret = scoutfs_lock_server_setup(sb, &server->alloc, &server->wri) ?:
 	      start_recovery(sb);
 	if (ret)
 		goto shutdown;

From 65c39e5f970c2c22b52f16f3c437bfde092d8655 Mon Sep 17 00:00:00 2001
From: Zach Brown <zab@versity.com>
Date: Tue, 8 Dec 2020 15:15:17 -0800
Subject: [PATCH 05/29] Item seq is max of trans and lock write_seq

Rename the item version to seq and set it to the max of the transaction
seq and the lock's write_seq.  This lets btree item merging chose a seq
at which all dirty items written in future commits must have greater
seqs.  It can drop the seqs from items written to the fs tree during
btree merging knowing that there aren't any older items out in
transactions that could be mistaken for newer items.

Signed-off-by: Zach Brown <zab@versity.com>
---
 kmod/src/forest.c      | 39 ++++++++++++++++++------------------
 kmod/src/forest.h      |  8 ++++----
 kmod/src/format.h      |  4 ++--
 kmod/src/item.c        | 45 ++++++++++++++++++++++++++++--------------
 kmod/src/lock_server.c |  1 +
 kmod/src/server.c      |  8 ++++----
 6 files changed, 60 insertions(+), 45 deletions(-)

diff --git a/kmod/src/forest.c b/kmod/src/forest.c
index 9047c223..f88ac5e2 100644
--- a/kmod/src/forest.c
+++ b/kmod/src/forest.c
@@ -37,9 +37,9 @@
  *
  * The log btrees are modified by multiple transactions over time so
  * there is no consistent ordering relationship between the items in
- * different btrees.  Each item in a log btree stores a version number
- * for the item.  Readers check log btrees for the most recent version
- * that it should use.
+ * different btrees.  Each item in a log btree stores a seq for the
+ * item.  Readers check log btrees for the most recent seq that it
+ * should use.
  *
  * The item cache reads items in bulk from stable btrees, and writes a
  * transaction's worth of dirty items into the item log btree.
@@ -249,7 +249,7 @@ static int forest_read_items(struct super_block *sb, struct scoutfs_key *key,
  * If we hit stale blocks and retry we can call the callback for
  * duplicate items.  This is harmless because the items are stable while
  * the caller holds their cluster lock and the caller has to filter out
- * item versions anyway.
+ * item seqs anyway.
  */
 int scoutfs_forest_read_items(struct super_block *sb,
 			      struct scoutfs_lock *lock,
@@ -426,29 +426,29 @@ out:
 
 /*
  * The caller is commiting items in the transaction and has found the
- * greatest item version amongst them.  We store it in the log_trees root
+ * greatest item seq amongst them.  We store it in the log_trees root
  * to send to the server.
  */
-void scoutfs_forest_set_max_vers(struct super_block *sb, u64 max_vers)
+void scoutfs_forest_set_max_seq(struct super_block *sb, u64 max_seq)
 {
 	DECLARE_FOREST_INFO(sb, finf);
 
-	finf->our_log.max_item_vers = cpu_to_le64(max_vers);
+	finf->our_log.max_item_seq = cpu_to_le64(max_seq);
 }
 
 /*
- * The server is calling during setup to find the greatest item version
+ * The server is calling during setup to find the greatest item seq
  * amongst all the log tree roots.  They have the authoritative current
  * super.
  *
- * Item versions are only used to compare items in log trees, not in the
- * main fs tree.  All we have to do is find the greatest version amongst
- * the log_trees so that new locks will have a write_version greater
- * than all the items in the log_trees.
+ * Item seqs are only used to compare items in log trees, not in the
+ * main fs tree.  All we have to do is find the greatest seq amongst the
+ * log_trees so that the core seq will have a greater seq than all the
+ * items in the log_trees.
  */
-int scoutfs_forest_get_max_vers(struct super_block *sb,
-				struct scoutfs_super_block *super,
-				u64 *vers)
+int scoutfs_forest_get_max_seq(struct super_block *sb,
+			       struct scoutfs_super_block *super,
+			       u64 *seq)
 {
 	struct scoutfs_log_trees *lt;
 	SCOUTFS_BTREE_ITEM_REF(iref);
@@ -456,7 +456,7 @@ int scoutfs_forest_get_max_vers(struct super_block *sb,
 	int ret;
 
 	scoutfs_key_init_log_trees(&ltk, 0, 0);
-	*vers = 0;
+	*seq = 0;
 
 	for (;; scoutfs_key_inc(&ltk)) {
 		ret = scoutfs_btree_next(sb, &super->logs_root, &ltk, &iref);
@@ -464,8 +464,7 @@ int scoutfs_forest_get_max_vers(struct super_block *sb,
 			if (iref.val_len == sizeof(struct scoutfs_log_trees)) {
 				ltk = *iref.key;
 				lt = iref.val;
-				*vers = max(*vers,
-					    le64_to_cpu(lt->max_item_vers));
+				*seq = max(*seq, le64_to_cpu(lt->max_item_seq));
 			} else {
 				ret = -EIO;
 			}
@@ -534,7 +533,7 @@ void scoutfs_forest_init_btrees(struct super_block *sb,
 	memset(&finf->our_log, 0, sizeof(finf->our_log));
 	finf->our_log.item_root = lt->item_root;
 	finf->our_log.bloom_ref = lt->bloom_ref;
-	finf->our_log.max_item_vers = lt->max_item_vers;
+	finf->our_log.max_item_seq = lt->max_item_seq;
 	finf->our_log.rid = lt->rid;
 	finf->our_log.nr = lt->nr;
 	finf->srch_file = lt->srch_file;
@@ -564,7 +563,7 @@ void scoutfs_forest_get_btrees(struct super_block *sb,
 	lt->item_root = finf->our_log.item_root;
 	lt->bloom_ref = finf->our_log.bloom_ref;
 	lt->srch_file = finf->srch_file;
-	lt->max_item_vers = finf->our_log.max_item_vers;
+	lt->max_item_seq = finf->our_log.max_item_seq;
 
 	scoutfs_block_put(sb, finf->srch_bl);
 	finf->srch_bl = NULL;
diff --git a/kmod/src/forest.h b/kmod/src/forest.h
index b73ea7a4..3ca50670 100644
--- a/kmod/src/forest.h
+++ b/kmod/src/forest.h
@@ -23,10 +23,10 @@ int scoutfs_forest_read_items(struct super_block *sb,
 			      scoutfs_forest_item_cb cb, void *arg);
 int scoutfs_forest_set_bloom_bits(struct super_block *sb,
 				  struct scoutfs_lock *lock);
-void scoutfs_forest_set_max_vers(struct super_block *sb, u64 max_vers);
-int scoutfs_forest_get_max_vers(struct super_block *sb,
-				struct scoutfs_super_block *super,
-				u64 *vers);
+void scoutfs_forest_set_max_seq(struct super_block *sb, u64 max_seq);
+int scoutfs_forest_get_max_seq(struct super_block *sb,
+			       struct scoutfs_super_block *super,
+			       u64 *seq);
 int scoutfs_forest_insert_list(struct super_block *sb,
 			       struct scoutfs_btree_item_list *lst);
 int scoutfs_forest_srch_add(struct super_block *sb, u64 hash, u64 ino, u64 id);
diff --git a/kmod/src/format.h b/kmod/src/format.h
index 585cb91f..156732d4 100644
--- a/kmod/src/format.h
+++ b/kmod/src/format.h
@@ -449,13 +449,13 @@ struct scoutfs_log_trees {
 	struct scoutfs_srch_file srch_file;
 	__le64 data_alloc_zone_blocks;
 	__le64 data_alloc_zones[SCOUTFS_DATA_ALLOC_ZONE_LE64S];
-	__le64 max_item_vers;
+	__le64 max_item_seq;
 	__le64 rid;
 	__le64 nr;
 };
 
 struct scoutfs_log_item_value {
-	__le64 vers;
+	__le64 seq;
 	__u8 flags;
 	__u8 __pad[7];
 	__u8 data[];
diff --git a/kmod/src/item.c b/kmod/src/item.c
index d2e0f566..9fb08463 100644
--- a/kmod/src/item.c
+++ b/kmod/src/item.c
@@ -1308,10 +1308,10 @@ static struct active_reader *active_rbtree_walk(struct rb_root *root,
  * on our root and aren't in dirty or lru lists.
  *
  * We need to store deletion items here as we read items from all the
- * btrees so that they can override older versions of the items.  The
- * deletion items will be deleted before we insert the pages into the
- * cache.  We don't insert old versions of items into the tree here so
- * that the trees don't have to compare versions.
+ * btrees so that they can override older items.  The deletion items
+ * will be deleted before we insert the pages into the cache.  We don't
+ * insert old versions of items into the tree here so that the trees
+ * don't have to compare seqs.
  */
 static int read_page_item(struct super_block *sb, struct scoutfs_key *key,
 			  struct scoutfs_log_item_value *liv, void *val,
@@ -1331,7 +1331,7 @@ static int read_page_item(struct super_block *sb, struct scoutfs_key *key,
 
 	pg = page_rbtree_walk(sb, root, key, key, NULL, NULL, &p_par, &p_pnode);
 	found = item_rbtree_walk(&pg->item_root, key, NULL, &par, &pnode);
-	if (found && (le64_to_cpu(found->liv.vers) >= le64_to_cpu(liv->vers)))
+	if (found && (le64_to_cpu(found->liv.seq) >= le64_to_cpu(liv->seq)))
 		return 0;
 
 	if (!page_has_room(pg, val_len)) {
@@ -1783,6 +1783,21 @@ out:
 	return ret;
 }
 
+/*
+ * An item's seq is greater of the client transaction's seq and the
+ * lock's write_seq.  This ensures that multiple commits in one lock
+ * grant will have increasing seqs, and new locks in open commits will
+ * also increase the seqs.  It lets us limit the inputs of item merging
+ * to the last stable seq and ensure that all the items in open
+ * transactions and granted locks will have greater seqs.
+ */
+static __le64 item_seq(struct super_block *sb, struct scoutfs_lock *lock)
+{
+	struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
+
+	return cpu_to_le64(max(sbi->trans_seq, lock->write_seq));
+}
+
 /*
  * Mark the item dirty.  Dirtying while holding a transaction pins the
  * page holding the item and guarantees that the item can be deleted or
@@ -1816,7 +1831,7 @@ int scoutfs_item_dirty(struct super_block *sb, struct scoutfs_key *key,
 		ret = -ENOENT;
 	} else {
 		mark_item_dirty(sb, cinf, pg, NULL, item);
-		item->liv.vers = cpu_to_le64(lock->write_seq);
+		item->liv.seq = item_seq(sb, lock);
 		ret = 0;
 	}
 
@@ -1836,7 +1851,7 @@ static int item_create(struct super_block *sb, struct scoutfs_key *key,
 {
 	DECLARE_ITEM_CACHE_INFO(sb, cinf);
 	struct scoutfs_log_item_value liv = {
-		.vers = cpu_to_le64(lock->write_seq),
+		.seq = item_seq(sb, lock),
 	};
 	struct cached_item *found;
 	struct cached_item *item;
@@ -1911,7 +1926,7 @@ int scoutfs_item_update(struct super_block *sb, struct scoutfs_key *key,
 {
 	DECLARE_ITEM_CACHE_INFO(sb, cinf);
 	struct scoutfs_log_item_value liv = {
-		.vers = cpu_to_le64(lock->write_seq),
+		.seq = item_seq(sb, lock),
 	};
 	struct cached_item *item;
 	struct cached_item *found;
@@ -1946,7 +1961,7 @@ int scoutfs_item_update(struct super_block *sb, struct scoutfs_key *key,
 		if (val_len < found->val_len)
 			pg->erased_bytes += found->val_len - val_len;
 		found->val_len = val_len;
-		found->liv.vers = liv.vers;
+		found->liv.seq = liv.seq;
 		mark_item_dirty(sb, cinf, pg, NULL, found);
 	} else {
 		item = alloc_item(pg, key, &liv, val, val_len);
@@ -1978,7 +1993,7 @@ static int item_delete(struct super_block *sb, struct scoutfs_key *key,
 {
 	DECLARE_ITEM_CACHE_INFO(sb, cinf);
 	struct scoutfs_log_item_value liv = {
-		.vers = cpu_to_le64(lock->write_seq),
+		.seq = item_seq(sb, lock),
 	};
 	struct cached_item *item;
 	struct cached_page *pg;
@@ -2020,7 +2035,7 @@ static int item_delete(struct super_block *sb, struct scoutfs_key *key,
 		erase_item(pg, item);
 	} else {
 		/* must emit deletion to clobber old persistent item */
-		item->liv.vers = cpu_to_le64(lock->write_seq);
+		item->liv.seq = liv.seq;
 		item->liv.flags |= SCOUTFS_LOG_ITEM_FLAG_DELETION;
 		item->deletion = 1;
 		pg->erased_bytes += item->val_len;
@@ -2106,7 +2121,7 @@ int scoutfs_item_write_dirty(struct super_block *sb)
 	struct page *page;
 	LIST_HEAD(pages);
 	LIST_HEAD(pos);
-	u64 max_vers = 0;
+	u64 max_seq = 0;
 	int val_len;
 	int bytes;
 	int off;
@@ -2171,7 +2186,7 @@ int scoutfs_item_write_dirty(struct super_block *sb)
 			val_len = sizeof(item->liv) + item->val_len;
 			bytes = offsetof(struct scoutfs_btree_item_list,
 					 val[val_len]);
-			max_vers = max(max_vers, le64_to_cpu(item->liv.vers));
+			max_seq = max(max_seq, le64_to_cpu(item->liv.seq));
 
 			if (off + bytes > PAGE_SIZE) {
 				page = second;
@@ -2201,8 +2216,8 @@ int scoutfs_item_write_dirty(struct super_block *sb)
 		read_unlock(&pg->rwlock);
 	}
 
-	/* store max item vers in forest's log_trees */
-	scoutfs_forest_set_max_vers(sb, max_vers);
+	/* store max item seq in forest's log_trees */
+	scoutfs_forest_set_max_seq(sb, max_seq);
 
 	/* write all the dirty items into log btree blocks */
 	ret = scoutfs_forest_insert_list(sb, first);
diff --git a/kmod/src/lock_server.c b/kmod/src/lock_server.c
index 3012d5d7..5a3a0cd7 100644
--- a/kmod/src/lock_server.c
+++ b/kmod/src/lock_server.c
@@ -518,6 +518,7 @@ static int process_waiting_requests(struct super_block *sb,
 
 		nl.key = snode->key;
 		nl.new_mode = req->mode;
+		nl.write_seq = 0;
 
 		/* see if there's an existing compatible grant to replace */
 		gr = find_entry(snode, &snode->granted, req->rid);
diff --git a/kmod/src/server.c b/kmod/src/server.c
index 0dafff15..76c45abf 100644
--- a/kmod/src/server.c
+++ b/kmod/src/server.c
@@ -2250,7 +2250,7 @@ static void scoutfs_server_worker(struct work_struct *work)
 	struct scoutfs_net_connection *conn = NULL;
 	DECLARE_WAIT_QUEUE_HEAD(waitq);
 	struct sockaddr_in sin;
-	u64 max_vers;
+	u64 max_seq;
 	int ret;
 
 	trace_scoutfs_server_work_enter(sb, 0, 0);
@@ -2314,12 +2314,12 @@ static void scoutfs_server_worker(struct work_struct *work)
 	    le64_to_cpu(server->meta_avail->total_len))
 		swap(server->meta_avail, server->meta_freed);
 
-	ret = scoutfs_forest_get_max_vers(sb, super, &max_vers);
+	ret = scoutfs_forest_get_max_seq(sb, super, &max_seq);
 	if (ret) {
-		scoutfs_err(sb, "server couldn't find max item vers: %d", ret);
+		scoutfs_err(sb, "server couldn't find max item seq: %d", ret);
 		goto shutdown;
 	}
-	scoutfs_server_set_seq_if_greater(sb, max_vers);
+	scoutfs_server_set_seq_if_greater(sb, max_seq);
 
 	ret = scoutfs_lock_server_setup(sb, &server->alloc, &server->wri) ?:
 	      start_recovery(sb);

From d7f8896fac58b4372b90ceb045ab3a07bdfc7866 Mon Sep 17 00:00:00 2001
From: Zach Brown <zab@versity.com>
Date: Tue, 8 Dec 2020 15:31:14 -0800
Subject: [PATCH 06/29] Add scoutfs_btree_parent_range

Add a btree helper for finding the range of keys which are found in
leaves referenced by the last parent block when searching for a given
key.

Signed-off-by: Zach Brown <zab@versity.com>
---
 kmod/src/btree.c | 36 +++++++++++++++++++++++++++++++++++-
 kmod/src/btree.h |  6 ++++++
 2 files changed, 41 insertions(+), 1 deletion(-)

diff --git a/kmod/src/btree.c b/kmod/src/btree.c
index b9b02696..f9e6795f 100644
--- a/kmod/src/btree.c
+++ b/kmod/src/btree.c
@@ -83,6 +83,7 @@ enum btree_walk_flags {
 	 BTW_ALLOC	= (1 <<  3), /* allocate a new block for 0 ref, requires dirty */
 	 BTW_INSERT	= (1 <<  4), /* walking to insert, try splitting */
 	 BTW_DELETE	= (1 <<  5), /* walking to delete, try joining */
+	 BTW_PAR_RNG	= (1 <<  6), /* return range through final parent */
 };
 
 /* total length of the value payload */
@@ -1098,7 +1099,8 @@ static int btree_walk(struct super_block *sb,
 	unsigned int nr;
 	int ret;
 
-	if (WARN_ON_ONCE((flags & BTW_DIRTY) && (!alloc || !wri)))
+	if (WARN_ON_ONCE((flags & BTW_DIRTY) && (!alloc || !wri)) ||
+	    WARN_ON_ONCE((flags & BTW_PAR_RNG) && !kr))
 		return -EINVAL;
 
 	/* all ops come through walk and walk calls all reads */
@@ -1144,6 +1146,12 @@ restart:
 
 		trace_scoutfs_btree_walk(sb, root, key, flags, level, ref);
 
+		/* par range set by ref to last parent block */
+		if (level < 2 && (flags & BTW_PAR_RNG)) {
+			ret = 0;
+			break;
+		}
+
 		ret = get_ref_block(sb, alloc, wri, flags, ref, &bl);
 		if (ret)
 			break;
@@ -1742,3 +1750,29 @@ int scoutfs_btree_insert_list(struct super_block *sb,
 out:
 	return ret;
 }
+
+/*
+ * Descend towards the leaf that would contain the key.  As we arrive at
+ * the last parent block, set start and end to the range of keys that
+ * could be found through traversal of that last parent.
+ *
+ * If the tree is too short for parent blocks then the max key range
+ * is returned.
+ */
+int scoutfs_btree_parent_range(struct super_block *sb,
+			       struct scoutfs_btree_root *root,
+			       struct scoutfs_key *key,
+			       struct scoutfs_key *start,
+			       struct scoutfs_key *end)
+{
+	struct btree_walk_key_range kr;
+	int ret;
+
+	ret = btree_walk(sb, NULL, NULL, root, BTW_PAR_RNG, key, 0, NULL, &kr);
+	if (ret == -ENOENT)
+		ret = 0;
+
+	*start = kr.start;
+	*end = kr.end;
+	return ret;
+}
diff --git a/kmod/src/btree.h b/kmod/src/btree.h
index 79d4de58..697f34c3 100644
--- a/kmod/src/btree.h
+++ b/kmod/src/btree.h
@@ -82,6 +82,12 @@ int scoutfs_btree_insert_list(struct super_block *sb,
 			      struct scoutfs_btree_root *root,
 			      struct scoutfs_btree_item_list *lst);
 
+int scoutfs_btree_parent_range(struct super_block *sb,
+			       struct scoutfs_btree_root *root,
+			       struct scoutfs_key *key,
+			       struct scoutfs_key *start,
+			       struct scoutfs_key *end);
+
 void scoutfs_btree_put_iref(struct scoutfs_btree_item_ref *iref);
 
 #endif

From b6d0a45f6da5d477a3d08e38a8df7c19c943b086 Mon Sep 17 00:00:00 2001
From: Zach Brown <zab@versity.com>
Date: Wed, 9 Dec 2020 09:49:06 -0800
Subject: [PATCH 07/29] Add btree_{get,set}_parent

Add calls for working with subtrees built around references to blocks in
the last level of parents.  This will let the server farm out btree
merging work where concurrency is built around safely working with all
the items and leaves that fall under a given parent block.

Signed-off-by: Zach Brown <zab@versity.com>
---
 kmod/src/btree.c         | 105 ++++++++++++++++++++++++++++++++++-----
 kmod/src/btree.h         |  10 ++++
 kmod/src/scoutfs_trace.h |  36 ++++++++++++++
 3 files changed, 138 insertions(+), 13 deletions(-)

diff --git a/kmod/src/btree.c b/kmod/src/btree.c
index f9e6795f..1b83c498 100644
--- a/kmod/src/btree.c
+++ b/kmod/src/btree.c
@@ -84,6 +84,8 @@ enum btree_walk_flags {
 	 BTW_INSERT	= (1 <<  4), /* walking to insert, try splitting */
 	 BTW_DELETE	= (1 <<  5), /* walking to delete, try joining */
 	 BTW_PAR_RNG	= (1 <<  6), /* return range through final parent */
+	 BTW_GET_PAR	= (1 <<  7), /* get reference to final parent */
+	 BTW_SET_PAR	= (1 <<  8), /* override reference to final parent */
 };
 
 /* total length of the value payload */
@@ -1083,7 +1085,8 @@ static int btree_walk(struct super_block *sb,
 		      int flags, struct scoutfs_key *key,
 		      unsigned int val_len,
 		      struct scoutfs_block **bl_ret,
-		      struct btree_walk_key_range *kr)
+		      struct btree_walk_key_range *kr,
+		      struct scoutfs_btree_root *par_root)
 {
 	struct scoutfs_block *par_bl = NULL;
 	struct scoutfs_block *bl = NULL;
@@ -1100,7 +1103,8 @@ static int btree_walk(struct super_block *sb,
 	int ret;
 
 	if (WARN_ON_ONCE((flags & BTW_DIRTY) && (!alloc || !wri)) ||
-	    WARN_ON_ONCE((flags & BTW_PAR_RNG) && !kr))
+	    WARN_ON_ONCE((flags & BTW_PAR_RNG) && !kr) ||
+	    WARN_ON_ONCE((flags & (BTW_GET_PAR|BTW_SET_PAR)) && !par_root))
 		return -EINVAL;
 
 	/* all ops come through walk and walk calls all reads */
@@ -1127,7 +1131,14 @@ restart:
 	ret = 0;
 
 	if (!root->height) {
-		if (!(flags & BTW_INSERT)) {
+		if (flags & BTW_GET_PAR) {
+			memset(par_root, 0, sizeof(*par_root));
+			*root = *par_root;
+			ret = 0;
+		} else if (flags & BTW_SET_PAR) {
+			*root = *par_root;
+			ret = 0;
+		} else if (!(flags & BTW_INSERT)) {
 			ret = -ENOENT;
 		} else {
 			ret = get_ref_block(sb, alloc, wri, BTW_ALLOC | BTW_DIRTY, &root->ref, &bl);
@@ -1152,6 +1163,29 @@ restart:
 			break;
 		}
 
+		if (level < 2 && (flags & BTW_GET_PAR)) {
+			par_root->ref = *ref;
+			par_root->height = level + 1;
+			ret = 0;
+			break;
+		}
+
+		if (level < 2 && (flags & BTW_SET_PAR)) {
+			if (ref == &root->ref) {
+				/* single parent block is replaced, can shrink/grow */
+				*root = *par_root;
+			} else {
+				/* subtree replacing one of parents must match height */
+				if (par_root->height != level + 1) {
+					ret = -EINVAL;
+					break;
+				}
+				*ref = par_root->ref;
+			}
+			ret = 0;
+			break;
+		}
+
 		ret = get_ref_block(sb, alloc, wri, flags, ref, &bl);
 		if (ret)
 			break;
@@ -1300,7 +1334,7 @@ int scoutfs_btree_lookup(struct super_block *sb,
 	if (WARN_ON_ONCE(iref->key))
 		return -EINVAL;
 
-	ret = btree_walk(sb, NULL, NULL, root, 0, key, 0, &bl, NULL);
+	ret = btree_walk(sb, NULL, NULL, root, 0, key, 0, &bl, NULL, NULL);
 	if (ret == 0) {
 		bt = bl->data;
 
@@ -1352,7 +1386,7 @@ int scoutfs_btree_insert(struct super_block *sb,
 		return -EINVAL;
 
 	ret = btree_walk(sb, alloc, wri, root, BTW_DIRTY | BTW_INSERT, key,
-			 val_len, &bl, NULL);
+			 val_len, &bl, NULL, NULL);
 	if (ret == 0) {
 		bt = bl->data;
 
@@ -1414,7 +1448,7 @@ int scoutfs_btree_update(struct super_block *sb,
 		return -EINVAL;
 
 	ret = btree_walk(sb, alloc, wri, root, BTW_DIRTY | BTW_INSERT, key,
-			 val_len, &bl, NULL);
+			 val_len, &bl, NULL, NULL);
 	if (ret == 0) {
 		bt = bl->data;
 
@@ -1456,7 +1490,7 @@ int scoutfs_btree_force(struct super_block *sb,
 		return -EINVAL;
 
 	ret = btree_walk(sb, alloc, wri, root, BTW_DIRTY | BTW_INSERT, key,
-			 val_len, &bl, NULL);
+			 val_len, &bl, NULL, NULL);
 	if (ret == 0) {
 		bt = bl->data;
 
@@ -1494,7 +1528,7 @@ int scoutfs_btree_delete(struct super_block *sb,
 	scoutfs_inc_counter(sb, btree_delete);
 
 	ret = btree_walk(sb, alloc, wri, root, BTW_DELETE | BTW_DIRTY, key,
-			 0, &bl, NULL);
+			 0, &bl, NULL, NULL);
 	if (ret == 0) {
 		bt = bl->data;
 
@@ -1558,7 +1592,7 @@ static int btree_iter(struct super_block *sb,struct scoutfs_btree_root *root,
 
 	for (;;) {
 		ret = btree_walk(sb, NULL, NULL, root, flags, &walk_key,
-				 0, &bl, &kr);
+				 0, &bl, &kr, NULL);
 		if (ret < 0)
 			break;
 		bt = bl->data;
@@ -1631,7 +1665,8 @@ int scoutfs_btree_dirty(struct super_block *sb,
 
 	scoutfs_inc_counter(sb, btree_dirty);
 
-	ret = btree_walk(sb, alloc, wri, root, BTW_DIRTY, key, 0, &bl, NULL);
+	ret = btree_walk(sb, alloc, wri, root, BTW_DIRTY, key, 0, &bl,
+			 NULL, NULL);
 	if (ret == 0) {
 		bt = bl->data;
 
@@ -1667,7 +1702,7 @@ int scoutfs_btree_read_items(struct super_block *sb,
 	struct scoutfs_block *bl;
 	int ret;
 
-	ret = btree_walk(sb, NULL, NULL, root, 0, key, 0, &bl, &kr);
+	ret = btree_walk(sb, NULL, NULL, root, 0, key, 0, &bl, &kr, NULL);
 	if (ret < 0)
 		goto out;
 	bt = bl->data;
@@ -1722,7 +1757,7 @@ int scoutfs_btree_insert_list(struct super_block *sb,
 
 	while (lst) {
 		ret = btree_walk(sb, alloc, wri, root, BTW_DIRTY | BTW_INSERT,
-				 &lst->key, lst->val_len, &bl, &kr);
+				 &lst->key, lst->val_len, &bl, &kr, NULL);
 		if (ret < 0)
 			goto out;
 		bt = bl->data;
@@ -1768,7 +1803,8 @@ int scoutfs_btree_parent_range(struct super_block *sb,
 	struct btree_walk_key_range kr;
 	int ret;
 
-	ret = btree_walk(sb, NULL, NULL, root, BTW_PAR_RNG, key, 0, NULL, &kr);
+	ret = btree_walk(sb, NULL, NULL, root, BTW_PAR_RNG, key, 0, NULL,
+			 &kr, NULL);
 	if (ret == -ENOENT)
 		ret = 0;
 
@@ -1776,3 +1812,46 @@ int scoutfs_btree_parent_range(struct super_block *sb,
 	*end = kr.end;
 	return ret;
 }
+
+/*
+ * Initialize the caller's root as a subtree whose ref points to the
+ * last parent found as we traverse towards the leaf containing the key.
+ * If the tree is too small to have multiple blocks at the final parent
+ * level then the caller's root will be initialized to equal full input
+ * root.  If the tree is empty then the par root will also be empty.
+ */
+int scoutfs_btree_get_parent(struct super_block *sb,
+			     struct scoutfs_btree_root *root,
+			     struct scoutfs_key *key,
+			     struct scoutfs_btree_root *par_root)
+{
+	return btree_walk(sb, NULL, NULL, root, BTW_GET_PAR, key, 0, NULL,
+			  NULL, par_root);
+}
+
+/*
+ * Dirty a path towards the leaf block containing the key.  As we reach
+ * the reference to the final parent block override it with the ref in
+ * the caller's block.  If the tree only has a single block at the final
+ * parent level, or a single leaf block, then the entire tree is
+ * replaced with the caller's root.
+ *
+ * This manages allocs and frees while dirtying blocks in the path to
+ * the ref, but it doesn't account for allocating the blocks that are
+ * referenced by the ref nor freeing blocks referenced by the old ref
+ * that's overwritten.  Keeping allocators in sync with the result of
+ * the ref override is the responsibility of the caller.
+ */
+int scoutfs_btree_set_parent(struct super_block *sb,
+			     struct scoutfs_alloc *alloc,
+			     struct scoutfs_block_writer *wri,
+			     struct scoutfs_btree_root *root,
+			     struct scoutfs_key *key,
+			     struct scoutfs_btree_root *par_root)
+{
+
+	trace_scoutfs_btree_set_parent(sb, root, key, par_root);
+
+	return btree_walk(sb, alloc, wri, root, BTW_DIRTY | BTW_SET_PAR,
+			  key, 0, NULL, NULL, par_root);
+}
diff --git a/kmod/src/btree.h b/kmod/src/btree.h
index 697f34c3..7f906a84 100644
--- a/kmod/src/btree.h
+++ b/kmod/src/btree.h
@@ -87,6 +87,16 @@ int scoutfs_btree_parent_range(struct super_block *sb,
 			       struct scoutfs_key *key,
 			       struct scoutfs_key *start,
 			       struct scoutfs_key *end);
+int scoutfs_btree_get_parent(struct super_block *sb,
+			     struct scoutfs_btree_root *root,
+			     struct scoutfs_key *key,
+			     struct scoutfs_btree_root *par_root);
+int scoutfs_btree_set_parent(struct super_block *sb,
+			     struct scoutfs_alloc *alloc,
+			     struct scoutfs_block_writer *wri,
+			     struct scoutfs_btree_root *root,
+			     struct scoutfs_key *key,
+			     struct scoutfs_btree_root *par_root);
 
 void scoutfs_btree_put_iref(struct scoutfs_btree_item_ref *iref);
 
diff --git a/kmod/src/scoutfs_trace.h b/kmod/src/scoutfs_trace.h
index 7dce85f0..e0814d0c 100644
--- a/kmod/src/scoutfs_trace.h
+++ b/kmod/src/scoutfs_trace.h
@@ -1644,6 +1644,42 @@ TRACE_EVENT(scoutfs_btree_walk,
 		  __entry->level, __entry->ref_blkno, __entry->ref_seq)
 );
 
+TRACE_EVENT(scoutfs_btree_set_parent,
+	TP_PROTO(struct super_block *sb,
+		 struct scoutfs_btree_root *root, struct scoutfs_key *key,
+		 struct scoutfs_btree_root *par_root),
+
+	TP_ARGS(sb, root, key, par_root),
+
+	TP_STRUCT__entry(
+		SCSB_TRACE_FIELDS
+		__field(__u64, root_blkno)
+		__field(__u64, root_seq)
+		__field(__u8, root_height)
+		sk_trace_define(key)
+		__field(__u64, par_root_blkno)
+		__field(__u64, par_root_seq)
+		__field(__u8, par_root_height)
+	),
+
+	TP_fast_assign(
+		SCSB_TRACE_ASSIGN(sb);
+		__entry->root_blkno = le64_to_cpu(root->ref.blkno);
+		__entry->root_seq = le64_to_cpu(root->ref.seq);
+		__entry->root_height = root->height;
+		sk_trace_assign(key, key);
+		__entry->par_root_blkno = le64_to_cpu(par_root->ref.blkno);
+		__entry->par_root_seq = le64_to_cpu(par_root->ref.seq);
+		__entry->par_root_height = par_root->height;
+	),
+
+	TP_printk(SCSBF" root blkno %llu seq %llu height %u, key "SK_FMT", par_root blkno %llu seq %llu height %u",
+		  SCSB_TRACE_ARGS, __entry->root_blkno, __entry->root_seq,
+		  __entry->root_height, sk_trace_args(key),
+		  __entry->par_root_blkno, __entry->par_root_seq,
+		  __entry->par_root_height)
+);
+
 TRACE_EVENT(scoutfs_online_offline_blocks,
 	TP_PROTO(struct inode *inode, s64 on_delta, s64 off_delta,
 		 u64 on_now, u64 off_now),

From 3a03a6a20cc2691a0485c1b710c889f09d28492b Mon Sep 17 00:00:00 2001
From: Zach Brown <zab@versity.com>
Date: Wed, 9 Dec 2020 14:09:52 -0800
Subject: [PATCH 08/29] Add SUBTREE btree walk flag to restrict join/merge

Add a BTW_SUBTREE flag to btree_walk() to restrict splitting or joining
of the root block.   When clients are merging into the root built from a
reference to the last parent in the fs tree we want to be careful that
we maintain a single root block that can be spliced back into the fs
tree.   We specifically check that the root block remain within the
split/join thresholds.  If it falls out of compliance we return an error
so that it can be spliced back into the fs tree and then split/joined
with its siblings.

Signed-off-by: Zach Brown <zab@versity.com>
---
 kmod/src/btree.c | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/kmod/src/btree.c b/kmod/src/btree.c
index 1b83c498..603f4110 100644
--- a/kmod/src/btree.c
+++ b/kmod/src/btree.c
@@ -86,6 +86,7 @@ enum btree_walk_flags {
 	 BTW_PAR_RNG	= (1 <<  6), /* return range through final parent */
 	 BTW_GET_PAR	= (1 <<  7), /* get reference to final parent */
 	 BTW_SET_PAR	= (1 <<  8), /* override reference to final parent */
+	 BTW_SUBTREE	= (1 <<  9), /* root is parent subtree, return -ERANGE if split/join */
 };
 
 /* total length of the value payload */
@@ -1209,6 +1210,17 @@ restart:
 			break;
 		}
 
+		/*
+		 * join/split won't check subtree parent root, let
+		 * caller know when it needs to be split/join.
+		 */
+		if ((flags & BTW_SUBTREE) && level == 1 &&
+		    (!total_above_join_low_water(bt) ||
+		     !mid_free_item_room(bt, sizeof(struct scoutfs_block_ref)))) {
+			ret = -ERANGE;
+			break;
+		}
+
 		/*
 		 * Splitting and joining can add or remove parents or
 		 * change the parent item we use to reach the child

From 0538c882bcb63088be1498e0901719c2205e3fb8 Mon Sep 17 00:00:00 2001
From: Zach Brown <zab@versity.com>
Date: Tue, 15 Jun 2021 15:15:57 -0700
Subject: [PATCH 09/29] Add btree_merge()

Add a btree function for merging the items in a range from a number of
read-only input btrees into a destination btree.

Signed-off-by: Zach Brown <zab@versity.com>
---
 kmod/src/btree.c         | 282 +++++++++++++++++++++++++++++++++++++++
 kmod/src/btree.h         |  25 ++++
 kmod/src/counters.h      |   8 ++
 kmod/src/scoutfs_trace.h |  80 +++++++++++
 4 files changed, 395 insertions(+)

diff --git a/kmod/src/btree.c b/kmod/src/btree.c
index 603f4110..03ad5b96 100644
--- a/kmod/src/btree.c
+++ b/kmod/src/btree.c
@@ -1867,3 +1867,285 @@ int scoutfs_btree_set_parent(struct super_block *sb,
 	return btree_walk(sb, alloc, wri, root, BTW_DIRTY | BTW_SET_PAR,
 			  key, 0, NULL, NULL, par_root);
 }
+
+struct merge_pos {
+	struct rb_node node;
+	struct scoutfs_btree_root *root;
+	struct scoutfs_key key;
+	unsigned int val_len;
+	u8 val[SCOUTFS_BTREE_MAX_VAL_LEN];
+};
+
+/*
+ * Find the next item in the mpos's root after its key and make sure
+ * that it's in its sorted position in the rbtree.  We're responsible
+ * for freeing the mpos if we don't put it back in the pos_root.  This
+ * happens naturally naturally when its item_root has no more items to
+ * merge.
+ */
+static int reset_mpos(struct super_block *sb, struct rb_root *pos_root,
+		      struct merge_pos *mpos, struct scoutfs_key *end,
+		      scoutfs_btree_merge_cmp_t merge_cmp)
+{
+	SCOUTFS_BTREE_ITEM_REF(iref);
+	struct merge_pos *walk;
+	struct rb_node *parent;
+	struct rb_node **node;
+	int key_cmp;
+	int val_cmp;
+	int ret;
+
+restart:
+	if (!RB_EMPTY_NODE(&mpos->node)) {
+		rb_erase(&mpos->node, pos_root);
+		RB_CLEAR_NODE(&mpos->node);
+	}
+
+	/* find the next item in the root within end */
+	ret = scoutfs_btree_next(sb, mpos->root, &mpos->key, &iref);
+	if (ret == 0) {
+		if (scoutfs_key_compare(iref.key, end) > 0) {
+			ret = -ENOENT;
+		} else {
+			mpos->key = *iref.key;
+			mpos->val_len = iref.val_len;
+			memcpy(mpos->val, iref.val, iref.val_len);
+		}
+		scoutfs_btree_put_iref(&iref);
+	}
+	if (ret < 0) {
+		kfree(mpos);
+		if (ret == -ENOENT)
+			ret = 0;
+		goto out;
+	}
+
+rewalk:
+	/* sort merge items by key then oldest to newest */
+	node = &pos_root->rb_node;
+	parent = NULL;
+	while (*node) {
+		parent = *node;
+		walk = container_of(*node, struct merge_pos, node);
+
+		key_cmp = scoutfs_key_compare(&mpos->key, &walk->key);
+		val_cmp = merge_cmp(mpos->val, mpos->val_len,
+				    walk->val, walk->val_len);
+
+		/* drop old versions of logged keys as we discover them */
+		if (key_cmp == 0) {
+			scoutfs_inc_counter(sb, btree_merge_drop_old);
+			if (val_cmp < 0)  {
+				scoutfs_key_inc(&mpos->key);
+				goto restart;
+			} else {
+				BUG_ON(val_cmp == 0);
+				rb_erase(&walk->node, pos_root);
+				kfree(walk);
+				goto rewalk;
+			}
+		}
+
+		if ((key_cmp ?: val_cmp) < 0)
+			node = &(*node)->rb_left;
+		else
+			node = &(*node)->rb_right;
+	}
+
+	rb_link_node(&mpos->node, parent, node);
+	rb_insert_color(&mpos->node, pos_root);
+	ret = 0;
+out:
+	return ret;
+}
+
+static struct merge_pos *first_mpos(struct rb_root *root)
+{
+	struct rb_node *node = rb_first(root);
+	if (node)
+		 return container_of(node, struct merge_pos, node);
+	return NULL;
+}
+
+/*
+ * Merge items from a number of read-only input roots into a writable
+ * destination root.  The order of the input roots doesn't matter, the
+ * items are merged in sorted key order.
+ *
+ * The merge_cmp callback determines the order that the input items are
+ * merged in.  The is_del callback determines if a merging item should
+ * be removed from the destination.
+ *
+ * subtree indicates that the destination root is in fact one of many
+ * parent blocks and shouldn't be split or allowed to fall below the
+ * join low water mark.
+ *
+ * drop_val indicates the initial length of the value that should be
+ * dropped when merging items into destination items.
+ *
+ * -ERANGE is returned if the merge doesn't fully exhaust the range, due
+ * to allocators running low or needing to join/split the parent.
+ * *next_ret is set to the next key which hasn't been merged so that the
+ * caller can retry with a new allocator and subtree.
+ */
+int scoutfs_btree_merge(struct super_block *sb,
+			struct scoutfs_alloc *alloc,
+			struct scoutfs_block_writer *wri,
+			struct scoutfs_key *start,
+			struct scoutfs_key *end,
+			struct scoutfs_key *next_ret,
+			struct scoutfs_btree_root *root,
+			struct list_head *inputs,
+			scoutfs_btree_merge_cmp_t merge_cmp,
+			scoutfs_btree_merge_is_del_t merge_is_del, bool subtree,
+			int drop_val, int dirty_limit, int alloc_low)
+{
+	struct scoutfs_btree_root_head *rhead;
+	struct rb_root pos_root = RB_ROOT;
+	struct scoutfs_btree_item *item;
+	struct scoutfs_btree_block *bt;
+	struct scoutfs_block *bl = NULL;
+	struct btree_walk_key_range kr;
+	struct scoutfs_avl_node *par;
+	struct merge_pos *mpos;
+	struct merge_pos *tmp;
+	int walk_val_len;
+	int walk_flags;
+	bool is_del;
+	int cmp;
+	int ret;
+
+	trace_scoutfs_btree_merge(sb, root, start, end);
+	scoutfs_inc_counter(sb, btree_merge);
+
+	list_for_each_entry(rhead, inputs, head) {
+		mpos = kmalloc(sizeof(*mpos), GFP_NOFS);
+		if (!mpos) {
+			ret = -ENOMEM;
+			goto out;
+		}
+
+		RB_CLEAR_NODE(&mpos->node);
+		mpos->key = *start;
+		mpos->root = &rhead->root;
+
+		ret = reset_mpos(sb, &pos_root, mpos, end, merge_cmp);
+		if (ret < 0)
+			goto out;
+	}
+
+	walk_flags = BTW_DIRTY;
+	if (subtree)
+		walk_flags |= BTW_SUBTREE;
+	walk_val_len = 0;
+
+	while ((mpos = first_mpos(&pos_root))) {
+
+		if (scoutfs_block_writer_dirty_bytes(sb, wri) >= dirty_limit) {
+			scoutfs_inc_counter(sb, btree_merge_dirty_limit);
+			ret = -ERANGE;
+			*next_ret = mpos->key;
+			goto out;
+		}
+
+		if (scoutfs_alloc_meta_low(sb, alloc, alloc_low)) {
+			scoutfs_inc_counter(sb, btree_merge_alloc_low);
+			ret = -ERANGE;
+			*next_ret = mpos->key;
+			goto out;
+		}
+
+		scoutfs_block_put(sb, bl);
+		bl = NULL;
+		ret = btree_walk(sb, alloc, wri, root, walk_flags,
+				 &mpos->key, walk_val_len, &bl, &kr, NULL);
+		if (ret < 0) {
+			if (ret == -ERANGE)
+				*next_ret = mpos->key;
+			goto out;
+		}
+		bt = bl->data;
+		scoutfs_inc_counter(sb, btree_merge_walk);
+
+		for (; mpos; mpos = first_mpos(&pos_root)) {
+
+			/* val must have at least what we need to drop */
+			if (mpos->val_len < drop_val) {
+				ret = -EIO;
+				goto out;
+			}
+
+			/* walk to new leaf if we exceed parent ref key */
+			if (scoutfs_key_compare(&mpos->key, &kr.end) > 0)
+				break;
+
+			/* see if there's an existing item */
+			item = leaf_item_hash_search(sb, bt, &mpos->key);
+			is_del = merge_is_del(mpos->val, mpos->val_len);
+
+			trace_scoutfs_btree_merge_items(sb, mpos->root,
+					&mpos->key, mpos->val_len,
+					item ? root : NULL,
+					item ? item_key(item) : NULL,
+					item ? item_val_len(item) : 0, is_del);
+
+			/* rewalk and split if ins/update needs room */
+			if (!is_del && !mid_free_item_room(bt, mpos->val_len)) {
+				walk_flags |= BTW_INSERT;
+				walk_val_len = mpos->val_len;
+				break;
+			}
+
+			/* insert missing non-deletion merge items */
+			if (!item && !is_del) {
+				scoutfs_avl_search(&bt->item_root,
+						   cmp_key_item, &mpos->key,
+						   &cmp, &par, NULL, NULL);
+				create_item(bt, &mpos->key,
+					    mpos->val + drop_val,
+					    mpos->val_len - drop_val, par, cmp);
+				scoutfs_inc_counter(sb, btree_merge_insert);
+			}
+
+			/* update existing items */
+			if (item && !is_del) {
+				update_item_value(bt, item,
+						  mpos->val + drop_val,
+						  mpos->val_len - drop_val);
+				scoutfs_inc_counter(sb, btree_merge_update);
+			}
+
+			/* delete if merge item was deletion */
+			if (item && is_del) {
+				/* rewalk and join if non-root falls under low water mark */
+				if (root->ref.blkno != bt->hdr.blkno &&
+				    !total_above_join_low_water(bt)) {
+					walk_flags |= BTW_DELETE;
+					break;
+				}
+				delete_item(bt, item, NULL);
+				scoutfs_inc_counter(sb, btree_merge_delete);
+			}
+
+			/* reset walk args now that we're not split/join */
+			walk_flags &= ~(BTW_INSERT | BTW_DELETE);
+			walk_val_len = 0;
+
+			/* finished with this merge item */
+			scoutfs_key_inc(&mpos->key);
+			ret = reset_mpos(sb, &pos_root, mpos, end, merge_cmp);
+			if (ret < 0)
+				goto out;
+			mpos = NULL;
+		}
+	}
+
+	ret = 0;
+out:
+	scoutfs_block_put(sb, bl);
+	rbtree_postorder_for_each_entry_safe(mpos, tmp, &pos_root, node) {
+		kfree(mpos);
+	}
+
+	return ret;
+}
diff --git a/kmod/src/btree.h b/kmod/src/btree.h
index 7f906a84..02228290 100644
--- a/kmod/src/btree.h
+++ b/kmod/src/btree.h
@@ -98,6 +98,31 @@ int scoutfs_btree_set_parent(struct super_block *sb,
 			     struct scoutfs_key *key,
 			     struct scoutfs_btree_root *par_root);
 
+/* merge input is a list of roots */
+struct scoutfs_btree_root_head {
+	struct list_head head;
+	struct scoutfs_btree_root root;
+};
+/*
+ * Compare the values of merge input items whose keys are equal to
+ * determine their merge order.
+ */
+typedef int (*scoutfs_btree_merge_cmp_t)(void *a_val, int a_val_len,
+					 void *b_val, int b_val_len);
+/* whether merging item should be removed from destination */
+typedef bool (*scoutfs_btree_merge_is_del_t)(void *val, int val_len);
+int scoutfs_btree_merge(struct super_block *sb,
+			struct scoutfs_alloc *alloc,
+			struct scoutfs_block_writer *wri,
+			struct scoutfs_key *start,
+			struct scoutfs_key *end,
+			struct scoutfs_key *next_ret,
+			struct scoutfs_btree_root *root,
+			struct list_head *input_list,
+			scoutfs_btree_merge_cmp_t merge_cmp,
+			scoutfs_btree_merge_is_del_t merge_is_del, bool subtree,
+			int drop_val, int dirty_limit, int alloc_low);
+
 void scoutfs_btree_put_iref(struct scoutfs_btree_item_ref *iref);
 
 #endif
diff --git a/kmod/src/counters.h b/kmod/src/counters.h
index 7cb5a331..9e9e9f5e 100644
--- a/kmod/src/counters.h
+++ b/kmod/src/counters.h
@@ -44,6 +44,14 @@
 	EXPAND_COUNTER(btree_insert)				\
 	EXPAND_COUNTER(btree_leaf_item_hash_search)		\
 	EXPAND_COUNTER(btree_lookup)				\
+	EXPAND_COUNTER(btree_merge)				\
+	EXPAND_COUNTER(btree_merge_alloc_low)			\
+	EXPAND_COUNTER(btree_merge_delete)			\
+	EXPAND_COUNTER(btree_merge_dirty_limit)			\
+	EXPAND_COUNTER(btree_merge_drop_old)			\
+	EXPAND_COUNTER(btree_merge_insert)			\
+	EXPAND_COUNTER(btree_merge_update)			\
+	EXPAND_COUNTER(btree_merge_walk)			\
 	EXPAND_COUNTER(btree_next)				\
 	EXPAND_COUNTER(btree_prev)				\
 	EXPAND_COUNTER(btree_split)				\
diff --git a/kmod/src/scoutfs_trace.h b/kmod/src/scoutfs_trace.h
index e0814d0c..2be1014a 100644
--- a/kmod/src/scoutfs_trace.h
+++ b/kmod/src/scoutfs_trace.h
@@ -1680,6 +1680,86 @@ TRACE_EVENT(scoutfs_btree_set_parent,
 		  __entry->par_root_height)
 );
 
+TRACE_EVENT(scoutfs_btree_merge,
+	TP_PROTO(struct super_block *sb, struct scoutfs_btree_root *root,
+		 struct scoutfs_key *start, struct scoutfs_key *end),
+
+	TP_ARGS(sb, root, start, end),
+
+	TP_STRUCT__entry(
+		SCSB_TRACE_FIELDS
+		__field(__u64, root_blkno)
+		__field(__u64, root_seq)
+		__field(__u8, root_height)
+		sk_trace_define(start)
+		sk_trace_define(end)
+	),
+
+	TP_fast_assign(
+		SCSB_TRACE_ASSIGN(sb);
+		__entry->root_blkno = le64_to_cpu(root->ref.blkno);
+		__entry->root_seq = le64_to_cpu(root->ref.seq);
+		__entry->root_height = root->height;
+		sk_trace_assign(start, start);
+		sk_trace_assign(end, end);
+	),
+
+	TP_printk(SCSBF" root blkno %llu seq %llu height %u start "SK_FMT" end "SK_FMT,
+		  SCSB_TRACE_ARGS, __entry->root_blkno, __entry->root_seq,
+		  __entry->root_height, sk_trace_args(start),
+		  sk_trace_args(end))
+);
+
+TRACE_EVENT(scoutfs_btree_merge_items,
+	TP_PROTO(struct super_block *sb,
+		 struct scoutfs_btree_root *m_root,
+		 struct scoutfs_key *m_key, int m_val_len,
+		 struct scoutfs_btree_root *f_root,
+		 struct scoutfs_key *f_key, int f_val_len,
+		 int is_del),
+
+	TP_ARGS(sb, m_root, m_key, m_val_len, f_root, f_key, f_val_len, is_del),
+
+	TP_STRUCT__entry(
+		SCSB_TRACE_FIELDS
+		__field(__u64, m_root_blkno)
+		__field(__u64, m_root_seq)
+		__field(__u8, m_root_height)
+		sk_trace_define(m_key)
+		__field(int, m_val_len)
+		__field(__u64, f_root_blkno)
+		__field(__u64, f_root_seq)
+		__field(__u8, f_root_height)
+		sk_trace_define(f_key)
+		__field(int, f_val_len)
+		__field(int, is_del)
+	),
+
+	TP_fast_assign(
+		SCSB_TRACE_ASSIGN(sb);
+		__entry->m_root_blkno = m_root ?
+					le64_to_cpu(m_root->ref.blkno) : 0;
+		__entry->m_root_seq = m_root ? le64_to_cpu(m_root->ref.seq) : 0;
+		__entry->m_root_height = m_root ? m_root->height : 0;
+		sk_trace_assign(m_key, m_key);
+		__entry->m_val_len = m_val_len;
+		__entry->f_root_blkno = f_root ?
+					le64_to_cpu(f_root->ref.blkno) : 0;
+		__entry->f_root_seq = f_root ? le64_to_cpu(f_root->ref.seq) : 0;
+		__entry->f_root_height = f_root ? f_root->height : 0;
+		sk_trace_assign(f_key, f_key);
+		__entry->f_val_len = f_val_len;
+		__entry->is_del = !!is_del;
+	),
+
+	TP_printk(SCSBF" merge item root blkno %llu seq %llu height %u key "SK_FMT" val_len %d, fs item root blkno %llu seq %llu height %u key "SK_FMT" val_len %d, is_del %d",
+		  SCSB_TRACE_ARGS, __entry->m_root_blkno, __entry->m_root_seq,
+		  __entry->m_root_height, sk_trace_args(m_key),
+		  __entry->m_val_len, __entry->f_root_blkno,
+		  __entry->f_root_seq, __entry->f_root_height,
+		  sk_trace_args(f_key), __entry->f_val_len, __entry->is_del)
+);
+
 TRACE_EVENT(scoutfs_online_offline_blocks,
 	TP_PROTO(struct inode *inode, s64 on_delta, s64 off_delta,
 		 u64 on_now, u64 off_now),

From d8478ed6f131b6cfa93a22aa3bb3ace3f1dd3ae7 Mon Sep 17 00:00:00 2001
From: Zach Brown <zab@versity.com>
Date: Fri, 18 Dec 2020 09:24:27 -0800
Subject: [PATCH 10/29] Add scoutfs_btree_rebalance()

Add a btree call to just dirty to a leaf block, joining and splitting
along the way so that the blocks in the path satisfy the balance
constraints.

Signed-off-by: Zach Brown <zab@versity.com>
---
 kmod/src/btree.c | 24 ++++++++++++++++++++++++
 kmod/src/btree.h |  5 +++++
 2 files changed, 29 insertions(+)

diff --git a/kmod/src/btree.c b/kmod/src/btree.c
index 03ad5b96..f2e16924 100644
--- a/kmod/src/btree.c
+++ b/kmod/src/btree.c
@@ -1868,6 +1868,30 @@ int scoutfs_btree_set_parent(struct super_block *sb,
 			  key, 0, NULL, NULL, par_root);
 }
 
+/*
+ * Descend to the leaf, making sure that all the blocks conform to the
+ * balance constraints.  Blocks below the low threshold will be joined.
+ * This is called to split blocks that were too large for insertions,
+ * but those insertions were in a distant context and we don't bother
+ * communicating the val_len back here.  We just try to insert a max
+ * value.
+ *
+ * This always dirties all the way to the leaf.  It could be made more
+ * efficient with more btree walk flags to walk and check for blocks
+ * that need balancing, and then walks that don't dirty unless they need
+ * to join/split.
+ */
+int scoutfs_btree_rebalance(struct super_block *sb,
+			    struct scoutfs_alloc *alloc,
+			    struct scoutfs_block_writer *wri,
+			    struct scoutfs_btree_root *root,
+			    struct scoutfs_key *key)
+{
+	return btree_walk(sb, alloc, wri, root,
+			  BTW_DIRTY | BTW_INSERT | BTW_DELETE,
+			  key, SCOUTFS_BTREE_MAX_VAL_LEN, NULL, NULL, NULL);
+}
+
 struct merge_pos {
 	struct rb_node node;
 	struct scoutfs_btree_root *root;
diff --git a/kmod/src/btree.h b/kmod/src/btree.h
index 02228290..63447669 100644
--- a/kmod/src/btree.h
+++ b/kmod/src/btree.h
@@ -97,6 +97,11 @@ int scoutfs_btree_set_parent(struct super_block *sb,
 			     struct scoutfs_btree_root *root,
 			     struct scoutfs_key *key,
 			     struct scoutfs_btree_root *par_root);
+int scoutfs_btree_rebalance(struct super_block *sb,
+			    struct scoutfs_alloc *alloc,
+			    struct scoutfs_block_writer *wri,
+			    struct scoutfs_btree_root *root,
+			    struct scoutfs_key *key);
 
 /* merge input is a list of roots */
 struct scoutfs_btree_root_head {

From 082924df1a754a6f3e07cd93587dcfef3f304832 Mon Sep 17 00:00:00 2001
From: Zach Brown <zab@versity.com>
Date: Fri, 18 Dec 2020 09:26:32 -0800
Subject: [PATCH 11/29] Add scoutfs_key_is_ones()

Add a quick inline for testing that a key is all ones.

Signed-off-by: Zach Brown <zab@versity.com>
---
 kmod/src/key.h | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/kmod/src/key.h b/kmod/src/key.h
index 5ea4dd4c..66a4c84a 100644
--- a/kmod/src/key.h
+++ b/kmod/src/key.h
@@ -108,6 +108,16 @@ static inline void scoutfs_key_set_ones(struct scoutfs_key *key)
 	memset(key->__pad, 0, sizeof(key->__pad));
 }
 
+static inline bool scoutfs_key_is_ones(struct scoutfs_key *key)
+{
+	return key->sk_zone == U8_MAX &&
+	       key->_sk_first == cpu_to_le64(U64_MAX) &&
+	       key->sk_type == U8_MAX &&
+	       key->_sk_second == cpu_to_le64(U64_MAX) &&
+	       key->_sk_third == cpu_to_le64(U64_MAX) &&
+	       key->_sk_fourth == U8_MAX;
+}
+
 /*
  * Return a -1/0/1 comparison of keys.
  *

From 298a6a8865b0e99dc77a76912eb5a063951ebca1 Mon Sep 17 00:00:00 2001
From: Zach Brown <zab@versity.com>
Date: Fri, 18 Dec 2020 09:31:35 -0800
Subject: [PATCH 12/29] Add server get_stable_trans_seq()

Extract part of the get_last_seq handler into a call that finds the last
stable client transaction seq.  Log merging needs this to determine a
cutoff for stable items in log btrees.

Signed-off-by: Zach Brown <zab@versity.com>
---
 kmod/src/server.c | 68 ++++++++++++++++++++++++++++++-----------------
 1 file changed, 43 insertions(+), 25 deletions(-)

diff --git a/kmod/src/server.c b/kmod/src/server.c
index 76c45abf..d89a83ca 100644
--- a/kmod/src/server.c
+++ b/kmod/src/server.c
@@ -1021,6 +1021,43 @@ static int remove_trans_seq(struct super_block *sb, u64 rid)
 	return ret;
 }
 
+/*
+ * Give the caller the last seq before outstanding client commits.  All
+ * seqs up to and including this are stable, new client transactions can
+ * only have greater seqs.
+ */
+static int get_stable_trans_seq(struct super_block *sb, u64 *last_seq_ret)
+{
+	DECLARE_SERVER_INFO(sb, server);
+	struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
+	struct scoutfs_super_block *super = &sbi->super;
+	SCOUTFS_BTREE_ITEM_REF(iref);
+	struct scoutfs_key key;
+	u64 last_seq = 0;
+	int ret;
+
+	down_read(&server->seq_rwsem);
+
+	init_trans_seq_key(&key, 0, 0);
+	ret = scoutfs_btree_next(sb, &super->trans_seqs, &key, &iref);
+	if (ret == 0) {
+		last_seq = le64_to_cpu(iref.key->skts_trans_seq) - 1;
+		scoutfs_btree_put_iref(&iref);
+
+	} else if (ret == -ENOENT) {
+		last_seq = scoutfs_server_seq(sb) - 1;
+		ret = 0;
+	}
+
+	up_read(&server->seq_rwsem);
+
+	if (ret < 0)
+		last_seq = 0;
+
+	*last_seq_ret = last_seq;
+	return ret;
+}
+
 /*
  * Give the calling client the last valid trans_seq that it can return
  * in results from the indices of trans seqs to inodes.  These indices
@@ -1033,13 +1070,9 @@ static int server_get_last_seq(struct super_block *sb,
 			       struct scoutfs_net_connection *conn,
 			       u8 cmd, u64 id, void *arg, u16 arg_len)
 {
-	DECLARE_SERVER_INFO(sb, server);
-	struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
-	struct scoutfs_super_block *super = &sbi->super;
-	SCOUTFS_BTREE_ITEM_REF(iref);
 	u64 rid = scoutfs_net_client_rid(conn);
-	struct scoutfs_key key;
-	__le64 last_seq = 0;
+	u64 last_seq = 0;
+	__le64 leseq;
 	int ret;
 
 	if (arg_len != 0) {
@@ -1047,27 +1080,12 @@ static int server_get_last_seq(struct super_block *sb,
 		goto out;
 	}
 
-	down_read(&server->seq_rwsem);
-
-	init_trans_seq_key(&key, 0, 0);
-	ret = scoutfs_btree_next(sb, &super->trans_seqs, &key, &iref);
-	if (ret == 0) {
-		key = *iref.key;
-		scoutfs_btree_put_iref(&iref);
-		last_seq = key.skts_trans_seq;
-
-	} else if (ret == -ENOENT) {
-		last_seq = cpu_to_le64(scoutfs_server_seq(sb));
-		ret = 0;
-	}
-
-	le64_add_cpu(&last_seq, -1ULL);
-	trace_scoutfs_trans_seq_last(sb, rid, le64_to_cpu(last_seq));
-
-	up_read(&server->seq_rwsem);
+	ret = get_stable_trans_seq(sb, &last_seq);
 out:
+	trace_scoutfs_trans_seq_last(sb, rid, last_seq);
+	leseq = cpu_to_le64(last_seq);
 	return scoutfs_net_response(sb, conn, cmd, id, ret,
-				    &last_seq, sizeof(last_seq));
+				    &leseq, sizeof(leseq));
 }
 
 static int server_lock(struct super_block *sb,

From 4d3ea3b59b0f7c58659976a98c7f846af5cd77a0 Mon Sep 17 00:00:00 2001
From: Zach Brown <zab@versity.com>
Date: Fri, 18 Dec 2020 11:37:18 -0800
Subject: [PATCH 13/29] Add format support for log btree merging

Add the format specification for the upcoming btree merging.  Log btrees
gain a finalized field, we add the super btree root and all the items
that the server will use to coordinate merging amongst clients, and we
add the two client net messages which the server will implement.

Signed-off-by: Zach Brown <zab@versity.com>
---
 kmod/src/format.h | 84 +++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 84 insertions(+)

diff --git a/kmod/src/format.h b/kmod/src/format.h
index 156732d4..aa6abc88 100644
--- a/kmod/src/format.h
+++ b/kmod/src/format.h
@@ -452,8 +452,11 @@ struct scoutfs_log_trees {
 	__le64 max_item_seq;
 	__le64 rid;
 	__le64 nr;
+	__le64 flags;
 };
 
+#define SCOUTFS_LOG_TREES_FINALIZED	(1ULL << 0)
+
 struct scoutfs_log_item_value {
 	__le64 seq;
 	__u8 flags;
@@ -490,6 +493,78 @@ struct scoutfs_bloom_block {
 	 member_sizeof(struct scoutfs_bloom_block, bits[0]) * 8)
 #define SCOUTFS_FOREST_BLOOM_FUNC_BITS		(SCOUTFS_BLOCK_LG_SHIFT + 3)
 
+/*
+ * A private server btree item which records the status of a log merge
+ * operation that is in progress.
+ */
+struct scoutfs_log_merge_status {
+	struct scoutfs_key next_range_key;
+	__le64 nr_requests;
+	__le64 nr_complete;
+	__le64 last_seq;
+	__le64 seq;
+};
+
+/*
+ * A request is sent to the client and stored in a server btree item to
+ * record resources that would be reclaimed if the client failed.  It
+ * has all the inputs needed for the client to perform its portion of a
+ * merge.
+ */
+struct scoutfs_log_merge_request {
+	struct scoutfs_alloc_list_head meta_avail;
+	struct scoutfs_alloc_list_head meta_freed;
+	struct scoutfs_btree_root logs_root;
+	struct scoutfs_btree_root root;
+	struct scoutfs_key start;
+	struct scoutfs_key end;
+	__le64 last_seq;
+	__le64 rid;
+	__le64 seq;
+	__le64 flags;
+};
+
+/* request root is subtree of fs root at parent, restricted merging modifications */
+#define SCOUTFS_LOG_MERGE_REQUEST_SUBTREE	(1ULL << 0)
+
+/*
+ * The output of a client's merge of log btree items into a subtree
+ * rooted at a parent in the fs_root.  The client sends it to the
+ * server, who stores it in a btree item for later splicing/rebalancing.
+ */
+struct scoutfs_log_merge_complete {
+	struct scoutfs_alloc_list_head meta_avail;
+	struct scoutfs_alloc_list_head meta_freed;
+	struct scoutfs_btree_root root;
+	struct scoutfs_key start;
+	struct scoutfs_key end;
+	struct scoutfs_key remain;
+	__le64 rid;
+	__le64 seq;
+	__le64 flags;
+};
+
+/* merge failed, ignore completion and reclaim stored request */
+#define SCOUTFS_LOG_MERGE_COMP_ERROR	(1ULL << 0)
+/* merge didn't complete range, restart from remain */
+#define SCOUTFS_LOG_MERGE_COMP_REMAIN	(1ULL << 1)
+
+/*
+ * Range items record the ranges of the fs keyspace that still need to
+ * be merged.  They're added as a merge starts, removed as requests are
+ * sent and added back if the request didn't consume its entire range.
+ */
+struct scoutfs_log_merge_range {
+	struct scoutfs_key start;
+	struct scoutfs_key end;
+};
+
+struct scoutfs_log_merge_freeing {
+	struct scoutfs_btree_root root;
+	struct scoutfs_key key;
+	__le64 seq;
+};
+
 /*
  * Keys are first sorted by major key zones.
  */
@@ -504,6 +579,12 @@ struct scoutfs_bloom_block {
 #define SCOUTFS_SRCH_ZONE			9
 #define SCOUTFS_FREE_EXTENT_BLKNO_ZONE		10
 #define SCOUTFS_FREE_EXTENT_ORDER_ZONE		11
+/* Items only stored in log merge server btrees */
+#define SCOUTFS_LOG_MERGE_STATUS_ZONE		12
+#define SCOUTFS_LOG_MERGE_RANGE_ZONE		13
+#define SCOUTFS_LOG_MERGE_REQUEST_ZONE		14
+#define SCOUTFS_LOG_MERGE_COMPLETE_ZONE		15
+#define SCOUTFS_LOG_MERGE_FREEING_ZONE		16
 
 /* inode index zone */
 #define SCOUTFS_INODE_INDEX_META_SEQ_TYPE	1
@@ -703,6 +784,7 @@ struct scoutfs_super_block {
 	struct scoutfs_alloc_list_head server_meta_freed[2];
 	struct scoutfs_btree_root fs_root;
 	struct scoutfs_btree_root logs_root;
+	struct scoutfs_btree_root log_merge;
 	struct scoutfs_btree_root trans_seqs;
 	struct scoutfs_btree_root mounted_clients;
 	struct scoutfs_btree_root srch_root;
@@ -895,6 +977,8 @@ enum scoutfs_net_cmd {
 	SCOUTFS_NET_CMD_LOCK_RECOVER,
 	SCOUTFS_NET_CMD_SRCH_GET_COMPACT,
 	SCOUTFS_NET_CMD_SRCH_COMMIT_COMPACT,
+	SCOUTFS_NET_CMD_GET_LOG_MERGE,
+	SCOUTFS_NET_CMD_COMMIT_LOG_MERGE,
 	SCOUTFS_NET_CMD_OPEN_INO_MAP,
 	SCOUTFS_NET_CMD_GET_VOLOPT,
 	SCOUTFS_NET_CMD_SET_VOLOPT,

From 9c2122f7ded1ac9a7103110fac04ced4e6c34a83 Mon Sep 17 00:00:00 2001
From: Zach Brown <zab@versity.com>
Date: Fri, 18 Dec 2020 11:40:13 -0800
Subject: [PATCH 14/29] Add server btree merge processing

This adds the server processing side of the btree merge functionality.
The client isn't yet sending the log_merge messages so no merging will
be performed.

The bulk of the work happens as the server processess a get_log_merge
message to build a merge request for the client.  It starts a log merge
if one isn't in flight.  If one is in flight it checks to see if it
should be spliced and maybe finished.  In the common case it finds the
next range to be merged and sends the request to the client to process.

The commit_log_merge handler is the completion side of that request.  If
the request failed then we unwind its resources based on the stored
request item.  If it succeeds we record it in an item for get_
processing to splice eventually.

Then we modify two existing server code paths.

First, get_log_tree doesn't just create or use a single existing log
btree for a client mount.  If the existing log btree is large enough it
sets its finalized flag and advances the nr to use a new log btree.
That makes the old finalized log btree available for merging.

Then we need to be a bit more careful when reclaiming the open log btree
for a client.  We can't use next to find the only open log btree, we use
prev to find the last and make sure that it isn't already finalized.

Signed-off-by: Zach Brown <zab@versity.com>
---
 kmod/src/alloc.h         |   10 +
 kmod/src/scoutfs_trace.h |  110 ++++
 kmod/src/server.c        | 1141 +++++++++++++++++++++++++++++++++++++-
 3 files changed, 1232 insertions(+), 29 deletions(-)

diff --git a/kmod/src/alloc.h b/kmod/src/alloc.h
index 1e245c5f..9130d086 100644
--- a/kmod/src/alloc.h
+++ b/kmod/src/alloc.h
@@ -55,6 +55,16 @@
 #define SCOUTFS_SERVER_DATA_FILL_LO \
 	(1ULL * 1024 * 1024 * 1024 >> SCOUTFS_BLOCK_SM_SHIFT)
 
+/*
+ * Log merge meta allocations are only used for one request and will
+ * never use more than the dirty limit.
+ */
+#define SCOUTFS_LOG_MERGE_DIRTY_BYTE_LIMIT	(64ULL * 1024 * 1024)
+/* a few extra blocks for alloc blocks */
+#define SCOUTFS_SERVER_MERGE_FILL_TARGET	\
+	((SCOUTFS_LOG_MERGE_DIRTY_BYTE_LIMIT >> SCOUTFS_BLOCK_LG_SHIFT) + 4)
+#define SCOUTFS_SERVER_MERGE_FILL_LO		SCOUTFS_SERVER_MERGE_FILL_TARGET
+
 /*
  * Each of the server meta_alloc roots will try to keep a minimum amount
  * of free blocks.  The server will swap roots when its current avail
diff --git a/kmod/src/scoutfs_trace.h b/kmod/src/scoutfs_trace.h
index 2be1014a..4e58ef7a 100644
--- a/kmod/src/scoutfs_trace.h
+++ b/kmod/src/scoutfs_trace.h
@@ -2016,6 +2016,116 @@ TRACE_EVENT(scoutfs_trans_seq_last,
 		  SCSB_TRACE_ARGS, __entry->s_rid, __entry->trans_seq)
 );
 
+TRACE_EVENT(scoutfs_get_log_merge_status,
+	TP_PROTO(struct super_block *sb, u64 rid, struct scoutfs_key *next_range_key,
+		 u64 nr_requests, u64 nr_complete, u64 last_seq, u64 seq),
+
+	TP_ARGS(sb, rid, next_range_key, nr_requests, nr_complete, last_seq, seq),
+
+	TP_STRUCT__entry(
+		SCSB_TRACE_FIELDS
+		__field(__u64, s_rid)
+		sk_trace_define(next_range_key)
+		__field(__u64, nr_requests)
+		__field(__u64, nr_complete)
+		__field(__u64, last_seq)
+		__field(__u64, seq)
+	),
+
+	TP_fast_assign(
+		SCSB_TRACE_ASSIGN(sb);
+		__entry->s_rid = rid;
+		sk_trace_assign(next_range_key, next_range_key);
+		__entry->nr_requests = nr_requests;
+		__entry->nr_complete = nr_complete;
+		__entry->last_seq = last_seq;
+		__entry->seq = seq;
+	),
+
+	TP_printk(SCSBF" rid %016llx next_range_key "SK_FMT" nr_requests %llu nr_complete %llu last_seq %llu seq %llu",
+		  SCSB_TRACE_ARGS, __entry->s_rid, sk_trace_args(next_range_key),
+		  __entry->nr_requests, __entry->nr_complete, __entry->last_seq, __entry->seq)
+);
+
+TRACE_EVENT(scoutfs_get_log_merge_request,
+	TP_PROTO(struct super_block *sb, u64 rid,
+		 struct scoutfs_btree_root *root, struct scoutfs_key *start,
+		 struct scoutfs_key *end, u64 last_seq, u64 seq),
+
+	TP_ARGS(sb, rid, root, start, end, last_seq, seq),
+
+	TP_STRUCT__entry(
+		SCSB_TRACE_FIELDS
+		__field(__u64, s_rid)
+		__field(__u64, root_blkno)
+		__field(__u64, root_seq)
+		__field(__u8, root_height)
+		sk_trace_define(start)
+		sk_trace_define(end)
+		__field(__u64, last_seq)
+		__field(__u64, seq)
+	),
+
+	TP_fast_assign(
+		SCSB_TRACE_ASSIGN(sb);
+		__entry->s_rid = rid;
+		__entry->root_blkno = le64_to_cpu(root->ref.blkno);
+		__entry->root_seq = le64_to_cpu(root->ref.seq);
+		__entry->root_height = root->height;
+		sk_trace_assign(start, start);
+		sk_trace_assign(end, end);
+		__entry->last_seq = last_seq;
+		__entry->seq = seq;
+	),
+
+	TP_printk(SCSBF" rid %016llx root blkno %llu seq %llu height %u start "SK_FMT" end "SK_FMT" last_seq %llu seq %llu",
+		  SCSB_TRACE_ARGS, __entry->s_rid, __entry->root_blkno,
+		  __entry->root_seq, __entry->root_height,
+		  sk_trace_args(start), sk_trace_args(end), __entry->last_seq,
+		  __entry->seq)
+);
+
+TRACE_EVENT(scoutfs_get_log_merge_complete,
+	TP_PROTO(struct super_block *sb, u64 rid,
+		 struct scoutfs_btree_root *root, struct scoutfs_key *start,
+		 struct scoutfs_key *end, struct scoutfs_key *remain,
+		 u64 seq, u64 flags),
+
+	TP_ARGS(sb, rid, root, start, end, remain, seq, flags),
+
+	TP_STRUCT__entry(
+		SCSB_TRACE_FIELDS
+		__field(__u64, s_rid)
+		__field(__u64, root_blkno)
+		__field(__u64, root_seq)
+		__field(__u8, root_height)
+		sk_trace_define(start)
+		sk_trace_define(end)
+		sk_trace_define(remain)
+		__field(__u64, seq)
+		__field(__u64, flags)
+	),
+
+	TP_fast_assign(
+		SCSB_TRACE_ASSIGN(sb);
+		__entry->s_rid = rid;
+		__entry->root_blkno = le64_to_cpu(root->ref.blkno);
+		__entry->root_seq = le64_to_cpu(root->ref.seq);
+		__entry->root_height = root->height;
+		sk_trace_assign(start, start);
+		sk_trace_assign(end, end);
+		sk_trace_assign(remain, remain);
+		__entry->seq = seq;
+		__entry->flags = flags;
+	),
+
+	TP_printk(SCSBF" rid %016llx root blkno %llu seq %llu height %u start "SK_FMT" end "SK_FMT" remain "SK_FMT" seq %llu flags 0x%llx",
+		  SCSB_TRACE_ARGS, __entry->s_rid, __entry->root_blkno,
+		  __entry->root_seq, __entry->root_height,
+		  sk_trace_args(start), sk_trace_args(end),
+		  sk_trace_args(remain), __entry->seq, __entry->flags)
+);
+
 DECLARE_EVENT_CLASS(scoutfs_forest_bloom_class,
 	TP_PROTO(struct super_block *sb, struct scoutfs_key *key,
 		 u64 rid, u64 nr, u64 blkno, u64 seq, unsigned int count),
diff --git a/kmod/src/server.c b/kmod/src/server.c
index d89a83ca..15d3f6ed 100644
--- a/kmod/src/server.c
+++ b/kmod/src/server.c
@@ -96,6 +96,8 @@ struct server_info {
 	struct scoutfs_block_writer wri;
 
 	struct mutex logs_mutex;
+	struct work_struct log_merge_free_work;
+
 	struct mutex srch_mutex;
 	struct mutex mounted_clients_mutex;
 
@@ -604,6 +606,35 @@ static void set_extent_zone_bits(struct super_block *sb, void *cb_arg, struct sc
 	mod_extent_bits(cba->zones, cba->zone_blocks, ext->start, ext->len, true);
 }
 
+static int find_log_trees_item(struct super_block *sb,
+			       struct scoutfs_btree_root *logs_root,
+			       bool call_next, u64 rid, u64 nr,
+			       struct scoutfs_log_trees *lt_ret)
+{
+	SCOUTFS_BTREE_ITEM_REF(iref);
+	struct scoutfs_key key;
+	int ret;
+
+	scoutfs_key_init_log_trees(&key, rid, nr);
+	if (call_next)
+		ret = scoutfs_btree_next(sb, logs_root, &key, &iref);
+	else
+		ret = scoutfs_btree_prev(sb, logs_root, &key, &iref);
+	if (ret == 0) {
+		if (iref.val_len == sizeof(struct scoutfs_log_trees)) {
+			if (le64_to_cpu(iref.key->sklt_rid) != rid)
+				ret = -ENOENT;
+			else
+				memcpy(lt_ret, iref.val, iref.val_len);
+		} else {
+			ret = -EIO;
+		}
+		scoutfs_btree_put_iref(&iref);
+	}
+
+	return ret;
+}
+
 /*
  * Give the client roots to all the trees that they'll use to build
  * their transaction.
@@ -613,6 +644,9 @@ static void set_extent_zone_bits(struct super_block *sb, void *cb_arg, struct sc
  * trees back into the core allocators.  They're were committed with the
  * previous transaction so they're stable and can now be reused, even by
  * the server in this commit.
+ *
+ * If the committed log trees are large enough we finalize them and make
+ * them available to log merging.
  */
 static int server_get_log_trees(struct super_block *sb,
 				struct scoutfs_net_connection *conn,
@@ -624,10 +658,12 @@ static int server_get_log_trees(struct super_block *sb,
 	__le64 exclusive[SCOUTFS_DATA_ALLOC_ZONE_LE64S];
 	__le64 vacant[SCOUTFS_DATA_ALLOC_ZONE_LE64S];
 	struct alloc_extent_cb_args cba;
-	SCOUTFS_BTREE_ITEM_REF(iref);
+	struct scoutfs_log_trees fin;
 	struct scoutfs_log_trees lt;
 	struct scoutfs_key key;
+	bool have_fin = false;
 	u64 data_zone_blocks;
+	u64 nr;
 	int ret;
 
 	if (arg_len != 0) {
@@ -639,32 +675,55 @@ static int server_get_log_trees(struct super_block *sb,
 
 	mutex_lock(&server->logs_mutex);
 
-	scoutfs_key_init_log_trees(&key, rid, U64_MAX);
-
-	ret = scoutfs_btree_prev(sb, &super->logs_root, &key, &iref);
+	/* see if we have already have a finalized root from the rid */
+	ret = find_log_trees_item(sb, &super->logs_root, true, rid, 0, &lt);
 	if (ret < 0 && ret != -ENOENT)
 		goto unlock;
-	if (ret == 0) {
-		if (iref.val_len == sizeof(struct scoutfs_log_trees)) {
-			key = *iref.key;
-			memcpy(&lt, iref.val, iref.val_len);
-			if (le64_to_cpu(key.sklt_rid) != rid)
-				ret = -ENOENT;
-		} else {
-			ret = -EIO;
-		}
-		scoutfs_btree_put_iref(&iref);
-		if (ret == -EIO)
-			goto unlock;
+	if (ret == 0 && le64_to_cpu(lt.flags) & SCOUTFS_LOG_TREES_FINALIZED)
+		have_fin = true;
+
+	/* use the last non-finalized root, or start a new one */
+	ret = find_log_trees_item(sb, &super->logs_root, false, rid, U64_MAX,
+				  &lt);
+	if (ret < 0 && ret != -ENOENT)
+		goto unlock;
+	if (ret == 0 && le64_to_cpu(lt.flags) & SCOUTFS_LOG_TREES_FINALIZED) {
+		ret = -ENOENT;
+		nr = le64_to_cpu(lt.nr) + 1;
+	} else if (ret == -ENOENT) {
+		nr = 1;
 	}
 
-	/* initialize new roots if we don't have any */
+	/* initialize a new root if we don't have a non-finalized one */
 	if (ret == -ENOENT) {
-		key.sklt_rid = cpu_to_le64(rid);
-		key.sklt_nr = cpu_to_le64(1);
 		memset(&lt, 0, sizeof(lt));
-		lt.rid = key.sklt_rid;
-		lt.nr = key.sklt_nr;
+		lt.rid = cpu_to_le64(rid);
+		lt.nr = cpu_to_le64(nr);
+	}
+
+	/* finalize an existing root when large enough and don't have one */
+	if (lt.item_root.height > 2 && !have_fin) {
+		fin = lt;
+		memset(&fin.meta_avail, 0, sizeof(fin.meta_avail));
+		memset(&fin.meta_freed, 0, sizeof(fin.meta_freed));
+		memset(&fin.data_avail, 0, sizeof(fin.data_avail));
+		memset(&fin.data_freed, 0, sizeof(fin.data_freed));
+		memset(&fin.srch_file, 0, sizeof(fin.srch_file));
+		le64_add_cpu(&fin.flags, SCOUTFS_LOG_TREES_FINALIZED);
+
+		scoutfs_key_init_log_trees(&key, le64_to_cpu(fin.rid),
+					   le64_to_cpu(fin.nr));
+		ret = scoutfs_btree_update(sb, &server->alloc, &server->wri,
+					   &super->logs_root, &key, &fin,
+					   sizeof(fin));
+		if (ret < 0)
+			goto unlock;
+
+		memset(&lt.item_root, 0, sizeof(lt.item_root));
+		memset(&lt.bloom_ref, 0, sizeof(lt.bloom_ref));
+		lt.max_item_seq = 0;
+		le64_add_cpu(&lt.nr, 1);
+		lt.flags = 0;
 	}
 
 	if (get_volopt_val(server, SCOUTFS_VOLOPT_DATA_ALLOC_ZONE_BLOCKS_NR, &data_zone_blocks)) {
@@ -708,6 +767,8 @@ static int server_get_log_trees(struct super_block *sb,
 	}
 
 	/* update client's log tree's item */
+	scoutfs_key_init_log_trees(&key, le64_to_cpu(lt.rid),
+				   le64_to_cpu(lt.nr));
 	ret = scoutfs_btree_force(sb, &server->alloc, &server->wri,
 				  &super->logs_root, &key, &lt, sizeof(lt));
 unlock:
@@ -811,8 +872,9 @@ static int server_get_roots(struct super_block *sb,
 
 /*
  * A client is being evicted so we want to reclaim resources from their
- * log tree items.  The item trees and bloom refs stay around to be read
- * and eventually merged and we reclaim all the allocator items.
+ * open log tree item.  The item tree and bloom ref stay around to be
+ * read and we finalize the tree so that it will be merged.  We reclaim
+ * all the allocator items.
  *
  * The caller holds the commit rwsem which means we do all this work in
  * one server commit.  We'll need to keep the total amount of blocks in
@@ -826,7 +888,7 @@ static int server_get_roots(struct super_block *sb,
  * We can return an error without fully reclaiming all the log item's
  * referenced data.
  */
-static int reclaim_log_trees(struct super_block *sb, u64 rid)
+static int reclaim_open_log_tree(struct super_block *sb, u64 rid)
 {
 	struct scoutfs_super_block *super = &SCOUTFS_SB(sb)->super;
 	DECLARE_SERVER_INFO(sb, server);
@@ -838,14 +900,16 @@ static int reclaim_log_trees(struct super_block *sb, u64 rid)
 
 	mutex_lock(&server->logs_mutex);
 
-	/* find the client's existing item */
-	scoutfs_key_init_log_trees(&key, rid, 0);
-	ret = scoutfs_btree_next(sb, &super->logs_root, &key, &iref);
+	/* find the client's last open log_tree */
+	scoutfs_key_init_log_trees(&key, rid, U64_MAX);
+	ret = scoutfs_btree_prev(sb, &super->logs_root, &key, &iref);
 	if (ret == 0) {
 		if (iref.val_len == sizeof(struct scoutfs_log_trees)) {
 			key = *iref.key;
 			memcpy(&lt, iref.val, iref.val_len);
-			if (le64_to_cpu(key.sklt_rid) != rid)
+			if ((le64_to_cpu(key.sklt_rid) != rid) ||
+			    (le64_to_cpu(lt.flags) &
+			     SCOUTFS_LOG_TREES_FINALIZED))
 				ret = -ENOENT;
 		} else {
 			ret = -EIO;
@@ -876,6 +940,7 @@ static int reclaim_log_trees(struct super_block *sb, u64 rid)
 
 	/* the mount is no longer writing to the zones */
 	zero_data_alloc_zone_bits(&lt);
+	le64_add_cpu(&lt.flags, SCOUTFS_LOG_TREES_FINALIZED);
 
 	err = scoutfs_btree_update(sb, &server->alloc, &server->wri,
 				  &super->logs_root, &key, &lt, sizeof(lt));
@@ -1275,6 +1340,910 @@ out:
 	return scoutfs_net_response(sb, conn, cmd, id, ret, NULL, 0);
 }
 
+/*
+ * Log merge range items are stored at the starting fs key of the range.
+ * The only fs key field that doesn't hold information is the zone, so
+ * we use the zone to differentiate all types that we store in the log
+ * merge tree.
+ */
+static void init_log_merge_key(struct scoutfs_key *key, u8 zone, u64 first,
+			       u64 second)
+{
+	*key = (struct scoutfs_key) {
+		.sk_zone = zone,
+		._sk_first = cpu_to_le64(first),
+		._sk_second = cpu_to_le64(second),
+	};
+}
+
+static int next_log_merge_item_key(struct super_block *sb, struct scoutfs_btree_root *root,
+				   u8 zone, struct scoutfs_key *key, void *val, size_t val_len)
+{
+	SCOUTFS_BTREE_ITEM_REF(iref);
+	int ret;
+
+	ret = scoutfs_btree_next(sb, root, key, &iref);
+	if (ret == 0) {
+		if (iref.key->sk_zone != zone)
+			ret = -ENOENT;
+		else if (iref.val_len != val_len)
+			ret = -EIO;
+		else
+			memcpy(val, iref.val, val_len);
+		scoutfs_btree_put_iref(&iref);
+	}
+
+	return ret;
+}
+
+static int next_log_merge_item(struct super_block *sb,
+			       struct scoutfs_btree_root *root,
+			       u8 zone, u64 first, u64 second,
+			       void *val, size_t val_len)
+{
+	struct scoutfs_key key;
+
+	init_log_merge_key(&key, zone, first, second);
+	return next_log_merge_item_key(sb, root, zone, &key, val, val_len);
+}
+
+/*
+ * We start a log merge operation if there are any finalized log trees
+ * whose greatest seq is within the last stable seq.  This is called by
+ * every client's get_log_merge handler at a relatively low frequency
+ * until a merge starts.
+ */
+static int start_log_merge(struct super_block *sb,
+			   struct scoutfs_super_block *super,
+			   struct scoutfs_log_merge_status *stat_ret)
+{
+	struct server_info *server = SCOUTFS_SB(sb)->server_info;
+	struct scoutfs_log_merge_status stat;
+	struct scoutfs_log_merge_range rng;
+	SCOUTFS_BTREE_ITEM_REF(iref);
+	struct scoutfs_log_trees *lt;
+	struct scoutfs_key key;
+	u64 last_seq;
+	bool start;
+	int ret;
+	int err;
+
+	scoutfs_key_init_log_trees(&key, 0, 0);
+
+	ret = get_stable_trans_seq(sb, &last_seq);
+	if (ret < 0)
+		goto out;
+
+	scoutfs_key_init_log_trees(&key, 0, 0);
+	for (start = false; !start; scoutfs_key_inc(&key)) {
+		ret = scoutfs_btree_next(sb, &super->logs_root, &key, &iref);
+		if (ret == 0) {
+			if (iref.val_len == sizeof(*lt)) {
+				key = *iref.key;
+				lt = iref.val;
+				if ((le64_to_cpu(lt->flags) &
+				     SCOUTFS_LOG_TREES_FINALIZED) &&
+				    (le64_to_cpu(lt->max_item_seq) <=
+				     last_seq)) {
+					start = true;
+				}
+			} else {
+				ret = -EIO;
+			}
+			scoutfs_btree_put_iref(&iref);
+		}
+		if (ret < 0)
+			goto out;
+	}
+
+	if (!start) {
+		ret = -ENOENT;
+		goto out;
+	}
+
+	/* add an initial full-range */
+	scoutfs_key_set_zeros(&rng.start);
+	scoutfs_key_set_ones(&rng.end);
+	key = rng.start;
+	key.sk_zone = SCOUTFS_LOG_MERGE_RANGE_ZONE;
+	ret = scoutfs_btree_insert(sb, &server->alloc, &server->wri,
+				   &super->log_merge, &key, &rng, sizeof(rng));
+	if (ret < 0)
+		goto out;
+
+	/* and add the merge status item */
+	scoutfs_key_set_zeros(&stat.next_range_key);
+	stat.nr_requests = 0;
+	stat.nr_complete = 0;
+	stat.last_seq = cpu_to_le64(last_seq);
+	stat.seq = cpu_to_le64(scoutfs_server_next_seq(sb));
+
+	init_log_merge_key(&key, SCOUTFS_LOG_MERGE_STATUS_ZONE, 0, 0);
+	ret = scoutfs_btree_insert(sb, &server->alloc, &server->wri,
+				   &super->log_merge, &key,
+				   &stat, sizeof(stat));
+	if (ret < 0) {
+		key = rng.start;
+		key.sk_zone = SCOUTFS_LOG_MERGE_RANGE_ZONE;
+		err = scoutfs_btree_delete(sb, &server->alloc, &server->wri,
+					   &super->log_merge, &key);
+		BUG_ON(err); /* inconsistent */
+	}
+
+	/* queue free to see if there's lingering items to process */
+	if (ret == 0)
+		queue_work(server->wq, &server->log_merge_free_work);
+out:
+	if (ret == 0)
+		*stat_ret = stat;
+	return ret;
+}
+
+/* Requests drain once we get this many completions to splice */
+#define LOG_MERGE_SPLICE_BATCH 8
+
+/*
+ * Splice the completed subtrees from the clients back into the fs log
+ * tree as parents.  Once they're spliced in, try and rebalance a path
+ * through them in case they need to be split or joined before the rest
+ * of their range can be processed.
+ *
+ * It's only safe to splice in merged parents when all the requests have
+ * drained and no requests are relying on stable key ranges of parents
+ * in the fs root.
+ *
+ * It doesn't matter that the fs tree produced by these subtree splices
+ * itself contains inconsistent items because the subtrees can contain
+ * fragments of transactions.  The read-only finalized log btrees that
+ * are the source of the spliced items are still preferred by readers.
+ * It's only once all the finalized items have been merged, and all
+ * transactions are consistent, that we remove the finalized log trees
+ * and the fs tree items are used.
+ *
+ * As we splice in the subtrees we're implicitly allocating all the
+ * blocks referenced by the new subtree, and freeing all the blocks
+ * referenced by the old subtree that's overwritten.  These allocs and
+ * frees were performed by the client as it did cow updates and were
+ * stored in the allocators that were sent with the completion.  We
+ * merge in those allocators as we splice in the subtree.
+ *
+ * We can add back any remaining ranges for any partial completions and
+ * reset the next range key if there's still work to do.  If the
+ * operation is complete then we tear down the input log_trees items and
+ * delete the status.
+ */
+static int splice_log_merge_completions(struct super_block *sb,
+					struct scoutfs_log_merge_status *stat,
+					bool no_ranges)
+{
+	struct server_info *server = SCOUTFS_SB(sb)->server_info;
+	struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
+	struct scoutfs_super_block *super = &sbi->super;
+	struct scoutfs_log_merge_complete comp;
+	struct scoutfs_log_merge_freeing fr;
+	struct scoutfs_log_merge_range rng;
+	struct scoutfs_log_trees lt = {{{0,}}};
+	SCOUTFS_BTREE_ITEM_REF(iref);
+	struct scoutfs_key key;
+	u64 seq;
+	int ret;
+
+	/* musn't rebalance fs tree parents while reqs rely on their key bounds */
+	if (WARN_ON_ONCE(le64_to_cpu(stat->nr_requests) > 0))
+		return -EIO;
+
+	/*
+	 * Splice in all the completed subtrees at the initial parent
+	 * blocks in the main fs_tree before rebalancing any of them.
+	 */
+	for (seq = 0; ; seq++) {
+
+		ret = next_log_merge_item(sb, &super->log_merge,
+					  SCOUTFS_LOG_MERGE_COMPLETE_ZONE, seq,
+					  0, &comp, sizeof(comp));
+		if (ret < 0) {
+			if (ret == -ENOENT) {
+				ret = 0;
+				break;
+			}
+			goto out;
+		}
+
+		seq = le64_to_cpu(comp.seq);
+
+		ret = scoutfs_btree_set_parent(sb, &server->alloc, &server->wri,
+					       &super->fs_root, &comp.start,
+					       &comp.root);
+		if (ret < 0)
+			goto out;
+
+		mutex_lock(&server->alloc_mutex);
+		ret = scoutfs_alloc_splice_list(sb, &server->alloc,
+						&server->wri,
+						server->other_freed,
+						&comp.meta_avail) ?:
+		      scoutfs_alloc_splice_list(sb, &server->alloc,
+						&server->wri,
+						server->other_freed,
+						&comp.meta_freed);
+		mutex_unlock(&server->alloc_mutex);
+		if (ret < 0)
+			goto out;
+
+		/* clear allocators */
+		memset(&comp.meta_avail, 0, sizeof(comp.meta_avail));
+		memset(&comp.meta_freed, 0, sizeof(comp.meta_freed));
+
+		init_log_merge_key(&key, SCOUTFS_LOG_MERGE_COMPLETE_ZONE,
+				   seq, 0);
+		ret = scoutfs_btree_update(sb, &server->alloc, &server->wri,
+					   &super->log_merge, &key,
+					   &comp, sizeof(comp));
+		if (ret < 0)
+			goto out;
+	}
+
+	/*
+	 * Now with all the parent blocks spliced in, rebalance items
+	 * amongst parents that needed to split/join and delete the
+	 * completion items, possibly returning ranges to process.
+	 */
+	for (seq = 0; ; seq++) {
+		ret = next_log_merge_item(sb, &super->log_merge,
+					  SCOUTFS_LOG_MERGE_COMPLETE_ZONE, seq,
+					  0, &comp, sizeof(comp));
+		if (ret < 0) {
+			if (ret == -ENOENT) {
+				ret = 0;
+				break;
+			}
+			goto out;
+		}
+
+		seq = le64_to_cpu(comp.seq);
+
+		/* balance when there was a remaining key range */
+		if (le64_to_cpu(comp.flags) & SCOUTFS_LOG_MERGE_COMP_REMAIN) {
+			ret = scoutfs_btree_rebalance(sb, &server->alloc,
+						      &server->wri,
+						      &super->fs_root,
+						      &comp.start);
+			if (ret < 0)
+				goto out;
+
+			rng.start = comp.remain;
+			rng.end = comp.end;
+
+			key = rng.start;
+			key.sk_zone = SCOUTFS_LOG_MERGE_RANGE_ZONE;
+			ret = scoutfs_btree_insert(sb, &server->alloc,
+						   &server->wri,
+						   &super->log_merge, &key,
+						   &rng, sizeof(rng));
+			if (ret < 0)
+				goto out;
+			no_ranges = false;
+		}
+
+		/* delete the completion item */
+		init_log_merge_key(&key, SCOUTFS_LOG_MERGE_COMPLETE_ZONE,
+				   seq, 0);
+		ret = scoutfs_btree_delete(sb, &server->alloc, &server->wri,
+					   &super->log_merge,
+					   &key);
+		if (ret < 0)
+			goto out;
+	}
+
+	/* update the status once all completes are processed */
+	scoutfs_key_set_zeros(&stat->next_range_key);
+	stat->nr_complete = 0;
+
+	/* update counts and done if there's still ranges to process */
+	if (!no_ranges) {
+		init_log_merge_key(&key, SCOUTFS_LOG_MERGE_STATUS_ZONE, 0, 0);
+		ret = scoutfs_btree_update(sb, &server->alloc, &server->wri,
+					   &super->log_merge, &key,
+					   stat, sizeof(*stat));
+		goto out;
+	}
+
+	/* no more ranges, free blooms and add freeing items for free work */
+	lt.rid = 0;
+	lt.nr = 0;
+	for (;;) {
+		scoutfs_key_init_log_trees(&key, le64_to_cpu(lt.rid),
+					   le64_to_cpu(lt.nr) + 1);
+		ret = scoutfs_btree_next(sb, &super->logs_root, &key, &iref);
+		if (ret == 0) {
+			if (iref.val_len == sizeof(lt)) {
+				key = *iref.key;
+				memcpy(&lt, iref.val, sizeof(lt));
+			} else {
+				ret = -EIO;
+			}
+			scoutfs_btree_put_iref(&iref);
+		}
+		if (ret < 0) {
+			if (ret == -ENOENT) {
+				ret = 0;
+				break;
+			}
+			goto out;
+		}
+
+		/* only free the inputs to the log merge that just finished */
+		if (!(le64_to_cpu(lt.flags) & SCOUTFS_LOG_TREES_FINALIZED) ||
+		    (le64_to_cpu(lt.max_item_seq) >
+		     le64_to_cpu(stat->last_seq)))
+			continue;
+
+		fr.root = lt.item_root;
+		scoutfs_key_set_zeros(&fr.key);
+		fr.seq = cpu_to_le64(scoutfs_server_next_seq(sb));
+		init_log_merge_key(&key, SCOUTFS_LOG_MERGE_FREEING_ZONE,
+				   le64_to_cpu(fr.seq), 0);
+		ret = scoutfs_btree_insert(sb, &server->alloc, &server->wri,
+					   &super->log_merge, &key,
+					   &fr, sizeof(fr));
+		if (ret < 0)
+			goto out;
+
+		if (lt.bloom_ref.blkno) {
+			ret = scoutfs_free_meta(sb, &server->alloc,
+						&server->wri,
+					le64_to_cpu(lt.bloom_ref.blkno));
+			if (ret < 0)
+				goto out;
+		}
+
+		scoutfs_key_init_log_trees(&key, le64_to_cpu(lt.rid),
+					   le64_to_cpu(lt.nr));
+		ret = scoutfs_btree_delete(sb, &server->alloc, &server->wri,
+					   &super->logs_root, &key);
+		if (ret < 0)
+			goto out;
+	}
+
+	init_log_merge_key(&key, SCOUTFS_LOG_MERGE_STATUS_ZONE, 0, 0);
+	ret = scoutfs_btree_delete(sb, &server->alloc, &server->wri,
+				   &super->log_merge, &key);
+	if (ret == 0)
+		queue_work(server->wq, &server->log_merge_free_work);
+out:
+	BUG_ON(ret); /* inconsistent */
+
+	return ret;
+}
+
+/*
+ * Search amongst the finalized log roots within the caller's merge seq looking
+ * for the earliest item within the caller's range.  The caller has taken
+ * care of locking.
+ */
+static int next_least_log_item(struct super_block *sb,
+			       struct scoutfs_btree_root *logs_root,
+			       u64 seq, struct scoutfs_key *start,
+			       struct scoutfs_key *end,
+			       struct scoutfs_key *next_ret)
+{
+	struct scoutfs_btree_root item_root;
+	struct scoutfs_log_trees *lt;
+	SCOUTFS_BTREE_ITEM_REF(iref);
+	struct scoutfs_key key;
+	int ret;
+
+	scoutfs_key_set_ones(next_ret);
+
+	for (scoutfs_key_init_log_trees(&key, 0, 0); ; scoutfs_key_inc(&key)) {
+
+		/* find the next finalized log root within the merge last_seq */
+		ret = scoutfs_btree_next(sb, logs_root, &key, &iref);
+		if (ret == 0) {
+			if (iref.val_len == sizeof(*lt)) {
+				key = *iref.key;
+				lt = iref.val;
+				if ((le64_to_cpu(lt->flags) &
+				     SCOUTFS_LOG_TREES_FINALIZED) &&
+				    (le64_to_cpu(lt->max_item_seq) <= seq))
+					item_root = lt->item_root;
+				else
+					item_root.ref.blkno = 0;
+			} else {
+				ret = -EIO;
+			}
+			scoutfs_btree_put_iref(&iref);
+		}
+		if (ret < 0) {
+			if (ret == -ENOENT)
+				ret = 0;
+			goto out;
+		}
+		if (item_root.ref.blkno == 0)
+			continue;
+
+		/* see if populated roots have item keys less than than next */
+		ret = scoutfs_btree_next(sb, &item_root, start, &iref);
+		if (ret == 0) {
+			if (scoutfs_key_compare(iref.key, end) <= 0 &&
+			    scoutfs_key_compare(iref.key, next_ret) < 0)
+				*next_ret = *iref.key;
+			scoutfs_btree_put_iref(&iref);
+		}
+		if (ret < 0) {
+			if (ret == -ENOENT)
+				ret = 0;
+			else
+				goto out;
+		}
+	}
+
+out:
+	if (ret == 0 && scoutfs_key_is_ones(next_ret))
+		ret = -ENOENT;
+
+	return ret;
+}
+
+/*
+ * Once a merge is fully completed all of the finalized input log btrees
+ * are redundant and can be freed.
+ *
+ * As merging finishes and the status item is deleted, we also move all
+ * the finalized roots from log_trees items over into freeing items.
+ * This work is then kicked off which iterates over all the freeing
+ * items calling into the btree to free all its referenced blocks, with
+ * the key tracking partial progress.
+ *
+ * The freeing work is reasonably light.  We only read the btree blocks
+ * and add freed blocks to merge back into the core allocators.  The
+ * server can handle this load and we avoid the io overhead and
+ * complexity of farming it out to clients.
+ */
+static void server_log_merge_free_work(struct work_struct *work)
+{
+	struct server_info *server = container_of(work, struct server_info,
+						  log_merge_free_work);
+	struct super_block *sb = server->sb;
+	struct scoutfs_super_block *super = &SCOUTFS_SB(sb)->super;
+	struct scoutfs_log_merge_freeing fr;
+	struct scoutfs_key key;
+	bool commit = false;
+	int ret = 0;
+
+	/* shutdown waits for us, we'll eventually load set shutting_down */
+	while (!server->shutting_down) {
+		scoutfs_server_hold_commit(sb);
+		mutex_lock(&server->logs_mutex);
+		commit = true;
+
+		ret = next_log_merge_item(sb, &super->log_merge,
+					  SCOUTFS_LOG_MERGE_FREEING_ZONE,
+					  0, 0, &fr, sizeof(fr));
+		if (ret < 0) {
+			if (ret == -ENOENT)
+				ret = 0;
+			break;
+		}
+
+		ret = scoutfs_btree_free_blocks(sb, &server->alloc,
+						&server->wri, &fr.key,
+						&fr.root, 10);
+		if (ret < 0)
+			break;
+
+		/* freed blocks are in allocator, we *have* to update key */
+		init_log_merge_key(&key, SCOUTFS_LOG_MERGE_FREEING_ZONE,
+				   le64_to_cpu(fr.seq), 0);
+		if (scoutfs_key_is_ones(&fr.key))
+			ret = scoutfs_btree_delete(sb, &server->alloc,
+						   &server->wri,
+						   &super->log_merge, &key);
+		else
+			ret = scoutfs_btree_update(sb, &server->alloc,
+						   &server->wri,
+						   &super->log_merge, &key,
+						   &fr, sizeof(fr));
+		/* freed blocks are in allocator, we *have* to update fr */
+		BUG_ON(ret < 0);
+
+		mutex_unlock(&server->logs_mutex);
+		ret = scoutfs_server_apply_commit(sb, ret);
+		commit = false;
+		if (ret < 0)
+			break;
+	}
+
+	if (commit) {
+		mutex_unlock(&server->logs_mutex);
+		ret = scoutfs_server_apply_commit(sb, ret);
+	}
+
+	if (ret < 0) {
+		scoutfs_err(sb, "server error freeing merged btree blocks: %d",
+			    ret);
+		stop_server(server);
+	}
+
+	/* not re-arming, regularly queued by the server during merging */
+}
+
+/*
+ * This will return ENOENT to the client if there is no work to do.
+ */
+static int server_get_log_merge(struct super_block *sb,
+				struct scoutfs_net_connection *conn,
+				u8 cmd, u64 id, void *arg, u16 arg_len)
+{
+	DECLARE_SERVER_INFO(sb, server);
+	u64 rid = scoutfs_net_client_rid(conn);
+	struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
+	struct scoutfs_super_block *super = &sbi->super;
+	struct scoutfs_log_merge_status stat;
+	struct scoutfs_log_merge_range rng;
+	struct scoutfs_log_merge_range remain;
+	struct scoutfs_log_merge_request req;
+	struct scoutfs_key par_start;
+	struct scoutfs_key par_end;
+	struct scoutfs_key next_key;
+	struct scoutfs_key key;
+	bool ins_rng;
+	bool del_remain;
+	bool del_req;
+	bool upd_stat;
+	bool no_ranges;
+	bool no_next;
+	int ret;
+	int err;
+
+	if (arg_len != 0)
+		return -EINVAL;
+
+	scoutfs_server_hold_commit(sb);
+	mutex_lock(&server->logs_mutex);
+
+restart:
+	memset(&req, 0, sizeof(req));
+	ins_rng = false;
+	del_remain = false;
+	del_req = false;
+	upd_stat = false;
+
+	/* get the status item, maybe creating a new one */
+	ret = next_log_merge_item(sb, &super->log_merge,
+				  SCOUTFS_LOG_MERGE_STATUS_ZONE, 0, 0,
+				  &stat, sizeof(stat));
+	if (ret == -ENOENT)
+		ret = start_log_merge(sb, super, &stat);
+	if (ret < 0)
+		goto out;
+
+	trace_scoutfs_get_log_merge_status(sb, rid, &stat.next_range_key,
+					   le64_to_cpu(stat.nr_requests),
+					   le64_to_cpu(stat.nr_complete),
+					   le64_to_cpu(stat.last_seq),
+					   le64_to_cpu(stat.seq));
+
+	/* find the next range, always checking for splicing */
+	for (;;) {
+		key = stat.next_range_key;
+		key.sk_zone = SCOUTFS_LOG_MERGE_RANGE_ZONE;
+		ret = next_log_merge_item_key(sb, &super->log_merge, SCOUTFS_LOG_MERGE_RANGE_ZONE,
+					      &key, &rng, sizeof(rng));
+		if (ret < 0 && ret != -ENOENT)
+			goto out;
+
+		/* maybe splice now that we know if there's ranges */
+		no_next = ret == -ENOENT;
+		no_ranges = scoutfs_key_is_zeros(&stat.next_range_key) && ret == -ENOENT;
+		if (le64_to_cpu(stat.nr_requests) == 0 &&
+		    (no_next || le64_to_cpu(stat.nr_complete) >= LOG_MERGE_SPLICE_BATCH)) {
+			ret = splice_log_merge_completions(sb, &stat, no_ranges);
+			if (ret < 0)
+				goto out;
+			/* splicing resets key and adds ranges, could finish status */
+			goto restart;
+		}
+
+		/* no ranges from next for requests, future attempts will create or splice */
+		if (no_next) {
+			ret = -ENOENT;
+			goto out;
+		}
+
+		/* see if we should back off after splicing might have deleted completions */
+		if ((le64_to_cpu(stat.nr_requests) +
+		     le64_to_cpu(stat.nr_complete)) >= LOG_MERGE_SPLICE_BATCH) {
+			ret = -ENOENT;
+			goto out;
+		}
+
+		/* find the next logged item in the next range */
+		ret = next_least_log_item(sb, &super->logs_root,
+					  le64_to_cpu(stat.last_seq),
+					  &rng.start, &rng.end, &next_key);
+		if (ret == 0)
+			break;
+		/* drop the range if it contained no logged items */
+		if (ret == -ENOENT) {
+			key = rng.start;
+			key.sk_zone = SCOUTFS_LOG_MERGE_RANGE_ZONE;
+			ret = scoutfs_btree_delete(sb, &server->alloc,
+						   &server->wri,
+						   &super->log_merge, &key);
+		}
+		if (ret < 0)
+			goto out;
+	}
+
+	/* start to build the request that's saved and sent to the client */
+	req.logs_root = super->logs_root;
+	req.last_seq = stat.last_seq;
+	req.rid = cpu_to_le64(rid);
+	req.seq = cpu_to_le64(scoutfs_server_next_seq(sb));
+	req.flags = 0;
+	if (super->fs_root.height > 2)
+		req.flags |= cpu_to_le64(SCOUTFS_LOG_MERGE_REQUEST_SUBTREE);
+
+	/* find the fs_root parent block and its key range */
+	ret = scoutfs_btree_get_parent(sb, &super->fs_root, &next_key,
+					 &req.root) ?:
+	      scoutfs_btree_parent_range(sb, &super->fs_root, &next_key,
+					 &par_start, &par_end);
+	if (ret < 0)
+		goto out;
+
+	/* start from next item, don't exceed parent key range */
+	req.start = next_key;
+	req.end = rng.end;
+	if (scoutfs_key_compare(&par_end, &req.end) < 0)
+		req.end = par_end;
+
+	/* delete the old range */
+	key = rng.start;
+	key.sk_zone = SCOUTFS_LOG_MERGE_RANGE_ZONE;
+	ret = scoutfs_btree_delete(sb, &server->alloc, &server->wri,
+				   &super->log_merge, &key);
+	if (ret < 0)
+		goto out;
+	ins_rng = true;
+
+	/* add remaining range if we have to */
+	if (scoutfs_key_compare(&rng.end, &req.end) > 0) {
+		remain.start = req.end;
+		scoutfs_key_inc(&remain.start);
+		remain.end = rng.end;
+
+		key = remain.start;
+		key.sk_zone = SCOUTFS_LOG_MERGE_RANGE_ZONE;
+		ret = scoutfs_btree_insert(sb, &server->alloc, &server->wri,
+					   &super->log_merge, &key,
+					   &remain, sizeof(remain));
+		if (ret < 0)
+			goto out;
+		del_remain = true;
+	}
+
+	/* give the client an allocation pool to work with */
+	mutex_lock(&server->alloc_mutex);
+	ret = scoutfs_alloc_fill_list(sb, &server->alloc, &server->wri,
+				      &req.meta_avail, server->meta_avail,
+				      SCOUTFS_SERVER_MERGE_FILL_LO,
+				      SCOUTFS_SERVER_MERGE_FILL_TARGET);
+	mutex_unlock(&server->alloc_mutex);
+	if (ret < 0)
+		goto out;
+
+	/* save the request that will be sent to the client */
+	init_log_merge_key(&key, SCOUTFS_LOG_MERGE_REQUEST_ZONE, rid,
+			   le64_to_cpu(req.seq));
+	ret = scoutfs_btree_insert(sb, &server->alloc, &server->wri,
+				   &super->log_merge, &key,
+				   &req, sizeof(req));
+	if (ret < 0)
+		goto out;
+	del_req = true;
+
+	trace_scoutfs_get_log_merge_request(sb, rid, &req.root,
+					    &req.start, &req.end,
+					    le64_to_cpu(req.last_seq),
+					    le64_to_cpu(req.seq));
+
+	/* make sure next range avoids ranges for parent in use */
+	stat.next_range_key = par_end;
+	if (!scoutfs_key_is_ones(&stat.next_range_key))
+		scoutfs_key_inc(&stat.next_range_key);
+
+	/* update the status requests count */
+	le64_add_cpu(&stat.nr_requests, 1);
+	init_log_merge_key(&key, SCOUTFS_LOG_MERGE_STATUS_ZONE, 0, 0);
+	ret = scoutfs_btree_update(sb, &server->alloc, &server->wri,
+				   &super->log_merge, &key,
+				   &stat, sizeof(stat));
+	if (ret < 0)
+		goto out;
+	upd_stat = true;
+
+out:
+	if (ret < 0) {
+		/* undo any our partial item changes */
+		if (upd_stat) {
+			le64_add_cpu(&stat.nr_requests, -1ULL);
+			init_log_merge_key(&key, SCOUTFS_LOG_MERGE_STATUS_ZONE,
+					   0, 0);
+			err = scoutfs_btree_update(sb, &server->alloc,
+						   &server->wri,
+						   &super->log_merge, &key,
+						   &stat, sizeof(stat));
+			BUG_ON(err); /* inconsistent */
+		}
+
+		if (del_req) {
+			init_log_merge_key(&key, SCOUTFS_LOG_MERGE_REQUEST_ZONE,
+					   rid, le64_to_cpu(req.seq));
+			err = scoutfs_btree_delete(sb, &server->alloc,
+						   &server->wri,
+						   &super->log_merge, &key);
+			BUG_ON(err); /* inconsistent */
+		}
+
+		if (del_remain) {
+			key = remain.start;
+			key.sk_zone = SCOUTFS_LOG_MERGE_RANGE_ZONE;
+			err = scoutfs_btree_delete(sb, &server->alloc,
+						   &server->wri,
+						   &super->log_merge, &key);
+			BUG_ON(err); /* inconsistent */
+		}
+
+		if (ins_rng) {
+			key = rng.start;
+			key.sk_zone = SCOUTFS_LOG_MERGE_RANGE_ZONE;
+			err = scoutfs_btree_insert(sb, &server->alloc,
+						   &server->wri,
+						   &super->log_merge, &key,
+						   &rng, sizeof(rng));
+			BUG_ON(err); /* inconsistent */
+		}
+
+		/* reclaim allocation if we failed */
+		mutex_lock(&server->alloc_mutex);
+		err = scoutfs_alloc_splice_list(sb, &server->alloc,
+						&server->wri,
+						server->other_freed,
+						&req.meta_avail);
+		mutex_unlock(&server->alloc_mutex);
+		BUG_ON(err); /* inconsistent */
+	}
+
+	mutex_unlock(&server->logs_mutex);
+	ret = scoutfs_server_apply_commit(sb, ret);
+
+	return scoutfs_net_response(sb, conn, cmd, id, ret, &req, sizeof(req));
+}
+
+/*
+ * Commit the client's leg merge work.  Typically we store the
+ * completion so that we can later splice it back into the fs root and
+ * reclaim its allocators later in a batch.  If it failed we reclaim it
+ * immediately.
+ */
+static int server_commit_log_merge(struct super_block *sb,
+				   struct scoutfs_net_connection *conn,
+				   u8 cmd, u64 id, void *arg, u16 arg_len)
+{
+	DECLARE_SERVER_INFO(sb, server);
+	u64 rid = scoutfs_net_client_rid(conn);
+	struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
+	struct scoutfs_super_block *super = &sbi->super;
+	struct scoutfs_log_merge_request orig_req;
+	struct scoutfs_log_merge_complete *comp;
+	struct scoutfs_log_merge_status stat;
+	struct scoutfs_log_merge_range rng;
+	struct scoutfs_key key;
+	int ret;
+
+	scoutfs_key_set_zeros(&rng.end);
+
+	if (arg_len != sizeof(struct scoutfs_log_merge_complete))
+		return -EINVAL;
+	comp = arg;
+
+	trace_scoutfs_get_log_merge_complete(sb, rid, &comp->root,
+					     &comp->start, &comp->end,
+					     &comp->remain,
+					     le64_to_cpu(comp->seq),
+					     le64_to_cpu(comp->flags));
+
+	scoutfs_server_hold_commit(sb);
+	mutex_lock(&server->logs_mutex);
+
+	/* find the status of the current log merge */
+	ret = next_log_merge_item(sb, &super->log_merge,
+				  SCOUTFS_LOG_MERGE_STATUS_ZONE, 0, 0,
+				  &stat, sizeof(stat));
+	if (ret < 0) {
+		WARN_ON_ONCE(ret == -ENOENT); /* inconsistent */
+		goto out;
+	}
+
+	/* find the completion's original saved request */
+	ret = next_log_merge_item(sb, &super->log_merge,
+				  SCOUTFS_LOG_MERGE_REQUEST_ZONE,
+				  rid, le64_to_cpu(comp->seq),
+				  &orig_req, sizeof(orig_req));
+	if (WARN_ON_ONCE(ret == 0 && (comp->rid != orig_req.rid ||
+				      comp->seq != orig_req.seq)))
+		ret = -ENOENT; /* inconsistency */
+	if (ret < 0) {
+		WARN_ON_ONCE(ret == -ENOENT); /* inconsistency */
+		goto out;
+	}
+
+	/* delete the original request item */
+	init_log_merge_key(&key, SCOUTFS_LOG_MERGE_REQUEST_ZONE, rid,
+			   le64_to_cpu(orig_req.seq));
+	ret = scoutfs_btree_delete(sb, &server->alloc, &server->wri,
+				   &super->log_merge, &key);
+	if (ret < 0)
+		goto out;
+
+	if (le64_to_cpu(comp->flags) & SCOUTFS_LOG_MERGE_COMP_ERROR) {
+		/* restore the range and reclaim the allocator if it failed */
+		rng.start = orig_req.start;
+		rng.end = orig_req.end;
+
+		key = rng.start;
+		key.sk_zone = SCOUTFS_LOG_MERGE_RANGE_ZONE;
+		ret = scoutfs_btree_insert(sb, &server->alloc, &server->wri,
+					   &super->log_merge, &key,
+					   &rng, sizeof(rng));
+		if (ret < 0)
+			goto out;
+
+		mutex_lock(&server->alloc_mutex);
+		ret = scoutfs_alloc_splice_list(sb, &server->alloc,
+						&server->wri,
+						server->other_freed,
+						&orig_req.meta_avail) ?:
+		      scoutfs_alloc_splice_list(sb, &server->alloc,
+						&server->wri,
+						server->other_freed,
+						&orig_req.meta_freed);
+		mutex_unlock(&server->alloc_mutex);
+		if (ret < 0)
+			goto out;
+
+	} else {
+		/* otherwise store the completion for later splicing */
+		init_log_merge_key(&key, SCOUTFS_LOG_MERGE_COMPLETE_ZONE,
+				   le64_to_cpu(comp->seq), 0);
+		ret = scoutfs_btree_insert(sb, &server->alloc, &server->wri,
+					   &super->log_merge, &key,
+					   comp, sizeof(*comp));
+		if (ret < 0)
+			goto out;
+
+		le64_add_cpu(&stat.nr_complete, 1ULL);
+	}
+
+	/* and update the status counts */
+	le64_add_cpu(&stat.nr_requests, -1ULL);
+	init_log_merge_key(&key, SCOUTFS_LOG_MERGE_STATUS_ZONE, 0, 0);
+	ret = scoutfs_btree_update(sb, &server->alloc, &server->wri,
+				   &super->log_merge, &key,
+				   &stat, sizeof(stat));
+	if (ret < 0)
+		goto out;
+
+out:
+	mutex_unlock(&server->logs_mutex);
+	ret = scoutfs_server_apply_commit(sb, ret);
+	BUG_ON(ret < 0); /* inconsistent */
+
+	return scoutfs_net_response(sb, conn, cmd, id, ret, NULL, 0);
+}
+
 /* The server is receiving an omap response from the client */
 static int open_ino_map_response(struct super_block *sb, struct scoutfs_net_connection *conn,
 				 void *resp, unsigned int resp_len, int error, void *data)
@@ -1613,6 +2582,113 @@ static int cancel_srch_compact(struct super_block *sb, u64 rid)
 	return ret;
 }
 
+/*
+ * Clean up any log merge requests which have now been abandoned because
+ * their client was evicted.  This is always called on eviction and
+ * there may have been no merge in progres or our client had no
+ * outstanding requests.  For each pending request, we reclaim its
+ * allocators, delte its item, and update the status.
+ *
+ * The request we cancel might have been the last request which
+ * prevented batch processing, but we don't check that here.  This is in
+ * the client eviction path and we want that to be as light and
+ * responsive as possible so we can get back up and running.  The next
+ * client get_log_merge request will see that no more requests are
+ * outstanding.
+ *
+ * The caller holds a commit, but we're responsible for locking.
+ */
+static int cancel_log_merge(struct super_block *sb, u64 rid)
+{
+	DECLARE_SERVER_INFO(sb, server);
+	struct scoutfs_super_block *super = &SCOUTFS_SB(sb)->super;
+	struct scoutfs_log_merge_status stat;
+	struct scoutfs_log_merge_request req;
+	struct scoutfs_log_merge_range rng;
+	struct scoutfs_key key;
+	bool update = false;
+	u64 seq;
+	int ret;
+
+	mutex_lock(&server->logs_mutex);
+
+	ret = next_log_merge_item(sb, &super->log_merge,
+				  SCOUTFS_LOG_MERGE_STATUS_ZONE, 0, 0,
+				  &stat, sizeof(stat));
+	if (ret < 0) {
+		if (ret == -ENOENT)
+			ret = 0;
+		goto out;
+	}
+
+	for (seq = 0; ; seq++) {
+		ret = next_log_merge_item(sb, &super->log_merge,
+					  SCOUTFS_LOG_MERGE_REQUEST_ZONE, rid,
+					  seq, &req, sizeof(req));
+		if (ret == 0 && le64_to_cpu(req.rid) != rid)
+			ret = -ENOENT;
+		if (ret < 0) {
+			if (ret == -ENOENT)
+				ret = 0;
+			break;
+		}
+
+		seq = le64_to_cpu(req.seq);
+
+		/* remove request item */
+		init_log_merge_key(&key, SCOUTFS_LOG_MERGE_REQUEST_ZONE, rid,
+				   le64_to_cpu(req.seq));
+		ret = scoutfs_btree_delete(sb, &server->alloc, &server->wri,
+					   &super->log_merge, &key);
+		if (ret < 0)
+			goto out;
+
+		/* restore range */
+		rng.start = req.start;
+		rng.end = req.end;
+
+		key = rng.start;
+		key.sk_zone = SCOUTFS_LOG_MERGE_RANGE_ZONE;
+		ret = scoutfs_btree_insert(sb, &server->alloc,
+					   &server->wri,
+					   &super->log_merge, &key,
+					   &rng, sizeof(rng));
+		if (ret < 0)
+			goto out;
+
+		/* reclaim allocator */
+		mutex_lock(&server->alloc_mutex);
+		ret = scoutfs_alloc_splice_list(sb, &server->alloc,
+						&server->wri,
+						server->other_freed,
+						&req.meta_avail) ?:
+		      scoutfs_alloc_splice_list(sb, &server->alloc,
+						&server->wri,
+						server->other_freed,
+						&req.meta_freed);
+		mutex_unlock(&server->alloc_mutex);
+		if (ret < 0)
+			goto out;
+
+		/* update count */
+		le64_add_cpu(&stat.nr_requests, -1ULL);
+		update = true;
+	}
+
+	if (update) {
+		/* and update the status counts */
+		init_log_merge_key(&key, SCOUTFS_LOG_MERGE_STATUS_ZONE, 0, 0);
+		ret = scoutfs_btree_update(sb, &server->alloc, &server->wri,
+					   &super->log_merge, &key,
+					   &stat, sizeof(stat));
+	}
+out:
+	mutex_unlock(&server->logs_mutex);
+
+	BUG_ON(ret < 0);  /* XXX inconsistent */
+	return ret;
+}
+
 /*
  * Farewell processing is async to the request processing work.  Shutdown
  * waits for request processing to finish and then tears down the connection.
@@ -1758,8 +2834,9 @@ static int reclaim_rid(struct super_block *sb, u64 rid)
 	/* delete mounted client last, recovery looks for it */
 	ret = scoutfs_lock_server_farewell(sb, rid) ?:
 	      remove_trans_seq(sb, rid) ?:
-	      reclaim_log_trees(sb, rid) ?:
+	      reclaim_open_log_tree(sb, rid) ?:
 	      cancel_srch_compact(sb, rid) ?:
+	      cancel_log_merge(sb, rid) ?:
 	      scoutfs_omap_remove_rid(sb, rid) ?:
 	      delete_mounted_client(sb, rid);
 
@@ -1995,6 +3072,8 @@ static scoutfs_net_request_t server_req_funcs[] = {
 	[SCOUTFS_NET_CMD_LOCK]			= server_lock,
 	[SCOUTFS_NET_CMD_SRCH_GET_COMPACT]	= server_srch_get_compact,
 	[SCOUTFS_NET_CMD_SRCH_COMMIT_COMPACT]	= server_srch_commit_compact,
+	[SCOUTFS_NET_CMD_GET_LOG_MERGE]		= server_get_log_merge,
+	[SCOUTFS_NET_CMD_COMMIT_LOG_MERGE]	= server_commit_log_merge,
 	[SCOUTFS_NET_CMD_OPEN_INO_MAP]		= server_open_ino_map,
 	[SCOUTFS_NET_CMD_GET_VOLOPT]		= server_get_volopt,
 	[SCOUTFS_NET_CMD_SET_VOLOPT]		= server_set_volopt,
@@ -2367,6 +3446,8 @@ shutdown:
 	scoutfs_net_shutdown(sb, conn);
 	server->conn = NULL;
 
+	flush_work(&server->log_merge_free_work);
+
 	/* stop tracking recovery, cancel timer, flush any fencing */
 	scoutfs_recov_shutdown(sb);
 	flush_work(&server->fence_pending_recov_work);
@@ -2434,6 +3515,7 @@ void scoutfs_server_stop(struct super_block *sb)
 	cancel_work_sync(&server->work);
 	cancel_work_sync(&server->farewell_work);
 	cancel_work_sync(&server->commit_work);
+	cancel_work_sync(&server->log_merge_free_work);
 }
 
 int scoutfs_server_setup(struct super_block *sb)
@@ -2459,6 +3541,7 @@ int scoutfs_server_setup(struct super_block *sb)
 	INIT_WORK(&server->farewell_work, farewell_worker);
 	mutex_init(&server->alloc_mutex);
 	mutex_init(&server->logs_mutex);
+	INIT_WORK(&server->log_merge_free_work, server_log_merge_free_work);
 	mutex_init(&server->srch_mutex);
 	mutex_init(&server->mounted_clients_mutex);
 	seqcount_init(&server->roots_seqcount);

From 91acf92666cff8b7f5e26284510f3f657c161ee5 Mon Sep 17 00:00:00 2001
From: Zach Brown <zab@versity.com>
Date: Fri, 18 Dec 2020 11:48:44 -0800
Subject: [PATCH 15/29] Add client btree merge processing

Add the client work which is regularly scheduled to ask the server for
log merging work to do.  The relatively simple client work gets a
request from the server, finds the log roots to merge given the reqeust
seq, performs the merge with a btree call and callbacks, and commits the
result to the server.

Signed-off-by: Zach Brown <zab@versity.com>
---
 kmod/src/client.c |  20 ++++++
 kmod/src/client.h |   4 ++
 kmod/src/forest.c | 169 +++++++++++++++++++++++++++++++++++++++++++++-
 3 files changed, 192 insertions(+), 1 deletion(-)

diff --git a/kmod/src/client.c b/kmod/src/client.c
index 7a4b4322..68fe4736 100644
--- a/kmod/src/client.c
+++ b/kmod/src/client.c
@@ -217,6 +217,26 @@ int scoutfs_client_srch_commit_compact(struct super_block *sb,
 					res, sizeof(*res), NULL, 0);
 }
 
+int scoutfs_client_get_log_merge(struct super_block *sb,
+				 struct scoutfs_log_merge_request *req)
+{
+	struct client_info *client = SCOUTFS_SB(sb)->client_info;
+
+	return scoutfs_net_sync_request(sb, client->conn,
+					SCOUTFS_NET_CMD_GET_LOG_MERGE,
+					NULL, 0, req, sizeof(*req));
+}
+
+int scoutfs_client_commit_log_merge(struct super_block *sb,
+				    struct scoutfs_log_merge_complete *comp)
+{
+	struct client_info *client = SCOUTFS_SB(sb)->client_info;
+
+	return scoutfs_net_sync_request(sb, client->conn,
+					SCOUTFS_NET_CMD_COMMIT_LOG_MERGE,
+					comp, sizeof(*comp), NULL, 0);
+}
+
 int scoutfs_client_send_omap_response(struct super_block *sb, u64 id,
 				      struct scoutfs_open_ino_map *map)
 {
diff --git a/kmod/src/client.h b/kmod/src/client.h
index f8866abd..1cbcbc1d 100644
--- a/kmod/src/client.h
+++ b/kmod/src/client.h
@@ -22,6 +22,10 @@ int scoutfs_client_srch_get_compact(struct super_block *sb,
 				    struct scoutfs_srch_compact *sc);
 int scoutfs_client_srch_commit_compact(struct super_block *sb,
 				       struct scoutfs_srch_compact *res);
+int scoutfs_client_get_log_merge(struct super_block *sb,
+				 struct scoutfs_log_merge_request *req);
+int scoutfs_client_commit_log_merge(struct super_block *sb,
+				    struct scoutfs_log_merge_complete *comp);
 int scoutfs_client_send_omap_response(struct super_block *sb, u64 id,
 				      struct scoutfs_open_ino_map *map);
 int scoutfs_client_open_ino_map(struct super_block *sb, u64 group_nr,
diff --git a/kmod/src/forest.c b/kmod/src/forest.c
index f88ac5e2..37be80a0 100644
--- a/kmod/src/forest.c
+++ b/kmod/src/forest.c
@@ -52,6 +52,8 @@
  */
 
 struct forest_info {
+	struct super_block *sb;
+
 	struct mutex mutex;
 	struct scoutfs_alloc *alloc;
 	struct scoutfs_block_writer *wri;
@@ -60,6 +62,9 @@ struct forest_info {
 	struct mutex srch_mutex;
 	struct scoutfs_srch_file srch_file;
 	struct scoutfs_block *srch_bl;
+
+	struct workqueue_struct *workq;
+	struct delayed_work log_merge_dwork;
 };
 
 #define DECLARE_FOREST_INFO(sb, name) \
@@ -572,6 +577,149 @@ void scoutfs_forest_get_btrees(struct super_block *sb,
 					    &lt->bloom_ref);
 }
 
+/*
+ * Compare input items to merge by their log item value seq when their
+ * keys match.
+ */
+static int merge_cmp(void *a_val, int a_val_len, void *b_val, int b_val_len)
+{
+	struct scoutfs_log_item_value *a = a_val;
+	struct scoutfs_log_item_value *b = b_val;
+
+	/* sort merge item by seq */
+	return scoutfs_cmp(le64_to_cpu(a->seq), le64_to_cpu(b->seq));
+}
+
+static bool merge_is_del(void *val, int val_len)
+{
+	struct scoutfs_log_item_value *liv = val;
+
+	return !!(liv->flags & SCOUTFS_LOG_ITEM_FLAG_DELETION);
+}
+
+#define LOG_MERGE_DELAY_MS (5 * MSEC_PER_SEC)
+
+/*
+ * Regularly try to get a log merge request from the server.  If we get
+ * a request we walk the log_trees items to find input trees and pass
+ * them to btree_merge.  All of our work is done in dirty blocks
+ * allocated from available free blocks that the server gave us.  If we
+ * hit an error then we drop our dirty blocks without writing them and
+ * send an error flag to the server so they can reclaim our allocators
+ * and ignore the rest of our work.
+ */
+static void scoutfs_forest_log_merge_worker(struct work_struct *work)
+{
+	struct forest_info *finf = container_of(work, struct forest_info,
+						log_merge_dwork.work);
+	struct super_block *sb = finf->sb;
+	struct scoutfs_btree_root_head *rhead = NULL;
+	struct scoutfs_btree_root_head *tmp;
+	struct scoutfs_log_merge_complete comp;
+	struct scoutfs_log_merge_request req;
+	struct scoutfs_log_trees *lt;
+	struct scoutfs_block_writer wri;
+	struct scoutfs_alloc alloc;
+	SCOUTFS_BTREE_ITEM_REF(iref);
+	struct scoutfs_key next;
+	struct scoutfs_key key;
+	unsigned long delay;
+	LIST_HEAD(inputs);
+	int ret;
+
+	ret = scoutfs_client_get_log_merge(sb, &req);
+	if (ret < 0)
+		goto resched;
+
+	comp.root = req.root;
+	comp.start = req.start;
+	comp.end = req.end;
+	comp.remain = req.end;
+	comp.rid = req.rid;
+	comp.seq = req.seq;
+	comp.flags = 0;
+
+	scoutfs_alloc_init(&alloc, &req.meta_avail, &req.meta_freed);
+	scoutfs_block_writer_init(sb, &wri);
+
+	/* find finalized input log trees up to last_seq */
+	for (scoutfs_key_init_log_trees(&key, 0, 0); ; scoutfs_key_inc(&key)) {
+
+		if (!rhead) {
+			rhead = kmalloc(sizeof(*rhead), GFP_NOFS);
+			if (!rhead) {
+				ret = -ENOMEM;
+				goto out;
+			}
+		}
+
+		ret = scoutfs_btree_next(sb, &req.logs_root, &key, &iref);
+		if (ret == 0) {
+			if (iref.val_len == sizeof(*lt)) {
+				key = *iref.key;
+				lt = iref.val;
+				if ((le64_to_cpu(lt->flags) &
+				     SCOUTFS_LOG_TREES_FINALIZED) &&
+				    (le64_to_cpu(lt->max_item_seq) <=
+				     le64_to_cpu(req.last_seq))) {
+					rhead->root = lt->item_root;
+					list_add_tail(&rhead->head, &inputs);
+					rhead = NULL;
+				}
+			} else {
+				ret = -EIO;
+			}
+			scoutfs_btree_put_iref(&iref);
+		}
+		if (ret < 0) {
+			if (ret == -ENOENT) {
+				ret = 0;
+				break;
+			}
+			goto out;
+		}
+	}
+
+	/* shouldn't be possible, but it's harmless */
+	if (list_empty(&inputs)) {
+		ret = 0;
+		goto out;
+	}
+
+	ret = scoutfs_btree_merge(sb, &alloc, &wri, &req.start, &req.end,
+				  &next, &comp.root, &inputs, merge_cmp,
+				  merge_is_del,
+				  !!(req.flags & cpu_to_le64(SCOUTFS_LOG_MERGE_REQUEST_SUBTREE)),
+				  sizeof(struct scoutfs_log_item_value),
+				  SCOUTFS_LOG_MERGE_DIRTY_BYTE_LIMIT, 10);
+	if (ret == -ERANGE) {
+		comp.remain = next;
+		le64_add_cpu(&comp.flags, SCOUTFS_LOG_MERGE_COMP_REMAIN);
+		ret = 0;
+	}
+
+out:
+	scoutfs_alloc_prepare_commit(sb, &alloc, &wri);
+	if (ret == 0)
+	      ret = scoutfs_block_writer_write(sb, &wri);
+	scoutfs_block_writer_forget_all(sb, &wri);
+
+	comp.meta_avail = alloc.avail;
+	comp.meta_freed = alloc.freed;
+	if (ret < 0)
+		le64_add_cpu(&comp.flags, SCOUTFS_LOG_MERGE_COMP_ERROR);
+
+	ret = scoutfs_client_commit_log_merge(sb, &comp);
+
+	kfree(rhead);
+	list_for_each_entry_safe(rhead, tmp, &inputs, head)
+		kfree(rhead);
+
+resched:
+	delay = ret == 0 ? 0 : msecs_to_jiffies(LOG_MERGE_DELAY_MS);
+	queue_delayed_work(finf->workq, &finf->log_merge_dwork, delay);
+}
+
 int scoutfs_forest_setup(struct super_block *sb)
 {
 	struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
@@ -585,10 +733,23 @@ int scoutfs_forest_setup(struct super_block *sb)
 	}
 
 	/* the finf fields will be setup as we open a transaction */
+	finf->sb = sb;
 	mutex_init(&finf->mutex);
 	mutex_init(&finf->srch_mutex);
-
+	INIT_DELAYED_WORK(&finf->log_merge_dwork,
+			  scoutfs_forest_log_merge_worker);
 	sbi->forest_info = finf;
+
+	finf->workq = alloc_workqueue("scoutfs_log_merge", WQ_NON_REENTRANT |
+				      WQ_UNBOUND | WQ_HIGHPRI, 0);
+	if (!finf->workq) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	queue_delayed_work(finf->workq, &finf->log_merge_dwork,
+			   msecs_to_jiffies(LOG_MERGE_DELAY_MS));
+
 	ret = 0;
 out:
 	if (ret)
@@ -604,6 +765,12 @@ void scoutfs_forest_destroy(struct super_block *sb)
 
 	if (finf) {
 		scoutfs_block_put(sb, finf->srch_bl);
+
+		if (finf->workq) {
+			cancel_delayed_work_sync(&finf->log_merge_dwork);
+			destroy_workqueue(finf->workq);
+		}
+
 		kfree(finf);
 		sbi->forest_info = NULL;
 	}

From 9711fef12210e62099bcaedb1771dba0e366a1af Mon Sep 17 00:00:00 2001
From: Zach Brown <zab@versity.com>
Date: Fri, 18 Dec 2020 11:59:31 -0800
Subject: [PATCH 16/29] Update for core, trans, and item seq use

We now have a core seq number in the super that is advanced for multiple
users.    The client transaction seq comes from the core seq so we
remove the trans_seq from the super.  The item version is also converted
to use a seq that's derived from the core seq.

Signed-off-by: Zach Brown <zab@versity.com>
---
 utils/src/mkfs.c  |  2 +-
 utils/src/print.c | 12 ++++++------
 2 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/utils/src/mkfs.c b/utils/src/mkfs.c
index 92dc0b50..bcf07357 100644
--- a/utils/src/mkfs.c
+++ b/utils/src/mkfs.c
@@ -236,7 +236,7 @@ static int do_mkfs(struct mkfs_args *args)
 	super->version = cpu_to_le64(SCOUTFS_INTEROP_VERSION);
 	uuid_generate(super->uuid);
 	super->next_ino = cpu_to_le64(SCOUTFS_ROOT_INO + 1);
-	super->next_trans_seq = cpu_to_le64(1);
+	super->seq = cpu_to_le64(1);
 	super->total_meta_blocks = cpu_to_le64(last_meta + 1);
 	super->first_meta_blkno = cpu_to_le64(next_meta);
 	super->last_meta_blkno = cpu_to_le64(last_meta);
diff --git a/utils/src/print.c b/utils/src/print.c
index 5fa57bdb..146d6582 100644
--- a/utils/src/print.c
+++ b/utils/src/print.c
@@ -210,8 +210,8 @@ static int print_logs_item(struct scoutfs_key *key, void *val,
 	/* only items in leaf blocks have values */
 	if (val) {
 		liv = val;
-		printf("    log_item_value: vers %llu flags %x\n",
-		       le64_to_cpu(liv->vers), liv->flags);
+		printf("    log_item_value: seq %llu flags %x\n",
+		       le64_to_cpu(liv->seq), liv->flags);
 
 		/* deletion items don't have values */
 		if (!(liv->flags & SCOUTFS_LOG_ITEM_FLAG_DELETION)) {
@@ -289,7 +289,7 @@ static int print_log_trees_item(struct scoutfs_key *key, void *val,
 		       "      data_avail: "ALCROOT_F"\n"
 		       "      data_freed: "ALCROOT_F"\n"
 		       "      srch_file: "SRF_FMT"\n"
-		       "      max_item_vers: %llu\n"
+		       "      max_item_seq: %llu\n"
 		       "      rid: %016llx\n"
 		       "      nr: %llu\n"
 		       "      data_alloc_zone_blocks: %llu\n"
@@ -304,7 +304,7 @@ static int print_log_trees_item(struct scoutfs_key *key, void *val,
 		       ALCROOT_A(&lt->data_avail),
 		       ALCROOT_A(&lt->data_freed),
 		       SRF_A(&lt->srch_file),
-		       le64_to_cpu(lt->max_item_vers),
+		       le64_to_cpu(lt->max_item_seq),
 		       le64_to_cpu(lt->rid),
 		       le64_to_cpu(lt->nr),
 		       le64_to_cpu(lt->data_alloc_zone_blocks));
@@ -878,7 +878,7 @@ static void print_super_block(struct scoutfs_super_block *super, u64 blkno)
 	printf("  flags: 0x%016llx\n", le64_to_cpu(super->flags));
 
 	/* XXX these are all in a crazy order */
-	printf("  next_ino %llu next_trans_seq %llu\n"
+	printf("  next_ino %llu seq %llu\n"
 	       "  total_meta_blocks %llu first_meta_blkno %llu last_meta_blkno %llu\n"
 	       "  total_data_blocks %llu first_data_blkno %llu last_data_blkno %llu\n"
 	       "  meta_alloc[0]: "ALCROOT_F"\n"
@@ -893,7 +893,7 @@ static void print_super_block(struct scoutfs_super_block *super, u64 blkno)
 	       "  trans_seqs root: height %u blkno %llu seq %llu\n"
 	       "  fs_root btree root: height %u blkno %llu seq %llu\n",
 		le64_to_cpu(super->next_ino),
-		le64_to_cpu(super->next_trans_seq),
+		le64_to_cpu(super->seq),
 		le64_to_cpu(super->total_meta_blocks),
 		le64_to_cpu(super->first_meta_blkno),
 		le64_to_cpu(super->last_meta_blkno),

From c482204fcff69788d6eca5964d79544a1fece5bb Mon Sep 17 00:00:00 2001
From: Zach Brown <zab@versity.com>
Date: Tue, 12 Jan 2021 14:48:35 -0800
Subject: [PATCH 17/29] Clean up btree root printing in superblock

Over time the printing of the btree roots embedded in the super block
has gotten a little out of hand.  Add a helper macro for the printf
format and args and re-order them to match their order in the
superblock.

Signed-off-by: Zach Brown <zab@versity.com>
---
 utils/src/print.c | 30 ++++++++++++++----------------
 1 file changed, 14 insertions(+), 16 deletions(-)

diff --git a/utils/src/print.c b/utils/src/print.c
index 146d6582..13589d35 100644
--- a/utils/src/print.c
+++ b/utils/src/print.c
@@ -859,6 +859,10 @@ out:
 	return ret;
 }
 
+#define BTR_FMT "blkno %llu seq %016llx height %u"
+#define BTR_ARG(rt) \
+	le64_to_cpu((rt)->ref.blkno), le64_to_cpu((rt)->ref.seq), (rt)->height
+
 static void print_super_block(struct scoutfs_super_block *super, u64 blkno)
 {
 	char uuid_str[37];
@@ -888,10 +892,11 @@ static void print_super_block(struct scoutfs_super_block *super, u64 blkno)
 	       "  server_meta_avail[1]: "AL_HEAD_F"\n"
 	       "  server_meta_freed[0]: "AL_HEAD_F"\n"
 	       "  server_meta_freed[1]: "AL_HEAD_F"\n"
-	       "  mounted_clients root: height %u blkno %llu seq %llu\n"
-	       "  srch_root root: height %u blkno %llu seq %llu\n"
-	       "  trans_seqs root: height %u blkno %llu seq %llu\n"
-	       "  fs_root btree root: height %u blkno %llu seq %llu\n",
+	       "  fs_root: "BTR_FMT"\n"
+	       "  logs_root: "BTR_FMT"\n"
+	       "  trans_seqs: "BTR_FMT"\n"
+	       "  mounted_clients: "BTR_FMT"\n"
+	       "  srch_root: "BTR_FMT"\n",
 		le64_to_cpu(super->next_ino),
 		le64_to_cpu(super->seq),
 		le64_to_cpu(super->total_meta_blocks),
@@ -907,18 +912,11 @@ static void print_super_block(struct scoutfs_super_block *super, u64 blkno)
 		AL_HEAD_A(&super->server_meta_avail[1]),
 		AL_HEAD_A(&super->server_meta_freed[0]),
 		AL_HEAD_A(&super->server_meta_freed[1]),
-		super->mounted_clients.height,
-		le64_to_cpu(super->mounted_clients.ref.blkno),
-		le64_to_cpu(super->mounted_clients.ref.seq),
-		super->srch_root.height,
-		le64_to_cpu(super->srch_root.ref.blkno),
-		le64_to_cpu(super->srch_root.ref.seq),
-		super->trans_seqs.height,
-		le64_to_cpu(super->trans_seqs.ref.blkno),
-		le64_to_cpu(super->trans_seqs.ref.seq),
-		super->fs_root.height,
-		le64_to_cpu(super->fs_root.ref.blkno),
-		le64_to_cpu(super->fs_root.ref.seq));
+		BTR_ARG(&super->fs_root),
+		BTR_ARG(&super->logs_root),
+		BTR_ARG(&super->trans_seqs),
+		BTR_ARG(&super->mounted_clients),
+		BTR_ARG(&super->srch_root));
 
 	printf("  volume options:\n"
 	       "    set_bits: %016llx\n",

From 3488b4e6e09e50d877a1095706fe6c4529d3323e Mon Sep 17 00:00:00 2001
From: Zach Brown <zab@versity.com>
Date: Fri, 18 Dec 2020 14:57:46 -0800
Subject: [PATCH 18/29] Add scoutfs print support for log merge items

Add support for printing all the items in the log_merge tree that the
server uses to track log merging.

Signed-off-by: Zach Brown <zab@versity.com>
---
 utils/src/print.c | 75 +++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 75 insertions(+)

diff --git a/utils/src/print.c b/utils/src/print.c
index 13589d35..c6ea1fe0 100644
--- a/utils/src/print.c
+++ b/utils/src/print.c
@@ -292,6 +292,7 @@ static int print_log_trees_item(struct scoutfs_key *key, void *val,
 		       "      max_item_seq: %llu\n"
 		       "      rid: %016llx\n"
 		       "      nr: %llu\n"
+		       "      flags: %llx\n"
 		       "      data_alloc_zone_blocks: %llu\n"
 		       "      data_alloc_zones: ",
 		       AL_HEAD_A(&lt->meta_avail),
@@ -307,6 +308,7 @@ static int print_log_trees_item(struct scoutfs_key *key, void *val,
 		       le64_to_cpu(lt->max_item_seq),
 		       le64_to_cpu(lt->rid),
 		       le64_to_cpu(lt->nr),
+		       le64_to_cpu(lt->flags),
 		       le64_to_cpu(lt->data_alloc_zone_blocks));
 
 		for (i = 0; i < SCOUTFS_DATA_ALLOC_ZONE_LE64S; i++) {
@@ -383,6 +385,72 @@ static int print_mounted_client_entry(struct scoutfs_key *key, void *val,
 	return 0;
 }
 
+static int print_log_merge_item(struct scoutfs_key *key, void *val,
+				      unsigned val_len, void *arg)
+{
+	struct scoutfs_log_merge_status *stat;
+	struct scoutfs_log_merge_range *rng;
+	struct scoutfs_log_merge_request *req;
+	struct scoutfs_log_merge_complete *comp;
+	struct scoutfs_log_merge_freeing *fr;
+
+	switch (key->sk_zone) {
+	case SCOUTFS_LOG_MERGE_STATUS_ZONE:
+		stat = val;
+		printf("    status: next_range_key "SK_FMT" nr_req %llu nr_comp %llu"
+		       " last_seq %llu seq %llu\n",
+		       SK_ARG(&stat->next_range_key),
+		       le64_to_cpu(stat->nr_requests),
+		       le64_to_cpu(stat->nr_complete),
+		       le64_to_cpu(stat->last_seq),
+		       le64_to_cpu(stat->seq));
+		break;
+	case SCOUTFS_LOG_MERGE_RANGE_ZONE:
+		rng = val;
+		printf("    range: start "SK_FMT" end "SK_FMT"\n",
+		       SK_ARG(&rng->start),
+		       SK_ARG(&rng->end));
+		break;
+	case SCOUTFS_LOG_MERGE_REQUEST_ZONE:
+		req = val;
+		printf("    request: logs_root "BTROOT_F" logs_root "BTROOT_F" start "SK_FMT
+		       " end "SK_FMT" last_seq %llu rid %016llx seq %llu flags 0x%llx\n",
+		       BTROOT_A(&req->logs_root),
+		       BTROOT_A(&req->root),
+		       SK_ARG(&req->start),
+		       SK_ARG(&req->end),
+		       le64_to_cpu(req->last_seq),
+		       le64_to_cpu(req->rid),
+		       le64_to_cpu(req->seq),
+		       le64_to_cpu(req->flags));
+		break;
+	case SCOUTFS_LOG_MERGE_COMPLETE_ZONE:
+		comp = val;
+		printf("    complete: root "BTROOT_F" start "SK_FMT" end "SK_FMT
+		       " remain "SK_FMT" rid %016llx seq %llu flags %llx\n",
+		       BTROOT_A(&comp->root),
+		       SK_ARG(&comp->start),
+		       SK_ARG(&comp->end),
+		       SK_ARG(&comp->remain),
+		       le64_to_cpu(comp->rid),
+		       le64_to_cpu(comp->seq),
+		       le64_to_cpu(comp->flags));
+		break;
+	case SCOUTFS_LOG_MERGE_FREEING_ZONE:
+		fr = val;
+		printf("    freeing: root "BTROOT_F" key "SK_FMT" seq %llu\n",
+		       BTROOT_A(&fr->root),
+		       SK_ARG(&fr->key),
+		       le64_to_cpu(fr->seq));
+		break;
+	default:
+		printf("    (unknown log merge key zone %u)\n", key->sk_zone);
+		break;
+	}
+
+	return 0;
+}
+
 static int print_alloc_item(struct scoutfs_key *key, void *val,
 			    unsigned val_len, void *arg)
 {
@@ -894,6 +962,7 @@ static void print_super_block(struct scoutfs_super_block *super, u64 blkno)
 	       "  server_meta_freed[1]: "AL_HEAD_F"\n"
 	       "  fs_root: "BTR_FMT"\n"
 	       "  logs_root: "BTR_FMT"\n"
+	       "  log_merge: "BTR_FMT"\n"
 	       "  trans_seqs: "BTR_FMT"\n"
 	       "  mounted_clients: "BTR_FMT"\n"
 	       "  srch_root: "BTR_FMT"\n",
@@ -914,6 +983,7 @@ static void print_super_block(struct scoutfs_super_block *super, u64 blkno)
 		AL_HEAD_A(&super->server_meta_freed[1]),
 		BTR_ARG(&super->fs_root),
 		BTR_ARG(&super->logs_root),
+		BTR_ARG(&super->log_merge),
 		BTR_ARG(&super->trans_seqs),
 		BTR_ARG(&super->mounted_clients),
 		BTR_ARG(&super->srch_root));
@@ -971,6 +1041,11 @@ static int print_volume(int fd)
 	if (err && !ret)
 		ret = err;
 
+	err = print_btree(fd, super, "log_merge", &super->log_merge,
+			  print_log_merge_item, NULL);
+	if (err && !ret)
+		ret = err;
+
 	for (i = 0; i < array_size(super->server_meta_avail); i++) {
 		snprintf(str, sizeof(str), "server_meta_avail[%u]", i);
 		err = print_alloc_list_block(fd, str,

From 3d1a0f06c0fee730f40f3c06e9e396df2947a61f Mon Sep 17 00:00:00 2001
From: Zach Brown <zab@versity.com>
Date: Wed, 23 Dec 2020 11:55:50 -0800
Subject: [PATCH 19/29] Add scoutfs_btree_free_blocks

Add a btree function for freeing all the blocks in a btree without
having to cow the blocks to track which refs have been freed.  We use a
key from the caller to track which portions of the tree have been freed.

Signed-off-by: Zach Brown <zab@versity.com>
---
 kmod/src/btree.c         | 163 +++++++++++++++++++++++++++++++++++++++
 kmod/src/btree.h         |   6 ++
 kmod/src/scoutfs_trace.h |  42 ++++++++++
 3 files changed, 211 insertions(+)

diff --git a/kmod/src/btree.c b/kmod/src/btree.c
index f2e16924..96a20ef8 100644
--- a/kmod/src/btree.c
+++ b/kmod/src/btree.c
@@ -2173,3 +2173,166 @@ out:
 
 	return ret;
 }
+
+/*
+ * Free all the blocks referenced by a btree.  The btree is only read,
+ * this does not update the blocks as it frees.  The caller ensures that
+ * these btrees aren't been modified.
+ *
+ * The caller's key tracks which blocks have been freed.  It must be
+ * initialized to zeros before the first call to start freeing blocks.
+ * Once a block is freed the key is updated such that the freed block
+ * will not be read again.
+ *
+ * Returns 0 when progress has been made successfully, which includes
+ * partial progress.  The key is set to all ones once we've freed all
+ * the blocks.
+ *
+ * This works by descending to the last parent block and freeing all its
+ * leaf blocks without reading them.  As it descends it remembers the
+ * number of parent blocks which were traversed through their final
+ * child ref.  If we free all the leaf blocks then all these parent
+ * blocks are no longer needed and can be freed.  The caller's key is
+ * updated to past the subtree that we just freed and we retry the
+ * descent from the root through the next set of parents to the next set
+ * of leaf blocks to free.
+ */
+int scoutfs_btree_free_blocks(struct super_block *sb,
+			      struct scoutfs_alloc *alloc,
+			      struct scoutfs_block_writer *wri,
+			      struct scoutfs_key *key,
+			      struct scoutfs_btree_root *root, int alloc_low)
+{
+	u64 blknos[SCOUTFS_BTREE_MAX_HEIGHT];
+	struct scoutfs_block *bl = NULL;
+	struct scoutfs_btree_item *item;
+	struct scoutfs_btree_block *bt;
+	struct scoutfs_block_ref ref;
+	struct scoutfs_avl_node *node;
+	struct scoutfs_avl_node *next;
+	struct scoutfs_key par_next;
+	int nr_par;
+	int level;
+	int ret;
+	int i;
+
+	if (WARN_ON_ONCE(root->height > ARRAY_SIZE(blknos)))
+		return -EIO; /* XXX corruption */
+
+	if (root->height == 0) {
+		scoutfs_key_set_ones(key);
+		return 0;
+	}
+
+	if (scoutfs_key_is_ones(key))
+		return 0;
+
+	/* just free a single leaf block */
+	if (root->height == 1) {
+		ret = scoutfs_free_meta(sb, alloc, wri,
+					le64_to_cpu(root->ref.blkno));
+		if (ret == 0) {
+			trace_scoutfs_btree_free_blocks_single(sb, root,
+						le64_to_cpu(root->ref.blkno));
+			scoutfs_key_set_ones(key);
+		}
+		goto out;
+	}
+
+	for (;;) {
+		/* start the walk at the root block */
+		level = root->height - 1;
+		ref = root->ref;
+		scoutfs_key_set_ones(&par_next);
+		nr_par = 0;
+
+		/* read blocks until we read the last parent */
+		for (;;) {
+			scoutfs_block_put(sb, bl);
+			bl = NULL;
+			ret = get_ref_block(sb, alloc, wri, 0, &ref, &bl);
+			if (ret < 0)
+				goto out;
+			bt = bl->data;
+
+			node = scoutfs_avl_search(&bt->item_root, cmp_key_item,
+						  key, NULL, NULL, &next, NULL);
+			if (node == NULL)
+				node = next;
+
+			/* should never descend into parent with no more refs */
+			if (WARN_ON_ONCE(node == NULL)) {
+				ret = -EIO;
+				goto out;
+			}
+
+			/* we'll free refs in the last parent */
+			if (level == 1)
+				break;
+
+			item = node_item(node);
+			next = scoutfs_avl_next(&bt->item_root, node);
+			if (next) {
+				/* didn't take last ref, still need parents */
+				nr_par = 0;
+				par_next = *item_key(item);
+				scoutfs_key_inc(&par_next);
+			} else {
+				/* final ref, could free after all leaves */
+				blknos[nr_par++] = le64_to_cpu(bt->hdr.blkno);
+			}
+
+			memcpy(&ref, item_val(bt, item), sizeof(ref));
+			level--;
+		}
+
+		/* free all leaf block refs in last parent */
+		while (node) {
+
+			/* make sure we can always free parents after leaves */
+			if (scoutfs_alloc_meta_low(sb, alloc,
+						   alloc_low + nr_par + 1)) {
+				ret = 0;
+				goto out;
+			}
+
+			item = node_item(node);
+			memcpy(&ref, item_val(bt, item), sizeof(ref));
+
+			trace_scoutfs_btree_free_blocks_leaf(sb, root,
+							le64_to_cpu(ref.blkno));
+			ret = scoutfs_free_meta(sb, alloc, wri,
+						le64_to_cpu(ref.blkno));
+			if (ret < 0)
+				goto out;
+
+			node = scoutfs_avl_next(&bt->item_root, node);
+			if (node) {
+				/* done with keys in child we just freed */
+				*key = *item_key(item);
+				scoutfs_key_inc(key);
+			}
+		}
+
+		/* now that leaves are freed, free any empty parents */
+		for (i = 0; i < nr_par; i++) {
+			trace_scoutfs_btree_free_blocks_parent(sb, root,
+							       blknos[i]);
+			ret = scoutfs_free_meta(sb, alloc, wri, blknos[i]);
+			BUG_ON(ret); /* checked meta low, freed should fit */
+		}
+
+		/* restart walk past the subtree we just freed */
+		*key = par_next;
+
+		/* but done if we just freed all parents down right spine */
+		if (scoutfs_key_is_ones(&par_next)) {
+			ret = 0;
+			goto out;
+		}
+	}
+
+out:
+	scoutfs_block_put(sb, bl);
+	return ret;
+}
diff --git a/kmod/src/btree.h b/kmod/src/btree.h
index 63447669..3d27fec2 100644
--- a/kmod/src/btree.h
+++ b/kmod/src/btree.h
@@ -128,6 +128,12 @@ int scoutfs_btree_merge(struct super_block *sb,
 			scoutfs_btree_merge_is_del_t merge_is_del, bool subtree,
 			int drop_val, int dirty_limit, int alloc_low);
 
+int scoutfs_btree_free_blocks(struct super_block *sb,
+			      struct scoutfs_alloc *alloc,
+			      struct scoutfs_block_writer *wri,
+			      struct scoutfs_key *key,
+			      struct scoutfs_btree_root *root, int alloc_low);
+
 void scoutfs_btree_put_iref(struct scoutfs_btree_item_ref *iref);
 
 #endif
diff --git a/kmod/src/scoutfs_trace.h b/kmod/src/scoutfs_trace.h
index 4e58ef7a..fb5ea548 100644
--- a/kmod/src/scoutfs_trace.h
+++ b/kmod/src/scoutfs_trace.h
@@ -1760,6 +1760,48 @@ TRACE_EVENT(scoutfs_btree_merge_items,
 		  sk_trace_args(f_key), __entry->f_val_len, __entry->is_del)
 );
 
+DECLARE_EVENT_CLASS(scoutfs_btree_free_blocks,
+	TP_PROTO(struct super_block *sb, struct scoutfs_btree_root *root,
+		 u64 blkno),
+
+	TP_ARGS(sb, root, blkno),
+
+	TP_STRUCT__entry(
+		SCSB_TRACE_FIELDS
+		__field(__u64, root_blkno)
+		__field(__u64, root_seq)
+		__field(__u8, root_height)
+		__field(__u64, blkno)
+	),
+
+	TP_fast_assign(
+		SCSB_TRACE_ASSIGN(sb);
+		__entry->root_blkno = le64_to_cpu(root->ref.blkno);
+		__entry->root_seq = le64_to_cpu(root->ref.seq);
+		__entry->root_height = root->height;
+		__entry->blkno = blkno;
+	),
+
+	TP_printk(SCSBF" root blkno %llu seq %llu height %u, free blkno %llu",
+		  SCSB_TRACE_ARGS, __entry->root_blkno, __entry->root_seq,
+		  __entry->root_height, __entry->blkno)
+);
+DEFINE_EVENT(scoutfs_btree_free_blocks, scoutfs_btree_free_blocks_single,
+	TP_PROTO(struct super_block *sb, struct scoutfs_btree_root *root,
+		 u64 blkno),
+	TP_ARGS(sb, root, blkno)
+);
+DEFINE_EVENT(scoutfs_btree_free_blocks, scoutfs_btree_free_blocks_leaf,
+	TP_PROTO(struct super_block *sb, struct scoutfs_btree_root *root,
+		 u64 blkno),
+	TP_ARGS(sb, root, blkno)
+);
+DEFINE_EVENT(scoutfs_btree_free_blocks, scoutfs_btree_free_blocks_parent,
+	TP_PROTO(struct super_block *sb, struct scoutfs_btree_root *root,
+		 u64 blkno),
+	TP_ARGS(sb, root, blkno)
+);
+
 TRACE_EVENT(scoutfs_online_offline_blocks,
 	TP_PROTO(struct inode *inode, s64 on_delta, s64 off_delta,
 		 u64 on_now, u64 off_now),

From ff882a4c4fcf87c9b70446069987a839a5553904 Mon Sep 17 00:00:00 2001
From: Zach Brown <zab@versity.com>
Date: Mon, 28 Dec 2020 13:30:12 -0800
Subject: [PATCH 20/29] Add btree total_above_join_low_water() test

Take the condition used to decide if a btree block needs to be joined
and put it in total_above_join_low_water() so that btree_merging will be
able to call it to see if the leaf block it's merging into needs to be
joined.

Signed-off-by: Zach Brown <zab@versity.com>
---
 kmod/src/btree.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/kmod/src/btree.c b/kmod/src/btree.c
index 96a20ef8..fdcd8996 100644
--- a/kmod/src/btree.c
+++ b/kmod/src/btree.c
@@ -118,6 +118,11 @@ static unsigned int join_low_watermark(void)
 	return (SCOUTFS_BLOCK_LG_SIZE - sizeof(struct scoutfs_btree_block)) / 4;
 }
 
+static bool total_above_join_low_water(struct scoutfs_btree_block *bt)
+{
+	return le16_to_cpu(bt->total_item_bytes) >= join_low_watermark();
+}
+
 /*
  * return the integer percentages of total space the block could have
  * consumed by items that is currently consumed.
@@ -814,7 +819,7 @@ static int try_join(struct super_block *sb,
 	int to_move;
 	int ret;
 
-	if (le16_to_cpu(bt->total_item_bytes) >= join_low_watermark())
+	if (total_above_join_low_water(bt))
 		return 0;
 
 	scoutfs_inc_counter(sb, btree_join);

From 045b3ca8d4ddc74cd48e62daa06177555eba6889 Mon Sep 17 00:00:00 2001
From: Zach Brown <zab@versity.com>
Date: Wed, 13 Jan 2021 14:02:20 -0800
Subject: [PATCH 21/29] Expand unused btree verifying walker

Previously we had an unused function that could be flipped on to verify
btree blocks during traversal.   This refactors the block verifier a bit
to be called by a verifying walker.  This will let callers walk paths to
leaves to verify the tree around operations, rather than verification
being performed during the next walk.

Signed-off-by: Zach Brown <zab@versity.com>
---
 kmod/src/btree.c | 119 +++++++++++++++++++++++++++++++++++++++++++----
 1 file changed, 110 insertions(+), 9 deletions(-)

diff --git a/kmod/src/btree.c b/kmod/src/btree.c
index fdcd8996..ff28b5e8 100644
--- a/kmod/src/btree.c
+++ b/kmod/src/btree.c
@@ -913,13 +913,15 @@ static bool bad_avl_node_off(__le16 node_off, int nr)
  *  - call after leaf modification
  *  - padding is zero
  */
-static void verify_btree_block(struct super_block *sb,
+__attribute__((unused))
+static void verify_btree_block(struct super_block *sb, char *str,
 			       struct scoutfs_btree_block *bt, int level,
-			       struct scoutfs_key *start,
+			       bool last_ref, struct scoutfs_key *start,
 			       struct scoutfs_key *end)
 {
 	__le16 *buckets = leaf_item_hash_buckets(bt);
 	struct scoutfs_btree_item *item;
+	struct scoutfs_avl_node *node;
 	char *reason = NULL;
 	int first_val = 0;
 	int hashed = 0;
@@ -981,6 +983,12 @@ static void verify_btree_block(struct super_block *sb,
 			goto out;
 		}
 
+		if (level > 0 && le16_to_cpu(item->val_len) !=
+				 sizeof(struct scoutfs_block_ref)) {
+			reason = "parent item val not sizeof ref";
+			goto out;
+		}
+
 		if (le16_to_cpu(item->val_len) > SCOUTFS_BTREE_MAX_VAL_LEN) {
 			reason = "bad item val len";
 			goto out;
@@ -1001,6 +1009,15 @@ static void verify_btree_block(struct super_block *sb,
 		}
 	}
 
+	if (last_ref && level > 0 &&
+	    (node = scoutfs_avl_last(&bt->item_root)) != NULL) {
+		item = node_item(node);
+		if (scoutfs_key_compare(&item->key, end) != 0) {
+			reason = "final ref item key not range end";
+			goto out;
+		}
+	}
+
 	for (i = 0; level == 0 && i < SCOUTFS_BTREE_LEAF_ITEM_HASH_NR; i++) {
 		if (buckets[i] == 0)
 			continue;
@@ -1033,17 +1050,18 @@ out:
 	if (!reason)
 		return;
 
-	printk("found btree block inconsistency: %s\n", reason);
-	printk("start "SK_FMT" end "SK_FMT"\n", SK_ARG(start), SK_ARG(end));
+	printk("verifying btree %s: %s\n", str, reason);
+	printk("args: level %u last_ref %u start "SK_FMT" end "SK_FMT"\n",
+		level, last_ref, SK_ARG(start), SK_ARG(end));
 	printk("calced: i %u tot %u hashed %u fv %u\n",
 	       i, tot, hashed, first_val);
 
-	printk("hdr: crc %x magic %x fsid %llx seq %llx blkno %llu\n", 
+	printk("bt hdr: crc %x magic %x fsid %llx seq %llx blkno %llu\n", 
 		le32_to_cpu(bt->hdr.crc), le32_to_cpu(bt->hdr.magic),
 		le64_to_cpu(bt->hdr.fsid), le64_to_cpu(bt->hdr.seq),
 		le64_to_cpu(bt->hdr.blkno));
 	printk("item_root: node %u\n", le16_to_cpu(bt->item_root.node));
-	printk("nr %u tib %u mfl %u lvl %u\n",
+	printk("bt: nr %u tib %u mfl %u lvl %u\n",
 		le16_to_cpu(bt->nr_items), le16_to_cpu(bt->total_item_bytes),
 		le16_to_cpu(bt->mid_free_len), bt->level);
 
@@ -1060,6 +1078,92 @@ out:
 	BUG();
 }
 
+/*
+ * Walk from the root to the leaf, verifying the blocks traversed.
+ */
+__attribute__((unused))
+static void verify_btree_walk(struct super_block *sb, char *str,
+			      struct scoutfs_btree_root *root,
+			      struct scoutfs_key *key)
+{
+	struct scoutfs_avl_node *next_node;
+	struct scoutfs_avl_node *node;
+	struct scoutfs_btree_item *item;
+	struct scoutfs_btree_item *prev;
+	struct scoutfs_block *bl = NULL;
+	struct scoutfs_btree_block *bt;
+	struct scoutfs_block_ref ref;
+	struct scoutfs_key start;
+	struct scoutfs_key end;
+	bool last_ref;
+	int level;
+	int ret;
+
+	if (root->height == 0 && root->ref.blkno != 0) {
+		WARN_ONCE(1, "invalid btree root height %u blkno %llu seq %016llx\n",
+			root->height, le64_to_cpu(root->ref.blkno),
+			le64_to_cpu(root->ref.seq));
+		return;
+	}
+
+	if (root->height == 0)
+		return;
+
+	scoutfs_key_set_zeros(&start);
+	scoutfs_key_set_ones(&end);
+	level = root->height;
+	ref = root->ref;
+	/* first parent last ref isn't all ones in subtrees */
+	last_ref = false;
+
+	while(level-- > 0) {
+		scoutfs_block_put(sb, bl);
+		bl = NULL;
+		ret = get_ref_block(sb, NULL, NULL, 0, &ref, &bl);
+		if (ret) {
+			printk("verifying  btree %s: read error %d\n",
+			       str, ret);
+			break;
+		}
+		bt = bl->data;
+
+		verify_btree_block(sb, str, bt, level, last_ref, &start, &end);
+
+		if (level == 0)
+			break;
+
+		node = scoutfs_avl_search(&bt->item_root, cmp_key_item, key,
+					  NULL, NULL, &next_node, NULL);
+		item = node_item(node ?: next_node);
+
+		if (item == NULL) {
+			printk("verifying btree %s: no ref item\n", str);
+			printk("root: height %u blkno %llu seq %016llx\n",
+			       root->height, le64_to_cpu(root->ref.blkno),
+			       le64_to_cpu(root->ref.seq));
+			printk("walk level %u start "SK_FMT" end "SK_FMT"\n",
+				level, SK_ARG(&start), SK_ARG(&end));
+
+			printk("block: level %u blkno %llu seq %016llx\n",
+			       bt->level, le64_to_cpu(bt->hdr.blkno),
+			       le64_to_cpu(bt->hdr.seq));
+			printk("key: "SK_FMT"\n", SK_ARG(key));
+			BUG();
+		}
+
+		if ((prev = prev_item(bt, item))) {
+			start = *item_key(prev);
+			scoutfs_key_inc(&start);
+		}
+		end = *item_key(item);
+
+		memcpy(&ref, item_val(bt, item), sizeof(ref));
+		last_ref = !next_item(bt, item);
+	}
+
+	scoutfs_block_put(sb, bl);
+}
+
 struct btree_walk_key_range {
 	struct scoutfs_key start;
 	struct scoutfs_key end;
@@ -1197,9 +1301,6 @@ restart:
 			break;
 		bt = bl->data;
 
-		if (0 && kr)
-			verify_btree_block(sb, bt, level, &kr->start, &kr->end);
-
 		/* XXX more aggressive block verification, before ref updates? */
 		if (bt->level != level) {
 			scoutfs_corruption(sb, SC_BTREE_BLOCK_LEVEL,

From 9febc6b5dc59a8cb64ad37f3a48f47b3021da476 Mon Sep 17 00:00:00 2001
From: Zach Brown <zab@versity.com>
Date: Wed, 13 Jan 2021 14:33:44 -0800
Subject: [PATCH 22/29] Update btree block validator for 8byte alignment

The change to aligning values didn't update the btree block verifier's
total length calculation, and while we're in there we can also check
that values are correctly aligned.

Signed-off-by: Zach Brown <zab@versity.com>
---
 kmod/src/btree.c | 14 +++++++++-----
 1 file changed, 9 insertions(+), 5 deletions(-)

diff --git a/kmod/src/btree.c b/kmod/src/btree.c
index ff28b5e8..b87a5e72 100644
--- a/kmod/src/btree.c
+++ b/kmod/src/btree.c
@@ -928,7 +928,6 @@ static void verify_btree_block(struct super_block *sb, char *str,
 	int end_off;
 	int tot = 0;
 	int i = 0;
-	int j = 0;
 	int nr;
 
 	if (bt->level != level) {
@@ -967,8 +966,9 @@ static void verify_btree_block(struct super_block *sb, char *str,
 			goto out;
 		}
 
-		for (j = 0; j < sizeof(item->__pad); j++) {
-			WARN_ON_ONCE(item->__pad[j] != 0);
+		if (memchr_inv(item->__pad, '\0', sizeof(item->__pad))) {
+			reason = "item struct __pad isn't zero";
+			goto out;
 		}
 
 		if (scoutfs_key_compare(&item->key, start) < 0 ||
@@ -994,14 +994,18 @@ static void verify_btree_block(struct super_block *sb, char *str,
 			goto out;
 		}
 
+		if (le16_to_cpu(item->val_off) % SCOUTFS_BTREE_VALUE_ALIGN) {
+			reason = "item value not aligned";
+			goto out;
+		}
+
 		if (((int)le16_to_cpu(item->val_off) +
 		     le16_to_cpu(item->val_len)) > end_off) {
 			reason = "item value outside valid";
 			goto out;
 		}
 
-		tot += sizeof(struct scoutfs_btree_item) +
-		       le16_to_cpu(item->val_len);
+		tot += item_len_bytes(le16_to_cpu(item->val_len));
 
 		if (item->val_len != 0) {
 			first_val = min_t(int, first_val,

From 96d286d6e5d411621c8be96044631ffe17c8dbd3 Mon Sep 17 00:00:00 2001
From: Zach Brown <zab@versity.com>
Date: Wed, 13 Jan 2021 15:22:01 -0800
Subject: [PATCH 23/29] Zero btree item padding as items are created

Item creation, which fills out a new item at the end of the array of
item structs at the start of the block, didn't explicitly zero the item
struct padding to 0.  It would only have been zero if the memory was
already zero, which is likely for new blocks, but isn't necessarily true
if the memory had previously been used by deleted values.

Signed-off-by: Zach Brown <zab@versity.com>
---
 kmod/src/btree.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/kmod/src/btree.c b/kmod/src/btree.c
index b87a5e72..808cbaa8 100644
--- a/kmod/src/btree.c
+++ b/kmod/src/btree.c
@@ -521,6 +521,7 @@ static void create_item(struct scoutfs_btree_block *bt,
 
 	item->val_off = insert_value(bt, ptr_off(bt, item), val, val_len);
 	item->val_len = cpu_to_le16(val_len);
+	memset(item->__pad, 0, sizeof(item->__pad));
 
 	le16_add_cpu(&bt->total_item_bytes, item_bytes(item));
 }

From c5c050bef0a091380f893f1dac5f7fa123ac6c85 Mon Sep 17 00:00:00 2001
From: Zach Brown <zab@versity.com>
Date: Thu, 14 Jan 2021 12:10:17 -0800
Subject: [PATCH 24/29] Item cache might free null page on alloc error

The item cache allocates a page and a little tracking struct for each
cached page.  If the page allocation fails it might try to free a null
page pointer, which isn't allowed.

Signed-off-by: Zach Brown <zab@versity.com>
---
 kmod/src/item.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/kmod/src/item.c b/kmod/src/item.c
index 9fb08463..f4c725d2 100644
--- a/kmod/src/item.c
+++ b/kmod/src/item.c
@@ -345,7 +345,8 @@ static struct cached_page *alloc_pg(struct super_block *sb, gfp_t gfp)
 	page = alloc_page(GFP_NOFS | gfp);
 	if (!page || !pg) {
 		kfree(pg);
-		__free_page(page);
+		if (page)
+			__free_page(page);
 		return NULL;
 	}
 

From d67db6662b27b7d76c159b1f839dbe2cce776135 Mon Sep 17 00:00:00 2001
From: Zach Brown <zab@versity.com>
Date: Fri, 15 Jan 2021 13:36:16 -0800
Subject: [PATCH 25/29] Fix item cache val_len alignment math

Some item_val_len() callers were applying alignment twice, which isn't
needed.

And additions to erased_bytes as value lengths change  didn't take
alignment into account.  They could end up double counting if val_len
changes within the alignment are then accounted for again as the full
item and alignment is later deleted.  Additions to erased_bytes based on
val_len should always take alignment into account.

Signed-off-by: Zach Brown <zab@versity.com>
---
 kmod/src/item.c | 15 ++++++++-------
 1 file changed, 8 insertions(+), 7 deletions(-)

diff --git a/kmod/src/item.c b/kmod/src/item.c
index f4c725d2..d9cc2b2f 100644
--- a/kmod/src/item.c
+++ b/kmod/src/item.c
@@ -149,7 +149,8 @@ struct cached_item {
 
 static int item_val_bytes(int val_len)
 {
-	return round_up(offsetof(struct cached_item, val[val_len]), CACHED_ITEM_ALIGN);
+	return round_up(offsetof(struct cached_item, val[val_len]),
+			CACHED_ITEM_ALIGN);
 }
 
 /*
@@ -421,8 +422,7 @@ static struct cached_item *alloc_item(struct cached_page *pg,
 static void erase_item(struct cached_page *pg, struct cached_item *item)
 {
 	rbtree_erase(&item->node, &pg->item_root);
-	pg->erased_bytes += round_up(item_val_bytes(item->val_len),
-				     CACHED_ITEM_ALIGN);
+	pg->erased_bytes += item_val_bytes(item->val_len);
 }
 
 static void lru_add(struct super_block *sb, struct item_cache_info *cinf,
@@ -853,8 +853,7 @@ static void compact_page_items(struct super_block *sb,
 
 	for (from = first_item(&pg->item_root); from; from = next_item(from)) {
 		to = page_address(empty->page) + page_off;
-		page_off += round_up(item_val_bytes(from->val_len),
-				     CACHED_ITEM_ALIGN);
+		page_off += item_val_bytes(from->val_len);
 
 		/* copy the entire item, struct members and all */
 		memcpy(to, from, item_val_bytes(from->val_len));
@@ -1960,7 +1959,8 @@ int scoutfs_item_update(struct super_block *sb, struct scoutfs_key *key,
 		if (val_len)
 			memcpy(found->val, val, val_len);
 		if (val_len < found->val_len)
-			pg->erased_bytes += found->val_len - val_len;
+			pg->erased_bytes += item_val_bytes(found->val_len) -
+					    item_val_bytes(val_len);
 		found->val_len = val_len;
 		found->liv.seq = liv.seq;
 		mark_item_dirty(sb, cinf, pg, NULL, found);
@@ -2039,7 +2039,8 @@ static int item_delete(struct super_block *sb, struct scoutfs_key *key,
 		item->liv.seq = liv.seq;
 		item->liv.flags |= SCOUTFS_LOG_ITEM_FLAG_DELETION;
 		item->deletion = 1;
-		pg->erased_bytes += item->val_len;
+		pg->erased_bytes += item_val_bytes(item->val_len) -
+				    item_val_bytes(0);
 		item->val_len = 0;
 		mark_item_dirty(sb, cinf, pg, NULL, item);
 	}

From a1d46e1a926b90d965ec87a034e504d4a4b41b58 Mon Sep 17 00:00:00 2001
From: Zach Brown <zab@versity.com>
Date: Thu, 10 Jun 2021 17:38:32 -0700
Subject: [PATCH 26/29] Fix mkfs btree item offset calculation

mkfs was miscalculating the offset of the start of the free region in
the center of blocks as it populated blocks with items.  It was using
the length of the free region as its offset in the block.  To find
the offset of the end of the free region in the block it has to be
taken relative to the end of the item array.

Signed-off-by: Zach Brown <zab@versity.com>
---
 utils/src/btree.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/utils/src/btree.c b/utils/src/btree.c
index 9224f9de..201c47a5 100644
--- a/utils/src/btree.c
+++ b/utils/src/btree.c
@@ -40,7 +40,7 @@ static void *alloc_val(struct scoutfs_btree_block *bt, int len)
 {
 	le16_add_cpu(&bt->mid_free_len, -len);
 	le16_add_cpu(&bt->total_item_bytes, len);
-	return (void *)bt + le16_to_cpu(bt->mid_free_len);
+	return (void *)&bt->items[le16_to_cpu(bt->nr_items)] + le16_to_cpu(bt->mid_free_len);
 }
 
 /*

From a7828a64103fe2cd0c440f981df65d81e4b43a12 Mon Sep 17 00:00:00 2001
From: Zach Brown <zab@versity.com>
Date: Mon, 14 Jun 2021 13:40:38 -0700
Subject: [PATCH 27/29] Add log merge item allocators to alloc detail

The alloc iterator needs to find and include the totals of the avail and
freed allocator list heads in the log merge items.

Signed-off-by: Zach Brown <zab@versity.com>
---
 kmod/src/alloc.c  | 57 +++++++++++++++++++++++++++++++++++++++++++++++
 kmod/src/format.h |  1 +
 2 files changed, 58 insertions(+)

diff --git a/kmod/src/alloc.c b/kmod/src/alloc.c
index 92276395..d556112e 100644
--- a/kmod/src/alloc.c
+++ b/kmod/src/alloc.c
@@ -1272,9 +1272,15 @@ int scoutfs_alloc_foreach(struct super_block *sb,
 	struct scoutfs_block_ref refs[2] = {{0,}};
 	struct scoutfs_super_block *super = NULL;
 	struct scoutfs_srch_compact *sc;
+	struct scoutfs_log_merge_request *lmreq;
+	struct scoutfs_log_merge_complete *lmcomp;
 	struct scoutfs_log_trees lt;
 	SCOUTFS_BTREE_ITEM_REF(iref);
 	struct scoutfs_key key;
+	int expected;
+	u64 avail_tot;
+	u64 freed_tot;
+	u64 id;
 	int ret;
 
 	super = kmalloc(sizeof(struct scoutfs_super_block), GFP_NOFS);
@@ -1381,6 +1387,57 @@ retry:
 		scoutfs_key_inc(&key);
 	}
 
+	/* log merge allocators */
+	memset(&key, 0, sizeof(key));
+	key.sk_zone = SCOUTFS_LOG_MERGE_REQUEST_ZONE;
+	expected = sizeof(*lmreq);
+	id = 0;
+	avail_tot = 0;
+	freed_tot = 0;
+
+	for (;;) {
+		ret = scoutfs_btree_next(sb, &super->log_merge, &key, &iref);
+		if (ret == 0) {
+			if (iref.key->sk_zone != key.sk_zone) {
+				ret = -ENOENT;
+			} else if (iref.val_len == expected) {
+				key = *iref.key;
+				if (key.sk_zone == SCOUTFS_LOG_MERGE_REQUEST_ZONE) {
+					lmreq = iref.val;
+					id = le64_to_cpu(lmreq->rid);
+					avail_tot = le64_to_cpu(lmreq->meta_avail.total_nr);
+					freed_tot = le64_to_cpu(lmreq->meta_freed.total_nr);
+				} else {
+					lmcomp = iref.val;
+					id = le64_to_cpu(lmcomp->rid);
+					avail_tot = le64_to_cpu(lmcomp->meta_avail.total_nr);
+					freed_tot = le64_to_cpu(lmcomp->meta_freed.total_nr);
+				}
+			} else {
+				ret = -EIO;
+			}
+			scoutfs_btree_put_iref(&iref);
+		}
+		if (ret == -ENOENT) {
+			if (key.sk_zone == SCOUTFS_LOG_MERGE_REQUEST_ZONE) {
+				memset(&key, 0, sizeof(key));
+				key.sk_zone = SCOUTFS_LOG_MERGE_COMPLETE_ZONE;
+				expected = sizeof(*lmcomp);
+				continue;
+			}
+			break;
+		}
+		if (ret < 0)
+			goto out;
+
+		ret = cb(sb, arg, SCOUTFS_ALLOC_OWNER_LOG_MERGE, id, true, true, avail_tot) ?:
+		      cb(sb, arg, SCOUTFS_ALLOC_OWNER_LOG_MERGE, id, true, false, freed_tot);
+		if (ret < 0)
+			goto out;
+
+		scoutfs_key_inc(&key);
+	}
+
 	ret = 0;
 out:
 	if (ret == -ESTALE) {
diff --git a/kmod/src/format.h b/kmod/src/format.h
index aa6abc88..af2358a0 100644
--- a/kmod/src/format.h
+++ b/kmod/src/format.h
@@ -325,6 +325,7 @@ struct scoutfs_alloc_root {
 #define SCOUTFS_ALLOC_OWNER_SERVER	1
 #define SCOUTFS_ALLOC_OWNER_MOUNT	2
 #define SCOUTFS_ALLOC_OWNER_SRCH	3
+#define SCOUTFS_ALLOC_OWNER_LOG_MERGE	4
 
 struct scoutfs_mounted_client_btree_val {
 	union scoutfs_inet_addr addr;

From 5c3fdb48afc86951b2b41a1073372b065ffec677 Mon Sep 17 00:00:00 2001
From: Zach Brown <zab@versity.com>
Date: Mon, 14 Jun 2021 17:10:16 -0700
Subject: [PATCH 28/29] Fix btree join item movement

Refilling a btree block by moving items from its siblings as it falls
under the join threshold had some pretty serious mistakes.  It used the
target block's total item count instead of the siblings when deciding
how many items to move.  It didn't take item moving overruns into
account when deciding to compact so it could run out of contiguous free
space as it moved the last item.  And once it compacted it returned
without moving because the return was meant to be in the error case.

This is all fixed by correctly examining the sibling block to determine
if we should join a block up to 75% full or move a big chunk over,
compacting if the free space doesn't have room for an excessive worst
case overrun, and fixing the compaction error checking return typo.

Signed-off-by: Zach Brown <zab@versity.com>
---
 kmod/src/btree.c | 27 +++++++++++++++++----------
 1 file changed, 17 insertions(+), 10 deletions(-)

diff --git a/kmod/src/btree.c b/kmod/src/btree.c
index 808cbaa8..46989385 100644
--- a/kmod/src/btree.c
+++ b/kmod/src/btree.c
@@ -108,10 +108,11 @@ static inline unsigned int item_bytes(struct scoutfs_btree_item *item)
 }
 
 /*
- * Join blocks when they both are 1/4 full.  This puts some distance
- * between the join threshold and the full threshold for splitting.
- * Blocks that just split or joined need to undergo a reasonable amount
- * of item modification before they'll split or join again.
+ * Refill blocks from their siblings when they're under 1/4 full.  This
+ * puts some distance between the join threshold and the full threshold
+ * for splitting.  Blocks that just split or joined need to undergo a
+ * reasonable amount of item modification before they'll split or join
+ * again.
  */
 static unsigned int join_low_watermark(void)
 {
@@ -815,6 +816,7 @@ static int try_join(struct super_block *sb,
 	struct scoutfs_btree_block *sib;
 	struct scoutfs_block *sib_bl;
 	struct scoutfs_block_ref *ref;
+	const unsigned int lwm = join_low_watermark();
 	unsigned int sib_tot;
 	bool move_right;
 	int to_move;
@@ -840,18 +842,23 @@ static int try_join(struct super_block *sb,
 		return ret;
 	sib = sib_bl->data;
 
-	sib_tot = le16_to_cpu(bt->total_item_bytes);
-	if (sib_tot < join_low_watermark())
+	/* combine if resulting block would be up to 75% full, move big chunk otherwise */
+	sib_tot = le16_to_cpu(sib->total_item_bytes);
+	if (sib_tot <= lwm * 2)
 		to_move = sib_tot;
 	else
-		to_move = sib_tot - join_low_watermark();
+		to_move = lwm;
 
-	if (le16_to_cpu(bt->mid_free_len) < to_move) {
+	/* compact to make room for over-estimate of worst case move overrun */
+	if (le16_to_cpu(bt->mid_free_len) <
+	    (to_move + item_len_bytes(SCOUTFS_BTREE_MAX_VAL_LEN))) {
 		ret = compact_values(sb, bt);
-		if (ret < 0)
+		if (ret < 0) {
 			scoutfs_block_put(sb, sib_bl);
-		return ret;
+			return ret;
+		}
 	}
+
 	move_items(bt, sib, move_right, to_move);
 
 	/* update our parent's item */

From 28759f32698e9bc52a0697f3da1191604be68ef0 Mon Sep 17 00:00:00 2001
From: Zach Brown <zab@versity.com>
Date: Wed, 16 Jun 2021 15:52:08 -0700
Subject: [PATCH 29/29] Rotate srch files as log trees items are reclaimed

The log merging work deletes log trees items once their item roots are
merged back into the fs root.  Those deleted items could still have
populated srch files that would be lost.  We force rotation of the srch
files in the items as they're reclaimed to turn them into rotated srch
files that can be compacted.

Signed-off-by: Zach Brown <zab@versity.com>
---
 kmod/src/server.c | 14 ++++++++++++--
 kmod/src/srch.c   |  5 +++--
 kmod/src/srch.h   |  2 +-
 3 files changed, 16 insertions(+), 5 deletions(-)

diff --git a/kmod/src/server.c b/kmod/src/server.c
index 15d3f6ed..9e8307b8 100644
--- a/kmod/src/server.c
+++ b/kmod/src/server.c
@@ -823,7 +823,7 @@ static int server_commit_log_trees(struct super_block *sb,
 	/* try to rotate the srch log when big enough */
 	mutex_lock(&server->srch_mutex);
 	ret = scoutfs_srch_rotate_log(sb, &server->alloc, &server->wri,
-				      &super->srch_root, &lt.srch_file);
+				      &super->srch_root, &lt.srch_file, false);
 	mutex_unlock(&server->srch_mutex);
 	if (ret < 0) {
 		scoutfs_err(sb, "server error, rotating srch log: %d", ret);
@@ -922,6 +922,16 @@ static int reclaim_open_log_tree(struct super_block *sb, u64 rid)
 		goto out;
 	}
 
+	/* for srch log file rotation if it's populated */
+	mutex_lock(&server->srch_mutex);
+	ret = scoutfs_srch_rotate_log(sb, &server->alloc, &server->wri,
+				      &super->srch_root, &lt.srch_file, true);
+	mutex_unlock(&server->srch_mutex);
+	if (ret < 0) {
+		scoutfs_err(sb, "server error, reclaim rotating srch log: %d", ret);
+		goto out;
+	}
+
 	/*
 	 * All of these can return errors after having modified the
 	 * allocator trees.  We have to try and update the roots in the
@@ -944,7 +954,7 @@ static int reclaim_open_log_tree(struct super_block *sb, u64 rid)
 
 	err = scoutfs_btree_update(sb, &server->alloc, &server->wri,
 				  &super->logs_root, &key, &lt, sizeof(lt));
-	BUG_ON(err != 0); /* alloc and log item roots out of sync */
+	BUG_ON(err != 0); /* alloc, log, srch items out of sync */
 
 out:
 	mutex_unlock(&server->logs_mutex);
diff --git a/kmod/src/srch.c b/kmod/src/srch.c
index 372be7fe..9fbaaeb7 100644
--- a/kmod/src/srch.c
+++ b/kmod/src/srch.c
@@ -989,12 +989,13 @@ int scoutfs_srch_rotate_log(struct super_block *sb,
 			    struct scoutfs_alloc *alloc,
 			    struct scoutfs_block_writer *wri,
 			    struct scoutfs_btree_root *root,
-			    struct scoutfs_srch_file *sfl)
+			    struct scoutfs_srch_file *sfl, bool force)
 {
 	struct scoutfs_key key;
 	int ret;
 
-	if (le64_to_cpu(sfl->blocks) < SCOUTFS_SRCH_LOG_BLOCK_LIMIT)
+	if (sfl->ref.blkno == 0 ||
+	    (!force && le64_to_cpu(sfl->blocks) < SCOUTFS_SRCH_LOG_BLOCK_LIMIT))
 		return 0;
 
 	init_srch_key(&key, SCOUTFS_SRCH_LOG_TYPE,
diff --git a/kmod/src/srch.h b/kmod/src/srch.h
index 69448ab3..7f30f04c 100644
--- a/kmod/src/srch.h
+++ b/kmod/src/srch.h
@@ -37,7 +37,7 @@ int scoutfs_srch_rotate_log(struct super_block *sb,
 			    struct scoutfs_alloc *alloc,
 			    struct scoutfs_block_writer *wri,
 			    struct scoutfs_btree_root *root,
-			    struct scoutfs_srch_file *sfl);
+			    struct scoutfs_srch_file *sfl, bool force);
 int scoutfs_srch_get_compact(struct super_block *sb,
 			     struct scoutfs_alloc *alloc,
 			     struct scoutfs_block_writer *wri,