Use cwskip for the item cache

The use of pages in the item cache got us pretty far but it fundmanetally couldn't escape the contention around the global or per-page read locks. Some loads became bottlenecked in contention in the item cache. Worse, we were seeing inconsistency in the per-cpu cached mappings of key ranges to pages. All the users of items in the cache are transitioned from searching for items in locked pages to searching for items in the cwskip list. It's fundamentally built around a seqlock-like begin/retry pattern so most of the item work gets wrapped around search and retry helpers. Without pages we no longer have a global list of dirty pages. Instead we have per-cpu lists of dirty items that are later sorted and handed to the btree insertion iterator. We take the opportunity to clean up that interface now that it's very easy for us to iterate through the stable list of dirty items. Rather than a global lru of pages we have an algorithm for maintaining items in rough groups of ages. Shrinking randomly walks the cwskip list looking for regions of sufficiently old items rather than walking a precise global lru list of pages. Signed-off-by: Zach Brown <zab@versity.com>
Add cwskip skip list
2026-06-09 21:22:36 +00:00 · 2021-12-23 15:11:54 -08:00 · 2021-12-23 15:11:54 -08:00
12 changed files with 1822 additions and 1949 deletions
@@ -13,6 +13,7 @@ scoutfs-y +=			\
 	block.o			\
 	btree.o			\
 	client.o		\
+	cwskip.o		\
 	counters.o		\
 	data.o			\
 	dir.o			\
@@ -1875,12 +1875,11 @@ out:
 * set in btree items.  They're only used for fs items written through
 * the item cache and forest of log btrees.
 */
-int scoutfs_btree_insert_list(struct super_block *sb,
-			      struct scoutfs_alloc *alloc,
-			      struct scoutfs_block_writer *wri,
-			      struct scoutfs_btree_root *root,
-			      struct scoutfs_btree_item_list *lst)
+int scoutfs_btree_insert_list(struct super_block *sb, struct scoutfs_alloc *alloc,
+			      struct scoutfs_block_writer *wri, struct scoutfs_btree_root *root,
+			      scoutfs_btree_item_iter_cb iter_cb, void *pos, void *arg)
 {
+	struct scoutfs_btree_item_desc desc;
 	struct scoutfs_btree_item *item;
 	struct btree_walk_key_range kr;
 	struct scoutfs_btree_block *bt;
@@ -1889,44 +1888,46 @@ int scoutfs_btree_insert_list(struct super_block *sb,
 	int cmp;
 	int ret = 0;

-	while (lst) {
+	pos = iter_cb(sb, &desc, pos, arg);
+
+	while (pos) {
 		ret = btree_walk(sb, alloc, wri, root, BTW_DIRTY | BTW_INSERT,
-				 &lst->key, lst->val_len, &bl, &kr, NULL);
+				 desc.key, desc.val_len, &bl, &kr, NULL);
 		if (ret < 0)
 			goto out;
 		bt = bl->data;

 		do {
-			item = leaf_item_hash_search(sb, bt, &lst->key);
+			item = leaf_item_hash_search(sb, bt, desc.key);
 			if (item) {
 				/* try to merge delta values, _NULL not deleted; merge will */
-				ret = scoutfs_forest_combine_deltas(&lst->key,
+				ret = scoutfs_forest_combine_deltas(desc.key,
 								    item_val(bt, item),
 								    item_val_len(item),
-								    lst->val, lst->val_len);
+								    desc.val, desc.val_len);
 				if (ret < 0) {
 					scoutfs_block_put(sb, bl);
 					goto out;
 				}

-				item->seq = cpu_to_le64(lst->seq);
-				item->flags = lst->flags;
+				item->seq = cpu_to_le64(desc.seq);
+				item->flags = desc.flags;

 				if (ret == 0)
-					update_item_value(bt, item, lst->val, lst->val_len);
+					update_item_value(bt, item, desc.val, desc.val_len);
 				else
 					ret = 0;
 			} else {
 				scoutfs_avl_search(&bt->item_root,
-						   cmp_key_item, &lst->key,
+						   cmp_key_item, desc.key,
 						   &cmp, &par, NULL, NULL);
-				create_item(bt, &lst->key, lst->seq, lst->flags, lst->val,
-					    lst->val_len, par, cmp);
+				create_item(bt, desc.key, desc.seq, desc.flags, desc.val,
+					    desc.val_len, par, cmp);
 			}

-			lst = lst->next;
-		} while (lst && scoutfs_key_compare(&lst->key, &kr.end) <= 0 &&
-			 mid_free_item_room(bt, lst->val_len));
+			pos = iter_cb(sb, &desc, pos, arg);
+		} while (pos && scoutfs_key_compare(desc.key, &kr.end) <= 0 &&
+			 mid_free_item_room(bt, desc.val_len));

 		scoutfs_block_put(sb, bl);
 	}
@@ -18,11 +18,24 @@ struct scoutfs_btree_item_ref {
 #define SCOUTFS_BTREE_ITEM_REF(name) \
 	struct scoutfs_btree_item_ref name = {NULL,}

-/* caller gives an item to the callback */
+/* btree gives an item to caller */
 typedef int (*scoutfs_btree_item_cb)(struct super_block *sb,
 				     struct scoutfs_key *key, u64 seq, u8 flags,
 				     void *val, int val_len, void *arg);

+struct scoutfs_btree_item_desc {
+	struct scoutfs_key *key;
+	void *val;
+	u64 seq;
+	u8 flags;
+	unsigned val_len;
+};
+
+/* btree iterates through items from caller */
+typedef void *(*scoutfs_btree_item_iter_cb)(struct super_block *sb,
+					    struct scoutfs_btree_item_desc *desc,
+					    void *pos, void *arg);
+
 /* simple singly-linked list of items */
 struct scoutfs_btree_item_list {
 	struct scoutfs_btree_item_list *next;
@@ -78,11 +91,9 @@ int scoutfs_btree_read_items(struct super_block *sb,
 			     struct scoutfs_key *start,
 			     struct scoutfs_key *end,
 			     scoutfs_btree_item_cb cb, void *arg);
-int scoutfs_btree_insert_list(struct super_block *sb,
-			      struct scoutfs_alloc *alloc,
-			      struct scoutfs_block_writer *wri,
-			      struct scoutfs_btree_root *root,
-			      struct scoutfs_btree_item_list *lst);
+int scoutfs_btree_insert_list(struct super_block *sb, struct scoutfs_alloc *alloc,
+			      struct scoutfs_block_writer *wri, struct scoutfs_btree_root *root,
+			      scoutfs_btree_item_iter_cb iter_cb, void *pos, void *arg);

 int scoutfs_btree_parent_range(struct super_block *sb,
 			       struct scoutfs_btree_root *root,
@@ -90,36 +90,27 @@
 	EXPAND_COUNTER(forest_read_items)			\
 	EXPAND_COUNTER(forest_roots_next_hint)			\
 	EXPAND_COUNTER(forest_set_bloom_bits)			\
+	EXPAND_COUNTER(item_alloc_bytes)			\
 	EXPAND_COUNTER(item_clear_dirty)			\
 	EXPAND_COUNTER(item_create)				\
 	EXPAND_COUNTER(item_delete)				\
 	EXPAND_COUNTER(item_delta)				\
 	EXPAND_COUNTER(item_delta_written)			\
 	EXPAND_COUNTER(item_dirty)				\
+	EXPAND_COUNTER(item_free_bytes)				\
 	EXPAND_COUNTER(item_invalidate)				\
-	EXPAND_COUNTER(item_invalidate_page)			\
+	EXPAND_COUNTER(item_invalidate_item)			\
 	EXPAND_COUNTER(item_lookup)				\
 	EXPAND_COUNTER(item_mark_dirty)				\
 	EXPAND_COUNTER(item_next)				\
-	EXPAND_COUNTER(item_page_accessed)			\
-	EXPAND_COUNTER(item_page_alloc)				\
-	EXPAND_COUNTER(item_page_clear_dirty)			\
-	EXPAND_COUNTER(item_page_compact)			\
-	EXPAND_COUNTER(item_page_free)				\
-	EXPAND_COUNTER(item_page_lru_add)			\
-	EXPAND_COUNTER(item_page_lru_remove)			\
-	EXPAND_COUNTER(item_page_mark_dirty)			\
-	EXPAND_COUNTER(item_page_rbtree_walk)			\
-	EXPAND_COUNTER(item_page_split)				\
-	EXPAND_COUNTER(item_pcpu_add_replaced)			\
-	EXPAND_COUNTER(item_pcpu_page_hit)			\
-	EXPAND_COUNTER(item_pcpu_page_miss)			\
-	EXPAND_COUNTER(item_pcpu_page_miss_keys)		\
-	EXPAND_COUNTER(item_read_pages_split)			\
-	EXPAND_COUNTER(item_shrink_page)			\
-	EXPAND_COUNTER(item_shrink_page_dirty)			\
-	EXPAND_COUNTER(item_shrink_page_reader)			\
-	EXPAND_COUNTER(item_shrink_page_trylock)		\
+	EXPAND_COUNTER(item_shrink)				\
+	EXPAND_COUNTER(item_shrink_all)				\
+	EXPAND_COUNTER(item_shrink_exhausted)			\
+	EXPAND_COUNTER(item_shrink_read_search)			\
+	EXPAND_COUNTER(item_shrink_removed)			\
+	EXPAND_COUNTER(item_shrink_searched)			\
+	EXPAND_COUNTER(item_shrink_skipped)			\
+	EXPAND_COUNTER(item_shrink_write_search)		\
 	EXPAND_COUNTER(item_update)				\
 	EXPAND_COUNTER(item_write_dirty)			\
 	EXPAND_COUNTER(lock_alloc)				\
@@ -0,0 +1,584 @@
+/*
+ * Copyright (C) 2021 Versity Software, Inc.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ */
+#include <linux/kernel.h>
+#include <linux/fs.h>
+#include <linux/rcupdate.h>
+#include <linux/random.h>
+
+#include "cwskip.h"
+
+/*
+ * This skip list is built to allow concurrent modification and limit
+ * contention to the region of the list around the modification.  All
+ * node references are protected by RCU.   Each node has a write_seq
+ * that works like a seqlock, the big differences are that we nest them
+ * and use trylock to acquire them.
+ *
+ * Readers sample the write_seqs of nodes containing links as they
+ * traverse them, verifying that the node hasn't been modified before
+ * traversing to the node referenced by the link.
+ *
+ * Writers remember the seqs of all the nodes they traversed to end up
+ * at their final node.   They try to acquire the lock of all the nodes
+ * needed to modify the list at a given height.   Their trylocks will
+ * fail if any of the nodes have changed since their traversal.
+ *
+ * The interface is built around references to adjacent pairs of nodes
+ * and their sequence numbers.   This lets readers and writers traverse
+ * through their local region of the list until they hit contention and
+ * must start over with a full search.
+ *
+ * The caller is responsible for allocating and freeing nodes.   The
+ * interface is built around caller's objects which each have embedded
+ * nodes.
+ */
+
+/*
+ * node_off is the positive offset of the cwskip node within the
+ * container structs stored in the list.  The node_off is subtracted
+ * from node pointers to give the caller a pointer to their stored
+ * container struct.
+ */
+void scoutfs_cwskip_init_root(struct scoutfs_cwskip_root *root, scoutfs_cwskip_cmp_t cmp_fn,
+			      unsigned long node_off)
+{
+	memset(root, 0, sizeof(&root));
+	root->cmp_fn = cmp_fn;
+	root->node_off = node_off;
+}
+
+/* This is completely racey and should be used accordingly. */
+bool scoutfs_cwskip_empty(struct scoutfs_cwskip_root *root)
+{
+	int i;
+
+	for (i = 0; i < SCOUTFS_CWSKIP_MAX_HEIGHT; i++) {
+		if (root->node.links[i] != NULL)
+			return false;
+	}
+
+	return true;
+}
+
+/*
+ * Return a random height between 1 and max height, inclusive.  Using
+ * ffs means that each greater height relies on all lower height bits
+ * being clear and we get the height distribution we want: 1 = 1/2,
+ * 2 = 1/4, 3 = 1/8, etc.
+ */
+int scoutfs_cwskip_rand_height(void)
+{
+	return ffs(prandom_u32() | (1 << (SCOUTFS_CWSKIP_MAX_HEIGHT - 1)));
+}
+
+static void *node_container(struct scoutfs_cwskip_root *root, struct scoutfs_cwskip_node *node)
+{
+	return node ? (void *)((unsigned long)node - root->node_off) : NULL;
+}
+
+/*
+ * Set the caller's containers for the given nodes.   There isn't a
+ * previous container when the previous node is the root's static
+ * full-height node.
+ */
+static void set_containers(struct scoutfs_cwskip_root *root, struct scoutfs_cwskip_node *prev,
+			   struct scoutfs_cwskip_node *node, void **prev_cont, void **node_cont)
+{
+	if (prev_cont)
+		*prev_cont = (prev != &root->node) ? node_container(root, prev) : NULL;
+	if (node_cont)
+		*node_cont = node_container(root, node);
+}
+
+static struct scoutfs_cwskip_node *node_read_begin(struct scoutfs_cwskip_node *node,
+						   unsigned int *seq)
+{
+	if (node) {
+		*seq = READ_ONCE(node->write_seq) & ~1U;
+		smp_rmb();
+	} else {
+		*seq = 1;  /* caller shouldn't use if we return null, being careful */
+	}
+
+	return node;
+}
+
+static bool node_read_retry(struct scoutfs_cwskip_node *node, unsigned int seq)
+{
+	if (node) {
+	       smp_rmb();
+	       return READ_ONCE(node->write_seq) != seq;
+	}
+
+	return false;
+}
+
+/*
+ * write_seq is only an int to reduce the size of nodes and full-height
+ * seq arrays, it could be a long if archs have trouble with int
+ * cmpxchg.
+ */
+static bool __node_trylock(struct scoutfs_cwskip_node *node, unsigned int seq)
+{
+	if (seq & 1)
+		return false;
+
+	return cmpxchg(&node->write_seq, seq, seq + 1) == seq;
+}
+
+static bool node_trylock(struct scoutfs_cwskip_node *node, unsigned int seq)
+{
+	bool locked = __node_trylock(node, seq);
+	if (locked)
+		smp_wmb();
+	return locked;
+}
+
+static void __node_unlock(struct scoutfs_cwskip_node *node)
+{
+	node->write_seq++;
+}
+
+static void node_unlock(struct scoutfs_cwskip_node *node)
+{
+	__node_unlock(node);
+	smp_wmb();
+}
+
+/* return -1/1 to go left/right, never 0 */
+static int random_cmp(void *K, void *C)
+{
+	return (int)(prandom_u32() & 2) - 1;
+}
+
+static void cwskip_search(struct scoutfs_cwskip_root *root, void *key, int *node_cmp,
+			  struct scoutfs_cwskip_reader *rd, struct scoutfs_cwskip_writer *wr,
+			  unsigned int *prev_seqs)
+{
+	struct scoutfs_cwskip_node *prev;
+	struct scoutfs_cwskip_node *node;
+	scoutfs_cwskip_cmp_t cmp_fn;
+	unsigned int prev_seq;
+	unsigned int node_seq;
+	int level;
+	int cmp;
+
+	if (key == NULL)
+		cmp_fn = random_cmp;
+
+restart:
+	prev = node_read_begin(&root->node, &prev_seq);
+	node = NULL;
+	node_seq = 1;
+	cmp = -1;
+
+	level = SCOUTFS_CWSKIP_MAX_HEIGHT - 1;
+	while (prev && level >= 0) {
+		node = node_read_begin(prev->links[level], &node_seq);
+		if (!node) {
+			cmp = -1;
+			level--;
+			continue;
+		}
+
+		cmp = cmp_fn(key, node_container(root, node));
+		if (cmp > 0) {
+			if (node_read_retry(prev, prev_seq))
+				goto restart;
+			prev = node;
+			prev_seq = node_seq;
+			node = NULL;
+			continue;
+		}
+
+		if (wr) {
+			wr->prevs[level] = prev;
+			prev_seqs[level] = prev_seq;
+		}
+
+		level--;
+	}
+
+	rd->prev = prev;
+	rd->prev_seq = prev_seq;
+	rd->node = node;
+	rd->node_seq = node_seq;
+	*node_cmp = cmp;
+}
+
+static void init_reader(struct scoutfs_cwskip_reader *rd, struct scoutfs_cwskip_root *root)
+{
+	memset(rd, 0, sizeof(struct scoutfs_cwskip_reader));
+	rd->root = root;
+}
+
+/*
+ * Find and returns nodes that surround the search key.
+ *
+ * Either prev or null can be null if there are no nodes before or after
+ * the search key.  *node_cmp is set to the final comparison of the key
+ * and the returned node's container key, it will be 0 if an exact match
+ * is found.
+ *
+ * This starts an RCU read critical section and is fully concurrent with
+ * both other readers and writers.   The nodes won't be freed until
+ * after the section so its always safe to reference them but their
+ * contents might be nonsense if they're modified during the read.
+ * Nothing learned from the list during the read section should have an
+ * effect until after _read_valid has said it was OK.
+ *
+ * _read_valid can be called after referencing the nodes to see if they
+ * were stable during the read.  _read_next can be used to iterate
+ * forward through the list without repeating the search.   The caller
+ * must always call a matching _read_end once they're done.
+ */
+void scoutfs_cwskip_read_begin(struct scoutfs_cwskip_root *root, void *key, void **prev_cont,
+			       void **node_cont, int *node_cmp, struct scoutfs_cwskip_reader *rd)
+	__acquires(RCU) /* :/ */
+{
+	init_reader(rd, root);
+
+	rcu_read_lock();
+	cwskip_search(root, key, node_cmp, rd, NULL, NULL);
+	set_containers(root, rd->prev, rd->node, prev_cont, node_cont);
+}
+
+/*
+ * Returns true of the nodes referenced by the reader haven't been
+ * modified and any references of them were consistent.  Thsi does not
+ * end the reader critical section and can be called multiple times.
+ */
+bool scoutfs_cwskip_read_valid(struct scoutfs_cwskip_reader *rd)
+{
+	return !(node_read_retry(rd->prev, rd->prev_seq) &&
+		 node_read_retry(rd->node, rd->node_seq));
+}
+
+/*
+ * Advance from the current prev/node to the next pair of nodes in the
+ * list.  prev_cont is set to what node_cont was before the call.
+ * node_cont is set to the next node after the current node_cont.
+ *
+ * This returns true if it found a next node and that its load of the
+ * next pointer from node was valid and stable.  Returning false means
+ * that the caller should retry.  There could be more items in the list.
+ */
+bool scoutfs_cwskip_read_next(struct scoutfs_cwskip_reader *rd, void **prev_cont, void **node_cont)
+{
+	struct scoutfs_cwskip_node *next;
+	unsigned int next_seq;
+	bool valid_next;
+
+	next = rd->node ? node_read_begin(rd->node->links[0], &next_seq) : NULL;
+	valid_next = scoutfs_cwskip_read_valid(rd) && next;
+	if (valid_next) {
+		rd->prev = rd->node;
+		rd->prev_seq = rd->node_seq;
+		rd->node = next;
+		rd->node_seq = next_seq;
+
+		set_containers(rd->root, rd->prev, rd->node, prev_cont, node_cont);
+	}
+
+	return valid_next;
+}
+
+/*
+ * End the critical section started with _read_begin.
+ */
+void scoutfs_cwskip_read_end(struct scoutfs_cwskip_reader *rd)
+	__releases(RCU) /* :/ */
+{
+	rcu_read_unlock();
+}
+
+/*
+ * Higher locks are more likely to cause contention so we unlock them
+ * first.
+ */
+static void writer_unlock(struct scoutfs_cwskip_writer *wr)
+{
+	int i;
+
+	for (i = wr->locked_height - 1; i >= 0; i--) {
+		if (i == 0 || (wr->prevs[i - 1] != wr->prevs[i]))
+			__node_unlock(wr->prevs[i]);
+	}
+
+	if (wr->node_locked)
+		__node_unlock(wr->node);
+
+	smp_wmb();
+
+	wr->locked_height = 0;
+	wr->node_locked = false;
+}
+
+/*
+ * A search traversal has saved all the previous nodes at each level.
+ *
+ * We try to acquire the write_seq locks for all the prevs up to height
+ * from the seqs that we read during the search.   The search was
+ * protected by read sections so the prevs represent a consistent
+ * version of the list at some point in the past.  If nodes have been
+ * locked since we read them we won't be able to acquire the locks.
+ * Nodes aren't re-inserted after removal so we shouldn't see nodes in
+ * multiple places (which would deadlock).
+ *
+ * The same node can be in multiple prev slots.  We're careful to only
+ * try locking the lowest duplicate slot.
+ *
+ * We lock from the highest level down.   This only matters when there's
+ * contention.   The higher nodes are more likely to see contention so
+ * we want trylock to fail early to avoid useless locking churn on lower
+ * nodes.
+ */
+static bool writer_trylock(struct scoutfs_cwskip_writer *wr, unsigned int *prev_seqs, int height)
+{
+	int i;
+
+	if (WARN_ON_ONCE(wr->locked_height != 0) ||
+	    WARN_ON_ONCE(height < 1 || height > ARRAY_SIZE(wr->prevs)))
+		return false;
+
+	for (i = height - 1; i >= 0; i--) {
+		if ((i == 0 || wr->prevs[i - 1] != wr->prevs[i]) &&
+		    !__node_trylock(wr->prevs[i], prev_seqs[i]))
+			break;
+		wr->locked_height++;
+	}
+
+	if (i < height) {
+		writer_unlock(wr);
+		return false;
+	}
+
+	/* paranoid debugging verification */
+	for (i = 0; i < wr->locked_height; i++) {
+		BUG_ON(wr->prevs[i]->height <= i);
+		BUG_ON(wr->node && i < wr->node->height && wr->prevs[i]->links[i] != wr->node);
+	}
+
+	smp_mb();
+	return true;
+}
+
+static void init_writer(struct scoutfs_cwskip_writer *wr, struct scoutfs_cwskip_root *root)
+{
+	memset(wr, 0, sizeof(struct scoutfs_cwskip_writer));
+	wr->root = root;
+}
+
+/*
+ * Search for and return references to the two nodes that surround the
+ * search key, with the nodes locked.
+ *
+ * Either node can be null if there are no nodes before or after the
+ * search key.  We still hold a lock on the static root node if the
+ * search key falls before the first node in the list.
+ *
+ * If lock_height is 0 then the caller is saying that they just want to
+ * lock the surrounding nodes and not modify their position in the list.
+ * We only lock those two nodes.  Any greater lock_height represents a
+ * height that we need to lock so the caller can insert an allocated
+ * node with that height.
+ *
+ * The caller can use the writer context to iterate through locked nodes
+ * via the lowest level list that contains all nodes.  If they hit a
+ * node that's higher than the locked height in the writer then they
+ * have to unlock and restart because we don't have the previous node
+ * for that height.  We set a min level that we lock to reduce the
+ * possibility of hitting higher nodes and retrying.
+ */
+#define MIN_LOCKED_HEIGHT 4
+void scoutfs_cwskip_write_begin(struct scoutfs_cwskip_root *root, void *key, int lock_height,
+				void **prev_cont, void **node_cont, int *node_cmp,
+				struct scoutfs_cwskip_writer *wr)
+	__acquires(RCU) /* :/ */
+{
+	unsigned int prev_seqs[SCOUTFS_CWSKIP_MAX_HEIGHT];
+	struct scoutfs_cwskip_reader rd;
+	int node_height;
+	int use_height;
+	bool locked;
+
+	BUG_ON(WARN_ON_ONCE(lock_height < 0 || lock_height > SCOUTFS_CWSKIP_MAX_HEIGHT));
+
+	do {
+		init_reader(&rd, root);
+		init_writer(wr, root);
+
+		rcu_read_lock();
+		cwskip_search(root, key, node_cmp, &rd, wr, NULL);
+
+		wr->node = rd.node;
+		if (wr->node) {
+			/* _trylock of prevs will issue barrier on success */
+			if (!__node_trylock(wr->node, rd.node_seq)) {
+				locked = false;
+				continue;
+			}
+			wr->node_locked = true;
+			node_height = wr->node->height;
+		} else {
+			node_height = 0;
+		}
+
+		if (lock_height > 0)
+			use_height = max3(MIN_LOCKED_HEIGHT, node_height, lock_height);
+		else
+			use_height = 1;
+
+		locked = writer_trylock(wr, prev_seqs, use_height);
+		if (!locked)
+			rcu_read_unlock();
+	} while (!locked);
+
+	set_containers(root, wr->prevs[0], wr->node, prev_cont, node_cont);
+}
+
+/*
+ * Insert a new node between the writer's two locked nodes.  The
+ * inserting node is locked and replaces the existing node in the writer
+ * which is unlocked.
+ *
+ * The next node may not exist.   The previous nodes will always exist
+ * though they may be the static root node.
+ *
+ * The inserting node is visible to readers the moment we store the
+ * first link to it in previous nodes.  We first lock it with a write
+ * barrier so that any readers will retry if they visit it before all
+ * its links are updated and its unlocked.
+ *
+ * We don't unlock prevs that are higher than the inserting node.  This
+ * lets the caller continue iterating through nodes that are higher than
+ * insertion but still under the locked height.
+ */
+void scoutfs_cwskip_write_insert(struct scoutfs_cwskip_writer *wr,
+				 struct scoutfs_cwskip_node *ins)
+{
+	struct scoutfs_cwskip_node *node = wr->node;
+	int i;
+
+	BUG_ON(ins->height > wr->locked_height);
+	node_trylock(ins, ins->write_seq);
+
+	for (i = 0; i < ins->height; i++) {
+		ins->links[i] = wr->prevs[i]->links[i];
+		wr->prevs[i]->links[i] = ins;
+	}
+
+	if (node)
+		node_unlock(node);
+	wr->node = ins;
+}
+
+/*
+ * Remove the node in the writer from the list.  The writers node
+ * pointer is not advanced because we don't want this to be able to fail
+ * if trylock on the next node fails.  The caller can call _write_next
+ * on this writer and it will try and iterate from prevs[0].
+ *
+ * The caller's removal argument must be the node pointer in the writer.
+ * This is redundant but meant to communicate to the caller that they're
+ * responsible for the node after removing it (presumably queueing it
+ * for freeing before _write_end leaves rcu).
+ *
+ * Readers can be traversing our node as we modify its pointers and can
+ * read a temporarily inconsistent state.  We have the node locked so
+ * the reader will immediately retry once the check the seqs after
+ * hitting our node that's being removed.
+ */
+void scoutfs_cwskip_write_remove(struct scoutfs_cwskip_writer *wr,
+				 struct scoutfs_cwskip_node *node)
+{
+	int i;
+
+	BUG_ON(node != wr->node);
+	BUG_ON(node->height > wr->locked_height);
+
+	for (i = 0; i < node->height; i++) {
+		wr->prevs[i]->links[i] = node->links[i];
+		node->links[i] = NULL;
+	}
+
+	node_unlock(node);
+	wr->node = NULL;
+}
+
+/*
+ * Advance through the list by setting prevs to node and node to the
+ * next node in the list after locking it.  Returns true only if there
+ * was a next node that we were able to lock.   Returning false can mean
+ * that we weren't able to lock the next node and the caller should
+ * retry a full search.
+ *
+ * This may be called after _write_remove clears node so we try to
+ * iterate from prev if there is no node.
+ *
+ * If lock_height is greater than zero then the caller needs at least
+ * that lock_height to insert a node of that height.   If locked_height
+ * doesn't cover it then we return false so the caller can retry
+ * _write_begin with the needed height.
+ *
+ * Like insertion, we don't unlock prevs higher than the height of the
+ * next node.   They're not strictly needed to modify the next node but
+ * we want to keep them locked so the caller can continue to iterate
+ * through nodes up to the locked height.
+ */
+bool scoutfs_cwskip_write_next(struct scoutfs_cwskip_writer *wr, int lock_height,
+			       void **prev_cont, void **node_cont)
+{
+	struct scoutfs_cwskip_node *next;
+	int i;
+
+	if (WARN_ON_ONCE(lock_height < 0 || lock_height > SCOUTFS_CWSKIP_MAX_HEIGHT))
+		return false;
+
+	if (wr->node)
+		next = rcu_dereference(wr->node->links[0]);
+	else
+		next = rcu_dereference(wr->prevs[0]->links[0]);
+
+	if (!next ||
+	    (lock_height > wr->locked_height) ||
+	    (lock_height > 0 && next->height > wr->locked_height) ||
+	    !__node_trylock(next, next->write_seq))
+		return false;
+
+	if (!wr->node) {
+		/* set next as missing node */
+		wr->node = next;
+		wr->node_locked = true;
+
+	} else {
+		/* existing node becomes prevs for its height */
+		__node_unlock(wr->prevs[0]);
+		for (i = 0; i < wr->node->height; i++)
+			wr->prevs[0] = wr->node;
+		wr->node = next;
+	}
+
+	smp_wmb(); /* next locked and prev unlocked */
+
+	set_containers(wr->root, wr->prevs[0], wr->node, prev_cont, node_cont);
+
+	return true;
+}
+
+void scoutfs_cwskip_write_end(struct scoutfs_cwskip_writer *wr)
+	__releases(RCU) /* :/ */
+{
+	writer_unlock(wr);
+	rcu_read_unlock();
+}
@@ -0,0 +1,68 @@
+#ifndef _SCOUTFS_CWSKIP_H_
+#define _SCOUTFS_CWSKIP_H_
+
+/* A billion seems like a lot. */
+#define SCOUTFS_CWSKIP_MAX_HEIGHT 30
+
+struct scoutfs_cwskip_node {
+	int height;
+	unsigned int write_seq;
+	struct scoutfs_cwskip_node *links[];
+};
+
+#define SCOUTFS_CWSKIP_FULL_NODE_BYTES \
+	offsetof(struct scoutfs_cwskip_node, links[SCOUTFS_CWSKIP_MAX_HEIGHT + 1])
+
+typedef int (*scoutfs_cwskip_cmp_t)(void *K, void *C);
+
+struct scoutfs_cwskip_root {
+	scoutfs_cwskip_cmp_t cmp_fn;
+	unsigned long node_off;
+	union {
+		struct scoutfs_cwskip_node node;
+		__u8 __full_root_node[SCOUTFS_CWSKIP_FULL_NODE_BYTES];
+	};
+};
+
+struct scoutfs_cwskip_reader {
+	struct scoutfs_cwskip_root *root;
+	struct scoutfs_cwskip_node *prev;
+	struct scoutfs_cwskip_node *node;
+	unsigned int prev_seq;
+	unsigned int node_seq;
+};
+
+/*
+ * The full height prevs array makes these pretty enormous :/.
+ */
+struct scoutfs_cwskip_writer {
+	struct scoutfs_cwskip_root *root;
+	bool node_locked;
+	int locked_height;
+	struct scoutfs_cwskip_node *node;
+	struct scoutfs_cwskip_node *prevs[SCOUTFS_CWSKIP_MAX_HEIGHT];
+};
+
+void scoutfs_cwskip_init_root(struct scoutfs_cwskip_root *root, scoutfs_cwskip_cmp_t cmp_fn,
+			      unsigned long node_off);
+bool scoutfs_cwskip_empty(struct scoutfs_cwskip_root *root);
+int scoutfs_cwskip_rand_height(void);
+
+void scoutfs_cwskip_read_begin(struct scoutfs_cwskip_root *root, void *key, void **prev_cont,
+			       void **node_cont, int *node_cmp, struct scoutfs_cwskip_reader *rd);
+bool scoutfs_cwskip_read_valid(struct scoutfs_cwskip_reader *rd);
+bool scoutfs_cwskip_read_next(struct scoutfs_cwskip_reader *rd, void **prev_cont, void **node_cont);
+void scoutfs_cwskip_read_end(struct scoutfs_cwskip_reader *rd);
+
+void scoutfs_cwskip_write_begin(struct scoutfs_cwskip_root *root, void *key, int lock_height,
+				void **prev_cont, void **node_cont, int *node_cmp,
+				struct scoutfs_cwskip_writer *wr);
+void scoutfs_cwskip_write_insert(struct scoutfs_cwskip_writer *wr,
+				 struct scoutfs_cwskip_node *ins);
+void scoutfs_cwskip_write_remove(struct scoutfs_cwskip_writer *wr,
+				 struct scoutfs_cwskip_node *node);
+bool scoutfs_cwskip_write_next(struct scoutfs_cwskip_writer *wr, int lock_height,
+			       void **prev_cont, void **node_cont);
+void scoutfs_cwskip_write_end(struct scoutfs_cwskip_writer *wr);
+
+#endif
@@ -494,13 +494,13 @@ out:
 	return ret;
 }

-int scoutfs_forest_insert_list(struct super_block *sb,
-			       struct scoutfs_btree_item_list *lst)
+int scoutfs_forest_insert_list(struct super_block *sb, scoutfs_btree_item_iter_cb cb,
+			       void *pos, void *arg)
 {
 	DECLARE_FOREST_INFO(sb, finf);

 	return scoutfs_btree_insert_list(sb, finf->alloc, finf->wri,
-					 &finf->our_log.item_root, lst);
+					 &finf->our_log.item_root, cb, pos, arg);
 }

 /*
@@ -29,8 +29,8 @@ void scoutfs_forest_set_max_seq(struct super_block *sb, u64 max_seq);
 int scoutfs_forest_get_max_seq(struct super_block *sb,
 			       struct scoutfs_super_block *super,
 			       u64 *seq);
-int scoutfs_forest_insert_list(struct super_block *sb,
-			       struct scoutfs_btree_item_list *lst);
+int scoutfs_forest_insert_list(struct super_block *sb, scoutfs_btree_item_iter_cb cb,
+			       void *pos, void *arg);
 int scoutfs_forest_srch_add(struct super_block *sb, u64 hash, u64 ino, u64 id);

 void scoutfs_forest_inc_inode_count(struct super_block *sb);
@@ -26,7 +26,7 @@ int scoutfs_item_delete_force(struct super_block *sb,
 				struct scoutfs_key *key,
 				struct scoutfs_lock *lock);

-u64 scoutfs_item_dirty_pages(struct super_block *sb);
+u64 scoutfs_item_dirty_bytes(struct super_block *sb);
 int scoutfs_item_write_dirty(struct super_block *sb);
 int scoutfs_item_write_done(struct super_block *sb);
 bool scoutfs_item_range_cached(struct super_block *sb,
@@ -403,24 +403,24 @@ TRACE_EVENT(scoutfs_sync_fs,
 );

 TRACE_EVENT(scoutfs_trans_write_func,
-	TP_PROTO(struct super_block *sb, u64 dirty_block_bytes, u64 dirty_item_pages),
+	TP_PROTO(struct super_block *sb, u64 dirty_block_bytes, u64 dirty_item_bytes),

-	TP_ARGS(sb, dirty_block_bytes, dirty_item_pages),
+	TP_ARGS(sb, dirty_block_bytes, dirty_item_bytes),

 	TP_STRUCT__entry(
 		SCSB_TRACE_FIELDS
 		__field(__u64, dirty_block_bytes)
-		__field(__u64, dirty_item_pages)
+		__field(__u64, dirty_item_bytes)
 	),

 	TP_fast_assign(
 		SCSB_TRACE_ASSIGN(sb);
 		__entry->dirty_block_bytes = dirty_block_bytes;
-		__entry->dirty_item_pages = dirty_item_pages;
+		__entry->dirty_item_bytes = dirty_item_bytes;
 	),

-	TP_printk(SCSBF" dirty_block_bytes %llu dirty_item_pages %llu",
-		  SCSB_TRACE_ARGS, __entry->dirty_block_bytes, __entry->dirty_item_pages)
+	TP_printk(SCSBF" dirty_block_bytes %llu dirty_item_bytes %llu",
+		  SCSB_TRACE_ARGS, __entry->dirty_block_bytes, __entry->dirty_item_bytes)
 );

 DECLARE_EVENT_CLASS(scoutfs_trans_hold_release_class,
@@ -207,7 +207,7 @@ void scoutfs_trans_write_func(struct work_struct *work)
 	}

 	trace_scoutfs_trans_write_func(sb, scoutfs_block_writer_dirty_bytes(sb, &tri->wri),
-				       scoutfs_item_dirty_pages(sb));
+				       scoutfs_item_dirty_bytes(sb));

 	if (tri->deadline_expired)
 		scoutfs_inc_counter(sb, trans_commit_timer);
@@ -422,16 +422,18 @@ static void release_holders(struct super_block *sb)
 */
 static bool commit_before_hold(struct super_block *sb, struct trans_info *tri)
 {
+	u64 dirty_blocks = (scoutfs_item_dirty_bytes(sb) >> SCOUTFS_BLOCK_LG_SHIFT) + 1;
+
 	/*
-	 * In theory each dirty item page could be straddling two full
-	 * blocks, requiring 4 allocations for each item cache page.
-	 * That's much too conservative, typically many dirty item cache
-	 * pages that are near each other all land in one block.  This
+	 * In theory each dirty item could be added to a full block that
+	 * has to split, requiring 2 meta block allocs for each dirty
+	 * item.  That's much too conservative, typically many dirty
+	 * items that are near each other all land in one block.  This
 	 * rough estimate is still so far beyond what typically happens
 	 * that it accounts for having to dirty parent blocks and
 	 * whatever dirtying is done during the transaction hold.
 	 */
-	if (scoutfs_alloc_meta_low(sb, &tri->alloc, scoutfs_item_dirty_pages(sb) * 2)) {
+	if (scoutfs_alloc_meta_low(sb, &tri->alloc, dirty_blocks * 4)) {
 		scoutfs_inc_counter(sb, trans_commit_dirty_meta_full);
 		return true;
 	}