Use cwskip for the item cache

The use of pages in the item cache got us pretty far but it fundmanetally couldn't escape the contention around the global or per-page read locks. Some loads became bottlenecked in contention in the item cache. Worse, we were seeing inconsistency in the per-cpu cached mappings of key ranges to pages. All the users of items in the cache are transitioned from searching for items in locked pages to searching for items in the cwskip list. It's fundamentally built around a seqlock-like begin/retry pattern so most of the item work gets wrapped around search and retry helpers. Without pages we no longer have a global list of dirty pages. Instead we have per-cpu lists of dirty items that are later sorted and handed to the btree insertion iterator. We take the opportunity to clean up that interface now that it's very easy for us to iterate through the stable list of dirty items. Rather than a global lru of pages we have an algorithm for maintaining items in rough groups of ages. Shrinking randomly walks the cwskip list looking for regions of sufficiently old items rather than walking a precise global lru list of pages. Signed-off-by: Zach Brown <zab@versity.com>
2026-01-10 05:37:25 +00:00 · 2021-12-23 14:58:19 -08:00
parent 7a999f2657
commit 5bea29a168
9 changed files with 1169 additions and 1949 deletions
--- a/kmod/src/btree.c
+++ b/kmod/src/btree.c
@@ -1875,12 +1875,11 @@ out:
 * set in btree items.  They're only used for fs items written through
 * the item cache and forest of log btrees.
 */
-int scoutfs_btree_insert_list(struct super_block *sb,
-			      struct scoutfs_alloc *alloc,
-			      struct scoutfs_block_writer *wri,
-			      struct scoutfs_btree_root *root,
-			      struct scoutfs_btree_item_list *lst)
+int scoutfs_btree_insert_list(struct super_block *sb, struct scoutfs_alloc *alloc,
+			      struct scoutfs_block_writer *wri, struct scoutfs_btree_root *root,
+			      scoutfs_btree_item_iter_cb iter_cb, void *pos, void *arg)
 {
+	struct scoutfs_btree_item_desc desc;
 	struct scoutfs_btree_item *item;
 	struct btree_walk_key_range kr;
 	struct scoutfs_btree_block *bt;
@@ -1889,44 +1888,46 @@ int scoutfs_btree_insert_list(struct super_block *sb,
 	int cmp;
 	int ret = 0;

-	while (lst) {
+	pos = iter_cb(sb, &desc, pos, arg);
+
+	while (pos) {
 		ret = btree_walk(sb, alloc, wri, root, BTW_DIRTY | BTW_INSERT,
-				 &lst->key, lst->val_len, &bl, &kr, NULL);
+				 desc.key, desc.val_len, &bl, &kr, NULL);
 		if (ret < 0)
 			goto out;
 		bt = bl->data;

 		do {
-			item = leaf_item_hash_search(sb, bt, &lst->key);
+			item = leaf_item_hash_search(sb, bt, desc.key);
 			if (item) {
 				/* try to merge delta values, _NULL not deleted; merge will */
-				ret = scoutfs_forest_combine_deltas(&lst->key,
+				ret = scoutfs_forest_combine_deltas(desc.key,
 								    item_val(bt, item),
 								    item_val_len(item),
-								    lst->val, lst->val_len);
+								    desc.val, desc.val_len);
 				if (ret < 0) {
 					scoutfs_block_put(sb, bl);
 					goto out;
 				}

-				item->seq = cpu_to_le64(lst->seq);
-				item->flags = lst->flags;
+				item->seq = cpu_to_le64(desc.seq);
+				item->flags = desc.flags;

 				if (ret == 0)
-					update_item_value(bt, item, lst->val, lst->val_len);
+					update_item_value(bt, item, desc.val, desc.val_len);
 				else
 					ret = 0;
 			} else {
 				scoutfs_avl_search(&bt->item_root,
-						   cmp_key_item, &lst->key,
+						   cmp_key_item, desc.key,
 						   &cmp, &par, NULL, NULL);
-				create_item(bt, &lst->key, lst->seq, lst->flags, lst->val,
-					    lst->val_len, par, cmp);
+				create_item(bt, desc.key, desc.seq, desc.flags, desc.val,
+					    desc.val_len, par, cmp);
 			}

-			lst = lst->next;
-		} while (lst && scoutfs_key_compare(&lst->key, &kr.end) <= 0 &&
-			 mid_free_item_room(bt, lst->val_len));
+			pos = iter_cb(sb, &desc, pos, arg);
+		} while (pos && scoutfs_key_compare(desc.key, &kr.end) <= 0 &&
+			 mid_free_item_room(bt, desc.val_len));

 		scoutfs_block_put(sb, bl);
 	}
--- a/kmod/src/btree.h
+++ b/kmod/src/btree.h
@@ -18,11 +18,24 @@ struct scoutfs_btree_item_ref {
 #define SCOUTFS_BTREE_ITEM_REF(name) \
 	struct scoutfs_btree_item_ref name = {NULL,}

-/* caller gives an item to the callback */
+/* btree gives an item to caller */
 typedef int (*scoutfs_btree_item_cb)(struct super_block *sb,
 				     struct scoutfs_key *key, u64 seq, u8 flags,
 				     void *val, int val_len, void *arg);

+struct scoutfs_btree_item_desc {
+	struct scoutfs_key *key;
+	void *val;
+	u64 seq;
+	u8 flags;
+	unsigned val_len;
+};
+
+/* btree iterates through items from caller */
+typedef void *(*scoutfs_btree_item_iter_cb)(struct super_block *sb,
+					    struct scoutfs_btree_item_desc *desc,
+					    void *pos, void *arg);
+
 /* simple singly-linked list of items */
 struct scoutfs_btree_item_list {
 	struct scoutfs_btree_item_list *next;
@@ -78,11 +91,9 @@ int scoutfs_btree_read_items(struct super_block *sb,
 			     struct scoutfs_key *start,
 			     struct scoutfs_key *end,
 			     scoutfs_btree_item_cb cb, void *arg);
-int scoutfs_btree_insert_list(struct super_block *sb,
-			      struct scoutfs_alloc *alloc,
-			      struct scoutfs_block_writer *wri,
-			      struct scoutfs_btree_root *root,
-			      struct scoutfs_btree_item_list *lst);
+int scoutfs_btree_insert_list(struct super_block *sb, struct scoutfs_alloc *alloc,
+			      struct scoutfs_block_writer *wri, struct scoutfs_btree_root *root,
+			      scoutfs_btree_item_iter_cb iter_cb, void *pos, void *arg);

 int scoutfs_btree_parent_range(struct super_block *sb,
 			       struct scoutfs_btree_root *root,
--- a/kmod/src/counters.h
+++ b/kmod/src/counters.h
@@ -90,36 +90,27 @@
 	EXPAND_COUNTER(forest_read_items)			\
 	EXPAND_COUNTER(forest_roots_next_hint)			\
 	EXPAND_COUNTER(forest_set_bloom_bits)			\
+	EXPAND_COUNTER(item_alloc_bytes)			\
 	EXPAND_COUNTER(item_clear_dirty)			\
 	EXPAND_COUNTER(item_create)				\
 	EXPAND_COUNTER(item_delete)				\
 	EXPAND_COUNTER(item_delta)				\
 	EXPAND_COUNTER(item_delta_written)			\
 	EXPAND_COUNTER(item_dirty)				\
+	EXPAND_COUNTER(item_free_bytes)				\
 	EXPAND_COUNTER(item_invalidate)				\
-	EXPAND_COUNTER(item_invalidate_page)			\
+	EXPAND_COUNTER(item_invalidate_item)			\
 	EXPAND_COUNTER(item_lookup)				\
 	EXPAND_COUNTER(item_mark_dirty)				\
 	EXPAND_COUNTER(item_next)				\
-	EXPAND_COUNTER(item_page_accessed)			\
-	EXPAND_COUNTER(item_page_alloc)				\
-	EXPAND_COUNTER(item_page_clear_dirty)			\
-	EXPAND_COUNTER(item_page_compact)			\
-	EXPAND_COUNTER(item_page_free)				\
-	EXPAND_COUNTER(item_page_lru_add)			\
-	EXPAND_COUNTER(item_page_lru_remove)			\
-	EXPAND_COUNTER(item_page_mark_dirty)			\
-	EXPAND_COUNTER(item_page_rbtree_walk)			\
-	EXPAND_COUNTER(item_page_split)				\
-	EXPAND_COUNTER(item_pcpu_add_replaced)			\
-	EXPAND_COUNTER(item_pcpu_page_hit)			\
-	EXPAND_COUNTER(item_pcpu_page_miss)			\
-	EXPAND_COUNTER(item_pcpu_page_miss_keys)		\
-	EXPAND_COUNTER(item_read_pages_split)			\
-	EXPAND_COUNTER(item_shrink_page)			\
-	EXPAND_COUNTER(item_shrink_page_dirty)			\
-	EXPAND_COUNTER(item_shrink_page_reader)			\
-	EXPAND_COUNTER(item_shrink_page_trylock)		\
+	EXPAND_COUNTER(item_shrink)				\
+	EXPAND_COUNTER(item_shrink_all)				\
+	EXPAND_COUNTER(item_shrink_exhausted)			\
+	EXPAND_COUNTER(item_shrink_read_search)			\
+	EXPAND_COUNTER(item_shrink_removed)			\
+	EXPAND_COUNTER(item_shrink_searched)			\
+	EXPAND_COUNTER(item_shrink_skipped)			\
+	EXPAND_COUNTER(item_shrink_write_search)		\
 	EXPAND_COUNTER(item_update)				\
 	EXPAND_COUNTER(item_write_dirty)			\
 	EXPAND_COUNTER(lock_alloc)				\
--- a/kmod/src/forest.c
+++ b/kmod/src/forest.c
@@ -494,13 +494,13 @@ out:
 	return ret;
 }

-int scoutfs_forest_insert_list(struct super_block *sb,
-			       struct scoutfs_btree_item_list *lst)
+int scoutfs_forest_insert_list(struct super_block *sb, scoutfs_btree_item_iter_cb cb,
+			       void *pos, void *arg)
 {
 	DECLARE_FOREST_INFO(sb, finf);

 	return scoutfs_btree_insert_list(sb, finf->alloc, finf->wri,
-					 &finf->our_log.item_root, lst);
+					 &finf->our_log.item_root, cb, pos, arg);
 }

 /*
--- a/kmod/src/forest.h
+++ b/kmod/src/forest.h
@@ -29,8 +29,8 @@ void scoutfs_forest_set_max_seq(struct super_block *sb, u64 max_seq);
 int scoutfs_forest_get_max_seq(struct super_block *sb,
 			       struct scoutfs_super_block *super,
 			       u64 *seq);
-int scoutfs_forest_insert_list(struct super_block *sb,
-			       struct scoutfs_btree_item_list *lst);
+int scoutfs_forest_insert_list(struct super_block *sb, scoutfs_btree_item_iter_cb cb,
+			       void *pos, void *arg);
 int scoutfs_forest_srch_add(struct super_block *sb, u64 hash, u64 ino, u64 id);

 void scoutfs_forest_inc_inode_count(struct super_block *sb);
--- a/kmod/src/item.c
+++ b/kmod/src/item.c
--- a/kmod/src/item.h
+++ b/kmod/src/item.h
@@ -26,7 +26,7 @@ int scoutfs_item_delete_force(struct super_block *sb,
 				struct scoutfs_key *key,
 				struct scoutfs_lock *lock);

-u64 scoutfs_item_dirty_pages(struct super_block *sb);
+u64 scoutfs_item_dirty_bytes(struct super_block *sb);
 int scoutfs_item_write_dirty(struct super_block *sb);
 int scoutfs_item_write_done(struct super_block *sb);
 bool scoutfs_item_range_cached(struct super_block *sb,
--- a/kmod/src/scoutfs_trace.h
+++ b/kmod/src/scoutfs_trace.h
@@ -403,24 +403,24 @@ TRACE_EVENT(scoutfs_sync_fs,
 );

 TRACE_EVENT(scoutfs_trans_write_func,
-	TP_PROTO(struct super_block *sb, u64 dirty_block_bytes, u64 dirty_item_pages),
+	TP_PROTO(struct super_block *sb, u64 dirty_block_bytes, u64 dirty_item_bytes),

-	TP_ARGS(sb, dirty_block_bytes, dirty_item_pages),
+	TP_ARGS(sb, dirty_block_bytes, dirty_item_bytes),

 	TP_STRUCT__entry(
 		SCSB_TRACE_FIELDS
 		__field(__u64, dirty_block_bytes)
-		__field(__u64, dirty_item_pages)
+		__field(__u64, dirty_item_bytes)
 	),

 	TP_fast_assign(
 		SCSB_TRACE_ASSIGN(sb);
 		__entry->dirty_block_bytes = dirty_block_bytes;
-		__entry->dirty_item_pages = dirty_item_pages;
+		__entry->dirty_item_bytes = dirty_item_bytes;
 	),

-	TP_printk(SCSBF" dirty_block_bytes %llu dirty_item_pages %llu",
-		  SCSB_TRACE_ARGS, __entry->dirty_block_bytes, __entry->dirty_item_pages)
+	TP_printk(SCSBF" dirty_block_bytes %llu dirty_item_bytes %llu",
+		  SCSB_TRACE_ARGS, __entry->dirty_block_bytes, __entry->dirty_item_bytes)
 );

 DECLARE_EVENT_CLASS(scoutfs_trans_hold_release_class,
--- a/kmod/src/trans.c
+++ b/kmod/src/trans.c
@@ -207,7 +207,7 @@ void scoutfs_trans_write_func(struct work_struct *work)
 	}

 	trace_scoutfs_trans_write_func(sb, scoutfs_block_writer_dirty_bytes(sb, &tri->wri),
-				       scoutfs_item_dirty_pages(sb));
+				       scoutfs_item_dirty_bytes(sb));

 	if (tri->deadline_expired)
 		scoutfs_inc_counter(sb, trans_commit_timer);
@@ -422,16 +422,18 @@ static void release_holders(struct super_block *sb)
 */
 static bool commit_before_hold(struct super_block *sb, struct trans_info *tri)
 {
+	u64 dirty_blocks = (scoutfs_item_dirty_bytes(sb) >> SCOUTFS_BLOCK_LG_SHIFT) + 1;
+
 	/*
-	 * In theory each dirty item page could be straddling two full
-	 * blocks, requiring 4 allocations for each item cache page.
-	 * That's much too conservative, typically many dirty item cache
-	 * pages that are near each other all land in one block.  This
+	 * In theory each dirty item could be added to a full block that
+	 * has to split, requiring 2 meta block allocs for each dirty
+	 * item.  That's much too conservative, typically many dirty
+	 * items that are near each other all land in one block.  This
 	 * rough estimate is still so far beyond what typically happens
 	 * that it accounts for having to dirty parent blocks and
 	 * whatever dirtying is done during the transaction hold.
 	 */
-	if (scoutfs_alloc_meta_low(sb, &tri->alloc, scoutfs_item_dirty_pages(sb) * 2)) {
+	if (scoutfs_alloc_meta_low(sb, &tri->alloc, dirty_blocks * 4)) {
 		scoutfs_inc_counter(sb, trans_commit_dirty_meta_full);
 		return true;
 	}