scoutfs: commit while enough meta for dirty items

Dirty items in a client transaction are stored in OS pages. When the transaction is committed each item is stored in its position in a dirty btree block in the client's existing log btree. Allocators are refilled between transaction commits so a given commit must have sufficient meta allocator space (avail blocks and unused freed entries) for all the btree blocks that are dirtied. The number of btree blocks that are written, thus the number of cow allocations and frees, depends on the number of blocks in the log btree and the distribution of dirty items amongst those blocks. In a typical load items will be near each other and many dirty items in smaller kernel pages will be stored in fewer larger btree blocks. But with the right circumstances, the ratio of dirty pages to dirty blocks can be much smaller. With a very large directory and random entry renames you can easily have 1 btree block dirtied for every page of dirty items. Our existing allocator meta allocator fill targets and the number of dirty item cache pages we allowed did not properly take this in to account. It was possible (and, it turned out, relatively easy to test for with a hgue directory and random renames) to run out of meta avail blocks while storing dirty items in dirtied btree blocks. This rebalances our targets and thresholds to make it more likely that we'll have enough allocator resources to commit dirty items. Instead of having an arbitrary limit on the number of dirty item cache pages, we require that a given number of dirty item cache pages have a given number of allocator blocks available. We require a decent number of avialable blocks for each dirty page, so we increase the server's target number of blocks to give the client so that it can still build large transactions. This code is conservative and should not be a problem in practice, but it's theoretically possible to build a log btree and set of dirty items that would dirty more blocks that this code assumes. We will probably revisit this as we add proper support for ENOSPC. Signed-off-by: Zach Brown <zab@versity.com>
2026-01-08 04:55:21 +00:00 · 2020-11-13 13:39:13 -08:00
parent ae286bf837
commit 9375b9d3b7
4 changed files with 29 additions and 16 deletions
--- a/kmod/src/alloc.h
+++ b/kmod/src/alloc.h
@@ -41,11 +41,15 @@
 /*
 * Fill client alloc roots to the target when they fall below the lo
 * threshold.
+ *
+ * We're giving the client the most available meta blocks we can so that
+ * it has the freedom to build large transactions before worrying that
+ * it might run out of meta allocs during commits.
 */
 #define SCOUTFS_SERVER_META_FILL_TARGET \
-	(256ULL * 1024 * 1024 >> SCOUTFS_BLOCK_LG_SHIFT)
+	SCOUTFS_ALLOC_LIST_MAX_BLOCKS
 #define SCOUTFS_SERVER_META_FILL_LO \
-	(64ULL * 1024 * 1024 >> SCOUTFS_BLOCK_LG_SHIFT)
+	(SCOUTFS_ALLOC_LIST_MAX_BLOCKS / 2)
 #define SCOUTFS_SERVER_DATA_FILL_TARGET \
 	(4ULL * 1024 * 1024 * 1024 >> SCOUTFS_BLOCK_SM_SHIFT)
 #define SCOUTFS_SERVER_DATA_FILL_LO \
--- a/kmod/src/item.c
+++ b/kmod/src/item.c
@@ -2042,18 +2042,11 @@ int scoutfs_item_delete_force(struct super_block *sb, struct scoutfs_key *key,
 	return item_delete(sb, key, lock, SCOUTFS_LOCK_WRITE_ONLY, true);
 }

-/*
- * Give a rough idea of the number of bytes that would need to be
- * written to commit the current dirty items.  Reporting the total item
- * dirty bytes wouldn't be accurate because they're written into btree
- * pages.  The number of dirty pages holding the dirty items is
- * comparable.  This could probably use some tuning.
- */
-u64 scoutfs_item_dirty_bytes(struct super_block *sb)
+u64 scoutfs_item_dirty_pages(struct super_block *sb)
 {
 	DECLARE_ITEM_CACHE_INFO(sb, cinf);

-	return (u64)atomic_read(&cinf->dirty_pages) << PAGE_SHIFT;
+	return (u64)atomic_read(&cinf->dirty_pages);
 }

 static int cmp_pg_start(void *priv, struct list_head *A, struct list_head *B)
--- a/kmod/src/item.h
+++ b/kmod/src/item.h
@@ -24,7 +24,7 @@ int scoutfs_item_delete_force(struct super_block *sb,
 				struct scoutfs_key *key,
 				struct scoutfs_lock *lock);

-u64 scoutfs_item_dirty_bytes(struct super_block *sb);
+u64 scoutfs_item_dirty_pages(struct super_block *sb);
 int scoutfs_item_write_dirty(struct super_block *sb);
 int scoutfs_item_write_done(struct super_block *sb);
 bool scoutfs_item_range_cached(struct super_block *sb,
--- a/kmod/src/trans.c
+++ b/kmod/src/trans.c
@@ -170,7 +170,7 @@ void scoutfs_trans_write_func(struct work_struct *work)
 			scoutfs_block_writer_dirty_bytes(sb, &tri->wri));

 	if (!scoutfs_block_writer_has_dirty(sb, &tri->wri) &&
-	    !scoutfs_item_dirty_bytes(sb)) {
+	    !scoutfs_item_dirty_pages(sb)) {
 		if (sbi->trans_deadline_expired) {
 			/*
 			 * If we're not writing data then we only advance the
@@ -369,14 +369,30 @@ static bool acquired_hold(struct super_block *sb,
 	items = tri->reserved_items + cnt->items;
 	vals = tri->reserved_vals + cnt->vals;

-	/* XXX arbitrarily limit to 8 meg transactions */
-	if (scoutfs_item_dirty_bytes(sb) >= (8 * 1024 * 1024)) {
+	/*
+	 * In theory each dirty item page could be straddling two full
+	 * blocks, requiring 4 allocations for each item cache page.
+	 * That's much too conservative, typically many dirty item cache
+	 * pages that are near each other all land in one block.  This
+	 * rough estimate is still so far beyond what typically happens
+	 * that it accounts for having to dirty parent blocks and
+	 * whatever dirtying is done during the transaction hold.
+	 */
+	if (scoutfs_alloc_meta_low(sb, &tri->alloc,
+				   scoutfs_item_dirty_pages(sb) * 2)) {
 		scoutfs_inc_counter(sb, trans_commit_dirty_meta_full);
 		queue_trans_work(sbi);
 		goto out;
 	}

-	if (scoutfs_alloc_meta_low(sb, &tri->alloc, 8)) {
+	/*
+	 * Extent modifications can use meta allocators without creating
+	 * dirty items so we have to check the meta alloc specifically.
+	 * The size of the client's avail and freed roots are bound so
+	 * we're unlikely to need very many block allocations per
+	 * transaction hold.  XXX This should be more precisely tuned.
+	 */
+	if (scoutfs_alloc_meta_low(sb, &tri->alloc, 16)) {
 		scoutfs_inc_counter(sb, trans_commit_meta_alloc_low);
 		queue_trans_work(sbi);
 		goto out;