More thoroughly integrate compaction

The first pass at compaction just kicked a thread any time we added a segment that brought its level's count over the limit. Tasks could create dirty items and write level0 segments regardless of the progress of compaction. This ties the writing rate to compaction. Writers have to wait to hold a transaction until the dirty item count is under a segment and there's no level0 segments. Usualy more level0 segments are allowed but we're aggressively pushing compaction, we'll relax this later. This also more forcefully ensures that compaction makes forward progress. We kick the compaction thread if we exceed the level count, wait for level0 to drain, or successfully complete a compaction. We tweak scoutfs_manifest_next_compact() to return 0 if there's no compaction work to do so the the compaction thread can exit without triggering another. For clarity we also kick off a sync after compaction so that we don't sit around with a dirty manifest until the next sync. This may not be wise. Signed-off-by: Zach Brown <zab@versity.com>
2026-02-07 19:20:44 +00:00 · 2017-01-12 11:35:55 -08:00
parent aad5a34290
commit 3f812fa9a7
5 changed files with 126 additions and 40 deletions
--- a/kmod/src/compact.c
+++ b/kmod/src/compact.c
@@ -22,6 +22,7 @@
 #include "cmp.h"
 #include "compact.h"
 #include "manifest.h"
+#include "trans.h"
 #include "scoutfs_trace.h"

 /*
@@ -470,15 +471,21 @@ static void scoutfs_compact_func(struct work_struct *work)

 	INIT_LIST_HEAD(&curs.csegs);

-	ret = scoutfs_manifest_next_compact(sb, (void *)&curs) ?:
-	      read_segments(sb, &curs) ?:
+	ret = scoutfs_manifest_next_compact(sb, (void *)&curs);
+	if (ret <= 0)
+		goto out;
+
+	ret = read_segments(sb, &curs) ?:
 	      compact_segments(sb, &curs, &results) ?:
 	      write_segments(sb, &results) ?:
 	      update_manifest(sb, &curs, &results);
-
-	if (ret)
+	if (ret) {
 		free_result_segnos(sb, &results);
-
+	} else {
+		scoutfs_sync_fs(sb, 0);
+		scoutfs_compact_kick(sb);
+	}
+out:
 	free_csegs(&curs.csegs);
 	free_csegs(&results);

--- a/kmod/src/manifest.c
+++ b/kmod/src/manifest.c
@@ -24,6 +24,7 @@
 #include "cmp.h"
 #include "compact.h"
 #include "manifest.h"
+#include "trans.h"
 #include "scoutfs_trace.h"

 /*
@@ -142,19 +143,41 @@ static u64 get_level_count(struct manifest *mani,
 	return count;
 }

-static void add_level_count(struct manifest *mani,
-			    struct scoutfs_super_block *super, u8 level,
-			    s64 val)
+static bool past_limit(struct manifest *mani, u8 level, u64 count)
 {
-	write_seqcount_begin(&mani->seqcount);
-	le64_add_cpu(&super->manifest.level_counts[level], val);
-	write_seqcount_end(&mani->seqcount);
+	return count > mani->level_limits[level];
 }

 static bool level_full(struct manifest *mani,
 		       struct scoutfs_super_block *super, u8 level)
 {
-	return get_level_count(mani, super, level) > mani->level_limits[level];
+	return past_limit(mani, level, get_level_count(mani, super, level));
+}
+
+static void add_level_count(struct super_block *sb, struct manifest *mani,
+			    struct scoutfs_super_block *super, u8 level,
+			    s64 val)
+{
+	bool was_full;
+	bool now_full;
+	u64 count;
+
+	write_seqcount_begin(&mani->seqcount);
+
+	count = le64_to_cpu(super->manifest.level_counts[level]);
+	was_full = past_limit(mani, level, count);
+
+	count += val;
+	now_full = past_limit(mani, level, count);
+	super->manifest.level_counts[level] = cpu_to_le64(count);
+
+	write_seqcount_end(&mani->seqcount);
+
+	if (was_full && !now_full)
+		scoutfs_trans_wake_holders(sb);
+
+	if (now_full)
+		scoutfs_compact_kick(sb);
 }

 /*
@@ -199,11 +222,7 @@ int scoutfs_manifest_add(struct super_block *sb, struct kvec *first,
 		ret = PTR_ERR(ment);
 	} else {
 		mani->nr_levels = max_t(u8, mani->nr_levels, level + 1);
-		add_level_count(mani, super, level, 1);
-
-		if (level_full(mani, super, level))
-			scoutfs_compact_kick(sb);
-
+		add_level_count(sb, mani, super, level, 1);
 		ret = 0;
 	}

@@ -250,7 +269,7 @@ int scoutfs_manifest_del(struct super_block *sb, struct kvec *first, u64 seq,

 	ret = scoutfs_treap_delete(mani->treap, &skey);
 	if (ret == 0)
-		add_level_count(mani, super, level, -1ULL);
+		add_level_count(sb, mani, super, level, -1ULL);

 	return ret;
 }
@@ -618,6 +637,15 @@ int scoutfs_manifest_dirty_ring(struct super_block *sb)
 	return 0;
 }

+u64 scoutfs_manifest_level_count(struct super_block *sb, u8 level)
+{
+	DECLARE_MANIFEST(sb, mani);
+	struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
+	struct scoutfs_super_block *super = &sbi->super;
+
+	return get_level_count(mani, super, level);
+}
+
 /*
 * Give the caller the segments that will be involved in the next
 * compaction.
@@ -636,6 +664,8 @@ int scoutfs_manifest_dirty_ring(struct super_block *sb)
 * compaction caller's data and let it do its thing.  It'll allocate and
 * free segments and update the manifest.
 *
+ * Returns 1 if there's compaction work to do, 0 if not, or -errno.
+ *
 * XXX this will get a lot more clever:
 *  - ensuring concurrent compactions don't overlap
 *  - prioritize segments with deletion or incremental records
@@ -769,7 +799,7 @@ done:
 	scoutfs_kvec_memcpy_truncate(mani->compact_keys[level], ment_last);
 	scoutfs_kvec_be_inc(mani->compact_keys[level]);

-	ret = 0;
+	ret = 1;
 out:
 	up_write(&mani->rwsem);
 	return ret;
--- a/kmod/src/manifest.h
+++ b/kmod/src/manifest.h
@@ -16,6 +16,7 @@ int scoutfs_manifest_unlock(struct super_block *sb);
 int scoutfs_manifest_read_items(struct super_block *sb, struct kvec *key,
 				struct kvec *until);

+u64 scoutfs_manifest_level_count(struct super_block *sb, u8 level);
 int scoutfs_manifest_next_compact(struct super_block *sb, void *data);

 int scoutfs_manifest_setup(struct super_block *sb);
--- a/kmod/src/trans.c
+++ b/kmod/src/trans.c
@@ -28,6 +28,7 @@
 #include "seg.h"
 #include "alloc.h"
 #include "treap.h"
+#include "compact.h"
 #include "scoutfs_trace.h"

 /*
@@ -210,9 +211,56 @@ int scoutfs_file_fsync(struct file *file, loff_t start, loff_t end,
 }

 /*
- * The first holders race to try and allocate the segment that will be
- * written by the next commit.
+ * I think the holder that creates the most dirty item data is
+ * symlinking, which can create all the entry items and a symlink target
+ * item with a full 4k path.  We go a little nuts and just set it to two
+ * blocks.
+ *
+ * XXX This divides the segment size to set the hard limit on the number of
+ * concurrent holders so we'll want this to be more precise.
 */
+#define MOST_DIRTY (2 * SCOUTFS_BLOCK_SIZE)
+
+/*
+ * We're able to hold the transaction if the current dirty item bytes
+ * and the presumed worst case item dirtying of all the holders,
+ * including us, all fit in a segment.
+ */
+static bool hold_acquired(struct super_block *sb)
+{
+	struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
+	long bytes;
+	int with_us;
+	int holds;
+	int before;
+
+	holds = atomic_read(&sbi->trans_holds);
+	for (;;) {
+		/* transaction is being committed */
+		if (holds < 0)
+			return false;
+
+		/* only hold when there's no level 0 segments, XXX for now */
+		if (scoutfs_manifest_level_count(sb, 0) > 0) {
+			scoutfs_compact_kick(sb);
+			return false;
+		}
+
+		/* see if we all would fill the segment */
+		with_us = holds + 1;
+		bytes = (with_us * MOST_DIRTY) + scoutfs_item_dirty_bytes(sb);
+		if (bytes > SCOUTFS_SEGMENT_SIZE) {
+			scoutfs_sync_fs(sb, 0);
+			return false;
+		}
+
+		before = atomic_cmpxchg(&sbi->trans_holds, holds, with_us);
+		if (before == holds)
+			return true;
+		holds = before;
+	}
+}
+
 int scoutfs_hold_trans(struct super_block *sb)
 {
 	struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
@@ -220,37 +268,36 @@ int scoutfs_hold_trans(struct super_block *sb)
 	if (current == sbi->trans_task)
 		return 0;

-	return wait_event_interruptible(sbi->trans_hold_wq,
-				  atomic_add_unless(&sbi->trans_holds, 1, -1));
+	return wait_event_interruptible(sbi->trans_hold_wq, hold_acquired(sb));
 }

 /*
- * As we release we kick off a commit if we have a segment's worth of
- * dirty items.
- *
- * Right now it's conservatively kicking off writes at ~95% full blocks.
- * This leaves a lot of slop for the largest item bytes created by a
- * holder and overrun by concurrent holders (who aren't accounted
- * today).
- *
- * It should more precisely know the worst case item byte consumption of
- * holders and only kick off a write when someone tries to hold who
- * might fill the segment.
+ * As we release we'll almost certainly have dirtied less than the
+ * worst case dirty assumption that holders might be throttled waiting
+ * for.  We always try and wake blocked holders in case they now have
+ * room to dirty.
 */
 void scoutfs_release_trans(struct super_block *sb)
 {
 	struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
-	unsigned int target = (SCOUTFS_SEGMENT_SIZE * 95 / 100);

 	if (current == sbi->trans_task)
 		return;

-	if (atomic_sub_return(1, &sbi->trans_holds) == 0) {
-		if (scoutfs_item_dirty_bytes(sb) >= target)
-			scoutfs_sync_fs(sb, 0);
+	atomic_dec(&sbi->trans_holds);
+	wake_up(&sbi->trans_hold_wq);
+}

-		wake_up(&sbi->trans_hold_wq);
-	}
+/*
+ * This is called to wake people waiting on holders when the conditions
+ * that they're waiting on change: levels being full, dirty count falling
+ * under a segment, or holders falling to 0.
+ */
+void scoutfs_trans_wake_holders(struct super_block *sb)
+{
+	struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
+
+	wake_up(&sbi->trans_hold_wq);
 }

 int scoutfs_setup_trans(struct super_block *sb)
--- a/kmod/src/trans.h
+++ b/kmod/src/trans.h
@@ -8,6 +8,7 @@ int scoutfs_file_fsync(struct file *file, loff_t start, loff_t end,

 int scoutfs_hold_trans(struct super_block *sb);
 void scoutfs_release_trans(struct super_block *sb);
+void scoutfs_trans_wake_holders(struct super_block *sb);

 int scoutfs_setup_trans(struct super_block *sb);
 void scoutfs_shutdown_trans(struct super_block *sb);