From f7701177d2c6c686a2ffd552470a1de95e1b7bee Mon Sep 17 00:00:00 2001 From: Zach Brown Date: Wed, 21 Jun 2017 20:47:11 -0700 Subject: [PATCH] scoutfs: throttle addition of level 0 segments Writers can add level 0 segments much faster (~20x) than compaction can compact them down into the lower levels. Without a limit on the number of level 0 segments item readind can try to read an extraordinary number of level 0 segments and wedge the box nonreclaimable page allocations. Signed-off-by: Zach Brown --- kmod/src/manifest.c | 56 +++++++++++++++++++++++++++++++++++++-------- kmod/src/manifest.h | 2 ++ kmod/src/net.c | 42 +++++++++++++++++++++++++++++++--- 3 files changed, 88 insertions(+), 12 deletions(-) diff --git a/kmod/src/manifest.c b/kmod/src/manifest.c index df6ba43f..564b7259 100644 --- a/kmod/src/manifest.c +++ b/kmod/src/manifest.c @@ -45,9 +45,13 @@ struct manifest { /* calculated on mount, const thereafter */ u64 level_limits[SCOUTFS_MANIFEST_MAX_LEVEL + 1]; + unsigned long flags; + struct scoutfs_key_buf *compact_keys[SCOUTFS_MANIFEST_MAX_LEVEL + 1]; }; +#define MANI_FLAG_LEVEL0_FULL (1 << 0) + #define DECLARE_MANIFEST(sb, name) \ struct manifest *name = SCOUTFS_SB(sb)->manifest @@ -109,6 +113,46 @@ static bool cmp_range_ment(struct scoutfs_key_buf *key, return scoutfs_key_compare_ranges(key, end, &first, &last); } +/* + * Change the level count under the manifest lock. We then maintain a + * bit that can be tested outside the lock to determine if the caller + * should wait for level 0 segments to drain. + */ +static void add_level_count(struct super_block *sb, int level, s64 val) +{ + DECLARE_MANIFEST(sb, mani); + struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb); + struct scoutfs_super_block *super = &sbi->super; + __le64 count; + int full; + + le64_add_cpu(&super->manifest.level_counts[level], val); + + if (level == 0) { + count = super->manifest.level_counts[level]; + full = test_bit(MANI_FLAG_LEVEL0_FULL, &mani->flags); + if (count && !full) + set_bit(MANI_FLAG_LEVEL0_FULL, &mani->flags); + else if (!count && full) + clear_bit(MANI_FLAG_LEVEL0_FULL, &mani->flags); + } +} + +/* + * Return whether or not level 0 segments are full. It's safe to use + * this as a wait_event condition because it doesn't block. + * + * Callers rely on on the spin locks in wait queues to synchronize + * testing this as a sleeping condition with addition to the wait queue + * and waking of the waitqueue. + */ +bool scoutfs_manifest_level0_full(struct super_block *sb) +{ + DECLARE_MANIFEST(sb, mani); + + return test_bit(MANI_FLAG_LEVEL0_FULL, &mani->flags); +} + /* * Insert a new manifest entry in the ring. The ring allocates a new * node for us and we fill it. @@ -121,8 +165,6 @@ int scoutfs_manifest_add(struct super_block *sb, u8 level) { DECLARE_MANIFEST(sb, mani); - struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb); - struct scoutfs_super_block *super = &sbi->super; struct scoutfs_manifest_entry *ment; struct scoutfs_key_buf ment_first; struct scoutfs_key_buf ment_last; @@ -154,7 +196,7 @@ int scoutfs_manifest_add(struct super_block *sb, scoutfs_key_copy(&ment_last, last); mani->nr_levels = max_t(u8, mani->nr_levels, level + 1); - le64_add_cpu(&super->manifest.level_counts[level], 1); + add_level_count(sb, level, 1); return 0; } @@ -168,8 +210,6 @@ int scoutfs_manifest_add_ment(struct super_block *sb, struct scoutfs_manifest_entry *add) { DECLARE_MANIFEST(sb, mani); - struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb); - struct scoutfs_super_block *super = &sbi->super; struct scoutfs_manifest_entry *ment; struct manifest_search_key skey; struct scoutfs_key_buf first; @@ -195,7 +235,7 @@ int scoutfs_manifest_add_ment(struct super_block *sb, memcpy(ment, add, bytes); mani->nr_levels = max_t(u8, mani->nr_levels, add->level + 1); - le64_add_cpu(&super->manifest.level_counts[add->level], 1); + add_level_count(sb, add->level, 1); return 0; } @@ -229,8 +269,6 @@ int scoutfs_manifest_del(struct super_block *sb, struct scoutfs_key_buf *first, u64 seq, u8 level) { DECLARE_MANIFEST(sb, mani); - struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb); - struct scoutfs_super_block *super = &sbi->super; struct scoutfs_manifest_entry *ment; struct manifest_search_key skey; struct scoutfs_key_buf last; @@ -248,7 +286,7 @@ int scoutfs_manifest_del(struct super_block *sb, struct scoutfs_key_buf *first, le64_to_cpu(ment->seq), first, &last); scoutfs_ring_delete(&mani->ring, ment); - le64_add_cpu(&super->manifest.level_counts[level], -1ULL); + add_level_count(sb, level, -1ULL); return 0; } diff --git a/kmod/src/manifest.h b/kmod/src/manifest.h index b65860a2..46aef7d1 100644 --- a/kmod/src/manifest.h +++ b/kmod/src/manifest.h @@ -45,6 +45,8 @@ int scoutfs_manifest_add_ment_ref(struct super_block *sb, int scoutfs_manifest_next_compact(struct super_block *sb, void *data); +bool scoutfs_manifest_level0_full(struct super_block *sb); + int scoutfs_manifest_setup(struct super_block *sb); void scoutfs_manifest_destroy(struct super_block *sb); diff --git a/kmod/src/net.c b/kmod/src/net.c index 76d9ca3a..91189304 100644 --- a/kmod/src/net.c +++ b/kmod/src/net.c @@ -84,6 +84,9 @@ struct net_info { struct llist_head ring_commit_waiters; struct work_struct ring_commit_work; + /* level 0 segment addition waits for it to clear */ + wait_queue_head_t waitq; + /* server tracks seq use */ spinlock_t seq_lock; struct list_head pending_seqs; @@ -422,6 +425,20 @@ static struct send_buf *process_bulk_alloc(struct super_block *sb,void *req, return sbuf; } +/* + * This is new segments arriving. It needs to wait for level 0 to be + * free. It has relatively little visibility into the manifest, though. + * We don't want it to block holding commits because that'll stop + * manifest updates from emptying level 0. + * + * Maybe the easiest way is to protect the level counts with a seqlock, + * or whatever. + */ + +/* + * The sender has written their level 0 segment and has given us its + * details. We wait for there to be room in level 0 before adding it. + */ static struct send_buf *process_record_segment(struct super_block *sb, void *req, int req_len) { @@ -443,9 +460,18 @@ static struct send_buf *process_record_segment(struct super_block *sb, goto out; } +retry: down_read(&nti->ring_commit_rwsem); - scoutfs_manifest_lock(sb); + + if (scoutfs_manifest_level0_full(sb)) { + scoutfs_manifest_unlock(sb); + up_read(&nti->ring_commit_rwsem); + /* XXX waits indefinitely? io errors? */ + wait_event(nti->waitq, !scoutfs_manifest_level0_full(sb)); + goto retry; + } + ret = scoutfs_manifest_add_ment(sb, ment); scoutfs_manifest_unlock(sb); @@ -1446,20 +1472,29 @@ int scoutfs_net_get_compaction(struct super_block *sb, void *curs) * In the future we'd encode the manifest and segnos in requests sent to * the server who'd update the manifest and allocator in request * processing. + * + * As we finish a compaction we wait level0 writers if it opened up + * space in level 0. */ int scoutfs_net_finish_compaction(struct super_block *sb, void *curs, void *list) { DECLARE_NET_INFO(sb, nti); struct commit_waiter cw; + bool level0_was_full; int ret; down_read(&nti->ring_commit_rwsem); - ret = scoutfs_compact_commit(sb, curs, list); + level0_was_full = scoutfs_manifest_level0_full(sb); - if (ret == 0) + ret = scoutfs_compact_commit(sb, curs, list); + if (ret == 0) { queue_commit_work(nti, &cw); + if (level0_was_full && !scoutfs_manifest_level0_full(sb)) + wake_up(&nti->waitq); + } + up_read(&nti->ring_commit_rwsem); if (ret == 0) @@ -2150,6 +2185,7 @@ int scoutfs_net_setup(struct super_block *sb) init_rwsem(&nti->ring_commit_rwsem); init_llist_head(&nti->ring_commit_waiters); INIT_WORK(&nti->ring_commit_work, scoutfs_net_ring_commit_func); + init_waitqueue_head(&nti->waitq); spin_lock_init(&nti->seq_lock); INIT_LIST_HEAD(&nti->pending_seqs); INIT_LIST_HEAD(&nti->active_socks);