From c4663ea1a17f847fc857e4ecb5d2fe77ad5f89cd Mon Sep 17 00:00:00 2001 From: Zach Brown Date: Tue, 6 Oct 2020 15:10:00 -0700 Subject: [PATCH] scoutfs: compact items in item cache pages The first pass of the item cache didn't try to reclaim freed space at all. It would leave behind very sparse pages. The oldest of which would be reclaimed by memory pressure. While this worked, it created much more stress on the system than is necessary. Splitting a page with one key also makes it hard to calculate the boundaries of the split pages, given that the start and end keys could be the single item. This adds a header field which tracks the free space in item cache pgaes. Free space is created before the alloc offset by removing items from the rbtree, but also from shrinking item values when updating or deleting items. If we try to split a page with sufficient free space to insert the largest possible item then we compact the page instead of splitting it. We copy the items into the front of an unused page and swap the pages. Signed-off-by: Zach Brown --- kmod/src/counters.h | 1 + kmod/src/item.c | 110 +++++++++++++++++++++++++++++++++++++++----- 2 files changed, 100 insertions(+), 11 deletions(-) diff --git a/kmod/src/counters.h b/kmod/src/counters.h index 140a07f5..2f4098cc 100644 --- a/kmod/src/counters.h +++ b/kmod/src/counters.h @@ -85,6 +85,7 @@ EXPAND_COUNTER(item_page_accessed) \ EXPAND_COUNTER(item_page_alloc) \ EXPAND_COUNTER(item_page_clear_dirty) \ + EXPAND_COUNTER(item_page_compact) \ EXPAND_COUNTER(item_page_free) \ EXPAND_COUNTER(item_page_lru_add) \ EXPAND_COUNTER(item_page_lru_remove) \ diff --git a/kmod/src/item.c b/kmod/src/item.c index fade9084..120c07c1 100644 --- a/kmod/src/item.c +++ b/kmod/src/item.c @@ -129,6 +129,7 @@ struct cached_page { struct list_head dirty_head; struct page *page; unsigned int page_off; + unsigned int erased_bytes; atomic_t refcount; }; @@ -416,6 +417,13 @@ static struct cached_item *alloc_item(struct cached_page *pg, return item; } +static void erase_item(struct cached_page *pg, struct cached_item *item) +{ + rbtree_erase(&item->node, &pg->item_root); + pg->erased_bytes += round_up(item_val_bytes(item->val_len), + CACHED_ITEM_ALIGN); +} + static void lru_add(struct super_block *sb, struct item_cache_info *cinf, struct cached_page *pg) { @@ -649,7 +657,8 @@ static void erase_page_items(struct cached_page *pg, if (scoutfs_key_compare(&item->key, end) > 0) break; - rbtree_erase(&item->node, &pg->item_root); + + erase_item(pg, item); } } @@ -702,7 +711,7 @@ static void move_page_items(struct super_block *sb, to->persistent = from->persistent; to->deletion = from->deletion; - rbtree_erase(&from->node, &left->item_root); + erase_item(left, from); } } @@ -775,6 +784,64 @@ static int trim_page_intersection(struct super_block *sb, return PGI_BISECT; } +/* + * The caller wants to allocate an item in the page but there isn't room + * at the page_off. If erasing items has left sufficient internal free + * space we can pack the existing items to the start of the page to make + * room for the insertion. + * + * The caller's empty pg is only used for its page struct, which we swap + * with our old empty page. We don't touch its pg struct. + * + * This is a coarse bulk way of dealing with free space, as opposed to + * specifically tracking internal free regions and using them to satisfy + * item allocations. + */ +static void compact_page_items(struct super_block *sb, + struct cached_page *pg, + struct cached_page *empty) +{ + struct cached_item *from; + struct cached_item *to; + struct rb_root item_root = RB_ROOT; + struct rb_node *par = NULL; + struct rb_node **pnode = &item_root.rb_node; + unsigned int page_off = 0; + LIST_HEAD(dirty_list); + + if (pg->erased_bytes < item_val_bytes(SCOUTFS_MAX_VAL_SIZE)) + return; + + if (WARN_ON_ONCE(empty->page_off != 0) || + WARN_ON_ONCE(!RB_EMPTY_ROOT(&empty->item_root)) || + WARN_ON_ONCE(!list_empty(&empty->dirty_list))) + return; + + scoutfs_inc_counter(sb, item_page_compact); + + for (from = first_item(&pg->item_root); from; from = next_item(from)) { + to = page_address(empty->page) + page_off; + page_off += round_up(item_val_bytes(from->val_len), + CACHED_ITEM_ALIGN); + + /* copy the entire item, struct members and all */ + memcpy(to, from, item_val_bytes(from->val_len)); + + rbtree_insert(&to->node, par, pnode, &item_root); + par = &to->node; + pnode = &to->node.rb_right; + + if (to->dirty) + list_add_tail(&to->dirty_head, &dirty_list); + } + + pg->item_root = item_root; + list_replace(&dirty_list, &pg->dirty_list); + swap(pg->page, empty->page); + pg->page_off = page_off; + pg->erased_bytes = 0; +} + /* * This behaves a little differently than the other walks because we * want to minimize compares and there are only simple searching and @@ -1028,6 +1095,9 @@ static int try_split_page(struct super_block *sb, struct item_cache_info *cinf, write_lock(&pg->rwlock); + if (!page_has_room(pg, val_len)) + compact_page_items(sb, pg, left); + if (page_has_room(pg, val_len)) { write_unlock(&cinf->rwlock); write_unlock(&pg->rwlock); @@ -1207,8 +1277,8 @@ static int read_page_item(struct super_block *sb, struct scoutfs_key *key, { DECLARE_ITEM_CACHE_INFO(sb, cinf); struct rb_root *root = arg; - struct cached_page *right; - struct cached_page *left; + struct cached_page *right = NULL; + struct cached_page *left = NULL; struct cached_page *pg; struct cached_item *found; struct cached_item *item; @@ -1222,10 +1292,19 @@ static int read_page_item(struct super_block *sb, struct scoutfs_key *key, if (found && (le64_to_cpu(found->liv.vers) >= le64_to_cpu(liv->vers))) return 0; + if (!page_has_room(pg, val_len)) { + left = alloc_pg(sb, 0); + /* split needs multiple items, sparse may not have enough */ + if (!left) + return -ENOMEM; + compact_page_items(sb, pg, left); + } + item = alloc_item(pg, key, liv, val, val_len); if (!item) { /* simpler split of private pages, no locking/dirty/lru */ - left = alloc_pg(sb, 0); + if (!left) + left = alloc_pg(sb, 0); right = alloc_pg(sb, 0); if (!left || !right) { put_pg(sb, left); @@ -1247,6 +1326,9 @@ static int read_page_item(struct super_block *sb, struct scoutfs_key *key, item = alloc_item(pg, key, liv, val, val_len); found = item_rbtree_walk(&pg->item_root, key, NULL, &par, &pnode); + + left = NULL; + right = NULL; } /* if deleted a deletion item will be required */ @@ -1254,7 +1336,10 @@ static int read_page_item(struct super_block *sb, struct scoutfs_key *key, rbtree_insert(&item->node, par, pnode, &pg->item_root); if (found) - rbtree_erase(&found->node, &pg->item_root); + erase_item(pg, found); + + put_pg(sb, left); + put_pg(sb, right); return 0; } @@ -1348,7 +1433,7 @@ static int read_pages(struct super_block *sb, struct item_cache_info *cinf, /* drop deletion items, we don't need them in the cache */ for_each_item_safe(&pg->item_root, item, item_tmp) { if (item->deletion) - rbtree_erase(&item->node, &pg->item_root); + erase_item(pg, item); } } @@ -1740,7 +1825,7 @@ static int item_create(struct super_block *sb, struct scoutfs_key *key, if (found) { item->persistent = found->persistent; clear_item_dirty(sb, cinf, pg, found); - rbtree_erase(&found->node, &pg->item_root); + erase_item(pg, found); } if (force) @@ -1811,6 +1896,8 @@ int scoutfs_item_update(struct super_block *sb, struct scoutfs_key *key, if (val_len <= found->val_len) { if (val_len) memcpy(found->val, val, val_len); + if (val_len < found->val_len) + pg->erased_bytes += found->val_len - val_len; found->val_len = val_len; found->liv.vers = liv.vers; mark_item_dirty(sb, cinf, pg, NULL, found); @@ -1821,7 +1908,7 @@ int scoutfs_item_update(struct super_block *sb, struct scoutfs_key *key, mark_item_dirty(sb, cinf, pg, NULL, item); clear_item_dirty(sb, cinf, pg, found); - rbtree_erase(&found->node, &pg->item_root); + erase_item(pg, found); } ret = 0; @@ -1883,12 +1970,13 @@ static int item_delete(struct super_block *sb, struct scoutfs_key *key, if (!item->persistent) { /* can just forget items that aren't yet persistent */ clear_item_dirty(sb, cinf, pg, item); - rbtree_erase(&item->node, &pg->item_root); + erase_item(pg, item); } else { /* must emit deletion to clobber old persistent item */ item->liv.vers = cpu_to_le64(lock->write_version); item->liv.flags |= SCOUTFS_LOG_ITEM_FLAG_DELETION; item->deletion = 1; + pg->erased_bytes += item->val_len; item->val_len = 0; mark_item_dirty(sb, cinf, pg, NULL, item); } @@ -2115,7 +2203,7 @@ retry: /* free deletion items */ if (item->deletion) - rbtree_erase(&item->node, &pg->item_root); + erase_item(pg, item); else item->persistent = 1; }