scoutfs: switch block cache to rbtree

Switch the block cache from indexing blocks in a radix tree to using an
rbtree.  We lose the RCU lookups but we gain being able to move blocks
around in the cache without allocation failure.  And we no longer have
the problem of not being able to index large blocks with a 32bit long
radix key.

Signed-off-by: Zach Brown <zab@versity.com>
This commit is contained in:
Zach Brown
2020-02-20 13:59:03 -08:00
committed by Zach Brown
parent 05a8573054
commit 809d4be58e

View File

@@ -19,6 +19,7 @@
#include <linux/sched.h>
#include <linux/bio.h>
#include <linux/blkdev.h>
#include <linux/rbtree.h>
#include "format.h"
#include "super.h"
@@ -45,7 +46,7 @@
struct block_info {
struct super_block *sb;
spinlock_t lock;
struct radix_tree_root radix;
struct rb_root root;
struct list_head lru_list;
u64 lru_nr;
u64 lru_move_counter;
@@ -63,7 +64,7 @@ enum {
BLOCK_BIT_NEW, /* newly allocated, contents undefined */
BLOCK_BIT_DIRTY, /* dirty, writer will write */
BLOCK_BIT_ERROR, /* saw IO error */
BLOCK_BIT_DELETED, /* has been deleted from radix tree */
BLOCK_BIT_DELETED, /* has been deleted from rbtree */
BLOCK_BIT_PAGE_ALLOC, /* page (possibly high order) allocation */
BLOCK_BIT_VIRT, /* mapped virt allocation */
BLOCK_BIT_CRC_VALID, /* crc has been verified */
@@ -72,6 +73,7 @@ enum {
struct block_private {
struct scoutfs_block bl;
struct rb_node node;
struct super_block *sb;
atomic_t refcount;
union {
@@ -180,6 +182,7 @@ static struct block_private *block_alloc(struct super_block *sb, u64 blkno)
}
bp->bl.blkno = blkno;
RB_CLEAR_NODE(&bp->node);
bp->sb = sb;
atomic_set(&bp->refcount, 1);
INIT_LIST_HEAD(&bp->lru_entry);
@@ -249,9 +252,39 @@ static void block_put(struct super_block *sb, struct block_private *bp)
}
}
static struct block_private *walk_block_rbtree(struct rb_root *root,
u64 blkno,
struct block_private *ins)
{
struct rb_node **node = &root->rb_node;
struct rb_node *parent = NULL;
struct block_private *bp;
int cmp;
while (*node) {
parent = *node;
bp = container_of(*node, struct block_private, node);
cmp = scoutfs_cmp_u64s(bp->bl.blkno, blkno);
if (cmp == 0)
return bp;
else if (cmp < 0)
node = &(*node)->rb_left;
else
node = &(*node)->rb_right;
}
if (ins) {
rb_link_node(&ins->node, parent, node);
rb_insert_color(&ins->node, root);
return ins;
}
return NULL;
}
/*
* Add a new block into the cache. The caller holds the lock and has
* preloaded the radix.
* Add a new block into the cache. The caller holds the lock.
*/
static void block_insert(struct super_block *sb, struct block_private *bp,
u64 blkno)
@@ -260,9 +293,10 @@ static void block_insert(struct super_block *sb, struct block_private *bp,
assert_spin_locked(&binf->lock);
BUG_ON(!list_empty(&bp->lru_entry));
BUG_ON(!RB_EMPTY_NODE(&bp->node));
atomic_inc(&bp->refcount);
radix_tree_insert(&binf->radix, blkno, bp);
walk_block_rbtree(&binf->root, blkno, bp);
list_add_tail(&bp->lru_entry, &binf->lru_list);
bp->lru_moved = ++binf->lru_move_counter;
binf->lru_nr++;
@@ -310,11 +344,10 @@ static void block_remove(struct super_block *sb, struct block_private *bp)
{
DECLARE_BLOCK_INFO(sb, binf);
assert_spin_locked(&binf->lock);
if (!test_and_set_bit(BLOCK_BIT_DELETED, &bp->bits)) {
BUG_ON(list_empty(&bp->lru_entry));
radix_tree_delete(&binf->radix, bp->bl.blkno);
rb_erase(&bp->node, &binf->root);
RB_CLEAR_NODE(&bp->node);
list_del_init(&bp->lru_entry);
binf->lru_nr--;
block_put(sb, bp);
@@ -328,19 +361,18 @@ static void block_remove_all(struct super_block *sb)
{
DECLARE_BLOCK_INFO(sb, binf);
struct block_private *bp;
struct rb_node *node;
spin_lock(&binf->lock);
while (radix_tree_gang_lookup(&binf->radix, (void **)&bp, 0, 1) == 1) {
for (node = rb_first(&binf->root); node; ) {
bp = container_of(node, struct block_private, node);
node = rb_next(node);
wait_event(binf->waitq, atomic_read(&bp->io_count) == 0);
block_remove(sb, bp);
}
spin_unlock(&binf->lock);
WARN_ON_ONCE(!list_empty(&binf->lru_list));
WARN_ON_ONCE(binf->lru_nr != 0);
WARN_ON_ONCE(binf->radix.rnode != NULL);
WARN_ON_ONCE(!RB_EMPTY_ROOT(&binf->root));
}
/*
@@ -457,8 +489,8 @@ static int block_submit_bio(struct super_block *sb, struct block_private *bp,
/*
* Return a reference to a cached block in the system, allocating a new
* block if one isn't found in the radix. Its contents are undefined if
* it's newly allocated.
* block if one isn't found in the rbtree. Its contents are undefined
* if it's newly allocated.
*/
static struct block_private *block_get(struct super_block *sb, u64 blkno)
{
@@ -467,11 +499,11 @@ static struct block_private *block_get(struct super_block *sb, u64 blkno)
struct block_private *bp;
int ret;
rcu_read_lock();
bp = radix_tree_lookup(&binf->radix, blkno);
spin_lock(&binf->lock);
bp = walk_block_rbtree(&binf->root, blkno, NULL);
if (bp)
atomic_inc(&bp->refcount);
rcu_read_unlock();
spin_unlock(&binf->lock);
/* drop failed reads that interrupted waiters abandoned */
if (bp && (test_bit(BLOCK_BIT_ERROR, &bp->bits) &&
@@ -490,20 +522,15 @@ static struct block_private *block_get(struct super_block *sb, u64 blkno)
goto out;
}
ret = radix_tree_preload(GFP_NOFS);
if (ret)
goto out;
/* could use slot instead of lookup/insert */
/* could refactor to insert in one walk */
spin_lock(&binf->lock);
found = radix_tree_lookup(&binf->radix, blkno);
found = walk_block_rbtree(&binf->root, blkno, NULL);
if (found) {
atomic_inc(&found->refcount);
} else {
block_insert(sb, bp, blkno);
}
spin_unlock(&binf->lock);
radix_tree_preload_end();
if (found) {
block_put(sb, bp);
@@ -850,17 +877,8 @@ int scoutfs_block_setup(struct super_block *sb)
{
struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
struct block_info *binf;
loff_t size;
int ret;
/* we store blknos in longs in the radix */
size = i_size_read(sb->s_bdev->bd_inode);
if ((size >> SCOUTFS_BLOCK_SHIFT) >= LONG_MAX) {
scoutfs_err(sb, "Cant reference all blocks in %llu byte device with %u bit long radix tree indexes",
size, BITS_PER_LONG);
return -EINVAL;
}
binf = kzalloc(sizeof(struct block_info), GFP_KERNEL);
if (!binf) {
ret = -ENOMEM;
@@ -869,7 +887,7 @@ int scoutfs_block_setup(struct super_block *sb)
binf->sb = sb;
spin_lock_init(&binf->lock);
INIT_RADIX_TREE(&binf->radix, GFP_ATOMIC); /* insertion preloads */
binf->root = RB_ROOT;
INIT_LIST_HEAD(&binf->lru_list);
init_waitqueue_head(&binf->waitq);
binf->shrinker.shrink = block_shrink;