diff --git a/kmod/src/Makefile b/kmod/src/Makefile index 5f23e87d..f4b293ed 100644 --- a/kmod/src/Makefile +++ b/kmod/src/Makefile @@ -1,3 +1,4 @@ obj-$(CONFIG_SCOUTFS_FS) := scoutfs.o -scoutfs-y += block.o dir.o inode.o item.o msg.o super.o +scoutfs-y += block.o chunk.o crc.o dir.o inode.o item.o manifest.o msg.o \ + ring.o super.o diff --git a/kmod/src/chunk.c b/kmod/src/chunk.c new file mode 100644 index 00000000..6b5758af --- /dev/null +++ b/kmod/src/chunk.c @@ -0,0 +1,39 @@ +/* + * Copyright (C) 2016 Versity Software, Inc. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + */ +#include +#include +#include +#include +#include +#include +#include + +#include "super.h" +#include "format.h" +#include "inode.h" +#include "dir.h" +#include "msg.h" +#include "block.h" + +void scoutfs_set_chunk_alloc_bits(struct super_block *sb, + struct scoutfs_ring_bitmap *bm) +{ + struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb); + u64 off = le64_to_cpu(bm->offset); + + /* XXX check for corruption */ + + sbi->chunk_alloc_bits[off] = bm->bits[0]; + sbi->chunk_alloc_bits[off + 1] = bm->bits[1]; + +} diff --git a/kmod/src/chunk.h b/kmod/src/chunk.h new file mode 100644 index 00000000..b2cb6ff7 --- /dev/null +++ b/kmod/src/chunk.h @@ -0,0 +1,7 @@ +#ifndef _SCOUTFS_CHUNK_H_ +#define _SCOUTFS_CHUNK_H_ + +void scoutfs_set_chunk_alloc_bits(struct super_block *sb, + struct scoutfs_ring_bitmap *bm); + +#endif diff --git a/kmod/src/format.h b/kmod/src/format.h index d27748c0..bafaef80 100644 --- a/kmod/src/format.h +++ b/kmod/src/format.h @@ -24,6 +24,7 @@ #define SCOUTFS_CHUNK_SHIFT 22 #define SCOUTFS_CHUNK_SIZE (1 << SCOUTFS_CHUNK_SHIFT) #define SCOUTFS_CHUNK_BLOCK_SHIFT (SCOUTFS_CHUNK_SHIFT - SCOUTFS_BLOCK_SHIFT) +#define SCOUTFS_CHUNK_BLOCK_MASK ((1 << SCOUTFS_CHUNK_BLOCK_SHIFT) - 1) #define SCOUTFS_BLOCKS_PER_CHUNK (1 << SCOUTFS_CHUNK_BLOCK_SHIFT) /* @@ -93,6 +94,10 @@ struct scoutfs_ring_map_block { __le64 blknos[0]; } __packed; +#define SCOUTFS_RING_MAP_BLOCKS \ + ((SCOUTFS_BLOCK_SIZE - sizeof(struct scoutfs_ring_map_block)) / \ + sizeof(__le64)) + struct scoutfs_ring_entry { u8 type; __le16 len; @@ -112,15 +117,11 @@ struct scoutfs_ring_block { } __packed; enum { - SCOUTFS_RING_REMOVE_MANIFEST = 0, - SCOUTFS_RING_ADD_MANIFEST, + SCOUTFS_RING_ADD_MANIFEST = 0, + SCOUTFS_RING_DEL_MANIFEST, SCOUTFS_RING_BITMAP, }; -struct scoutfs_ring_remove_manifest { - __le64 blkno; -} __packed; - /* * Including both keys might make the manifest too large. It might be * better to only include one key and infer a block's range from the @@ -128,7 +129,7 @@ struct scoutfs_ring_remove_manifest { * isn't unused key space between blocks in a level. We might search * blocks when we didn't need to. */ -struct scoutfs_ring_add_manifest { +struct scoutfs_ring_manifest_entry { __le64 blkno; __le64 seq; __u8 level; @@ -136,6 +137,13 @@ struct scoutfs_ring_add_manifest { struct scoutfs_key last; } __packed; +struct scoutfs_ring_del_manifest { + __le64 blkno; +} __packed; + +/* 2^22 * 10^13 > 2^64 */ +#define SCOUTFS_MAX_LEVEL 13 + struct scoutfs_ring_bitmap { __le32 offset; __le64 bits[2]; diff --git a/kmod/src/manifest.c b/kmod/src/manifest.c new file mode 100644 index 00000000..48fc7999 --- /dev/null +++ b/kmod/src/manifest.c @@ -0,0 +1,207 @@ +/* + * Copyright (C) 2016 Versity Software, Inc. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + */ +#include +#include +#include + +#include "super.h" +#include "format.h" +#include "manifest.h" +#include "key.h" + +/* + * The manifest organizes log segment blocks into a tree structure. + * + * Each level of the tree contains an ordered list of log segments whose + * item keys don't overlap. The first level (level 0) of the tree is + * the exception whose segments can have key ranges that overlap. + * + * We also store pointers to the manifest entries in a radix tree + * indexed by their block number so that we can easily find existing + * entries for deletion. + * + * Level 0 segments are stored in the list with the most recent at the + * head of the list. Level 0's rb tree will always be empty. + */ +struct scoutfs_manifest { + spinlock_t lock; + + struct radix_tree_root blkno_radix; + struct list_head level_zero; + + struct scoutfs_level { + struct rb_root root; + } levels[SCOUTFS_MAX_LEVEL + 1]; +}; + +struct scoutfs_manifest_node { + struct rb_node node; + struct list_head head; + + struct scoutfs_ring_manifest_entry ment; +}; + +static void insert_mnode(struct rb_root *root, + struct scoutfs_manifest_node *ins) +{ + struct rb_node **node = &root->rb_node; + struct scoutfs_manifest_node *mnode; + struct rb_node *parent = NULL; + int cmp; + + while (*node) { + parent = *node; + mnode = rb_entry(*node, struct scoutfs_manifest_node, node); + + cmp = scoutfs_key_cmp(&ins->ment.first, &mnode->ment.first); + if (cmp < 0) + node = &(*node)->rb_left; + else + node = &(*node)->rb_right; + } + + rb_link_node(&ins->node, parent, node); + rb_insert_color(&ins->node, root); +} + +static struct scoutfs_manifest_node *delete_mnode(struct scoutfs_manifest *mani, + u64 blkno) + +{ + struct scoutfs_manifest_node *mnode; + + mnode = radix_tree_lookup(&mani->blkno_radix, blkno); + if (mnode) { + if (!list_empty(&mnode->head)) + list_del_init(&mnode->head); + if (!RB_EMPTY_NODE(&mnode->node)) { + rb_erase(&mnode->node, + &mani->levels[mnode->ment.level].root); + RB_CLEAR_NODE(&mnode->node); + } + } + + return mnode; +} + +/* + * This is called during ring replay. Because of the way the ring works + * we can get deletion entries for segments that we don't yet have + * in the replayed ring state. + */ +void scoutfs_delete_manifest(struct super_block *sb, u64 blkno) +{ + struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb); + struct scoutfs_manifest *mani = sbi->mani; + struct scoutfs_manifest_node *mnode; + + spin_lock(&mani->lock); + mnode = delete_mnode(mani, blkno); + spin_unlock(&mani->lock); + if (mnode) + kfree(mnode); +} + +/* + * This is called during ring replay to reconstruct the manifest state + * from the ring entries. Moving segments between levels is recorded + * with a single ring entry so we always try to look up the segment in + * the manifest before we add it to the manifest. + */ +int scoutfs_add_manifest(struct super_block *sb, + struct scoutfs_ring_manifest_entry *ment) +{ + struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb); + struct scoutfs_manifest *mani = sbi->mani; + struct scoutfs_manifest_node *mnode; + + spin_lock(&mani->lock); + + mnode = delete_mnode(mani, le64_to_cpu(ment->blkno)); + if (!mnode) { + spin_unlock(&mani->lock); + mnode = kmalloc(sizeof(struct scoutfs_manifest_node), + GFP_NOFS); + if (!mnode) + return -ENOMEM; /* XXX hmm, fatal? prealloc?*/ + + INIT_LIST_HEAD(&mnode->head); + RB_CLEAR_NODE(&mnode->node); + spin_lock(&mani->lock); + } + + mnode->ment = *ment; + if (ment->level) + insert_mnode(&mani->levels[ment->level].root, mnode); + else + list_add(&mnode->head, &mani->level_zero); + + spin_unlock(&mani->lock); + + return 0; +} + +int scoutfs_setup_manifest(struct super_block *sb) +{ + struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb); + struct scoutfs_manifest *mani; + int i; + + mani = kmalloc(sizeof(struct scoutfs_manifest), GFP_KERNEL); + if (!mani) + return -ENOMEM; + + spin_lock_init(&mani->lock); + INIT_RADIX_TREE(&mani->blkno_radix, GFP_NOFS); + INIT_LIST_HEAD(&mani->level_zero); + + for (i = 0; i < ARRAY_SIZE(mani->levels); i++) + mani->levels[i].root = RB_ROOT; + + sbi->mani = mani; + + return 0; +} + +/* + * This is called once the manifest will no longer be used. We iterate + * over the blkno radix deleting radix entries and freeing manifest + * nodes. + */ +void scoutfs_destroy_manifest(struct super_block *sb) +{ + struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb); + struct scoutfs_manifest *mani = sbi->mani; + struct scoutfs_manifest_node *mnodes[16]; + unsigned long first_index = 0; + int ret; + int i; + + for (;;) { + ret = radix_tree_gang_lookup(&mani->blkno_radix, + (void **)mnodes, first_index, + ARRAY_SIZE(mnodes)); + if (!ret) + break; + + for (i = 0; i < ret; i++) { + first_index = le64_to_cpu(mnodes[i]->ment.blkno); + radix_tree_delete(&mani->blkno_radix, first_index); + kfree(mnodes[i]); + } + first_index++; + } + + kfree(sbi->mani); + sbi->mani = NULL; +} diff --git a/kmod/src/manifest.h b/kmod/src/manifest.h new file mode 100644 index 00000000..a7685c5d --- /dev/null +++ b/kmod/src/manifest.h @@ -0,0 +1,11 @@ +#ifndef _SCOUTFS_MANIFEST_H_ +#define _SCOUTFS_MANIFEST_H_ + +int scoutfs_setup_manifest(struct super_block *sb); +void scoutfs_destroy_manifest(struct super_block *sb); + +int scoutfs_add_manifest(struct super_block *sb, + struct scoutfs_ring_manifest_entry *ment); +void scoutfs_delete_manifest(struct super_block *sb, u64 blkno); + +#endif diff --git a/kmod/src/ring.c b/kmod/src/ring.c new file mode 100644 index 00000000..aeee472b --- /dev/null +++ b/kmod/src/ring.c @@ -0,0 +1,128 @@ +/* + * Copyright (C) 2016 Versity Software, Inc. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + */ +#include +#include +#include + +#include "format.h" +#include "dir.h" +#include "inode.h" +#include "key.h" +#include "item.h" +#include "super.h" +#include "manifest.h" +#include "chunk.h" +#include "block.h" + +static int replay_ring_block(struct super_block *sb, struct buffer_head *bh) +{ + struct scoutfs_ring_block *ring = (void *)bh->b_data; + struct scoutfs_ring_entry *ent = (void *)(ring + 1); + struct scoutfs_ring_manifest_entry *ment; + struct scoutfs_ring_del_manifest *del; + struct scoutfs_ring_bitmap *bm; + int ret = 0; + int i; + + /* XXX verify */ + + for (i = 0; i < le16_to_cpu(ring->nr_entries); i++) { + switch(ent->type) { + case SCOUTFS_RING_ADD_MANIFEST: + ment = (void *)(ent + 1); + ret = scoutfs_add_manifest(sb, ment); + break; + case SCOUTFS_RING_DEL_MANIFEST: + del = (void *)(ent + 1); + scoutfs_delete_manifest(sb, le64_to_cpu(del->blkno)); + break; + case SCOUTFS_RING_BITMAP: + bm = (void *)(ent + 1); + scoutfs_set_chunk_alloc_bits(sb, bm); + break; + default: + /* XXX */ + break; + } + + ent = (void *)(ent + 1) + le16_to_cpu(ent->len); + } + + return ret; +} + +/* + * Read a given logical ring block. + * + * Each ring map block entry maps a chunk's worth of ring blocks. + */ +static struct buffer_head *read_ring_block(struct super_block *sb, u64 block) +{ + struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb); + struct scoutfs_super_block *super = &sbi->super; + struct scoutfs_ring_map_block *map; + struct buffer_head *bh; + u64 ring_chunk; + u32 ring_block; + u64 blkno; + u64 div; + u32 rem; + + ring_block = block & SCOUTFS_CHUNK_BLOCK_MASK; + ring_chunk = block >> SCOUTFS_CHUNK_BLOCK_SHIFT; + + div = div_u64_rem(ring_chunk, SCOUTFS_RING_MAP_BLOCKS, &rem); + + bh = scoutfs_read_block(sb, le64_to_cpu(super->ring_map_blkno) + div); + if (!bh) + return NULL; + + /* XXX verify map block */ + + map = (void *)bh->b_data; + blkno = le64_to_cpu(map->blknos[rem]) + ring_block; + brelse(bh); + + return scoutfs_read_block(sb, blkno); +} + +int scoutfs_replay_ring(struct super_block *sb) +{ + struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb); + struct scoutfs_super_block *super = &sbi->super; + struct buffer_head *bh; + u64 block; + int ret; + int i; + + /* XXX read-ahead map blocks and each set of ring blocks */ + + block = le64_to_cpu(super->ring_first_block); + for (i = 0; i < le64_to_cpu(super->ring_active_blocks); i++) { + bh = read_ring_block(sb, block); + if (!bh) { + ret = -EIO; + break; + } + + ret = replay_ring_block(sb, bh); + brelse(bh); + if (ret) + break; + + if (++block == le64_to_cpu(super->ring_total_blocks)) + block = 0; + } + + return ret; +} diff --git a/kmod/src/ring.h b/kmod/src/ring.h new file mode 100644 index 00000000..b50b67e3 --- /dev/null +++ b/kmod/src/ring.h @@ -0,0 +1,6 @@ +#ifndef _SCOUTFS_RING_H_ +#define _SCOUTFS_RING_H_ + +int scoutfs_replay_ring(struct super_block *sb); + +#endif diff --git a/kmod/src/super.c b/kmod/src/super.c index 27e12c52..3f9d001f 100644 --- a/kmod/src/super.c +++ b/kmod/src/super.c @@ -24,6 +24,8 @@ #include "dir.h" #include "msg.h" #include "block.h" +#include "manifest.h" +#include "ring.h" static const struct super_operations scoutfs_super_ops = { .alloc_inode = scoutfs_alloc_inode, @@ -35,6 +37,7 @@ static int read_supers(struct super_block *sb) struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb); struct scoutfs_super_block *super; struct buffer_head *bh = NULL; + unsigned long bytes; int found = -1; int i; @@ -80,6 +83,16 @@ static int read_supers(struct super_block *sb) atomic64_set(&sbi->next_ino, SCOUTFS_ROOT_INO + 1); atomic64_set(&sbi->next_blkno, 2); + /* Initialize all the sb info fields which depends on the supers. */ + + bytes = DIV_ROUND_UP(sbi->super.total_chunks, 64) * sizeof(u64); + sbi->chunk_alloc_bits = vmalloc(bytes); + if (!sbi->chunk_alloc_bits) + return -ENOMEM; + + /* the alloc bits default to all free then ring entries update them */ + memset(sbi->chunk_alloc_bits, 0xff, bytes); + return 0; } @@ -111,6 +124,14 @@ static int scoutfs_fill_super(struct super_block *sb, void *data, int silent) if (ret) return ret; + ret = scoutfs_setup_manifest(sb); + if (ret) + return ret; + + ret = scoutfs_replay_ring(sb); + if (ret) + return ret; + inode = scoutfs_iget(sb, SCOUTFS_ROOT_INO); if (IS_ERR(inode)) return PTR_ERR(inode); @@ -130,6 +151,7 @@ static struct dentry *scoutfs_mount(struct file_system_type *fs_type, int flags, static void scoutfs_kill_sb(struct super_block *sb) { + scoutfs_destroy_manifest(sb); kill_block_super(sb); kfree(sb->s_fs_info); } diff --git a/kmod/src/super.h b/kmod/src/super.h index 538dd773..d7229877 100644 --- a/kmod/src/super.h +++ b/kmod/src/super.h @@ -4,6 +4,8 @@ #include #include "format.h" +struct scoutfs_manifest; + struct scoutfs_sb_info { struct scoutfs_super_block super; @@ -13,6 +15,10 @@ struct scoutfs_sb_info { spinlock_t item_lock; struct rb_root item_root; struct rb_root dirty_item_root; + + struct scoutfs_manifest *mani; + + __le64 *chunk_alloc_bits; }; static inline struct scoutfs_sb_info *SCOUTFS_SB(struct super_block *sb)