mkfs LSM segment and ring stuctures

Make a new file system by writing a root inode in a segment and storing
a manifest entry in the ring that references the segment.

Signed-off-by: Zach Brown <zab@versity.com>
This commit is contained in:
Zach Brown
2016-12-03 12:17:47 -08:00
parent 9d3fe27929
commit c96b833a36
2 changed files with 192 additions and 313 deletions

View File

@@ -6,9 +6,23 @@
/* super block id */
#define SCOUTFS_SUPER_ID 0x2e736674756f6373ULL /* "scoutfs." */
/*
* The super block and ring blocks are fixed 4k.
*/
#define SCOUTFS_BLOCK_SHIFT 12
#define SCOUTFS_BLOCK_SIZE (1 << SCOUTFS_BLOCK_SHIFT)
#define SCOUTFS_BLOCK_MASK (SCOUTFS_BLOCK_SIZE - 1)
#define SCOUTFS_BLOCKS_PER_PAGE (PAGE_SIZE / SCOUTFS_BLOCK_SIZE)
/*
* FS data is stored in segments, for now they're fixed size. They'll
* be dynamic.
*/
#define SCOUTFS_SEGMENT_SHIFT 20
#define SCOUTFS_SEGMENT_SIZE (1 << SCOUTFS_SEGMENT_SHIFT)
#define SCOUTFS_SEGMENT_MASK (SCOUTFS_SEGMENT_SIZE - 1)
#define SCOUTFS_SEGMENT_PAGES (SCOUTFS_SEGMENT_SIZE / PAGE_SIZE)
#define SCOUTFS_SEGMENT_BLOCKS (SCOUTFS_SEGMENT_SIZE / SCOUTFS_BLOCK_SIZE)
#define SCOUTFS_PAGES_PER_BLOCK (SCOUTFS_BLOCK_SIZE / PAGE_SIZE)
#define SCOUTFS_BLOCK_PAGE_ORDER (SCOUTFS_BLOCK_SHIFT - PAGE_SHIFT)
@@ -23,6 +37,8 @@
#define SCOUTFS_MAX_TRANS_BLOCKS (128 * 1024 * 1024 / SCOUTFS_BLOCK_SIZE)
#define SCOUTFS_MAX_KEY_BYTES 255
/*
* This header is found at the start of every block so that we can
* verify that it's what we were looking for. The crc and padding
@@ -37,6 +53,67 @@ struct scoutfs_block_header {
__le64 blkno;
} __packed;
struct scoutfs_ring_entry_header {
__u8 type;
__le16 len;
} __packed;
#define SCOUTFS_RING_ADD_MANIFEST 1
struct scoutfs_ring_add_manifest {
struct scoutfs_ring_entry_header eh;
__le64 segno;
__le64 seq;
__le16 first_key_len;
__le16 last_key_len;
__u8 level;
/* first and last key bytes */
} __packed;
/*
* This is absurdly huge. If there was only ever 1 item per segment and
* 2^64 items the tree could get this deep.
*/
#define SCOUTFS_MANIFEST_MAX_LEVEL 20
struct scoutfs_ring_block {
struct scoutfs_block_header hdr;
__le32 nr_entries;
struct scoutfs_ring_entry_header entries[0];
} __packed;
struct scoutfs_segment_item {
__le64 seq;
__le32 key_off;
__le32 val_off;
__le16 key_len;
__le16 val_len;
} __packed;
/*
* Each large segment starts with a segment block that describes the
* rest of the blocks that make up the segment.
*/
struct scoutfs_segment_block {
__le32 crc;
__le32 _padding;
__le64 segno;
__le64 max_seq;
__le32 nr_items;
/* item array with gaps so they don't cross 4k blocks */
/* packed keys */
/* packed vals */
} __packed;
/* the first block in the segment has the header and items */
#define SCOUTFS_SEGMENT_FIRST_BLOCK_ITEMS \
((SCOUTFS_BLOCK_SIZE - sizeof(struct scoutfs_segment_block)) / \
sizeof(struct scoutfs_segment_item))
/* the rest of the header blocks are full of items */
#define SCOUTFS_SEGMENT_ITEMS_PER_BLOCK \
(SCOUTFS_BLOCK_SIZE / sizeof(struct scoutfs_segment_item))
/*
* Block references include the sequence number so that we can detect
* readers racing with writers and so that we can tell that we don't
@@ -97,7 +174,7 @@ struct scoutfs_buddy_root {
*/
struct scoutfs_key {
__le64 inode;
__u8 type;
u8 type;
__le64 offset;
} __packed;
@@ -118,8 +195,13 @@ struct scoutfs_key {
#define SCOUTFS_MAX_ITEM_LEN 512
struct scoutfs_inode_key {
__u8 type;
__be64 ino;
} __packed;
struct scoutfs_btree_root {
__u8 height;
u8 height;
struct scoutfs_block_ref ref;
} __packed;
@@ -180,6 +262,11 @@ struct scoutfs_btree_item {
#define SCOUTFS_UUID_BYTES 16
/*
* The ring fields describe the statically allocated ring log. The
* head and tail indexes are logical 4k blocks offsets inside the ring.
* The head block should contain the seq.
*/
struct scoutfs_super_block {
struct scoutfs_block_header hdr;
__le64 id;
@@ -187,6 +274,11 @@ struct scoutfs_super_block {
__le64 next_ino;
__le64 total_blocks;
__le64 free_blocks;
__le64 ring_blkno;
__le64 ring_blocks;
__le64 ring_head_index;
__le64 ring_tail_index;
__le64 ring_head_seq;
__le64 buddy_blocks;
struct scoutfs_buddy_root buddy_root;
struct scoutfs_btree_root btree_root;

View File

@@ -43,278 +43,70 @@ static int write_block(int fd, u64 blkno, struct scoutfs_super_block *super,
return 0;
}
static u64 first_blkno(struct scoutfs_super_block *super)
{
return SCOUTFS_BUDDY_BLKNO + le64_to_cpu(super->buddy_blocks);
}
static u64 last_blkno(struct scoutfs_super_block *super)
{
return le64_to_cpu(super->total_blocks) - 1;
}
/* the starting bit offset in the block bitmap of an order's bitmap */
static int order_off(int order)
{
if (order == 0)
return 0;
return (2 * SCOUTFS_BUDDY_ORDER0_BITS) -
(SCOUTFS_BUDDY_ORDER0_BITS / (1 << (order - 1)));
}
/* the bit offset in the block bitmap of an order's bit */
static int order_nr(int order, int nr)
{
return order_off(order) + nr;
}
static void set_order_nr(struct scoutfs_buddy_block *bud, int order, u16 nr)
{
u16 first = le16_to_cpu(bud->first_set[order]);
if (nr <= first)
bud->first_set[order] = cpu_to_le16(nr);
}
static void clear_order_nr(struct scoutfs_buddy_block *bud, int order, u16 nr)
{
u16 first = le16_to_cpu(bud->first_set[order]);
int size;
int i;
if (nr != first)
return;
if (bud->level) {
for (i = nr + 1; i < SCOUTFS_BUDDY_SLOTS; i++) {
if (le16_to_cpu(bud->slots[i].free_orders) &
(1 << order))
break;
}
if (i == SCOUTFS_BUDDY_SLOTS)
i = U16_MAX;
} else {
size = order_off(order + 1);
i = find_next_bit_le(bud->bits, size,
order_nr(order, first) + 1);
if (i >= size)
i = U16_MAX;
else
i -= order_off(order);
}
bud->first_set[order] = cpu_to_le16(i);
}
#define for_each_changed_bit(nr, bit, old, new, tmp) \
for (tmp = old ^ new; \
tmp && (nr = ffs(tmp) - 1, bit = 1 << nr, 1); \
tmp ^= bit)
/*
* Set a slot's free_orders value and update first_set for each order
* that it changes. Returns true of the slot's free_orders was changed.
* Figure out how many blocks the ring will need. This goes crazy
* with the variables to make the calculation clear.
*
* XXX just a place holder. The real calculation is more like:
*
* - max size add manifest entries for all segments
* - (some day) allocator entries for all segments
* - ring block header overhead
* - ring block unused tail space overhead
*
*/
static int set_slot_free_orders(struct scoutfs_buddy_block *bud, u16 sl,
u16 free_orders)
static u64 calc_ring_blocks(u64 size)
{
u16 old = le16_to_cpu(bud->slots[sl].free_orders);
int order;
int tmp;
int bit;
u64 first_seg_blocks;
u64 max_entry_bytes;
u64 total_bytes;
u64 blocks;
u64 segs;
if (old == free_orders)
return 0;
segs = size >> SCOUTFS_SEGMENT_SHIFT;
max_entry_bytes = sizeof(struct scoutfs_ring_add_manifest) +
(2 * SCOUTFS_MAX_KEY_BYTES);
total_bytes = (segs * max_entry_bytes) * 4;
blocks = DIV_ROUND_UP(total_bytes, SCOUTFS_BLOCK_SIZE);
for_each_changed_bit(order, bit, old, free_orders, tmp) {
if (old & bit)
clear_order_nr(bud, order, sl);
else
set_order_nr(bud, order, sl);
}
first_seg_blocks = SCOUTFS_SEGMENT_BLOCKS -
(SCOUTFS_SUPER_BLKNO + SCOUTFS_SUPER_NR);
bud->slots[sl].free_orders = cpu_to_le16(free_orders);
return 1;
}
static int test_buddy_bit(struct scoutfs_buddy_block *bud, int order, int nr)
{
return test_bit_le(order_nr(order, nr), bud->bits);
}
static void set_buddy_bit(struct scoutfs_buddy_block *bud, int order, int nr)
{
if (!test_and_set_bit_le(order_nr(order, nr), bud->bits))
set_order_nr(bud, order, nr);
}
static void clear_buddy_bit(struct scoutfs_buddy_block *bud, int order, int nr)
{
if (test_and_clear_bit_le(order_nr(order, nr), bud->bits))
clear_order_nr(bud, order, nr);
}
/* merge lower orders buddies as we free up to the highest */
static void free_order_bit(struct scoutfs_buddy_block *bud, int order, int nr)
{
int i;
for (i = order; i < SCOUTFS_BUDDY_ORDERS - 2; i++) {
if (!test_buddy_bit(bud, i, nr ^ 1))
break;
clear_buddy_bit(bud, i, nr ^ 1);
nr >>= 1;
}
set_buddy_bit(bud, i, nr);
}
static u16 calc_free_orders(struct scoutfs_buddy_block *bud)
{
u16 free = 0;
int i;
for (i = 0; i < SCOUTFS_BUDDY_ORDERS; i++)
if (le16_to_cpu(bud->first_set[i]) != U16_MAX)
free |= 1 << i;
return free;
}
static void init_buddy_block(struct scoutfs_buddy_block *bud, int level)
{
int i;
memset(bud, 0, SCOUTFS_BLOCK_SIZE);
for (i = 0; i < array_size(bud->first_set); i++)
bud->first_set[i] = cpu_to_le16(U16_MAX);
bud->level = level;
return max(first_seg_blocks, blocks);
}
/*
* Write either the left-most or right-most buddy bitmap leaf in the
* allocator and then ascend writing parent blocks to the root.
*
* If we're writing the left leaf then blk is the first free blk. If
* we're writing the right leaf then blk is the last usable blk.
*
* If we're writing the left leaf then we don't actually write the root
* block. We record the free_orders for the first child block from the
* root block. When we write the right leaf we'll ascend into the root
* block and initialize the free_order of the first slot for the path to
* the left leaf.
*
* We initialize free_orders in all the unused slots so that the kernel
* can try to descend in to them when searching by size and will
* initialize new full blocks blocks.
* Make a new file system by writing:
* - super blocks
* - ring block with manifest entry
* - segment with root inode
*/
static int write_buddy_blocks(int fd, struct scoutfs_super_block *super,
struct buddy_info *binf,
struct scoutfs_buddy_block *bud, u64 blk,
int left, u16 *free_orders)
{
u64 blkno;
int level;
int first;
int last;
int ret;
u16 free;
u16 full;
int sl;
int i;
if (left) {
first = blk;
last = SCOUTFS_BUDDY_ORDER0_BITS - 1;
} else {
first = 0;
last = blk % SCOUTFS_BUDDY_ORDER0_BITS;
}
/* write the leaf block */
level = 0;
init_buddy_block(bud, level);
for (i = first; i <= last; i++)
free_order_bit(bud, 0, i);
blk = blk / SCOUTFS_BUDDY_ORDER0_BITS;
blkno = binf->blknos[level] + (blk * 2);
ret = write_block(fd, blkno, super, &bud->hdr);
if (ret)
return ret;
free = calc_free_orders(bud);
full = SCOUTFS_BUDDY_ORDER0_BITS;
/* write parents, stopping before root if left */
while (++level < (left ? binf->height - 1 : binf->height)) {
sl = blk % SCOUTFS_BUDDY_SLOTS;
blk = blk / SCOUTFS_BUDDY_SLOTS;
blkno = binf->blknos[level] + (blk * 2);
init_buddy_block(bud, level);
/* set full until right spine, 0th in root from left */
for (i = 0; i < sl; i++)
set_slot_free_orders(bud, i, full);
if (!left && level == (binf->height - 1)) {
set_slot_free_orders(bud, 0, *free_orders);
bud->slots[0].seq = super->hdr.seq;
}
set_slot_free_orders(bud, sl, free);
bud->slots[sl].seq = super->hdr.seq;
/* init full slots in full parents down the left spine */
for (i = sl; left && i < SCOUTFS_BUDDY_SLOTS; i++)
set_slot_free_orders(bud, i, full);
ret = write_block(fd, blkno, super, &bud->hdr);
if (ret)
return ret;
free = calc_free_orders(bud);
}
*free_orders = free;
return 0;
}
static int write_new_fs(char *path, int fd)
{
struct scoutfs_super_block *super;
struct scoutfs_inode_key *ikey;
struct scoutfs_inode *inode;
struct scoutfs_btree_block *bt;
struct scoutfs_btree_item *item;
struct scoutfs_key root_key;
struct buddy_info binf;
struct scoutfs_segment_block *sblk;
struct scoutfs_ring_block *ring;
struct scoutfs_segment_item *item;
struct scoutfs_ring_add_manifest *am;
struct timeval tv;
char uuid_str[37];
unsigned int i;
u64 limit;
u64 size;
u64 blkno;
u64 count;
u64 total_blocks;
u16 free_orders;
void *buf;
u64 ring_blocks;
int ret;
gettimeofday(&tv, NULL);
buf = malloc(SCOUTFS_BLOCK_SIZE);
super = malloc(SCOUTFS_BLOCK_SIZE);
if (!buf || !super) {
super = calloc(1, SCOUTFS_BLOCK_SIZE);
ring = calloc(1, SCOUTFS_BLOCK_SIZE);
sblk = calloc(1, SCOUTFS_SEGMENT_SIZE);
if (!super || !ring || !sblk) {
ret = -errno;
fprintf(stderr, "failed to allocate a block: %s (%d)\n",
fprintf(stderr, "failed to allocate block mem: %s (%d)\n",
strerror(errno), errno);
goto out;
}
@@ -326,14 +118,16 @@ static int write_new_fs(char *path, int fd)
goto out;
}
/* the block limit is totally arbitrary */
/* require space for one segment */
limit = SCOUTFS_SEGMENT_SIZE * 2;
if (size < limit) {
fprintf(stderr, "%llu byte device too small for min %llu byte fs\n",
size, limit);
goto out;
}
total_blocks = size / SCOUTFS_BLOCK_SIZE;
buddy_init(&binf, total_blocks);
root_key.inode = cpu_to_le64(SCOUTFS_ROOT_INO);
root_key.type = SCOUTFS_INODE_KEY;
root_key.offset = 0;
ring_blocks = calc_ring_blocks(size);
/* first initialize the super so we can use it to build structures */
memset(super, 0, SCOUTFS_BLOCK_SIZE);
@@ -343,34 +137,28 @@ static int write_new_fs(char *path, int fd)
uuid_generate(super->uuid);
super->next_ino = cpu_to_le64(SCOUTFS_ROOT_INO + 1);
super->total_blocks = cpu_to_le64(total_blocks);
super->buddy_blocks = cpu_to_le64(binf.buddy_blocks);
super->ring_blkno = cpu_to_le64(SCOUTFS_SUPER_BLKNO + 2);
super->ring_blocks = cpu_to_le64(ring_blocks);
super->ring_head_seq = cpu_to_le64(1);
/* require space for two leaf blocks for writing left/right paths */
count = last_blkno(super) - first_blkno(super) + 1;
limit = (SCOUTFS_BUDDY_ORDER0_BITS * 2);
if (count < limit) {
fprintf(stderr, "%llu byte device only has room for %llu %u byte fs blocks, needs at least %llu fs blocks\n",
size, count, SCOUTFS_BLOCK_SIZE, limit);
goto out;
}
/* write seg with root inode */
sblk->segno = cpu_to_le64(1);
sblk->max_seq = cpu_to_le64(1);
sblk->nr_items = cpu_to_le32(1);
blkno = first_blkno(super);
item = (void *)(sblk + 1);
ikey = (void *)(item + 1);
inode = (void *)(ikey + 1);
/* write a btree leaf root inode item */
memset(buf, 0, SCOUTFS_BLOCK_SIZE);
bt = buf;
bt->nr_items = cpu_to_le16(1);
bt->free_end = cpu_to_le16(SCOUTFS_BLOCK_SIZE - sizeof(*item) -
sizeof(*inode));
bt->free_reclaim = 0;
bt->item_offs[0] = bt->free_end;
item = (void *)bt + le16_to_cpu(bt->free_end);
item->seq = cpu_to_le64(1);
item->key = root_key;
item->key_off = cpu_to_le32((long)ikey - (long)sblk);
item->val_off = cpu_to_le32((long)inode - (long)sblk);
item->key_len = cpu_to_le16(sizeof(struct scoutfs_inode_key));
item->val_len = cpu_to_le16(sizeof(struct scoutfs_inode));
inode = (void *)(item + 1);
ikey->type = SCOUTFS_INODE_KEY;
ikey->ino = cpu_to_be64(SCOUTFS_ROOT_INO);
inode->nlink = cpu_to_le32(2);
inode->mode = cpu_to_le32(0755 | 0040000);
inode->atime.sec = cpu_to_le64(tv.tv_sec);
@@ -380,38 +168,35 @@ static int write_new_fs(char *path, int fd)
inode->mtime.sec = inode->atime.sec;
inode->mtime.nsec = inode->atime.nsec;
ret = write_block(fd, blkno, super, &bt->hdr);
ret = pwrite(fd, sblk, SCOUTFS_SEGMENT_SIZE,
1 << SCOUTFS_SEGMENT_SHIFT);
if (ret != SCOUTFS_SEGMENT_SIZE) {
ret = -EIO;
goto out;
}
/* write the ring block with the manifest entry pointing to seg */
ring->nr_entries = cpu_to_le32(1);
am = (void *)ring->entries;
am->eh.type = SCOUTFS_RING_ADD_MANIFEST;
am->eh.len = cpu_to_le16(sizeof(struct scoutfs_ring_add_manifest));
am->segno = cpu_to_le64(1);
am->seq = cpu_to_le64(1);
am->first_key_len = cpu_to_le16(sizeof(struct scoutfs_inode_key));
am->last_key_len = cpu_to_le16(sizeof(struct scoutfs_inode_key));
am->level = 1;
ikey = (void *)(am + 1);
ikey->type = SCOUTFS_INODE_KEY;
ikey->ino = cpu_to_be64(SCOUTFS_ROOT_INO);
ikey = (void *)(ikey + 1);
ikey->type = SCOUTFS_INODE_KEY;
ikey->ino = cpu_to_be64(SCOUTFS_ROOT_INO);
ret = write_block(fd, le64_to_cpu(super->ring_blkno), super,
&ring->hdr);
if (ret)
goto out;
/* blkno is now first free */
blkno++;
/* the super references the btree block */
super->btree_root.height = 1;
super->btree_root.ref.blkno = bt->hdr.blkno;
super->btree_root.ref.seq = bt->hdr.seq;
/* free_blocks reflects the fs blocks, not buddy blocks */
super->free_blocks = cpu_to_le64(total_blocks - blkno);
/* write left-most buddy block and all full parents, not root */
ret = write_buddy_blocks(fd, super, &binf, buf,
blkno - first_blkno(super),
1, &free_orders);
if (ret)
goto out;
/* write right-most buddy and parents and the root */
ret = write_buddy_blocks(fd, super, &binf, buf,
last_blkno(super) - first_blkno(super),
0, &free_orders);
if (ret)
goto out;
/* the super references the buddy leaf block */
super->buddy_root.height = binf.height;
super->buddy_root.slot.seq = super->hdr.seq;
super->buddy_root.slot.free_orders = cpu_to_le16(free_orders);
/* write the two super blocks */
for (i = 0; i < SCOUTFS_SUPER_NR; i++) {
@@ -433,18 +218,20 @@ static int write_new_fs(char *path, int fd)
printf("Created scoutfs filesystem:\n"
" total blocks: %llu\n"
" buddy blocks: %llu\n"
" ring blocks: %llu\n"
" fsid: %llx\n"
" uuid: %s\n",
total_blocks, le64_to_cpu(super->buddy_blocks),
le64_to_cpu(super->hdr.fsid), uuid_str);
total_blocks, ring_blocks, le64_to_cpu(super->hdr.fsid),
uuid_str);
ret = 0;
out:
if (super)
free(super);
if (buf)
free(buf);
if (ring)
free(ring);
if (sblk)
free(sblk);
return ret;
}