From 6a97aa3c9a182c1bcfdc465b7ab71a183183060b Mon Sep 17 00:00:00 2001 From: Zach Brown Date: Mon, 25 Jul 2016 13:45:34 -0700 Subject: [PATCH] Add support for the radix buddy bitmaps Update mkfs and print to support the buddy allocator that's indexed by radix blocks. Signed-off-by: Zach Brown --- utils/src/format.h | 106 +++++++++++++++---------------- utils/src/mkfs.c | 153 +++++++++++++++++++++++++++++++++++++++------ utils/src/print.c | 125 +++++++++++++++++++++++++----------- utils/src/sparse.h | 5 ++ 4 files changed, 278 insertions(+), 111 deletions(-) diff --git a/utils/src/format.h b/utils/src/format.h index e3112b3e..4c23d6e4 100644 --- a/utils/src/format.h +++ b/utils/src/format.h @@ -19,7 +19,8 @@ */ #define SCOUTFS_SUPER_BLKNO ((64 * 1024) >> SCOUTFS_BLOCK_SHIFT) #define SCOUTFS_SUPER_NR 2 -#define SCOUTFS_BUDDY_BLKNO (SCOUTFS_SUPER_BLKNO + SCOUTFS_SUPER_NR) +#define SCOUTFS_BUDDY_BM_BLKNO (SCOUTFS_SUPER_BLKNO + SCOUTFS_SUPER_NR) +#define SCOUTFS_BUDDY_BM_NR 2 /* * This header is found at the start of every block so that we can @@ -35,6 +36,52 @@ struct scoutfs_block_header { __le64 blkno; } __packed; +/* + * Block references include the sequence number so that we can detect + * readers racing with writers and so that we can tell that we don't + * need to follow a reference when traversing based on seqs. + */ +struct scoutfs_block_ref { + __le64 blkno; + __le64 seq; +} __packed; + +struct scoutfs_bitmap_block { + struct scoutfs_block_header hdr; + __le64 bits[0]; +} __packed; + +/* + * Track allocations from BLOCK_SIZE to (BLOCK_SIZE << ..._ORDERS). + */ +#define SCOUTFS_BUDDY_ORDERS 8 + +struct scoutfs_buddy_block { + struct scoutfs_block_header hdr; + __le32 order_counts[SCOUTFS_BUDDY_ORDERS]; + __le64 bits[0]; +} __packed; + +/* + * If we had log2(raw bits) orders we'd fully use all of the raw bits in + * the block. We're close enough that the amount of space wasted at the + * end (~1/256th of the block, ~64 bytes) isn't worth worrying about. + */ +#define SCOUTFS_BUDDY_ORDER0_BITS \ + (((SCOUTFS_BLOCK_SIZE - sizeof(struct scoutfs_buddy_block)) * 8) / 2) + +struct scoutfs_buddy_indirect { + struct scoutfs_block_header hdr; + struct scoutfs_buddy_slot { + __u8 free_orders; + struct scoutfs_block_ref ref; + } slots[0]; +} __packed; + +#define SCOUTFS_BUDDY_SLOTS \ + ((SCOUTFS_BLOCK_SIZE - sizeof(struct scoutfs_buddy_block)) / \ + sizeof(struct scoutfs_buddy_slot)) + /* * We should be able to make the offset smaller if neither dirents nor * data items use the full 64 bits. @@ -57,16 +104,6 @@ struct scoutfs_key { #define SCOUTFS_MAX_ITEM_LEN 2048 -/* - * Block references include the sequence number so that we can detect - * readers racing with writers and so that we can tell that we don't - * need to follow a reference when traversing based on seqs. - */ -struct scoutfs_block_ref { - __le64 blkno; - __le64 seq; -} __packed; - struct scoutfs_treap_root { __le16 off; } __packed; @@ -109,48 +146,6 @@ struct scoutfs_btree_item { #define SCOUTFS_UUID_BYTES 16 -/* - * Arbitrarily choose a reasonably fine grained 64byte chunk. This is a - * balance between write amplification of writing chunks with a single - * modified bit, storage overhead of partial blocks losing a chunk to - * make room for the block header and having a pos field per chunk, and - * runtime memory overhead of a bit per chunk. - */ -#define SCOUTFS_BUDDY_CHUNK_LE64S 8 -#define SCOUTFS_BUDDY_CHUNK_BYTES (SCOUTFS_BUDDY_CHUNK_LE64S * 8) -#define SCOUTFS_BUDDY_CHUNK_BITS (SCOUTFS_BUDDY_CHUNK_BYTES * 8) - -/* - * After the pair of super blocks are a preallocated ring of blocks - * which record modified regions of the buddy bitmap allocator. - * - * The seq's header needs to match the unwrapped ring index of the - * block. - */ -struct scoutfs_buddy_block { - struct scoutfs_block_header hdr; - u8 nr_chunks; - struct scoutfs_buddy_chunk { - __le32 pos; - __le64 bits[SCOUTFS_BUDDY_CHUNK_LE64S]; - } __packed chunks[0]; -} __packed; - -#define SCOUTFS_BUDDY_CHUNKS_PER_BLOCK \ - ((SCOUTFS_BLOCK_SIZE - offsetof(struct scoutfs_buddy_block, chunks)) /\ - SCOUTFS_BUDDY_CHUNK_BYTES) - - -/* - * The super is stored in a pair of blocks in the first chunk on the - * device. - * - * The ring map blocks describe the chunks that make up the ring. - * - * The rest of the ring fields describe the state of the ring blocks - * that are stored in their chunks. The active portion of the ring - * describes the current state of the system and is replayed on mount. - */ struct scoutfs_super_block { struct scoutfs_block_header hdr; __le64 id; @@ -158,10 +153,9 @@ struct scoutfs_super_block { __le64 next_ino; __le64 total_blocks; __le32 buddy_blocks; - __le32 buddy_sweep_bit; - __le64 buddy_head; - __le64 buddy_tail; struct scoutfs_btree_root btree_root; + struct scoutfs_block_ref buddy_ind_ref; + struct scoutfs_block_ref buddy_bm_ref; } __packed; #define SCOUTFS_ROOT_INO 1 diff --git a/utils/src/mkfs.c b/utils/src/mkfs.c index 48e6e1b0..83ad7ef3 100644 --- a/utils/src/mkfs.c +++ b/utils/src/mkfs.c @@ -22,10 +22,13 @@ /* * Update the block's header and write it out. */ -static int write_block(int fd, u64 blkno, struct scoutfs_block_header *hdr) +static int write_block(int fd, u64 blkno, struct scoutfs_super_block *super, + struct scoutfs_block_header *hdr) { ssize_t ret; + if (super) + *hdr = super->hdr; hdr->blkno = cpu_to_le64(blkno); hdr->crc = cpu_to_le32(crc_block(hdr)); @@ -40,21 +43,84 @@ static int write_block(int fd, u64 blkno, struct scoutfs_block_header *hdr) } /* - * Calculate the number of buddy blocks that are needed to track the - * allocation of a device with the given byte size. We need an even - * number of buddy blocks that contain 8 bits for every device block. This - * is a bit overly conservative in that it doesn't subtract the buddy - * blocks and super block from the calculation. + * Calculate the number of buddy blocks that are needed to manage + * allocation of a device with the given number of total blocks. + * + * We need a little bit of overhead to write each transaction's dirty + * buddy blocks to free space. We chose 16MB for now which is wild + * overkill and should be dependent on the max transaction size. */ static u32 calc_buddy_blocks(u64 total_blocks) { - u64 buddy_bits = total_blocks * 8; - u64 chunks = DIV_ROUND_UP(buddy_bits, SCOUTFS_BUDDY_CHUNK_BITS); - u64 blocks = DIV_ROUND_UP(chunks, SCOUTFS_BUDDY_CHUNKS_PER_BLOCK); + return DIV_ROUND_UP(total_blocks, SCOUTFS_BUDDY_ORDER0_BITS) + + ((16 * 1024 * 1024) / SCOUTFS_BLOCK_SIZE); +} - /* XXX check u32 overflow? */ +static u32 first_blkno(struct scoutfs_super_block *super) +{ + return SCOUTFS_BUDDY_BM_BLKNO + SCOUTFS_BUDDY_BM_NR + + le32_to_cpu(super->buddy_blocks); +} - return round_up(blocks, 2); +/* the starting bit offset in the block bitmap of an order's bitmap */ +static int order_off(int order) +{ + if (order == 0) + return 0; + + return (2 * SCOUTFS_BUDDY_ORDER0_BITS) - + (SCOUTFS_BUDDY_ORDER0_BITS / (1 << (order - 1))); +} + +/* the bit offset in the block bitmap of an order's bit */ +static int order_nr(int order, int nr) +{ + return order_off(order) + nr; +} + +static int test_buddy_bit(struct scoutfs_buddy_block *bud, int order, int nr) +{ + return test_bit_le(order_nr(order, nr), bud->bits); +} + +static void set_buddy_bit(struct scoutfs_buddy_block *bud, int order, int nr) +{ + if (!test_and_set_bit_le(order_nr(order, nr), bud->bits)) + le32_add_cpu(&bud->order_counts[order], 1); +} + +static void clear_buddy_bit(struct scoutfs_buddy_block *bud, int order, int nr) +{ + if (test_and_clear_bit_le(order_nr(order, nr), bud->bits)) + le32_add_cpu(&bud->order_counts[order], -1); +} + +/* merge lower orders buddies as we free up to the highest */ +static void free_order_bit(struct scoutfs_buddy_block *bud, int order, int nr) +{ + int i; + + for (i = order; i < SCOUTFS_BUDDY_ORDERS - 1; i++) { + + if (!test_buddy_bit(bud, i, nr ^ 1)) + break; + + clear_buddy_bit(bud, i, nr ^ 1); + nr >>= 1; + } + + set_buddy_bit(bud, i, nr); +} + +static u8 calc_free_orders(struct scoutfs_buddy_block *bud) +{ + u8 free = 0; + int i; + + for (i = 0; i < SCOUTFS_BUDDY_ORDERS; i++) + free |= (!!bud->order_counts[i]) << i; + + return free; } static int write_new_fs(char *path, int fd) @@ -63,6 +129,9 @@ static int write_new_fs(char *path, int fd) struct scoutfs_inode *inode; struct scoutfs_btree_block *bt; struct scoutfs_btree_item *item; + struct scoutfs_buddy_block *bud; + struct scoutfs_buddy_indirect *ind; + struct scoutfs_bitmap_block *bm; struct scoutfs_key root_key; struct timeval tv; char uuid_str[37]; @@ -71,6 +140,7 @@ static int write_new_fs(char *path, int fd) u64 blkno; u64 total_blocks; u64 buddy_blocks; + u8 free_orders; void *buf; int ret; @@ -105,9 +175,6 @@ static int write_new_fs(char *path, int fd) root_key.type = SCOUTFS_INODE_KEY; root_key.offset = 0; - /* start with the block after the supers */ - blkno = SCOUTFS_SUPER_BLKNO + SCOUTFS_SUPER_NR; - /* first initialize the super so we can use it to build structures */ memset(super, 0, SCOUTFS_BLOCK_SIZE); pseudo_random_bytes(&super->hdr.fsid, sizeof(super->hdr.fsid)); @@ -118,10 +185,11 @@ static int write_new_fs(char *path, int fd) super->total_blocks = cpu_to_le64(total_blocks); super->buddy_blocks = cpu_to_le32(buddy_blocks); + blkno = first_blkno(super); + /* write a btree leaf root inode item */ memset(buf, 0, SCOUTFS_BLOCK_SIZE); bt = buf; - bt->hdr = super->hdr; bt->nr_items = cpu_to_le16(1); item = (void *)(bt + 1); @@ -148,19 +216,68 @@ static int write_new_fs(char *path, int fd) ((char *)(inode + 1) - (char *)bt)); bt->tail_free = bt->total_free; - ret = write_block(fd, blkno, &bt->hdr); + ret = write_block(fd, blkno, super, &bt->hdr); if (ret) goto out; - /* make sure the super references everything we just wrote */ + /* the super references the btree block */ super->btree_root.height = 1; super->btree_root.ref.blkno = bt->hdr.blkno; super->btree_root.ref.seq = bt->hdr.seq; + /* free all the blocks in the first buddy block after btree block */ + memset(buf, 0, SCOUTFS_BLOCK_SIZE); + bud = buf; + for (i = 1; i < min(total_blocks - first_blkno(super), + SCOUTFS_BUDDY_ORDER0_BITS); i++) + free_order_bit(bud, 0, i); + free_orders = calc_free_orders(bud); + + blkno = SCOUTFS_BUDDY_BM_BLKNO + SCOUTFS_BUDDY_BM_NR; + ret = write_block(fd, blkno, super, &bud->hdr); + if (ret) + goto out; + + /* an indirect buddy block references the buddy bitmap block */ + memset(buf, 0, SCOUTFS_BLOCK_SIZE); + ind = buf; + for (i = 0; i < SCOUTFS_BUDDY_SLOTS; i++) { + ind->slots[i].free_orders = 0; + ind->slots[i].ref = (struct scoutfs_block_ref){0,}; + } + ind->slots[0].free_orders = free_orders; + ind->slots[0].ref.seq = super->hdr.seq; + ind->slots[0].ref.blkno = cpu_to_le64(blkno); + + blkno++; + ret = write_block(fd, blkno, super, &ind->hdr); + if (ret) + goto out; + + /* the super references the buddy indirect block */ + super->buddy_ind_ref.blkno = ind->hdr.blkno; + super->buddy_ind_ref.seq = ind->hdr.seq; + + /* a bitmap block records the two used buddy blocks */ + memset(buf, 0, SCOUTFS_BLOCK_SIZE); + bm = buf; + memset(bm->bits, 0xff, SCOUTFS_BLOCK_SIZE - + offsetof(struct scoutfs_bitmap_block, bits)); + bm->bits[0] = cpu_to_le64(~0ULL << 2); /* two low order bits clear */ + + ret = write_block(fd, SCOUTFS_BUDDY_BM_BLKNO, super, &bm->hdr); + if (ret) + goto out; + + /* the super references the buddy bitmap block */ + super->buddy_bm_ref.blkno = bm->hdr.blkno; + super->buddy_bm_ref.seq = bm->hdr.seq; + /* write the two super blocks */ for (i = 0; i < SCOUTFS_SUPER_NR; i++) { super->hdr.seq = cpu_to_le64(i + 1); - ret = write_block(fd, SCOUTFS_SUPER_BLKNO + i, &super->hdr); + ret = write_block(fd, SCOUTFS_SUPER_BLKNO + i, NULL, + &super->hdr); if (ret) goto out; } diff --git a/utils/src/print.c b/utils/src/print.c index 775b5e52..8b99bb27 100644 --- a/utils/src/print.c +++ b/utils/src/print.c @@ -177,46 +177,90 @@ static int print_btree_block(int fd, __le64 blkno, u8 level) return ret; } +static int print_buddy_block(int fd, struct scoutfs_super_block *super, + u64 blkno) +{ + struct scoutfs_buddy_block *bud; + int i; + + bud = read_block(fd, blkno); + if (!bud) + return -ENOMEM; + + printf("buddy blkno %llu\n", blkno); + print_block_header(&bud->hdr); + printf(" order_counts:"); + for (i = 0; i < SCOUTFS_BUDDY_ORDERS; i++) + printf(" %u", le32_to_cpu(bud->order_counts[i])); + printf("\n"); + + free(bud); + + return 0; +} + static int print_buddy_blocks(int fd, struct scoutfs_super_block *super) { - struct scoutfs_buddy_chunk *chunk; - struct scoutfs_buddy_block *bb; + struct scoutfs_buddy_indirect *ind; + struct scoutfs_buddy_slot *slot; u64 blkno; - u64 blocks; - u64 head; - u64 tail; + int ret = 0; + int err; int i; - int j; - blocks = le32_to_cpu(super->buddy_blocks); - head = le64_to_cpu(super->buddy_head); - tail = le64_to_cpu(super->buddy_tail); + blkno = le64_to_cpu(super->buddy_ind_ref.blkno); + ind = read_block(fd, blkno); + if (!ind) + return -ENOMEM; - /* XXX make sure values are sane */ + printf("buddy indirect blkno %llu\n", blkno); + print_block_header(&ind->hdr); - for (; head < tail; head++) { + for (i = 0; i < SCOUTFS_BUDDY_SLOTS; i++) { + slot = &ind->slots[i]; - blkno = SCOUTFS_BUDDY_BLKNO + (head % blocks); - bb = read_block(fd, blkno); - if (!bb) - return -ENOMEM; + /* only print slots with non-zero fields */ + if (!slot->free_orders && !slot->ref.seq && !slot->ref.blkno) + continue; - printf("buddy blkno %llu\n", blkno); - print_block_header(&bb->hdr); - printf(" nr_chunks %u\n", bb->nr_chunks); - for (i = 0; i < bb->nr_chunks; i++) { - chunk = &bb->chunks[i]; - - printf(" [%u]: pos %u bits ", - i, le32_to_cpu(chunk->pos)); - for (j = 0; j < SCOUTFS_BUDDY_CHUNK_LE64S; j++) - printf("%016llx", le64_to_cpu(chunk->bits[j])); - printf("\n"); - } - - free(bb); + printf(" slot[%u]: free_orders: %x ref: seq %llu blkno %llu\n", + i, slot->free_orders, le64_to_cpu(slot->ref.seq), + le64_to_cpu(slot->ref.blkno)); } + for (i = 0; i < SCOUTFS_BUDDY_SLOTS; i++) { + slot = &ind->slots[i]; + + if (!slot->free_orders && !slot->ref.seq && !slot->ref.blkno) + continue; + + err = print_buddy_block(fd, super, + le64_to_cpu(slot->ref.blkno)); + if (err && !ret) + ret = err; + } + + free(ind); + + return ret; +} + + +static int print_bitmap_block(int fd, struct scoutfs_super_block *super) +{ + struct scoutfs_bitmap_block *bm; + u64 blkno; + + blkno = le64_to_cpu(super->buddy_bm_ref.blkno); + bm = read_block(fd, blkno); + if (!bm) + return -ENOMEM; + + printf("bitmap blkno %llu\n", blkno); + print_block_header(&bm->hdr); + + free(bm); + return 0; } @@ -240,15 +284,16 @@ static int print_super_blocks(int fd) print_block_header(&super->hdr); printf(" id %llx uuid %s\n", le64_to_cpu(super->id), uuid_str); - printf(" next_ino %llu total_blocks %llu buddy_blocks %u " - "buddy_sweep_bit %u\n" - " buddy_head %llu buddy_tail %llu\n", + printf(" next_ino %llu total_blocks %llu buddy_blocks %u\n", le64_to_cpu(super->next_ino), le64_to_cpu(super->total_blocks), - le32_to_cpu(super->buddy_blocks), - le32_to_cpu(super->buddy_sweep_bit), - le64_to_cpu(super->buddy_head), - le64_to_cpu(super->buddy_tail)); + le32_to_cpu(super->buddy_blocks)); + printf(" buddy_bm_ref: seq %llu blkno %llu\n", + le64_to_cpu(super->buddy_bm_ref.seq), + le64_to_cpu(super->buddy_bm_ref.blkno)); + printf(" buddy_ind_ref: seq %llu blkno %llu\n", + le64_to_cpu(super->buddy_ind_ref.seq), + le64_to_cpu(super->buddy_ind_ref.blkno)); printf(" btree_root: height %u seq %llu blkno %llu\n", super->btree_root.height, le64_to_cpu(super->btree_root.ref.seq), @@ -262,7 +307,13 @@ static int print_super_blocks(int fd) super = &recent; - ret = print_buddy_blocks(fd, super); + err = print_bitmap_block(fd, super); + if (err && !ret) + ret = err; + + err = print_buddy_blocks(fd, super); + if (err && !ret) + ret = err; if (super->btree_root.height) { err = print_btree_block(fd, super->btree_root.ref.blkno, diff --git a/utils/src/sparse.h b/utils/src/sparse.h index 7842aca2..24956150 100644 --- a/utils/src/sparse.h +++ b/utils/src/sparse.h @@ -104,4 +104,9 @@ __gen_functions(cast, be) #error "machine is neither BIG_ENDIAN nor LITTLE_ENDIAN" #endif +static inline void le32_add_cpu(__le32 *val, u32 delta) +{ + *val = cpu_to_le32(le32_to_cpu(*val) + delta); +} + #endif