Add support for the radix buddy bitmaps

Update mkfs and print to support the buddy allocator that's indexed by
radix blocks.

Signed-off-by: Zach Brown <zab@versity.com>
This commit is contained in:
Zach Brown
2016-07-25 13:45:34 -07:00
parent 4b86256904
commit 6a97aa3c9a
4 changed files with 278 additions and 111 deletions

View File

@@ -19,7 +19,8 @@
*/
#define SCOUTFS_SUPER_BLKNO ((64 * 1024) >> SCOUTFS_BLOCK_SHIFT)
#define SCOUTFS_SUPER_NR 2
#define SCOUTFS_BUDDY_BLKNO (SCOUTFS_SUPER_BLKNO + SCOUTFS_SUPER_NR)
#define SCOUTFS_BUDDY_BM_BLKNO (SCOUTFS_SUPER_BLKNO + SCOUTFS_SUPER_NR)
#define SCOUTFS_BUDDY_BM_NR 2
/*
* This header is found at the start of every block so that we can
@@ -35,6 +36,52 @@ struct scoutfs_block_header {
__le64 blkno;
} __packed;
/*
* Block references include the sequence number so that we can detect
* readers racing with writers and so that we can tell that we don't
* need to follow a reference when traversing based on seqs.
*/
struct scoutfs_block_ref {
__le64 blkno;
__le64 seq;
} __packed;
struct scoutfs_bitmap_block {
struct scoutfs_block_header hdr;
__le64 bits[0];
} __packed;
/*
* Track allocations from BLOCK_SIZE to (BLOCK_SIZE << ..._ORDERS).
*/
#define SCOUTFS_BUDDY_ORDERS 8
struct scoutfs_buddy_block {
struct scoutfs_block_header hdr;
__le32 order_counts[SCOUTFS_BUDDY_ORDERS];
__le64 bits[0];
} __packed;
/*
* If we had log2(raw bits) orders we'd fully use all of the raw bits in
* the block. We're close enough that the amount of space wasted at the
* end (~1/256th of the block, ~64 bytes) isn't worth worrying about.
*/
#define SCOUTFS_BUDDY_ORDER0_BITS \
(((SCOUTFS_BLOCK_SIZE - sizeof(struct scoutfs_buddy_block)) * 8) / 2)
struct scoutfs_buddy_indirect {
struct scoutfs_block_header hdr;
struct scoutfs_buddy_slot {
__u8 free_orders;
struct scoutfs_block_ref ref;
} slots[0];
} __packed;
#define SCOUTFS_BUDDY_SLOTS \
((SCOUTFS_BLOCK_SIZE - sizeof(struct scoutfs_buddy_block)) / \
sizeof(struct scoutfs_buddy_slot))
/*
* We should be able to make the offset smaller if neither dirents nor
* data items use the full 64 bits.
@@ -57,16 +104,6 @@ struct scoutfs_key {
#define SCOUTFS_MAX_ITEM_LEN 2048
/*
* Block references include the sequence number so that we can detect
* readers racing with writers and so that we can tell that we don't
* need to follow a reference when traversing based on seqs.
*/
struct scoutfs_block_ref {
__le64 blkno;
__le64 seq;
} __packed;
struct scoutfs_treap_root {
__le16 off;
} __packed;
@@ -109,48 +146,6 @@ struct scoutfs_btree_item {
#define SCOUTFS_UUID_BYTES 16
/*
* Arbitrarily choose a reasonably fine grained 64byte chunk. This is a
* balance between write amplification of writing chunks with a single
* modified bit, storage overhead of partial blocks losing a chunk to
* make room for the block header and having a pos field per chunk, and
* runtime memory overhead of a bit per chunk.
*/
#define SCOUTFS_BUDDY_CHUNK_LE64S 8
#define SCOUTFS_BUDDY_CHUNK_BYTES (SCOUTFS_BUDDY_CHUNK_LE64S * 8)
#define SCOUTFS_BUDDY_CHUNK_BITS (SCOUTFS_BUDDY_CHUNK_BYTES * 8)
/*
* After the pair of super blocks are a preallocated ring of blocks
* which record modified regions of the buddy bitmap allocator.
*
* The seq's header needs to match the unwrapped ring index of the
* block.
*/
struct scoutfs_buddy_block {
struct scoutfs_block_header hdr;
u8 nr_chunks;
struct scoutfs_buddy_chunk {
__le32 pos;
__le64 bits[SCOUTFS_BUDDY_CHUNK_LE64S];
} __packed chunks[0];
} __packed;
#define SCOUTFS_BUDDY_CHUNKS_PER_BLOCK \
((SCOUTFS_BLOCK_SIZE - offsetof(struct scoutfs_buddy_block, chunks)) /\
SCOUTFS_BUDDY_CHUNK_BYTES)
/*
* The super is stored in a pair of blocks in the first chunk on the
* device.
*
* The ring map blocks describe the chunks that make up the ring.
*
* The rest of the ring fields describe the state of the ring blocks
* that are stored in their chunks. The active portion of the ring
* describes the current state of the system and is replayed on mount.
*/
struct scoutfs_super_block {
struct scoutfs_block_header hdr;
__le64 id;
@@ -158,10 +153,9 @@ struct scoutfs_super_block {
__le64 next_ino;
__le64 total_blocks;
__le32 buddy_blocks;
__le32 buddy_sweep_bit;
__le64 buddy_head;
__le64 buddy_tail;
struct scoutfs_btree_root btree_root;
struct scoutfs_block_ref buddy_ind_ref;
struct scoutfs_block_ref buddy_bm_ref;
} __packed;
#define SCOUTFS_ROOT_INO 1

View File

@@ -22,10 +22,13 @@
/*
* Update the block's header and write it out.
*/
static int write_block(int fd, u64 blkno, struct scoutfs_block_header *hdr)
static int write_block(int fd, u64 blkno, struct scoutfs_super_block *super,
struct scoutfs_block_header *hdr)
{
ssize_t ret;
if (super)
*hdr = super->hdr;
hdr->blkno = cpu_to_le64(blkno);
hdr->crc = cpu_to_le32(crc_block(hdr));
@@ -40,21 +43,84 @@ static int write_block(int fd, u64 blkno, struct scoutfs_block_header *hdr)
}
/*
* Calculate the number of buddy blocks that are needed to track the
* allocation of a device with the given byte size. We need an even
* number of buddy blocks that contain 8 bits for every device block. This
* is a bit overly conservative in that it doesn't subtract the buddy
* blocks and super block from the calculation.
* Calculate the number of buddy blocks that are needed to manage
* allocation of a device with the given number of total blocks.
*
* We need a little bit of overhead to write each transaction's dirty
* buddy blocks to free space. We chose 16MB for now which is wild
* overkill and should be dependent on the max transaction size.
*/
static u32 calc_buddy_blocks(u64 total_blocks)
{
u64 buddy_bits = total_blocks * 8;
u64 chunks = DIV_ROUND_UP(buddy_bits, SCOUTFS_BUDDY_CHUNK_BITS);
u64 blocks = DIV_ROUND_UP(chunks, SCOUTFS_BUDDY_CHUNKS_PER_BLOCK);
return DIV_ROUND_UP(total_blocks, SCOUTFS_BUDDY_ORDER0_BITS) +
((16 * 1024 * 1024) / SCOUTFS_BLOCK_SIZE);
}
/* XXX check u32 overflow? */
static u32 first_blkno(struct scoutfs_super_block *super)
{
return SCOUTFS_BUDDY_BM_BLKNO + SCOUTFS_BUDDY_BM_NR +
le32_to_cpu(super->buddy_blocks);
}
return round_up(blocks, 2);
/* the starting bit offset in the block bitmap of an order's bitmap */
static int order_off(int order)
{
if (order == 0)
return 0;
return (2 * SCOUTFS_BUDDY_ORDER0_BITS) -
(SCOUTFS_BUDDY_ORDER0_BITS / (1 << (order - 1)));
}
/* the bit offset in the block bitmap of an order's bit */
static int order_nr(int order, int nr)
{
return order_off(order) + nr;
}
static int test_buddy_bit(struct scoutfs_buddy_block *bud, int order, int nr)
{
return test_bit_le(order_nr(order, nr), bud->bits);
}
static void set_buddy_bit(struct scoutfs_buddy_block *bud, int order, int nr)
{
if (!test_and_set_bit_le(order_nr(order, nr), bud->bits))
le32_add_cpu(&bud->order_counts[order], 1);
}
static void clear_buddy_bit(struct scoutfs_buddy_block *bud, int order, int nr)
{
if (test_and_clear_bit_le(order_nr(order, nr), bud->bits))
le32_add_cpu(&bud->order_counts[order], -1);
}
/* merge lower orders buddies as we free up to the highest */
static void free_order_bit(struct scoutfs_buddy_block *bud, int order, int nr)
{
int i;
for (i = order; i < SCOUTFS_BUDDY_ORDERS - 1; i++) {
if (!test_buddy_bit(bud, i, nr ^ 1))
break;
clear_buddy_bit(bud, i, nr ^ 1);
nr >>= 1;
}
set_buddy_bit(bud, i, nr);
}
static u8 calc_free_orders(struct scoutfs_buddy_block *bud)
{
u8 free = 0;
int i;
for (i = 0; i < SCOUTFS_BUDDY_ORDERS; i++)
free |= (!!bud->order_counts[i]) << i;
return free;
}
static int write_new_fs(char *path, int fd)
@@ -63,6 +129,9 @@ static int write_new_fs(char *path, int fd)
struct scoutfs_inode *inode;
struct scoutfs_btree_block *bt;
struct scoutfs_btree_item *item;
struct scoutfs_buddy_block *bud;
struct scoutfs_buddy_indirect *ind;
struct scoutfs_bitmap_block *bm;
struct scoutfs_key root_key;
struct timeval tv;
char uuid_str[37];
@@ -71,6 +140,7 @@ static int write_new_fs(char *path, int fd)
u64 blkno;
u64 total_blocks;
u64 buddy_blocks;
u8 free_orders;
void *buf;
int ret;
@@ -105,9 +175,6 @@ static int write_new_fs(char *path, int fd)
root_key.type = SCOUTFS_INODE_KEY;
root_key.offset = 0;
/* start with the block after the supers */
blkno = SCOUTFS_SUPER_BLKNO + SCOUTFS_SUPER_NR;
/* first initialize the super so we can use it to build structures */
memset(super, 0, SCOUTFS_BLOCK_SIZE);
pseudo_random_bytes(&super->hdr.fsid, sizeof(super->hdr.fsid));
@@ -118,10 +185,11 @@ static int write_new_fs(char *path, int fd)
super->total_blocks = cpu_to_le64(total_blocks);
super->buddy_blocks = cpu_to_le32(buddy_blocks);
blkno = first_blkno(super);
/* write a btree leaf root inode item */
memset(buf, 0, SCOUTFS_BLOCK_SIZE);
bt = buf;
bt->hdr = super->hdr;
bt->nr_items = cpu_to_le16(1);
item = (void *)(bt + 1);
@@ -148,19 +216,68 @@ static int write_new_fs(char *path, int fd)
((char *)(inode + 1) - (char *)bt));
bt->tail_free = bt->total_free;
ret = write_block(fd, blkno, &bt->hdr);
ret = write_block(fd, blkno, super, &bt->hdr);
if (ret)
goto out;
/* make sure the super references everything we just wrote */
/* the super references the btree block */
super->btree_root.height = 1;
super->btree_root.ref.blkno = bt->hdr.blkno;
super->btree_root.ref.seq = bt->hdr.seq;
/* free all the blocks in the first buddy block after btree block */
memset(buf, 0, SCOUTFS_BLOCK_SIZE);
bud = buf;
for (i = 1; i < min(total_blocks - first_blkno(super),
SCOUTFS_BUDDY_ORDER0_BITS); i++)
free_order_bit(bud, 0, i);
free_orders = calc_free_orders(bud);
blkno = SCOUTFS_BUDDY_BM_BLKNO + SCOUTFS_BUDDY_BM_NR;
ret = write_block(fd, blkno, super, &bud->hdr);
if (ret)
goto out;
/* an indirect buddy block references the buddy bitmap block */
memset(buf, 0, SCOUTFS_BLOCK_SIZE);
ind = buf;
for (i = 0; i < SCOUTFS_BUDDY_SLOTS; i++) {
ind->slots[i].free_orders = 0;
ind->slots[i].ref = (struct scoutfs_block_ref){0,};
}
ind->slots[0].free_orders = free_orders;
ind->slots[0].ref.seq = super->hdr.seq;
ind->slots[0].ref.blkno = cpu_to_le64(blkno);
blkno++;
ret = write_block(fd, blkno, super, &ind->hdr);
if (ret)
goto out;
/* the super references the buddy indirect block */
super->buddy_ind_ref.blkno = ind->hdr.blkno;
super->buddy_ind_ref.seq = ind->hdr.seq;
/* a bitmap block records the two used buddy blocks */
memset(buf, 0, SCOUTFS_BLOCK_SIZE);
bm = buf;
memset(bm->bits, 0xff, SCOUTFS_BLOCK_SIZE -
offsetof(struct scoutfs_bitmap_block, bits));
bm->bits[0] = cpu_to_le64(~0ULL << 2); /* two low order bits clear */
ret = write_block(fd, SCOUTFS_BUDDY_BM_BLKNO, super, &bm->hdr);
if (ret)
goto out;
/* the super references the buddy bitmap block */
super->buddy_bm_ref.blkno = bm->hdr.blkno;
super->buddy_bm_ref.seq = bm->hdr.seq;
/* write the two super blocks */
for (i = 0; i < SCOUTFS_SUPER_NR; i++) {
super->hdr.seq = cpu_to_le64(i + 1);
ret = write_block(fd, SCOUTFS_SUPER_BLKNO + i, &super->hdr);
ret = write_block(fd, SCOUTFS_SUPER_BLKNO + i, NULL,
&super->hdr);
if (ret)
goto out;
}

View File

@@ -177,46 +177,90 @@ static int print_btree_block(int fd, __le64 blkno, u8 level)
return ret;
}
static int print_buddy_block(int fd, struct scoutfs_super_block *super,
u64 blkno)
{
struct scoutfs_buddy_block *bud;
int i;
bud = read_block(fd, blkno);
if (!bud)
return -ENOMEM;
printf("buddy blkno %llu\n", blkno);
print_block_header(&bud->hdr);
printf(" order_counts:");
for (i = 0; i < SCOUTFS_BUDDY_ORDERS; i++)
printf(" %u", le32_to_cpu(bud->order_counts[i]));
printf("\n");
free(bud);
return 0;
}
static int print_buddy_blocks(int fd, struct scoutfs_super_block *super)
{
struct scoutfs_buddy_chunk *chunk;
struct scoutfs_buddy_block *bb;
struct scoutfs_buddy_indirect *ind;
struct scoutfs_buddy_slot *slot;
u64 blkno;
u64 blocks;
u64 head;
u64 tail;
int ret = 0;
int err;
int i;
int j;
blocks = le32_to_cpu(super->buddy_blocks);
head = le64_to_cpu(super->buddy_head);
tail = le64_to_cpu(super->buddy_tail);
blkno = le64_to_cpu(super->buddy_ind_ref.blkno);
ind = read_block(fd, blkno);
if (!ind)
return -ENOMEM;
/* XXX make sure values are sane */
printf("buddy indirect blkno %llu\n", blkno);
print_block_header(&ind->hdr);
for (; head < tail; head++) {
for (i = 0; i < SCOUTFS_BUDDY_SLOTS; i++) {
slot = &ind->slots[i];
blkno = SCOUTFS_BUDDY_BLKNO + (head % blocks);
bb = read_block(fd, blkno);
if (!bb)
return -ENOMEM;
/* only print slots with non-zero fields */
if (!slot->free_orders && !slot->ref.seq && !slot->ref.blkno)
continue;
printf("buddy blkno %llu\n", blkno);
print_block_header(&bb->hdr);
printf(" nr_chunks %u\n", bb->nr_chunks);
for (i = 0; i < bb->nr_chunks; i++) {
chunk = &bb->chunks[i];
printf(" [%u]: pos %u bits ",
i, le32_to_cpu(chunk->pos));
for (j = 0; j < SCOUTFS_BUDDY_CHUNK_LE64S; j++)
printf("%016llx", le64_to_cpu(chunk->bits[j]));
printf("\n");
}
free(bb);
printf(" slot[%u]: free_orders: %x ref: seq %llu blkno %llu\n",
i, slot->free_orders, le64_to_cpu(slot->ref.seq),
le64_to_cpu(slot->ref.blkno));
}
for (i = 0; i < SCOUTFS_BUDDY_SLOTS; i++) {
slot = &ind->slots[i];
if (!slot->free_orders && !slot->ref.seq && !slot->ref.blkno)
continue;
err = print_buddy_block(fd, super,
le64_to_cpu(slot->ref.blkno));
if (err && !ret)
ret = err;
}
free(ind);
return ret;
}
static int print_bitmap_block(int fd, struct scoutfs_super_block *super)
{
struct scoutfs_bitmap_block *bm;
u64 blkno;
blkno = le64_to_cpu(super->buddy_bm_ref.blkno);
bm = read_block(fd, blkno);
if (!bm)
return -ENOMEM;
printf("bitmap blkno %llu\n", blkno);
print_block_header(&bm->hdr);
free(bm);
return 0;
}
@@ -240,15 +284,16 @@ static int print_super_blocks(int fd)
print_block_header(&super->hdr);
printf(" id %llx uuid %s\n",
le64_to_cpu(super->id), uuid_str);
printf(" next_ino %llu total_blocks %llu buddy_blocks %u "
"buddy_sweep_bit %u\n"
" buddy_head %llu buddy_tail %llu\n",
printf(" next_ino %llu total_blocks %llu buddy_blocks %u\n",
le64_to_cpu(super->next_ino),
le64_to_cpu(super->total_blocks),
le32_to_cpu(super->buddy_blocks),
le32_to_cpu(super->buddy_sweep_bit),
le64_to_cpu(super->buddy_head),
le64_to_cpu(super->buddy_tail));
le32_to_cpu(super->buddy_blocks));
printf(" buddy_bm_ref: seq %llu blkno %llu\n",
le64_to_cpu(super->buddy_bm_ref.seq),
le64_to_cpu(super->buddy_bm_ref.blkno));
printf(" buddy_ind_ref: seq %llu blkno %llu\n",
le64_to_cpu(super->buddy_ind_ref.seq),
le64_to_cpu(super->buddy_ind_ref.blkno));
printf(" btree_root: height %u seq %llu blkno %llu\n",
super->btree_root.height,
le64_to_cpu(super->btree_root.ref.seq),
@@ -262,7 +307,13 @@ static int print_super_blocks(int fd)
super = &recent;
ret = print_buddy_blocks(fd, super);
err = print_bitmap_block(fd, super);
if (err && !ret)
ret = err;
err = print_buddy_blocks(fd, super);
if (err && !ret)
ret = err;
if (super->btree_root.height) {
err = print_btree_block(fd, super->btree_root.ref.blkno,

View File

@@ -104,4 +104,9 @@ __gen_functions(cast, be)
#error "machine is neither BIG_ENDIAN nor LITTLE_ENDIAN"
#endif
static inline void le32_add_cpu(__le32 *val, u32 delta)
{
*val = cpu_to_le32(le32_to_cpu(*val) + delta);
}
#endif