diff --git a/kmod/src/alloc.c b/kmod/src/alloc.c index cf2a4598..82364641 100644 --- a/kmod/src/alloc.c +++ b/kmod/src/alloc.c @@ -29,8 +29,8 @@ * The core allocator uses extent items in btrees rooted in the super. * Each free extent is stored in two items. The first item is indexed * by block location and is used to merge adjacent extents when freeing. - * The second item is indexed by length and is used to find large - * extents to allocate from. + * The second item is indexed by the order of the length and is used to + * find large extents to allocate from. * * Free extent always consumes the front of the largest extent. This * attempts to discourage fragmentation by given smaller freed extents @@ -67,25 +67,52 @@ */ /* - * Free extents don't have flags and are stored in two indexes sorted by - * block location and by length, largest first. The block location key - * is set to the final block in the extent so that we can find - * intersections by calling _next() iterators starting with the block - * we're searching for. + * Return the order of the length of a free extent, which we define as + * floor(log_8_(len)): 0..7 = 0, 8..63 = 1, etc. */ -static void init_ext_key(struct scoutfs_key *key, int type, u64 start, u64 len) +static u64 free_extent_order(u64 len) +{ + return (fls64(len | 1) - 1) / 3; +} + +/* + * The smallest (non-zero) length that will be mapped to the same order + * as the given length. + */ +static u64 smallest_order_length(u64 len) +{ + return 1ULL << (free_extent_order(len) * 3); +} + +/* + * Free extents don't have flags and are stored in two indexes sorted by + * block location and by length order, largest first. The location key + * field is set to the final block in the extent so that we can find + * intersections by calling _next() with the start of the range we're + * searching for. + * + * We never store 0 length extents but we do build keys for searching + * the order index from 0,0 without having to map it to a real extent. + */ +static void init_ext_key(struct scoutfs_key *key, int zone, u64 start, u64 len) { *key = (struct scoutfs_key) { - .sk_zone = SCOUTFS_FREE_EXTENT_ZONE, - .sk_type = type, + .sk_zone = zone, }; - if (type == SCOUTFS_FREE_EXTENT_BLKNO_TYPE) { + if (len == 0) { + /* we only use 0 len extents for magic 0,0 order lookups */ + WARN_ON_ONCE(zone != SCOUTFS_FREE_EXTENT_ORDER_ZONE || start != 0); + return; + } + + if (zone == SCOUTFS_FREE_EXTENT_BLKNO_ZONE) { key->skfb_end = cpu_to_le64(start + len - 1); key->skfb_len = cpu_to_le64(len); - } else if (type == SCOUTFS_FREE_EXTENT_LEN_TYPE) { - key->skfl_neglen = cpu_to_le64(-len); - key->skfl_blkno = cpu_to_le64(start); + } else if (zone == SCOUTFS_FREE_EXTENT_ORDER_ZONE) { + key->skfo_revord = cpu_to_le64(U64_MAX - free_extent_order(len)); + key->skfo_end = cpu_to_le64(start + len - 1); + key->skfo_len = cpu_to_le64(len); } else { BUG(); } @@ -93,23 +120,27 @@ static void init_ext_key(struct scoutfs_key *key, int type, u64 start, u64 len) static void ext_from_key(struct scoutfs_extent *ext, struct scoutfs_key *key) { - if (key->sk_type == SCOUTFS_FREE_EXTENT_BLKNO_TYPE) { + if (key->sk_zone == SCOUTFS_FREE_EXTENT_BLKNO_ZONE) { ext->start = le64_to_cpu(key->skfb_end) - le64_to_cpu(key->skfb_len) + 1; ext->len = le64_to_cpu(key->skfb_len); } else { - ext->start = le64_to_cpu(key->skfl_blkno); - ext->len = -le64_to_cpu(key->skfl_neglen); + ext->start = le64_to_cpu(key->skfo_end) - + le64_to_cpu(key->skfo_len) + 1; + ext->len = le64_to_cpu(key->skfo_len); } ext->map = 0; ext->flags = 0; + + /* we never store 0 length extents */ + WARN_ON_ONCE(ext->len == 0); } struct alloc_ext_args { struct scoutfs_alloc *alloc; struct scoutfs_block_writer *wri; struct scoutfs_alloc_root *root; - int type; + int zone; }; static int alloc_ext_next(struct super_block *sb, void *arg, @@ -120,13 +151,13 @@ static int alloc_ext_next(struct super_block *sb, void *arg, struct scoutfs_key key; int ret; - init_ext_key(&key, args->type, start, len); + init_ext_key(&key, args->zone, start, len); ret = scoutfs_btree_next(sb, &args->root->root, &key, &iref); if (ret == 0) { if (iref.val_len != 0) ret = -EIO; - else if (iref.key->sk_type != args->type) + else if (iref.key->sk_zone != args->zone) ret = -ENOENT; else ext_from_key(ext, iref.key); @@ -139,19 +170,19 @@ static int alloc_ext_next(struct super_block *sb, void *arg, return ret; } -static int other_type(int type) +static int other_zone(int zone) { - if (type == SCOUTFS_FREE_EXTENT_BLKNO_TYPE) - return SCOUTFS_FREE_EXTENT_LEN_TYPE; - else if (type == SCOUTFS_FREE_EXTENT_LEN_TYPE) - return SCOUTFS_FREE_EXTENT_BLKNO_TYPE; + if (zone == SCOUTFS_FREE_EXTENT_BLKNO_ZONE) + return SCOUTFS_FREE_EXTENT_ORDER_ZONE; + else if (zone == SCOUTFS_FREE_EXTENT_ORDER_ZONE) + return SCOUTFS_FREE_EXTENT_BLKNO_ZONE; else BUG(); } /* * Insert an extent along with its matching item which is indexed by - * opposite of its len or blkno. If we succeed we update the root's + * opposite of its order or blkno. If we succeed we update the root's * record of the total length of all the stored extents. */ static int alloc_ext_insert(struct super_block *sb, void *arg, @@ -167,8 +198,8 @@ static int alloc_ext_insert(struct super_block *sb, void *arg, if (WARN_ON_ONCE(map || flags)) return -EINVAL; - init_ext_key(&key, args->type, start, len); - init_ext_key(&other, other_type(args->type), start, len); + init_ext_key(&key, args->zone, start, len); + init_ext_key(&other, other_zone(args->zone), start, len); ret = scoutfs_btree_insert(sb, args->alloc, args->wri, &args->root->root, &key, NULL, 0); @@ -196,8 +227,8 @@ static int alloc_ext_remove(struct super_block *sb, void *arg, int ret; int err; - init_ext_key(&key, args->type, start, len); - init_ext_key(&other, other_type(args->type), start, len); + init_ext_key(&key, args->zone, start, len); + init_ext_key(&other, other_zone(args->zone), start, len); ret = scoutfs_btree_delete(sb, args->alloc, args->wri, &args->root->root, &key); @@ -619,7 +650,7 @@ int scoutfs_dalloc_return_cached(struct super_block *sb, .alloc = alloc, .wri = wri, .root = &dalloc->root, - .type = SCOUTFS_FREE_EXTENT_BLKNO_TYPE, + .zone = SCOUTFS_FREE_EXTENT_BLKNO_ZONE, }; int ret = 0; @@ -655,7 +686,7 @@ int scoutfs_alloc_data(struct super_block *sb, struct scoutfs_alloc *alloc, .alloc = alloc, .wri = wri, .root = &dalloc->root, - .type = SCOUTFS_FREE_EXTENT_LEN_TYPE, + .zone = SCOUTFS_FREE_EXTENT_ORDER_ZONE, }; struct scoutfs_extent ext; u64 len; @@ -728,7 +759,7 @@ int scoutfs_free_data(struct super_block *sb, struct scoutfs_alloc *alloc, .alloc = alloc, .wri = wri, .root = root, - .type = SCOUTFS_FREE_EXTENT_BLKNO_TYPE, + .zone = SCOUTFS_FREE_EXTENT_BLKNO_ZONE, }; int ret; @@ -772,19 +803,19 @@ int scoutfs_alloc_move(struct super_block *sb, struct scoutfs_alloc *alloc, while (moved < total) { args.root = src; - args.type = SCOUTFS_FREE_EXTENT_LEN_TYPE; + args.zone = SCOUTFS_FREE_EXTENT_ORDER_ZONE; ret = scoutfs_ext_alloc(sb, &alloc_ext_ops, &args, 0, 0, total - moved, &ext); if (ret < 0) break; args.root = dst; - args.type = SCOUTFS_FREE_EXTENT_BLKNO_TYPE; + args.zone = SCOUTFS_FREE_EXTENT_BLKNO_ZONE; ret = scoutfs_ext_insert(sb, &alloc_ext_ops, &args, ext.start, ext.len, ext.map, ext.flags); if (ret < 0) { args.root = src; - args.type = SCOUTFS_FREE_EXTENT_BLKNO_TYPE; + args.zone = SCOUTFS_FREE_EXTENT_BLKNO_ZONE; err = scoutfs_ext_insert(sb, &alloc_ext_ops, &args, ext.start, ext.len, ext.map, ext.flags); @@ -852,7 +883,7 @@ out: * a list block and all the btree blocks that store extent items. * * At most, an extent operation can dirty down three paths of the tree - * to modify a blkno item and two distant len items. We can grow and + * to modify a blkno item and two distant order items. We can grow and * split the root, and then those three paths could share blocks but each * modify two leaf blocks. */ @@ -901,7 +932,7 @@ int scoutfs_alloc_fill_list(struct super_block *sb, .alloc = alloc, .wri = wri, .root = root, - .type = SCOUTFS_FREE_EXTENT_LEN_TYPE, + .zone = SCOUTFS_FREE_EXTENT_ORDER_ZONE, }; struct scoutfs_alloc_list_block *lblk; struct scoutfs_block *bl = NULL; @@ -958,7 +989,7 @@ int scoutfs_alloc_empty_list(struct super_block *sb, .alloc = alloc, .wri = wri, .root = root, - .type = SCOUTFS_FREE_EXTENT_BLKNO_TYPE, + .zone = SCOUTFS_FREE_EXTENT_BLKNO_ZONE, }; struct scoutfs_alloc_list_block *lblk = NULL; struct scoutfs_block *bl = NULL; diff --git a/kmod/src/format.h b/kmod/src/format.h index bf77f92a..49acf4e9 100644 --- a/kmod/src/format.h +++ b/kmod/src/format.h @@ -203,11 +203,12 @@ struct scoutfs_key { #define skmc_rid _sk_first /* free extents by blkno */ -#define skfb_end _sk_second -#define skfb_len _sk_third -/* free extents by len */ -#define skfl_neglen _sk_second -#define skfl_blkno _sk_third +#define skfb_end _sk_first +#define skfb_len _sk_second +/* free extents by order */ +#define skfo_revord _sk_first +#define skfo_end _sk_second +#define skfo_len _sk_third struct scoutfs_avl_root { __le16 node; @@ -493,7 +494,8 @@ struct scoutfs_bloom_block { #define SCOUTFS_TRANS_SEQ_ZONE 7 #define SCOUTFS_MOUNTED_CLIENT_ZONE 8 #define SCOUTFS_SRCH_ZONE 9 -#define SCOUTFS_FREE_EXTENT_ZONE 10 +#define SCOUTFS_FREE_EXTENT_BLKNO_ZONE 10 +#define SCOUTFS_FREE_EXTENT_ORDER_ZONE 11 /* inode index zone */ #define SCOUTFS_INODE_INDEX_META_SEQ_TYPE 1 @@ -521,10 +523,6 @@ struct scoutfs_bloom_block { #define SCOUTFS_SRCH_PENDING_TYPE 3 #define SCOUTFS_SRCH_BUSY_TYPE 4 -/* free extents in allocator btrees in client and server, by blkno or len */ -#define SCOUTFS_FREE_EXTENT_BLKNO_TYPE 1 -#define SCOUTFS_FREE_EXTENT_LEN_TYPE 2 - /* file data extents have start and len in key */ struct scoutfs_data_extent_val { __le64 blkno; diff --git a/utils/src/mkfs.c b/utils/src/mkfs.c index c646c8ae..8b293996 100644 --- a/utils/src/mkfs.c +++ b/utils/src/mkfs.c @@ -57,6 +57,15 @@ static int write_block(int fd, u32 magic, __le64 fsid, u64 seq, u64 blkno, return 0; } +/* + * Return the order of the length of a free extent, which we define as + * floor(log_8_(len)): 0..7 = 0, 8..63 = 1, etc. + */ +static u64 free_extent_order(u64 len) +{ + return (flsll(len | 1) - 1) / 3; +} + /* * Write the single btree block that contains the blkno and len indexed * items to store the given extent, and update the root to point to it. @@ -72,19 +81,16 @@ static int write_alloc_root(int fd, __le64 fsid, root->total_len = cpu_to_le64(len); memset(&key, 0, sizeof(key)); - key.sk_zone = SCOUTFS_FREE_EXTENT_ZONE; - key.sk_type = SCOUTFS_FREE_EXTENT_BLKNO_TYPE; - key.skii_ino = cpu_to_le64(SCOUTFS_ROOT_INO); + key.sk_zone = SCOUTFS_FREE_EXTENT_BLKNO_ZONE; key.skfb_end = cpu_to_le64(start + len - 1); key.skfb_len = cpu_to_le64(len); btree_append_item(bt, &key, NULL, 0); memset(&key, 0, sizeof(key)); - key.sk_zone = SCOUTFS_FREE_EXTENT_ZONE; - key.sk_type = SCOUTFS_FREE_EXTENT_LEN_TYPE; - key.skii_ino = cpu_to_le64(SCOUTFS_ROOT_INO); - key.skfl_neglen = cpu_to_le64(-len); - key.skfl_blkno = cpu_to_le64(start); + key.sk_zone = SCOUTFS_FREE_EXTENT_ORDER_ZONE; + key.skfo_revord = cpu_to_le64(U64_MAX - free_extent_order(len)); + key.skfo_end = cpu_to_le64(start + len - 1); + key.skfo_len = cpu_to_le64(len); btree_append_item(bt, &key, NULL, 0); return write_block(fd, SCOUTFS_BLOCK_MAGIC_BTREE, fsid, seq, blkno, diff --git a/utils/src/print.c b/utils/src/print.c index e08f2741..6e82251b 100644 --- a/utils/src/print.c +++ b/utils/src/print.c @@ -362,17 +362,17 @@ static int print_mounted_client_entry(struct scoutfs_key *key, void *val, static int print_alloc_item(struct scoutfs_key *key, void *val, unsigned val_len, void *arg) { - if (key->sk_type == SCOUTFS_FREE_EXTENT_BLKNO_TYPE) + if (key->sk_zone == SCOUTFS_FREE_EXTENT_BLKNO_ZONE) printf(" free extent: blkno %llu len %llu end %llu\n", le64_to_cpu(key->skfb_end) - le64_to_cpu(key->skfb_len) + 1, le64_to_cpu(key->skfb_len), le64_to_cpu(key->skfb_end)); else - printf(" free extent: blkno %llu len %llu neglen %lld\n", - le64_to_cpu(key->skfl_blkno), - -le64_to_cpu(key->skfl_neglen), - (long long)le64_to_cpu(key->skfl_neglen)); + printf(" free extent: blkno %llu len %llu order %llu\n", + le64_to_cpu(key->skfo_end) - le64_to_cpu(key->skfo_len) + 1, + le64_to_cpu(key->skfo_len), + (long long)(U64_MAX - le64_to_cpu(key->skfo_revord))); return 0; }