mirror of
https://github.com/versity/scoutfs.git
synced 2026-01-08 04:55:21 +00:00
Index free extents by order of length
Allocators store free extents in two items, one sorted by their blkno position and the other by their precise length. The length index makes it easy to search for precise extent lengths, but it makes it hard to search for a large extent within a given blkno region. Skipping in the blkno dimension has to be done for every precise length value. We don't need that level of precision. If we index the extents by a coarser order of the length then we have a fixed number of orders in which we have to skip in the blkno dimension when searching within a specific region. This changes the length item to be stored at the log(8) order of the length of the extents. This groups extents into orders that are close to the human-friendly base 10 orders of magnitude. With this change the order field in the key no longer stores the precise extent length. To preserve the length of the extent we need to use another field. The only 64bit field remaining is the first which is a higher comparision priority than the type. So we use the highest comparison priority zone field to differentiate the position and order indexes and can now use all three 64bit fields in the key. Finally, we have to be careful when constructing a key to use _next when searching for a large extent. Previously keys were relying on the magic property that building a key from an extent length of 0 ended up at the key value -0 = 0. That only worked because we never stored zero length extents. We now store zero length orders so we can't use the negative trick anymore. We explicitly treat 0 length extents carefully when building keys and we subtract the order from U64_MAX to store the orders from largest to smallest. Signed-off-by: Zach Brown <zab@versity.com>
This commit is contained in:
109
kmod/src/alloc.c
109
kmod/src/alloc.c
@@ -29,8 +29,8 @@
|
||||
* The core allocator uses extent items in btrees rooted in the super.
|
||||
* Each free extent is stored in two items. The first item is indexed
|
||||
* by block location and is used to merge adjacent extents when freeing.
|
||||
* The second item is indexed by length and is used to find large
|
||||
* extents to allocate from.
|
||||
* The second item is indexed by the order of the length and is used to
|
||||
* find large extents to allocate from.
|
||||
*
|
||||
* Free extent always consumes the front of the largest extent. This
|
||||
* attempts to discourage fragmentation by given smaller freed extents
|
||||
@@ -67,25 +67,52 @@
|
||||
*/
|
||||
|
||||
/*
|
||||
* Free extents don't have flags and are stored in two indexes sorted by
|
||||
* block location and by length, largest first. The block location key
|
||||
* is set to the final block in the extent so that we can find
|
||||
* intersections by calling _next() iterators starting with the block
|
||||
* we're searching for.
|
||||
* Return the order of the length of a free extent, which we define as
|
||||
* floor(log_8_(len)): 0..7 = 0, 8..63 = 1, etc.
|
||||
*/
|
||||
static void init_ext_key(struct scoutfs_key *key, int type, u64 start, u64 len)
|
||||
static u64 free_extent_order(u64 len)
|
||||
{
|
||||
return (fls64(len | 1) - 1) / 3;
|
||||
}
|
||||
|
||||
/*
|
||||
* The smallest (non-zero) length that will be mapped to the same order
|
||||
* as the given length.
|
||||
*/
|
||||
static u64 smallest_order_length(u64 len)
|
||||
{
|
||||
return 1ULL << (free_extent_order(len) * 3);
|
||||
}
|
||||
|
||||
/*
|
||||
* Free extents don't have flags and are stored in two indexes sorted by
|
||||
* block location and by length order, largest first. The location key
|
||||
* field is set to the final block in the extent so that we can find
|
||||
* intersections by calling _next() with the start of the range we're
|
||||
* searching for.
|
||||
*
|
||||
* We never store 0 length extents but we do build keys for searching
|
||||
* the order index from 0,0 without having to map it to a real extent.
|
||||
*/
|
||||
static void init_ext_key(struct scoutfs_key *key, int zone, u64 start, u64 len)
|
||||
{
|
||||
*key = (struct scoutfs_key) {
|
||||
.sk_zone = SCOUTFS_FREE_EXTENT_ZONE,
|
||||
.sk_type = type,
|
||||
.sk_zone = zone,
|
||||
};
|
||||
|
||||
if (type == SCOUTFS_FREE_EXTENT_BLKNO_TYPE) {
|
||||
if (len == 0) {
|
||||
/* we only use 0 len extents for magic 0,0 order lookups */
|
||||
WARN_ON_ONCE(zone != SCOUTFS_FREE_EXTENT_ORDER_ZONE || start != 0);
|
||||
return;
|
||||
}
|
||||
|
||||
if (zone == SCOUTFS_FREE_EXTENT_BLKNO_ZONE) {
|
||||
key->skfb_end = cpu_to_le64(start + len - 1);
|
||||
key->skfb_len = cpu_to_le64(len);
|
||||
} else if (type == SCOUTFS_FREE_EXTENT_LEN_TYPE) {
|
||||
key->skfl_neglen = cpu_to_le64(-len);
|
||||
key->skfl_blkno = cpu_to_le64(start);
|
||||
} else if (zone == SCOUTFS_FREE_EXTENT_ORDER_ZONE) {
|
||||
key->skfo_revord = cpu_to_le64(U64_MAX - free_extent_order(len));
|
||||
key->skfo_end = cpu_to_le64(start + len - 1);
|
||||
key->skfo_len = cpu_to_le64(len);
|
||||
} else {
|
||||
BUG();
|
||||
}
|
||||
@@ -93,23 +120,27 @@ static void init_ext_key(struct scoutfs_key *key, int type, u64 start, u64 len)
|
||||
|
||||
static void ext_from_key(struct scoutfs_extent *ext, struct scoutfs_key *key)
|
||||
{
|
||||
if (key->sk_type == SCOUTFS_FREE_EXTENT_BLKNO_TYPE) {
|
||||
if (key->sk_zone == SCOUTFS_FREE_EXTENT_BLKNO_ZONE) {
|
||||
ext->start = le64_to_cpu(key->skfb_end) -
|
||||
le64_to_cpu(key->skfb_len) + 1;
|
||||
ext->len = le64_to_cpu(key->skfb_len);
|
||||
} else {
|
||||
ext->start = le64_to_cpu(key->skfl_blkno);
|
||||
ext->len = -le64_to_cpu(key->skfl_neglen);
|
||||
ext->start = le64_to_cpu(key->skfo_end) -
|
||||
le64_to_cpu(key->skfo_len) + 1;
|
||||
ext->len = le64_to_cpu(key->skfo_len);
|
||||
}
|
||||
ext->map = 0;
|
||||
ext->flags = 0;
|
||||
|
||||
/* we never store 0 length extents */
|
||||
WARN_ON_ONCE(ext->len == 0);
|
||||
}
|
||||
|
||||
struct alloc_ext_args {
|
||||
struct scoutfs_alloc *alloc;
|
||||
struct scoutfs_block_writer *wri;
|
||||
struct scoutfs_alloc_root *root;
|
||||
int type;
|
||||
int zone;
|
||||
};
|
||||
|
||||
static int alloc_ext_next(struct super_block *sb, void *arg,
|
||||
@@ -120,13 +151,13 @@ static int alloc_ext_next(struct super_block *sb, void *arg,
|
||||
struct scoutfs_key key;
|
||||
int ret;
|
||||
|
||||
init_ext_key(&key, args->type, start, len);
|
||||
init_ext_key(&key, args->zone, start, len);
|
||||
|
||||
ret = scoutfs_btree_next(sb, &args->root->root, &key, &iref);
|
||||
if (ret == 0) {
|
||||
if (iref.val_len != 0)
|
||||
ret = -EIO;
|
||||
else if (iref.key->sk_type != args->type)
|
||||
else if (iref.key->sk_zone != args->zone)
|
||||
ret = -ENOENT;
|
||||
else
|
||||
ext_from_key(ext, iref.key);
|
||||
@@ -139,19 +170,19 @@ static int alloc_ext_next(struct super_block *sb, void *arg,
|
||||
return ret;
|
||||
}
|
||||
|
||||
static int other_type(int type)
|
||||
static int other_zone(int zone)
|
||||
{
|
||||
if (type == SCOUTFS_FREE_EXTENT_BLKNO_TYPE)
|
||||
return SCOUTFS_FREE_EXTENT_LEN_TYPE;
|
||||
else if (type == SCOUTFS_FREE_EXTENT_LEN_TYPE)
|
||||
return SCOUTFS_FREE_EXTENT_BLKNO_TYPE;
|
||||
if (zone == SCOUTFS_FREE_EXTENT_BLKNO_ZONE)
|
||||
return SCOUTFS_FREE_EXTENT_ORDER_ZONE;
|
||||
else if (zone == SCOUTFS_FREE_EXTENT_ORDER_ZONE)
|
||||
return SCOUTFS_FREE_EXTENT_BLKNO_ZONE;
|
||||
else
|
||||
BUG();
|
||||
}
|
||||
|
||||
/*
|
||||
* Insert an extent along with its matching item which is indexed by
|
||||
* opposite of its len or blkno. If we succeed we update the root's
|
||||
* opposite of its order or blkno. If we succeed we update the root's
|
||||
* record of the total length of all the stored extents.
|
||||
*/
|
||||
static int alloc_ext_insert(struct super_block *sb, void *arg,
|
||||
@@ -167,8 +198,8 @@ static int alloc_ext_insert(struct super_block *sb, void *arg,
|
||||
if (WARN_ON_ONCE(map || flags))
|
||||
return -EINVAL;
|
||||
|
||||
init_ext_key(&key, args->type, start, len);
|
||||
init_ext_key(&other, other_type(args->type), start, len);
|
||||
init_ext_key(&key, args->zone, start, len);
|
||||
init_ext_key(&other, other_zone(args->zone), start, len);
|
||||
|
||||
ret = scoutfs_btree_insert(sb, args->alloc, args->wri,
|
||||
&args->root->root, &key, NULL, 0);
|
||||
@@ -196,8 +227,8 @@ static int alloc_ext_remove(struct super_block *sb, void *arg,
|
||||
int ret;
|
||||
int err;
|
||||
|
||||
init_ext_key(&key, args->type, start, len);
|
||||
init_ext_key(&other, other_type(args->type), start, len);
|
||||
init_ext_key(&key, args->zone, start, len);
|
||||
init_ext_key(&other, other_zone(args->zone), start, len);
|
||||
|
||||
ret = scoutfs_btree_delete(sb, args->alloc, args->wri,
|
||||
&args->root->root, &key);
|
||||
@@ -619,7 +650,7 @@ int scoutfs_dalloc_return_cached(struct super_block *sb,
|
||||
.alloc = alloc,
|
||||
.wri = wri,
|
||||
.root = &dalloc->root,
|
||||
.type = SCOUTFS_FREE_EXTENT_BLKNO_TYPE,
|
||||
.zone = SCOUTFS_FREE_EXTENT_BLKNO_ZONE,
|
||||
};
|
||||
int ret = 0;
|
||||
|
||||
@@ -655,7 +686,7 @@ int scoutfs_alloc_data(struct super_block *sb, struct scoutfs_alloc *alloc,
|
||||
.alloc = alloc,
|
||||
.wri = wri,
|
||||
.root = &dalloc->root,
|
||||
.type = SCOUTFS_FREE_EXTENT_LEN_TYPE,
|
||||
.zone = SCOUTFS_FREE_EXTENT_ORDER_ZONE,
|
||||
};
|
||||
struct scoutfs_extent ext;
|
||||
u64 len;
|
||||
@@ -728,7 +759,7 @@ int scoutfs_free_data(struct super_block *sb, struct scoutfs_alloc *alloc,
|
||||
.alloc = alloc,
|
||||
.wri = wri,
|
||||
.root = root,
|
||||
.type = SCOUTFS_FREE_EXTENT_BLKNO_TYPE,
|
||||
.zone = SCOUTFS_FREE_EXTENT_BLKNO_ZONE,
|
||||
};
|
||||
int ret;
|
||||
|
||||
@@ -772,19 +803,19 @@ int scoutfs_alloc_move(struct super_block *sb, struct scoutfs_alloc *alloc,
|
||||
|
||||
while (moved < total) {
|
||||
args.root = src;
|
||||
args.type = SCOUTFS_FREE_EXTENT_LEN_TYPE;
|
||||
args.zone = SCOUTFS_FREE_EXTENT_ORDER_ZONE;
|
||||
ret = scoutfs_ext_alloc(sb, &alloc_ext_ops, &args,
|
||||
0, 0, total - moved, &ext);
|
||||
if (ret < 0)
|
||||
break;
|
||||
|
||||
args.root = dst;
|
||||
args.type = SCOUTFS_FREE_EXTENT_BLKNO_TYPE;
|
||||
args.zone = SCOUTFS_FREE_EXTENT_BLKNO_ZONE;
|
||||
ret = scoutfs_ext_insert(sb, &alloc_ext_ops, &args, ext.start,
|
||||
ext.len, ext.map, ext.flags);
|
||||
if (ret < 0) {
|
||||
args.root = src;
|
||||
args.type = SCOUTFS_FREE_EXTENT_BLKNO_TYPE;
|
||||
args.zone = SCOUTFS_FREE_EXTENT_BLKNO_ZONE;
|
||||
err = scoutfs_ext_insert(sb, &alloc_ext_ops, &args,
|
||||
ext.start, ext.len, ext.map,
|
||||
ext.flags);
|
||||
@@ -852,7 +883,7 @@ out:
|
||||
* a list block and all the btree blocks that store extent items.
|
||||
*
|
||||
* At most, an extent operation can dirty down three paths of the tree
|
||||
* to modify a blkno item and two distant len items. We can grow and
|
||||
* to modify a blkno item and two distant order items. We can grow and
|
||||
* split the root, and then those three paths could share blocks but each
|
||||
* modify two leaf blocks.
|
||||
*/
|
||||
@@ -901,7 +932,7 @@ int scoutfs_alloc_fill_list(struct super_block *sb,
|
||||
.alloc = alloc,
|
||||
.wri = wri,
|
||||
.root = root,
|
||||
.type = SCOUTFS_FREE_EXTENT_LEN_TYPE,
|
||||
.zone = SCOUTFS_FREE_EXTENT_ORDER_ZONE,
|
||||
};
|
||||
struct scoutfs_alloc_list_block *lblk;
|
||||
struct scoutfs_block *bl = NULL;
|
||||
@@ -958,7 +989,7 @@ int scoutfs_alloc_empty_list(struct super_block *sb,
|
||||
.alloc = alloc,
|
||||
.wri = wri,
|
||||
.root = root,
|
||||
.type = SCOUTFS_FREE_EXTENT_BLKNO_TYPE,
|
||||
.zone = SCOUTFS_FREE_EXTENT_BLKNO_ZONE,
|
||||
};
|
||||
struct scoutfs_alloc_list_block *lblk = NULL;
|
||||
struct scoutfs_block *bl = NULL;
|
||||
|
||||
@@ -203,11 +203,12 @@ struct scoutfs_key {
|
||||
#define skmc_rid _sk_first
|
||||
|
||||
/* free extents by blkno */
|
||||
#define skfb_end _sk_second
|
||||
#define skfb_len _sk_third
|
||||
/* free extents by len */
|
||||
#define skfl_neglen _sk_second
|
||||
#define skfl_blkno _sk_third
|
||||
#define skfb_end _sk_first
|
||||
#define skfb_len _sk_second
|
||||
/* free extents by order */
|
||||
#define skfo_revord _sk_first
|
||||
#define skfo_end _sk_second
|
||||
#define skfo_len _sk_third
|
||||
|
||||
struct scoutfs_avl_root {
|
||||
__le16 node;
|
||||
@@ -493,7 +494,8 @@ struct scoutfs_bloom_block {
|
||||
#define SCOUTFS_TRANS_SEQ_ZONE 7
|
||||
#define SCOUTFS_MOUNTED_CLIENT_ZONE 8
|
||||
#define SCOUTFS_SRCH_ZONE 9
|
||||
#define SCOUTFS_FREE_EXTENT_ZONE 10
|
||||
#define SCOUTFS_FREE_EXTENT_BLKNO_ZONE 10
|
||||
#define SCOUTFS_FREE_EXTENT_ORDER_ZONE 11
|
||||
|
||||
/* inode index zone */
|
||||
#define SCOUTFS_INODE_INDEX_META_SEQ_TYPE 1
|
||||
@@ -521,10 +523,6 @@ struct scoutfs_bloom_block {
|
||||
#define SCOUTFS_SRCH_PENDING_TYPE 3
|
||||
#define SCOUTFS_SRCH_BUSY_TYPE 4
|
||||
|
||||
/* free extents in allocator btrees in client and server, by blkno or len */
|
||||
#define SCOUTFS_FREE_EXTENT_BLKNO_TYPE 1
|
||||
#define SCOUTFS_FREE_EXTENT_LEN_TYPE 2
|
||||
|
||||
/* file data extents have start and len in key */
|
||||
struct scoutfs_data_extent_val {
|
||||
__le64 blkno;
|
||||
|
||||
@@ -57,6 +57,15 @@ static int write_block(int fd, u32 magic, __le64 fsid, u64 seq, u64 blkno,
|
||||
return 0;
|
||||
}
|
||||
|
||||
/*
|
||||
* Return the order of the length of a free extent, which we define as
|
||||
* floor(log_8_(len)): 0..7 = 0, 8..63 = 1, etc.
|
||||
*/
|
||||
static u64 free_extent_order(u64 len)
|
||||
{
|
||||
return (flsll(len | 1) - 1) / 3;
|
||||
}
|
||||
|
||||
/*
|
||||
* Write the single btree block that contains the blkno and len indexed
|
||||
* items to store the given extent, and update the root to point to it.
|
||||
@@ -72,19 +81,16 @@ static int write_alloc_root(int fd, __le64 fsid,
|
||||
root->total_len = cpu_to_le64(len);
|
||||
|
||||
memset(&key, 0, sizeof(key));
|
||||
key.sk_zone = SCOUTFS_FREE_EXTENT_ZONE;
|
||||
key.sk_type = SCOUTFS_FREE_EXTENT_BLKNO_TYPE;
|
||||
key.skii_ino = cpu_to_le64(SCOUTFS_ROOT_INO);
|
||||
key.sk_zone = SCOUTFS_FREE_EXTENT_BLKNO_ZONE;
|
||||
key.skfb_end = cpu_to_le64(start + len - 1);
|
||||
key.skfb_len = cpu_to_le64(len);
|
||||
btree_append_item(bt, &key, NULL, 0);
|
||||
|
||||
memset(&key, 0, sizeof(key));
|
||||
key.sk_zone = SCOUTFS_FREE_EXTENT_ZONE;
|
||||
key.sk_type = SCOUTFS_FREE_EXTENT_LEN_TYPE;
|
||||
key.skii_ino = cpu_to_le64(SCOUTFS_ROOT_INO);
|
||||
key.skfl_neglen = cpu_to_le64(-len);
|
||||
key.skfl_blkno = cpu_to_le64(start);
|
||||
key.sk_zone = SCOUTFS_FREE_EXTENT_ORDER_ZONE;
|
||||
key.skfo_revord = cpu_to_le64(U64_MAX - free_extent_order(len));
|
||||
key.skfo_end = cpu_to_le64(start + len - 1);
|
||||
key.skfo_len = cpu_to_le64(len);
|
||||
btree_append_item(bt, &key, NULL, 0);
|
||||
|
||||
return write_block(fd, SCOUTFS_BLOCK_MAGIC_BTREE, fsid, seq, blkno,
|
||||
|
||||
@@ -362,17 +362,17 @@ static int print_mounted_client_entry(struct scoutfs_key *key, void *val,
|
||||
static int print_alloc_item(struct scoutfs_key *key, void *val,
|
||||
unsigned val_len, void *arg)
|
||||
{
|
||||
if (key->sk_type == SCOUTFS_FREE_EXTENT_BLKNO_TYPE)
|
||||
if (key->sk_zone == SCOUTFS_FREE_EXTENT_BLKNO_ZONE)
|
||||
printf(" free extent: blkno %llu len %llu end %llu\n",
|
||||
le64_to_cpu(key->skfb_end) -
|
||||
le64_to_cpu(key->skfb_len) + 1,
|
||||
le64_to_cpu(key->skfb_len),
|
||||
le64_to_cpu(key->skfb_end));
|
||||
else
|
||||
printf(" free extent: blkno %llu len %llu neglen %lld\n",
|
||||
le64_to_cpu(key->skfl_blkno),
|
||||
-le64_to_cpu(key->skfl_neglen),
|
||||
(long long)le64_to_cpu(key->skfl_neglen));
|
||||
printf(" free extent: blkno %llu len %llu order %llu\n",
|
||||
le64_to_cpu(key->skfo_end) - le64_to_cpu(key->skfo_len) + 1,
|
||||
le64_to_cpu(key->skfo_len),
|
||||
(long long)(U64_MAX - le64_to_cpu(key->skfo_revord)));
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user