mirror of
https://github.com/versity/scoutfs.git
synced 2026-01-08 21:03:12 +00:00
Merge pull request #40 from versity/zab/data_alloc_zones
Zab/data alloc zones
This commit is contained in:
@@ -42,6 +42,7 @@ scoutfs-y += \
|
||||
trans.o \
|
||||
triggers.o \
|
||||
tseq.o \
|
||||
volopt.o \
|
||||
xattr.o
|
||||
|
||||
#
|
||||
|
||||
316
kmod/src/alloc.c
316
kmod/src/alloc.c
@@ -29,8 +29,8 @@
|
||||
* The core allocator uses extent items in btrees rooted in the super.
|
||||
* Each free extent is stored in two items. The first item is indexed
|
||||
* by block location and is used to merge adjacent extents when freeing.
|
||||
* The second item is indexed by length and is used to find large
|
||||
* extents to allocate from.
|
||||
* The second item is indexed by the order of the length and is used to
|
||||
* find large extents to allocate from.
|
||||
*
|
||||
* Free extent always consumes the front of the largest extent. This
|
||||
* attempts to discourage fragmentation by given smaller freed extents
|
||||
@@ -67,25 +67,52 @@
|
||||
*/
|
||||
|
||||
/*
|
||||
* Free extents don't have flags and are stored in two indexes sorted by
|
||||
* block location and by length, largest first. The block location key
|
||||
* is set to the final block in the extent so that we can find
|
||||
* intersections by calling _next() iterators starting with the block
|
||||
* we're searching for.
|
||||
* Return the order of the length of a free extent, which we define as
|
||||
* floor(log_8_(len)): 0..7 = 0, 8..63 = 1, etc.
|
||||
*/
|
||||
static void init_ext_key(struct scoutfs_key *key, int type, u64 start, u64 len)
|
||||
static u64 free_extent_order(u64 len)
|
||||
{
|
||||
return (fls64(len | 1) - 1) / 3;
|
||||
}
|
||||
|
||||
/*
|
||||
* The smallest (non-zero) length that will be mapped to the same order
|
||||
* as the given length.
|
||||
*/
|
||||
static u64 smallest_order_length(u64 len)
|
||||
{
|
||||
return 1ULL << (free_extent_order(len) * 3);
|
||||
}
|
||||
|
||||
/*
|
||||
* Free extents don't have flags and are stored in two indexes sorted by
|
||||
* block location and by length order, largest first. The location key
|
||||
* field is set to the final block in the extent so that we can find
|
||||
* intersections by calling _next() with the start of the range we're
|
||||
* searching for.
|
||||
*
|
||||
* We never store 0 length extents but we do build keys for searching
|
||||
* the order index from 0,0 without having to map it to a real extent.
|
||||
*/
|
||||
static void init_ext_key(struct scoutfs_key *key, int zone, u64 start, u64 len)
|
||||
{
|
||||
*key = (struct scoutfs_key) {
|
||||
.sk_zone = SCOUTFS_FREE_EXTENT_ZONE,
|
||||
.sk_type = type,
|
||||
.sk_zone = zone,
|
||||
};
|
||||
|
||||
if (type == SCOUTFS_FREE_EXTENT_BLKNO_TYPE) {
|
||||
if (len == 0) {
|
||||
/* we only use 0 len extents for magic 0,0 order lookups */
|
||||
WARN_ON_ONCE(zone != SCOUTFS_FREE_EXTENT_ORDER_ZONE || start != 0);
|
||||
return;
|
||||
}
|
||||
|
||||
if (zone == SCOUTFS_FREE_EXTENT_BLKNO_ZONE) {
|
||||
key->skfb_end = cpu_to_le64(start + len - 1);
|
||||
key->skfb_len = cpu_to_le64(len);
|
||||
} else if (type == SCOUTFS_FREE_EXTENT_LEN_TYPE) {
|
||||
key->skfl_neglen = cpu_to_le64(-len);
|
||||
key->skfl_blkno = cpu_to_le64(start);
|
||||
} else if (zone == SCOUTFS_FREE_EXTENT_ORDER_ZONE) {
|
||||
key->skfo_revord = cpu_to_le64(U64_MAX - free_extent_order(len));
|
||||
key->skfo_end = cpu_to_le64(start + len - 1);
|
||||
key->skfo_len = cpu_to_le64(len);
|
||||
} else {
|
||||
BUG();
|
||||
}
|
||||
@@ -93,23 +120,27 @@ static void init_ext_key(struct scoutfs_key *key, int type, u64 start, u64 len)
|
||||
|
||||
static void ext_from_key(struct scoutfs_extent *ext, struct scoutfs_key *key)
|
||||
{
|
||||
if (key->sk_type == SCOUTFS_FREE_EXTENT_BLKNO_TYPE) {
|
||||
if (key->sk_zone == SCOUTFS_FREE_EXTENT_BLKNO_ZONE) {
|
||||
ext->start = le64_to_cpu(key->skfb_end) -
|
||||
le64_to_cpu(key->skfb_len) + 1;
|
||||
ext->len = le64_to_cpu(key->skfb_len);
|
||||
} else {
|
||||
ext->start = le64_to_cpu(key->skfl_blkno);
|
||||
ext->len = -le64_to_cpu(key->skfl_neglen);
|
||||
ext->start = le64_to_cpu(key->skfo_end) -
|
||||
le64_to_cpu(key->skfo_len) + 1;
|
||||
ext->len = le64_to_cpu(key->skfo_len);
|
||||
}
|
||||
ext->map = 0;
|
||||
ext->flags = 0;
|
||||
|
||||
/* we never store 0 length extents */
|
||||
WARN_ON_ONCE(ext->len == 0);
|
||||
}
|
||||
|
||||
struct alloc_ext_args {
|
||||
struct scoutfs_alloc *alloc;
|
||||
struct scoutfs_block_writer *wri;
|
||||
struct scoutfs_alloc_root *root;
|
||||
int type;
|
||||
int zone;
|
||||
};
|
||||
|
||||
static int alloc_ext_next(struct super_block *sb, void *arg,
|
||||
@@ -120,13 +151,13 @@ static int alloc_ext_next(struct super_block *sb, void *arg,
|
||||
struct scoutfs_key key;
|
||||
int ret;
|
||||
|
||||
init_ext_key(&key, args->type, start, len);
|
||||
init_ext_key(&key, args->zone, start, len);
|
||||
|
||||
ret = scoutfs_btree_next(sb, &args->root->root, &key, &iref);
|
||||
if (ret == 0) {
|
||||
if (iref.val_len != 0)
|
||||
ret = -EIO;
|
||||
else if (iref.key->sk_type != args->type)
|
||||
else if (iref.key->sk_zone != args->zone)
|
||||
ret = -ENOENT;
|
||||
else
|
||||
ext_from_key(ext, iref.key);
|
||||
@@ -139,19 +170,19 @@ static int alloc_ext_next(struct super_block *sb, void *arg,
|
||||
return ret;
|
||||
}
|
||||
|
||||
static int other_type(int type)
|
||||
static int other_zone(int zone)
|
||||
{
|
||||
if (type == SCOUTFS_FREE_EXTENT_BLKNO_TYPE)
|
||||
return SCOUTFS_FREE_EXTENT_LEN_TYPE;
|
||||
else if (type == SCOUTFS_FREE_EXTENT_LEN_TYPE)
|
||||
return SCOUTFS_FREE_EXTENT_BLKNO_TYPE;
|
||||
if (zone == SCOUTFS_FREE_EXTENT_BLKNO_ZONE)
|
||||
return SCOUTFS_FREE_EXTENT_ORDER_ZONE;
|
||||
else if (zone == SCOUTFS_FREE_EXTENT_ORDER_ZONE)
|
||||
return SCOUTFS_FREE_EXTENT_BLKNO_ZONE;
|
||||
else
|
||||
BUG();
|
||||
}
|
||||
|
||||
/*
|
||||
* Insert an extent along with its matching item which is indexed by
|
||||
* opposite of its len or blkno. If we succeed we update the root's
|
||||
* opposite of its order or blkno. If we succeed we update the root's
|
||||
* record of the total length of all the stored extents.
|
||||
*/
|
||||
static int alloc_ext_insert(struct super_block *sb, void *arg,
|
||||
@@ -167,8 +198,8 @@ static int alloc_ext_insert(struct super_block *sb, void *arg,
|
||||
if (WARN_ON_ONCE(map || flags))
|
||||
return -EINVAL;
|
||||
|
||||
init_ext_key(&key, args->type, start, len);
|
||||
init_ext_key(&other, other_type(args->type), start, len);
|
||||
init_ext_key(&key, args->zone, start, len);
|
||||
init_ext_key(&other, other_zone(args->zone), start, len);
|
||||
|
||||
ret = scoutfs_btree_insert(sb, args->alloc, args->wri,
|
||||
&args->root->root, &key, NULL, 0);
|
||||
@@ -196,8 +227,8 @@ static int alloc_ext_remove(struct super_block *sb, void *arg,
|
||||
int ret;
|
||||
int err;
|
||||
|
||||
init_ext_key(&key, args->type, start, len);
|
||||
init_ext_key(&other, other_type(args->type), start, len);
|
||||
init_ext_key(&key, args->zone, start, len);
|
||||
init_ext_key(&other, other_zone(args->zone), start, len);
|
||||
|
||||
ret = scoutfs_btree_delete(sb, args->alloc, args->wri,
|
||||
&args->root->root, &key);
|
||||
@@ -619,7 +650,7 @@ int scoutfs_dalloc_return_cached(struct super_block *sb,
|
||||
.alloc = alloc,
|
||||
.wri = wri,
|
||||
.root = &dalloc->root,
|
||||
.type = SCOUTFS_FREE_EXTENT_BLKNO_TYPE,
|
||||
.zone = SCOUTFS_FREE_EXTENT_BLKNO_ZONE,
|
||||
};
|
||||
int ret = 0;
|
||||
|
||||
@@ -655,7 +686,7 @@ int scoutfs_alloc_data(struct super_block *sb, struct scoutfs_alloc *alloc,
|
||||
.alloc = alloc,
|
||||
.wri = wri,
|
||||
.root = &dalloc->root,
|
||||
.type = SCOUTFS_FREE_EXTENT_LEN_TYPE,
|
||||
.zone = SCOUTFS_FREE_EXTENT_ORDER_ZONE,
|
||||
};
|
||||
struct scoutfs_extent ext;
|
||||
u64 len;
|
||||
@@ -728,7 +759,7 @@ int scoutfs_free_data(struct super_block *sb, struct scoutfs_alloc *alloc,
|
||||
.alloc = alloc,
|
||||
.wri = wri,
|
||||
.root = root,
|
||||
.type = SCOUTFS_FREE_EXTENT_BLKNO_TYPE,
|
||||
.zone = SCOUTFS_FREE_EXTENT_BLKNO_ZONE,
|
||||
};
|
||||
int ret;
|
||||
|
||||
@@ -741,6 +772,95 @@ int scoutfs_free_data(struct super_block *sb, struct scoutfs_alloc *alloc,
|
||||
return ret;
|
||||
}
|
||||
|
||||
/*
|
||||
* Return the first zone bit that the extent intersects with.
|
||||
*/
|
||||
static int first_extent_zone(struct scoutfs_extent *ext, __le64 *zones, u64 zone_blocks)
|
||||
{
|
||||
int first;
|
||||
int last;
|
||||
int nr;
|
||||
|
||||
first = div64_u64(ext->start, zone_blocks);
|
||||
last = div64_u64(ext->start + ext->len - 1, zone_blocks);
|
||||
|
||||
nr = find_next_bit_le(zones, SCOUTFS_DATA_ALLOC_MAX_ZONES, first);
|
||||
if (nr <= last)
|
||||
return nr;
|
||||
|
||||
return SCOUTFS_DATA_ALLOC_MAX_ZONES;
|
||||
}
|
||||
|
||||
/*
|
||||
* Find an extent in specific zones to satisfy an allocation. We use
|
||||
* the order items to search for the largest extent that intersects with
|
||||
* the zones whose bits are set in the caller's bitmap.
|
||||
*/
|
||||
static int find_zone_extent(struct super_block *sb, struct scoutfs_alloc_root *root,
|
||||
__le64 *zones, u64 zone_blocks,
|
||||
struct scoutfs_extent *found_ret, u64 count,
|
||||
struct scoutfs_extent *ext_ret)
|
||||
{
|
||||
struct alloc_ext_args args = {
|
||||
.root = root,
|
||||
.zone = SCOUTFS_FREE_EXTENT_ORDER_ZONE,
|
||||
};
|
||||
struct scoutfs_extent found;
|
||||
struct scoutfs_extent ext;
|
||||
u64 start;
|
||||
u64 len;
|
||||
int nr;
|
||||
int ret;
|
||||
|
||||
/* don't bother when there are no bits set */
|
||||
if (find_next_bit_le(zones, SCOUTFS_DATA_ALLOC_MAX_ZONES, 0) ==
|
||||
SCOUTFS_DATA_ALLOC_MAX_ZONES)
|
||||
return -ENOENT;
|
||||
|
||||
/* start searching for largest extent from the first zone */
|
||||
len = smallest_order_length(SCOUTFS_BLOCK_SM_MAX);
|
||||
nr = 0;
|
||||
|
||||
for (;;) {
|
||||
/* search for extents in the next zone at our order */
|
||||
nr = find_next_bit_le(zones, SCOUTFS_DATA_ALLOC_MAX_ZONES, nr);
|
||||
if (nr >= SCOUTFS_DATA_ALLOC_MAX_ZONES) {
|
||||
/* wrap down to next smaller order if we run out of bits */
|
||||
len >>= 3;
|
||||
if (len == 0) {
|
||||
ret = -ENOENT;
|
||||
break;
|
||||
}
|
||||
nr = find_next_bit_le(zones, SCOUTFS_DATA_ALLOC_MAX_ZONES, 0);
|
||||
}
|
||||
|
||||
start = (u64)nr * zone_blocks;
|
||||
|
||||
ret = scoutfs_ext_next(sb, &alloc_ext_ops, &args, start, len, &found);
|
||||
if (ret < 0)
|
||||
break;
|
||||
|
||||
/* see if the next extent intersects any zones */
|
||||
nr = first_extent_zone(&found, zones, zone_blocks);
|
||||
if (nr < SCOUTFS_DATA_ALLOC_MAX_ZONES) {
|
||||
start = (u64)nr * zone_blocks;
|
||||
|
||||
ext.start = max(start, found.start);
|
||||
ext.len = min(count, found.start + found.len - ext.start);
|
||||
|
||||
*found_ret = found;
|
||||
*ext_ret = ext;
|
||||
ret = 0;
|
||||
break;
|
||||
}
|
||||
|
||||
/* continue searching past extent */
|
||||
nr = div64_u64(found.start + found.len - 1, zone_blocks) + 1;
|
||||
len = smallest_order_length(found.len);
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
/*
|
||||
* Move extent items adding up to the requested total length from the
|
||||
@@ -751,6 +871,11 @@ int scoutfs_free_data(struct super_block *sb, struct scoutfs_alloc *alloc,
|
||||
* -ENOENT is returned if we run out of extents in the source tree
|
||||
* before moving the total.
|
||||
*
|
||||
* The caller can specify that extents in the source tree should first
|
||||
* be found based on their zone bitmaps. We'll first try to find
|
||||
* extents in the exclusive zones, then vacant zones, and then we'll
|
||||
* fall back to normal allocation that ignores zones.
|
||||
*
|
||||
* This first pass is not optimal because it performs full btree walks
|
||||
* per extent. We could optimize this with more clever btree item
|
||||
* manipulation functions which can iterate through src and dst blocks
|
||||
@@ -759,32 +884,77 @@ int scoutfs_free_data(struct super_block *sb, struct scoutfs_alloc *alloc,
|
||||
int scoutfs_alloc_move(struct super_block *sb, struct scoutfs_alloc *alloc,
|
||||
struct scoutfs_block_writer *wri,
|
||||
struct scoutfs_alloc_root *dst,
|
||||
struct scoutfs_alloc_root *src, u64 total)
|
||||
struct scoutfs_alloc_root *src, u64 total,
|
||||
__le64 *exclusive, __le64 *vacant, u64 zone_blocks)
|
||||
{
|
||||
struct alloc_ext_args args = {
|
||||
.alloc = alloc,
|
||||
.wri = wri,
|
||||
};
|
||||
struct scoutfs_extent found;
|
||||
struct scoutfs_extent ext;
|
||||
u64 moved = 0;
|
||||
u64 count;
|
||||
int ret = 0;
|
||||
int err;
|
||||
|
||||
if (zone_blocks == 0) {
|
||||
exclusive = NULL;
|
||||
vacant = NULL;
|
||||
}
|
||||
|
||||
while (moved < total) {
|
||||
args.root = src;
|
||||
args.type = SCOUTFS_FREE_EXTENT_LEN_TYPE;
|
||||
ret = scoutfs_ext_alloc(sb, &alloc_ext_ops, &args,
|
||||
0, 0, total - moved, &ext);
|
||||
count = total - moved;
|
||||
|
||||
if (exclusive) {
|
||||
/* first try to find extents in our exclusive zones */
|
||||
ret = find_zone_extent(sb, src, exclusive, zone_blocks,
|
||||
&found, count, &ext);
|
||||
if (ret == -ENOENT) {
|
||||
exclusive = NULL;
|
||||
continue;
|
||||
}
|
||||
} else if (vacant) {
|
||||
/* then try to find extents in vacant zones */
|
||||
ret = find_zone_extent(sb, src, vacant, zone_blocks,
|
||||
&found, count, &ext);
|
||||
if (ret == -ENOENT) {
|
||||
vacant = NULL;
|
||||
continue;
|
||||
}
|
||||
} else {
|
||||
/* otherwise fall back to finding extents anywhere */
|
||||
args.root = src;
|
||||
args.zone = SCOUTFS_FREE_EXTENT_ORDER_ZONE;
|
||||
ret = scoutfs_ext_next(sb, &alloc_ext_ops, &args, 0, 0, &found);
|
||||
if (ret == 0) {
|
||||
ext.start = found.start;
|
||||
ext.len = min(count, found.len);
|
||||
}
|
||||
}
|
||||
if (ret < 0)
|
||||
break;
|
||||
|
||||
/* searching set start/len, finish initializing alloced extent */
|
||||
ext.map = found.map ? ext.start - found.start + found.map : 0;
|
||||
ext.flags = found.flags;
|
||||
|
||||
/* remove the allocation from the found extent */
|
||||
args.root = src;
|
||||
args.zone = SCOUTFS_FREE_EXTENT_BLKNO_ZONE;
|
||||
ret = scoutfs_ext_remove(sb, &alloc_ext_ops, &args, ext.start, ext.len);
|
||||
if (ret < 0)
|
||||
break;
|
||||
|
||||
/* insert the allocated extent into the dest */
|
||||
args.root = dst;
|
||||
args.type = SCOUTFS_FREE_EXTENT_BLKNO_TYPE;
|
||||
args.zone = SCOUTFS_FREE_EXTENT_BLKNO_ZONE;
|
||||
ret = scoutfs_ext_insert(sb, &alloc_ext_ops, &args, ext.start,
|
||||
ext.len, ext.map, ext.flags);
|
||||
if (ret < 0) {
|
||||
/* and put it back in src if insertion failed */
|
||||
args.root = src;
|
||||
args.type = SCOUTFS_FREE_EXTENT_BLKNO_TYPE;
|
||||
args.zone = SCOUTFS_FREE_EXTENT_BLKNO_ZONE;
|
||||
err = scoutfs_ext_insert(sb, &alloc_ext_ops, &args,
|
||||
ext.start, ext.len, ext.map,
|
||||
ext.flags);
|
||||
@@ -852,7 +1022,7 @@ out:
|
||||
* a list block and all the btree blocks that store extent items.
|
||||
*
|
||||
* At most, an extent operation can dirty down three paths of the tree
|
||||
* to modify a blkno item and two distant len items. We can grow and
|
||||
* to modify a blkno item and two distant order items. We can grow and
|
||||
* split the root, and then those three paths could share blocks but each
|
||||
* modify two leaf blocks.
|
||||
*/
|
||||
@@ -901,7 +1071,7 @@ int scoutfs_alloc_fill_list(struct super_block *sb,
|
||||
.alloc = alloc,
|
||||
.wri = wri,
|
||||
.root = root,
|
||||
.type = SCOUTFS_FREE_EXTENT_LEN_TYPE,
|
||||
.zone = SCOUTFS_FREE_EXTENT_ORDER_ZONE,
|
||||
};
|
||||
struct scoutfs_alloc_list_block *lblk;
|
||||
struct scoutfs_block *bl = NULL;
|
||||
@@ -958,7 +1128,7 @@ int scoutfs_alloc_empty_list(struct super_block *sb,
|
||||
.alloc = alloc,
|
||||
.wri = wri,
|
||||
.root = root,
|
||||
.type = SCOUTFS_FREE_EXTENT_BLKNO_TYPE,
|
||||
.zone = SCOUTFS_FREE_EXTENT_BLKNO_ZONE,
|
||||
};
|
||||
struct scoutfs_alloc_list_block *lblk = NULL;
|
||||
struct scoutfs_block *bl = NULL;
|
||||
@@ -1227,3 +1397,63 @@ out:
|
||||
kfree(sc);
|
||||
return ret;
|
||||
}
|
||||
|
||||
|
||||
struct foreach_cb_args {
|
||||
scoutfs_alloc_extent_cb_t cb;
|
||||
void *cb_arg;
|
||||
};
|
||||
|
||||
static int alloc_btree_extent_item_cb(struct super_block *sb, struct scoutfs_key *key,
|
||||
void *val, int val_len, void *arg)
|
||||
{
|
||||
struct foreach_cb_args *cba = arg;
|
||||
struct scoutfs_extent ext;
|
||||
|
||||
if (key->sk_zone != SCOUTFS_FREE_EXTENT_BLKNO_ZONE)
|
||||
return -ENOENT;
|
||||
|
||||
ext_from_key(&ext, key);
|
||||
cba->cb(sb, cba->cb_arg, &ext);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
/*
|
||||
* Call the caller's callback on each extent stored in the allocator's
|
||||
* btree. The callback sees extents called in order by starting blkno.
|
||||
*/
|
||||
int scoutfs_alloc_extents_cb(struct super_block *sb, struct scoutfs_alloc_root *root,
|
||||
scoutfs_alloc_extent_cb_t cb, void *cb_arg)
|
||||
{
|
||||
struct foreach_cb_args cba = {
|
||||
.cb = cb,
|
||||
.cb_arg = cb_arg,
|
||||
};
|
||||
struct scoutfs_key start;
|
||||
struct scoutfs_key end;
|
||||
struct scoutfs_key key;
|
||||
int ret;
|
||||
|
||||
init_ext_key(&key, SCOUTFS_FREE_EXTENT_BLKNO_ZONE, 0, 1);
|
||||
|
||||
for (;;) {
|
||||
/* will stop at order items before getting stuck in final block */
|
||||
BUILD_BUG_ON(SCOUTFS_FREE_EXTENT_BLKNO_ZONE > SCOUTFS_FREE_EXTENT_ORDER_ZONE);
|
||||
init_ext_key(&start, SCOUTFS_FREE_EXTENT_BLKNO_ZONE, 0, 1);
|
||||
init_ext_key(&end, SCOUTFS_FREE_EXTENT_ORDER_ZONE, 0, 1);
|
||||
|
||||
ret = scoutfs_btree_read_items(sb, &root->root, &key, &start, &end,
|
||||
alloc_btree_extent_item_cb, &cba);
|
||||
if (ret < 0 || end.sk_zone != SCOUTFS_FREE_EXTENT_BLKNO_ZONE) {
|
||||
if (ret == -ENOENT)
|
||||
ret = 0;
|
||||
break;
|
||||
}
|
||||
|
||||
key = end;
|
||||
scoutfs_key_inc(&key);
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
@@ -125,7 +125,8 @@ int scoutfs_free_data(struct super_block *sb, struct scoutfs_alloc *alloc,
|
||||
int scoutfs_alloc_move(struct super_block *sb, struct scoutfs_alloc *alloc,
|
||||
struct scoutfs_block_writer *wri,
|
||||
struct scoutfs_alloc_root *dst,
|
||||
struct scoutfs_alloc_root *src, u64 total);
|
||||
struct scoutfs_alloc_root *src, u64 total,
|
||||
__le64 *exclusive, __le64 *vacant, u64 zone_blocks);
|
||||
|
||||
int scoutfs_alloc_fill_list(struct super_block *sb,
|
||||
struct scoutfs_alloc *alloc,
|
||||
@@ -153,4 +154,9 @@ typedef int (*scoutfs_alloc_foreach_cb_t)(struct super_block *sb, void *arg,
|
||||
int scoutfs_alloc_foreach(struct super_block *sb,
|
||||
scoutfs_alloc_foreach_cb_t cb, void *arg);
|
||||
|
||||
typedef void (*scoutfs_alloc_extent_cb_t)(struct super_block *sb, void *cb_arg,
|
||||
struct scoutfs_extent *ext);
|
||||
int scoutfs_alloc_extents_cb(struct super_block *sb, struct scoutfs_alloc_root *root,
|
||||
scoutfs_alloc_extent_cb_t cb, void *cb_arg);
|
||||
|
||||
#endif
|
||||
|
||||
@@ -249,6 +249,33 @@ int scoutfs_client_open_ino_map(struct super_block *sb, u64 group_nr,
|
||||
&args, sizeof(args), map, sizeof(*map));
|
||||
}
|
||||
|
||||
/* The client is asking the server for the current volume options */
|
||||
int scoutfs_client_get_volopt(struct super_block *sb, struct scoutfs_volume_options *volopt)
|
||||
{
|
||||
struct client_info *client = SCOUTFS_SB(sb)->client_info;
|
||||
|
||||
return scoutfs_net_sync_request(sb, client->conn, SCOUTFS_NET_CMD_GET_VOLOPT,
|
||||
NULL, 0, volopt, sizeof(*volopt));
|
||||
}
|
||||
|
||||
/* The client is asking the server to update volume options */
|
||||
int scoutfs_client_set_volopt(struct super_block *sb, struct scoutfs_volume_options *volopt)
|
||||
{
|
||||
struct client_info *client = SCOUTFS_SB(sb)->client_info;
|
||||
|
||||
return scoutfs_net_sync_request(sb, client->conn, SCOUTFS_NET_CMD_SET_VOLOPT,
|
||||
volopt, sizeof(*volopt), NULL, 0);
|
||||
}
|
||||
|
||||
/* The client is asking the server to clear volume options */
|
||||
int scoutfs_client_clear_volopt(struct super_block *sb, struct scoutfs_volume_options *volopt)
|
||||
{
|
||||
struct client_info *client = SCOUTFS_SB(sb)->client_info;
|
||||
|
||||
return scoutfs_net_sync_request(sb, client->conn, SCOUTFS_NET_CMD_CLEAR_VOLOPT,
|
||||
volopt, sizeof(*volopt), NULL, 0);
|
||||
}
|
||||
|
||||
/* The client is receiving a invalidation request from the server */
|
||||
static int client_lock(struct super_block *sb,
|
||||
struct scoutfs_net_connection *conn, u8 cmd, u64 id,
|
||||
|
||||
@@ -26,6 +26,9 @@ int scoutfs_client_send_omap_response(struct super_block *sb, u64 id,
|
||||
struct scoutfs_open_ino_map *map);
|
||||
int scoutfs_client_open_ino_map(struct super_block *sb, u64 group_nr,
|
||||
struct scoutfs_open_ino_map *map);
|
||||
int scoutfs_client_get_volopt(struct super_block *sb, struct scoutfs_volume_options *volopt);
|
||||
int scoutfs_client_set_volopt(struct super_block *sb, struct scoutfs_volume_options *volopt);
|
||||
int scoutfs_client_clear_volopt(struct super_block *sb, struct scoutfs_volume_options *volopt);
|
||||
|
||||
int scoutfs_client_setup(struct super_block *sb);
|
||||
void scoutfs_client_destroy(struct super_block *sb);
|
||||
|
||||
@@ -203,11 +203,12 @@ struct scoutfs_key {
|
||||
#define skmc_rid _sk_first
|
||||
|
||||
/* free extents by blkno */
|
||||
#define skfb_end _sk_second
|
||||
#define skfb_len _sk_third
|
||||
/* free extents by len */
|
||||
#define skfl_neglen _sk_second
|
||||
#define skfl_blkno _sk_third
|
||||
#define skfb_end _sk_first
|
||||
#define skfb_len _sk_second
|
||||
/* free extents by order */
|
||||
#define skfo_revord _sk_first
|
||||
#define skfo_end _sk_second
|
||||
#define skfo_len _sk_third
|
||||
|
||||
struct scoutfs_avl_root {
|
||||
__le16 node;
|
||||
@@ -427,6 +428,10 @@ struct scoutfs_srch_compact {
|
||||
/* client -> server: compaction failed */
|
||||
#define SCOUTFS_SRCH_COMPACT_FLAG_ERROR (1 << 5)
|
||||
|
||||
#define SCOUTFS_DATA_ALLOC_MAX_ZONES 1024
|
||||
#define SCOUTFS_DATA_ALLOC_ZONE_BYTES DIV_ROUND_UP(SCOUTFS_DATA_ALLOC_MAX_ZONES, 8)
|
||||
#define SCOUTFS_DATA_ALLOC_ZONE_LE64S DIV_ROUND_UP(SCOUTFS_DATA_ALLOC_MAX_ZONES, 64)
|
||||
|
||||
/*
|
||||
* XXX I imagine we should rename these now that they've evolved to track
|
||||
* all the btrees that clients use during a transaction. It's not just
|
||||
@@ -440,6 +445,8 @@ struct scoutfs_log_trees {
|
||||
struct scoutfs_alloc_root data_avail;
|
||||
struct scoutfs_alloc_root data_freed;
|
||||
struct scoutfs_srch_file srch_file;
|
||||
__le64 data_alloc_zone_blocks;
|
||||
__le64 data_alloc_zones[SCOUTFS_DATA_ALLOC_ZONE_LE64S];
|
||||
__le64 max_item_vers;
|
||||
__le64 rid;
|
||||
__le64 nr;
|
||||
@@ -493,7 +500,8 @@ struct scoutfs_bloom_block {
|
||||
#define SCOUTFS_TRANS_SEQ_ZONE 7
|
||||
#define SCOUTFS_MOUNTED_CLIENT_ZONE 8
|
||||
#define SCOUTFS_SRCH_ZONE 9
|
||||
#define SCOUTFS_FREE_EXTENT_ZONE 10
|
||||
#define SCOUTFS_FREE_EXTENT_BLKNO_ZONE 10
|
||||
#define SCOUTFS_FREE_EXTENT_ORDER_ZONE 11
|
||||
|
||||
/* inode index zone */
|
||||
#define SCOUTFS_INODE_INDEX_META_SEQ_TYPE 1
|
||||
@@ -521,10 +529,6 @@ struct scoutfs_bloom_block {
|
||||
#define SCOUTFS_SRCH_PENDING_TYPE 3
|
||||
#define SCOUTFS_SRCH_BUSY_TYPE 4
|
||||
|
||||
/* free extents in allocator btrees in client and server, by blkno or len */
|
||||
#define SCOUTFS_FREE_EXTENT_BLKNO_TYPE 1
|
||||
#define SCOUTFS_FREE_EXTENT_LEN_TYPE 2
|
||||
|
||||
/* file data extents have start and len in key */
|
||||
struct scoutfs_data_extent_val {
|
||||
__le64 blkno;
|
||||
@@ -626,6 +630,42 @@ struct scoutfs_quorum_block {
|
||||
|
||||
#define SCOUTFS_QUORUM_BLOCK_LEADER (1 << 0)
|
||||
|
||||
/*
|
||||
* Tunable options that apply to the entire system. They can be set in
|
||||
* mkfs or in sysfs files which send an rpc to the server to make the
|
||||
* change. The super version defines the options that exist.
|
||||
*
|
||||
* @set_bits: bits for each 64bit starting offset after set_bits
|
||||
* indicate which logical option is set.
|
||||
*
|
||||
* @data_alloc_zone_blocks: if set, the data device is logically divided
|
||||
* into contiguous zones of this many blocks. Data allocation will try
|
||||
* and isolate allocated extents for each mount to their own zone. The
|
||||
* zone size must be larger than the data alloc high water mark and
|
||||
* large enough such that the number of zones is kept within its static
|
||||
* limit.
|
||||
*/
|
||||
struct scoutfs_volume_options {
|
||||
__le64 set_bits;
|
||||
__le64 data_alloc_zone_blocks;
|
||||
__le64 __future_expansion[63];
|
||||
};
|
||||
|
||||
#define scoutfs_volopt_nr(field) \
|
||||
((offsetof(struct scoutfs_volume_options, field) - \
|
||||
(offsetof(struct scoutfs_volume_options, set_bits) + \
|
||||
member_sizeof(struct scoutfs_volume_options, set_bits))) / sizeof(__le64))
|
||||
#define scoutfs_volopt_bit(field) \
|
||||
(1ULL << scoutfs_volopt_nr(field))
|
||||
|
||||
#define SCOUTFS_VOLOPT_DATA_ALLOC_ZONE_BLOCKS_NR \
|
||||
scoutfs_volopt_nr(data_alloc_zone_blocks)
|
||||
#define SCOUTFS_VOLOPT_DATA_ALLOC_ZONE_BLOCKS_BIT \
|
||||
scoutfs_volopt_bit(data_alloc_zone_blocks)
|
||||
|
||||
#define SCOUTFS_VOLOPT_EXPANSION_BITS \
|
||||
(~(scoutfs_volopt_bit(__future_expansion) - 1))
|
||||
|
||||
#define SCOUTFS_FLAG_IS_META_BDEV 0x01
|
||||
|
||||
struct scoutfs_super_block {
|
||||
@@ -652,6 +692,7 @@ struct scoutfs_super_block {
|
||||
struct scoutfs_btree_root trans_seqs;
|
||||
struct scoutfs_btree_root mounted_clients;
|
||||
struct scoutfs_btree_root srch_root;
|
||||
struct scoutfs_volume_options volopt;
|
||||
};
|
||||
|
||||
#define SCOUTFS_ROOT_INO 1
|
||||
@@ -841,6 +882,9 @@ enum scoutfs_net_cmd {
|
||||
SCOUTFS_NET_CMD_SRCH_GET_COMPACT,
|
||||
SCOUTFS_NET_CMD_SRCH_COMMIT_COMPACT,
|
||||
SCOUTFS_NET_CMD_OPEN_INO_MAP,
|
||||
SCOUTFS_NET_CMD_GET_VOLOPT,
|
||||
SCOUTFS_NET_CMD_SET_VOLOPT,
|
||||
SCOUTFS_NET_CMD_CLEAR_VOLOPT,
|
||||
SCOUTFS_NET_CMD_FAREWELL,
|
||||
SCOUTFS_NET_CMD_UNKNOWN,
|
||||
};
|
||||
|
||||
@@ -99,6 +99,11 @@ struct server_info {
|
||||
seqcount_t roots_seqcount;
|
||||
struct scoutfs_net_roots roots;
|
||||
|
||||
/* serializing and get and set volume options */
|
||||
seqcount_t volopt_seqcount;
|
||||
struct mutex volopt_mutex;
|
||||
struct scoutfs_volume_options volopt;
|
||||
|
||||
/* recovery timeout fences from work */
|
||||
struct work_struct fence_pending_recov_work;
|
||||
};
|
||||
@@ -114,6 +119,38 @@ struct server_client_info {
|
||||
struct list_head head;
|
||||
};
|
||||
|
||||
static __le64 *first_valopt(struct scoutfs_volume_options *valopt)
|
||||
{
|
||||
return &valopt->set_bits + 1;
|
||||
}
|
||||
|
||||
/*
|
||||
* A server caller wants to know if a volume option is set and wants to
|
||||
* know it's value. This is quite early in the file to make it
|
||||
* available to all of the server paths.
|
||||
*/
|
||||
static bool get_volopt_val(struct server_info *server, int nr, u64 *val)
|
||||
{
|
||||
u64 bit = 1ULL << nr;
|
||||
__le64 *opt = first_valopt(&server->volopt) + nr;
|
||||
bool is_set = false;
|
||||
unsigned seq;
|
||||
|
||||
do {
|
||||
seq = read_seqcount_begin(&server->volopt_seqcount);
|
||||
if ((le64_to_cpu(server->volopt.set_bits) & bit)) {
|
||||
is_set = true;
|
||||
*val = le64_to_cpup(opt);
|
||||
} else {
|
||||
is_set = false;
|
||||
*val = 0;
|
||||
};
|
||||
} while (read_seqcount_retry(&server->volopt_seqcount, seq));
|
||||
|
||||
return is_set;
|
||||
}
|
||||
|
||||
|
||||
struct commit_waiter {
|
||||
struct completion comp;
|
||||
struct llist_node node;
|
||||
@@ -361,9 +398,9 @@ out:
|
||||
* Refill the destination root if it's fallen below the lo threshold by
|
||||
* moving from the src root to bring it up to the target.
|
||||
*/
|
||||
static int alloc_move_refill(struct super_block *sb,
|
||||
struct scoutfs_alloc_root *dst,
|
||||
struct scoutfs_alloc_root *src, u64 lo, u64 target)
|
||||
static int alloc_move_refill_zoned(struct super_block *sb, struct scoutfs_alloc_root *dst,
|
||||
struct scoutfs_alloc_root *src, u64 lo, u64 target,
|
||||
__le64 *exclusive, __le64 *vacant, u64 zone_blocks)
|
||||
{
|
||||
DECLARE_SERVER_INFO(sb, server);
|
||||
|
||||
@@ -372,7 +409,14 @@ static int alloc_move_refill(struct super_block *sb,
|
||||
|
||||
return scoutfs_alloc_move(sb, &server->alloc, &server->wri, dst, src,
|
||||
min(target - le64_to_cpu(dst->total_len),
|
||||
le64_to_cpu(src->total_len)));
|
||||
le64_to_cpu(src->total_len)),
|
||||
exclusive, vacant, zone_blocks);
|
||||
}
|
||||
|
||||
static inline int alloc_move_refill(struct super_block *sb, struct scoutfs_alloc_root *dst,
|
||||
struct scoutfs_alloc_root *src, u64 lo, u64 target)
|
||||
{
|
||||
return alloc_move_refill_zoned(sb, dst, src, lo, target, NULL, NULL, 0);
|
||||
}
|
||||
|
||||
static int alloc_move_empty(struct super_block *sb,
|
||||
@@ -382,7 +426,134 @@ static int alloc_move_empty(struct super_block *sb,
|
||||
DECLARE_SERVER_INFO(sb, server);
|
||||
|
||||
return scoutfs_alloc_move(sb, &server->alloc, &server->wri,
|
||||
dst, src, le64_to_cpu(src->total_len));
|
||||
dst, src, le64_to_cpu(src->total_len), NULL, NULL, 0);
|
||||
}
|
||||
|
||||
/*
|
||||
* Set all the bits in the destination which overlap with the extent.
|
||||
*/
|
||||
static void mod_extent_bits(__le64 *bits, u64 zone_blocks, u64 blkno, u64 len, bool set)
|
||||
{
|
||||
u64 nr = div64_u64(blkno, zone_blocks);
|
||||
u64 last_nr = div64_u64(blkno + len - 1, zone_blocks);
|
||||
|
||||
if (WARN_ON_ONCE(len == 0))
|
||||
return;
|
||||
|
||||
while (nr <= last_nr) {
|
||||
if (set)
|
||||
set_bit_le(nr, bits);
|
||||
else
|
||||
clear_bit_le(nr, bits);
|
||||
|
||||
nr++;
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Translate the bits in the source bitmap into extents and modify bits
|
||||
* in the destination that map those extents.
|
||||
*/
|
||||
static void mod_bitmap_bits(__le64 *dst, u64 dst_zone_blocks,
|
||||
__le64 *src, u64 src_zone_blocks, bool set)
|
||||
{
|
||||
int nr = 0;
|
||||
|
||||
for (;;) {
|
||||
nr = find_next_bit_le(src, SCOUTFS_DATA_ALLOC_MAX_ZONES, nr);
|
||||
if (nr >= SCOUTFS_DATA_ALLOC_MAX_ZONES)
|
||||
break;
|
||||
|
||||
mod_extent_bits(dst, dst_zone_blocks,
|
||||
(u64)nr * src_zone_blocks, src_zone_blocks, set);
|
||||
nr++;
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Iterate over all the log_tree items and initialize the caller's zone
|
||||
* bitmaps. Exclusive bits are only found in the caller's items.
|
||||
* Vacant bits are not found in any items.
|
||||
*
|
||||
* The log_tree item zone bitmaps could have been stored with different
|
||||
* zone_blocks sizes. We translate the bits into block extents and
|
||||
* record overlaps with the current zone size.
|
||||
*
|
||||
* The caller has the log items locked.
|
||||
*/
|
||||
static int get_data_alloc_zone_bits(struct super_block *sb, u64 rid, __le64 *exclusive,
|
||||
__le64 *vacant, u64 zone_blocks)
|
||||
{
|
||||
struct scoutfs_super_block *super = &SCOUTFS_SB(sb)->super;
|
||||
SCOUTFS_BTREE_ITEM_REF(iref);
|
||||
struct scoutfs_log_trees *lt;
|
||||
struct scoutfs_key key;
|
||||
int ret;
|
||||
|
||||
memset(exclusive, 0, SCOUTFS_DATA_ALLOC_ZONE_BYTES);
|
||||
memset(vacant, 0, SCOUTFS_DATA_ALLOC_ZONE_BYTES);
|
||||
|
||||
mod_extent_bits(vacant, zone_blocks, 0, le64_to_cpu(super->total_data_blocks), true);
|
||||
|
||||
scoutfs_key_init_log_trees(&key, 0, 0);
|
||||
for (;;) {
|
||||
ret = scoutfs_btree_next(sb, &super->logs_root, &key, &iref);
|
||||
if (ret == 0) {
|
||||
if (iref.val_len == sizeof(struct scoutfs_log_trees)) {
|
||||
lt = iref.val;
|
||||
|
||||
/* vacant bits have no bits found in items */
|
||||
mod_bitmap_bits(vacant, zone_blocks,
|
||||
lt->data_alloc_zones,
|
||||
le64_to_cpu(lt->data_alloc_zone_blocks),
|
||||
false);
|
||||
|
||||
/* exclusive bits are only found in caller's items */
|
||||
if (le64_to_cpu(iref.key->sklt_rid) == rid) {
|
||||
mod_bitmap_bits(exclusive, zone_blocks,
|
||||
lt->data_alloc_zones,
|
||||
le64_to_cpu(lt->data_alloc_zone_blocks),
|
||||
true);
|
||||
} else {
|
||||
mod_bitmap_bits(exclusive, zone_blocks,
|
||||
lt->data_alloc_zones,
|
||||
le64_to_cpu(lt->data_alloc_zone_blocks),
|
||||
false);
|
||||
}
|
||||
|
||||
key = *iref.key;
|
||||
scoutfs_key_inc(&key);
|
||||
} else {
|
||||
ret = -EIO;
|
||||
}
|
||||
scoutfs_btree_put_iref(&iref);
|
||||
}
|
||||
if (ret < 0) {
|
||||
if (ret == -ENOENT)
|
||||
ret = 0;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
static void zero_data_alloc_zone_bits(struct scoutfs_log_trees *lt)
|
||||
{
|
||||
lt->data_alloc_zone_blocks = 0;
|
||||
memset(lt->data_alloc_zones, 0, sizeof(lt->data_alloc_zones));
|
||||
}
|
||||
|
||||
struct alloc_extent_cb_args {
|
||||
__le64 *zones;
|
||||
u64 zone_blocks;
|
||||
};
|
||||
|
||||
static void set_extent_zone_bits(struct super_block *sb, void *cb_arg, struct scoutfs_extent *ext)
|
||||
{
|
||||
struct alloc_extent_cb_args *cba = cb_arg;
|
||||
|
||||
mod_extent_bits(cba->zones, cba->zone_blocks, ext->start, ext->len, true);
|
||||
}
|
||||
|
||||
/*
|
||||
@@ -402,9 +573,13 @@ static int server_get_log_trees(struct super_block *sb,
|
||||
struct scoutfs_super_block *super = &SCOUTFS_SB(sb)->super;
|
||||
u64 rid = scoutfs_net_client_rid(conn);
|
||||
DECLARE_SERVER_INFO(sb, server);
|
||||
__le64 exclusive[SCOUTFS_DATA_ALLOC_ZONE_LE64S];
|
||||
__le64 vacant[SCOUTFS_DATA_ALLOC_ZONE_LE64S];
|
||||
struct alloc_extent_cb_args cba;
|
||||
SCOUTFS_BTREE_ITEM_REF(iref);
|
||||
struct scoutfs_log_trees lt;
|
||||
struct scoutfs_key key;
|
||||
u64 data_zone_blocks;
|
||||
int ret;
|
||||
|
||||
if (arg_len != 0) {
|
||||
@@ -446,6 +621,14 @@ static int server_get_log_trees(struct super_block *sb,
|
||||
lt.nr = key.sklt_nr;
|
||||
}
|
||||
|
||||
if (get_volopt_val(server, SCOUTFS_VOLOPT_DATA_ALLOC_ZONE_BLOCKS_NR, &data_zone_blocks)) {
|
||||
ret = get_data_alloc_zone_bits(sb, rid, exclusive, vacant, data_zone_blocks);
|
||||
if (ret < 0)
|
||||
goto unlock;
|
||||
} else {
|
||||
data_zone_blocks = 0;
|
||||
}
|
||||
|
||||
/* return freed to server for emptying, refill avail */
|
||||
mutex_lock(&server->alloc_mutex);
|
||||
ret = scoutfs_alloc_splice_list(sb, &server->alloc, &server->wri,
|
||||
@@ -456,13 +639,28 @@ static int server_get_log_trees(struct super_block *sb,
|
||||
<.meta_avail, server->meta_avail,
|
||||
SCOUTFS_SERVER_META_FILL_LO,
|
||||
SCOUTFS_SERVER_META_FILL_TARGET) ?:
|
||||
alloc_move_refill(sb, <.data_avail, &super->data_alloc,
|
||||
SCOUTFS_SERVER_DATA_FILL_LO,
|
||||
SCOUTFS_SERVER_DATA_FILL_TARGET);
|
||||
alloc_move_refill_zoned(sb, <.data_avail, &super->data_alloc,
|
||||
SCOUTFS_SERVER_DATA_FILL_LO,
|
||||
SCOUTFS_SERVER_DATA_FILL_TARGET,
|
||||
exclusive, vacant, data_zone_blocks);
|
||||
mutex_unlock(&server->alloc_mutex);
|
||||
if (ret < 0)
|
||||
goto unlock;
|
||||
|
||||
/* record data alloc zone bits */
|
||||
zero_data_alloc_zone_bits(<);
|
||||
if (data_zone_blocks != 0) {
|
||||
cba.zones = lt.data_alloc_zones;
|
||||
cba.zone_blocks = data_zone_blocks;
|
||||
ret = scoutfs_alloc_extents_cb(sb, <.data_avail, set_extent_zone_bits, &cba);
|
||||
if (ret < 0) {
|
||||
zero_data_alloc_zone_bits(<);
|
||||
goto unlock;
|
||||
}
|
||||
|
||||
lt.data_alloc_zone_blocks = cpu_to_le64(data_zone_blocks);
|
||||
}
|
||||
|
||||
/* update client's log tree's item */
|
||||
ret = scoutfs_btree_force(sb, &server->alloc, &server->wri,
|
||||
&super->logs_root, &key, <, sizeof(lt));
|
||||
@@ -634,6 +832,9 @@ static int reclaim_log_trees(struct super_block *sb, u64 rid)
|
||||
alloc_move_empty(sb, &super->data_alloc, <.data_freed);
|
||||
mutex_unlock(&server->alloc_mutex);
|
||||
|
||||
/* the mount is no longer writing to the zones */
|
||||
zero_data_alloc_zone_bits(<);
|
||||
|
||||
err = scoutfs_btree_update(sb, &server->alloc, &server->wri,
|
||||
&super->logs_root, &key, <, sizeof(lt));
|
||||
BUG_ON(err != 0); /* alloc and log item roots out of sync */
|
||||
@@ -1075,6 +1276,156 @@ out:
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* The server is receiving a request for the current volume options */
|
||||
static int server_get_volopt(struct super_block *sb, struct scoutfs_net_connection *conn,
|
||||
u8 cmd, u64 id, void *arg, u16 arg_len)
|
||||
{
|
||||
DECLARE_SERVER_INFO(sb, server);
|
||||
struct scoutfs_volume_options volopt;
|
||||
unsigned seq;
|
||||
int ret = 0;
|
||||
|
||||
if (arg_len != 0) {
|
||||
ret = -EINVAL;
|
||||
goto out;
|
||||
}
|
||||
|
||||
do {
|
||||
seq = read_seqcount_begin(&server->volopt_seqcount);
|
||||
volopt = server->volopt;
|
||||
} while (read_seqcount_retry(&server->volopt_seqcount, seq));
|
||||
|
||||
out:
|
||||
return scoutfs_net_response(sb, conn, cmd, id, ret, &volopt, sizeof(volopt));
|
||||
}
|
||||
|
||||
/*
|
||||
* The server is receiving a request to update volume options.
|
||||
*
|
||||
* The in-memory options that readers use is updated only once the
|
||||
* updated options are written in the super block.
|
||||
*/
|
||||
static int server_set_volopt(struct super_block *sb, struct scoutfs_net_connection *conn,
|
||||
u8 cmd, u64 id, void *arg, u16 arg_len)
|
||||
{
|
||||
DECLARE_SERVER_INFO(sb, server);
|
||||
struct scoutfs_super_block *super = &SCOUTFS_SB(sb)->super;
|
||||
struct scoutfs_volume_options *volopt;
|
||||
u64 opt;
|
||||
u64 nr;
|
||||
int ret = 0;
|
||||
|
||||
if (arg_len != sizeof(struct scoutfs_volume_options)) {
|
||||
ret = -EINVAL;
|
||||
goto out;
|
||||
}
|
||||
volopt = arg;
|
||||
|
||||
if (le64_to_cpu(volopt->set_bits) & SCOUTFS_VOLOPT_EXPANSION_BITS) {
|
||||
ret = -EINVAL;
|
||||
goto out;
|
||||
}
|
||||
|
||||
mutex_lock(&server->volopt_mutex);
|
||||
|
||||
ret = scoutfs_server_hold_commit(sb);
|
||||
if (ret)
|
||||
goto unlock;
|
||||
|
||||
if (le64_to_cpu(volopt->set_bits) & SCOUTFS_VOLOPT_DATA_ALLOC_ZONE_BLOCKS_BIT) {
|
||||
opt = le64_to_cpu(volopt->data_alloc_zone_blocks);
|
||||
if (opt < SCOUTFS_SERVER_DATA_FILL_TARGET) {
|
||||
scoutfs_err(sb, "setting data_alloc_zone_blocks to '%llu' failed, must be at least %llu mount data allocation target blocks",
|
||||
opt, SCOUTFS_SERVER_DATA_FILL_TARGET);
|
||||
ret = -EINVAL;
|
||||
goto apply;
|
||||
}
|
||||
|
||||
nr = div_u64(le64_to_cpu(super->total_data_blocks), SCOUTFS_DATA_ALLOC_MAX_ZONES);
|
||||
if (opt < nr) {
|
||||
scoutfs_err(sb, "setting data_alloc_zone_blocks to '%llu' failed, must be greater than %llu blocks which results in max %u zones",
|
||||
opt, nr, SCOUTFS_DATA_ALLOC_MAX_ZONES);
|
||||
ret = -EINVAL;
|
||||
goto apply;
|
||||
}
|
||||
|
||||
if (opt > le64_to_cpu(super->total_data_blocks)) {
|
||||
scoutfs_err(sb, "setting data_alloc_zone_blocks to '%llu' failed, must be at most %llu total data device blocks",
|
||||
opt, le64_to_cpu(super->total_data_blocks));
|
||||
ret = -EINVAL;
|
||||
goto apply;
|
||||
}
|
||||
|
||||
super->volopt.data_alloc_zone_blocks = volopt->data_alloc_zone_blocks;
|
||||
super->volopt.set_bits |= cpu_to_le64(SCOUTFS_VOLOPT_DATA_ALLOC_ZONE_BLOCKS_BIT);
|
||||
}
|
||||
|
||||
apply:
|
||||
ret = scoutfs_server_apply_commit(sb, ret);
|
||||
|
||||
write_seqcount_begin(&server->volopt_seqcount);
|
||||
if (ret == 0)
|
||||
server->volopt = super->volopt;
|
||||
else
|
||||
super->volopt = server->volopt;
|
||||
write_seqcount_end(&server->volopt_seqcount);
|
||||
|
||||
unlock:
|
||||
mutex_unlock(&server->volopt_mutex);
|
||||
out:
|
||||
return scoutfs_net_response(sb, conn, cmd, id, ret, NULL, 0);
|
||||
}
|
||||
|
||||
static int server_clear_volopt(struct super_block *sb, struct scoutfs_net_connection *conn,
|
||||
u8 cmd, u64 id, void *arg, u16 arg_len)
|
||||
{
|
||||
DECLARE_SERVER_INFO(sb, server);
|
||||
struct scoutfs_super_block *super = &SCOUTFS_SB(sb)->super;
|
||||
struct scoutfs_volume_options *volopt;
|
||||
__le64 *opt;
|
||||
u64 bit;
|
||||
int ret = 0;
|
||||
int i;
|
||||
|
||||
if (arg_len != sizeof(struct scoutfs_volume_options)) {
|
||||
ret = -EINVAL;
|
||||
goto out;
|
||||
}
|
||||
volopt = arg;
|
||||
|
||||
if (le64_to_cpu(volopt->set_bits) & SCOUTFS_VOLOPT_EXPANSION_BITS) {
|
||||
ret = -EINVAL;
|
||||
goto out;
|
||||
}
|
||||
|
||||
mutex_lock(&server->volopt_mutex);
|
||||
|
||||
ret = scoutfs_server_hold_commit(sb);
|
||||
if (ret)
|
||||
goto unlock;
|
||||
|
||||
for (i = 0, bit = 1, opt = first_valopt(&super->volopt); i < 64; i++, bit <<= 1, opt++) {
|
||||
if (le64_to_cpu(volopt->set_bits) & bit) {
|
||||
super->volopt.set_bits &= ~cpu_to_le64(bit);
|
||||
*opt = 0;
|
||||
}
|
||||
}
|
||||
|
||||
ret = scoutfs_server_apply_commit(sb, ret);
|
||||
|
||||
write_seqcount_begin(&server->volopt_seqcount);
|
||||
if (ret == 0)
|
||||
server->volopt = super->volopt;
|
||||
else
|
||||
super->volopt = server->volopt;
|
||||
write_seqcount_end(&server->volopt_seqcount);
|
||||
|
||||
unlock:
|
||||
mutex_unlock(&server->volopt_mutex);
|
||||
out:
|
||||
return scoutfs_net_response(sb, conn, cmd, id, ret, NULL, 0);
|
||||
}
|
||||
|
||||
static void init_mounted_client_key(struct scoutfs_key *key, u64 rid)
|
||||
{
|
||||
*key = (struct scoutfs_key) {
|
||||
@@ -1565,6 +1916,9 @@ static scoutfs_net_request_t server_req_funcs[] = {
|
||||
[SCOUTFS_NET_CMD_SRCH_GET_COMPACT] = server_srch_get_compact,
|
||||
[SCOUTFS_NET_CMD_SRCH_COMMIT_COMPACT] = server_srch_commit_compact,
|
||||
[SCOUTFS_NET_CMD_OPEN_INO_MAP] = server_open_ino_map,
|
||||
[SCOUTFS_NET_CMD_GET_VOLOPT] = server_get_volopt,
|
||||
[SCOUTFS_NET_CMD_SET_VOLOPT] = server_set_volopt,
|
||||
[SCOUTFS_NET_CMD_CLEAR_VOLOPT] = server_clear_volopt,
|
||||
[SCOUTFS_NET_CMD_FAREWELL] = server_farewell,
|
||||
};
|
||||
|
||||
@@ -1784,6 +2138,11 @@ static void scoutfs_server_worker(struct work_struct *work)
|
||||
if (ret < 0)
|
||||
goto shutdown;
|
||||
|
||||
/* update volume options early, possibly for use during startup */
|
||||
write_seqcount_begin(&server->volopt_seqcount);
|
||||
server->volopt = super->volopt;
|
||||
write_seqcount_end(&server->volopt_seqcount);
|
||||
|
||||
set_roots(server, &super->fs_root, &super->logs_root,
|
||||
&super->srch_root);
|
||||
scoutfs_block_writer_init(sb, &server->wri);
|
||||
@@ -1932,6 +2291,8 @@ int scoutfs_server_setup(struct super_block *sb)
|
||||
mutex_init(&server->srch_mutex);
|
||||
mutex_init(&server->mounted_clients_mutex);
|
||||
seqcount_init(&server->roots_seqcount);
|
||||
seqcount_init(&server->volopt_seqcount);
|
||||
mutex_init(&server->volopt_mutex);
|
||||
INIT_WORK(&server->fence_pending_recov_work, fence_pending_recov_worker);
|
||||
|
||||
server->wq = alloc_workqueue("scoutfs_server",
|
||||
|
||||
@@ -46,6 +46,7 @@
|
||||
#include "alloc.h"
|
||||
#include "recov.h"
|
||||
#include "omap.h"
|
||||
#include "volopt.h"
|
||||
#include "scoutfs_trace.h"
|
||||
|
||||
static struct dentry *scoutfs_debugfs_root;
|
||||
@@ -253,6 +254,7 @@ static void scoutfs_put_super(struct super_block *sb)
|
||||
scoutfs_lock_shutdown(sb);
|
||||
|
||||
scoutfs_shutdown_trans(sb);
|
||||
scoutfs_volopt_destroy(sb);
|
||||
scoutfs_client_destroy(sb);
|
||||
scoutfs_inode_destroy(sb);
|
||||
scoutfs_item_destroy(sb);
|
||||
@@ -601,6 +603,7 @@ static int scoutfs_fill_super(struct super_block *sb, void *data, int silent)
|
||||
scoutfs_server_setup(sb) ?:
|
||||
scoutfs_quorum_setup(sb) ?:
|
||||
scoutfs_client_setup(sb) ?:
|
||||
scoutfs_volopt_setup(sb) ?:
|
||||
scoutfs_lock_rid(sb, SCOUTFS_LOCK_WRITE, 0, sbi->rid,
|
||||
&sbi->rid_lock) ?:
|
||||
scoutfs_trans_get_log_trees(sb) ?:
|
||||
|
||||
@@ -28,6 +28,7 @@ struct forest_info;
|
||||
struct srch_info;
|
||||
struct recov_info;
|
||||
struct omap_info;
|
||||
struct volopt_info;
|
||||
|
||||
struct scoutfs_sb_info {
|
||||
struct super_block *sb;
|
||||
@@ -51,6 +52,7 @@ struct scoutfs_sb_info {
|
||||
struct forest_info *forest_info;
|
||||
struct srch_info *srch_info;
|
||||
struct omap_info *omap_info;
|
||||
struct volopt_info *volopt_info;
|
||||
struct item_cache_info *item_cache_info;
|
||||
|
||||
wait_queue_head_t trans_hold_wq;
|
||||
|
||||
188
kmod/src/volopt.c
Normal file
188
kmod/src/volopt.c
Normal file
@@ -0,0 +1,188 @@
|
||||
/*
|
||||
* Copyright (C) 2021 Versity Software, Inc. All rights reserved.
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU General Public
|
||||
* License v2 as published by the Free Software Foundation.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* General Public License for more details.
|
||||
*/
|
||||
#include <linux/kernel.h>
|
||||
#include <linux/fs.h>
|
||||
#include <linux/slab.h>
|
||||
#include <linux/kobject.h>
|
||||
#include <linux/sysfs.h>
|
||||
|
||||
#include "super.h"
|
||||
#include "client.h"
|
||||
#include "volopt.h"
|
||||
|
||||
/*
|
||||
* Volume options are exposed through a sysfs directory. Getting and
|
||||
* setting the values sends rpcs to the server who owns the options in
|
||||
* the super block.
|
||||
*/
|
||||
|
||||
struct volopt_info {
|
||||
struct super_block *sb;
|
||||
struct scoutfs_sysfs_attrs ssa;
|
||||
};
|
||||
|
||||
#define DECLARE_VOLOPT_INFO(sb, name) \
|
||||
struct volopt_info *name = SCOUTFS_SB(sb)->volopt_info
|
||||
#define DECLARE_VOLOPT_INFO_KOBJ(kobj, name) \
|
||||
DECLARE_VOLOPT_INFO(SCOUTFS_SYSFS_ATTRS_SB(kobj), name)
|
||||
|
||||
/*
|
||||
* attribute arrays need to be dense but the options we export could
|
||||
* well become sparse over time. .store and .load are generic and we
|
||||
* have a lookup table to map the attributes array indexes to the number
|
||||
* and name of the option.
|
||||
*/
|
||||
static struct volopt_nr_name {
|
||||
int nr;
|
||||
char *name;
|
||||
} volopt_table[] = {
|
||||
{ SCOUTFS_VOLOPT_DATA_ALLOC_ZONE_BLOCKS_NR, "data_alloc_zone_blocks" },
|
||||
};
|
||||
|
||||
/* initialized by setup, pointer array is null terminated */
|
||||
static struct kobj_attribute volopt_attrs[ARRAY_SIZE(volopt_table)];
|
||||
static struct attribute *volopt_attr_ptrs[ARRAY_SIZE(volopt_table) + 1];
|
||||
|
||||
static void get_opt_data(struct kobj_attribute *attr, struct scoutfs_volume_options *volopt,
|
||||
u64 *bit, __le64 **opt)
|
||||
{
|
||||
size_t index = attr - &volopt_attrs[0];
|
||||
int nr = volopt_table[index].nr;
|
||||
|
||||
*bit = 1ULL << nr;
|
||||
*opt = &volopt->set_bits + 1 + nr;
|
||||
}
|
||||
|
||||
static ssize_t volopt_attr_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf)
|
||||
{
|
||||
DECLARE_VOLOPT_INFO_KOBJ(kobj, vinf);
|
||||
struct super_block *sb = vinf->sb;
|
||||
struct scoutfs_volume_options volopt;
|
||||
__le64 *opt;
|
||||
u64 bit;
|
||||
int ret;
|
||||
|
||||
ret = scoutfs_client_get_volopt(sb, &volopt);
|
||||
if (ret < 0)
|
||||
return ret;
|
||||
|
||||
get_opt_data(attr, &volopt, &bit, &opt);
|
||||
|
||||
if (le64_to_cpu(volopt.set_bits) & bit) {
|
||||
return snprintf(buf, PAGE_SIZE, "%llu", le64_to_cpup(opt));
|
||||
} else {
|
||||
buf[0] = '\0';
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
||||
static ssize_t volopt_attr_store(struct kobject *kobj, struct kobj_attribute *attr,
|
||||
const char *buf, size_t count)
|
||||
{
|
||||
DECLARE_VOLOPT_INFO_KOBJ(kobj, vinf);
|
||||
struct super_block *sb = vinf->sb;
|
||||
struct scoutfs_volume_options volopt = {0,};
|
||||
u8 chars[32];
|
||||
__le64 *opt;
|
||||
u64 bit;
|
||||
u64 val;
|
||||
int ret;
|
||||
|
||||
if (count == 0)
|
||||
return 0;
|
||||
if (count > sizeof(chars) - 1)
|
||||
return -ERANGE;
|
||||
|
||||
get_opt_data(attr, &volopt, &bit, &opt);
|
||||
|
||||
if (buf[0] == '\n' || buf[0] == '\r') {
|
||||
volopt.set_bits = cpu_to_le64(bit);
|
||||
|
||||
ret = scoutfs_client_clear_volopt(sb, &volopt);
|
||||
} else {
|
||||
memcpy(chars, buf, count);
|
||||
chars[count] = '\0';
|
||||
ret = kstrtoull(chars, 0, &val);
|
||||
if (ret < 0)
|
||||
return ret;
|
||||
|
||||
volopt.set_bits = cpu_to_le64(bit);
|
||||
*opt = cpu_to_le64(val);
|
||||
|
||||
ret = scoutfs_client_set_volopt(sb, &volopt);
|
||||
}
|
||||
|
||||
if (ret == 0)
|
||||
ret = count;
|
||||
return ret;
|
||||
}
|
||||
|
||||
/*
|
||||
* The volume option sysfs files are slim shims around RPCs so this
|
||||
* should be called after the client is setup and before it is torn
|
||||
* down.
|
||||
*/
|
||||
int scoutfs_volopt_setup(struct super_block *sb)
|
||||
{
|
||||
struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
|
||||
struct volopt_info *vinf;
|
||||
int ret;
|
||||
int i;
|
||||
|
||||
/* persistent volume options are always a bitmap u64 then the 64 options */
|
||||
BUILD_BUG_ON(sizeof(struct scoutfs_volume_options) != (1 + 64) * 8);
|
||||
|
||||
vinf = kzalloc(sizeof(struct volopt_info), GFP_KERNEL);
|
||||
if (!vinf) {
|
||||
ret = -ENOMEM;
|
||||
goto out;
|
||||
}
|
||||
|
||||
scoutfs_sysfs_init_attrs(sb, &vinf->ssa);
|
||||
vinf->sb = sb;
|
||||
sbi->volopt_info = vinf;
|
||||
|
||||
for (i = 0; i < ARRAY_SIZE(volopt_table); i++) {
|
||||
volopt_attrs[i] = (struct kobj_attribute) {
|
||||
.attr = { .name = volopt_table[i].name, .mode = S_IWUSR | S_IRUGO },
|
||||
.show = volopt_attr_show,
|
||||
.store = volopt_attr_store,
|
||||
};
|
||||
volopt_attr_ptrs[i] = &volopt_attrs[i].attr;
|
||||
}
|
||||
|
||||
BUILD_BUG_ON(ARRAY_SIZE(volopt_table) != ARRAY_SIZE(volopt_attr_ptrs) - 1);
|
||||
volopt_attr_ptrs[i] = NULL;
|
||||
|
||||
ret = scoutfs_sysfs_create_attrs(sb, &vinf->ssa, volopt_attr_ptrs, "volume_options");
|
||||
if (ret < 0)
|
||||
goto out;
|
||||
|
||||
out:
|
||||
if (ret)
|
||||
scoutfs_volopt_destroy(sb);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
void scoutfs_volopt_destroy(struct super_block *sb)
|
||||
{
|
||||
struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
|
||||
struct volopt_info *vinf = SCOUTFS_SB(sb)->volopt_info;
|
||||
|
||||
if (vinf) {
|
||||
scoutfs_sysfs_destroy_attrs(sb, &vinf->ssa);
|
||||
kfree(vinf);
|
||||
sbi->volopt_info = NULL;
|
||||
}
|
||||
}
|
||||
7
kmod/src/volopt.h
Normal file
7
kmod/src/volopt.h
Normal file
@@ -0,0 +1,7 @@
|
||||
#ifndef _SCOUTFS_VOLOPT_H_
|
||||
#define _SCOUTFS_VOLOPT_H_
|
||||
|
||||
int scoutfs_volopt_setup(struct super_block *sb);
|
||||
void scoutfs_volopt_destroy(struct super_block *sb);
|
||||
|
||||
#endif
|
||||
@@ -66,6 +66,7 @@ $(basename $0) options:
|
||||
-X | xfstests git repo. Used by tests/xfstests.sh.
|
||||
-x | xfstests git branch to checkout and track.
|
||||
-y | xfstests ./check additional args
|
||||
-z <nr> | set data-alloc-zone-blocks in mkfs
|
||||
EOF
|
||||
}
|
||||
|
||||
@@ -169,6 +170,11 @@ while true; do
|
||||
T_XFSTESTS_ARGS="$2"
|
||||
shift
|
||||
;;
|
||||
-z)
|
||||
test -n "$2" || die "-z must have nr mounts argument"
|
||||
T_DATA_ALLOC_ZONE_BLOCKS="-z $2"
|
||||
shift
|
||||
;;
|
||||
-h|-\?|--help)
|
||||
show_help
|
||||
exit 1
|
||||
@@ -319,7 +325,8 @@ if [ -n "$T_MKFS" ]; then
|
||||
done
|
||||
|
||||
msg "making new filesystem with $T_QUORUM quorum members"
|
||||
cmd scoutfs mkfs -f $quo "$T_META_DEVICE" "$T_DATA_DEVICE"
|
||||
cmd scoutfs mkfs -f $quo $T_DATA_ALLOC_ZONE_BLOCKS \
|
||||
"$T_META_DEVICE" "$T_DATA_DEVICE"
|
||||
fi
|
||||
|
||||
if [ -n "$T_INSMOD" ]; then
|
||||
|
||||
@@ -34,6 +34,40 @@ the server for the filesystem if it is elected leader.
|
||||
The assigned number must match one of the slots defined with \-Q options
|
||||
when the filesystem was created with mkfs. If the number assigned
|
||||
doesn't match a number created during mkfs then the mount will fail.
|
||||
.SH VOLUME OPTIONS
|
||||
Volume options are persistent options which are stored in the super
|
||||
block in the metadata device and which apply to all mounts of the volume.
|
||||
.sp
|
||||
Volume options may be initially specified as the volume is created
|
||||
as described in the mkfs command in
|
||||
.BR scoutfs (8).
|
||||
.sp
|
||||
Volume options may be changed at runtime by writing to files in sysfs
|
||||
while the volume is mounted. Volume options are found in the
|
||||
volume_options/ directory with a file for each option. Reading the
|
||||
file provides the current setting of the option and an empty string
|
||||
is returned if the option is not set. To set the option, write
|
||||
the new value ofthe option to the file. To clear the option, write
|
||||
a blank line with a newline to the file. The write syscall will
|
||||
return an error if the set operation fails and a message will be written
|
||||
to the console.
|
||||
.sp
|
||||
The following volume options are supported:
|
||||
.TP
|
||||
.B data_alloc_zone_blocks=<zone size in 4KiB blocks>
|
||||
When the data_alloc_zone_blocks option is set the data device is
|
||||
logically divided into zones of equal length as specified by the value
|
||||
of the option. The size of the zones must be greater than a minimum
|
||||
allocation pool size, large enough to result in no more than 1024 zones,
|
||||
and not more than the total number of blocks in the data device.
|
||||
.sp
|
||||
When set, the server will try to provide each mount with free data
|
||||
extents that don't share a zone with other mounts. When a mount has free
|
||||
extents in a given zone the server will try and find more free extents
|
||||
in that zone. When the mount is not in a zone, or its zone has no more
|
||||
free extents, the server will try and find free extents in a zone that
|
||||
no other mount currently occupies. The result is to try and produce
|
||||
write streams where only one mount is writing into each zone.
|
||||
.SH FURTHER READING
|
||||
A
|
||||
.B scoutfs
|
||||
|
||||
@@ -32,7 +32,7 @@ A path within a ScoutFS filesystem.
|
||||
.PD
|
||||
|
||||
.TP
|
||||
.BI "mkfs META-DEVICE DATA-DEVICE {-Q|--quorum-slot} NR,ADDR,PORT [-m|--max-meta-size SIZE] [-d|--max-data-size SIZE] [-f|--force]"
|
||||
.BI "mkfs META-DEVICE DATA-DEVICE {-Q|--quorum-slot} NR,ADDR,PORT [-m|--max-meta-size SIZE] [-d|--max-data-size SIZE] [-z|--data-alloc-zone-blocks BLOCKS] [-f|--force]"
|
||||
.sp
|
||||
Initialize a new ScoutFS filesystem on the target devices. Since ScoutFS uses
|
||||
separate block devices for its metadata and data storage, two are required.
|
||||
@@ -81,6 +81,10 @@ kibibytes, mebibytes, etc.
|
||||
.B "-d, --max-data-size SIZE"
|
||||
Same as previous, but for limiting the size of the data device.
|
||||
.TP
|
||||
.B "-z, --data-alloc-zone-blocks BLOCKS"
|
||||
Set the data_alloc_zone_blocks volume option, as described in
|
||||
.BR scoutfs (5).
|
||||
.TP
|
||||
.B "-f, --force"
|
||||
Ignore presence of existing data on the data and metadata devices.
|
||||
.RE
|
||||
|
||||
@@ -57,6 +57,15 @@ static int write_block(int fd, u32 magic, __le64 fsid, u64 seq, u64 blkno,
|
||||
return 0;
|
||||
}
|
||||
|
||||
/*
|
||||
* Return the order of the length of a free extent, which we define as
|
||||
* floor(log_8_(len)): 0..7 = 0, 8..63 = 1, etc.
|
||||
*/
|
||||
static u64 free_extent_order(u64 len)
|
||||
{
|
||||
return (flsll(len | 1) - 1) / 3;
|
||||
}
|
||||
|
||||
/*
|
||||
* Write the single btree block that contains the blkno and len indexed
|
||||
* items to store the given extent, and update the root to point to it.
|
||||
@@ -72,30 +81,59 @@ static int write_alloc_root(int fd, __le64 fsid,
|
||||
root->total_len = cpu_to_le64(len);
|
||||
|
||||
memset(&key, 0, sizeof(key));
|
||||
key.sk_zone = SCOUTFS_FREE_EXTENT_ZONE;
|
||||
key.sk_type = SCOUTFS_FREE_EXTENT_BLKNO_TYPE;
|
||||
key.skii_ino = cpu_to_le64(SCOUTFS_ROOT_INO);
|
||||
key.sk_zone = SCOUTFS_FREE_EXTENT_BLKNO_ZONE;
|
||||
key.skfb_end = cpu_to_le64(start + len - 1);
|
||||
key.skfb_len = cpu_to_le64(len);
|
||||
btree_append_item(bt, &key, NULL, 0);
|
||||
|
||||
memset(&key, 0, sizeof(key));
|
||||
key.sk_zone = SCOUTFS_FREE_EXTENT_ZONE;
|
||||
key.sk_type = SCOUTFS_FREE_EXTENT_LEN_TYPE;
|
||||
key.skii_ino = cpu_to_le64(SCOUTFS_ROOT_INO);
|
||||
key.skfl_neglen = cpu_to_le64(-len);
|
||||
key.skfl_blkno = cpu_to_le64(start);
|
||||
key.sk_zone = SCOUTFS_FREE_EXTENT_ORDER_ZONE;
|
||||
key.skfo_revord = cpu_to_le64(U64_MAX - free_extent_order(len));
|
||||
key.skfo_end = cpu_to_le64(start + len - 1);
|
||||
key.skfo_len = cpu_to_le64(len);
|
||||
btree_append_item(bt, &key, NULL, 0);
|
||||
|
||||
return write_block(fd, SCOUTFS_BLOCK_MAGIC_BTREE, fsid, seq, blkno,
|
||||
SCOUTFS_BLOCK_LG_SHIFT, &bt->hdr);
|
||||
}
|
||||
|
||||
#define SCOUTFS_SERVER_DATA_FILL_TARGET \
|
||||
((4ULL * 1024 * 1024 * 1024) >> SCOUTFS_BLOCK_SM_SHIFT)
|
||||
static bool invalid_data_alloc_zone_blocks(u64 total_data_blocks, u64 zone_blocks)
|
||||
{
|
||||
u64 nr;
|
||||
|
||||
if (zone_blocks == 0)
|
||||
return false;
|
||||
|
||||
if (zone_blocks < SCOUTFS_SERVER_DATA_FILL_TARGET) {
|
||||
fprintf(stderr, "setting data_alloc_zone_blocks to '%llu' failed, must be at least %llu mount data allocation target blocks",
|
||||
zone_blocks, SCOUTFS_SERVER_DATA_FILL_TARGET);
|
||||
return true;
|
||||
}
|
||||
|
||||
nr = total_data_blocks / SCOUTFS_DATA_ALLOC_MAX_ZONES;
|
||||
if (zone_blocks < nr) {
|
||||
fprintf(stderr, "setting data_alloc_zone_blocks to '%llu' failed, must be greater than %llu blocks which results in max %u zones",
|
||||
zone_blocks, nr, SCOUTFS_DATA_ALLOC_MAX_ZONES);
|
||||
return true;
|
||||
}
|
||||
|
||||
if (zone_blocks > total_data_blocks) {
|
||||
fprintf(stderr, "setting data_alloc_zone_blocks to '%llu' failed, must be at most %llu total data device blocks",
|
||||
zone_blocks, total_data_blocks);
|
||||
return true;
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
struct mkfs_args {
|
||||
char *meta_device;
|
||||
char *data_device;
|
||||
unsigned long long max_meta_size;
|
||||
unsigned long long max_data_size;
|
||||
u64 data_alloc_zone_blocks;
|
||||
bool force;
|
||||
int nr_slots;
|
||||
struct scoutfs_quorum_slot slots[SCOUTFS_QUORUM_MAX_SLOTS];
|
||||
@@ -210,6 +248,17 @@ static int do_mkfs(struct mkfs_args *args)
|
||||
member_sizeof(struct scoutfs_super_block, qconf.slots));
|
||||
memcpy(super->qconf.slots, args->slots, sizeof(args->slots));
|
||||
|
||||
if (invalid_data_alloc_zone_blocks(le64_to_cpu(super->total_data_blocks),
|
||||
args->data_alloc_zone_blocks)) {
|
||||
ret = -EINVAL;
|
||||
goto out;
|
||||
}
|
||||
|
||||
if (args->data_alloc_zone_blocks) {
|
||||
super->volopt.set_bits |= cpu_to_le64(SCOUTFS_VOLOPT_DATA_ALLOC_ZONE_BLOCKS_BIT);
|
||||
super->volopt.data_alloc_zone_blocks = cpu_to_le64(args->data_alloc_zone_blocks);
|
||||
}
|
||||
|
||||
/* fs root starts with root inode and its index items */
|
||||
blkno = next_meta++;
|
||||
btree_init_root_single(&super->fs_root, bt, 1, blkno);
|
||||
@@ -471,6 +520,17 @@ static int parse_opt(int key, char *arg, struct argp_state *state)
|
||||
prev_val, args->max_data_size);
|
||||
break;
|
||||
}
|
||||
case 'z': /* data-alloc-zone-blocks */
|
||||
{
|
||||
ret = parse_u64(arg, &args->data_alloc_zone_blocks);
|
||||
if (ret)
|
||||
return ret;
|
||||
|
||||
if (args->data_alloc_zone_blocks == 0)
|
||||
argp_error(state, "must provide non-zero data-alloc-zone-blocks");
|
||||
|
||||
break;
|
||||
}
|
||||
case ARGP_KEY_ARG:
|
||||
if (!args->meta_device)
|
||||
args->meta_device = strdup_or_error(state, arg);
|
||||
@@ -501,6 +561,7 @@ static struct argp_option options[] = {
|
||||
{ "force", 'f', NULL, 0, "Overwrite existing data on block devices"},
|
||||
{ "max-meta-size", 'm', "SIZE", 0, "Use a size less than the base metadata device size (bytes or KMGTP units)"},
|
||||
{ "max-data-size", 'd', "SIZE", 0, "Use a size less than the base data device size (bytes or KMGTP units)"},
|
||||
{ "data-alloc-zone-blocks", 'z', "BLOCKS", 0, "Divide data device into block zones so each mounts writes to a zone (4KB blocks)"},
|
||||
{ NULL }
|
||||
};
|
||||
|
||||
|
||||
@@ -1,3 +1,4 @@
|
||||
#define _GNU_SOURCE /* ffsll for glibc < 2.27 */
|
||||
#include <unistd.h>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
@@ -272,6 +273,9 @@ static int print_log_trees_item(struct scoutfs_key *key, void *val,
|
||||
unsigned val_len, void *arg)
|
||||
{
|
||||
struct scoutfs_log_trees *lt = val;
|
||||
u64 zones;
|
||||
int bit;
|
||||
int i;
|
||||
|
||||
printf(" rid %llu nr %llu\n",
|
||||
le64_to_cpu(key->sklt_rid), le64_to_cpu(key->sklt_nr));
|
||||
@@ -287,7 +291,9 @@ static int print_log_trees_item(struct scoutfs_key *key, void *val,
|
||||
" srch_file: "SRF_FMT"\n"
|
||||
" max_item_vers: %llu\n"
|
||||
" rid: %016llx\n"
|
||||
" nr: %llu\n",
|
||||
" nr: %llu\n"
|
||||
" data_alloc_zone_blocks: %llu\n"
|
||||
" data_alloc_zones: ",
|
||||
AL_HEAD_A(<->meta_avail),
|
||||
AL_HEAD_A(<->meta_freed),
|
||||
lt->item_root.height,
|
||||
@@ -300,7 +306,21 @@ static int print_log_trees_item(struct scoutfs_key *key, void *val,
|
||||
SRF_A(<->srch_file),
|
||||
le64_to_cpu(lt->max_item_vers),
|
||||
le64_to_cpu(lt->rid),
|
||||
le64_to_cpu(lt->nr));
|
||||
le64_to_cpu(lt->nr),
|
||||
le64_to_cpu(lt->data_alloc_zone_blocks));
|
||||
|
||||
for (i = 0; i < SCOUTFS_DATA_ALLOC_ZONE_LE64S; i++) {
|
||||
if (lt->data_alloc_zones[i] == 0)
|
||||
continue;
|
||||
|
||||
zones = le64_to_cpu(lt->data_alloc_zones[i]);
|
||||
while (zones) {
|
||||
bit = ffsll(zones) - 1;
|
||||
printf("%u ", (i * 64) + bit);
|
||||
zones ^= (1ULL << bit);
|
||||
}
|
||||
}
|
||||
printf("\n");
|
||||
}
|
||||
|
||||
return 0;
|
||||
@@ -362,17 +382,17 @@ static int print_mounted_client_entry(struct scoutfs_key *key, void *val,
|
||||
static int print_alloc_item(struct scoutfs_key *key, void *val,
|
||||
unsigned val_len, void *arg)
|
||||
{
|
||||
if (key->sk_type == SCOUTFS_FREE_EXTENT_BLKNO_TYPE)
|
||||
if (key->sk_zone == SCOUTFS_FREE_EXTENT_BLKNO_ZONE)
|
||||
printf(" free extent: blkno %llu len %llu end %llu\n",
|
||||
le64_to_cpu(key->skfb_end) -
|
||||
le64_to_cpu(key->skfb_len) + 1,
|
||||
le64_to_cpu(key->skfb_len),
|
||||
le64_to_cpu(key->skfb_end));
|
||||
else
|
||||
printf(" free extent: blkno %llu len %llu neglen %lld\n",
|
||||
le64_to_cpu(key->skfl_blkno),
|
||||
-le64_to_cpu(key->skfl_neglen),
|
||||
(long long)le64_to_cpu(key->skfl_neglen));
|
||||
printf(" free extent: blkno %llu len %llu order %llu\n",
|
||||
le64_to_cpu(key->skfo_end) - le64_to_cpu(key->skfo_len) + 1,
|
||||
le64_to_cpu(key->skfo_len),
|
||||
(long long)(U64_MAX - le64_to_cpu(key->skfo_revord)));
|
||||
|
||||
return 0;
|
||||
}
|
||||
@@ -900,6 +920,14 @@ static void print_super_block(struct scoutfs_super_block *super, u64 blkno)
|
||||
le64_to_cpu(super->fs_root.ref.blkno),
|
||||
le64_to_cpu(super->fs_root.ref.seq));
|
||||
|
||||
printf(" volume options:\n"
|
||||
" set_bits: %016llx\n",
|
||||
le64_to_cpu(super->volopt.set_bits));
|
||||
if (le64_to_cpu(super->volopt.set_bits) & SCOUTFS_VOLOPT_DATA_ALLOC_ZONE_BLOCKS_BIT) {
|
||||
printf(" data_alloc_zone_blocks: %llu\n",
|
||||
le64_to_cpu(super->volopt.data_alloc_zone_blocks));
|
||||
}
|
||||
|
||||
printf(" quorum config version %llu\n",
|
||||
le64_to_cpu(super->qconf.version));
|
||||
for (i = 0; i < array_size(super->qconf.slots); i++) {
|
||||
|
||||
Reference in New Issue
Block a user