Merge pull request #40 from versity/zab/data_alloc_zones

Zab/data alloc zones
This commit is contained in:
Zach Brown
2021-05-24 13:00:48 -07:00
committed by GitHub
16 changed files with 1085 additions and 79 deletions

View File

@@ -42,6 +42,7 @@ scoutfs-y += \
trans.o \
triggers.o \
tseq.o \
volopt.o \
xattr.o
#

View File

@@ -29,8 +29,8 @@
* The core allocator uses extent items in btrees rooted in the super.
* Each free extent is stored in two items. The first item is indexed
* by block location and is used to merge adjacent extents when freeing.
* The second item is indexed by length and is used to find large
* extents to allocate from.
* The second item is indexed by the order of the length and is used to
* find large extents to allocate from.
*
* Free extent always consumes the front of the largest extent. This
* attempts to discourage fragmentation by given smaller freed extents
@@ -67,25 +67,52 @@
*/
/*
* Free extents don't have flags and are stored in two indexes sorted by
* block location and by length, largest first. The block location key
* is set to the final block in the extent so that we can find
* intersections by calling _next() iterators starting with the block
* we're searching for.
* Return the order of the length of a free extent, which we define as
* floor(log_8_(len)): 0..7 = 0, 8..63 = 1, etc.
*/
static void init_ext_key(struct scoutfs_key *key, int type, u64 start, u64 len)
static u64 free_extent_order(u64 len)
{
return (fls64(len | 1) - 1) / 3;
}
/*
* The smallest (non-zero) length that will be mapped to the same order
* as the given length.
*/
static u64 smallest_order_length(u64 len)
{
return 1ULL << (free_extent_order(len) * 3);
}
/*
* Free extents don't have flags and are stored in two indexes sorted by
* block location and by length order, largest first. The location key
* field is set to the final block in the extent so that we can find
* intersections by calling _next() with the start of the range we're
* searching for.
*
* We never store 0 length extents but we do build keys for searching
* the order index from 0,0 without having to map it to a real extent.
*/
static void init_ext_key(struct scoutfs_key *key, int zone, u64 start, u64 len)
{
*key = (struct scoutfs_key) {
.sk_zone = SCOUTFS_FREE_EXTENT_ZONE,
.sk_type = type,
.sk_zone = zone,
};
if (type == SCOUTFS_FREE_EXTENT_BLKNO_TYPE) {
if (len == 0) {
/* we only use 0 len extents for magic 0,0 order lookups */
WARN_ON_ONCE(zone != SCOUTFS_FREE_EXTENT_ORDER_ZONE || start != 0);
return;
}
if (zone == SCOUTFS_FREE_EXTENT_BLKNO_ZONE) {
key->skfb_end = cpu_to_le64(start + len - 1);
key->skfb_len = cpu_to_le64(len);
} else if (type == SCOUTFS_FREE_EXTENT_LEN_TYPE) {
key->skfl_neglen = cpu_to_le64(-len);
key->skfl_blkno = cpu_to_le64(start);
} else if (zone == SCOUTFS_FREE_EXTENT_ORDER_ZONE) {
key->skfo_revord = cpu_to_le64(U64_MAX - free_extent_order(len));
key->skfo_end = cpu_to_le64(start + len - 1);
key->skfo_len = cpu_to_le64(len);
} else {
BUG();
}
@@ -93,23 +120,27 @@ static void init_ext_key(struct scoutfs_key *key, int type, u64 start, u64 len)
static void ext_from_key(struct scoutfs_extent *ext, struct scoutfs_key *key)
{
if (key->sk_type == SCOUTFS_FREE_EXTENT_BLKNO_TYPE) {
if (key->sk_zone == SCOUTFS_FREE_EXTENT_BLKNO_ZONE) {
ext->start = le64_to_cpu(key->skfb_end) -
le64_to_cpu(key->skfb_len) + 1;
ext->len = le64_to_cpu(key->skfb_len);
} else {
ext->start = le64_to_cpu(key->skfl_blkno);
ext->len = -le64_to_cpu(key->skfl_neglen);
ext->start = le64_to_cpu(key->skfo_end) -
le64_to_cpu(key->skfo_len) + 1;
ext->len = le64_to_cpu(key->skfo_len);
}
ext->map = 0;
ext->flags = 0;
/* we never store 0 length extents */
WARN_ON_ONCE(ext->len == 0);
}
struct alloc_ext_args {
struct scoutfs_alloc *alloc;
struct scoutfs_block_writer *wri;
struct scoutfs_alloc_root *root;
int type;
int zone;
};
static int alloc_ext_next(struct super_block *sb, void *arg,
@@ -120,13 +151,13 @@ static int alloc_ext_next(struct super_block *sb, void *arg,
struct scoutfs_key key;
int ret;
init_ext_key(&key, args->type, start, len);
init_ext_key(&key, args->zone, start, len);
ret = scoutfs_btree_next(sb, &args->root->root, &key, &iref);
if (ret == 0) {
if (iref.val_len != 0)
ret = -EIO;
else if (iref.key->sk_type != args->type)
else if (iref.key->sk_zone != args->zone)
ret = -ENOENT;
else
ext_from_key(ext, iref.key);
@@ -139,19 +170,19 @@ static int alloc_ext_next(struct super_block *sb, void *arg,
return ret;
}
static int other_type(int type)
static int other_zone(int zone)
{
if (type == SCOUTFS_FREE_EXTENT_BLKNO_TYPE)
return SCOUTFS_FREE_EXTENT_LEN_TYPE;
else if (type == SCOUTFS_FREE_EXTENT_LEN_TYPE)
return SCOUTFS_FREE_EXTENT_BLKNO_TYPE;
if (zone == SCOUTFS_FREE_EXTENT_BLKNO_ZONE)
return SCOUTFS_FREE_EXTENT_ORDER_ZONE;
else if (zone == SCOUTFS_FREE_EXTENT_ORDER_ZONE)
return SCOUTFS_FREE_EXTENT_BLKNO_ZONE;
else
BUG();
}
/*
* Insert an extent along with its matching item which is indexed by
* opposite of its len or blkno. If we succeed we update the root's
* opposite of its order or blkno. If we succeed we update the root's
* record of the total length of all the stored extents.
*/
static int alloc_ext_insert(struct super_block *sb, void *arg,
@@ -167,8 +198,8 @@ static int alloc_ext_insert(struct super_block *sb, void *arg,
if (WARN_ON_ONCE(map || flags))
return -EINVAL;
init_ext_key(&key, args->type, start, len);
init_ext_key(&other, other_type(args->type), start, len);
init_ext_key(&key, args->zone, start, len);
init_ext_key(&other, other_zone(args->zone), start, len);
ret = scoutfs_btree_insert(sb, args->alloc, args->wri,
&args->root->root, &key, NULL, 0);
@@ -196,8 +227,8 @@ static int alloc_ext_remove(struct super_block *sb, void *arg,
int ret;
int err;
init_ext_key(&key, args->type, start, len);
init_ext_key(&other, other_type(args->type), start, len);
init_ext_key(&key, args->zone, start, len);
init_ext_key(&other, other_zone(args->zone), start, len);
ret = scoutfs_btree_delete(sb, args->alloc, args->wri,
&args->root->root, &key);
@@ -619,7 +650,7 @@ int scoutfs_dalloc_return_cached(struct super_block *sb,
.alloc = alloc,
.wri = wri,
.root = &dalloc->root,
.type = SCOUTFS_FREE_EXTENT_BLKNO_TYPE,
.zone = SCOUTFS_FREE_EXTENT_BLKNO_ZONE,
};
int ret = 0;
@@ -655,7 +686,7 @@ int scoutfs_alloc_data(struct super_block *sb, struct scoutfs_alloc *alloc,
.alloc = alloc,
.wri = wri,
.root = &dalloc->root,
.type = SCOUTFS_FREE_EXTENT_LEN_TYPE,
.zone = SCOUTFS_FREE_EXTENT_ORDER_ZONE,
};
struct scoutfs_extent ext;
u64 len;
@@ -728,7 +759,7 @@ int scoutfs_free_data(struct super_block *sb, struct scoutfs_alloc *alloc,
.alloc = alloc,
.wri = wri,
.root = root,
.type = SCOUTFS_FREE_EXTENT_BLKNO_TYPE,
.zone = SCOUTFS_FREE_EXTENT_BLKNO_ZONE,
};
int ret;
@@ -741,6 +772,95 @@ int scoutfs_free_data(struct super_block *sb, struct scoutfs_alloc *alloc,
return ret;
}
/*
* Return the first zone bit that the extent intersects with.
*/
static int first_extent_zone(struct scoutfs_extent *ext, __le64 *zones, u64 zone_blocks)
{
int first;
int last;
int nr;
first = div64_u64(ext->start, zone_blocks);
last = div64_u64(ext->start + ext->len - 1, zone_blocks);
nr = find_next_bit_le(zones, SCOUTFS_DATA_ALLOC_MAX_ZONES, first);
if (nr <= last)
return nr;
return SCOUTFS_DATA_ALLOC_MAX_ZONES;
}
/*
* Find an extent in specific zones to satisfy an allocation. We use
* the order items to search for the largest extent that intersects with
* the zones whose bits are set in the caller's bitmap.
*/
static int find_zone_extent(struct super_block *sb, struct scoutfs_alloc_root *root,
__le64 *zones, u64 zone_blocks,
struct scoutfs_extent *found_ret, u64 count,
struct scoutfs_extent *ext_ret)
{
struct alloc_ext_args args = {
.root = root,
.zone = SCOUTFS_FREE_EXTENT_ORDER_ZONE,
};
struct scoutfs_extent found;
struct scoutfs_extent ext;
u64 start;
u64 len;
int nr;
int ret;
/* don't bother when there are no bits set */
if (find_next_bit_le(zones, SCOUTFS_DATA_ALLOC_MAX_ZONES, 0) ==
SCOUTFS_DATA_ALLOC_MAX_ZONES)
return -ENOENT;
/* start searching for largest extent from the first zone */
len = smallest_order_length(SCOUTFS_BLOCK_SM_MAX);
nr = 0;
for (;;) {
/* search for extents in the next zone at our order */
nr = find_next_bit_le(zones, SCOUTFS_DATA_ALLOC_MAX_ZONES, nr);
if (nr >= SCOUTFS_DATA_ALLOC_MAX_ZONES) {
/* wrap down to next smaller order if we run out of bits */
len >>= 3;
if (len == 0) {
ret = -ENOENT;
break;
}
nr = find_next_bit_le(zones, SCOUTFS_DATA_ALLOC_MAX_ZONES, 0);
}
start = (u64)nr * zone_blocks;
ret = scoutfs_ext_next(sb, &alloc_ext_ops, &args, start, len, &found);
if (ret < 0)
break;
/* see if the next extent intersects any zones */
nr = first_extent_zone(&found, zones, zone_blocks);
if (nr < SCOUTFS_DATA_ALLOC_MAX_ZONES) {
start = (u64)nr * zone_blocks;
ext.start = max(start, found.start);
ext.len = min(count, found.start + found.len - ext.start);
*found_ret = found;
*ext_ret = ext;
ret = 0;
break;
}
/* continue searching past extent */
nr = div64_u64(found.start + found.len - 1, zone_blocks) + 1;
len = smallest_order_length(found.len);
}
return ret;
}
/*
* Move extent items adding up to the requested total length from the
@@ -751,6 +871,11 @@ int scoutfs_free_data(struct super_block *sb, struct scoutfs_alloc *alloc,
* -ENOENT is returned if we run out of extents in the source tree
* before moving the total.
*
* The caller can specify that extents in the source tree should first
* be found based on their zone bitmaps. We'll first try to find
* extents in the exclusive zones, then vacant zones, and then we'll
* fall back to normal allocation that ignores zones.
*
* This first pass is not optimal because it performs full btree walks
* per extent. We could optimize this with more clever btree item
* manipulation functions which can iterate through src and dst blocks
@@ -759,32 +884,77 @@ int scoutfs_free_data(struct super_block *sb, struct scoutfs_alloc *alloc,
int scoutfs_alloc_move(struct super_block *sb, struct scoutfs_alloc *alloc,
struct scoutfs_block_writer *wri,
struct scoutfs_alloc_root *dst,
struct scoutfs_alloc_root *src, u64 total)
struct scoutfs_alloc_root *src, u64 total,
__le64 *exclusive, __le64 *vacant, u64 zone_blocks)
{
struct alloc_ext_args args = {
.alloc = alloc,
.wri = wri,
};
struct scoutfs_extent found;
struct scoutfs_extent ext;
u64 moved = 0;
u64 count;
int ret = 0;
int err;
if (zone_blocks == 0) {
exclusive = NULL;
vacant = NULL;
}
while (moved < total) {
args.root = src;
args.type = SCOUTFS_FREE_EXTENT_LEN_TYPE;
ret = scoutfs_ext_alloc(sb, &alloc_ext_ops, &args,
0, 0, total - moved, &ext);
count = total - moved;
if (exclusive) {
/* first try to find extents in our exclusive zones */
ret = find_zone_extent(sb, src, exclusive, zone_blocks,
&found, count, &ext);
if (ret == -ENOENT) {
exclusive = NULL;
continue;
}
} else if (vacant) {
/* then try to find extents in vacant zones */
ret = find_zone_extent(sb, src, vacant, zone_blocks,
&found, count, &ext);
if (ret == -ENOENT) {
vacant = NULL;
continue;
}
} else {
/* otherwise fall back to finding extents anywhere */
args.root = src;
args.zone = SCOUTFS_FREE_EXTENT_ORDER_ZONE;
ret = scoutfs_ext_next(sb, &alloc_ext_ops, &args, 0, 0, &found);
if (ret == 0) {
ext.start = found.start;
ext.len = min(count, found.len);
}
}
if (ret < 0)
break;
/* searching set start/len, finish initializing alloced extent */
ext.map = found.map ? ext.start - found.start + found.map : 0;
ext.flags = found.flags;
/* remove the allocation from the found extent */
args.root = src;
args.zone = SCOUTFS_FREE_EXTENT_BLKNO_ZONE;
ret = scoutfs_ext_remove(sb, &alloc_ext_ops, &args, ext.start, ext.len);
if (ret < 0)
break;
/* insert the allocated extent into the dest */
args.root = dst;
args.type = SCOUTFS_FREE_EXTENT_BLKNO_TYPE;
args.zone = SCOUTFS_FREE_EXTENT_BLKNO_ZONE;
ret = scoutfs_ext_insert(sb, &alloc_ext_ops, &args, ext.start,
ext.len, ext.map, ext.flags);
if (ret < 0) {
/* and put it back in src if insertion failed */
args.root = src;
args.type = SCOUTFS_FREE_EXTENT_BLKNO_TYPE;
args.zone = SCOUTFS_FREE_EXTENT_BLKNO_ZONE;
err = scoutfs_ext_insert(sb, &alloc_ext_ops, &args,
ext.start, ext.len, ext.map,
ext.flags);
@@ -852,7 +1022,7 @@ out:
* a list block and all the btree blocks that store extent items.
*
* At most, an extent operation can dirty down three paths of the tree
* to modify a blkno item and two distant len items. We can grow and
* to modify a blkno item and two distant order items. We can grow and
* split the root, and then those three paths could share blocks but each
* modify two leaf blocks.
*/
@@ -901,7 +1071,7 @@ int scoutfs_alloc_fill_list(struct super_block *sb,
.alloc = alloc,
.wri = wri,
.root = root,
.type = SCOUTFS_FREE_EXTENT_LEN_TYPE,
.zone = SCOUTFS_FREE_EXTENT_ORDER_ZONE,
};
struct scoutfs_alloc_list_block *lblk;
struct scoutfs_block *bl = NULL;
@@ -958,7 +1128,7 @@ int scoutfs_alloc_empty_list(struct super_block *sb,
.alloc = alloc,
.wri = wri,
.root = root,
.type = SCOUTFS_FREE_EXTENT_BLKNO_TYPE,
.zone = SCOUTFS_FREE_EXTENT_BLKNO_ZONE,
};
struct scoutfs_alloc_list_block *lblk = NULL;
struct scoutfs_block *bl = NULL;
@@ -1227,3 +1397,63 @@ out:
kfree(sc);
return ret;
}
struct foreach_cb_args {
scoutfs_alloc_extent_cb_t cb;
void *cb_arg;
};
static int alloc_btree_extent_item_cb(struct super_block *sb, struct scoutfs_key *key,
void *val, int val_len, void *arg)
{
struct foreach_cb_args *cba = arg;
struct scoutfs_extent ext;
if (key->sk_zone != SCOUTFS_FREE_EXTENT_BLKNO_ZONE)
return -ENOENT;
ext_from_key(&ext, key);
cba->cb(sb, cba->cb_arg, &ext);
return 0;
}
/*
* Call the caller's callback on each extent stored in the allocator's
* btree. The callback sees extents called in order by starting blkno.
*/
int scoutfs_alloc_extents_cb(struct super_block *sb, struct scoutfs_alloc_root *root,
scoutfs_alloc_extent_cb_t cb, void *cb_arg)
{
struct foreach_cb_args cba = {
.cb = cb,
.cb_arg = cb_arg,
};
struct scoutfs_key start;
struct scoutfs_key end;
struct scoutfs_key key;
int ret;
init_ext_key(&key, SCOUTFS_FREE_EXTENT_BLKNO_ZONE, 0, 1);
for (;;) {
/* will stop at order items before getting stuck in final block */
BUILD_BUG_ON(SCOUTFS_FREE_EXTENT_BLKNO_ZONE > SCOUTFS_FREE_EXTENT_ORDER_ZONE);
init_ext_key(&start, SCOUTFS_FREE_EXTENT_BLKNO_ZONE, 0, 1);
init_ext_key(&end, SCOUTFS_FREE_EXTENT_ORDER_ZONE, 0, 1);
ret = scoutfs_btree_read_items(sb, &root->root, &key, &start, &end,
alloc_btree_extent_item_cb, &cba);
if (ret < 0 || end.sk_zone != SCOUTFS_FREE_EXTENT_BLKNO_ZONE) {
if (ret == -ENOENT)
ret = 0;
break;
}
key = end;
scoutfs_key_inc(&key);
}
return ret;
}

View File

@@ -125,7 +125,8 @@ int scoutfs_free_data(struct super_block *sb, struct scoutfs_alloc *alloc,
int scoutfs_alloc_move(struct super_block *sb, struct scoutfs_alloc *alloc,
struct scoutfs_block_writer *wri,
struct scoutfs_alloc_root *dst,
struct scoutfs_alloc_root *src, u64 total);
struct scoutfs_alloc_root *src, u64 total,
__le64 *exclusive, __le64 *vacant, u64 zone_blocks);
int scoutfs_alloc_fill_list(struct super_block *sb,
struct scoutfs_alloc *alloc,
@@ -153,4 +154,9 @@ typedef int (*scoutfs_alloc_foreach_cb_t)(struct super_block *sb, void *arg,
int scoutfs_alloc_foreach(struct super_block *sb,
scoutfs_alloc_foreach_cb_t cb, void *arg);
typedef void (*scoutfs_alloc_extent_cb_t)(struct super_block *sb, void *cb_arg,
struct scoutfs_extent *ext);
int scoutfs_alloc_extents_cb(struct super_block *sb, struct scoutfs_alloc_root *root,
scoutfs_alloc_extent_cb_t cb, void *cb_arg);
#endif

View File

@@ -249,6 +249,33 @@ int scoutfs_client_open_ino_map(struct super_block *sb, u64 group_nr,
&args, sizeof(args), map, sizeof(*map));
}
/* The client is asking the server for the current volume options */
int scoutfs_client_get_volopt(struct super_block *sb, struct scoutfs_volume_options *volopt)
{
struct client_info *client = SCOUTFS_SB(sb)->client_info;
return scoutfs_net_sync_request(sb, client->conn, SCOUTFS_NET_CMD_GET_VOLOPT,
NULL, 0, volopt, sizeof(*volopt));
}
/* The client is asking the server to update volume options */
int scoutfs_client_set_volopt(struct super_block *sb, struct scoutfs_volume_options *volopt)
{
struct client_info *client = SCOUTFS_SB(sb)->client_info;
return scoutfs_net_sync_request(sb, client->conn, SCOUTFS_NET_CMD_SET_VOLOPT,
volopt, sizeof(*volopt), NULL, 0);
}
/* The client is asking the server to clear volume options */
int scoutfs_client_clear_volopt(struct super_block *sb, struct scoutfs_volume_options *volopt)
{
struct client_info *client = SCOUTFS_SB(sb)->client_info;
return scoutfs_net_sync_request(sb, client->conn, SCOUTFS_NET_CMD_CLEAR_VOLOPT,
volopt, sizeof(*volopt), NULL, 0);
}
/* The client is receiving a invalidation request from the server */
static int client_lock(struct super_block *sb,
struct scoutfs_net_connection *conn, u8 cmd, u64 id,

View File

@@ -26,6 +26,9 @@ int scoutfs_client_send_omap_response(struct super_block *sb, u64 id,
struct scoutfs_open_ino_map *map);
int scoutfs_client_open_ino_map(struct super_block *sb, u64 group_nr,
struct scoutfs_open_ino_map *map);
int scoutfs_client_get_volopt(struct super_block *sb, struct scoutfs_volume_options *volopt);
int scoutfs_client_set_volopt(struct super_block *sb, struct scoutfs_volume_options *volopt);
int scoutfs_client_clear_volopt(struct super_block *sb, struct scoutfs_volume_options *volopt);
int scoutfs_client_setup(struct super_block *sb);
void scoutfs_client_destroy(struct super_block *sb);

View File

@@ -203,11 +203,12 @@ struct scoutfs_key {
#define skmc_rid _sk_first
/* free extents by blkno */
#define skfb_end _sk_second
#define skfb_len _sk_third
/* free extents by len */
#define skfl_neglen _sk_second
#define skfl_blkno _sk_third
#define skfb_end _sk_first
#define skfb_len _sk_second
/* free extents by order */
#define skfo_revord _sk_first
#define skfo_end _sk_second
#define skfo_len _sk_third
struct scoutfs_avl_root {
__le16 node;
@@ -427,6 +428,10 @@ struct scoutfs_srch_compact {
/* client -> server: compaction failed */
#define SCOUTFS_SRCH_COMPACT_FLAG_ERROR (1 << 5)
#define SCOUTFS_DATA_ALLOC_MAX_ZONES 1024
#define SCOUTFS_DATA_ALLOC_ZONE_BYTES DIV_ROUND_UP(SCOUTFS_DATA_ALLOC_MAX_ZONES, 8)
#define SCOUTFS_DATA_ALLOC_ZONE_LE64S DIV_ROUND_UP(SCOUTFS_DATA_ALLOC_MAX_ZONES, 64)
/*
* XXX I imagine we should rename these now that they've evolved to track
* all the btrees that clients use during a transaction. It's not just
@@ -440,6 +445,8 @@ struct scoutfs_log_trees {
struct scoutfs_alloc_root data_avail;
struct scoutfs_alloc_root data_freed;
struct scoutfs_srch_file srch_file;
__le64 data_alloc_zone_blocks;
__le64 data_alloc_zones[SCOUTFS_DATA_ALLOC_ZONE_LE64S];
__le64 max_item_vers;
__le64 rid;
__le64 nr;
@@ -493,7 +500,8 @@ struct scoutfs_bloom_block {
#define SCOUTFS_TRANS_SEQ_ZONE 7
#define SCOUTFS_MOUNTED_CLIENT_ZONE 8
#define SCOUTFS_SRCH_ZONE 9
#define SCOUTFS_FREE_EXTENT_ZONE 10
#define SCOUTFS_FREE_EXTENT_BLKNO_ZONE 10
#define SCOUTFS_FREE_EXTENT_ORDER_ZONE 11
/* inode index zone */
#define SCOUTFS_INODE_INDEX_META_SEQ_TYPE 1
@@ -521,10 +529,6 @@ struct scoutfs_bloom_block {
#define SCOUTFS_SRCH_PENDING_TYPE 3
#define SCOUTFS_SRCH_BUSY_TYPE 4
/* free extents in allocator btrees in client and server, by blkno or len */
#define SCOUTFS_FREE_EXTENT_BLKNO_TYPE 1
#define SCOUTFS_FREE_EXTENT_LEN_TYPE 2
/* file data extents have start and len in key */
struct scoutfs_data_extent_val {
__le64 blkno;
@@ -626,6 +630,42 @@ struct scoutfs_quorum_block {
#define SCOUTFS_QUORUM_BLOCK_LEADER (1 << 0)
/*
* Tunable options that apply to the entire system. They can be set in
* mkfs or in sysfs files which send an rpc to the server to make the
* change. The super version defines the options that exist.
*
* @set_bits: bits for each 64bit starting offset after set_bits
* indicate which logical option is set.
*
* @data_alloc_zone_blocks: if set, the data device is logically divided
* into contiguous zones of this many blocks. Data allocation will try
* and isolate allocated extents for each mount to their own zone. The
* zone size must be larger than the data alloc high water mark and
* large enough such that the number of zones is kept within its static
* limit.
*/
struct scoutfs_volume_options {
__le64 set_bits;
__le64 data_alloc_zone_blocks;
__le64 __future_expansion[63];
};
#define scoutfs_volopt_nr(field) \
((offsetof(struct scoutfs_volume_options, field) - \
(offsetof(struct scoutfs_volume_options, set_bits) + \
member_sizeof(struct scoutfs_volume_options, set_bits))) / sizeof(__le64))
#define scoutfs_volopt_bit(field) \
(1ULL << scoutfs_volopt_nr(field))
#define SCOUTFS_VOLOPT_DATA_ALLOC_ZONE_BLOCKS_NR \
scoutfs_volopt_nr(data_alloc_zone_blocks)
#define SCOUTFS_VOLOPT_DATA_ALLOC_ZONE_BLOCKS_BIT \
scoutfs_volopt_bit(data_alloc_zone_blocks)
#define SCOUTFS_VOLOPT_EXPANSION_BITS \
(~(scoutfs_volopt_bit(__future_expansion) - 1))
#define SCOUTFS_FLAG_IS_META_BDEV 0x01
struct scoutfs_super_block {
@@ -652,6 +692,7 @@ struct scoutfs_super_block {
struct scoutfs_btree_root trans_seqs;
struct scoutfs_btree_root mounted_clients;
struct scoutfs_btree_root srch_root;
struct scoutfs_volume_options volopt;
};
#define SCOUTFS_ROOT_INO 1
@@ -841,6 +882,9 @@ enum scoutfs_net_cmd {
SCOUTFS_NET_CMD_SRCH_GET_COMPACT,
SCOUTFS_NET_CMD_SRCH_COMMIT_COMPACT,
SCOUTFS_NET_CMD_OPEN_INO_MAP,
SCOUTFS_NET_CMD_GET_VOLOPT,
SCOUTFS_NET_CMD_SET_VOLOPT,
SCOUTFS_NET_CMD_CLEAR_VOLOPT,
SCOUTFS_NET_CMD_FAREWELL,
SCOUTFS_NET_CMD_UNKNOWN,
};

View File

@@ -99,6 +99,11 @@ struct server_info {
seqcount_t roots_seqcount;
struct scoutfs_net_roots roots;
/* serializing and get and set volume options */
seqcount_t volopt_seqcount;
struct mutex volopt_mutex;
struct scoutfs_volume_options volopt;
/* recovery timeout fences from work */
struct work_struct fence_pending_recov_work;
};
@@ -114,6 +119,38 @@ struct server_client_info {
struct list_head head;
};
static __le64 *first_valopt(struct scoutfs_volume_options *valopt)
{
return &valopt->set_bits + 1;
}
/*
* A server caller wants to know if a volume option is set and wants to
* know it's value. This is quite early in the file to make it
* available to all of the server paths.
*/
static bool get_volopt_val(struct server_info *server, int nr, u64 *val)
{
u64 bit = 1ULL << nr;
__le64 *opt = first_valopt(&server->volopt) + nr;
bool is_set = false;
unsigned seq;
do {
seq = read_seqcount_begin(&server->volopt_seqcount);
if ((le64_to_cpu(server->volopt.set_bits) & bit)) {
is_set = true;
*val = le64_to_cpup(opt);
} else {
is_set = false;
*val = 0;
};
} while (read_seqcount_retry(&server->volopt_seqcount, seq));
return is_set;
}
struct commit_waiter {
struct completion comp;
struct llist_node node;
@@ -361,9 +398,9 @@ out:
* Refill the destination root if it's fallen below the lo threshold by
* moving from the src root to bring it up to the target.
*/
static int alloc_move_refill(struct super_block *sb,
struct scoutfs_alloc_root *dst,
struct scoutfs_alloc_root *src, u64 lo, u64 target)
static int alloc_move_refill_zoned(struct super_block *sb, struct scoutfs_alloc_root *dst,
struct scoutfs_alloc_root *src, u64 lo, u64 target,
__le64 *exclusive, __le64 *vacant, u64 zone_blocks)
{
DECLARE_SERVER_INFO(sb, server);
@@ -372,7 +409,14 @@ static int alloc_move_refill(struct super_block *sb,
return scoutfs_alloc_move(sb, &server->alloc, &server->wri, dst, src,
min(target - le64_to_cpu(dst->total_len),
le64_to_cpu(src->total_len)));
le64_to_cpu(src->total_len)),
exclusive, vacant, zone_blocks);
}
static inline int alloc_move_refill(struct super_block *sb, struct scoutfs_alloc_root *dst,
struct scoutfs_alloc_root *src, u64 lo, u64 target)
{
return alloc_move_refill_zoned(sb, dst, src, lo, target, NULL, NULL, 0);
}
static int alloc_move_empty(struct super_block *sb,
@@ -382,7 +426,134 @@ static int alloc_move_empty(struct super_block *sb,
DECLARE_SERVER_INFO(sb, server);
return scoutfs_alloc_move(sb, &server->alloc, &server->wri,
dst, src, le64_to_cpu(src->total_len));
dst, src, le64_to_cpu(src->total_len), NULL, NULL, 0);
}
/*
* Set all the bits in the destination which overlap with the extent.
*/
static void mod_extent_bits(__le64 *bits, u64 zone_blocks, u64 blkno, u64 len, bool set)
{
u64 nr = div64_u64(blkno, zone_blocks);
u64 last_nr = div64_u64(blkno + len - 1, zone_blocks);
if (WARN_ON_ONCE(len == 0))
return;
while (nr <= last_nr) {
if (set)
set_bit_le(nr, bits);
else
clear_bit_le(nr, bits);
nr++;
}
}
/*
* Translate the bits in the source bitmap into extents and modify bits
* in the destination that map those extents.
*/
static void mod_bitmap_bits(__le64 *dst, u64 dst_zone_blocks,
__le64 *src, u64 src_zone_blocks, bool set)
{
int nr = 0;
for (;;) {
nr = find_next_bit_le(src, SCOUTFS_DATA_ALLOC_MAX_ZONES, nr);
if (nr >= SCOUTFS_DATA_ALLOC_MAX_ZONES)
break;
mod_extent_bits(dst, dst_zone_blocks,
(u64)nr * src_zone_blocks, src_zone_blocks, set);
nr++;
}
}
/*
* Iterate over all the log_tree items and initialize the caller's zone
* bitmaps. Exclusive bits are only found in the caller's items.
* Vacant bits are not found in any items.
*
* The log_tree item zone bitmaps could have been stored with different
* zone_blocks sizes. We translate the bits into block extents and
* record overlaps with the current zone size.
*
* The caller has the log items locked.
*/
static int get_data_alloc_zone_bits(struct super_block *sb, u64 rid, __le64 *exclusive,
__le64 *vacant, u64 zone_blocks)
{
struct scoutfs_super_block *super = &SCOUTFS_SB(sb)->super;
SCOUTFS_BTREE_ITEM_REF(iref);
struct scoutfs_log_trees *lt;
struct scoutfs_key key;
int ret;
memset(exclusive, 0, SCOUTFS_DATA_ALLOC_ZONE_BYTES);
memset(vacant, 0, SCOUTFS_DATA_ALLOC_ZONE_BYTES);
mod_extent_bits(vacant, zone_blocks, 0, le64_to_cpu(super->total_data_blocks), true);
scoutfs_key_init_log_trees(&key, 0, 0);
for (;;) {
ret = scoutfs_btree_next(sb, &super->logs_root, &key, &iref);
if (ret == 0) {
if (iref.val_len == sizeof(struct scoutfs_log_trees)) {
lt = iref.val;
/* vacant bits have no bits found in items */
mod_bitmap_bits(vacant, zone_blocks,
lt->data_alloc_zones,
le64_to_cpu(lt->data_alloc_zone_blocks),
false);
/* exclusive bits are only found in caller's items */
if (le64_to_cpu(iref.key->sklt_rid) == rid) {
mod_bitmap_bits(exclusive, zone_blocks,
lt->data_alloc_zones,
le64_to_cpu(lt->data_alloc_zone_blocks),
true);
} else {
mod_bitmap_bits(exclusive, zone_blocks,
lt->data_alloc_zones,
le64_to_cpu(lt->data_alloc_zone_blocks),
false);
}
key = *iref.key;
scoutfs_key_inc(&key);
} else {
ret = -EIO;
}
scoutfs_btree_put_iref(&iref);
}
if (ret < 0) {
if (ret == -ENOENT)
ret = 0;
break;
}
}
return ret;
}
static void zero_data_alloc_zone_bits(struct scoutfs_log_trees *lt)
{
lt->data_alloc_zone_blocks = 0;
memset(lt->data_alloc_zones, 0, sizeof(lt->data_alloc_zones));
}
struct alloc_extent_cb_args {
__le64 *zones;
u64 zone_blocks;
};
static void set_extent_zone_bits(struct super_block *sb, void *cb_arg, struct scoutfs_extent *ext)
{
struct alloc_extent_cb_args *cba = cb_arg;
mod_extent_bits(cba->zones, cba->zone_blocks, ext->start, ext->len, true);
}
/*
@@ -402,9 +573,13 @@ static int server_get_log_trees(struct super_block *sb,
struct scoutfs_super_block *super = &SCOUTFS_SB(sb)->super;
u64 rid = scoutfs_net_client_rid(conn);
DECLARE_SERVER_INFO(sb, server);
__le64 exclusive[SCOUTFS_DATA_ALLOC_ZONE_LE64S];
__le64 vacant[SCOUTFS_DATA_ALLOC_ZONE_LE64S];
struct alloc_extent_cb_args cba;
SCOUTFS_BTREE_ITEM_REF(iref);
struct scoutfs_log_trees lt;
struct scoutfs_key key;
u64 data_zone_blocks;
int ret;
if (arg_len != 0) {
@@ -446,6 +621,14 @@ static int server_get_log_trees(struct super_block *sb,
lt.nr = key.sklt_nr;
}
if (get_volopt_val(server, SCOUTFS_VOLOPT_DATA_ALLOC_ZONE_BLOCKS_NR, &data_zone_blocks)) {
ret = get_data_alloc_zone_bits(sb, rid, exclusive, vacant, data_zone_blocks);
if (ret < 0)
goto unlock;
} else {
data_zone_blocks = 0;
}
/* return freed to server for emptying, refill avail */
mutex_lock(&server->alloc_mutex);
ret = scoutfs_alloc_splice_list(sb, &server->alloc, &server->wri,
@@ -456,13 +639,28 @@ static int server_get_log_trees(struct super_block *sb,
&lt.meta_avail, server->meta_avail,
SCOUTFS_SERVER_META_FILL_LO,
SCOUTFS_SERVER_META_FILL_TARGET) ?:
alloc_move_refill(sb, &lt.data_avail, &super->data_alloc,
SCOUTFS_SERVER_DATA_FILL_LO,
SCOUTFS_SERVER_DATA_FILL_TARGET);
alloc_move_refill_zoned(sb, &lt.data_avail, &super->data_alloc,
SCOUTFS_SERVER_DATA_FILL_LO,
SCOUTFS_SERVER_DATA_FILL_TARGET,
exclusive, vacant, data_zone_blocks);
mutex_unlock(&server->alloc_mutex);
if (ret < 0)
goto unlock;
/* record data alloc zone bits */
zero_data_alloc_zone_bits(&lt);
if (data_zone_blocks != 0) {
cba.zones = lt.data_alloc_zones;
cba.zone_blocks = data_zone_blocks;
ret = scoutfs_alloc_extents_cb(sb, &lt.data_avail, set_extent_zone_bits, &cba);
if (ret < 0) {
zero_data_alloc_zone_bits(&lt);
goto unlock;
}
lt.data_alloc_zone_blocks = cpu_to_le64(data_zone_blocks);
}
/* update client's log tree's item */
ret = scoutfs_btree_force(sb, &server->alloc, &server->wri,
&super->logs_root, &key, &lt, sizeof(lt));
@@ -634,6 +832,9 @@ static int reclaim_log_trees(struct super_block *sb, u64 rid)
alloc_move_empty(sb, &super->data_alloc, &lt.data_freed);
mutex_unlock(&server->alloc_mutex);
/* the mount is no longer writing to the zones */
zero_data_alloc_zone_bits(&lt);
err = scoutfs_btree_update(sb, &server->alloc, &server->wri,
&super->logs_root, &key, &lt, sizeof(lt));
BUG_ON(err != 0); /* alloc and log item roots out of sync */
@@ -1075,6 +1276,156 @@ out:
return 0;
}
/* The server is receiving a request for the current volume options */
static int server_get_volopt(struct super_block *sb, struct scoutfs_net_connection *conn,
u8 cmd, u64 id, void *arg, u16 arg_len)
{
DECLARE_SERVER_INFO(sb, server);
struct scoutfs_volume_options volopt;
unsigned seq;
int ret = 0;
if (arg_len != 0) {
ret = -EINVAL;
goto out;
}
do {
seq = read_seqcount_begin(&server->volopt_seqcount);
volopt = server->volopt;
} while (read_seqcount_retry(&server->volopt_seqcount, seq));
out:
return scoutfs_net_response(sb, conn, cmd, id, ret, &volopt, sizeof(volopt));
}
/*
* The server is receiving a request to update volume options.
*
* The in-memory options that readers use is updated only once the
* updated options are written in the super block.
*/
static int server_set_volopt(struct super_block *sb, struct scoutfs_net_connection *conn,
u8 cmd, u64 id, void *arg, u16 arg_len)
{
DECLARE_SERVER_INFO(sb, server);
struct scoutfs_super_block *super = &SCOUTFS_SB(sb)->super;
struct scoutfs_volume_options *volopt;
u64 opt;
u64 nr;
int ret = 0;
if (arg_len != sizeof(struct scoutfs_volume_options)) {
ret = -EINVAL;
goto out;
}
volopt = arg;
if (le64_to_cpu(volopt->set_bits) & SCOUTFS_VOLOPT_EXPANSION_BITS) {
ret = -EINVAL;
goto out;
}
mutex_lock(&server->volopt_mutex);
ret = scoutfs_server_hold_commit(sb);
if (ret)
goto unlock;
if (le64_to_cpu(volopt->set_bits) & SCOUTFS_VOLOPT_DATA_ALLOC_ZONE_BLOCKS_BIT) {
opt = le64_to_cpu(volopt->data_alloc_zone_blocks);
if (opt < SCOUTFS_SERVER_DATA_FILL_TARGET) {
scoutfs_err(sb, "setting data_alloc_zone_blocks to '%llu' failed, must be at least %llu mount data allocation target blocks",
opt, SCOUTFS_SERVER_DATA_FILL_TARGET);
ret = -EINVAL;
goto apply;
}
nr = div_u64(le64_to_cpu(super->total_data_blocks), SCOUTFS_DATA_ALLOC_MAX_ZONES);
if (opt < nr) {
scoutfs_err(sb, "setting data_alloc_zone_blocks to '%llu' failed, must be greater than %llu blocks which results in max %u zones",
opt, nr, SCOUTFS_DATA_ALLOC_MAX_ZONES);
ret = -EINVAL;
goto apply;
}
if (opt > le64_to_cpu(super->total_data_blocks)) {
scoutfs_err(sb, "setting data_alloc_zone_blocks to '%llu' failed, must be at most %llu total data device blocks",
opt, le64_to_cpu(super->total_data_blocks));
ret = -EINVAL;
goto apply;
}
super->volopt.data_alloc_zone_blocks = volopt->data_alloc_zone_blocks;
super->volopt.set_bits |= cpu_to_le64(SCOUTFS_VOLOPT_DATA_ALLOC_ZONE_BLOCKS_BIT);
}
apply:
ret = scoutfs_server_apply_commit(sb, ret);
write_seqcount_begin(&server->volopt_seqcount);
if (ret == 0)
server->volopt = super->volopt;
else
super->volopt = server->volopt;
write_seqcount_end(&server->volopt_seqcount);
unlock:
mutex_unlock(&server->volopt_mutex);
out:
return scoutfs_net_response(sb, conn, cmd, id, ret, NULL, 0);
}
static int server_clear_volopt(struct super_block *sb, struct scoutfs_net_connection *conn,
u8 cmd, u64 id, void *arg, u16 arg_len)
{
DECLARE_SERVER_INFO(sb, server);
struct scoutfs_super_block *super = &SCOUTFS_SB(sb)->super;
struct scoutfs_volume_options *volopt;
__le64 *opt;
u64 bit;
int ret = 0;
int i;
if (arg_len != sizeof(struct scoutfs_volume_options)) {
ret = -EINVAL;
goto out;
}
volopt = arg;
if (le64_to_cpu(volopt->set_bits) & SCOUTFS_VOLOPT_EXPANSION_BITS) {
ret = -EINVAL;
goto out;
}
mutex_lock(&server->volopt_mutex);
ret = scoutfs_server_hold_commit(sb);
if (ret)
goto unlock;
for (i = 0, bit = 1, opt = first_valopt(&super->volopt); i < 64; i++, bit <<= 1, opt++) {
if (le64_to_cpu(volopt->set_bits) & bit) {
super->volopt.set_bits &= ~cpu_to_le64(bit);
*opt = 0;
}
}
ret = scoutfs_server_apply_commit(sb, ret);
write_seqcount_begin(&server->volopt_seqcount);
if (ret == 0)
server->volopt = super->volopt;
else
super->volopt = server->volopt;
write_seqcount_end(&server->volopt_seqcount);
unlock:
mutex_unlock(&server->volopt_mutex);
out:
return scoutfs_net_response(sb, conn, cmd, id, ret, NULL, 0);
}
static void init_mounted_client_key(struct scoutfs_key *key, u64 rid)
{
*key = (struct scoutfs_key) {
@@ -1565,6 +1916,9 @@ static scoutfs_net_request_t server_req_funcs[] = {
[SCOUTFS_NET_CMD_SRCH_GET_COMPACT] = server_srch_get_compact,
[SCOUTFS_NET_CMD_SRCH_COMMIT_COMPACT] = server_srch_commit_compact,
[SCOUTFS_NET_CMD_OPEN_INO_MAP] = server_open_ino_map,
[SCOUTFS_NET_CMD_GET_VOLOPT] = server_get_volopt,
[SCOUTFS_NET_CMD_SET_VOLOPT] = server_set_volopt,
[SCOUTFS_NET_CMD_CLEAR_VOLOPT] = server_clear_volopt,
[SCOUTFS_NET_CMD_FAREWELL] = server_farewell,
};
@@ -1784,6 +2138,11 @@ static void scoutfs_server_worker(struct work_struct *work)
if (ret < 0)
goto shutdown;
/* update volume options early, possibly for use during startup */
write_seqcount_begin(&server->volopt_seqcount);
server->volopt = super->volopt;
write_seqcount_end(&server->volopt_seqcount);
set_roots(server, &super->fs_root, &super->logs_root,
&super->srch_root);
scoutfs_block_writer_init(sb, &server->wri);
@@ -1932,6 +2291,8 @@ int scoutfs_server_setup(struct super_block *sb)
mutex_init(&server->srch_mutex);
mutex_init(&server->mounted_clients_mutex);
seqcount_init(&server->roots_seqcount);
seqcount_init(&server->volopt_seqcount);
mutex_init(&server->volopt_mutex);
INIT_WORK(&server->fence_pending_recov_work, fence_pending_recov_worker);
server->wq = alloc_workqueue("scoutfs_server",

View File

@@ -46,6 +46,7 @@
#include "alloc.h"
#include "recov.h"
#include "omap.h"
#include "volopt.h"
#include "scoutfs_trace.h"
static struct dentry *scoutfs_debugfs_root;
@@ -253,6 +254,7 @@ static void scoutfs_put_super(struct super_block *sb)
scoutfs_lock_shutdown(sb);
scoutfs_shutdown_trans(sb);
scoutfs_volopt_destroy(sb);
scoutfs_client_destroy(sb);
scoutfs_inode_destroy(sb);
scoutfs_item_destroy(sb);
@@ -601,6 +603,7 @@ static int scoutfs_fill_super(struct super_block *sb, void *data, int silent)
scoutfs_server_setup(sb) ?:
scoutfs_quorum_setup(sb) ?:
scoutfs_client_setup(sb) ?:
scoutfs_volopt_setup(sb) ?:
scoutfs_lock_rid(sb, SCOUTFS_LOCK_WRITE, 0, sbi->rid,
&sbi->rid_lock) ?:
scoutfs_trans_get_log_trees(sb) ?:

View File

@@ -28,6 +28,7 @@ struct forest_info;
struct srch_info;
struct recov_info;
struct omap_info;
struct volopt_info;
struct scoutfs_sb_info {
struct super_block *sb;
@@ -51,6 +52,7 @@ struct scoutfs_sb_info {
struct forest_info *forest_info;
struct srch_info *srch_info;
struct omap_info *omap_info;
struct volopt_info *volopt_info;
struct item_cache_info *item_cache_info;
wait_queue_head_t trans_hold_wq;

188
kmod/src/volopt.c Normal file
View File

@@ -0,0 +1,188 @@
/*
* Copyright (C) 2021 Versity Software, Inc. All rights reserved.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public
* License v2 as published by the Free Software Foundation.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* General Public License for more details.
*/
#include <linux/kernel.h>
#include <linux/fs.h>
#include <linux/slab.h>
#include <linux/kobject.h>
#include <linux/sysfs.h>
#include "super.h"
#include "client.h"
#include "volopt.h"
/*
* Volume options are exposed through a sysfs directory. Getting and
* setting the values sends rpcs to the server who owns the options in
* the super block.
*/
struct volopt_info {
struct super_block *sb;
struct scoutfs_sysfs_attrs ssa;
};
#define DECLARE_VOLOPT_INFO(sb, name) \
struct volopt_info *name = SCOUTFS_SB(sb)->volopt_info
#define DECLARE_VOLOPT_INFO_KOBJ(kobj, name) \
DECLARE_VOLOPT_INFO(SCOUTFS_SYSFS_ATTRS_SB(kobj), name)
/*
* attribute arrays need to be dense but the options we export could
* well become sparse over time. .store and .load are generic and we
* have a lookup table to map the attributes array indexes to the number
* and name of the option.
*/
static struct volopt_nr_name {
int nr;
char *name;
} volopt_table[] = {
{ SCOUTFS_VOLOPT_DATA_ALLOC_ZONE_BLOCKS_NR, "data_alloc_zone_blocks" },
};
/* initialized by setup, pointer array is null terminated */
static struct kobj_attribute volopt_attrs[ARRAY_SIZE(volopt_table)];
static struct attribute *volopt_attr_ptrs[ARRAY_SIZE(volopt_table) + 1];
static void get_opt_data(struct kobj_attribute *attr, struct scoutfs_volume_options *volopt,
u64 *bit, __le64 **opt)
{
size_t index = attr - &volopt_attrs[0];
int nr = volopt_table[index].nr;
*bit = 1ULL << nr;
*opt = &volopt->set_bits + 1 + nr;
}
static ssize_t volopt_attr_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf)
{
DECLARE_VOLOPT_INFO_KOBJ(kobj, vinf);
struct super_block *sb = vinf->sb;
struct scoutfs_volume_options volopt;
__le64 *opt;
u64 bit;
int ret;
ret = scoutfs_client_get_volopt(sb, &volopt);
if (ret < 0)
return ret;
get_opt_data(attr, &volopt, &bit, &opt);
if (le64_to_cpu(volopt.set_bits) & bit) {
return snprintf(buf, PAGE_SIZE, "%llu", le64_to_cpup(opt));
} else {
buf[0] = '\0';
return 0;
}
}
static ssize_t volopt_attr_store(struct kobject *kobj, struct kobj_attribute *attr,
const char *buf, size_t count)
{
DECLARE_VOLOPT_INFO_KOBJ(kobj, vinf);
struct super_block *sb = vinf->sb;
struct scoutfs_volume_options volopt = {0,};
u8 chars[32];
__le64 *opt;
u64 bit;
u64 val;
int ret;
if (count == 0)
return 0;
if (count > sizeof(chars) - 1)
return -ERANGE;
get_opt_data(attr, &volopt, &bit, &opt);
if (buf[0] == '\n' || buf[0] == '\r') {
volopt.set_bits = cpu_to_le64(bit);
ret = scoutfs_client_clear_volopt(sb, &volopt);
} else {
memcpy(chars, buf, count);
chars[count] = '\0';
ret = kstrtoull(chars, 0, &val);
if (ret < 0)
return ret;
volopt.set_bits = cpu_to_le64(bit);
*opt = cpu_to_le64(val);
ret = scoutfs_client_set_volopt(sb, &volopt);
}
if (ret == 0)
ret = count;
return ret;
}
/*
* The volume option sysfs files are slim shims around RPCs so this
* should be called after the client is setup and before it is torn
* down.
*/
int scoutfs_volopt_setup(struct super_block *sb)
{
struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
struct volopt_info *vinf;
int ret;
int i;
/* persistent volume options are always a bitmap u64 then the 64 options */
BUILD_BUG_ON(sizeof(struct scoutfs_volume_options) != (1 + 64) * 8);
vinf = kzalloc(sizeof(struct volopt_info), GFP_KERNEL);
if (!vinf) {
ret = -ENOMEM;
goto out;
}
scoutfs_sysfs_init_attrs(sb, &vinf->ssa);
vinf->sb = sb;
sbi->volopt_info = vinf;
for (i = 0; i < ARRAY_SIZE(volopt_table); i++) {
volopt_attrs[i] = (struct kobj_attribute) {
.attr = { .name = volopt_table[i].name, .mode = S_IWUSR | S_IRUGO },
.show = volopt_attr_show,
.store = volopt_attr_store,
};
volopt_attr_ptrs[i] = &volopt_attrs[i].attr;
}
BUILD_BUG_ON(ARRAY_SIZE(volopt_table) != ARRAY_SIZE(volopt_attr_ptrs) - 1);
volopt_attr_ptrs[i] = NULL;
ret = scoutfs_sysfs_create_attrs(sb, &vinf->ssa, volopt_attr_ptrs, "volume_options");
if (ret < 0)
goto out;
out:
if (ret)
scoutfs_volopt_destroy(sb);
return ret;
}
void scoutfs_volopt_destroy(struct super_block *sb)
{
struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
struct volopt_info *vinf = SCOUTFS_SB(sb)->volopt_info;
if (vinf) {
scoutfs_sysfs_destroy_attrs(sb, &vinf->ssa);
kfree(vinf);
sbi->volopt_info = NULL;
}
}

7
kmod/src/volopt.h Normal file
View File

@@ -0,0 +1,7 @@
#ifndef _SCOUTFS_VOLOPT_H_
#define _SCOUTFS_VOLOPT_H_
int scoutfs_volopt_setup(struct super_block *sb);
void scoutfs_volopt_destroy(struct super_block *sb);
#endif

View File

@@ -66,6 +66,7 @@ $(basename $0) options:
-X | xfstests git repo. Used by tests/xfstests.sh.
-x | xfstests git branch to checkout and track.
-y | xfstests ./check additional args
-z <nr> | set data-alloc-zone-blocks in mkfs
EOF
}
@@ -169,6 +170,11 @@ while true; do
T_XFSTESTS_ARGS="$2"
shift
;;
-z)
test -n "$2" || die "-z must have nr mounts argument"
T_DATA_ALLOC_ZONE_BLOCKS="-z $2"
shift
;;
-h|-\?|--help)
show_help
exit 1
@@ -319,7 +325,8 @@ if [ -n "$T_MKFS" ]; then
done
msg "making new filesystem with $T_QUORUM quorum members"
cmd scoutfs mkfs -f $quo "$T_META_DEVICE" "$T_DATA_DEVICE"
cmd scoutfs mkfs -f $quo $T_DATA_ALLOC_ZONE_BLOCKS \
"$T_META_DEVICE" "$T_DATA_DEVICE"
fi
if [ -n "$T_INSMOD" ]; then

View File

@@ -34,6 +34,40 @@ the server for the filesystem if it is elected leader.
The assigned number must match one of the slots defined with \-Q options
when the filesystem was created with mkfs. If the number assigned
doesn't match a number created during mkfs then the mount will fail.
.SH VOLUME OPTIONS
Volume options are persistent options which are stored in the super
block in the metadata device and which apply to all mounts of the volume.
.sp
Volume options may be initially specified as the volume is created
as described in the mkfs command in
.BR scoutfs (8).
.sp
Volume options may be changed at runtime by writing to files in sysfs
while the volume is mounted. Volume options are found in the
volume_options/ directory with a file for each option. Reading the
file provides the current setting of the option and an empty string
is returned if the option is not set. To set the option, write
the new value ofthe option to the file. To clear the option, write
a blank line with a newline to the file. The write syscall will
return an error if the set operation fails and a message will be written
to the console.
.sp
The following volume options are supported:
.TP
.B data_alloc_zone_blocks=<zone size in 4KiB blocks>
When the data_alloc_zone_blocks option is set the data device is
logically divided into zones of equal length as specified by the value
of the option. The size of the zones must be greater than a minimum
allocation pool size, large enough to result in no more than 1024 zones,
and not more than the total number of blocks in the data device.
.sp
When set, the server will try to provide each mount with free data
extents that don't share a zone with other mounts. When a mount has free
extents in a given zone the server will try and find more free extents
in that zone. When the mount is not in a zone, or its zone has no more
free extents, the server will try and find free extents in a zone that
no other mount currently occupies. The result is to try and produce
write streams where only one mount is writing into each zone.
.SH FURTHER READING
A
.B scoutfs

View File

@@ -32,7 +32,7 @@ A path within a ScoutFS filesystem.
.PD
.TP
.BI "mkfs META-DEVICE DATA-DEVICE {-Q|--quorum-slot} NR,ADDR,PORT [-m|--max-meta-size SIZE] [-d|--max-data-size SIZE] [-f|--force]"
.BI "mkfs META-DEVICE DATA-DEVICE {-Q|--quorum-slot} NR,ADDR,PORT [-m|--max-meta-size SIZE] [-d|--max-data-size SIZE] [-z|--data-alloc-zone-blocks BLOCKS] [-f|--force]"
.sp
Initialize a new ScoutFS filesystem on the target devices. Since ScoutFS uses
separate block devices for its metadata and data storage, two are required.
@@ -81,6 +81,10 @@ kibibytes, mebibytes, etc.
.B "-d, --max-data-size SIZE"
Same as previous, but for limiting the size of the data device.
.TP
.B "-z, --data-alloc-zone-blocks BLOCKS"
Set the data_alloc_zone_blocks volume option, as described in
.BR scoutfs (5).
.TP
.B "-f, --force"
Ignore presence of existing data on the data and metadata devices.
.RE

View File

@@ -57,6 +57,15 @@ static int write_block(int fd, u32 magic, __le64 fsid, u64 seq, u64 blkno,
return 0;
}
/*
* Return the order of the length of a free extent, which we define as
* floor(log_8_(len)): 0..7 = 0, 8..63 = 1, etc.
*/
static u64 free_extent_order(u64 len)
{
return (flsll(len | 1) - 1) / 3;
}
/*
* Write the single btree block that contains the blkno and len indexed
* items to store the given extent, and update the root to point to it.
@@ -72,30 +81,59 @@ static int write_alloc_root(int fd, __le64 fsid,
root->total_len = cpu_to_le64(len);
memset(&key, 0, sizeof(key));
key.sk_zone = SCOUTFS_FREE_EXTENT_ZONE;
key.sk_type = SCOUTFS_FREE_EXTENT_BLKNO_TYPE;
key.skii_ino = cpu_to_le64(SCOUTFS_ROOT_INO);
key.sk_zone = SCOUTFS_FREE_EXTENT_BLKNO_ZONE;
key.skfb_end = cpu_to_le64(start + len - 1);
key.skfb_len = cpu_to_le64(len);
btree_append_item(bt, &key, NULL, 0);
memset(&key, 0, sizeof(key));
key.sk_zone = SCOUTFS_FREE_EXTENT_ZONE;
key.sk_type = SCOUTFS_FREE_EXTENT_LEN_TYPE;
key.skii_ino = cpu_to_le64(SCOUTFS_ROOT_INO);
key.skfl_neglen = cpu_to_le64(-len);
key.skfl_blkno = cpu_to_le64(start);
key.sk_zone = SCOUTFS_FREE_EXTENT_ORDER_ZONE;
key.skfo_revord = cpu_to_le64(U64_MAX - free_extent_order(len));
key.skfo_end = cpu_to_le64(start + len - 1);
key.skfo_len = cpu_to_le64(len);
btree_append_item(bt, &key, NULL, 0);
return write_block(fd, SCOUTFS_BLOCK_MAGIC_BTREE, fsid, seq, blkno,
SCOUTFS_BLOCK_LG_SHIFT, &bt->hdr);
}
#define SCOUTFS_SERVER_DATA_FILL_TARGET \
((4ULL * 1024 * 1024 * 1024) >> SCOUTFS_BLOCK_SM_SHIFT)
static bool invalid_data_alloc_zone_blocks(u64 total_data_blocks, u64 zone_blocks)
{
u64 nr;
if (zone_blocks == 0)
return false;
if (zone_blocks < SCOUTFS_SERVER_DATA_FILL_TARGET) {
fprintf(stderr, "setting data_alloc_zone_blocks to '%llu' failed, must be at least %llu mount data allocation target blocks",
zone_blocks, SCOUTFS_SERVER_DATA_FILL_TARGET);
return true;
}
nr = total_data_blocks / SCOUTFS_DATA_ALLOC_MAX_ZONES;
if (zone_blocks < nr) {
fprintf(stderr, "setting data_alloc_zone_blocks to '%llu' failed, must be greater than %llu blocks which results in max %u zones",
zone_blocks, nr, SCOUTFS_DATA_ALLOC_MAX_ZONES);
return true;
}
if (zone_blocks > total_data_blocks) {
fprintf(stderr, "setting data_alloc_zone_blocks to '%llu' failed, must be at most %llu total data device blocks",
zone_blocks, total_data_blocks);
return true;
}
return false;
}
struct mkfs_args {
char *meta_device;
char *data_device;
unsigned long long max_meta_size;
unsigned long long max_data_size;
u64 data_alloc_zone_blocks;
bool force;
int nr_slots;
struct scoutfs_quorum_slot slots[SCOUTFS_QUORUM_MAX_SLOTS];
@@ -210,6 +248,17 @@ static int do_mkfs(struct mkfs_args *args)
member_sizeof(struct scoutfs_super_block, qconf.slots));
memcpy(super->qconf.slots, args->slots, sizeof(args->slots));
if (invalid_data_alloc_zone_blocks(le64_to_cpu(super->total_data_blocks),
args->data_alloc_zone_blocks)) {
ret = -EINVAL;
goto out;
}
if (args->data_alloc_zone_blocks) {
super->volopt.set_bits |= cpu_to_le64(SCOUTFS_VOLOPT_DATA_ALLOC_ZONE_BLOCKS_BIT);
super->volopt.data_alloc_zone_blocks = cpu_to_le64(args->data_alloc_zone_blocks);
}
/* fs root starts with root inode and its index items */
blkno = next_meta++;
btree_init_root_single(&super->fs_root, bt, 1, blkno);
@@ -471,6 +520,17 @@ static int parse_opt(int key, char *arg, struct argp_state *state)
prev_val, args->max_data_size);
break;
}
case 'z': /* data-alloc-zone-blocks */
{
ret = parse_u64(arg, &args->data_alloc_zone_blocks);
if (ret)
return ret;
if (args->data_alloc_zone_blocks == 0)
argp_error(state, "must provide non-zero data-alloc-zone-blocks");
break;
}
case ARGP_KEY_ARG:
if (!args->meta_device)
args->meta_device = strdup_or_error(state, arg);
@@ -501,6 +561,7 @@ static struct argp_option options[] = {
{ "force", 'f', NULL, 0, "Overwrite existing data on block devices"},
{ "max-meta-size", 'm', "SIZE", 0, "Use a size less than the base metadata device size (bytes or KMGTP units)"},
{ "max-data-size", 'd', "SIZE", 0, "Use a size less than the base data device size (bytes or KMGTP units)"},
{ "data-alloc-zone-blocks", 'z', "BLOCKS", 0, "Divide data device into block zones so each mounts writes to a zone (4KB blocks)"},
{ NULL }
};

View File

@@ -1,3 +1,4 @@
#define _GNU_SOURCE /* ffsll for glibc < 2.27 */
#include <unistd.h>
#include <stdio.h>
#include <stdlib.h>
@@ -272,6 +273,9 @@ static int print_log_trees_item(struct scoutfs_key *key, void *val,
unsigned val_len, void *arg)
{
struct scoutfs_log_trees *lt = val;
u64 zones;
int bit;
int i;
printf(" rid %llu nr %llu\n",
le64_to_cpu(key->sklt_rid), le64_to_cpu(key->sklt_nr));
@@ -287,7 +291,9 @@ static int print_log_trees_item(struct scoutfs_key *key, void *val,
" srch_file: "SRF_FMT"\n"
" max_item_vers: %llu\n"
" rid: %016llx\n"
" nr: %llu\n",
" nr: %llu\n"
" data_alloc_zone_blocks: %llu\n"
" data_alloc_zones: ",
AL_HEAD_A(&lt->meta_avail),
AL_HEAD_A(&lt->meta_freed),
lt->item_root.height,
@@ -300,7 +306,21 @@ static int print_log_trees_item(struct scoutfs_key *key, void *val,
SRF_A(&lt->srch_file),
le64_to_cpu(lt->max_item_vers),
le64_to_cpu(lt->rid),
le64_to_cpu(lt->nr));
le64_to_cpu(lt->nr),
le64_to_cpu(lt->data_alloc_zone_blocks));
for (i = 0; i < SCOUTFS_DATA_ALLOC_ZONE_LE64S; i++) {
if (lt->data_alloc_zones[i] == 0)
continue;
zones = le64_to_cpu(lt->data_alloc_zones[i]);
while (zones) {
bit = ffsll(zones) - 1;
printf("%u ", (i * 64) + bit);
zones ^= (1ULL << bit);
}
}
printf("\n");
}
return 0;
@@ -362,17 +382,17 @@ static int print_mounted_client_entry(struct scoutfs_key *key, void *val,
static int print_alloc_item(struct scoutfs_key *key, void *val,
unsigned val_len, void *arg)
{
if (key->sk_type == SCOUTFS_FREE_EXTENT_BLKNO_TYPE)
if (key->sk_zone == SCOUTFS_FREE_EXTENT_BLKNO_ZONE)
printf(" free extent: blkno %llu len %llu end %llu\n",
le64_to_cpu(key->skfb_end) -
le64_to_cpu(key->skfb_len) + 1,
le64_to_cpu(key->skfb_len),
le64_to_cpu(key->skfb_end));
else
printf(" free extent: blkno %llu len %llu neglen %lld\n",
le64_to_cpu(key->skfl_blkno),
-le64_to_cpu(key->skfl_neglen),
(long long)le64_to_cpu(key->skfl_neglen));
printf(" free extent: blkno %llu len %llu order %llu\n",
le64_to_cpu(key->skfo_end) - le64_to_cpu(key->skfo_len) + 1,
le64_to_cpu(key->skfo_len),
(long long)(U64_MAX - le64_to_cpu(key->skfo_revord)));
return 0;
}
@@ -900,6 +920,14 @@ static void print_super_block(struct scoutfs_super_block *super, u64 blkno)
le64_to_cpu(super->fs_root.ref.blkno),
le64_to_cpu(super->fs_root.ref.seq));
printf(" volume options:\n"
" set_bits: %016llx\n",
le64_to_cpu(super->volopt.set_bits));
if (le64_to_cpu(super->volopt.set_bits) & SCOUTFS_VOLOPT_DATA_ALLOC_ZONE_BLOCKS_BIT) {
printf(" data_alloc_zone_blocks: %llu\n",
le64_to_cpu(super->volopt.data_alloc_zone_blocks));
}
printf(" quorum config version %llu\n",
le64_to_cpu(super->qconf.version));
for (i = 0; i < array_size(super->qconf.slots); i++) {