mirror of
https://github.com/versity/scoutfs.git
synced 2026-01-05 11:45:09 +00:00
Merge pull request #160 from versity/zab/log_merging_speedups
Zab/log merging speedups
This commit is contained in:
439
kmod/src/btree.c
439
kmod/src/btree.c
@@ -2029,187 +2029,253 @@ int scoutfs_btree_rebalance(struct super_block *sb,
|
||||
key, SCOUTFS_BTREE_MAX_VAL_LEN, NULL, NULL, NULL);
|
||||
}
|
||||
|
||||
struct merge_pos {
|
||||
struct merged_range {
|
||||
struct scoutfs_key start;
|
||||
struct scoutfs_key end;
|
||||
struct rb_root root;
|
||||
int size;
|
||||
};
|
||||
|
||||
struct merged_item {
|
||||
struct rb_node node;
|
||||
struct scoutfs_btree_root *root;
|
||||
struct scoutfs_block *bl;
|
||||
struct scoutfs_btree_block *bt;
|
||||
struct scoutfs_avl_node *avl;
|
||||
struct scoutfs_key *key;
|
||||
struct scoutfs_key key;
|
||||
u64 seq;
|
||||
u8 flags;
|
||||
unsigned int val_len;
|
||||
u8 *val;
|
||||
u8 val[0];
|
||||
};
|
||||
|
||||
static struct merge_pos *first_mpos(struct rb_root *root)
|
||||
static inline struct merged_item *mitem_container(struct rb_node *node)
|
||||
{
|
||||
struct rb_node *node = rb_first(root);
|
||||
if (node)
|
||||
return container_of(node, struct merge_pos, node);
|
||||
return node ? container_of(node, struct merged_item, node) : NULL;
|
||||
}
|
||||
|
||||
static inline struct merged_item *first_mitem(struct rb_root *root)
|
||||
{
|
||||
return mitem_container(rb_first(root));
|
||||
}
|
||||
|
||||
static inline struct merged_item *last_mitem(struct rb_root *root)
|
||||
{
|
||||
return mitem_container(rb_last(root));
|
||||
}
|
||||
|
||||
static inline struct merged_item *next_mitem(struct merged_item *mitem)
|
||||
{
|
||||
return mitem_container(mitem ? rb_next(&mitem->node) : NULL);
|
||||
}
|
||||
|
||||
static inline struct merged_item *prev_mitem(struct merged_item *mitem)
|
||||
{
|
||||
return mitem_container(mitem ? rb_prev(&mitem->node) : NULL);
|
||||
}
|
||||
|
||||
static struct merged_item *find_mitem(struct rb_root *root, struct scoutfs_key *key,
|
||||
struct rb_node **parent_ret, struct rb_node ***link_ret)
|
||||
{
|
||||
struct rb_node **node = &root->rb_node;
|
||||
struct rb_node *parent = NULL;
|
||||
struct merged_item *mitem;
|
||||
int cmp;
|
||||
|
||||
while (*node) {
|
||||
parent = *node;
|
||||
mitem = container_of(*node, struct merged_item, node);
|
||||
|
||||
cmp = scoutfs_key_compare(key, &mitem->key);
|
||||
|
||||
if (cmp < 0) {
|
||||
node = &(*node)->rb_left;
|
||||
} else if (cmp > 0) {
|
||||
node = &(*node)->rb_right;
|
||||
} else {
|
||||
*parent_ret = NULL;
|
||||
*link_ret = NULL;
|
||||
return mitem;
|
||||
}
|
||||
}
|
||||
|
||||
*parent_ret = parent;
|
||||
*link_ret = node;
|
||||
return NULL;
|
||||
}
|
||||
|
||||
static struct merge_pos *next_mpos(struct merge_pos *mpos)
|
||||
static void insert_mitem(struct merged_range *rng, struct merged_item *mitem,
|
||||
struct rb_node *parent, struct rb_node **link)
|
||||
{
|
||||
struct rb_node *node;
|
||||
|
||||
if (mpos && (node = rb_next(&mpos->node)))
|
||||
return container_of(node, struct merge_pos, node);
|
||||
else
|
||||
return NULL;
|
||||
rb_link_node(&mitem->node, parent, link);
|
||||
rb_insert_color(&mitem->node, &rng->root);
|
||||
rng->size += item_len_bytes(mitem->val_len);
|
||||
}
|
||||
|
||||
static void free_mpos(struct super_block *sb, struct merge_pos *mpos)
|
||||
static void replace_mitem(struct merged_range *rng, struct merged_item *victim,
|
||||
struct merged_item *new)
|
||||
{
|
||||
scoutfs_block_put(sb, mpos->bl);
|
||||
kfree(mpos);
|
||||
rb_replace_node(&victim->node, &new->node, &rng->root);
|
||||
RB_CLEAR_NODE(&victim->node);
|
||||
rng->size -= item_len_bytes(victim->val_len);
|
||||
rng->size += item_len_bytes(new->val_len);
|
||||
}
|
||||
|
||||
static void insert_mpos(struct rb_root *pos_root, struct merge_pos *ins)
|
||||
static void free_mitem(struct merged_range *rng, struct merged_item *mitem)
|
||||
{
|
||||
struct rb_node **node = &pos_root->rb_node;
|
||||
struct rb_node *parent = NULL;
|
||||
struct merge_pos *mpos;
|
||||
int cmp;
|
||||
if (IS_ERR_OR_NULL(mitem))
|
||||
return;
|
||||
|
||||
parent = NULL;
|
||||
while (*node) {
|
||||
parent = *node;
|
||||
mpos = container_of(*node, struct merge_pos, node);
|
||||
|
||||
/* sort merge items by key then newest to oldest */
|
||||
cmp = scoutfs_key_compare(ins->key, mpos->key) ?:
|
||||
-scoutfs_cmp(ins->seq, mpos->seq);
|
||||
|
||||
if (cmp < 0)
|
||||
node = &(*node)->rb_left;
|
||||
else
|
||||
node = &(*node)->rb_right;
|
||||
if (!RB_EMPTY_NODE(&mitem->node)) {
|
||||
rng->size -= item_len_bytes(mitem->val_len);
|
||||
rb_erase(&mitem->node, &rng->root);
|
||||
}
|
||||
|
||||
rb_link_node(&ins->node, parent, node);
|
||||
rb_insert_color(&ins->node, pos_root);
|
||||
kfree(mitem);
|
||||
}
|
||||
|
||||
static void trim_range_size(struct merged_range *rng, int merge_window)
|
||||
{
|
||||
struct merged_item *mitem;
|
||||
struct merged_item *tmp;
|
||||
|
||||
mitem = last_mitem(&rng->root);
|
||||
while (mitem && rng->size > merge_window) {
|
||||
|
||||
rng->end = mitem->key;
|
||||
scoutfs_key_dec(&rng->end);
|
||||
|
||||
tmp = mitem;
|
||||
mitem = prev_mitem(mitem);
|
||||
free_mitem(rng, tmp);
|
||||
}
|
||||
}
|
||||
|
||||
static void trim_range_end(struct merged_range *rng)
|
||||
{
|
||||
struct merged_item *mitem;
|
||||
struct merged_item *tmp;
|
||||
|
||||
mitem = last_mitem(&rng->root);
|
||||
while (mitem && scoutfs_key_compare(&mitem->key, &rng->end) > 0) {
|
||||
tmp = mitem;
|
||||
mitem = prev_mitem(mitem);
|
||||
free_mitem(rng, tmp);
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Find the next item in the merge_pos root in the caller's range and
|
||||
* insert it into the rbtree sorted by key and version so that merging
|
||||
* can find the next newest item at the front of the rbtree. We free
|
||||
* the mpos on error or if there are no more items in the range.
|
||||
* Record and combine logged items from log roots for merging with the
|
||||
* writable destination root. The caller is responsible for trimming
|
||||
* the range if it gets too large or if the key range shrinks.
|
||||
*/
|
||||
static int reset_mpos(struct super_block *sb, struct rb_root *pos_root, struct merge_pos *mpos,
|
||||
struct scoutfs_key *start, struct scoutfs_key *end)
|
||||
static int merge_read_item(struct super_block *sb, struct scoutfs_key *key, u64 seq, u8 flags,
|
||||
void *val, int val_len, void *arg)
|
||||
{
|
||||
struct scoutfs_btree_item *item;
|
||||
struct scoutfs_avl_node *next;
|
||||
struct btree_walk_key_range kr;
|
||||
struct scoutfs_key walk_key;
|
||||
int ret = 0;
|
||||
struct merged_range *rng = arg;
|
||||
struct merged_item *mitem;
|
||||
struct merged_item *found;
|
||||
struct rb_node *parent;
|
||||
struct rb_node **link;
|
||||
int ret;
|
||||
|
||||
/* always erase before freeing or inserting */
|
||||
if (!RB_EMPTY_NODE(&mpos->node)) {
|
||||
rb_erase(&mpos->node, pos_root);
|
||||
RB_CLEAR_NODE(&mpos->node);
|
||||
}
|
||||
|
||||
/*
|
||||
* advance to next item via the avl tree. The caller's pos is
|
||||
* only ever incremented past the last key so we can use next to
|
||||
* iterate rather than using search to skip past multiple items.
|
||||
*/
|
||||
if (mpos->avl)
|
||||
mpos->avl = scoutfs_avl_next(&mpos->bt->item_root, mpos->avl);
|
||||
|
||||
/* find the next leaf with the key if we run out of items */
|
||||
walk_key = *start;
|
||||
while (!mpos->avl && !scoutfs_key_is_zeros(&walk_key)) {
|
||||
scoutfs_block_put(sb, mpos->bl);
|
||||
mpos->bl = NULL;
|
||||
ret = btree_walk(sb, NULL, NULL, mpos->root, BTW_NEXT, &walk_key,
|
||||
0, &mpos->bl, &kr, NULL);
|
||||
if (ret < 0) {
|
||||
if (ret == -ENOENT)
|
||||
ret = 0;
|
||||
free_mpos(sb, mpos);
|
||||
found = find_mitem(&rng->root, key, &parent, &link);
|
||||
if (found) {
|
||||
ret = scoutfs_forest_combine_deltas(key, found->val, found->val_len, val, val_len);
|
||||
if (ret < 0)
|
||||
goto out;
|
||||
if (ret > 0) {
|
||||
if (ret == SCOUTFS_DELTA_COMBINED) {
|
||||
scoutfs_inc_counter(sb, btree_merge_delta_combined);
|
||||
} else if (ret == SCOUTFS_DELTA_COMBINED_NULL) {
|
||||
scoutfs_inc_counter(sb, btree_merge_delta_null);
|
||||
free_mitem(rng, found);
|
||||
}
|
||||
ret = 0;
|
||||
goto out;
|
||||
}
|
||||
mpos->bt = mpos->bl->data;
|
||||
|
||||
mpos->avl = scoutfs_avl_search(&mpos->bt->item_root, cmp_key_item,
|
||||
start, NULL, NULL, &next, NULL) ?: next;
|
||||
if (mpos->avl == NULL)
|
||||
walk_key = kr.iter_next;
|
||||
if (found->seq >= seq) {
|
||||
ret = 0;
|
||||
goto out;
|
||||
}
|
||||
}
|
||||
|
||||
/* see if we're out of items within the range */
|
||||
item = node_item(mpos->avl);
|
||||
if (!item || scoutfs_key_compare(item_key(item), end) > 0) {
|
||||
free_mpos(sb, mpos);
|
||||
ret = 0;
|
||||
mitem = kmalloc(offsetof(struct merged_item, val[val_len]), GFP_NOFS);
|
||||
if (!mitem) {
|
||||
ret = -ENOMEM;
|
||||
goto out;
|
||||
}
|
||||
|
||||
/* insert the next item within range at its version */
|
||||
mpos->key = item_key(item);
|
||||
mpos->seq = le64_to_cpu(item->seq);
|
||||
mpos->flags = item->flags;
|
||||
mpos->val_len = item_val_len(item);
|
||||
mpos->val = item_val(mpos->bt, item);
|
||||
mitem->key = *key;
|
||||
mitem->seq = seq;
|
||||
mitem->flags = flags;
|
||||
mitem->val_len = val_len;
|
||||
if (val_len)
|
||||
memcpy(mitem->val, val, val_len);
|
||||
|
||||
if (found) {
|
||||
replace_mitem(rng, found, mitem);
|
||||
free_mitem(rng, found);
|
||||
} else {
|
||||
insert_mitem(rng, mitem, parent, link);
|
||||
}
|
||||
|
||||
insert_mpos(pos_root, mpos);
|
||||
ret = 0;
|
||||
out:
|
||||
return ret;
|
||||
}
|
||||
|
||||
/*
|
||||
* The caller has reset all the merge positions for all the input log
|
||||
* btree roots and wants the next logged item it should try and merge
|
||||
* with the items in the fs_root.
|
||||
* Read a range of merged items. The caller has set the key bounds of
|
||||
* the range. We read a merge window's worth of items from blocks in
|
||||
* each input btree.
|
||||
*
|
||||
* We look ahead in the logged item stream to see if we should merge any
|
||||
* older logged delta items into one result for the caller. We also
|
||||
* take this opportunity to skip and reset the mpos for any older
|
||||
* versions of the first item.
|
||||
* The caller can only use the smallest range that overlaps with all the
|
||||
* blocks that we read. We start reading from the range's start key so
|
||||
* it will always be present and we don't need to adjust it. The final
|
||||
* block we read from each input might not cover the range's end so it
|
||||
* needs to be adjusted.
|
||||
*
|
||||
* The end range can also shrink if we have to drop items because the
|
||||
* items exceeded the merge window size.
|
||||
*/
|
||||
static int next_resolved_mpos(struct super_block *sb, struct rb_root *pos_root,
|
||||
struct scoutfs_key *end, struct merge_pos **mpos_ret)
|
||||
static int read_merged_range(struct super_block *sb, struct merged_range *rng,
|
||||
struct list_head *inputs, int merge_window)
|
||||
{
|
||||
struct merge_pos *mpos;
|
||||
struct merge_pos *next;
|
||||
struct scoutfs_btree_root_head *rhead;
|
||||
struct scoutfs_key start;
|
||||
struct scoutfs_key end;
|
||||
struct scoutfs_key key;
|
||||
int ret = 0;
|
||||
int i;
|
||||
|
||||
while ((mpos = first_mpos(pos_root)) && (next = next_mpos(mpos)) &&
|
||||
!scoutfs_key_compare(mpos->key, next->key)) {
|
||||
list_for_each_entry(rhead, inputs, head) {
|
||||
key = rng->start;
|
||||
|
||||
ret = scoutfs_forest_combine_deltas(mpos->key, mpos->val, mpos->val_len,
|
||||
next->val, next->val_len);
|
||||
if (ret < 0)
|
||||
break;
|
||||
|
||||
/* reset advances to the next item */
|
||||
key = *mpos->key;
|
||||
scoutfs_key_inc(&key);
|
||||
|
||||
/* always skip next combined or older version */
|
||||
ret = reset_mpos(sb, pos_root, next, &key, end);
|
||||
if (ret < 0)
|
||||
break;
|
||||
|
||||
if (ret == SCOUTFS_DELTA_COMBINED) {
|
||||
scoutfs_inc_counter(sb, btree_merge_delta_combined);
|
||||
} else if (ret == SCOUTFS_DELTA_COMBINED_NULL) {
|
||||
scoutfs_inc_counter(sb, btree_merge_delta_null);
|
||||
/* if merging resulted in no info, skip current */
|
||||
ret = reset_mpos(sb, pos_root, mpos, &key, end);
|
||||
for (i = 0; i < merge_window; i += SCOUTFS_BLOCK_LG_SIZE) {
|
||||
start = key;
|
||||
end = rng->end;
|
||||
ret = scoutfs_btree_read_items(sb, &rhead->root, &key, &start, &end,
|
||||
merge_read_item, rng);
|
||||
if (ret < 0)
|
||||
goto out;
|
||||
|
||||
if (scoutfs_key_compare(&end, &rng->end) >= 0)
|
||||
break;
|
||||
|
||||
key = end;
|
||||
scoutfs_key_inc(&key);
|
||||
}
|
||||
|
||||
if (scoutfs_key_compare(&end, &rng->end) < 0) {
|
||||
rng->end = end;
|
||||
trim_range_end(rng);
|
||||
}
|
||||
|
||||
if (rng->size > merge_window)
|
||||
trim_range_size(rng, merge_window);
|
||||
}
|
||||
|
||||
*mpos_ret = mpos;
|
||||
trace_scoutfs_btree_merge_read_range(sb, &rng->start, &rng->end, rng->size);
|
||||
ret = 0;
|
||||
out:
|
||||
return ret;
|
||||
}
|
||||
|
||||
@@ -2226,6 +2292,13 @@ static int next_resolved_mpos(struct super_block *sb, struct rb_root *pos_root,
|
||||
* to allocators running low or needing to join/split the parent.
|
||||
* *next_ret is set to the next key which hasn't been merged so that the
|
||||
* caller can retry with a new allocator and subtree.
|
||||
*
|
||||
* The number of input roots can be immense. The merge_window specifies
|
||||
* the size of the set of merged items that we'll maintain as we iterate
|
||||
* over all the input roots. Once we've merged items into the window
|
||||
* from all the input roots the merged input items are then merged to
|
||||
* the writable destination root. It may take multiple passes of
|
||||
* windows of merged items to cover the input key range.
|
||||
*/
|
||||
int scoutfs_btree_merge(struct super_block *sb,
|
||||
struct scoutfs_alloc *alloc,
|
||||
@@ -2235,18 +2308,16 @@ int scoutfs_btree_merge(struct super_block *sb,
|
||||
struct scoutfs_key *next_ret,
|
||||
struct scoutfs_btree_root *root,
|
||||
struct list_head *inputs,
|
||||
bool subtree, int dirty_limit, int alloc_low)
|
||||
bool subtree, int dirty_limit, int alloc_low, int merge_window)
|
||||
{
|
||||
struct scoutfs_btree_root_head *rhead;
|
||||
struct rb_root pos_root = RB_ROOT;
|
||||
struct scoutfs_btree_item *item;
|
||||
struct scoutfs_btree_block *bt;
|
||||
struct scoutfs_block *bl = NULL;
|
||||
struct btree_walk_key_range kr;
|
||||
struct scoutfs_avl_node *par;
|
||||
struct scoutfs_key next;
|
||||
struct merge_pos *mpos;
|
||||
struct merge_pos *tmp;
|
||||
struct merged_item *mitem;
|
||||
struct merged_item *tmp;
|
||||
struct merged_range rng;
|
||||
int walk_val_len;
|
||||
int walk_flags;
|
||||
bool is_del;
|
||||
@@ -2257,49 +2328,59 @@ int scoutfs_btree_merge(struct super_block *sb,
|
||||
trace_scoutfs_btree_merge(sb, root, start, end);
|
||||
scoutfs_inc_counter(sb, btree_merge);
|
||||
|
||||
list_for_each_entry(rhead, inputs, head) {
|
||||
mpos = kzalloc(sizeof(*mpos), GFP_NOFS);
|
||||
if (!mpos) {
|
||||
ret = -ENOMEM;
|
||||
goto out;
|
||||
}
|
||||
|
||||
RB_CLEAR_NODE(&mpos->node);
|
||||
mpos->root = &rhead->root;
|
||||
|
||||
ret = reset_mpos(sb, &pos_root, mpos, start, end);
|
||||
if (ret < 0)
|
||||
goto out;
|
||||
}
|
||||
|
||||
walk_flags = BTW_DIRTY;
|
||||
if (subtree)
|
||||
walk_flags |= BTW_SUBTREE;
|
||||
walk_val_len = 0;
|
||||
|
||||
while ((ret = next_resolved_mpos(sb, &pos_root, end, &mpos)) == 0 && mpos) {
|
||||
rng.start = *start;
|
||||
rng.end = *end;
|
||||
rng.root = RB_ROOT;
|
||||
rng.size = 0;
|
||||
|
||||
ret = read_merged_range(sb, &rng, inputs, merge_window);
|
||||
if (ret < 0)
|
||||
goto out;
|
||||
|
||||
for (;;) {
|
||||
/* read next window as it empties (and it is possible to read an empty range) */
|
||||
mitem = first_mitem(&rng.root);
|
||||
if (!mitem) {
|
||||
/* done if the read range hit the end */
|
||||
if (scoutfs_key_compare(&rng.end, end) >= 0)
|
||||
break;
|
||||
|
||||
/* read next batch of merged items */
|
||||
rng.start = rng.end;
|
||||
scoutfs_key_inc(&rng.start);
|
||||
rng.end = *end;
|
||||
ret = read_merged_range(sb, &rng, inputs, merge_window);
|
||||
if (ret < 0)
|
||||
break;
|
||||
continue;
|
||||
}
|
||||
|
||||
if (scoutfs_block_writer_dirty_bytes(sb, wri) >= dirty_limit) {
|
||||
scoutfs_inc_counter(sb, btree_merge_dirty_limit);
|
||||
ret = -ERANGE;
|
||||
*next_ret = *mpos->key;
|
||||
*next_ret = mitem->key;
|
||||
goto out;
|
||||
}
|
||||
|
||||
if (scoutfs_alloc_meta_low(sb, alloc, alloc_low)) {
|
||||
scoutfs_inc_counter(sb, btree_merge_alloc_low);
|
||||
ret = -ERANGE;
|
||||
*next_ret = *mpos->key;
|
||||
*next_ret = mitem->key;
|
||||
goto out;
|
||||
}
|
||||
|
||||
scoutfs_block_put(sb, bl);
|
||||
bl = NULL;
|
||||
ret = btree_walk(sb, alloc, wri, root, walk_flags,
|
||||
mpos->key, walk_val_len, &bl, &kr, NULL);
|
||||
&mitem->key, walk_val_len, &bl, &kr, NULL);
|
||||
if (ret < 0) {
|
||||
if (ret == -ERANGE)
|
||||
*next_ret = *mpos->key;
|
||||
*next_ret = mitem->key;
|
||||
goto out;
|
||||
}
|
||||
bt = bl->data;
|
||||
@@ -2311,22 +2392,21 @@ int scoutfs_btree_merge(struct super_block *sb,
|
||||
continue;
|
||||
}
|
||||
|
||||
while ((ret = next_resolved_mpos(sb, &pos_root, end, &mpos)) == 0 && mpos) {
|
||||
|
||||
while (mitem) {
|
||||
/* walk to new leaf if we exceed parent ref key */
|
||||
if (scoutfs_key_compare(mpos->key, &kr.end) > 0)
|
||||
if (scoutfs_key_compare(&mitem->key, &kr.end) > 0)
|
||||
break;
|
||||
|
||||
/* see if there's an existing item */
|
||||
item = leaf_item_hash_search(sb, bt, mpos->key);
|
||||
is_del = !!(mpos->flags & SCOUTFS_ITEM_FLAG_DELETION);
|
||||
item = leaf_item_hash_search(sb, bt, &mitem->key);
|
||||
is_del = !!(mitem->flags & SCOUTFS_ITEM_FLAG_DELETION);
|
||||
|
||||
/* see if we're merging delta items */
|
||||
if (item && !is_del)
|
||||
delta = scoutfs_forest_combine_deltas(mpos->key,
|
||||
delta = scoutfs_forest_combine_deltas(&mitem->key,
|
||||
item_val(bt, item),
|
||||
item_val_len(item),
|
||||
mpos->val, mpos->val_len);
|
||||
mitem->val, mitem->val_len);
|
||||
else
|
||||
delta = 0;
|
||||
if (delta < 0) {
|
||||
@@ -2338,40 +2418,38 @@ int scoutfs_btree_merge(struct super_block *sb,
|
||||
scoutfs_inc_counter(sb, btree_merge_delta_null);
|
||||
}
|
||||
|
||||
trace_scoutfs_btree_merge_items(sb, mpos->root,
|
||||
mpos->key, mpos->val_len,
|
||||
trace_scoutfs_btree_merge_items(sb, &mitem->key, mitem->val_len,
|
||||
item ? root : NULL,
|
||||
item ? item_key(item) : NULL,
|
||||
item ? item_val_len(item) : 0, is_del);
|
||||
|
||||
/* rewalk and split if ins/update needs room */
|
||||
if (!is_del && !delta && !mid_free_item_room(bt, mpos->val_len)) {
|
||||
if (!is_del && !delta && !mid_free_item_room(bt, mitem->val_len)) {
|
||||
walk_flags |= BTW_INSERT;
|
||||
walk_val_len = mpos->val_len;
|
||||
walk_val_len = mitem->val_len;
|
||||
break;
|
||||
}
|
||||
|
||||
/* insert missing non-deletion merge items */
|
||||
if (!item && !is_del) {
|
||||
scoutfs_avl_search(&bt->item_root,
|
||||
cmp_key_item, mpos->key,
|
||||
scoutfs_avl_search(&bt->item_root, cmp_key_item, &mitem->key,
|
||||
&cmp, &par, NULL, NULL);
|
||||
create_item(bt, mpos->key, mpos->seq, mpos->flags,
|
||||
mpos->val, mpos->val_len, par, cmp);
|
||||
create_item(bt, &mitem->key, mitem->seq, mitem->flags,
|
||||
mitem->val, mitem->val_len, par, cmp);
|
||||
scoutfs_inc_counter(sb, btree_merge_insert);
|
||||
}
|
||||
|
||||
/* update existing items */
|
||||
if (item && !is_del && !delta) {
|
||||
item->seq = cpu_to_le64(mpos->seq);
|
||||
item->flags = mpos->flags;
|
||||
update_item_value(bt, item, mpos->val, mpos->val_len);
|
||||
item->seq = cpu_to_le64(mitem->seq);
|
||||
item->flags = mitem->flags;
|
||||
update_item_value(bt, item, mitem->val, mitem->val_len);
|
||||
scoutfs_inc_counter(sb, btree_merge_update);
|
||||
}
|
||||
|
||||
/* update combined delta item seq */
|
||||
if (delta == SCOUTFS_DELTA_COMBINED) {
|
||||
item->seq = cpu_to_le64(mpos->seq);
|
||||
item->seq = cpu_to_le64(mitem->seq);
|
||||
}
|
||||
|
||||
/*
|
||||
@@ -2403,21 +2481,18 @@ int scoutfs_btree_merge(struct super_block *sb,
|
||||
walk_flags &= ~(BTW_INSERT | BTW_DELETE);
|
||||
walk_val_len = 0;
|
||||
|
||||
/* finished with this key, skip any older items */
|
||||
next = *mpos->key;
|
||||
scoutfs_key_inc(&next);
|
||||
ret = reset_mpos(sb, &pos_root, mpos, &next, end);
|
||||
if (ret < 0)
|
||||
goto out;
|
||||
/* finished with this merged item */
|
||||
tmp = mitem;
|
||||
mitem = next_mitem(mitem);
|
||||
free_mitem(&rng, tmp);
|
||||
}
|
||||
}
|
||||
|
||||
ret = 0;
|
||||
out:
|
||||
scoutfs_block_put(sb, bl);
|
||||
rbtree_postorder_for_each_entry_safe(mpos, tmp, &pos_root, node) {
|
||||
free_mpos(sb, mpos);
|
||||
}
|
||||
rbtree_postorder_for_each_entry_safe(mitem, tmp, &rng.root, node)
|
||||
free_mitem(&rng, mitem);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
@@ -119,7 +119,7 @@ int scoutfs_btree_merge(struct super_block *sb,
|
||||
struct scoutfs_key *next_ret,
|
||||
struct scoutfs_btree_root *root,
|
||||
struct list_head *input_list,
|
||||
bool subtree, int dirty_limit, int alloc_low);
|
||||
bool subtree, int dirty_limit, int alloc_low, int merge_window);
|
||||
|
||||
int scoutfs_btree_free_blocks(struct super_block *sb,
|
||||
struct scoutfs_alloc *alloc,
|
||||
|
||||
@@ -145,6 +145,7 @@
|
||||
EXPAND_COUNTER(lock_shrink_work) \
|
||||
EXPAND_COUNTER(lock_unlock) \
|
||||
EXPAND_COUNTER(lock_wait) \
|
||||
EXPAND_COUNTER(log_merge_wait_timeout) \
|
||||
EXPAND_COUNTER(net_dropped_response) \
|
||||
EXPAND_COUNTER(net_send_bytes) \
|
||||
EXPAND_COUNTER(net_send_error) \
|
||||
|
||||
@@ -721,7 +721,8 @@ static void scoutfs_forest_log_merge_worker(struct work_struct *work)
|
||||
ret = scoutfs_btree_merge(sb, &alloc, &wri, &req.start, &req.end,
|
||||
&next, &comp.root, &inputs,
|
||||
!!(req.flags & cpu_to_le64(SCOUTFS_LOG_MERGE_REQUEST_SUBTREE)),
|
||||
SCOUTFS_LOG_MERGE_DIRTY_BYTE_LIMIT, 10);
|
||||
SCOUTFS_LOG_MERGE_DIRTY_BYTE_LIMIT, 10,
|
||||
(2 * 1024 * 1024));
|
||||
if (ret == -ERANGE) {
|
||||
comp.remain = next;
|
||||
le64_add_cpu(&comp.flags, SCOUTFS_LOG_MERGE_COMP_REMAIN);
|
||||
|
||||
@@ -33,6 +33,7 @@ enum {
|
||||
Opt_acl,
|
||||
Opt_data_prealloc_blocks,
|
||||
Opt_data_prealloc_contig_only,
|
||||
Opt_log_merge_wait_timeout_ms,
|
||||
Opt_metadev_path,
|
||||
Opt_noacl,
|
||||
Opt_orphan_scan_delay_ms,
|
||||
@@ -45,6 +46,7 @@ static const match_table_t tokens = {
|
||||
{Opt_acl, "acl"},
|
||||
{Opt_data_prealloc_blocks, "data_prealloc_blocks=%s"},
|
||||
{Opt_data_prealloc_contig_only, "data_prealloc_contig_only=%s"},
|
||||
{Opt_log_merge_wait_timeout_ms, "log_merge_wait_timeout_ms=%s"},
|
||||
{Opt_metadev_path, "metadev_path=%s"},
|
||||
{Opt_noacl, "noacl"},
|
||||
{Opt_orphan_scan_delay_ms, "orphan_scan_delay_ms=%s"},
|
||||
@@ -113,6 +115,10 @@ static void free_options(struct scoutfs_mount_options *opts)
|
||||
kfree(opts->metadev_path);
|
||||
}
|
||||
|
||||
#define MIN_LOG_MERGE_WAIT_TIMEOUT_MS 100UL
|
||||
#define DEFAULT_LOG_MERGE_WAIT_TIMEOUT_MS 500
|
||||
#define MAX_LOG_MERGE_WAIT_TIMEOUT_MS (60 * MSEC_PER_SEC)
|
||||
|
||||
#define MIN_ORPHAN_SCAN_DELAY_MS 100UL
|
||||
#define DEFAULT_ORPHAN_SCAN_DELAY_MS (10 * MSEC_PER_SEC)
|
||||
#define MAX_ORPHAN_SCAN_DELAY_MS (60 * MSEC_PER_SEC)
|
||||
@@ -126,11 +132,27 @@ static void init_default_options(struct scoutfs_mount_options *opts)
|
||||
|
||||
opts->data_prealloc_blocks = SCOUTFS_DATA_PREALLOC_DEFAULT_BLOCKS;
|
||||
opts->data_prealloc_contig_only = 1;
|
||||
opts->log_merge_wait_timeout_ms = DEFAULT_LOG_MERGE_WAIT_TIMEOUT_MS;
|
||||
opts->orphan_scan_delay_ms = -1;
|
||||
opts->quorum_heartbeat_timeout_ms = SCOUTFS_QUORUM_DEF_HB_TIMEO_MS;
|
||||
opts->quorum_slot_nr = -1;
|
||||
}
|
||||
|
||||
static int verify_log_merge_wait_timeout_ms(struct super_block *sb, int ret, int val)
|
||||
{
|
||||
if (ret < 0) {
|
||||
scoutfs_err(sb, "failed to parse log_merge_wait_timeout_ms value");
|
||||
return -EINVAL;
|
||||
}
|
||||
if (val < MIN_LOG_MERGE_WAIT_TIMEOUT_MS || val > MAX_LOG_MERGE_WAIT_TIMEOUT_MS) {
|
||||
scoutfs_err(sb, "invalid log_merge_wait_timeout_ms value %d, must be between %lu and %lu",
|
||||
val, MIN_LOG_MERGE_WAIT_TIMEOUT_MS, MAX_LOG_MERGE_WAIT_TIMEOUT_MS);
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int verify_quorum_heartbeat_timeout_ms(struct super_block *sb, int ret, u64 val)
|
||||
{
|
||||
if (ret < 0) {
|
||||
@@ -196,6 +218,14 @@ static int parse_options(struct super_block *sb, char *options, struct scoutfs_m
|
||||
opts->data_prealloc_contig_only = nr;
|
||||
break;
|
||||
|
||||
case Opt_log_merge_wait_timeout_ms:
|
||||
ret = match_int(args, &nr);
|
||||
ret = verify_log_merge_wait_timeout_ms(sb, ret, nr);
|
||||
if (ret < 0)
|
||||
return ret;
|
||||
opts->log_merge_wait_timeout_ms = nr64;
|
||||
break;
|
||||
|
||||
case Opt_metadev_path:
|
||||
ret = parse_bdev_path(sb, &args[0], &opts->metadev_path);
|
||||
if (ret < 0)
|
||||
@@ -422,6 +452,43 @@ static ssize_t data_prealloc_contig_only_store(struct kobject *kobj, struct kobj
|
||||
}
|
||||
SCOUTFS_ATTR_RW(data_prealloc_contig_only);
|
||||
|
||||
static ssize_t log_merge_wait_timeout_ms_show(struct kobject *kobj, struct kobj_attribute *attr,
|
||||
char *buf)
|
||||
{
|
||||
struct super_block *sb = SCOUTFS_SYSFS_ATTRS_SB(kobj);
|
||||
struct scoutfs_mount_options opts;
|
||||
|
||||
scoutfs_options_read(sb, &opts);
|
||||
|
||||
return snprintf(buf, PAGE_SIZE, "%u", opts.log_merge_wait_timeout_ms);
|
||||
}
|
||||
static ssize_t log_merge_wait_timeout_ms_store(struct kobject *kobj, struct kobj_attribute *attr,
|
||||
const char *buf, size_t count)
|
||||
{
|
||||
struct super_block *sb = SCOUTFS_SYSFS_ATTRS_SB(kobj);
|
||||
DECLARE_OPTIONS_INFO(sb, optinf);
|
||||
char nullterm[30]; /* more than enough for octal -U64_MAX */
|
||||
int val;
|
||||
int len;
|
||||
int ret;
|
||||
|
||||
len = min(count, sizeof(nullterm) - 1);
|
||||
memcpy(nullterm, buf, len);
|
||||
nullterm[len] = '\0';
|
||||
|
||||
ret = kstrtoint(nullterm, 0, &val);
|
||||
ret = verify_log_merge_wait_timeout_ms(sb, ret, val);
|
||||
if (ret == 0) {
|
||||
write_seqlock(&optinf->seqlock);
|
||||
optinf->opts.log_merge_wait_timeout_ms = val;
|
||||
write_sequnlock(&optinf->seqlock);
|
||||
ret = count;
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
SCOUTFS_ATTR_RW(log_merge_wait_timeout_ms);
|
||||
|
||||
static ssize_t metadev_path_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf)
|
||||
{
|
||||
struct super_block *sb = SCOUTFS_SYSFS_ATTRS_SB(kobj);
|
||||
@@ -525,6 +592,7 @@ SCOUTFS_ATTR_RO(quorum_slot_nr);
|
||||
static struct attribute *options_attrs[] = {
|
||||
SCOUTFS_ATTR_PTR(data_prealloc_blocks),
|
||||
SCOUTFS_ATTR_PTR(data_prealloc_contig_only),
|
||||
SCOUTFS_ATTR_PTR(log_merge_wait_timeout_ms),
|
||||
SCOUTFS_ATTR_PTR(metadev_path),
|
||||
SCOUTFS_ATTR_PTR(orphan_scan_delay_ms),
|
||||
SCOUTFS_ATTR_PTR(quorum_heartbeat_timeout_ms),
|
||||
|
||||
@@ -8,6 +8,7 @@
|
||||
struct scoutfs_mount_options {
|
||||
u64 data_prealloc_blocks;
|
||||
bool data_prealloc_contig_only;
|
||||
unsigned int log_merge_wait_timeout_ms;
|
||||
char *metadev_path;
|
||||
unsigned int orphan_scan_delay_ms;
|
||||
int quorum_slot_nr;
|
||||
|
||||
@@ -1747,21 +1747,41 @@ TRACE_EVENT(scoutfs_btree_merge,
|
||||
sk_trace_args(end))
|
||||
);
|
||||
|
||||
TRACE_EVENT(scoutfs_btree_merge_read_range,
|
||||
TP_PROTO(struct super_block *sb, struct scoutfs_key *start, struct scoutfs_key *end,
|
||||
int size),
|
||||
|
||||
TP_ARGS(sb, start, end, size),
|
||||
|
||||
TP_STRUCT__entry(
|
||||
SCSB_TRACE_FIELDS
|
||||
sk_trace_define(start)
|
||||
sk_trace_define(end)
|
||||
__field(int, size)
|
||||
),
|
||||
|
||||
TP_fast_assign(
|
||||
SCSB_TRACE_ASSIGN(sb);
|
||||
sk_trace_assign(start, start);
|
||||
sk_trace_assign(end, end);
|
||||
__entry->size = size;
|
||||
),
|
||||
|
||||
TP_printk(SCSBF" start "SK_FMT" end "SK_FMT" size %d",
|
||||
SCSB_TRACE_ARGS, sk_trace_args(start), sk_trace_args(end), __entry->size)
|
||||
);
|
||||
|
||||
TRACE_EVENT(scoutfs_btree_merge_items,
|
||||
TP_PROTO(struct super_block *sb,
|
||||
struct scoutfs_btree_root *m_root,
|
||||
struct scoutfs_key *m_key, int m_val_len,
|
||||
struct scoutfs_btree_root *f_root,
|
||||
struct scoutfs_key *f_key, int f_val_len,
|
||||
int is_del),
|
||||
|
||||
TP_ARGS(sb, m_root, m_key, m_val_len, f_root, f_key, f_val_len, is_del),
|
||||
TP_ARGS(sb, m_key, m_val_len, f_root, f_key, f_val_len, is_del),
|
||||
|
||||
TP_STRUCT__entry(
|
||||
SCSB_TRACE_FIELDS
|
||||
__field(__u64, m_root_blkno)
|
||||
__field(__u64, m_root_seq)
|
||||
__field(__u8, m_root_height)
|
||||
sk_trace_define(m_key)
|
||||
__field(int, m_val_len)
|
||||
__field(__u64, f_root_blkno)
|
||||
@@ -1774,10 +1794,6 @@ TRACE_EVENT(scoutfs_btree_merge_items,
|
||||
|
||||
TP_fast_assign(
|
||||
SCSB_TRACE_ASSIGN(sb);
|
||||
__entry->m_root_blkno = m_root ?
|
||||
le64_to_cpu(m_root->ref.blkno) : 0;
|
||||
__entry->m_root_seq = m_root ? le64_to_cpu(m_root->ref.seq) : 0;
|
||||
__entry->m_root_height = m_root ? m_root->height : 0;
|
||||
sk_trace_assign(m_key, m_key);
|
||||
__entry->m_val_len = m_val_len;
|
||||
__entry->f_root_blkno = f_root ?
|
||||
@@ -1789,11 +1805,9 @@ TRACE_EVENT(scoutfs_btree_merge_items,
|
||||
__entry->is_del = !!is_del;
|
||||
),
|
||||
|
||||
TP_printk(SCSBF" merge item root blkno %llu seq %llu height %u key "SK_FMT" val_len %d, fs item root blkno %llu seq %llu height %u key "SK_FMT" val_len %d, is_del %d",
|
||||
SCSB_TRACE_ARGS, __entry->m_root_blkno, __entry->m_root_seq,
|
||||
__entry->m_root_height, sk_trace_args(m_key),
|
||||
__entry->m_val_len, __entry->f_root_blkno,
|
||||
__entry->f_root_seq, __entry->f_root_height,
|
||||
TP_printk(SCSBF" merge item key "SK_FMT" val_len %d, fs item root blkno %llu seq %llu height %u key "SK_FMT" val_len %d, is_del %d",
|
||||
SCSB_TRACE_ARGS, sk_trace_args(m_key), __entry->m_val_len,
|
||||
__entry->f_root_blkno, __entry->f_root_seq, __entry->f_root_height,
|
||||
sk_trace_args(f_key), __entry->f_val_len, __entry->is_del)
|
||||
);
|
||||
|
||||
@@ -2076,6 +2090,71 @@ TRACE_EVENT(scoutfs_trans_seq_last,
|
||||
SCSB_TRACE_ARGS, __entry->s_rid, __entry->trans_seq)
|
||||
);
|
||||
|
||||
TRACE_EVENT(scoutfs_server_finalize_items,
|
||||
TP_PROTO(struct super_block *sb, u64 rid, u64 item_rid, u64 item_nr, u64 item_flags,
|
||||
u64 item_get_trans_seq),
|
||||
|
||||
TP_ARGS(sb, rid, item_rid, item_nr, item_flags, item_get_trans_seq),
|
||||
|
||||
TP_STRUCT__entry(
|
||||
SCSB_TRACE_FIELDS
|
||||
__field(__u64, c_rid)
|
||||
__field(__u64, item_rid)
|
||||
__field(__u64, item_nr)
|
||||
__field(__u64, item_flags)
|
||||
__field(__u64, item_get_trans_seq)
|
||||
),
|
||||
|
||||
TP_fast_assign(
|
||||
SCSB_TRACE_ASSIGN(sb);
|
||||
__entry->c_rid = rid;
|
||||
__entry->item_rid = item_rid;
|
||||
__entry->item_nr = item_nr;
|
||||
__entry->item_flags = item_flags;
|
||||
__entry->item_get_trans_seq = item_get_trans_seq;
|
||||
),
|
||||
|
||||
TP_printk(SCSBF" rid %016llx item_rid %016llx item_nr %llu item_flags 0x%llx item_get_trans_seq %llu",
|
||||
SCSB_TRACE_ARGS, __entry->c_rid, __entry->item_rid, __entry->item_nr,
|
||||
__entry->item_flags, __entry->item_get_trans_seq)
|
||||
);
|
||||
|
||||
TRACE_EVENT(scoutfs_server_finalize_decision,
|
||||
TP_PROTO(struct super_block *sb, u64 rid, bool saw_finalized, bool others_active,
|
||||
bool ours_visible, bool finalize_ours, unsigned int delay_ms,
|
||||
u64 finalize_sent_seq),
|
||||
|
||||
TP_ARGS(sb, rid, saw_finalized, others_active, ours_visible, finalize_ours, delay_ms,
|
||||
finalize_sent_seq),
|
||||
|
||||
TP_STRUCT__entry(
|
||||
SCSB_TRACE_FIELDS
|
||||
__field(__u64, c_rid)
|
||||
__field(bool, saw_finalized)
|
||||
__field(bool, others_active)
|
||||
__field(bool, ours_visible)
|
||||
__field(bool, finalize_ours)
|
||||
__field(unsigned int, delay_ms)
|
||||
__field(__u64, finalize_sent_seq)
|
||||
),
|
||||
|
||||
TP_fast_assign(
|
||||
SCSB_TRACE_ASSIGN(sb);
|
||||
__entry->c_rid = rid;
|
||||
__entry->saw_finalized = saw_finalized;
|
||||
__entry->others_active = others_active;
|
||||
__entry->ours_visible = ours_visible;
|
||||
__entry->finalize_ours = finalize_ours;
|
||||
__entry->delay_ms = delay_ms;
|
||||
__entry->finalize_sent_seq = finalize_sent_seq;
|
||||
),
|
||||
|
||||
TP_printk(SCSBF" rid %016llx saw_finalized %u others_active %u ours_visible %u finalize_ours %u delay_ms %u finalize_sent_seq %llu",
|
||||
SCSB_TRACE_ARGS, __entry->c_rid, __entry->saw_finalized, __entry->others_active,
|
||||
__entry->ours_visible, __entry->finalize_ours, __entry->delay_ms,
|
||||
__entry->finalize_sent_seq)
|
||||
);
|
||||
|
||||
TRACE_EVENT(scoutfs_get_log_merge_status,
|
||||
TP_PROTO(struct super_block *sb, u64 rid, struct scoutfs_key *next_range_key,
|
||||
u64 nr_requests, u64 nr_complete, u64 seq),
|
||||
|
||||
@@ -148,6 +148,8 @@ struct server_info {
|
||||
struct scoutfs_quorum_config qconf;
|
||||
/* a running server maintains a private dirty super */
|
||||
struct scoutfs_super_block dirty_super;
|
||||
|
||||
u64 finalize_sent_seq;
|
||||
};
|
||||
|
||||
#define DECLARE_SERVER_INFO(sb, name) \
|
||||
@@ -413,6 +415,27 @@ static void server_hold_commit(struct super_block *sb, struct commit_hold *hold)
|
||||
wait_event(cusers->waitq, hold_commit(sb, server, cusers, hold));
|
||||
}
|
||||
|
||||
/*
|
||||
* Return the higher of the avail or freed used by the active commit
|
||||
* since this holder joined the commit. This is *not* the amount used
|
||||
* by the holder, we don't track per-holder alloc use.
|
||||
*/
|
||||
static u32 server_hold_alloc_used_since(struct super_block *sb, struct commit_hold *hold)
|
||||
{
|
||||
DECLARE_SERVER_INFO(sb, server);
|
||||
u32 avail_used;
|
||||
u32 freed_used;
|
||||
u32 avail_now;
|
||||
u32 freed_now;
|
||||
|
||||
scoutfs_alloc_meta_remaining(&server->alloc, &avail_now, &freed_now);
|
||||
|
||||
avail_used = hold->avail - avail_now;
|
||||
freed_used = hold->freed - freed_now;
|
||||
|
||||
return max(avail_used, freed_used);
|
||||
}
|
||||
|
||||
/*
|
||||
* This is called while holding the commit and returns once the commit
|
||||
* is successfully written. Many holders can all wait for all holders
|
||||
@@ -938,22 +961,24 @@ static int find_log_trees_item(struct super_block *sb,
|
||||
}
|
||||
|
||||
/*
|
||||
* Find the next log_trees item from the key. Fills the caller's log_trees and sets
|
||||
* the key past the returned log_trees for iteration. Returns 0 when done, > 0 for each
|
||||
* item, and -errno on fatal errors.
|
||||
* Find the log_trees item with the greatest nr for each rid. Fills the
|
||||
* caller's log_trees and sets the key before the returned log_trees for
|
||||
* the next iteration. Returns 0 when done, > 0 for each item, and
|
||||
* -errno on fatal errors.
|
||||
*/
|
||||
static int for_each_lt(struct super_block *sb, struct scoutfs_btree_root *root,
|
||||
struct scoutfs_key *key, struct scoutfs_log_trees *lt)
|
||||
static int for_each_rid_last_lt(struct super_block *sb, struct scoutfs_btree_root *root,
|
||||
struct scoutfs_key *key, struct scoutfs_log_trees *lt)
|
||||
{
|
||||
SCOUTFS_BTREE_ITEM_REF(iref);
|
||||
int ret;
|
||||
|
||||
ret = scoutfs_btree_next(sb, root, key, &iref);
|
||||
ret = scoutfs_btree_prev(sb, root, key, &iref);
|
||||
if (ret == 0) {
|
||||
if (iref.val_len == sizeof(struct scoutfs_log_trees)) {
|
||||
memcpy(lt, iref.val, iref.val_len);
|
||||
*key = *iref.key;
|
||||
scoutfs_key_inc(key);
|
||||
key->sklt_nr = 0;
|
||||
scoutfs_key_dec(key);
|
||||
ret = 1;
|
||||
} else {
|
||||
ret = -EIO;
|
||||
@@ -1048,21 +1073,13 @@ static int next_log_merge_item(struct super_block *sb,
|
||||
* abandoned log btree finalized. If it takes too long each client has
|
||||
* a change to make forward progress before being asked to commit again.
|
||||
*
|
||||
* We're waiting on heavy state that is protected by mutexes and
|
||||
* transaction machinery. It's tricky to recreate that state for
|
||||
* lightweight condition tests that don't change task state. Instead of
|
||||
* trying to get that right, particularly as we unwind after success or
|
||||
* after timeouts, waiters use an unsatisfying poll. Short enough to
|
||||
* not add terrible latency, given how heavy and infrequent this already
|
||||
* is, and long enough to not melt the cpu. This could be tuned if it
|
||||
* becomes a problem.
|
||||
*
|
||||
* This can end up finalizing a new empty log btree if a new mount
|
||||
* happens to arrive at just the right time. That's fine, merging will
|
||||
* ignore and tear down the empty input.
|
||||
*/
|
||||
#define FINALIZE_POLL_MS (11)
|
||||
#define FINALIZE_TIMEOUT_MS (MSEC_PER_SEC / 2)
|
||||
#define FINALIZE_POLL_MIN_DELAY_MS 5U
|
||||
#define FINALIZE_POLL_MAX_DELAY_MS 100U
|
||||
#define FINALIZE_POLL_DELAY_GROWTH_PCT 150U
|
||||
static int finalize_and_start_log_merge(struct super_block *sb, struct scoutfs_log_trees *lt,
|
||||
u64 rid, struct commit_hold *hold)
|
||||
{
|
||||
@@ -1070,8 +1087,10 @@ static int finalize_and_start_log_merge(struct super_block *sb, struct scoutfs_l
|
||||
struct scoutfs_super_block *super = DIRTY_SUPER_SB(sb);
|
||||
struct scoutfs_log_merge_status stat;
|
||||
struct scoutfs_log_merge_range rng;
|
||||
struct scoutfs_mount_options opts;
|
||||
struct scoutfs_log_trees each_lt;
|
||||
struct scoutfs_log_trees fin;
|
||||
unsigned int delay_ms;
|
||||
unsigned long timeo;
|
||||
bool saw_finalized;
|
||||
bool others_active;
|
||||
@@ -1079,10 +1098,14 @@ static int finalize_and_start_log_merge(struct super_block *sb, struct scoutfs_l
|
||||
bool ours_visible;
|
||||
struct scoutfs_key key;
|
||||
char *err_str = NULL;
|
||||
ktime_t start;
|
||||
int ret;
|
||||
int err;
|
||||
|
||||
timeo = jiffies + msecs_to_jiffies(FINALIZE_TIMEOUT_MS);
|
||||
scoutfs_options_read(sb, &opts);
|
||||
timeo = jiffies + msecs_to_jiffies(opts.log_merge_wait_timeout_ms);
|
||||
delay_ms = FINALIZE_POLL_MIN_DELAY_MS;
|
||||
start = ktime_get_raw();
|
||||
|
||||
for (;;) {
|
||||
/* nothing to do if there's already a merge in flight */
|
||||
@@ -1099,8 +1122,13 @@ static int finalize_and_start_log_merge(struct super_block *sb, struct scoutfs_l
|
||||
saw_finalized = false;
|
||||
others_active = false;
|
||||
ours_visible = false;
|
||||
scoutfs_key_init_log_trees(&key, 0, 0);
|
||||
while ((ret = for_each_lt(sb, &super->logs_root, &key, &each_lt)) > 0) {
|
||||
scoutfs_key_init_log_trees(&key, U64_MAX, U64_MAX);
|
||||
while ((ret = for_each_rid_last_lt(sb, &super->logs_root, &key, &each_lt)) > 0) {
|
||||
|
||||
trace_scoutfs_server_finalize_items(sb, rid, le64_to_cpu(each_lt.rid),
|
||||
le64_to_cpu(each_lt.nr),
|
||||
le64_to_cpu(each_lt.flags),
|
||||
le64_to_cpu(each_lt.get_trans_seq));
|
||||
|
||||
if ((le64_to_cpu(each_lt.flags) & SCOUTFS_LOG_TREES_FINALIZED))
|
||||
saw_finalized = true;
|
||||
@@ -1125,6 +1153,10 @@ static int finalize_and_start_log_merge(struct super_block *sb, struct scoutfs_l
|
||||
finalize_ours = (lt->item_root.height > 2) ||
|
||||
(le32_to_cpu(lt->meta_avail.flags) & SCOUTFS_ALLOC_FLAG_LOW);
|
||||
|
||||
trace_scoutfs_server_finalize_decision(sb, rid, saw_finalized, others_active,
|
||||
ours_visible, finalize_ours, delay_ms,
|
||||
server->finalize_sent_seq);
|
||||
|
||||
/* done if we're not finalizing and there's no finalized */
|
||||
if (!finalize_ours && !saw_finalized) {
|
||||
ret = 0;
|
||||
@@ -1132,12 +1164,13 @@ static int finalize_and_start_log_merge(struct super_block *sb, struct scoutfs_l
|
||||
}
|
||||
|
||||
/* send sync requests soon to give time to commit */
|
||||
scoutfs_key_init_log_trees(&key, 0, 0);
|
||||
scoutfs_key_init_log_trees(&key, U64_MAX, U64_MAX);
|
||||
while (others_active &&
|
||||
(ret = for_each_lt(sb, &super->logs_root, &key, &each_lt)) > 0) {
|
||||
(ret = for_each_rid_last_lt(sb, &super->logs_root, &key, &each_lt)) > 0) {
|
||||
|
||||
if ((le64_to_cpu(each_lt.flags) & SCOUTFS_LOG_TREES_FINALIZED) ||
|
||||
(le64_to_cpu(each_lt.rid) == rid))
|
||||
(le64_to_cpu(each_lt.rid) == rid) ||
|
||||
(le64_to_cpu(each_lt.get_trans_seq) <= server->finalize_sent_seq))
|
||||
continue;
|
||||
|
||||
ret = scoutfs_net_submit_request_node(sb, server->conn,
|
||||
@@ -1157,6 +1190,8 @@ static int finalize_and_start_log_merge(struct super_block *sb, struct scoutfs_l
|
||||
break;
|
||||
}
|
||||
|
||||
server->finalize_sent_seq = scoutfs_server_seq(sb);
|
||||
|
||||
/* Finalize ours if it's visible to others */
|
||||
if (ours_visible) {
|
||||
fin = *lt;
|
||||
@@ -1194,13 +1229,16 @@ static int finalize_and_start_log_merge(struct super_block *sb, struct scoutfs_l
|
||||
if (ret < 0)
|
||||
err_str = "applying commit before waiting for finalized";
|
||||
|
||||
msleep(FINALIZE_POLL_MS);
|
||||
msleep(delay_ms);
|
||||
delay_ms = min(delay_ms * FINALIZE_POLL_DELAY_GROWTH_PCT / 100,
|
||||
FINALIZE_POLL_MAX_DELAY_MS);
|
||||
|
||||
server_hold_commit(sb, hold);
|
||||
mutex_lock(&server->logs_mutex);
|
||||
|
||||
/* done if we timed out */
|
||||
if (time_after(jiffies, timeo)) {
|
||||
scoutfs_inc_counter(sb, log_merge_wait_timeout);
|
||||
ret = 0;
|
||||
break;
|
||||
}
|
||||
@@ -1783,43 +1821,29 @@ out:
|
||||
* Give the caller the last seq before outstanding client commits. All
|
||||
* seqs up to and including this are stable, new client transactions can
|
||||
* only have greater seqs.
|
||||
*
|
||||
* For each rid, only its greatest log trees nr can be an open commit.
|
||||
* We look at the last log_trees item for each client rid and record its
|
||||
* trans seq if it hasn't been committed.
|
||||
*/
|
||||
static int get_stable_trans_seq(struct super_block *sb, u64 *last_seq_ret)
|
||||
{
|
||||
struct scoutfs_super_block *super = DIRTY_SUPER_SB(sb);
|
||||
DECLARE_SERVER_INFO(sb, server);
|
||||
SCOUTFS_BTREE_ITEM_REF(iref);
|
||||
struct scoutfs_log_trees *lt;
|
||||
struct scoutfs_log_trees lt;
|
||||
struct scoutfs_key key;
|
||||
u64 last_seq = 0;
|
||||
int ret;
|
||||
|
||||
last_seq = scoutfs_server_seq(sb) - 1;
|
||||
scoutfs_key_init_log_trees(&key, 0, 0);
|
||||
|
||||
mutex_lock(&server->logs_mutex);
|
||||
|
||||
for (;; scoutfs_key_inc(&key)) {
|
||||
ret = scoutfs_btree_next(sb, &super->logs_root, &key, &iref);
|
||||
if (ret == 0) {
|
||||
if (iref.val_len == sizeof(*lt)) {
|
||||
lt = iref.val;
|
||||
if ((le64_to_cpu(lt->get_trans_seq) >
|
||||
le64_to_cpu(lt->commit_trans_seq)) &&
|
||||
le64_to_cpu(lt->get_trans_seq) <= last_seq) {
|
||||
last_seq = le64_to_cpu(lt->get_trans_seq) - 1;
|
||||
}
|
||||
key = *iref.key;
|
||||
} else {
|
||||
ret = -EIO;
|
||||
}
|
||||
scoutfs_btree_put_iref(&iref);
|
||||
}
|
||||
if (ret < 0) {
|
||||
if (ret == -ENOENT) {
|
||||
ret = 0;
|
||||
break;
|
||||
}
|
||||
scoutfs_key_init_log_trees(&key, U64_MAX, U64_MAX);
|
||||
while ((ret = for_each_rid_last_lt(sb, &super->logs_root, &key, <)) > 0) {
|
||||
if ((le64_to_cpu(lt.get_trans_seq) > le64_to_cpu(lt.commit_trans_seq)) &&
|
||||
le64_to_cpu(lt.get_trans_seq) <= last_seq) {
|
||||
last_seq = le64_to_cpu(lt.get_trans_seq) - 1;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -2471,9 +2495,11 @@ static void server_log_merge_free_work(struct work_struct *work)
|
||||
|
||||
while (!server_is_stopping(server)) {
|
||||
|
||||
server_hold_commit(sb, &hold);
|
||||
mutex_lock(&server->logs_mutex);
|
||||
commit = true;
|
||||
if (!commit) {
|
||||
server_hold_commit(sb, &hold);
|
||||
mutex_lock(&server->logs_mutex);
|
||||
commit = true;
|
||||
}
|
||||
|
||||
ret = next_log_merge_item(sb, &super->log_merge,
|
||||
SCOUTFS_LOG_MERGE_FREEING_ZONE,
|
||||
@@ -2520,12 +2546,14 @@ static void server_log_merge_free_work(struct work_struct *work)
|
||||
/* freed blocks are in allocator, we *have* to update fr */
|
||||
BUG_ON(ret < 0);
|
||||
|
||||
mutex_unlock(&server->logs_mutex);
|
||||
ret = server_apply_commit(sb, &hold, ret);
|
||||
commit = false;
|
||||
if (ret < 0) {
|
||||
err_str = "looping commit del/upd freeing item";
|
||||
break;
|
||||
if (server_hold_alloc_used_since(sb, &hold) >= COMMIT_HOLD_ALLOC_BUDGET / 2) {
|
||||
mutex_unlock(&server->logs_mutex);
|
||||
ret = server_apply_commit(sb, &hold, ret);
|
||||
commit = false;
|
||||
if (ret < 0) {
|
||||
err_str = "looping commit del/upd freeing item";
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -4298,6 +4326,7 @@ static void scoutfs_server_worker(struct work_struct *work)
|
||||
scoutfs_info(sb, "server starting at "SIN_FMT, SIN_ARG(&sin));
|
||||
|
||||
scoutfs_block_writer_init(sb, &server->wri);
|
||||
server->finalize_sent_seq = 0;
|
||||
|
||||
/* first make sure no other servers are still running */
|
||||
ret = scoutfs_quorum_fence_leaders(sb, &server->qconf, server->term);
|
||||
|
||||
@@ -55,6 +55,19 @@ with initial sparse regions (perhaps by multiple threads writing to
|
||||
different regions) and wasted space isn't an issue (perhaps because the
|
||||
file population contains few small files).
|
||||
.TP
|
||||
.B log_merge_wait_timeout_ms=<number>
|
||||
This option sets the amount of time, in milliseconds, that log merge
|
||||
creation can wait before timing out. This setting is per-mount, only
|
||||
changes the behavior of that mount, and only affects the server when it
|
||||
is running in that mount.
|
||||
.sp
|
||||
This determines how long it may take for mounts to synchronize
|
||||
committing their log trees to create a log merge operation. Setting it
|
||||
too high can create long latencies in the event that a mount takes a
|
||||
long time to commit their log. Setting it too low can result in the
|
||||
creation of excessive numbers of log trees that are never merged. The
|
||||
default is 500 and it can not be less than 100 nor greater than 60000.
|
||||
.TP
|
||||
.B metadev_path=<device>
|
||||
The metadev_path option specifies the path to the block device that
|
||||
contains the filesystem's metadata.
|
||||
|
||||
Reference in New Issue
Block a user