mirror of
https://github.com/versity/scoutfs.git
synced 2026-01-08 13:01:23 +00:00
Compare commits
42 Commits
zab/force_
...
zab/check
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
6541ccfdd0 | ||
|
|
b56836b395 | ||
|
|
de8395a9cf | ||
|
|
94c608f281 | ||
|
|
84c1460e4f | ||
|
|
e407f67fcc | ||
|
|
13c4b35bed | ||
|
|
67990a7007 | ||
|
|
ba819be8f9 | ||
|
|
1b103184ca | ||
|
|
c3890abd7b | ||
|
|
5ab38bfa48 | ||
|
|
e9ad61b444 | ||
|
|
91bbf90f71 | ||
|
|
b5630f540d | ||
|
|
90a4c82363 | ||
|
|
f654fa0fda | ||
|
|
50168a2d2a | ||
|
|
3c0616524a | ||
|
|
8d3e6883c6 | ||
|
|
8747dae61c | ||
|
|
fffcf4a9bb | ||
|
|
b552406427 | ||
|
|
d812599e6b | ||
|
|
03ab5cedb6 | ||
|
|
2b94cd6468 | ||
|
|
5507ee5351 | ||
|
|
1600a121d9 | ||
|
|
6daf24ff37 | ||
|
|
cd5d9ff3e0 | ||
|
|
d94e49eb63 | ||
|
|
1dbe408539 | ||
|
|
bf21699ad7 | ||
|
|
c7c67a173d | ||
|
|
0d10189f58 | ||
|
|
6b88f3268e | ||
|
|
4b2afa61b8 | ||
|
|
222ba2cede | ||
|
|
c7e97eeb1f | ||
|
|
21c070b42d | ||
|
|
77fbf92968 | ||
|
|
d5c699c3b4 |
@@ -1,6 +1,32 @@
|
||||
Versity ScoutFS Release Notes
|
||||
=============================
|
||||
|
||||
---
|
||||
v1.19
|
||||
\
|
||||
*Jan 30, 2024*
|
||||
|
||||
Added the log\_merge\_wait\_timeout\_ms mount option to set the timeout
|
||||
for creating log merge operations. The previous timeout, now the
|
||||
default, was too short for some systems and was resulting in consistent
|
||||
timeouts which created an excessive number of log trees waiting to be
|
||||
merged.
|
||||
|
||||
Improved performance of many in-mount server operations when there are a
|
||||
large number of log trees waiting to be merged.
|
||||
|
||||
---
|
||||
v1.18
|
||||
\
|
||||
*Nov 7, 2023*
|
||||
|
||||
Fixed a bug where background srch file compaction could stop making
|
||||
forward progress if a partial compaction operation was committed at a
|
||||
specific byte offset in a block. This would cause srch file searches to
|
||||
be progressively more expensive over time. Once this fix is running
|
||||
background compaction will resume, bringing the cost of searches back
|
||||
down.
|
||||
|
||||
---
|
||||
v1.17
|
||||
\
|
||||
|
||||
439
kmod/src/btree.c
439
kmod/src/btree.c
@@ -2029,187 +2029,253 @@ int scoutfs_btree_rebalance(struct super_block *sb,
|
||||
key, SCOUTFS_BTREE_MAX_VAL_LEN, NULL, NULL, NULL);
|
||||
}
|
||||
|
||||
struct merge_pos {
|
||||
struct merged_range {
|
||||
struct scoutfs_key start;
|
||||
struct scoutfs_key end;
|
||||
struct rb_root root;
|
||||
int size;
|
||||
};
|
||||
|
||||
struct merged_item {
|
||||
struct rb_node node;
|
||||
struct scoutfs_btree_root *root;
|
||||
struct scoutfs_block *bl;
|
||||
struct scoutfs_btree_block *bt;
|
||||
struct scoutfs_avl_node *avl;
|
||||
struct scoutfs_key *key;
|
||||
struct scoutfs_key key;
|
||||
u64 seq;
|
||||
u8 flags;
|
||||
unsigned int val_len;
|
||||
u8 *val;
|
||||
u8 val[0];
|
||||
};
|
||||
|
||||
static struct merge_pos *first_mpos(struct rb_root *root)
|
||||
static inline struct merged_item *mitem_container(struct rb_node *node)
|
||||
{
|
||||
struct rb_node *node = rb_first(root);
|
||||
if (node)
|
||||
return container_of(node, struct merge_pos, node);
|
||||
return node ? container_of(node, struct merged_item, node) : NULL;
|
||||
}
|
||||
|
||||
static inline struct merged_item *first_mitem(struct rb_root *root)
|
||||
{
|
||||
return mitem_container(rb_first(root));
|
||||
}
|
||||
|
||||
static inline struct merged_item *last_mitem(struct rb_root *root)
|
||||
{
|
||||
return mitem_container(rb_last(root));
|
||||
}
|
||||
|
||||
static inline struct merged_item *next_mitem(struct merged_item *mitem)
|
||||
{
|
||||
return mitem_container(mitem ? rb_next(&mitem->node) : NULL);
|
||||
}
|
||||
|
||||
static inline struct merged_item *prev_mitem(struct merged_item *mitem)
|
||||
{
|
||||
return mitem_container(mitem ? rb_prev(&mitem->node) : NULL);
|
||||
}
|
||||
|
||||
static struct merged_item *find_mitem(struct rb_root *root, struct scoutfs_key *key,
|
||||
struct rb_node **parent_ret, struct rb_node ***link_ret)
|
||||
{
|
||||
struct rb_node **node = &root->rb_node;
|
||||
struct rb_node *parent = NULL;
|
||||
struct merged_item *mitem;
|
||||
int cmp;
|
||||
|
||||
while (*node) {
|
||||
parent = *node;
|
||||
mitem = container_of(*node, struct merged_item, node);
|
||||
|
||||
cmp = scoutfs_key_compare(key, &mitem->key);
|
||||
|
||||
if (cmp < 0) {
|
||||
node = &(*node)->rb_left;
|
||||
} else if (cmp > 0) {
|
||||
node = &(*node)->rb_right;
|
||||
} else {
|
||||
*parent_ret = NULL;
|
||||
*link_ret = NULL;
|
||||
return mitem;
|
||||
}
|
||||
}
|
||||
|
||||
*parent_ret = parent;
|
||||
*link_ret = node;
|
||||
return NULL;
|
||||
}
|
||||
|
||||
static struct merge_pos *next_mpos(struct merge_pos *mpos)
|
||||
static void insert_mitem(struct merged_range *rng, struct merged_item *mitem,
|
||||
struct rb_node *parent, struct rb_node **link)
|
||||
{
|
||||
struct rb_node *node;
|
||||
|
||||
if (mpos && (node = rb_next(&mpos->node)))
|
||||
return container_of(node, struct merge_pos, node);
|
||||
else
|
||||
return NULL;
|
||||
rb_link_node(&mitem->node, parent, link);
|
||||
rb_insert_color(&mitem->node, &rng->root);
|
||||
rng->size += item_len_bytes(mitem->val_len);
|
||||
}
|
||||
|
||||
static void free_mpos(struct super_block *sb, struct merge_pos *mpos)
|
||||
static void replace_mitem(struct merged_range *rng, struct merged_item *victim,
|
||||
struct merged_item *new)
|
||||
{
|
||||
scoutfs_block_put(sb, mpos->bl);
|
||||
kfree(mpos);
|
||||
rb_replace_node(&victim->node, &new->node, &rng->root);
|
||||
RB_CLEAR_NODE(&victim->node);
|
||||
rng->size -= item_len_bytes(victim->val_len);
|
||||
rng->size += item_len_bytes(new->val_len);
|
||||
}
|
||||
|
||||
static void insert_mpos(struct rb_root *pos_root, struct merge_pos *ins)
|
||||
static void free_mitem(struct merged_range *rng, struct merged_item *mitem)
|
||||
{
|
||||
struct rb_node **node = &pos_root->rb_node;
|
||||
struct rb_node *parent = NULL;
|
||||
struct merge_pos *mpos;
|
||||
int cmp;
|
||||
if (IS_ERR_OR_NULL(mitem))
|
||||
return;
|
||||
|
||||
parent = NULL;
|
||||
while (*node) {
|
||||
parent = *node;
|
||||
mpos = container_of(*node, struct merge_pos, node);
|
||||
|
||||
/* sort merge items by key then newest to oldest */
|
||||
cmp = scoutfs_key_compare(ins->key, mpos->key) ?:
|
||||
-scoutfs_cmp(ins->seq, mpos->seq);
|
||||
|
||||
if (cmp < 0)
|
||||
node = &(*node)->rb_left;
|
||||
else
|
||||
node = &(*node)->rb_right;
|
||||
if (!RB_EMPTY_NODE(&mitem->node)) {
|
||||
rng->size -= item_len_bytes(mitem->val_len);
|
||||
rb_erase(&mitem->node, &rng->root);
|
||||
}
|
||||
|
||||
rb_link_node(&ins->node, parent, node);
|
||||
rb_insert_color(&ins->node, pos_root);
|
||||
kfree(mitem);
|
||||
}
|
||||
|
||||
static void trim_range_size(struct merged_range *rng, int merge_window)
|
||||
{
|
||||
struct merged_item *mitem;
|
||||
struct merged_item *tmp;
|
||||
|
||||
mitem = last_mitem(&rng->root);
|
||||
while (mitem && rng->size > merge_window) {
|
||||
|
||||
rng->end = mitem->key;
|
||||
scoutfs_key_dec(&rng->end);
|
||||
|
||||
tmp = mitem;
|
||||
mitem = prev_mitem(mitem);
|
||||
free_mitem(rng, tmp);
|
||||
}
|
||||
}
|
||||
|
||||
static void trim_range_end(struct merged_range *rng)
|
||||
{
|
||||
struct merged_item *mitem;
|
||||
struct merged_item *tmp;
|
||||
|
||||
mitem = last_mitem(&rng->root);
|
||||
while (mitem && scoutfs_key_compare(&mitem->key, &rng->end) > 0) {
|
||||
tmp = mitem;
|
||||
mitem = prev_mitem(mitem);
|
||||
free_mitem(rng, tmp);
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Find the next item in the merge_pos root in the caller's range and
|
||||
* insert it into the rbtree sorted by key and version so that merging
|
||||
* can find the next newest item at the front of the rbtree. We free
|
||||
* the mpos on error or if there are no more items in the range.
|
||||
* Record and combine logged items from log roots for merging with the
|
||||
* writable destination root. The caller is responsible for trimming
|
||||
* the range if it gets too large or if the key range shrinks.
|
||||
*/
|
||||
static int reset_mpos(struct super_block *sb, struct rb_root *pos_root, struct merge_pos *mpos,
|
||||
struct scoutfs_key *start, struct scoutfs_key *end)
|
||||
static int merge_read_item(struct super_block *sb, struct scoutfs_key *key, u64 seq, u8 flags,
|
||||
void *val, int val_len, void *arg)
|
||||
{
|
||||
struct scoutfs_btree_item *item;
|
||||
struct scoutfs_avl_node *next;
|
||||
struct btree_walk_key_range kr;
|
||||
struct scoutfs_key walk_key;
|
||||
int ret = 0;
|
||||
struct merged_range *rng = arg;
|
||||
struct merged_item *mitem;
|
||||
struct merged_item *found;
|
||||
struct rb_node *parent;
|
||||
struct rb_node **link;
|
||||
int ret;
|
||||
|
||||
/* always erase before freeing or inserting */
|
||||
if (!RB_EMPTY_NODE(&mpos->node)) {
|
||||
rb_erase(&mpos->node, pos_root);
|
||||
RB_CLEAR_NODE(&mpos->node);
|
||||
}
|
||||
|
||||
/*
|
||||
* advance to next item via the avl tree. The caller's pos is
|
||||
* only ever incremented past the last key so we can use next to
|
||||
* iterate rather than using search to skip past multiple items.
|
||||
*/
|
||||
if (mpos->avl)
|
||||
mpos->avl = scoutfs_avl_next(&mpos->bt->item_root, mpos->avl);
|
||||
|
||||
/* find the next leaf with the key if we run out of items */
|
||||
walk_key = *start;
|
||||
while (!mpos->avl && !scoutfs_key_is_zeros(&walk_key)) {
|
||||
scoutfs_block_put(sb, mpos->bl);
|
||||
mpos->bl = NULL;
|
||||
ret = btree_walk(sb, NULL, NULL, mpos->root, BTW_NEXT, &walk_key,
|
||||
0, &mpos->bl, &kr, NULL);
|
||||
if (ret < 0) {
|
||||
if (ret == -ENOENT)
|
||||
ret = 0;
|
||||
free_mpos(sb, mpos);
|
||||
found = find_mitem(&rng->root, key, &parent, &link);
|
||||
if (found) {
|
||||
ret = scoutfs_forest_combine_deltas(key, found->val, found->val_len, val, val_len);
|
||||
if (ret < 0)
|
||||
goto out;
|
||||
if (ret > 0) {
|
||||
if (ret == SCOUTFS_DELTA_COMBINED) {
|
||||
scoutfs_inc_counter(sb, btree_merge_delta_combined);
|
||||
} else if (ret == SCOUTFS_DELTA_COMBINED_NULL) {
|
||||
scoutfs_inc_counter(sb, btree_merge_delta_null);
|
||||
free_mitem(rng, found);
|
||||
}
|
||||
ret = 0;
|
||||
goto out;
|
||||
}
|
||||
mpos->bt = mpos->bl->data;
|
||||
|
||||
mpos->avl = scoutfs_avl_search(&mpos->bt->item_root, cmp_key_item,
|
||||
start, NULL, NULL, &next, NULL) ?: next;
|
||||
if (mpos->avl == NULL)
|
||||
walk_key = kr.iter_next;
|
||||
if (found->seq >= seq) {
|
||||
ret = 0;
|
||||
goto out;
|
||||
}
|
||||
}
|
||||
|
||||
/* see if we're out of items within the range */
|
||||
item = node_item(mpos->avl);
|
||||
if (!item || scoutfs_key_compare(item_key(item), end) > 0) {
|
||||
free_mpos(sb, mpos);
|
||||
ret = 0;
|
||||
mitem = kmalloc(offsetof(struct merged_item, val[val_len]), GFP_NOFS);
|
||||
if (!mitem) {
|
||||
ret = -ENOMEM;
|
||||
goto out;
|
||||
}
|
||||
|
||||
/* insert the next item within range at its version */
|
||||
mpos->key = item_key(item);
|
||||
mpos->seq = le64_to_cpu(item->seq);
|
||||
mpos->flags = item->flags;
|
||||
mpos->val_len = item_val_len(item);
|
||||
mpos->val = item_val(mpos->bt, item);
|
||||
mitem->key = *key;
|
||||
mitem->seq = seq;
|
||||
mitem->flags = flags;
|
||||
mitem->val_len = val_len;
|
||||
if (val_len)
|
||||
memcpy(mitem->val, val, val_len);
|
||||
|
||||
if (found) {
|
||||
replace_mitem(rng, found, mitem);
|
||||
free_mitem(rng, found);
|
||||
} else {
|
||||
insert_mitem(rng, mitem, parent, link);
|
||||
}
|
||||
|
||||
insert_mpos(pos_root, mpos);
|
||||
ret = 0;
|
||||
out:
|
||||
return ret;
|
||||
}
|
||||
|
||||
/*
|
||||
* The caller has reset all the merge positions for all the input log
|
||||
* btree roots and wants the next logged item it should try and merge
|
||||
* with the items in the fs_root.
|
||||
* Read a range of merged items. The caller has set the key bounds of
|
||||
* the range. We read a merge window's worth of items from blocks in
|
||||
* each input btree.
|
||||
*
|
||||
* We look ahead in the logged item stream to see if we should merge any
|
||||
* older logged delta items into one result for the caller. We also
|
||||
* take this opportunity to skip and reset the mpos for any older
|
||||
* versions of the first item.
|
||||
* The caller can only use the smallest range that overlaps with all the
|
||||
* blocks that we read. We start reading from the range's start key so
|
||||
* it will always be present and we don't need to adjust it. The final
|
||||
* block we read from each input might not cover the range's end so it
|
||||
* needs to be adjusted.
|
||||
*
|
||||
* The end range can also shrink if we have to drop items because the
|
||||
* items exceeded the merge window size.
|
||||
*/
|
||||
static int next_resolved_mpos(struct super_block *sb, struct rb_root *pos_root,
|
||||
struct scoutfs_key *end, struct merge_pos **mpos_ret)
|
||||
static int read_merged_range(struct super_block *sb, struct merged_range *rng,
|
||||
struct list_head *inputs, int merge_window)
|
||||
{
|
||||
struct merge_pos *mpos;
|
||||
struct merge_pos *next;
|
||||
struct scoutfs_btree_root_head *rhead;
|
||||
struct scoutfs_key start;
|
||||
struct scoutfs_key end;
|
||||
struct scoutfs_key key;
|
||||
int ret = 0;
|
||||
int i;
|
||||
|
||||
while ((mpos = first_mpos(pos_root)) && (next = next_mpos(mpos)) &&
|
||||
!scoutfs_key_compare(mpos->key, next->key)) {
|
||||
list_for_each_entry(rhead, inputs, head) {
|
||||
key = rng->start;
|
||||
|
||||
ret = scoutfs_forest_combine_deltas(mpos->key, mpos->val, mpos->val_len,
|
||||
next->val, next->val_len);
|
||||
if (ret < 0)
|
||||
break;
|
||||
|
||||
/* reset advances to the next item */
|
||||
key = *mpos->key;
|
||||
scoutfs_key_inc(&key);
|
||||
|
||||
/* always skip next combined or older version */
|
||||
ret = reset_mpos(sb, pos_root, next, &key, end);
|
||||
if (ret < 0)
|
||||
break;
|
||||
|
||||
if (ret == SCOUTFS_DELTA_COMBINED) {
|
||||
scoutfs_inc_counter(sb, btree_merge_delta_combined);
|
||||
} else if (ret == SCOUTFS_DELTA_COMBINED_NULL) {
|
||||
scoutfs_inc_counter(sb, btree_merge_delta_null);
|
||||
/* if merging resulted in no info, skip current */
|
||||
ret = reset_mpos(sb, pos_root, mpos, &key, end);
|
||||
for (i = 0; i < merge_window; i += SCOUTFS_BLOCK_LG_SIZE) {
|
||||
start = key;
|
||||
end = rng->end;
|
||||
ret = scoutfs_btree_read_items(sb, &rhead->root, &key, &start, &end,
|
||||
merge_read_item, rng);
|
||||
if (ret < 0)
|
||||
goto out;
|
||||
|
||||
if (scoutfs_key_compare(&end, &rng->end) >= 0)
|
||||
break;
|
||||
|
||||
key = end;
|
||||
scoutfs_key_inc(&key);
|
||||
}
|
||||
|
||||
if (scoutfs_key_compare(&end, &rng->end) < 0) {
|
||||
rng->end = end;
|
||||
trim_range_end(rng);
|
||||
}
|
||||
|
||||
if (rng->size > merge_window)
|
||||
trim_range_size(rng, merge_window);
|
||||
}
|
||||
|
||||
*mpos_ret = mpos;
|
||||
trace_scoutfs_btree_merge_read_range(sb, &rng->start, &rng->end, rng->size);
|
||||
ret = 0;
|
||||
out:
|
||||
return ret;
|
||||
}
|
||||
|
||||
@@ -2226,6 +2292,13 @@ static int next_resolved_mpos(struct super_block *sb, struct rb_root *pos_root,
|
||||
* to allocators running low or needing to join/split the parent.
|
||||
* *next_ret is set to the next key which hasn't been merged so that the
|
||||
* caller can retry with a new allocator and subtree.
|
||||
*
|
||||
* The number of input roots can be immense. The merge_window specifies
|
||||
* the size of the set of merged items that we'll maintain as we iterate
|
||||
* over all the input roots. Once we've merged items into the window
|
||||
* from all the input roots the merged input items are then merged to
|
||||
* the writable destination root. It may take multiple passes of
|
||||
* windows of merged items to cover the input key range.
|
||||
*/
|
||||
int scoutfs_btree_merge(struct super_block *sb,
|
||||
struct scoutfs_alloc *alloc,
|
||||
@@ -2235,18 +2308,16 @@ int scoutfs_btree_merge(struct super_block *sb,
|
||||
struct scoutfs_key *next_ret,
|
||||
struct scoutfs_btree_root *root,
|
||||
struct list_head *inputs,
|
||||
bool subtree, int dirty_limit, int alloc_low)
|
||||
bool subtree, int dirty_limit, int alloc_low, int merge_window)
|
||||
{
|
||||
struct scoutfs_btree_root_head *rhead;
|
||||
struct rb_root pos_root = RB_ROOT;
|
||||
struct scoutfs_btree_item *item;
|
||||
struct scoutfs_btree_block *bt;
|
||||
struct scoutfs_block *bl = NULL;
|
||||
struct btree_walk_key_range kr;
|
||||
struct scoutfs_avl_node *par;
|
||||
struct scoutfs_key next;
|
||||
struct merge_pos *mpos;
|
||||
struct merge_pos *tmp;
|
||||
struct merged_item *mitem;
|
||||
struct merged_item *tmp;
|
||||
struct merged_range rng;
|
||||
int walk_val_len;
|
||||
int walk_flags;
|
||||
bool is_del;
|
||||
@@ -2257,49 +2328,59 @@ int scoutfs_btree_merge(struct super_block *sb,
|
||||
trace_scoutfs_btree_merge(sb, root, start, end);
|
||||
scoutfs_inc_counter(sb, btree_merge);
|
||||
|
||||
list_for_each_entry(rhead, inputs, head) {
|
||||
mpos = kzalloc(sizeof(*mpos), GFP_NOFS);
|
||||
if (!mpos) {
|
||||
ret = -ENOMEM;
|
||||
goto out;
|
||||
}
|
||||
|
||||
RB_CLEAR_NODE(&mpos->node);
|
||||
mpos->root = &rhead->root;
|
||||
|
||||
ret = reset_mpos(sb, &pos_root, mpos, start, end);
|
||||
if (ret < 0)
|
||||
goto out;
|
||||
}
|
||||
|
||||
walk_flags = BTW_DIRTY;
|
||||
if (subtree)
|
||||
walk_flags |= BTW_SUBTREE;
|
||||
walk_val_len = 0;
|
||||
|
||||
while ((ret = next_resolved_mpos(sb, &pos_root, end, &mpos)) == 0 && mpos) {
|
||||
rng.start = *start;
|
||||
rng.end = *end;
|
||||
rng.root = RB_ROOT;
|
||||
rng.size = 0;
|
||||
|
||||
ret = read_merged_range(sb, &rng, inputs, merge_window);
|
||||
if (ret < 0)
|
||||
goto out;
|
||||
|
||||
for (;;) {
|
||||
/* read next window as it empties (and it is possible to read an empty range) */
|
||||
mitem = first_mitem(&rng.root);
|
||||
if (!mitem) {
|
||||
/* done if the read range hit the end */
|
||||
if (scoutfs_key_compare(&rng.end, end) >= 0)
|
||||
break;
|
||||
|
||||
/* read next batch of merged items */
|
||||
rng.start = rng.end;
|
||||
scoutfs_key_inc(&rng.start);
|
||||
rng.end = *end;
|
||||
ret = read_merged_range(sb, &rng, inputs, merge_window);
|
||||
if (ret < 0)
|
||||
break;
|
||||
continue;
|
||||
}
|
||||
|
||||
if (scoutfs_block_writer_dirty_bytes(sb, wri) >= dirty_limit) {
|
||||
scoutfs_inc_counter(sb, btree_merge_dirty_limit);
|
||||
ret = -ERANGE;
|
||||
*next_ret = *mpos->key;
|
||||
*next_ret = mitem->key;
|
||||
goto out;
|
||||
}
|
||||
|
||||
if (scoutfs_alloc_meta_low(sb, alloc, alloc_low)) {
|
||||
scoutfs_inc_counter(sb, btree_merge_alloc_low);
|
||||
ret = -ERANGE;
|
||||
*next_ret = *mpos->key;
|
||||
*next_ret = mitem->key;
|
||||
goto out;
|
||||
}
|
||||
|
||||
scoutfs_block_put(sb, bl);
|
||||
bl = NULL;
|
||||
ret = btree_walk(sb, alloc, wri, root, walk_flags,
|
||||
mpos->key, walk_val_len, &bl, &kr, NULL);
|
||||
&mitem->key, walk_val_len, &bl, &kr, NULL);
|
||||
if (ret < 0) {
|
||||
if (ret == -ERANGE)
|
||||
*next_ret = *mpos->key;
|
||||
*next_ret = mitem->key;
|
||||
goto out;
|
||||
}
|
||||
bt = bl->data;
|
||||
@@ -2311,22 +2392,21 @@ int scoutfs_btree_merge(struct super_block *sb,
|
||||
continue;
|
||||
}
|
||||
|
||||
while ((ret = next_resolved_mpos(sb, &pos_root, end, &mpos)) == 0 && mpos) {
|
||||
|
||||
while (mitem) {
|
||||
/* walk to new leaf if we exceed parent ref key */
|
||||
if (scoutfs_key_compare(mpos->key, &kr.end) > 0)
|
||||
if (scoutfs_key_compare(&mitem->key, &kr.end) > 0)
|
||||
break;
|
||||
|
||||
/* see if there's an existing item */
|
||||
item = leaf_item_hash_search(sb, bt, mpos->key);
|
||||
is_del = !!(mpos->flags & SCOUTFS_ITEM_FLAG_DELETION);
|
||||
item = leaf_item_hash_search(sb, bt, &mitem->key);
|
||||
is_del = !!(mitem->flags & SCOUTFS_ITEM_FLAG_DELETION);
|
||||
|
||||
/* see if we're merging delta items */
|
||||
if (item && !is_del)
|
||||
delta = scoutfs_forest_combine_deltas(mpos->key,
|
||||
delta = scoutfs_forest_combine_deltas(&mitem->key,
|
||||
item_val(bt, item),
|
||||
item_val_len(item),
|
||||
mpos->val, mpos->val_len);
|
||||
mitem->val, mitem->val_len);
|
||||
else
|
||||
delta = 0;
|
||||
if (delta < 0) {
|
||||
@@ -2338,40 +2418,38 @@ int scoutfs_btree_merge(struct super_block *sb,
|
||||
scoutfs_inc_counter(sb, btree_merge_delta_null);
|
||||
}
|
||||
|
||||
trace_scoutfs_btree_merge_items(sb, mpos->root,
|
||||
mpos->key, mpos->val_len,
|
||||
trace_scoutfs_btree_merge_items(sb, &mitem->key, mitem->val_len,
|
||||
item ? root : NULL,
|
||||
item ? item_key(item) : NULL,
|
||||
item ? item_val_len(item) : 0, is_del);
|
||||
|
||||
/* rewalk and split if ins/update needs room */
|
||||
if (!is_del && !delta && !mid_free_item_room(bt, mpos->val_len)) {
|
||||
if (!is_del && !delta && !mid_free_item_room(bt, mitem->val_len)) {
|
||||
walk_flags |= BTW_INSERT;
|
||||
walk_val_len = mpos->val_len;
|
||||
walk_val_len = mitem->val_len;
|
||||
break;
|
||||
}
|
||||
|
||||
/* insert missing non-deletion merge items */
|
||||
if (!item && !is_del) {
|
||||
scoutfs_avl_search(&bt->item_root,
|
||||
cmp_key_item, mpos->key,
|
||||
scoutfs_avl_search(&bt->item_root, cmp_key_item, &mitem->key,
|
||||
&cmp, &par, NULL, NULL);
|
||||
create_item(bt, mpos->key, mpos->seq, mpos->flags,
|
||||
mpos->val, mpos->val_len, par, cmp);
|
||||
create_item(bt, &mitem->key, mitem->seq, mitem->flags,
|
||||
mitem->val, mitem->val_len, par, cmp);
|
||||
scoutfs_inc_counter(sb, btree_merge_insert);
|
||||
}
|
||||
|
||||
/* update existing items */
|
||||
if (item && !is_del && !delta) {
|
||||
item->seq = cpu_to_le64(mpos->seq);
|
||||
item->flags = mpos->flags;
|
||||
update_item_value(bt, item, mpos->val, mpos->val_len);
|
||||
item->seq = cpu_to_le64(mitem->seq);
|
||||
item->flags = mitem->flags;
|
||||
update_item_value(bt, item, mitem->val, mitem->val_len);
|
||||
scoutfs_inc_counter(sb, btree_merge_update);
|
||||
}
|
||||
|
||||
/* update combined delta item seq */
|
||||
if (delta == SCOUTFS_DELTA_COMBINED) {
|
||||
item->seq = cpu_to_le64(mpos->seq);
|
||||
item->seq = cpu_to_le64(mitem->seq);
|
||||
}
|
||||
|
||||
/*
|
||||
@@ -2403,21 +2481,18 @@ int scoutfs_btree_merge(struct super_block *sb,
|
||||
walk_flags &= ~(BTW_INSERT | BTW_DELETE);
|
||||
walk_val_len = 0;
|
||||
|
||||
/* finished with this key, skip any older items */
|
||||
next = *mpos->key;
|
||||
scoutfs_key_inc(&next);
|
||||
ret = reset_mpos(sb, &pos_root, mpos, &next, end);
|
||||
if (ret < 0)
|
||||
goto out;
|
||||
/* finished with this merged item */
|
||||
tmp = mitem;
|
||||
mitem = next_mitem(mitem);
|
||||
free_mitem(&rng, tmp);
|
||||
}
|
||||
}
|
||||
|
||||
ret = 0;
|
||||
out:
|
||||
scoutfs_block_put(sb, bl);
|
||||
rbtree_postorder_for_each_entry_safe(mpos, tmp, &pos_root, node) {
|
||||
free_mpos(sb, mpos);
|
||||
}
|
||||
rbtree_postorder_for_each_entry_safe(mitem, tmp, &rng.root, node)
|
||||
free_mitem(&rng, mitem);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
@@ -119,7 +119,7 @@ int scoutfs_btree_merge(struct super_block *sb,
|
||||
struct scoutfs_key *next_ret,
|
||||
struct scoutfs_btree_root *root,
|
||||
struct list_head *input_list,
|
||||
bool subtree, int dirty_limit, int alloc_low);
|
||||
bool subtree, int dirty_limit, int alloc_low, int merge_window);
|
||||
|
||||
int scoutfs_btree_free_blocks(struct super_block *sb,
|
||||
struct scoutfs_alloc *alloc,
|
||||
|
||||
@@ -145,6 +145,7 @@
|
||||
EXPAND_COUNTER(lock_shrink_work) \
|
||||
EXPAND_COUNTER(lock_unlock) \
|
||||
EXPAND_COUNTER(lock_wait) \
|
||||
EXPAND_COUNTER(log_merge_wait_timeout) \
|
||||
EXPAND_COUNTER(net_dropped_response) \
|
||||
EXPAND_COUNTER(net_send_bytes) \
|
||||
EXPAND_COUNTER(net_send_error) \
|
||||
|
||||
@@ -721,7 +721,8 @@ static void scoutfs_forest_log_merge_worker(struct work_struct *work)
|
||||
ret = scoutfs_btree_merge(sb, &alloc, &wri, &req.start, &req.end,
|
||||
&next, &comp.root, &inputs,
|
||||
!!(req.flags & cpu_to_le64(SCOUTFS_LOG_MERGE_REQUEST_SUBTREE)),
|
||||
SCOUTFS_LOG_MERGE_DIRTY_BYTE_LIMIT, 10);
|
||||
SCOUTFS_LOG_MERGE_DIRTY_BYTE_LIMIT, 10,
|
||||
(2 * 1024 * 1024));
|
||||
if (ret == -ERANGE) {
|
||||
comp.remain = next;
|
||||
le64_add_cpu(&comp.flags, SCOUTFS_LOG_MERGE_COMP_REMAIN);
|
||||
|
||||
@@ -33,6 +33,7 @@ enum {
|
||||
Opt_acl,
|
||||
Opt_data_prealloc_blocks,
|
||||
Opt_data_prealloc_contig_only,
|
||||
Opt_log_merge_wait_timeout_ms,
|
||||
Opt_metadev_path,
|
||||
Opt_noacl,
|
||||
Opt_orphan_scan_delay_ms,
|
||||
@@ -45,6 +46,7 @@ static const match_table_t tokens = {
|
||||
{Opt_acl, "acl"},
|
||||
{Opt_data_prealloc_blocks, "data_prealloc_blocks=%s"},
|
||||
{Opt_data_prealloc_contig_only, "data_prealloc_contig_only=%s"},
|
||||
{Opt_log_merge_wait_timeout_ms, "log_merge_wait_timeout_ms=%s"},
|
||||
{Opt_metadev_path, "metadev_path=%s"},
|
||||
{Opt_noacl, "noacl"},
|
||||
{Opt_orphan_scan_delay_ms, "orphan_scan_delay_ms=%s"},
|
||||
@@ -113,6 +115,10 @@ static void free_options(struct scoutfs_mount_options *opts)
|
||||
kfree(opts->metadev_path);
|
||||
}
|
||||
|
||||
#define MIN_LOG_MERGE_WAIT_TIMEOUT_MS 100UL
|
||||
#define DEFAULT_LOG_MERGE_WAIT_TIMEOUT_MS 500
|
||||
#define MAX_LOG_MERGE_WAIT_TIMEOUT_MS (60 * MSEC_PER_SEC)
|
||||
|
||||
#define MIN_ORPHAN_SCAN_DELAY_MS 100UL
|
||||
#define DEFAULT_ORPHAN_SCAN_DELAY_MS (10 * MSEC_PER_SEC)
|
||||
#define MAX_ORPHAN_SCAN_DELAY_MS (60 * MSEC_PER_SEC)
|
||||
@@ -126,11 +132,27 @@ static void init_default_options(struct scoutfs_mount_options *opts)
|
||||
|
||||
opts->data_prealloc_blocks = SCOUTFS_DATA_PREALLOC_DEFAULT_BLOCKS;
|
||||
opts->data_prealloc_contig_only = 1;
|
||||
opts->log_merge_wait_timeout_ms = DEFAULT_LOG_MERGE_WAIT_TIMEOUT_MS;
|
||||
opts->orphan_scan_delay_ms = -1;
|
||||
opts->quorum_heartbeat_timeout_ms = SCOUTFS_QUORUM_DEF_HB_TIMEO_MS;
|
||||
opts->quorum_slot_nr = -1;
|
||||
}
|
||||
|
||||
static int verify_log_merge_wait_timeout_ms(struct super_block *sb, int ret, int val)
|
||||
{
|
||||
if (ret < 0) {
|
||||
scoutfs_err(sb, "failed to parse log_merge_wait_timeout_ms value");
|
||||
return -EINVAL;
|
||||
}
|
||||
if (val < MIN_LOG_MERGE_WAIT_TIMEOUT_MS || val > MAX_LOG_MERGE_WAIT_TIMEOUT_MS) {
|
||||
scoutfs_err(sb, "invalid log_merge_wait_timeout_ms value %d, must be between %lu and %lu",
|
||||
val, MIN_LOG_MERGE_WAIT_TIMEOUT_MS, MAX_LOG_MERGE_WAIT_TIMEOUT_MS);
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int verify_quorum_heartbeat_timeout_ms(struct super_block *sb, int ret, u64 val)
|
||||
{
|
||||
if (ret < 0) {
|
||||
@@ -196,6 +218,14 @@ static int parse_options(struct super_block *sb, char *options, struct scoutfs_m
|
||||
opts->data_prealloc_contig_only = nr;
|
||||
break;
|
||||
|
||||
case Opt_log_merge_wait_timeout_ms:
|
||||
ret = match_int(args, &nr);
|
||||
ret = verify_log_merge_wait_timeout_ms(sb, ret, nr);
|
||||
if (ret < 0)
|
||||
return ret;
|
||||
opts->log_merge_wait_timeout_ms = nr;
|
||||
break;
|
||||
|
||||
case Opt_metadev_path:
|
||||
ret = parse_bdev_path(sb, &args[0], &opts->metadev_path);
|
||||
if (ret < 0)
|
||||
@@ -422,6 +452,43 @@ static ssize_t data_prealloc_contig_only_store(struct kobject *kobj, struct kobj
|
||||
}
|
||||
SCOUTFS_ATTR_RW(data_prealloc_contig_only);
|
||||
|
||||
static ssize_t log_merge_wait_timeout_ms_show(struct kobject *kobj, struct kobj_attribute *attr,
|
||||
char *buf)
|
||||
{
|
||||
struct super_block *sb = SCOUTFS_SYSFS_ATTRS_SB(kobj);
|
||||
struct scoutfs_mount_options opts;
|
||||
|
||||
scoutfs_options_read(sb, &opts);
|
||||
|
||||
return snprintf(buf, PAGE_SIZE, "%u", opts.log_merge_wait_timeout_ms);
|
||||
}
|
||||
static ssize_t log_merge_wait_timeout_ms_store(struct kobject *kobj, struct kobj_attribute *attr,
|
||||
const char *buf, size_t count)
|
||||
{
|
||||
struct super_block *sb = SCOUTFS_SYSFS_ATTRS_SB(kobj);
|
||||
DECLARE_OPTIONS_INFO(sb, optinf);
|
||||
char nullterm[30]; /* more than enough for octal -U64_MAX */
|
||||
int val;
|
||||
int len;
|
||||
int ret;
|
||||
|
||||
len = min(count, sizeof(nullterm) - 1);
|
||||
memcpy(nullterm, buf, len);
|
||||
nullterm[len] = '\0';
|
||||
|
||||
ret = kstrtoint(nullterm, 0, &val);
|
||||
ret = verify_log_merge_wait_timeout_ms(sb, ret, val);
|
||||
if (ret == 0) {
|
||||
write_seqlock(&optinf->seqlock);
|
||||
optinf->opts.log_merge_wait_timeout_ms = val;
|
||||
write_sequnlock(&optinf->seqlock);
|
||||
ret = count;
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
SCOUTFS_ATTR_RW(log_merge_wait_timeout_ms);
|
||||
|
||||
static ssize_t metadev_path_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf)
|
||||
{
|
||||
struct super_block *sb = SCOUTFS_SYSFS_ATTRS_SB(kobj);
|
||||
@@ -525,6 +592,7 @@ SCOUTFS_ATTR_RO(quorum_slot_nr);
|
||||
static struct attribute *options_attrs[] = {
|
||||
SCOUTFS_ATTR_PTR(data_prealloc_blocks),
|
||||
SCOUTFS_ATTR_PTR(data_prealloc_contig_only),
|
||||
SCOUTFS_ATTR_PTR(log_merge_wait_timeout_ms),
|
||||
SCOUTFS_ATTR_PTR(metadev_path),
|
||||
SCOUTFS_ATTR_PTR(orphan_scan_delay_ms),
|
||||
SCOUTFS_ATTR_PTR(quorum_heartbeat_timeout_ms),
|
||||
|
||||
@@ -8,6 +8,7 @@
|
||||
struct scoutfs_mount_options {
|
||||
u64 data_prealloc_blocks;
|
||||
bool data_prealloc_contig_only;
|
||||
unsigned int log_merge_wait_timeout_ms;
|
||||
char *metadev_path;
|
||||
unsigned int orphan_scan_delay_ms;
|
||||
int quorum_slot_nr;
|
||||
|
||||
@@ -439,6 +439,7 @@ DECLARE_EVENT_CLASS(scoutfs_trans_hold_release_class,
|
||||
SCSB_TRACE_ASSIGN(sb);
|
||||
__entry->journal_info = (unsigned long)journal_info;
|
||||
__entry->holders = holders;
|
||||
__entry->ret = ret;
|
||||
),
|
||||
|
||||
TP_printk(SCSBF" journal_info 0x%0lx holders %d ret %d",
|
||||
@@ -1746,21 +1747,41 @@ TRACE_EVENT(scoutfs_btree_merge,
|
||||
sk_trace_args(end))
|
||||
);
|
||||
|
||||
TRACE_EVENT(scoutfs_btree_merge_read_range,
|
||||
TP_PROTO(struct super_block *sb, struct scoutfs_key *start, struct scoutfs_key *end,
|
||||
int size),
|
||||
|
||||
TP_ARGS(sb, start, end, size),
|
||||
|
||||
TP_STRUCT__entry(
|
||||
SCSB_TRACE_FIELDS
|
||||
sk_trace_define(start)
|
||||
sk_trace_define(end)
|
||||
__field(int, size)
|
||||
),
|
||||
|
||||
TP_fast_assign(
|
||||
SCSB_TRACE_ASSIGN(sb);
|
||||
sk_trace_assign(start, start);
|
||||
sk_trace_assign(end, end);
|
||||
__entry->size = size;
|
||||
),
|
||||
|
||||
TP_printk(SCSBF" start "SK_FMT" end "SK_FMT" size %d",
|
||||
SCSB_TRACE_ARGS, sk_trace_args(start), sk_trace_args(end), __entry->size)
|
||||
);
|
||||
|
||||
TRACE_EVENT(scoutfs_btree_merge_items,
|
||||
TP_PROTO(struct super_block *sb,
|
||||
struct scoutfs_btree_root *m_root,
|
||||
struct scoutfs_key *m_key, int m_val_len,
|
||||
struct scoutfs_btree_root *f_root,
|
||||
struct scoutfs_key *f_key, int f_val_len,
|
||||
int is_del),
|
||||
|
||||
TP_ARGS(sb, m_root, m_key, m_val_len, f_root, f_key, f_val_len, is_del),
|
||||
TP_ARGS(sb, m_key, m_val_len, f_root, f_key, f_val_len, is_del),
|
||||
|
||||
TP_STRUCT__entry(
|
||||
SCSB_TRACE_FIELDS
|
||||
__field(__u64, m_root_blkno)
|
||||
__field(__u64, m_root_seq)
|
||||
__field(__u8, m_root_height)
|
||||
sk_trace_define(m_key)
|
||||
__field(int, m_val_len)
|
||||
__field(__u64, f_root_blkno)
|
||||
@@ -1773,10 +1794,6 @@ TRACE_EVENT(scoutfs_btree_merge_items,
|
||||
|
||||
TP_fast_assign(
|
||||
SCSB_TRACE_ASSIGN(sb);
|
||||
__entry->m_root_blkno = m_root ?
|
||||
le64_to_cpu(m_root->ref.blkno) : 0;
|
||||
__entry->m_root_seq = m_root ? le64_to_cpu(m_root->ref.seq) : 0;
|
||||
__entry->m_root_height = m_root ? m_root->height : 0;
|
||||
sk_trace_assign(m_key, m_key);
|
||||
__entry->m_val_len = m_val_len;
|
||||
__entry->f_root_blkno = f_root ?
|
||||
@@ -1788,11 +1805,9 @@ TRACE_EVENT(scoutfs_btree_merge_items,
|
||||
__entry->is_del = !!is_del;
|
||||
),
|
||||
|
||||
TP_printk(SCSBF" merge item root blkno %llu seq %llu height %u key "SK_FMT" val_len %d, fs item root blkno %llu seq %llu height %u key "SK_FMT" val_len %d, is_del %d",
|
||||
SCSB_TRACE_ARGS, __entry->m_root_blkno, __entry->m_root_seq,
|
||||
__entry->m_root_height, sk_trace_args(m_key),
|
||||
__entry->m_val_len, __entry->f_root_blkno,
|
||||
__entry->f_root_seq, __entry->f_root_height,
|
||||
TP_printk(SCSBF" merge item key "SK_FMT" val_len %d, fs item root blkno %llu seq %llu height %u key "SK_FMT" val_len %d, is_del %d",
|
||||
SCSB_TRACE_ARGS, sk_trace_args(m_key), __entry->m_val_len,
|
||||
__entry->f_root_blkno, __entry->f_root_seq, __entry->f_root_height,
|
||||
sk_trace_args(f_key), __entry->f_val_len, __entry->is_del)
|
||||
);
|
||||
|
||||
@@ -2075,6 +2090,71 @@ TRACE_EVENT(scoutfs_trans_seq_last,
|
||||
SCSB_TRACE_ARGS, __entry->s_rid, __entry->trans_seq)
|
||||
);
|
||||
|
||||
TRACE_EVENT(scoutfs_server_finalize_items,
|
||||
TP_PROTO(struct super_block *sb, u64 rid, u64 item_rid, u64 item_nr, u64 item_flags,
|
||||
u64 item_get_trans_seq),
|
||||
|
||||
TP_ARGS(sb, rid, item_rid, item_nr, item_flags, item_get_trans_seq),
|
||||
|
||||
TP_STRUCT__entry(
|
||||
SCSB_TRACE_FIELDS
|
||||
__field(__u64, c_rid)
|
||||
__field(__u64, item_rid)
|
||||
__field(__u64, item_nr)
|
||||
__field(__u64, item_flags)
|
||||
__field(__u64, item_get_trans_seq)
|
||||
),
|
||||
|
||||
TP_fast_assign(
|
||||
SCSB_TRACE_ASSIGN(sb);
|
||||
__entry->c_rid = rid;
|
||||
__entry->item_rid = item_rid;
|
||||
__entry->item_nr = item_nr;
|
||||
__entry->item_flags = item_flags;
|
||||
__entry->item_get_trans_seq = item_get_trans_seq;
|
||||
),
|
||||
|
||||
TP_printk(SCSBF" rid %016llx item_rid %016llx item_nr %llu item_flags 0x%llx item_get_trans_seq %llu",
|
||||
SCSB_TRACE_ARGS, __entry->c_rid, __entry->item_rid, __entry->item_nr,
|
||||
__entry->item_flags, __entry->item_get_trans_seq)
|
||||
);
|
||||
|
||||
TRACE_EVENT(scoutfs_server_finalize_decision,
|
||||
TP_PROTO(struct super_block *sb, u64 rid, bool saw_finalized, bool others_active,
|
||||
bool ours_visible, bool finalize_ours, unsigned int delay_ms,
|
||||
u64 finalize_sent_seq),
|
||||
|
||||
TP_ARGS(sb, rid, saw_finalized, others_active, ours_visible, finalize_ours, delay_ms,
|
||||
finalize_sent_seq),
|
||||
|
||||
TP_STRUCT__entry(
|
||||
SCSB_TRACE_FIELDS
|
||||
__field(__u64, c_rid)
|
||||
__field(bool, saw_finalized)
|
||||
__field(bool, others_active)
|
||||
__field(bool, ours_visible)
|
||||
__field(bool, finalize_ours)
|
||||
__field(unsigned int, delay_ms)
|
||||
__field(__u64, finalize_sent_seq)
|
||||
),
|
||||
|
||||
TP_fast_assign(
|
||||
SCSB_TRACE_ASSIGN(sb);
|
||||
__entry->c_rid = rid;
|
||||
__entry->saw_finalized = saw_finalized;
|
||||
__entry->others_active = others_active;
|
||||
__entry->ours_visible = ours_visible;
|
||||
__entry->finalize_ours = finalize_ours;
|
||||
__entry->delay_ms = delay_ms;
|
||||
__entry->finalize_sent_seq = finalize_sent_seq;
|
||||
),
|
||||
|
||||
TP_printk(SCSBF" rid %016llx saw_finalized %u others_active %u ours_visible %u finalize_ours %u delay_ms %u finalize_sent_seq %llu",
|
||||
SCSB_TRACE_ARGS, __entry->c_rid, __entry->saw_finalized, __entry->others_active,
|
||||
__entry->ours_visible, __entry->finalize_ours, __entry->delay_ms,
|
||||
__entry->finalize_sent_seq)
|
||||
);
|
||||
|
||||
TRACE_EVENT(scoutfs_get_log_merge_status,
|
||||
TP_PROTO(struct super_block *sb, u64 rid, struct scoutfs_key *next_range_key,
|
||||
u64 nr_requests, u64 nr_complete, u64 seq),
|
||||
@@ -2799,6 +2879,81 @@ TRACE_EVENT(scoutfs_omap_should_delete,
|
||||
SCSB_TRACE_ARGS, __entry->ino, __entry->nlink, __entry->ret)
|
||||
);
|
||||
|
||||
#define SSCF_FMT "[bo %llu bs %llu es %llu]"
|
||||
#define SSCF_FIELDS(pref) \
|
||||
__field(__u64, pref##_blkno) \
|
||||
__field(__u64, pref##_blocks) \
|
||||
__field(__u64, pref##_entries)
|
||||
#define SSCF_ASSIGN(pref, sfl) \
|
||||
__entry->pref##_blkno = le64_to_cpu((sfl)->ref.blkno); \
|
||||
__entry->pref##_blocks = le64_to_cpu((sfl)->blocks); \
|
||||
__entry->pref##_entries = le64_to_cpu((sfl)->entries);
|
||||
#define SSCF_ENTRY_ARGS(pref) \
|
||||
__entry->pref##_blkno, \
|
||||
__entry->pref##_blocks, \
|
||||
__entry->pref##_entries
|
||||
|
||||
DECLARE_EVENT_CLASS(scoutfs_srch_compact_class,
|
||||
TP_PROTO(struct super_block *sb, struct scoutfs_srch_compact *sc),
|
||||
|
||||
TP_ARGS(sb, sc),
|
||||
|
||||
TP_STRUCT__entry(
|
||||
SCSB_TRACE_FIELDS
|
||||
__field(__u64, id)
|
||||
__field(__u8, nr)
|
||||
__field(__u8, flags)
|
||||
SSCF_FIELDS(out)
|
||||
__field(__u64, in0_blk)
|
||||
__field(__u64, in0_pos)
|
||||
SSCF_FIELDS(in0)
|
||||
__field(__u64, in1_blk)
|
||||
__field(__u64, in1_pos)
|
||||
SSCF_FIELDS(in1)
|
||||
__field(__u64, in2_blk)
|
||||
__field(__u64, in2_pos)
|
||||
SSCF_FIELDS(in2)
|
||||
__field(__u64, in3_blk)
|
||||
__field(__u64, in3_pos)
|
||||
SSCF_FIELDS(in3)
|
||||
),
|
||||
|
||||
TP_fast_assign(
|
||||
SCSB_TRACE_ASSIGN(sb);
|
||||
__entry->id = le64_to_cpu(sc->id);
|
||||
__entry->nr = sc->nr;
|
||||
__entry->flags = sc->flags;
|
||||
SSCF_ASSIGN(out, &sc->out)
|
||||
__entry->in0_blk = le64_to_cpu(sc->in[0].blk);
|
||||
__entry->in0_pos = le64_to_cpu(sc->in[0].pos);
|
||||
SSCF_ASSIGN(in0, &sc->in[0].sfl)
|
||||
__entry->in1_blk = le64_to_cpu(sc->in[0].blk);
|
||||
__entry->in1_pos = le64_to_cpu(sc->in[0].pos);
|
||||
SSCF_ASSIGN(in1, &sc->in[1].sfl)
|
||||
__entry->in2_blk = le64_to_cpu(sc->in[0].blk);
|
||||
__entry->in2_pos = le64_to_cpu(sc->in[0].pos);
|
||||
SSCF_ASSIGN(in2, &sc->in[2].sfl)
|
||||
__entry->in3_blk = le64_to_cpu(sc->in[0].blk);
|
||||
__entry->in3_pos = le64_to_cpu(sc->in[0].pos);
|
||||
SSCF_ASSIGN(in3, &sc->in[3].sfl)
|
||||
),
|
||||
|
||||
TP_printk(SCSBF" id %llu nr %u flags 0x%x out "SSCF_FMT" in0 b %llu p %llu "SSCF_FMT" in1 b %llu p %llu "SSCF_FMT" in2 b %llu p %llu "SSCF_FMT" in3 b %llu p %llu "SSCF_FMT,
|
||||
SCSB_TRACE_ARGS, __entry->id, __entry->nr, __entry->flags, SSCF_ENTRY_ARGS(out),
|
||||
__entry->in0_blk, __entry->in0_pos, SSCF_ENTRY_ARGS(in0),
|
||||
__entry->in1_blk, __entry->in1_pos, SSCF_ENTRY_ARGS(in1),
|
||||
__entry->in2_blk, __entry->in2_pos, SSCF_ENTRY_ARGS(in2),
|
||||
__entry->in3_blk, __entry->in3_pos, SSCF_ENTRY_ARGS(in3))
|
||||
);
|
||||
DEFINE_EVENT(scoutfs_srch_compact_class, scoutfs_srch_compact_client_send,
|
||||
TP_PROTO(struct super_block *sb, struct scoutfs_srch_compact *sc),
|
||||
TP_ARGS(sb, sc)
|
||||
);
|
||||
DEFINE_EVENT(scoutfs_srch_compact_class, scoutfs_srch_compact_client_recv,
|
||||
TP_PROTO(struct super_block *sb, struct scoutfs_srch_compact *sc),
|
||||
TP_ARGS(sb, sc)
|
||||
);
|
||||
|
||||
#endif /* _TRACE_SCOUTFS_H */
|
||||
|
||||
/* This part must be outside protection */
|
||||
|
||||
@@ -148,6 +148,8 @@ struct server_info {
|
||||
struct scoutfs_quorum_config qconf;
|
||||
/* a running server maintains a private dirty super */
|
||||
struct scoutfs_super_block dirty_super;
|
||||
|
||||
u64 finalize_sent_seq;
|
||||
};
|
||||
|
||||
#define DECLARE_SERVER_INFO(sb, name) \
|
||||
@@ -413,6 +415,27 @@ static void server_hold_commit(struct super_block *sb, struct commit_hold *hold)
|
||||
wait_event(cusers->waitq, hold_commit(sb, server, cusers, hold));
|
||||
}
|
||||
|
||||
/*
|
||||
* Return the higher of the avail or freed used by the active commit
|
||||
* since this holder joined the commit. This is *not* the amount used
|
||||
* by the holder, we don't track per-holder alloc use.
|
||||
*/
|
||||
static u32 server_hold_alloc_used_since(struct super_block *sb, struct commit_hold *hold)
|
||||
{
|
||||
DECLARE_SERVER_INFO(sb, server);
|
||||
u32 avail_used;
|
||||
u32 freed_used;
|
||||
u32 avail_now;
|
||||
u32 freed_now;
|
||||
|
||||
scoutfs_alloc_meta_remaining(&server->alloc, &avail_now, &freed_now);
|
||||
|
||||
avail_used = hold->avail - avail_now;
|
||||
freed_used = hold->freed - freed_now;
|
||||
|
||||
return max(avail_used, freed_used);
|
||||
}
|
||||
|
||||
/*
|
||||
* This is called while holding the commit and returns once the commit
|
||||
* is successfully written. Many holders can all wait for all holders
|
||||
@@ -938,22 +961,24 @@ static int find_log_trees_item(struct super_block *sb,
|
||||
}
|
||||
|
||||
/*
|
||||
* Find the next log_trees item from the key. Fills the caller's log_trees and sets
|
||||
* the key past the returned log_trees for iteration. Returns 0 when done, > 0 for each
|
||||
* item, and -errno on fatal errors.
|
||||
* Find the log_trees item with the greatest nr for each rid. Fills the
|
||||
* caller's log_trees and sets the key before the returned log_trees for
|
||||
* the next iteration. Returns 0 when done, > 0 for each item, and
|
||||
* -errno on fatal errors.
|
||||
*/
|
||||
static int for_each_lt(struct super_block *sb, struct scoutfs_btree_root *root,
|
||||
struct scoutfs_key *key, struct scoutfs_log_trees *lt)
|
||||
static int for_each_rid_last_lt(struct super_block *sb, struct scoutfs_btree_root *root,
|
||||
struct scoutfs_key *key, struct scoutfs_log_trees *lt)
|
||||
{
|
||||
SCOUTFS_BTREE_ITEM_REF(iref);
|
||||
int ret;
|
||||
|
||||
ret = scoutfs_btree_next(sb, root, key, &iref);
|
||||
ret = scoutfs_btree_prev(sb, root, key, &iref);
|
||||
if (ret == 0) {
|
||||
if (iref.val_len == sizeof(struct scoutfs_log_trees)) {
|
||||
memcpy(lt, iref.val, iref.val_len);
|
||||
*key = *iref.key;
|
||||
scoutfs_key_inc(key);
|
||||
key->sklt_nr = 0;
|
||||
scoutfs_key_dec(key);
|
||||
ret = 1;
|
||||
} else {
|
||||
ret = -EIO;
|
||||
@@ -1048,21 +1073,13 @@ static int next_log_merge_item(struct super_block *sb,
|
||||
* abandoned log btree finalized. If it takes too long each client has
|
||||
* a change to make forward progress before being asked to commit again.
|
||||
*
|
||||
* We're waiting on heavy state that is protected by mutexes and
|
||||
* transaction machinery. It's tricky to recreate that state for
|
||||
* lightweight condition tests that don't change task state. Instead of
|
||||
* trying to get that right, particularly as we unwind after success or
|
||||
* after timeouts, waiters use an unsatisfying poll. Short enough to
|
||||
* not add terrible latency, given how heavy and infrequent this already
|
||||
* is, and long enough to not melt the cpu. This could be tuned if it
|
||||
* becomes a problem.
|
||||
*
|
||||
* This can end up finalizing a new empty log btree if a new mount
|
||||
* happens to arrive at just the right time. That's fine, merging will
|
||||
* ignore and tear down the empty input.
|
||||
*/
|
||||
#define FINALIZE_POLL_MS (11)
|
||||
#define FINALIZE_TIMEOUT_MS (MSEC_PER_SEC / 2)
|
||||
#define FINALIZE_POLL_MIN_DELAY_MS 5U
|
||||
#define FINALIZE_POLL_MAX_DELAY_MS 100U
|
||||
#define FINALIZE_POLL_DELAY_GROWTH_PCT 150U
|
||||
static int finalize_and_start_log_merge(struct super_block *sb, struct scoutfs_log_trees *lt,
|
||||
u64 rid, struct commit_hold *hold)
|
||||
{
|
||||
@@ -1070,8 +1087,10 @@ static int finalize_and_start_log_merge(struct super_block *sb, struct scoutfs_l
|
||||
struct scoutfs_super_block *super = DIRTY_SUPER_SB(sb);
|
||||
struct scoutfs_log_merge_status stat;
|
||||
struct scoutfs_log_merge_range rng;
|
||||
struct scoutfs_mount_options opts;
|
||||
struct scoutfs_log_trees each_lt;
|
||||
struct scoutfs_log_trees fin;
|
||||
unsigned int delay_ms;
|
||||
unsigned long timeo;
|
||||
bool saw_finalized;
|
||||
bool others_active;
|
||||
@@ -1079,10 +1098,14 @@ static int finalize_and_start_log_merge(struct super_block *sb, struct scoutfs_l
|
||||
bool ours_visible;
|
||||
struct scoutfs_key key;
|
||||
char *err_str = NULL;
|
||||
ktime_t start;
|
||||
int ret;
|
||||
int err;
|
||||
|
||||
timeo = jiffies + msecs_to_jiffies(FINALIZE_TIMEOUT_MS);
|
||||
scoutfs_options_read(sb, &opts);
|
||||
timeo = jiffies + msecs_to_jiffies(opts.log_merge_wait_timeout_ms);
|
||||
delay_ms = FINALIZE_POLL_MIN_DELAY_MS;
|
||||
start = ktime_get_raw();
|
||||
|
||||
for (;;) {
|
||||
/* nothing to do if there's already a merge in flight */
|
||||
@@ -1099,8 +1122,13 @@ static int finalize_and_start_log_merge(struct super_block *sb, struct scoutfs_l
|
||||
saw_finalized = false;
|
||||
others_active = false;
|
||||
ours_visible = false;
|
||||
scoutfs_key_init_log_trees(&key, 0, 0);
|
||||
while ((ret = for_each_lt(sb, &super->logs_root, &key, &each_lt)) > 0) {
|
||||
scoutfs_key_init_log_trees(&key, U64_MAX, U64_MAX);
|
||||
while ((ret = for_each_rid_last_lt(sb, &super->logs_root, &key, &each_lt)) > 0) {
|
||||
|
||||
trace_scoutfs_server_finalize_items(sb, rid, le64_to_cpu(each_lt.rid),
|
||||
le64_to_cpu(each_lt.nr),
|
||||
le64_to_cpu(each_lt.flags),
|
||||
le64_to_cpu(each_lt.get_trans_seq));
|
||||
|
||||
if ((le64_to_cpu(each_lt.flags) & SCOUTFS_LOG_TREES_FINALIZED))
|
||||
saw_finalized = true;
|
||||
@@ -1125,6 +1153,10 @@ static int finalize_and_start_log_merge(struct super_block *sb, struct scoutfs_l
|
||||
finalize_ours = (lt->item_root.height > 2) ||
|
||||
(le32_to_cpu(lt->meta_avail.flags) & SCOUTFS_ALLOC_FLAG_LOW);
|
||||
|
||||
trace_scoutfs_server_finalize_decision(sb, rid, saw_finalized, others_active,
|
||||
ours_visible, finalize_ours, delay_ms,
|
||||
server->finalize_sent_seq);
|
||||
|
||||
/* done if we're not finalizing and there's no finalized */
|
||||
if (!finalize_ours && !saw_finalized) {
|
||||
ret = 0;
|
||||
@@ -1132,12 +1164,13 @@ static int finalize_and_start_log_merge(struct super_block *sb, struct scoutfs_l
|
||||
}
|
||||
|
||||
/* send sync requests soon to give time to commit */
|
||||
scoutfs_key_init_log_trees(&key, 0, 0);
|
||||
scoutfs_key_init_log_trees(&key, U64_MAX, U64_MAX);
|
||||
while (others_active &&
|
||||
(ret = for_each_lt(sb, &super->logs_root, &key, &each_lt)) > 0) {
|
||||
(ret = for_each_rid_last_lt(sb, &super->logs_root, &key, &each_lt)) > 0) {
|
||||
|
||||
if ((le64_to_cpu(each_lt.flags) & SCOUTFS_LOG_TREES_FINALIZED) ||
|
||||
(le64_to_cpu(each_lt.rid) == rid))
|
||||
(le64_to_cpu(each_lt.rid) == rid) ||
|
||||
(le64_to_cpu(each_lt.get_trans_seq) <= server->finalize_sent_seq))
|
||||
continue;
|
||||
|
||||
ret = scoutfs_net_submit_request_node(sb, server->conn,
|
||||
@@ -1157,6 +1190,8 @@ static int finalize_and_start_log_merge(struct super_block *sb, struct scoutfs_l
|
||||
break;
|
||||
}
|
||||
|
||||
server->finalize_sent_seq = scoutfs_server_seq(sb);
|
||||
|
||||
/* Finalize ours if it's visible to others */
|
||||
if (ours_visible) {
|
||||
fin = *lt;
|
||||
@@ -1194,13 +1229,16 @@ static int finalize_and_start_log_merge(struct super_block *sb, struct scoutfs_l
|
||||
if (ret < 0)
|
||||
err_str = "applying commit before waiting for finalized";
|
||||
|
||||
msleep(FINALIZE_POLL_MS);
|
||||
msleep(delay_ms);
|
||||
delay_ms = min(delay_ms * FINALIZE_POLL_DELAY_GROWTH_PCT / 100,
|
||||
FINALIZE_POLL_MAX_DELAY_MS);
|
||||
|
||||
server_hold_commit(sb, hold);
|
||||
mutex_lock(&server->logs_mutex);
|
||||
|
||||
/* done if we timed out */
|
||||
if (time_after(jiffies, timeo)) {
|
||||
scoutfs_inc_counter(sb, log_merge_wait_timeout);
|
||||
ret = 0;
|
||||
break;
|
||||
}
|
||||
@@ -1783,43 +1821,29 @@ out:
|
||||
* Give the caller the last seq before outstanding client commits. All
|
||||
* seqs up to and including this are stable, new client transactions can
|
||||
* only have greater seqs.
|
||||
*
|
||||
* For each rid, only its greatest log trees nr can be an open commit.
|
||||
* We look at the last log_trees item for each client rid and record its
|
||||
* trans seq if it hasn't been committed.
|
||||
*/
|
||||
static int get_stable_trans_seq(struct super_block *sb, u64 *last_seq_ret)
|
||||
{
|
||||
struct scoutfs_super_block *super = DIRTY_SUPER_SB(sb);
|
||||
DECLARE_SERVER_INFO(sb, server);
|
||||
SCOUTFS_BTREE_ITEM_REF(iref);
|
||||
struct scoutfs_log_trees *lt;
|
||||
struct scoutfs_log_trees lt;
|
||||
struct scoutfs_key key;
|
||||
u64 last_seq = 0;
|
||||
int ret;
|
||||
|
||||
last_seq = scoutfs_server_seq(sb) - 1;
|
||||
scoutfs_key_init_log_trees(&key, 0, 0);
|
||||
|
||||
mutex_lock(&server->logs_mutex);
|
||||
|
||||
for (;; scoutfs_key_inc(&key)) {
|
||||
ret = scoutfs_btree_next(sb, &super->logs_root, &key, &iref);
|
||||
if (ret == 0) {
|
||||
if (iref.val_len == sizeof(*lt)) {
|
||||
lt = iref.val;
|
||||
if ((le64_to_cpu(lt->get_trans_seq) >
|
||||
le64_to_cpu(lt->commit_trans_seq)) &&
|
||||
le64_to_cpu(lt->get_trans_seq) <= last_seq) {
|
||||
last_seq = le64_to_cpu(lt->get_trans_seq) - 1;
|
||||
}
|
||||
key = *iref.key;
|
||||
} else {
|
||||
ret = -EIO;
|
||||
}
|
||||
scoutfs_btree_put_iref(&iref);
|
||||
}
|
||||
if (ret < 0) {
|
||||
if (ret == -ENOENT) {
|
||||
ret = 0;
|
||||
break;
|
||||
}
|
||||
scoutfs_key_init_log_trees(&key, U64_MAX, U64_MAX);
|
||||
while ((ret = for_each_rid_last_lt(sb, &super->logs_root, &key, <)) > 0) {
|
||||
if ((le64_to_cpu(lt.get_trans_seq) > le64_to_cpu(lt.commit_trans_seq)) &&
|
||||
le64_to_cpu(lt.get_trans_seq) <= last_seq) {
|
||||
last_seq = le64_to_cpu(lt.get_trans_seq) - 1;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1966,9 +1990,7 @@ static int server_srch_get_compact(struct super_block *sb,
|
||||
ret = scoutfs_srch_get_compact(sb, &server->alloc, &server->wri,
|
||||
&super->srch_root, rid, sc);
|
||||
mutex_unlock(&server->srch_mutex);
|
||||
if (ret == 0 && sc->nr == 0)
|
||||
ret = -ENOENT;
|
||||
if (ret < 0)
|
||||
if (ret < 0 || (ret == 0 && sc->nr == 0))
|
||||
goto apply;
|
||||
|
||||
mutex_lock(&server->alloc_mutex);
|
||||
@@ -2473,9 +2495,11 @@ static void server_log_merge_free_work(struct work_struct *work)
|
||||
|
||||
while (!server_is_stopping(server)) {
|
||||
|
||||
server_hold_commit(sb, &hold);
|
||||
mutex_lock(&server->logs_mutex);
|
||||
commit = true;
|
||||
if (!commit) {
|
||||
server_hold_commit(sb, &hold);
|
||||
mutex_lock(&server->logs_mutex);
|
||||
commit = true;
|
||||
}
|
||||
|
||||
ret = next_log_merge_item(sb, &super->log_merge,
|
||||
SCOUTFS_LOG_MERGE_FREEING_ZONE,
|
||||
@@ -2522,12 +2546,14 @@ static void server_log_merge_free_work(struct work_struct *work)
|
||||
/* freed blocks are in allocator, we *have* to update fr */
|
||||
BUG_ON(ret < 0);
|
||||
|
||||
mutex_unlock(&server->logs_mutex);
|
||||
ret = server_apply_commit(sb, &hold, ret);
|
||||
commit = false;
|
||||
if (ret < 0) {
|
||||
err_str = "looping commit del/upd freeing item";
|
||||
break;
|
||||
if (server_hold_alloc_used_since(sb, &hold) >= COMMIT_HOLD_ALLOC_BUDGET / 2) {
|
||||
mutex_unlock(&server->logs_mutex);
|
||||
ret = server_apply_commit(sb, &hold, ret);
|
||||
commit = false;
|
||||
if (ret < 0) {
|
||||
err_str = "looping commit del/upd freeing item";
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -4300,6 +4326,7 @@ static void scoutfs_server_worker(struct work_struct *work)
|
||||
scoutfs_info(sb, "server starting at "SIN_FMT, SIN_ARG(&sin));
|
||||
|
||||
scoutfs_block_writer_init(sb, &server->wri);
|
||||
server->finalize_sent_seq = 0;
|
||||
|
||||
/* first make sure no other servers are still running */
|
||||
ret = scoutfs_quorum_fence_leaders(sb, &server->qconf, server->term);
|
||||
|
||||
201
kmod/src/srch.c
201
kmod/src/srch.c
@@ -30,6 +30,9 @@
|
||||
#include "client.h"
|
||||
#include "counters.h"
|
||||
#include "scoutfs_trace.h"
|
||||
#include "triggers.h"
|
||||
#include "sysfs.h"
|
||||
#include "msg.h"
|
||||
|
||||
/*
|
||||
* This srch subsystem gives us a way to find inodes that have a given
|
||||
@@ -68,10 +71,14 @@ struct srch_info {
|
||||
atomic_t shutdown;
|
||||
struct workqueue_struct *workq;
|
||||
struct delayed_work compact_dwork;
|
||||
struct scoutfs_sysfs_attrs ssa;
|
||||
atomic_t compact_delay_ms;
|
||||
};
|
||||
|
||||
#define DECLARE_SRCH_INFO(sb, name) \
|
||||
struct srch_info *name = SCOUTFS_SB(sb)->srch_info
|
||||
#define DECLARE_SRCH_INFO_KOBJ(kobj, name) \
|
||||
DECLARE_SRCH_INFO(SCOUTFS_SYSFS_ATTRS_SB(kobj), name)
|
||||
|
||||
#define SRE_FMT "%016llx.%llu.%llu"
|
||||
#define SRE_ARG(sre) \
|
||||
@@ -520,6 +527,95 @@ out:
|
||||
return ret;
|
||||
}
|
||||
|
||||
/*
|
||||
* Padded entries are encoded in pairs after an existing entry. All of
|
||||
* the pairs cancel each other out by all readers (the second encoding
|
||||
* looks like deletion) so they aren't visible to the first/last bounds of
|
||||
* the block or file.
|
||||
*/
|
||||
static int append_padded_entry(struct scoutfs_srch_file *sfl, u64 blk,
|
||||
struct scoutfs_srch_block *srb, struct scoutfs_srch_entry *sre)
|
||||
{
|
||||
int ret;
|
||||
|
||||
ret = encode_entry(srb->entries + le32_to_cpu(srb->entry_bytes),
|
||||
sre, &srb->tail);
|
||||
if (ret > 0) {
|
||||
srb->tail = *sre;
|
||||
le32_add_cpu(&srb->entry_nr, 1);
|
||||
le32_add_cpu(&srb->entry_bytes, ret);
|
||||
le64_add_cpu(&sfl->entries, 1);
|
||||
ret = 0;
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
/*
|
||||
* This is called by a testing trigger to create a very specific case of
|
||||
* encoded entry offsets. We want the last entry in the block to start
|
||||
* precisely at the _SAFE_BYTES offset.
|
||||
*
|
||||
* This is called when there is a single existing entry in the block.
|
||||
* We have the entire block to work with. We encode pairs of matching
|
||||
* entries. This hides them from readers (both searches and merging) as
|
||||
* they're interpreted as creation and deletion and are deleted. We use
|
||||
* the existing hash value of the first entry in the block but then set
|
||||
* the inode to an impossibly large number so it doesn't interfere with
|
||||
* anything.
|
||||
*
|
||||
* To hit the specific offset we very carefully manage the amount of
|
||||
* bytes of change between fields in the entry. We know that if we
|
||||
* change all the byte of the ino and id we end up with a 20 byte
|
||||
* (2+8+8,2) encoding of the pair of entries. To have the last entry
|
||||
* start at the _SAFE_POS offset we know that the final 20 byte pair
|
||||
* encoding needs to end at 2 bytes (second entry encoding) after the
|
||||
* _SAFE_POS offset.
|
||||
*
|
||||
* So as we encode pairs we watch the delta of our current offset from
|
||||
* that desired final offset of 2 past _SAFE_POS. If we're a multiple
|
||||
* of 20 away then we encode the full 20 byte pairs. If we're not, then
|
||||
* we drop a byte to encode 19 bytes. That'll slowly change the offset
|
||||
* to be a multiple of 20 again while encoding large entries.
|
||||
*/
|
||||
static void pad_entries_at_safe(struct scoutfs_srch_file *sfl, u64 blk,
|
||||
struct scoutfs_srch_block *srb)
|
||||
{
|
||||
struct scoutfs_srch_entry sre;
|
||||
u32 target;
|
||||
s32 diff;
|
||||
u64 hash;
|
||||
u64 ino;
|
||||
u64 id;
|
||||
int ret;
|
||||
|
||||
hash = le64_to_cpu(srb->tail.hash);
|
||||
ino = le64_to_cpu(srb->tail.ino) | (1ULL << 62);
|
||||
id = le64_to_cpu(srb->tail.id);
|
||||
|
||||
target = SCOUTFS_SRCH_BLOCK_SAFE_BYTES + 2;
|
||||
|
||||
while ((diff = target - le32_to_cpu(srb->entry_bytes)) > 0) {
|
||||
ino ^= 1ULL << (7 * 8);
|
||||
if (diff % 20 == 0) {
|
||||
id ^= 1ULL << (7 * 8);
|
||||
} else {
|
||||
id ^= 1ULL << (6 * 8);
|
||||
}
|
||||
|
||||
sre.hash = cpu_to_le64(hash);
|
||||
sre.ino = cpu_to_le64(ino);
|
||||
sre.id = cpu_to_le64(id);
|
||||
|
||||
ret = append_padded_entry(sfl, blk, srb, &sre);
|
||||
if (ret == 0)
|
||||
ret = append_padded_entry(sfl, blk, srb, &sre);
|
||||
BUG_ON(ret != 0);
|
||||
|
||||
diff = target - le32_to_cpu(srb->entry_bytes);
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* The caller is dropping an ino/id because the tracking rbtree is full.
|
||||
* This loses information so we can't return any entries at or after the
|
||||
@@ -987,6 +1083,9 @@ int scoutfs_srch_rotate_log(struct super_block *sb,
|
||||
struct scoutfs_key key;
|
||||
int ret;
|
||||
|
||||
if (sfl->ref.blkno && !force && scoutfs_trigger(sb, SRCH_FORCE_LOG_ROTATE))
|
||||
force = true;
|
||||
|
||||
if (sfl->ref.blkno == 0 ||
|
||||
(!force && le64_to_cpu(sfl->blocks) < SCOUTFS_SRCH_LOG_BLOCK_LIMIT))
|
||||
return 0;
|
||||
@@ -1462,7 +1561,7 @@ static int kway_merge(struct super_block *sb,
|
||||
struct scoutfs_block_writer *wri,
|
||||
struct scoutfs_srch_file *sfl,
|
||||
kway_get_t kway_get, kway_advance_t kway_adv,
|
||||
void **args, int nr)
|
||||
void **args, int nr, bool logs_input)
|
||||
{
|
||||
DECLARE_SRCH_INFO(sb, srinf);
|
||||
struct scoutfs_srch_block *srb = NULL;
|
||||
@@ -1567,6 +1666,15 @@ static int kway_merge(struct super_block *sb,
|
||||
blk++;
|
||||
}
|
||||
|
||||
/* end sorted block on _SAFE offset for testing */
|
||||
if (bl && le32_to_cpu(srb->entry_nr) == 1 && logs_input &&
|
||||
scoutfs_trigger(sb, SRCH_COMPACT_LOGS_PAD_SAFE)) {
|
||||
pad_entries_at_safe(sfl, blk, srb);
|
||||
scoutfs_block_put(sb, bl);
|
||||
bl = NULL;
|
||||
blk++;
|
||||
}
|
||||
|
||||
scoutfs_inc_counter(sb, srch_compact_entry);
|
||||
|
||||
} else {
|
||||
@@ -1609,6 +1717,8 @@ static int kway_merge(struct super_block *sb,
|
||||
empty++;
|
||||
ret = 0;
|
||||
} else if (ret < 0) {
|
||||
if (ret == -ENOANO) /* just testing trigger */
|
||||
ret = 0;
|
||||
goto out;
|
||||
}
|
||||
|
||||
@@ -1816,7 +1926,7 @@ static int compact_logs(struct super_block *sb,
|
||||
}
|
||||
|
||||
ret = kway_merge(sb, alloc, wri, &sc->out, kway_get_page, kway_adv_page,
|
||||
args, nr_pages);
|
||||
args, nr_pages, true);
|
||||
if (ret < 0)
|
||||
goto out;
|
||||
|
||||
@@ -1874,12 +1984,18 @@ static int kway_get_reader(struct super_block *sb,
|
||||
srb = rdr->bl->data;
|
||||
|
||||
if (rdr->pos > SCOUTFS_SRCH_BLOCK_SAFE_BYTES ||
|
||||
rdr->skip >= SCOUTFS_SRCH_BLOCK_SAFE_BYTES ||
|
||||
rdr->skip > SCOUTFS_SRCH_BLOCK_SAFE_BYTES ||
|
||||
rdr->skip >= le32_to_cpu(srb->entry_bytes)) {
|
||||
/* XXX inconsistency */
|
||||
return -EIO;
|
||||
}
|
||||
|
||||
if (rdr->decoded_bytes == 0 && rdr->pos == SCOUTFS_SRCH_BLOCK_SAFE_BYTES &&
|
||||
scoutfs_trigger(sb, SRCH_MERGE_STOP_SAFE)) {
|
||||
/* only used in testing */
|
||||
return -ENOANO;
|
||||
}
|
||||
|
||||
/* decode entry, possibly skipping start of the block */
|
||||
while (rdr->decoded_bytes == 0 || rdr->pos < rdr->skip) {
|
||||
ret = decode_entry(srb->entries + rdr->pos,
|
||||
@@ -1969,7 +2085,7 @@ static int compact_sorted(struct super_block *sb,
|
||||
}
|
||||
|
||||
ret = kway_merge(sb, alloc, wri, &sc->out, kway_get_reader,
|
||||
kway_adv_reader, args, nr);
|
||||
kway_adv_reader, args, nr, false);
|
||||
|
||||
sc->flags |= SCOUTFS_SRCH_COMPACT_FLAG_DONE;
|
||||
for (i = 0; i < nr; i++) {
|
||||
@@ -2098,8 +2214,15 @@ static int delete_files(struct super_block *sb, struct scoutfs_alloc *alloc,
|
||||
return ret;
|
||||
}
|
||||
|
||||
/* wait 10s between compact attempts on error, immediate after success */
|
||||
#define SRCH_COMPACT_DELAY_MS (10 * MSEC_PER_SEC)
|
||||
static void queue_compact_work(struct srch_info *srinf, bool immediate)
|
||||
{
|
||||
unsigned long delay;
|
||||
|
||||
if (!atomic_read(&srinf->shutdown)) {
|
||||
delay = immediate ? 0 : msecs_to_jiffies(atomic_read(&srinf->compact_delay_ms));
|
||||
queue_delayed_work(srinf->workq, &srinf->compact_dwork, delay);
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Get a compaction operation from the server, sort the entries from the
|
||||
@@ -2127,7 +2250,6 @@ static void scoutfs_srch_compact_worker(struct work_struct *work)
|
||||
struct super_block *sb = srinf->sb;
|
||||
struct scoutfs_block_writer wri;
|
||||
struct scoutfs_alloc alloc;
|
||||
unsigned long delay;
|
||||
int ret;
|
||||
int err;
|
||||
|
||||
@@ -2140,6 +2262,8 @@ static void scoutfs_srch_compact_worker(struct work_struct *work)
|
||||
scoutfs_block_writer_init(sb, &wri);
|
||||
|
||||
ret = scoutfs_client_srch_get_compact(sb, sc);
|
||||
if (ret >= 0)
|
||||
trace_scoutfs_srch_compact_client_recv(sb, sc);
|
||||
if (ret < 0 || sc->nr == 0)
|
||||
goto out;
|
||||
|
||||
@@ -2168,6 +2292,7 @@ commit:
|
||||
sc->meta_freed = alloc.freed;
|
||||
sc->flags |= ret < 0 ? SCOUTFS_SRCH_COMPACT_FLAG_ERROR : 0;
|
||||
|
||||
trace_scoutfs_srch_compact_client_send(sb, sc);
|
||||
err = scoutfs_client_srch_commit_compact(sb, sc);
|
||||
if (err < 0 && ret == 0)
|
||||
ret = err;
|
||||
@@ -2178,14 +2303,56 @@ out:
|
||||
scoutfs_inc_counter(sb, srch_compact_error);
|
||||
|
||||
scoutfs_block_writer_forget_all(sb, &wri);
|
||||
if (!atomic_read(&srinf->shutdown)) {
|
||||
delay = ret == 0 ? 0 : msecs_to_jiffies(SRCH_COMPACT_DELAY_MS);
|
||||
queue_delayed_work(srinf->workq, &srinf->compact_dwork, delay);
|
||||
}
|
||||
queue_compact_work(srinf, sc->nr > 0 && ret == 0);
|
||||
|
||||
kfree(sc);
|
||||
}
|
||||
|
||||
static ssize_t compact_delay_ms_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf)
|
||||
{
|
||||
DECLARE_SRCH_INFO_KOBJ(kobj, srinf);
|
||||
|
||||
return snprintf(buf, PAGE_SIZE, "%u", atomic_read(&srinf->compact_delay_ms));
|
||||
}
|
||||
|
||||
#define MIN_COMPACT_DELAY_MS MSEC_PER_SEC
|
||||
#define DEF_COMPACT_DELAY_MS (10 * MSEC_PER_SEC)
|
||||
#define MAX_COMPACT_DELAY_MS (60 * MSEC_PER_SEC)
|
||||
|
||||
static ssize_t compact_delay_ms_store(struct kobject *kobj, struct kobj_attribute *attr,
|
||||
const char *buf, size_t count)
|
||||
{
|
||||
struct super_block *sb = SCOUTFS_SYSFS_ATTRS_SB(kobj);
|
||||
DECLARE_SRCH_INFO(sb, srinf);
|
||||
char nullterm[30]; /* more than enough for octal -U64_MAX */
|
||||
u64 val;
|
||||
int len;
|
||||
int ret;
|
||||
|
||||
len = min(count, sizeof(nullterm) - 1);
|
||||
memcpy(nullterm, buf, len);
|
||||
nullterm[len] = '\0';
|
||||
|
||||
ret = kstrtoll(nullterm, 0, &val);
|
||||
if (ret < 0 || val < MIN_COMPACT_DELAY_MS || val > MAX_COMPACT_DELAY_MS) {
|
||||
scoutfs_err(sb, "invalid compact_delay_ms value, must be between %lu and %lu",
|
||||
MIN_COMPACT_DELAY_MS, MAX_COMPACT_DELAY_MS);
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
atomic_set(&srinf->compact_delay_ms, val);
|
||||
cancel_delayed_work(&srinf->compact_dwork);
|
||||
queue_compact_work(srinf, false);
|
||||
|
||||
return count;
|
||||
}
|
||||
SCOUTFS_ATTR_RW(compact_delay_ms);
|
||||
|
||||
static struct attribute *srch_attrs[] = {
|
||||
SCOUTFS_ATTR_PTR(compact_delay_ms),
|
||||
NULL,
|
||||
};
|
||||
|
||||
void scoutfs_srch_destroy(struct super_block *sb)
|
||||
{
|
||||
struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
|
||||
@@ -2202,6 +2369,8 @@ void scoutfs_srch_destroy(struct super_block *sb)
|
||||
destroy_workqueue(srinf->workq);
|
||||
}
|
||||
|
||||
scoutfs_sysfs_destroy_attrs(sb, &srinf->ssa);
|
||||
|
||||
kfree(srinf);
|
||||
sbi->srch_info = NULL;
|
||||
}
|
||||
@@ -2219,8 +2388,15 @@ int scoutfs_srch_setup(struct super_block *sb)
|
||||
srinf->sb = sb;
|
||||
atomic_set(&srinf->shutdown, 0);
|
||||
INIT_DELAYED_WORK(&srinf->compact_dwork, scoutfs_srch_compact_worker);
|
||||
scoutfs_sysfs_init_attrs(sb, &srinf->ssa);
|
||||
atomic_set(&srinf->compact_delay_ms, DEF_COMPACT_DELAY_MS);
|
||||
|
||||
sbi->srch_info = srinf;
|
||||
|
||||
ret = scoutfs_sysfs_create_attrs(sb, &srinf->ssa, srch_attrs, "srch");
|
||||
if (ret < 0)
|
||||
goto out;
|
||||
|
||||
srinf->workq = alloc_workqueue("scoutfs_srch_compact",
|
||||
WQ_NON_REENTRANT | WQ_UNBOUND |
|
||||
WQ_HIGHPRI, 0);
|
||||
@@ -2229,8 +2405,7 @@ int scoutfs_srch_setup(struct super_block *sb)
|
||||
goto out;
|
||||
}
|
||||
|
||||
queue_delayed_work(srinf->workq, &srinf->compact_dwork,
|
||||
msecs_to_jiffies(SRCH_COMPACT_DELAY_MS));
|
||||
queue_compact_work(srinf, false);
|
||||
|
||||
ret = 0;
|
||||
out:
|
||||
|
||||
@@ -39,6 +39,9 @@ struct scoutfs_triggers {
|
||||
|
||||
static char *names[] = {
|
||||
[SCOUTFS_TRIGGER_BLOCK_REMOVE_STALE] = "block_remove_stale",
|
||||
[SCOUTFS_TRIGGER_SRCH_COMPACT_LOGS_PAD_SAFE] = "srch_compact_logs_pad_safe",
|
||||
[SCOUTFS_TRIGGER_SRCH_FORCE_LOG_ROTATE] = "srch_force_log_rotate",
|
||||
[SCOUTFS_TRIGGER_SRCH_MERGE_STOP_SAFE] = "srch_merge_stop_safe",
|
||||
[SCOUTFS_TRIGGER_STATFS_LOCK_PURGE] = "statfs_lock_purge",
|
||||
};
|
||||
|
||||
|
||||
@@ -3,6 +3,9 @@
|
||||
|
||||
enum scoutfs_trigger {
|
||||
SCOUTFS_TRIGGER_BLOCK_REMOVE_STALE,
|
||||
SCOUTFS_TRIGGER_SRCH_COMPACT_LOGS_PAD_SAFE,
|
||||
SCOUTFS_TRIGGER_SRCH_FORCE_LOG_ROTATE,
|
||||
SCOUTFS_TRIGGER_SRCH_MERGE_STOP_SAFE,
|
||||
SCOUTFS_TRIGGER_STATFS_LOCK_PURGE,
|
||||
SCOUTFS_TRIGGER_NR,
|
||||
};
|
||||
|
||||
@@ -25,8 +25,9 @@ All options can be seen by running with -h.
|
||||
This script is built to test multi-node systems on one host by using
|
||||
different mounts of the same devices. The script creates a fake block
|
||||
device in front of each fs block device for each mount that will be
|
||||
tested. Currently it will create free loop devices and will mount on
|
||||
/mnt/test.[0-9].
|
||||
tested. It will create predictable device mapper devices and mounts
|
||||
them on /mnt/test.N. These static device names and mount paths limit
|
||||
the script to a single execution per host.
|
||||
|
||||
All tests will be run by default. Particular tests can be included or
|
||||
excluded by providing test name regular expressions with the -I and -E
|
||||
@@ -104,8 +105,8 @@ used during the test.
|
||||
|
||||
| Variable | Description | Origin | Example |
|
||||
| ---------------- | ------------------- | --------------- | ----------------- |
|
||||
| T\_MB[0-9] | per-mount meta bdev | created per run | /dev/loop0 |
|
||||
| T\_DB[0-9] | per-mount data bdev | created per run | /dev/loop1 |
|
||||
| T\_MB[0-9] | per-mount meta bdev | created per run | /dev/mapper/\_scoutfs\_test\_meta\_[0-9] |
|
||||
| T\_DB[0-9] | per-mount data bdev | created per run | /dev/mapper/\_scoutfs\_test\_data\_[0-9] |
|
||||
| T\_D[0-9] | per-mount test dir | made for test | /mnt/test.[0-9]/t |
|
||||
| T\_META\_DEVICE | main FS meta bdev | -M | /dev/vda |
|
||||
| T\_DATA\_DEVICE | main FS data bdev | -D | /dev/vdb |
|
||||
|
||||
@@ -6,6 +6,61 @@ t_filter_fs()
|
||||
-e 's@Device: [a-fA-F0-9]*h/[0-9]*d@Device: 0h/0d@g'
|
||||
}
|
||||
|
||||
#
|
||||
# We can hit a spurious kasan warning that was fixed upstream:
|
||||
#
|
||||
# e504e74cc3a2 x86/unwind/orc: Disable KASAN checking in the ORC unwinder, part 2
|
||||
#
|
||||
# KASAN can get mad when the unwinder doesn't find ORC metadata and
|
||||
# wanders up without using frames and hits the KASAN stack red zones.
|
||||
# We can ignore these messages.
|
||||
#
|
||||
# They're bracketed by:
|
||||
# [ 2687.690127] ==================================================================
|
||||
# [ 2687.691366] BUG: KASAN: stack-out-of-bounds in get_reg+0x1bc/0x230
|
||||
# ...
|
||||
# [ 2687.706220] ==================================================================
|
||||
# [ 2687.707284] Disabling lock debugging due to kernel taint
|
||||
#
|
||||
# That final lock debugging message may not be included.
|
||||
#
|
||||
ignore_harmless_unwind_kasan_stack_oob()
|
||||
{
|
||||
awk '
|
||||
BEGIN {
|
||||
in_soob = 0
|
||||
soob_nr = 0
|
||||
}
|
||||
( !in_soob && $0 ~ /==================================================================/ ) {
|
||||
in_soob = 1
|
||||
soob_nr = NR
|
||||
saved = $0
|
||||
}
|
||||
( in_soob == 1 && NR == (soob_nr + 1) ) {
|
||||
if (match($0, /KASAN: stack-out-of-bounds in get_reg/) != 0) {
|
||||
in_soob = 2
|
||||
} else {
|
||||
in_soob = 0
|
||||
print saved
|
||||
}
|
||||
saved=""
|
||||
}
|
||||
( in_soob == 2 && $0 ~ /==================================================================/ ) {
|
||||
in_soob = 3
|
||||
soob_nr = NR
|
||||
}
|
||||
( in_soob == 3 && NR > soob_nr && $0 !~ /Disabling lock debugging/ ) {
|
||||
in_soob = 0
|
||||
}
|
||||
( !in_soob ) { print $0 }
|
||||
END {
|
||||
if (saved) {
|
||||
print saved
|
||||
}
|
||||
}
|
||||
'
|
||||
}
|
||||
|
||||
#
|
||||
# Filter out expected messages. Putting messages here implies that
|
||||
# tests aren't relying on messages to discover failures.. they're
|
||||
@@ -86,10 +141,12 @@ t_filter_dmesg()
|
||||
re="$re|scoutfs .* critical transaction commit failure.*"
|
||||
|
||||
# change-devices causes loop device resizing
|
||||
re="$re|loop: module loaded"
|
||||
re="$re|loop[0-9].* detected capacity change from.*"
|
||||
|
||||
# ignore systemd-journal rotating
|
||||
re="$re|systemd-journald.*"
|
||||
|
||||
egrep -v "($re)"
|
||||
egrep -v "($re)" | \
|
||||
ignore_harmless_unwind_kasan_stack_oob
|
||||
}
|
||||
|
||||
@@ -265,6 +265,15 @@ t_trigger_get() {
|
||||
cat "$(t_trigger_path "$nr")/$which"
|
||||
}
|
||||
|
||||
t_trigger_set() {
|
||||
local which="$1"
|
||||
local nr="$2"
|
||||
local val="$3"
|
||||
local path=$(t_trigger_path "$nr")
|
||||
|
||||
echo "$val" > "$path/$which"
|
||||
}
|
||||
|
||||
t_trigger_show() {
|
||||
local which="$1"
|
||||
local string="$2"
|
||||
@@ -276,9 +285,8 @@ t_trigger_show() {
|
||||
t_trigger_arm_silent() {
|
||||
local which="$1"
|
||||
local nr="$2"
|
||||
local path=$(t_trigger_path "$nr")
|
||||
|
||||
echo 1 > "$path/$which"
|
||||
t_trigger_set "$which" "$nr" 1
|
||||
}
|
||||
|
||||
t_trigger_arm() {
|
||||
|
||||
@@ -1,3 +1,4 @@
|
||||
== measure initial createmany
|
||||
== measure initial createmany
|
||||
== measure two concurrent createmany runs
|
||||
== cleanup
|
||||
|
||||
@@ -1,3 +1,4 @@
|
||||
== setting longer hung task timeout
|
||||
== creating fragmented extents
|
||||
== unlink file with moved extents to free extents per block
|
||||
== cleanup
|
||||
|
||||
37
tests/golden/srch-safe-merge-pos
Normal file
37
tests/golden/srch-safe-merge-pos
Normal file
@@ -0,0 +1,37 @@
|
||||
== initialize per-mount values
|
||||
== arm compaction triggers
|
||||
trigger srch_compact_logs_pad_safe armed: 1
|
||||
trigger srch_merge_stop_safe armed: 1
|
||||
trigger srch_compact_logs_pad_safe armed: 1
|
||||
trigger srch_merge_stop_safe armed: 1
|
||||
trigger srch_compact_logs_pad_safe armed: 1
|
||||
trigger srch_merge_stop_safe armed: 1
|
||||
trigger srch_compact_logs_pad_safe armed: 1
|
||||
trigger srch_merge_stop_safe armed: 1
|
||||
trigger srch_compact_logs_pad_safe armed: 1
|
||||
trigger srch_merge_stop_safe armed: 1
|
||||
== compact more often
|
||||
== create padded sorted inputs by forcing log rotation
|
||||
trigger srch_force_log_rotate armed: 1
|
||||
trigger srch_force_log_rotate armed: 1
|
||||
trigger srch_force_log_rotate armed: 1
|
||||
trigger srch_force_log_rotate armed: 1
|
||||
trigger srch_compact_logs_pad_safe armed: 1
|
||||
trigger srch_force_log_rotate armed: 1
|
||||
trigger srch_force_log_rotate armed: 1
|
||||
trigger srch_force_log_rotate armed: 1
|
||||
trigger srch_force_log_rotate armed: 1
|
||||
trigger srch_compact_logs_pad_safe armed: 1
|
||||
trigger srch_force_log_rotate armed: 1
|
||||
trigger srch_force_log_rotate armed: 1
|
||||
trigger srch_force_log_rotate armed: 1
|
||||
trigger srch_force_log_rotate armed: 1
|
||||
trigger srch_compact_logs_pad_safe armed: 1
|
||||
trigger srch_force_log_rotate armed: 1
|
||||
trigger srch_force_log_rotate armed: 1
|
||||
trigger srch_force_log_rotate armed: 1
|
||||
trigger srch_force_log_rotate armed: 1
|
||||
trigger srch_compact_logs_pad_safe armed: 1
|
||||
== compaction of padded should stop at safe
|
||||
== verify no compaction errors
|
||||
== cleanup
|
||||
@@ -326,16 +326,10 @@ unmount_all() {
|
||||
cmd wait $p
|
||||
done
|
||||
|
||||
# delete all temp meta devices
|
||||
for dev in $(losetup --associated "$T_META_DEVICE" | cut -d : -f 1); do
|
||||
if [ -e "$dev" ]; then
|
||||
cmd losetup -d "$dev"
|
||||
fi
|
||||
done
|
||||
# delete all temp data devices
|
||||
for dev in $(losetup --associated "$T_DATA_DEVICE" | cut -d : -f 1); do
|
||||
if [ -e "$dev" ]; then
|
||||
cmd losetup -d "$dev"
|
||||
# delete all temp devices
|
||||
for dev in /dev/mapper/_scoutfs_test_*; do
|
||||
if [ -b "$dev" ]; then
|
||||
cmd dmsetup remove $dev
|
||||
fi
|
||||
done
|
||||
}
|
||||
@@ -434,6 +428,12 @@ $T_UTILS/fenced/scoutfs-fenced > "$T_FENCED_LOG" 2>&1 &
|
||||
fenced_pid=$!
|
||||
fenced_log "started fenced pid $fenced_pid in the background"
|
||||
|
||||
# setup dm tables
|
||||
echo "0 $(blockdev --getsz $T_META_DEVICE) linear $T_META_DEVICE 0" > \
|
||||
$T_RESULTS/dmtable.meta
|
||||
echo "0 $(blockdev --getsz $T_DATA_DEVICE) linear $T_DATA_DEVICE 0" > \
|
||||
$T_RESULTS/dmtable.data
|
||||
|
||||
#
|
||||
# mount concurrently so that a quorum is present to elect the leader and
|
||||
# start a server.
|
||||
@@ -442,10 +442,13 @@ msg "mounting $T_NR_MOUNTS mounts on meta $T_META_DEVICE data $T_DATA_DEVICE"
|
||||
pids=""
|
||||
for i in $(seq 0 $((T_NR_MOUNTS - 1))); do
|
||||
|
||||
meta_dev=$(losetup --find --show $T_META_DEVICE)
|
||||
test -b "$meta_dev" || die "failed to create temp device $meta_dev"
|
||||
data_dev=$(losetup --find --show $T_DATA_DEVICE)
|
||||
test -b "$data_dev" || die "failed to create temp device $data_dev"
|
||||
name="_scoutfs_test_meta_$i"
|
||||
cmd dmsetup create "$name" --table "$(cat $T_RESULTS/dmtable.meta)"
|
||||
meta_dev="/dev/mapper/$name"
|
||||
|
||||
name="_scoutfs_test_data_$i"
|
||||
cmd dmsetup create "$name" --table "$(cat $T_RESULTS/dmtable.data)"
|
||||
data_dev="/dev/mapper/$name"
|
||||
|
||||
dir="/mnt/test.$i"
|
||||
test -d "$dir" || cmd mkdir -p "$dir"
|
||||
|
||||
@@ -14,6 +14,7 @@ offline-extent-waiting.sh
|
||||
move-blocks.sh
|
||||
large-fragmented-free.sh
|
||||
enospc.sh
|
||||
srch-safe-merge-pos.sh
|
||||
srch-basic-functionality.sh
|
||||
simple-xattr-unit.sh
|
||||
totl-xattr-tag.sh
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
#include <unistd.h>
|
||||
#include <stdlib.h>
|
||||
#include <stdio.h>
|
||||
#include <stdarg.h>
|
||||
#include <errno.h>
|
||||
#include <string.h>
|
||||
#include <sys/stat.h>
|
||||
@@ -35,10 +36,10 @@ struct opts {
|
||||
unsigned int dry_run:1,
|
||||
ls_output:1,
|
||||
quiet:1,
|
||||
user_xattr:1,
|
||||
same_srch_xattr:1,
|
||||
group_srch_xattr:1,
|
||||
unique_srch_xattr:1;
|
||||
xattr_set:1,
|
||||
xattr_file:1,
|
||||
xattr_group:1;
|
||||
char *xattr_name;
|
||||
};
|
||||
|
||||
struct stats {
|
||||
@@ -149,12 +150,31 @@ static void free_dir(struct dir *dir)
|
||||
free(dir);
|
||||
}
|
||||
|
||||
static size_t snprintf_off(void *buf, size_t sz, size_t off, char *fmt, ...)
|
||||
{
|
||||
va_list ap;
|
||||
int ret;
|
||||
|
||||
if (off >= sz)
|
||||
return sz;
|
||||
|
||||
va_start(ap, fmt);
|
||||
ret = vsnprintf(buf + off, sz - off, fmt, ap);
|
||||
va_end(ap);
|
||||
|
||||
if (ret <= 0)
|
||||
return sz;
|
||||
|
||||
return off + ret;
|
||||
}
|
||||
|
||||
static void create_dir(struct dir *dir, struct opts *opts,
|
||||
struct stats *stats)
|
||||
{
|
||||
struct str_list *s;
|
||||
char name[100];
|
||||
char name[256]; /* max len and null term */
|
||||
char val = 'v';
|
||||
size_t off;
|
||||
int rc;
|
||||
int i;
|
||||
|
||||
@@ -175,29 +195,21 @@ static void create_dir(struct dir *dir, struct opts *opts,
|
||||
rc = mknod(s->str, S_IFREG | 0644, 0);
|
||||
error_exit(rc, "mknod %s failed"ERRF, s->str, ERRA);
|
||||
|
||||
rc = 0;
|
||||
if (rc == 0 && opts->user_xattr) {
|
||||
strcpy(name, "user.scoutfs_bcp");
|
||||
rc = setxattr(s->str, name, &val, 1, 0);
|
||||
}
|
||||
if (rc == 0 && opts->same_srch_xattr) {
|
||||
strcpy(name, "scoutfs.srch.scoutfs_bcp");
|
||||
rc = setxattr(s->str, name, &val, 1, 0);
|
||||
}
|
||||
if (rc == 0 && opts->group_srch_xattr) {
|
||||
snprintf(name, sizeof(name),
|
||||
"scoutfs.srch.scoutfs_bcp.group.%lu",
|
||||
stats->files / 10000);
|
||||
rc = setxattr(s->str, name, &val, 1, 0);
|
||||
}
|
||||
if (rc == 0 && opts->unique_srch_xattr) {
|
||||
snprintf(name, sizeof(name),
|
||||
"scoutfs.srch.scoutfs_bcp.unique.%lu",
|
||||
stats->files);
|
||||
if (opts->xattr_set) {
|
||||
off = snprintf_off(name, sizeof(name), 0, "%s", opts->xattr_name);
|
||||
if (opts->xattr_file)
|
||||
off = snprintf_off(name, sizeof(name), off,
|
||||
"-f-%lu", stats->files);
|
||||
if (opts->xattr_group)
|
||||
off = snprintf_off(name, sizeof(name), off,
|
||||
"-g-%lu", stats->files / 10000);
|
||||
|
||||
error_exit(off >= sizeof(name), "xattr name longer than 255 bytes");
|
||||
|
||||
rc = setxattr(s->str, name, &val, 1, 0);
|
||||
error_exit(rc, "setxattr %s %s failed"ERRF, s->str, name, ERRA);
|
||||
}
|
||||
|
||||
error_exit(rc, "setxattr %s %s failed"ERRF, s->str, name, ERRA);
|
||||
|
||||
stats->files++;
|
||||
rate_banner(opts, stats);
|
||||
@@ -365,11 +377,10 @@ static void usage(void)
|
||||
" -d DIR | create all files in DIR top level directory\n"
|
||||
" -n | dry run, only parse, don't create any files\n"
|
||||
" -q | quiet, don't regularly print rates\n"
|
||||
" -F | append \"-f-NR\" file nr to xattr name, requires -X\n"
|
||||
" -G | append \"-g-NR\" file nr/10000 to xattr name, requires -X\n"
|
||||
" -L | parse ls output; only reg, skip meta, paths at ./\n"
|
||||
" -X | set the same user. xattr name in all files\n"
|
||||
" -S | set the same .srch. xattr name in all files\n"
|
||||
" -G | set a .srch. xattr name shared by groups of files\n"
|
||||
" -U | set a unique .srch. xattr name in all files\n");
|
||||
" -X NAM | set named xattr in all files\n");
|
||||
}
|
||||
|
||||
int main(int argc, char **argv)
|
||||
@@ -386,7 +397,7 @@ int main(int argc, char **argv)
|
||||
|
||||
memset(&opts, 0, sizeof(opts));
|
||||
|
||||
while ((c = getopt(argc, argv, "d:nqLXSGU")) != -1) {
|
||||
while ((c = getopt(argc, argv, "d:nqFGLX:")) != -1) {
|
||||
switch(c) {
|
||||
case 'd':
|
||||
top_dir = strdup(optarg);
|
||||
@@ -397,20 +408,19 @@ int main(int argc, char **argv)
|
||||
case 'q':
|
||||
opts.quiet = 1;
|
||||
break;
|
||||
case 'F':
|
||||
opts.xattr_file = 1;
|
||||
break;
|
||||
case 'G':
|
||||
opts.xattr_group = 1;
|
||||
break;
|
||||
case 'L':
|
||||
opts.ls_output = 1;
|
||||
break;
|
||||
case 'X':
|
||||
opts.user_xattr = 1;
|
||||
break;
|
||||
case 'S':
|
||||
opts.same_srch_xattr = 1;
|
||||
break;
|
||||
case 'G':
|
||||
opts.group_srch_xattr = 1;
|
||||
break;
|
||||
case 'U':
|
||||
opts.unique_srch_xattr = 1;
|
||||
opts.xattr_set = 1;
|
||||
opts.xattr_name = strdup(optarg);
|
||||
error_exit(!opts.xattr_name, "error allocating xattr name");
|
||||
break;
|
||||
case '?':
|
||||
printf("Unknown option '%c'\n", optopt);
|
||||
@@ -419,6 +429,11 @@ int main(int argc, char **argv)
|
||||
}
|
||||
}
|
||||
|
||||
error_exit(opts.xattr_file && !opts.xattr_set,
|
||||
"must specify xattr -X when appending file nr with -F");
|
||||
error_exit(opts.xattr_group && !opts.xattr_set,
|
||||
"must specify xattr -X when appending file nr with -G");
|
||||
|
||||
if (!opts.dry_run) {
|
||||
error_exit(!top_dir,
|
||||
"must specify top level directory with -d");
|
||||
|
||||
@@ -7,9 +7,11 @@ t_require_mounts 2
|
||||
|
||||
COUNT=50000
|
||||
|
||||
# Prep dirs for test. Each mount needs to make their own parent dir for
|
||||
# the createmany run, otherwise both dirs will end up in the same inode
|
||||
# group, causing updates to bounce that lock around.
|
||||
#
|
||||
# Prep dirs for test. We have per-directory inode number allocators so
|
||||
# by putting each createmany in a per-mount dir they get their own inode
|
||||
# number region and cluster locks.
|
||||
#
|
||||
echo "== measure initial createmany"
|
||||
mkdir -p $T_D0/dir/0
|
||||
mkdir $T_D1/dir/1
|
||||
@@ -17,18 +19,20 @@ mkdir $T_D1/dir/1
|
||||
echo "== measure initial createmany"
|
||||
START=$SECONDS
|
||||
createmany -o "$T_D0/file_" $COUNT >> $T_TMP.full
|
||||
sync
|
||||
SINGLE=$((SECONDS - START))
|
||||
echo single $SINGLE >> $T_TMP.full
|
||||
|
||||
echo "== measure two concurrent createmany runs"
|
||||
START=$SECONDS
|
||||
createmany -o $T_D0/dir/0/file $COUNT > /dev/null &
|
||||
(cd $T_D0/dir/0; createmany -o ./file_ $COUNT > /dev/null) &
|
||||
pids="$!"
|
||||
createmany -o $T_D1/dir/1/file $COUNT > /dev/null &
|
||||
(cd $T_D1/dir/1; createmany -o ./file_ $COUNT > /dev/null) &
|
||||
pids="$pids $!"
|
||||
for p in $pids; do
|
||||
wait $p
|
||||
done
|
||||
sync
|
||||
BOTH=$((SECONDS - START))
|
||||
echo both $BOTH >> $T_TMP.full
|
||||
|
||||
@@ -41,7 +45,10 @@ echo both $BOTH >> $T_TMP.full
|
||||
# synchronized operation.
|
||||
FACTOR=200
|
||||
if [ "$BOTH" -gt $(($SINGLE*$FACTOR)) ]; then
|
||||
echo "both createmany took $BOTH sec, more than $FACTOR x single $SINGLE sec"
|
||||
t_fail "both createmany took $BOTH sec, more than $FACTOR x single $SINGLE sec"
|
||||
fi
|
||||
|
||||
echo "== cleanup"
|
||||
find $T_D0/dir -delete
|
||||
|
||||
t_pass
|
||||
|
||||
@@ -10,6 +10,30 @@ EXTENTS_PER_BTREE_BLOCK=600
|
||||
EXTENTS_PER_LIST_BLOCK=8192
|
||||
FREED_EXTENTS=$((EXTENTS_PER_BTREE_BLOCK * EXTENTS_PER_LIST_BLOCK))
|
||||
|
||||
#
|
||||
# This test specifically creates a pathologically sparse file that will
|
||||
# be as expensive as possible to free. This is usually fine on
|
||||
# dedicated or reasonable hardware, but trying to run this in
|
||||
# virtualized debug kernels can take a very long time. This test is
|
||||
# about making sure that the server doesn't fail, not that the platform
|
||||
# can handle the scale of work that our btree formats happen to require
|
||||
# while execution is bogged down with use-after-free memory reference
|
||||
# tracking. So we give the test a lot more breathing room before
|
||||
# deciding that its hung.
|
||||
#
|
||||
echo "== setting longer hung task timeout"
|
||||
if [ -w /proc/sys/kernel/hung_task_timeout_secs ]; then
|
||||
secs=$(cat /proc/sys/kernel/hung_task_timeout_secs)
|
||||
test "$secs" -gt 0 || \
|
||||
t_fail "confusing value '$secs' from /proc/sys/kernel/hung_task_timeout_secs"
|
||||
restore_hung_task_timeout()
|
||||
{
|
||||
echo "$secs" > /proc/sys/kernel/hung_task_timeout_secs
|
||||
}
|
||||
trap restore_hung_task_timeout EXIT
|
||||
echo "$((secs * 5))" > /proc/sys/kernel/hung_task_timeout_secs
|
||||
fi
|
||||
|
||||
echo "== creating fragmented extents"
|
||||
fragmented_data_extents $FREED_EXTENTS $EXTENTS_PER_BTREE_BLOCK "$T_D0/alloc" "$T_D0/move"
|
||||
|
||||
|
||||
@@ -9,6 +9,7 @@ LOG=340000
|
||||
LIM=1000000
|
||||
|
||||
SEQF="%.20g"
|
||||
SXA="scoutfs.srch.test-srch-basic-functionality"
|
||||
|
||||
t_require_commands touch rm setfattr scoutfs find_xattrs
|
||||
|
||||
@@ -27,20 +28,20 @@ diff_srch_find()
|
||||
|
||||
echo "== create new xattrs"
|
||||
touch "$T_D0/"{create,update}
|
||||
setfattr -n scoutfs.srch.test -v 1 "$T_D0/"{create,update} 2>&1 | t_filter_fs
|
||||
diff_srch_find scoutfs.srch.test
|
||||
setfattr -n $SXA -v 1 "$T_D0/"{create,update} 2>&1 | t_filter_fs
|
||||
diff_srch_find $SXA
|
||||
|
||||
echo "== update existing xattr"
|
||||
setfattr -n scoutfs.srch.test -v 2 "$T_D0/update" 2>&1 | t_filter_fs
|
||||
diff_srch_find scoutfs.srch.test
|
||||
setfattr -n $SXA -v 2 "$T_D0/update" 2>&1 | t_filter_fs
|
||||
diff_srch_find $SXA
|
||||
|
||||
echo "== remove an xattr"
|
||||
setfattr -x scoutfs.srch.test "$T_D0/create" 2>&1 | t_filter_fs
|
||||
diff_srch_find scoutfs.srch.test
|
||||
setfattr -x $SXA "$T_D0/create" 2>&1 | t_filter_fs
|
||||
diff_srch_find $SXA
|
||||
|
||||
echo "== remove xattr with files"
|
||||
rm -f "$T_D0/"{create,update}
|
||||
diff_srch_find scoutfs.srch.test
|
||||
diff_srch_find $SXA
|
||||
|
||||
echo "== trigger small log merges by rotating single block with unmount"
|
||||
sv=$(t_server_nr)
|
||||
@@ -56,7 +57,7 @@ while [ "$i" -lt "8" ]; do
|
||||
|
||||
eval path="\$T_D${nr}/single-block-$i"
|
||||
touch "$path"
|
||||
setfattr -n scoutfs.srch.single-block-logs -v $i "$path"
|
||||
setfattr -n $SXA -v $i "$path"
|
||||
t_umount $nr
|
||||
t_mount $nr
|
||||
|
||||
@@ -65,51 +66,51 @@ while [ "$i" -lt "8" ]; do
|
||||
done
|
||||
# wait for srch compaction worker delay
|
||||
sleep 10
|
||||
rm -rf "$T_D0/single-block-*"
|
||||
find "$T_D0" -type f -name 'single-block-*' -delete
|
||||
|
||||
echo "== create entries in current log"
|
||||
DIR="$T_D0/dir"
|
||||
NR=$((LOG / 4))
|
||||
mkdir -p "$DIR"
|
||||
seq -f "f-$SEQF" 1 $NR | src/bulk_create_paths -S -d "$DIR" > /dev/null
|
||||
diff_srch_find scoutfs.srch.scoutfs_bcp
|
||||
seq -f "f-$SEQF" 1 $NR | src/bulk_create_paths -X $SXA -d "$DIR" > /dev/null
|
||||
diff_srch_find $SXA
|
||||
|
||||
echo "== delete small fraction"
|
||||
seq -f "$DIR/f-$SEQF" 1 7 $NR | xargs setfattr -x scoutfs.srch.scoutfs_bcp
|
||||
diff_srch_find scoutfs.srch.scoutfs_bcp
|
||||
seq -f "$DIR/f-$SEQF" 1 7 $NR | xargs setfattr -x $SXA
|
||||
diff_srch_find $SXA
|
||||
|
||||
echo "== remove files"
|
||||
rm -rf "$DIR"
|
||||
diff_srch_find scoutfs.srch.scoutfs_bcp
|
||||
diff_srch_find $SXA
|
||||
|
||||
echo "== create entries that exceed one log"
|
||||
NR=$((LOG * 3 / 2))
|
||||
mkdir -p "$DIR"
|
||||
seq -f "f-$SEQF" 1 $NR | src/bulk_create_paths -S -d "$DIR" > /dev/null
|
||||
diff_srch_find scoutfs.srch.scoutfs_bcp
|
||||
seq -f "f-$SEQF" 1 $NR | src/bulk_create_paths -X $SXA -d "$DIR" > /dev/null
|
||||
diff_srch_find $SXA
|
||||
|
||||
echo "== delete fractions in phases"
|
||||
for i in $(seq 1 3); do
|
||||
seq -f "$DIR/f-$SEQF" $i 3 $NR | xargs setfattr -x scoutfs.srch.scoutfs_bcp
|
||||
diff_srch_find scoutfs.srch.scoutfs_bcp
|
||||
seq -f "$DIR/f-$SEQF" $i 3 $NR | xargs setfattr -x $SXA
|
||||
diff_srch_find $SXA
|
||||
done
|
||||
|
||||
echo "== remove files"
|
||||
rm -rf "$DIR"
|
||||
diff_srch_find scoutfs.srch.scoutfs_bcp
|
||||
diff_srch_find $SXA
|
||||
|
||||
echo "== create entries for exceed search entry limit"
|
||||
NR=$((LIM * 3 / 2))
|
||||
mkdir -p "$DIR"
|
||||
seq -f "f-$SEQF" 1 $NR | src/bulk_create_paths -S -d "$DIR" > /dev/null
|
||||
diff_srch_find scoutfs.srch.scoutfs_bcp
|
||||
seq -f "f-$SEQF" 1 $NR | src/bulk_create_paths -X $SXA -d "$DIR" > /dev/null
|
||||
diff_srch_find $SXA
|
||||
|
||||
echo "== delete half"
|
||||
seq -f "$DIR/f-$SEQF" 1 2 $NR | xargs setfattr -x scoutfs.srch.scoutfs_bcp
|
||||
diff_srch_find scoutfs.srch.scoutfs_bcp
|
||||
seq -f "$DIR/f-$SEQF" 1 2 $NR | xargs setfattr -x $SXA
|
||||
diff_srch_find $SXA
|
||||
|
||||
echo "== entirely remove third batch"
|
||||
rm -rf "$DIR"
|
||||
diff_srch_find scoutfs.srch.scoutfs_bcp
|
||||
diff_srch_find $SXA
|
||||
|
||||
t_pass
|
||||
|
||||
90
tests/tests/srch-safe-merge-pos.sh
Normal file
90
tests/tests/srch-safe-merge-pos.sh
Normal file
@@ -0,0 +1,90 @@
|
||||
#
|
||||
# There was a bug where srch file compaction could get stuck if a
|
||||
# partial compaction finished at the specific _SAFE_BYTES offset in a
|
||||
# block. Resuming from that position would return an error and
|
||||
# compaction would stop making forward progress.
|
||||
#
|
||||
# We use triggers to pad the output of log compaction to end on the safe
|
||||
# offset and then cause compaction of those padded inputs to stop at the
|
||||
# safe offset. Continuation will either succeed or return errors.
|
||||
#
|
||||
|
||||
# forcing rotation, so just a few
|
||||
NR=10
|
||||
SEQF="%.20g"
|
||||
COMPACT_NR=4
|
||||
|
||||
echo "== initialize per-mount values"
|
||||
declare -a err
|
||||
declare -a compact_delay
|
||||
for nr in $(t_fs_nrs); do
|
||||
err[$nr]=$(t_counter srch_compact_error $nr)
|
||||
compact_delay[$nr]=$(cat $(t_sysfs_path $nr)/srch/compact_delay_ms)
|
||||
done
|
||||
restore_compact_delay()
|
||||
{
|
||||
for nr in $(t_fs_nrs); do
|
||||
echo ${compact_delay[$nr]} > $(t_sysfs_path $nr)/srch/compact_delay_ms
|
||||
done
|
||||
}
|
||||
trap restore_compact_delay EXIT
|
||||
|
||||
echo "== arm compaction triggers"
|
||||
for nr in $(t_fs_nrs); do
|
||||
t_trigger_arm srch_compact_logs_pad_safe $nr
|
||||
t_trigger_arm srch_merge_stop_safe $nr
|
||||
done
|
||||
|
||||
echo "== compact more often"
|
||||
for nr in $(t_fs_nrs); do
|
||||
echo 1000 > $(t_sysfs_path $nr)/srch/compact_delay_ms
|
||||
done
|
||||
|
||||
echo "== create padded sorted inputs by forcing log rotation"
|
||||
sv=$(t_server_nr)
|
||||
for i in $(seq 1 $COMPACT_NR); do
|
||||
for j in $(seq 1 $COMPACT_NR); do
|
||||
t_trigger_arm srch_force_log_rotate $sv
|
||||
|
||||
seq -f "f-$i-$j-$SEQF" 1 10 | \
|
||||
bulk_create_paths -X "scoutfs.srch.t-srch-safe-merge-pos" -d "$T_D0" > \
|
||||
/dev/null
|
||||
sync
|
||||
|
||||
test "$(t_trigger_get srch_force_log_rotate $sv)" == "0" || \
|
||||
t_fail "srch_force_log_rotate didn't trigger"
|
||||
done
|
||||
|
||||
padded=0
|
||||
while test $padded == 0 && sleep .5; do
|
||||
for nr in $(t_fs_nrs); do
|
||||
if [ "$(t_trigger_get srch_compact_logs_pad_safe $nr)" == "0" ]; then
|
||||
t_trigger_arm srch_compact_logs_pad_safe $nr
|
||||
padded=1
|
||||
break
|
||||
fi
|
||||
test "$(t_counter srch_compact_error $nr)" == "${err[$nr]}" || \
|
||||
t_fail "srch_compact_error counter increased on mount $nr"
|
||||
done
|
||||
done
|
||||
done
|
||||
|
||||
echo "== compaction of padded should stop at safe"
|
||||
sleep 2
|
||||
for nr in $(t_fs_nrs); do
|
||||
if [ "$(t_trigger_get srch_merge_stop_safe $nr)" == "0" ]; then
|
||||
break
|
||||
fi
|
||||
done
|
||||
|
||||
echo "== verify no compaction errors"
|
||||
sleep 2
|
||||
for nr in $(t_fs_nrs); do
|
||||
test "$(t_counter srch_compact_error $nr)" == "${err[$nr]}" || \
|
||||
t_fail "srch_compact_error counter increased on mount $nr"
|
||||
done
|
||||
|
||||
echo "== cleanup"
|
||||
find "$T_D0" -type f -name 'f-*' -delete
|
||||
|
||||
t_pass
|
||||
@@ -7,7 +7,7 @@ FMTIOC_H := format.h ioctl.h
|
||||
FMTIOC_KMOD := $(addprefix ../kmod/src/,$(FMTIOC_H))
|
||||
|
||||
CFLAGS := -Wall -O2 -Werror -D_FILE_OFFSET_BITS=64 -g -msse4.2 \
|
||||
-fno-strict-aliasing \
|
||||
-I src/ -fno-strict-aliasing \
|
||||
-DSCOUTFS_FORMAT_HASH=0x$(SCOUTFS_FORMAT_HASH)LLU
|
||||
|
||||
ifneq ($(wildcard $(firstword $(FMTIOC_KMOD))),)
|
||||
@@ -15,8 +15,9 @@ CFLAGS += -I../kmod/src
|
||||
endif
|
||||
|
||||
BIN := src/scoutfs
|
||||
OBJ := $(patsubst %.c,%.o,$(wildcard src/*.c))
|
||||
DEPS := $(wildcard */*.d)
|
||||
OBJ_DIRS := src src/check
|
||||
OBJ := $(foreach dir,$(OBJ_DIRS),$(patsubst %.c,%.o,$(wildcard $(dir)/*.c)))
|
||||
DEPS := $(foreach dir,$(OBJ_DIRS),$(wildcard $(dir)/*.d))
|
||||
|
||||
all: $(BIN)
|
||||
|
||||
|
||||
@@ -55,6 +55,19 @@ with initial sparse regions (perhaps by multiple threads writing to
|
||||
different regions) and wasted space isn't an issue (perhaps because the
|
||||
file population contains few small files).
|
||||
.TP
|
||||
.B log_merge_wait_timeout_ms=<number>
|
||||
This option sets the amount of time, in milliseconds, that log merge
|
||||
creation can wait before timing out. This setting is per-mount, only
|
||||
changes the behavior of that mount, and only affects the server when it
|
||||
is running in that mount.
|
||||
.sp
|
||||
This determines how long it may take for mounts to synchronize
|
||||
committing their log trees to create a log merge operation. Setting it
|
||||
too high can create long latencies in the event that a mount takes a
|
||||
long time to commit their log. Setting it too low can result in the
|
||||
creation of excessive numbers of log trees that are never merged. The
|
||||
default is 500 and it can not be less than 100 nor greater than 60000.
|
||||
.TP
|
||||
.B metadev_path=<device>
|
||||
The metadev_path option specifies the path to the block device that
|
||||
contains the filesystem's metadata.
|
||||
|
||||
@@ -10,6 +10,11 @@
|
||||
* Just a quick simple native bitmap.
|
||||
*/
|
||||
|
||||
int test_bit(unsigned long *bits, u64 nr)
|
||||
{
|
||||
return !!(bits[nr / BITS_PER_LONG] & (1UL << (nr & (BITS_PER_LONG - 1))));
|
||||
}
|
||||
|
||||
void set_bit(unsigned long *bits, u64 nr)
|
||||
{
|
||||
bits[nr / BITS_PER_LONG] |= 1UL << (nr & (BITS_PER_LONG - 1));
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
#ifndef _BITMAP_H_
|
||||
#define _BITMAP_H_
|
||||
|
||||
int test_bit(unsigned long *bits, u64 nr);
|
||||
void set_bit(unsigned long *bits, u64 nr);
|
||||
void clear_bit(unsigned long *bits, u64 nr);
|
||||
u64 find_next_set_bit(unsigned long *start, u64 from, u64 total);
|
||||
|
||||
159
utils/src/check/alloc.c
Normal file
159
utils/src/check/alloc.c
Normal file
@@ -0,0 +1,159 @@
|
||||
#include <unistd.h>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <stdbool.h>
|
||||
#include <sys/mman.h>
|
||||
#include <errno.h>
|
||||
|
||||
#include "sparse.h"
|
||||
#include "util.h"
|
||||
#include "format.h"
|
||||
#include "bitmap.h"
|
||||
#include "key.h"
|
||||
|
||||
#include "alloc.h"
|
||||
#include "block.h"
|
||||
#include "btree.h"
|
||||
#include "extent.h"
|
||||
#include "iter.h"
|
||||
#include "sns.h"
|
||||
|
||||
/*
|
||||
* We check the list blocks serially.
|
||||
*
|
||||
* XXX:
|
||||
* - compare ref seqs
|
||||
* - detect cycles?
|
||||
*/
|
||||
int alloc_list_meta_iter(struct scoutfs_alloc_list_head *lhead, extent_cb_t cb, void *cb_arg)
|
||||
{
|
||||
struct scoutfs_alloc_list_block *lblk;
|
||||
struct scoutfs_block_ref ref;
|
||||
struct block *blk = NULL;
|
||||
u64 blkno;
|
||||
int ret;
|
||||
|
||||
ref = lhead->ref;
|
||||
|
||||
while (ref.blkno) {
|
||||
blkno = le64_to_cpu(ref.blkno);
|
||||
|
||||
ret = cb(blkno, 1, cb_arg);
|
||||
if (ret < 0) {
|
||||
ret = xlate_iter_errno(ret);
|
||||
goto out;
|
||||
}
|
||||
|
||||
ret = block_get(&blk, blkno, 0);
|
||||
if (ret < 0)
|
||||
goto out;
|
||||
|
||||
lblk = block_buf(blk);
|
||||
/* XXX verify block */
|
||||
/* XXX sort? maybe */
|
||||
|
||||
ref = lblk->next;
|
||||
|
||||
block_put(&blk);
|
||||
}
|
||||
|
||||
ret = 0;
|
||||
out:
|
||||
return ret;
|
||||
}
|
||||
|
||||
int alloc_root_meta_iter(struct scoutfs_alloc_root *root, extent_cb_t cb, void *cb_arg)
|
||||
{
|
||||
return btree_meta_iter(&root->root, cb, cb_arg);
|
||||
}
|
||||
|
||||
int alloc_list_extent_iter(struct scoutfs_alloc_list_head *lhead, extent_cb_t cb, void *cb_arg)
|
||||
{
|
||||
struct scoutfs_alloc_list_block *lblk;
|
||||
struct scoutfs_block_ref ref;
|
||||
struct block *blk = NULL;
|
||||
u64 blkno;
|
||||
int ret;
|
||||
int i;
|
||||
|
||||
ref = lhead->ref;
|
||||
|
||||
while (ref.blkno) {
|
||||
blkno = le64_to_cpu(ref.blkno);
|
||||
|
||||
ret = block_get(&blk, blkno, 0);
|
||||
if (ret < 0)
|
||||
goto out;
|
||||
|
||||
sns_push("alloc_list_block", blkno, 0);
|
||||
|
||||
lblk = block_buf(blk);
|
||||
/* XXX verify block */
|
||||
/* XXX sort? maybe */
|
||||
|
||||
ret = 0;
|
||||
for (i = 0; i < le32_to_cpu(lblk->nr); i++) {
|
||||
blkno = le64_to_cpu(lblk->blknos[le32_to_cpu(lblk->start) + i]);
|
||||
|
||||
ret = cb(blkno, 1, cb_arg);
|
||||
if (ret < 0)
|
||||
break;
|
||||
}
|
||||
|
||||
ref = lblk->next;
|
||||
|
||||
block_put(&blk);
|
||||
sns_pop();
|
||||
if (ret < 0) {
|
||||
ret = xlate_iter_errno(ret);
|
||||
goto out;
|
||||
}
|
||||
}
|
||||
|
||||
ret = 0;
|
||||
out:
|
||||
return ret;
|
||||
}
|
||||
|
||||
static bool valid_free_extent_key(struct scoutfs_key *key)
|
||||
{
|
||||
return (key->sk_zone == SCOUTFS_FREE_EXTENT_BLKNO_ZONE ||
|
||||
key->sk_zone == SCOUTFS_FREE_EXTENT_ORDER_ZONE) &&
|
||||
(!key->_sk_fourth && !key->sk_type &&
|
||||
(key->sk_zone == SCOUTFS_FREE_EXTENT_ORDER_ZONE || !key->_sk_third));
|
||||
}
|
||||
|
||||
static int free_item_cb(struct scoutfs_key *key, void *val, u16 val_len, void *cb_arg)
|
||||
{
|
||||
struct extent_cb_arg_t *ecba = cb_arg;
|
||||
u64 start;
|
||||
u64 len;
|
||||
|
||||
/* XXX not sure these eios are what we want */
|
||||
|
||||
if (val_len != 0)
|
||||
return -EIO;
|
||||
|
||||
if (!valid_free_extent_key(key))
|
||||
return -EIO;
|
||||
|
||||
if (key->sk_zone == SCOUTFS_FREE_EXTENT_ORDER_ZONE)
|
||||
return -ECHECK_ITER_DONE;
|
||||
|
||||
start = le64_to_cpu(key->skfb_end) - le64_to_cpu(key->skfb_len) + 1;
|
||||
len = le64_to_cpu(key->skfb_len);
|
||||
|
||||
return ecba->cb(start, len, ecba->cb_arg);
|
||||
}
|
||||
|
||||
/*
|
||||
* Call the callback with each of the primary BLKNO free extents stored
|
||||
* in item in the given alloc root. It doesn't visit the secondary
|
||||
* ORDER extents.
|
||||
*/
|
||||
int alloc_root_extent_iter(struct scoutfs_alloc_root *root, extent_cb_t cb, void *cb_arg)
|
||||
{
|
||||
struct extent_cb_arg_t ecba = { .cb = cb, .cb_arg = cb_arg };
|
||||
|
||||
return btree_item_iter(&root->root, free_item_cb, &ecba);
|
||||
}
|
||||
12
utils/src/check/alloc.h
Normal file
12
utils/src/check/alloc.h
Normal file
@@ -0,0 +1,12 @@
|
||||
#ifndef _SCOUTFS_UTILS_CHECK_ALLOC_H
|
||||
#define _SCOUTFS_UTILS_CHECK_ALLOC_H
|
||||
|
||||
#include "extent.h"
|
||||
|
||||
int alloc_list_meta_iter(struct scoutfs_alloc_list_head *lhead, extent_cb_t cb, void *cb_arg);
|
||||
int alloc_root_meta_iter(struct scoutfs_alloc_root *root, extent_cb_t cb, void *cb_arg);
|
||||
|
||||
int alloc_list_extent_iter(struct scoutfs_alloc_list_head *lhead, extent_cb_t cb, void *cb_arg);
|
||||
int alloc_root_extent_iter(struct scoutfs_alloc_root *root, extent_cb_t cb, void *cb_arg);
|
||||
|
||||
#endif
|
||||
564
utils/src/check/block.c
Normal file
564
utils/src/check/block.c
Normal file
@@ -0,0 +1,564 @@
|
||||
#define _ISOC11_SOURCE /* aligned_alloc */
|
||||
#define _DEFAULT_SOURCE /* syscall() */
|
||||
#include <stdlib.h>
|
||||
#include <unistd.h>
|
||||
#include <stdbool.h>
|
||||
#include <stdio.h>
|
||||
#include <errno.h>
|
||||
#include <sys/syscall.h>
|
||||
#include <linux/aio_abi.h>
|
||||
|
||||
#include "sparse.h"
|
||||
#include "util.h"
|
||||
#include "format.h"
|
||||
#include "list.h"
|
||||
#include "cmp.h"
|
||||
#include "hash.h"
|
||||
|
||||
#include "block.h"
|
||||
#include "debug.h"
|
||||
#include "eno.h"
|
||||
|
||||
static struct block_data {
|
||||
struct list_head *hash_lists;
|
||||
size_t hash_nr;
|
||||
|
||||
struct list_head active_head;
|
||||
struct list_head inactive_head;
|
||||
struct list_head dirty_list;
|
||||
size_t nr_active;
|
||||
size_t nr_inactive;
|
||||
size_t nr_dirty;
|
||||
|
||||
int meta_fd;
|
||||
size_t max_cached;
|
||||
size_t nr_events;
|
||||
|
||||
aio_context_t ctx;
|
||||
struct iocb *iocbs;
|
||||
struct iocb **iocbps;
|
||||
struct io_event *events;
|
||||
} global_bdat;
|
||||
|
||||
struct block {
|
||||
struct list_head hash_head;
|
||||
struct list_head lru_head;
|
||||
struct list_head dirty_head;
|
||||
struct list_head submit_head;
|
||||
unsigned long refcount;
|
||||
unsigned long uptodate:1,
|
||||
active:1;
|
||||
u64 blkno;
|
||||
void *buf;
|
||||
size_t size;
|
||||
};
|
||||
|
||||
#define BLK_FMT \
|
||||
"blkno %llu rc %ld d %u a %u"
|
||||
#define BLK_ARG(blk) \
|
||||
(blk)->blkno, (blk)->refcount, !list_empty(&(blk)->dirty_head), blk->active
|
||||
#define debug_blk(blk, fmt, args...) \
|
||||
debug(fmt " " BLK_FMT, ##args, BLK_ARG(blk))
|
||||
|
||||
/*
|
||||
* This just allocates and initialzies the block. The caller is
|
||||
* responsible for putting it on the appropriate initial lists and
|
||||
* managing refcounts.
|
||||
*/
|
||||
static struct block *alloc_block(struct block_data *bdat, u64 blkno, size_t size)
|
||||
{
|
||||
struct block *blk;
|
||||
|
||||
blk = calloc(1, sizeof(struct block));
|
||||
if (blk) {
|
||||
blk->buf = aligned_alloc(4096, size); /* XXX static alignment :/ */
|
||||
if (!blk->buf) {
|
||||
free(blk);
|
||||
blk = NULL;
|
||||
} else {
|
||||
INIT_LIST_HEAD(&blk->hash_head);
|
||||
INIT_LIST_HEAD(&blk->lru_head);
|
||||
INIT_LIST_HEAD(&blk->dirty_head);
|
||||
INIT_LIST_HEAD(&blk->submit_head);
|
||||
blk->blkno = blkno;
|
||||
blk->size = size;
|
||||
}
|
||||
}
|
||||
|
||||
return blk;
|
||||
}
|
||||
|
||||
static void free_block(struct block_data *bdat, struct block *blk)
|
||||
{
|
||||
debug_blk(blk, "free");
|
||||
|
||||
if (!list_empty(&blk->lru_head)) {
|
||||
if (blk->active)
|
||||
bdat->nr_active--;
|
||||
else
|
||||
bdat->nr_inactive--;
|
||||
list_del(&blk->lru_head);
|
||||
}
|
||||
|
||||
if (!list_empty(&blk->dirty_head)) {
|
||||
bdat->nr_dirty--;
|
||||
list_del(&blk->dirty_head);
|
||||
}
|
||||
|
||||
if (!list_empty(&blk->hash_head))
|
||||
list_del(&blk->hash_head);
|
||||
|
||||
if (!list_empty(&blk->submit_head))
|
||||
list_del(&blk->submit_head);
|
||||
|
||||
free(blk->buf);
|
||||
free(blk);
|
||||
}
|
||||
|
||||
static bool blk_is_dirty(struct block *blk)
|
||||
{
|
||||
return !list_empty(&blk->dirty_head);
|
||||
}
|
||||
|
||||
/*
|
||||
* Rebalance the cache.
|
||||
*
|
||||
* First we shrink the cache to limit it to max_cached blocks.
|
||||
* Logically, we walk from oldest to newest in the inactive list and
|
||||
* then in the active list. Since these lists are physically one
|
||||
* list_head list we achieve this with a reverse walk starting from the
|
||||
* active head.
|
||||
*
|
||||
* Then we rebalnace the size of the two lists. The constraint is that
|
||||
* we don't let the active list grow larger than the inactive list. We
|
||||
* move blocks from the oldest tail of the active list to the newest
|
||||
* head of the inactive list.
|
||||
*
|
||||
* <- [active head] <-> [ .. active list .. ] <-> [inactive head] <-> [ .. inactive list .. ] ->
|
||||
*/
|
||||
static void rebalance_cache(struct block_data *bdat)
|
||||
{
|
||||
struct block *blk;
|
||||
struct block *blk_;
|
||||
|
||||
list_for_each_entry_safe_reverse(blk, blk_, &bdat->active_head, lru_head) {
|
||||
if ((bdat->nr_active + bdat->nr_inactive) < bdat->max_cached)
|
||||
break;
|
||||
|
||||
if (&blk->lru_head == &bdat->inactive_head || blk->refcount > 0 ||
|
||||
blk_is_dirty(blk))
|
||||
continue;
|
||||
|
||||
free_block(bdat, blk);
|
||||
}
|
||||
|
||||
list_for_each_entry_safe_reverse(blk, blk_, &bdat->inactive_head, lru_head) {
|
||||
if (bdat->nr_active <= bdat->nr_inactive || &blk->lru_head == &bdat->active_head)
|
||||
break;
|
||||
|
||||
list_move(&blk->lru_head, &bdat->inactive_head);
|
||||
blk->active = 0;
|
||||
bdat->nr_active--;
|
||||
bdat->nr_inactive++;
|
||||
}
|
||||
}
|
||||
|
||||
static void make_active(struct block_data *bdat, struct block *blk)
|
||||
{
|
||||
if (!blk->active) {
|
||||
if (!list_empty(&blk->lru_head)) {
|
||||
list_move(&blk->lru_head, &bdat->active_head);
|
||||
bdat->nr_inactive--;
|
||||
} else {
|
||||
list_add(&blk->lru_head, &bdat->active_head);
|
||||
}
|
||||
|
||||
blk->active = 1;
|
||||
bdat->nr_active++;
|
||||
}
|
||||
}
|
||||
|
||||
static int compar_iocbp(const void *A, const void *B)
|
||||
{
|
||||
struct iocb *a = *(struct iocb **)A;
|
||||
struct iocb *b = *(struct iocb **)B;
|
||||
|
||||
return scoutfs_cmp(a->aio_offset, b->aio_offset);
|
||||
}
|
||||
|
||||
static int submit_and_wait(struct block_data *bdat, struct list_head *list)
|
||||
{
|
||||
struct io_event *event;
|
||||
struct iocb *iocb;
|
||||
struct block *blk;
|
||||
int ret;
|
||||
int err;
|
||||
int nr;
|
||||
int i;
|
||||
|
||||
err = 0;
|
||||
nr = 0;
|
||||
list_for_each_entry(blk, list, submit_head) {
|
||||
iocb = &bdat->iocbs[nr];
|
||||
bdat->iocbps[nr] = iocb;
|
||||
|
||||
memset(iocb, 0, sizeof(struct iocb));
|
||||
|
||||
iocb->aio_data = (intptr_t)blk;
|
||||
iocb->aio_lio_opcode = blk_is_dirty(blk) ? IOCB_CMD_PWRITE : IOCB_CMD_PREAD;
|
||||
iocb->aio_fildes = bdat->meta_fd;
|
||||
iocb->aio_buf = (intptr_t)blk->buf;
|
||||
iocb->aio_nbytes = blk->size;
|
||||
iocb->aio_offset = blk->blkno * blk->size;
|
||||
|
||||
nr++;
|
||||
|
||||
debug_blk(blk, "submit");
|
||||
|
||||
if ((nr < bdat->nr_events) && blk->submit_head.next != list)
|
||||
continue;
|
||||
|
||||
qsort(bdat->iocbps, nr, sizeof(bdat->iocbps[0]), compar_iocbp);
|
||||
|
||||
ret = syscall(__NR_io_submit, bdat->ctx, nr, bdat->iocbps);
|
||||
if (ret != nr) {
|
||||
if (ret >= 0)
|
||||
errno = EIO;
|
||||
ret = -errno;
|
||||
printf("fatal system error submitting async IO: "ENO_FMT"\n",
|
||||
ENO_ARG(-ret));
|
||||
goto out;
|
||||
}
|
||||
|
||||
ret = syscall(__NR_io_getevents, bdat->ctx, nr, nr, bdat->events, NULL);
|
||||
if (ret != nr) {
|
||||
if (ret >= 0)
|
||||
errno = EIO;
|
||||
ret = -errno;
|
||||
printf("fatal system error getting IO events: "ENO_FMT"\n",
|
||||
ENO_ARG(-ret));
|
||||
goto out;
|
||||
}
|
||||
|
||||
ret = 0;
|
||||
for (i = 0; i < nr; i++) {
|
||||
event = &bdat->events[i];
|
||||
iocb = (struct iocb *)(intptr_t)event->obj;
|
||||
blk = (struct block *)(intptr_t)event->data;
|
||||
|
||||
debug_blk(blk, "complete res %lld", (long long)event->res);
|
||||
|
||||
if (event->res >= 0 && event->res != blk->size)
|
||||
event->res = -EIO;
|
||||
|
||||
/* io errors are fatal */
|
||||
if (event->res < 0) {
|
||||
ret = event->res;
|
||||
goto out;
|
||||
}
|
||||
|
||||
if (iocb->aio_lio_opcode == IOCB_CMD_PREAD) {
|
||||
blk->uptodate = 1;
|
||||
} else {
|
||||
list_del_init(&blk->dirty_head);
|
||||
bdat->nr_dirty--;
|
||||
}
|
||||
}
|
||||
nr = 0;
|
||||
}
|
||||
|
||||
ret = 0;
|
||||
out:
|
||||
return ret ?: err;
|
||||
}
|
||||
|
||||
static void inc_refcount(struct block *blk)
|
||||
{
|
||||
blk->refcount++;
|
||||
}
|
||||
|
||||
void block_put(struct block **blkp)
|
||||
{
|
||||
struct block_data *bdat = &global_bdat;
|
||||
struct block *blk = *blkp;
|
||||
|
||||
if (blk) {
|
||||
blk->refcount--;
|
||||
*blkp = NULL;
|
||||
|
||||
rebalance_cache(bdat);
|
||||
}
|
||||
}
|
||||
|
||||
static struct list_head *hash_bucket(struct block_data *bdat, u64 blkno)
|
||||
{
|
||||
u32 hash = scoutfs_hash32(&blkno, sizeof(blkno));
|
||||
|
||||
return &bdat->hash_lists[hash % bdat->hash_nr];
|
||||
}
|
||||
|
||||
static struct block *get_or_alloc(struct block_data *bdat, u64 blkno, int bf)
|
||||
{
|
||||
struct list_head *bucket = hash_bucket(bdat, blkno);
|
||||
struct block *search;
|
||||
struct block *blk;
|
||||
size_t size;
|
||||
|
||||
size = (bf & BF_SM) ? SCOUTFS_BLOCK_SM_SIZE : SCOUTFS_BLOCK_LG_SIZE;
|
||||
|
||||
blk = NULL;
|
||||
list_for_each_entry(search, bucket, hash_head) {
|
||||
if (search->blkno && blkno && search->size == size) {
|
||||
blk = search;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (!blk) {
|
||||
blk = alloc_block(bdat, blkno, size);
|
||||
if (blk) {
|
||||
list_add(&blk->hash_head, bucket);
|
||||
list_add(&blk->lru_head, &bdat->inactive_head);
|
||||
bdat->nr_inactive++;
|
||||
}
|
||||
}
|
||||
if (blk)
|
||||
inc_refcount(blk);
|
||||
|
||||
return blk;
|
||||
}
|
||||
|
||||
/*
|
||||
* Get a block.
|
||||
*
|
||||
* The caller holds a refcount to the block while it's in use that
|
||||
* prevents it from being removed from the cache. It must be dropped
|
||||
* with block_put();
|
||||
*/
|
||||
int block_get(struct block **blk_ret, u64 blkno, int bf)
|
||||
{
|
||||
struct block_data *bdat = &global_bdat;
|
||||
struct block *blk;
|
||||
LIST_HEAD(list);
|
||||
int ret;
|
||||
|
||||
blk = get_or_alloc(bdat, blkno, bf);
|
||||
if (!blk) {
|
||||
ret = -ENOMEM;
|
||||
goto out;
|
||||
}
|
||||
|
||||
if ((bf & BF_ZERO)) {
|
||||
memset(blk->buf, 0, blk->size);
|
||||
blk->uptodate = 1;
|
||||
}
|
||||
|
||||
if (bf & BF_OVERWRITE)
|
||||
blk->uptodate = 1;
|
||||
|
||||
if (!blk->uptodate) {
|
||||
list_add(&blk->submit_head, &list);
|
||||
ret = submit_and_wait(bdat, &list);
|
||||
list_del_init(&blk->submit_head);
|
||||
if (ret < 0)
|
||||
goto out;
|
||||
}
|
||||
|
||||
if ((bf & BF_DIRTY) && !blk_is_dirty(blk)) {
|
||||
list_add_tail(&bdat->dirty_list, &blk->dirty_head);
|
||||
bdat->nr_dirty++;
|
||||
}
|
||||
|
||||
make_active(bdat, blk);
|
||||
|
||||
rebalance_cache(bdat);
|
||||
ret = 0;
|
||||
out:
|
||||
if (ret < 0)
|
||||
block_put(&blk);
|
||||
*blk_ret = blk;
|
||||
return ret;
|
||||
}
|
||||
|
||||
void *block_buf(struct block *blk)
|
||||
{
|
||||
return blk->buf;
|
||||
}
|
||||
|
||||
size_t block_size(struct block *blk)
|
||||
{
|
||||
return blk->size;
|
||||
}
|
||||
|
||||
/*
|
||||
* Drop the block from the cache, regardless of if it was free or not.
|
||||
* This is used to avoid writing blocks which were dirtied but then
|
||||
* later freed.
|
||||
*
|
||||
* The block is immediately freed and can't be referenced after this
|
||||
* returns.
|
||||
*/
|
||||
void block_drop(struct block **blkp)
|
||||
{
|
||||
struct block_data *bdat = &global_bdat;
|
||||
|
||||
free_block(bdat, *blkp);
|
||||
*blkp = NULL;
|
||||
rebalance_cache(bdat);
|
||||
}
|
||||
|
||||
/*
|
||||
* This doesn't quite work for mixing large and small blocks, but that's
|
||||
* fine, we never do that.
|
||||
*/
|
||||
static int compar_u64(const void *A, const void *B)
|
||||
{
|
||||
u64 a = *((u64 *)A);
|
||||
u64 b = *((u64 *)B);
|
||||
|
||||
return scoutfs_cmp(a, b);
|
||||
}
|
||||
|
||||
/*
|
||||
* This read-ahead is synchronous and errors are ignored. If any of the
|
||||
* blknos aren't present in the cache then we issue concurrent reads for
|
||||
* them and wait. Any existing cached blocks will be left as is.
|
||||
*
|
||||
* We might be trying to read a lot more than the number of events so we
|
||||
* sort the caller's blknos before iterating over them rather than
|
||||
* relying on submission sorting the blocks in each submitted set.
|
||||
*/
|
||||
void block_readahead(u64 *blknos, size_t nr)
|
||||
{
|
||||
struct block_data *bdat = &global_bdat;
|
||||
struct block *blk;
|
||||
struct block *blk_;
|
||||
LIST_HEAD(list);
|
||||
size_t i;
|
||||
|
||||
if (nr == 0)
|
||||
return;
|
||||
|
||||
qsort(blknos, nr, sizeof(blknos[0]), compar_u64);
|
||||
|
||||
for (i = 0; i < nr; i++) {
|
||||
blk = get_or_alloc(bdat, blknos[i], 0);
|
||||
if (blk) {
|
||||
if (!blk->uptodate)
|
||||
list_add_tail(&blk->submit_head, &list);
|
||||
else
|
||||
block_put(&blk);
|
||||
}
|
||||
}
|
||||
|
||||
(void)submit_and_wait(bdat, &list);
|
||||
|
||||
list_for_each_entry_safe(blk, blk_, &list, submit_head) {
|
||||
list_del_init(&blk->submit_head);
|
||||
block_put(&blk);
|
||||
}
|
||||
|
||||
rebalance_cache(bdat);
|
||||
}
|
||||
|
||||
/*
|
||||
* The caller's block changes form a consistent transaction. If the amount of dirty
|
||||
* blocks is large enough we issue a write.
|
||||
*/
|
||||
int block_try_commit(bool force)
|
||||
{
|
||||
struct block_data *bdat = &global_bdat;
|
||||
struct block *blk;
|
||||
struct block *blk_;
|
||||
LIST_HEAD(list);
|
||||
int ret;
|
||||
|
||||
if (!force && bdat->nr_dirty < bdat->nr_events)
|
||||
return 0;
|
||||
|
||||
list_for_each_entry(blk, &bdat->dirty_list, dirty_head) {
|
||||
list_add_tail(&blk->submit_head, &list);
|
||||
inc_refcount(blk);
|
||||
}
|
||||
|
||||
ret = submit_and_wait(bdat, &list);
|
||||
|
||||
list_for_each_entry_safe(blk, blk_, &list, submit_head) {
|
||||
list_del_init(&blk->submit_head);
|
||||
block_put(&blk);
|
||||
}
|
||||
|
||||
if (ret < 0) {
|
||||
printf("error writing dirty transaction blocks\n");
|
||||
goto out;
|
||||
}
|
||||
|
||||
ret = block_get(&blk, SCOUTFS_SUPER_BLKNO, BF_SM | BF_OVERWRITE | BF_DIRTY);
|
||||
if (ret == 0) {
|
||||
list_add(&blk->submit_head, &list);
|
||||
ret = submit_and_wait(bdat, &list);
|
||||
list_del_init(&blk->submit_head);
|
||||
block_put(&blk);
|
||||
} else {
|
||||
ret = -ENOMEM;
|
||||
}
|
||||
if (ret < 0)
|
||||
printf("error writing super block to commit transaction\n");
|
||||
|
||||
out:
|
||||
rebalance_cache(bdat);
|
||||
return ret;
|
||||
}
|
||||
|
||||
int block_setup(int meta_fd, size_t max_cached_bytes, size_t max_dirty_bytes)
|
||||
{
|
||||
struct block_data *bdat = &global_bdat;
|
||||
size_t i;
|
||||
int ret;
|
||||
|
||||
bdat->max_cached = DIV_ROUND_UP(max_cached_bytes, SCOUTFS_BLOCK_LG_SIZE);
|
||||
bdat->hash_nr = bdat->max_cached / 4;
|
||||
bdat->nr_events = DIV_ROUND_UP(max_dirty_bytes, SCOUTFS_BLOCK_LG_SIZE);
|
||||
|
||||
bdat->iocbs = calloc(bdat->nr_events, sizeof(bdat->iocbs[0]));
|
||||
bdat->iocbps = calloc(bdat->nr_events, sizeof(bdat->iocbps[0]));
|
||||
bdat->events = calloc(bdat->nr_events, sizeof(bdat->events[0]));
|
||||
bdat->hash_lists = calloc(bdat->hash_nr, sizeof(bdat->hash_lists[0]));
|
||||
if (!bdat->iocbs || !bdat->iocbps || !bdat->events || !bdat->hash_lists) {
|
||||
ret = -ENOMEM;
|
||||
goto out;
|
||||
}
|
||||
|
||||
INIT_LIST_HEAD(&bdat->active_head);
|
||||
INIT_LIST_HEAD(&bdat->inactive_head);
|
||||
INIT_LIST_HEAD(&bdat->dirty_list);
|
||||
bdat->meta_fd = meta_fd;
|
||||
list_add(&bdat->inactive_head, &bdat->active_head);
|
||||
|
||||
for (i = 0; i < bdat->hash_nr; i++)
|
||||
INIT_LIST_HEAD(&bdat->hash_lists[i]);
|
||||
|
||||
ret = syscall(__NR_io_setup, bdat->nr_events, &bdat->ctx);
|
||||
|
||||
out:
|
||||
if (ret < 0) {
|
||||
free(bdat->iocbs);
|
||||
free(bdat->iocbps);
|
||||
free(bdat->events);
|
||||
free(bdat->hash_lists);
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
void block_shutdown(void)
|
||||
{
|
||||
struct block_data *bdat = &global_bdat;
|
||||
|
||||
syscall(SYS_io_destroy, bdat->ctx);
|
||||
|
||||
free(bdat->iocbs);
|
||||
free(bdat->iocbps);
|
||||
free(bdat->events);
|
||||
free(bdat->hash_lists);
|
||||
}
|
||||
32
utils/src/check/block.h
Normal file
32
utils/src/check/block.h
Normal file
@@ -0,0 +1,32 @@
|
||||
#ifndef _SCOUTFS_UTILS_CHECK_BLOCK_H_
|
||||
#define _SCOUTFS_UTILS_CHECK_BLOCK_H_
|
||||
|
||||
#include <unistd.h>
|
||||
#include <stdbool.h>
|
||||
|
||||
struct block;
|
||||
|
||||
#include "sparse.h"
|
||||
|
||||
/* block flags passed to block_get() */
|
||||
enum {
|
||||
BF_ZERO = (1 << 0), /* zero contents buf as block is returned */
|
||||
BF_DIRTY = (1 << 1), /* block will be written with transaction */
|
||||
BF_SM = (1 << 2), /* small 4k block instead of large 64k block */
|
||||
BF_OVERWRITE = (1 << 3), /* caller will overwrite contents, don't read */
|
||||
};
|
||||
|
||||
int block_get(struct block **blk_ret, u64 blkno, int bf);
|
||||
void block_put(struct block **blkp);
|
||||
|
||||
void *block_buf(struct block *blk);
|
||||
size_t block_size(struct block *blk);
|
||||
void block_drop(struct block **blkp);
|
||||
|
||||
void block_readahead(u64 *blknos, size_t nr);
|
||||
int block_try_commit(bool force);
|
||||
|
||||
int block_setup(int meta_fd, size_t max_cached_bytes, size_t max_dirty_bytes);
|
||||
void block_shutdown(void);
|
||||
|
||||
#endif
|
||||
209
utils/src/check/btree.c
Normal file
209
utils/src/check/btree.c
Normal file
@@ -0,0 +1,209 @@
|
||||
#include <unistd.h>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <stdbool.h>
|
||||
#include <errno.h>
|
||||
|
||||
#include "sparse.h"
|
||||
#include "util.h"
|
||||
#include "format.h"
|
||||
#include "key.h"
|
||||
#include "avl.h"
|
||||
|
||||
#include "block.h"
|
||||
#include "btree.h"
|
||||
#include "extent.h"
|
||||
#include "iter.h"
|
||||
#include "sns.h"
|
||||
#include "meta.h"
|
||||
#include "problem.h"
|
||||
|
||||
static inline void *item_val(struct scoutfs_btree_block *bt, struct scoutfs_btree_item *item)
|
||||
{
|
||||
return (void *)bt + le16_to_cpu(item->val_off);
|
||||
}
|
||||
|
||||
static void readahead_refs(struct scoutfs_btree_block *bt)
|
||||
{
|
||||
struct scoutfs_btree_item *item;
|
||||
struct scoutfs_avl_node *node;
|
||||
struct scoutfs_block_ref *ref;
|
||||
u64 *blknos;
|
||||
u64 blkno;
|
||||
u16 valid = 0;
|
||||
u16 nr = le16_to_cpu(bt->nr_items);
|
||||
int i;
|
||||
|
||||
blknos = calloc(nr, sizeof(blknos[0]));
|
||||
if (!blknos)
|
||||
return;
|
||||
|
||||
node = avl_first(&bt->item_root);
|
||||
|
||||
for (i = 0; i < nr; i++) {
|
||||
item = container_of(node, struct scoutfs_btree_item, node);
|
||||
ref = item_val(bt, item);
|
||||
blkno = le64_to_cpu(ref->blkno);
|
||||
|
||||
if (valid_meta_blkno(blkno))
|
||||
blknos[valid++] = blkno;
|
||||
|
||||
node = avl_next(&bt->item_root, &item->node);
|
||||
}
|
||||
|
||||
if (valid > 0)
|
||||
block_readahead(blknos, valid);
|
||||
free(blknos);
|
||||
}
|
||||
|
||||
/*
|
||||
* Call the callback on the referenced block. Then if the block
|
||||
* contains referneces read it and recurse into all its references.
|
||||
*/
|
||||
static int btree_ref_meta_iter(struct scoutfs_block_ref *ref, unsigned level, extent_cb_t cb,
|
||||
void *cb_arg)
|
||||
{
|
||||
struct scoutfs_btree_item *item;
|
||||
struct scoutfs_btree_block *bt;
|
||||
struct scoutfs_avl_node *node;
|
||||
struct block *blk = NULL;
|
||||
u64 blkno;
|
||||
int ret;
|
||||
int i;
|
||||
|
||||
blkno = le64_to_cpu(ref->blkno);
|
||||
if (!blkno)
|
||||
return 0;
|
||||
|
||||
ret = cb(blkno, 1, cb_arg);
|
||||
if (ret < 0) {
|
||||
ret = xlate_iter_errno(ret);
|
||||
return 0;
|
||||
}
|
||||
|
||||
if (level == 0)
|
||||
return 0;
|
||||
|
||||
ret = block_get(&blk, blkno, 0);
|
||||
if (ret < 0)
|
||||
return ret;
|
||||
|
||||
sns_push("btree_parent", blkno, 0);
|
||||
|
||||
bt = block_buf(blk);
|
||||
|
||||
/* XXX integrate verification with block cache */
|
||||
if (bt->level != level) {
|
||||
problem(PB_BTREE_BLOCK_BAD_LEVEL, "expected %u level %u", level, bt->level);
|
||||
ret = -EINVAL;
|
||||
goto out;
|
||||
}
|
||||
|
||||
/* read-ahead last level of parents */
|
||||
if (level == 2)
|
||||
readahead_refs(bt);
|
||||
|
||||
node = avl_first(&bt->item_root);
|
||||
|
||||
for (i = 0; i < le16_to_cpu(bt->nr_items); i++) {
|
||||
item = container_of(node, struct scoutfs_btree_item, node);
|
||||
ref = item_val(bt, item);
|
||||
|
||||
ret = btree_ref_meta_iter(ref, level - 1, cb, cb_arg);
|
||||
if (ret < 0)
|
||||
goto out;
|
||||
|
||||
node = avl_next(&bt->item_root, &item->node);
|
||||
}
|
||||
|
||||
ret = 0;
|
||||
out:
|
||||
block_put(&blk);
|
||||
sns_pop();
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
int btree_meta_iter(struct scoutfs_btree_root *root, extent_cb_t cb, void *cb_arg)
|
||||
{
|
||||
/* XXX check root */
|
||||
if (root->height == 0)
|
||||
return 0;
|
||||
|
||||
return btree_ref_meta_iter(&root->ref, root->height - 1, cb, cb_arg);
|
||||
}
|
||||
|
||||
static int btree_ref_item_iter(struct scoutfs_block_ref *ref, unsigned level,
|
||||
btree_item_cb_t cb, void *cb_arg)
|
||||
{
|
||||
struct scoutfs_btree_item *item;
|
||||
struct scoutfs_btree_block *bt;
|
||||
struct scoutfs_avl_node *node;
|
||||
struct block *blk = NULL;
|
||||
u64 blkno;
|
||||
int ret;
|
||||
int i;
|
||||
|
||||
blkno = le64_to_cpu(ref->blkno);
|
||||
if (!blkno)
|
||||
return 0;
|
||||
|
||||
ret = block_get(&blk, blkno, 0);
|
||||
if (ret < 0)
|
||||
return ret;
|
||||
|
||||
if (level)
|
||||
sns_push("btree_parent", blkno, 0);
|
||||
else
|
||||
sns_push("btree_leaf", blkno, 0);
|
||||
|
||||
bt = block_buf(blk);
|
||||
|
||||
/* XXX integrate verification with block cache */
|
||||
if (bt->level != level) {
|
||||
problem(PB_BTREE_BLOCK_BAD_LEVEL, "expected %u level %u", level, bt->level);
|
||||
ret = -EINVAL;
|
||||
goto out;
|
||||
}
|
||||
|
||||
/* read-ahead leaves that contain items */
|
||||
if (level == 1)
|
||||
readahead_refs(bt);
|
||||
|
||||
node = avl_first(&bt->item_root);
|
||||
|
||||
for (i = 0; i < le16_to_cpu(bt->nr_items); i++) {
|
||||
item = container_of(node, struct scoutfs_btree_item, node);
|
||||
|
||||
if (level) {
|
||||
ref = item_val(bt, item);
|
||||
ret = btree_ref_item_iter(ref, level - 1, cb, cb_arg);
|
||||
} else {
|
||||
ret = cb(&item->key, item_val(bt, item),
|
||||
le16_to_cpu(item->val_len), cb_arg);
|
||||
debug("free item key "SK_FMT" ret %d", SK_ARG(&item->key), ret);
|
||||
}
|
||||
if (ret < 0) {
|
||||
ret = xlate_iter_errno(ret);
|
||||
goto out;
|
||||
}
|
||||
|
||||
node = avl_next(&bt->item_root, &item->node);
|
||||
}
|
||||
|
||||
ret = 0;
|
||||
out:
|
||||
block_put(&blk);
|
||||
sns_pop();
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
int btree_item_iter(struct scoutfs_btree_root *root, btree_item_cb_t cb, void *cb_arg)
|
||||
{
|
||||
/* XXX check root */
|
||||
if (root->height == 0)
|
||||
return 0;
|
||||
|
||||
return btree_ref_item_iter(&root->ref, root->height - 1, cb, cb_arg);
|
||||
}
|
||||
14
utils/src/check/btree.h
Normal file
14
utils/src/check/btree.h
Normal file
@@ -0,0 +1,14 @@
|
||||
#ifndef _SCOUTFS_UTILS_CHECK_BTREE_H_
|
||||
#define _SCOUTFS_UTILS_CHECK_BTREE_H_
|
||||
|
||||
#include "util.h"
|
||||
#include "format.h"
|
||||
|
||||
#include "extent.h"
|
||||
|
||||
typedef int (*btree_item_cb_t)(struct scoutfs_key *key, void *val, u16 val_len, void *cb_arg);
|
||||
|
||||
int btree_meta_iter(struct scoutfs_btree_root *root, extent_cb_t cb, void *cb_arg);
|
||||
int btree_item_iter(struct scoutfs_btree_root *root, btree_item_cb_t cb, void *cb_arg);
|
||||
|
||||
#endif
|
||||
149
utils/src/check/check.c
Normal file
149
utils/src/check/check.c
Normal file
@@ -0,0 +1,149 @@
|
||||
#define _GNU_SOURCE /* O_DIRECT */
|
||||
#include <unistd.h>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <sys/types.h>
|
||||
#include <sys/stat.h>
|
||||
#include <sys/ioctl.h>
|
||||
#include <fcntl.h>
|
||||
#include <errno.h>
|
||||
#include <string.h>
|
||||
#include <assert.h>
|
||||
#include <stdbool.h>
|
||||
#include <argp.h>
|
||||
|
||||
#include "sparse.h"
|
||||
#include "parse.h"
|
||||
#include "util.h"
|
||||
#include "format.h"
|
||||
#include "ioctl.h"
|
||||
#include "cmd.h"
|
||||
#include "dev.h"
|
||||
|
||||
#include "alloc.h"
|
||||
#include "block.h"
|
||||
#include "debug.h"
|
||||
#include "meta.h"
|
||||
#include "super.h"
|
||||
|
||||
struct check_args {
|
||||
char *meta_device;
|
||||
char *data_device;
|
||||
char *debug_path;
|
||||
};
|
||||
|
||||
static int do_check(struct check_args *args)
|
||||
{
|
||||
int debug_fd = -1;
|
||||
int meta_fd = -1;
|
||||
int data_fd = -1;
|
||||
int ret;
|
||||
|
||||
if (args->debug_path) {
|
||||
debug_fd = open(args->debug_path, O_WRONLY | O_CREAT | O_TRUNC, 0644);
|
||||
if (debug_fd < 0) {
|
||||
ret = -errno;
|
||||
fprintf(stderr, "error opening debug output file '%s': %s (%d)\n",
|
||||
args->debug_path, strerror(errno), errno);
|
||||
goto out;
|
||||
}
|
||||
|
||||
debug_enable(debug_fd);
|
||||
}
|
||||
|
||||
meta_fd = open(args->meta_device, O_DIRECT | O_RDWR | O_EXCL);
|
||||
if (meta_fd < 0) {
|
||||
ret = -errno;
|
||||
fprintf(stderr, "failed to open meta device '%s': %s (%d)\n",
|
||||
args->meta_device, strerror(errno), errno);
|
||||
goto out;
|
||||
}
|
||||
|
||||
data_fd = open(args->data_device, O_DIRECT | O_RDWR | O_EXCL);
|
||||
if (data_fd < 0) {
|
||||
ret = -errno;
|
||||
fprintf(stderr, "failed to open data device '%s': %s (%d)\n",
|
||||
args->data_device, strerror(errno), errno);
|
||||
goto out;
|
||||
}
|
||||
|
||||
ret = block_setup(meta_fd, 128 * 1024 * 1024, 32 * 1024 * 1024);
|
||||
if (ret < 0)
|
||||
goto out;
|
||||
|
||||
ret = check_supers() ?:
|
||||
check_meta_alloc();
|
||||
out:
|
||||
/* and tear it all down */
|
||||
block_shutdown();
|
||||
super_shutdown();
|
||||
debug_disable();
|
||||
|
||||
if (meta_fd >= 0)
|
||||
close(meta_fd);
|
||||
if (data_fd >= 0)
|
||||
close(data_fd);
|
||||
if (debug_fd >= 0)
|
||||
close(debug_fd);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
static int parse_opt(int key, char *arg, struct argp_state *state)
|
||||
{
|
||||
struct check_args *args = state->input;
|
||||
|
||||
switch (key) {
|
||||
case 'd':
|
||||
args->debug_path = strdup_or_error(state, arg);
|
||||
break;
|
||||
case 'e':
|
||||
case ARGP_KEY_ARG:
|
||||
if (!args->meta_device)
|
||||
args->meta_device = strdup_or_error(state, arg);
|
||||
else if (!args->data_device)
|
||||
args->data_device = strdup_or_error(state, arg);
|
||||
else
|
||||
argp_error(state, "more than two device arguments given");
|
||||
break;
|
||||
case ARGP_KEY_FINI:
|
||||
if (!args->meta_device)
|
||||
argp_error(state, "no metadata device argument given");
|
||||
if (!args->data_device)
|
||||
argp_error(state, "no data device argument given");
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static struct argp_option options[] = {
|
||||
{ "debug", 'd', "FILE_PATH", 0, "Path to debug output file, will be created or truncated"},
|
||||
{ NULL }
|
||||
};
|
||||
|
||||
static struct argp argp = {
|
||||
options,
|
||||
parse_opt,
|
||||
"META-DEVICE DATA-DEVICE",
|
||||
"Check filesystem consistency"
|
||||
};
|
||||
|
||||
static int check_cmd(int argc, char **argv)
|
||||
{
|
||||
struct check_args check_args = {NULL};
|
||||
int ret;
|
||||
|
||||
ret = argp_parse(&argp, argc, argv, 0, NULL, &check_args);
|
||||
if (ret)
|
||||
return ret;
|
||||
|
||||
return do_check(&check_args);
|
||||
}
|
||||
|
||||
static void __attribute__((constructor)) check_ctor(void)
|
||||
{
|
||||
cmd_register_argp("check", &argp, GROUP_CORE, check_cmd);
|
||||
}
|
||||
16
utils/src/check/debug.c
Normal file
16
utils/src/check/debug.c
Normal file
@@ -0,0 +1,16 @@
|
||||
#include <stdlib.h>
|
||||
|
||||
#include "debug.h"
|
||||
|
||||
int debug_fd = -1;
|
||||
|
||||
void debug_enable(int fd)
|
||||
{
|
||||
debug_fd = fd;
|
||||
}
|
||||
|
||||
void debug_disable(void)
|
||||
{
|
||||
if (debug_fd >= 0)
|
||||
debug_fd = -1;
|
||||
}
|
||||
17
utils/src/check/debug.h
Normal file
17
utils/src/check/debug.h
Normal file
@@ -0,0 +1,17 @@
|
||||
#ifndef _SCOUTFS_UTILS_CHECK_DEBUG_H_
|
||||
#define _SCOUTFS_UTILS_CHECK_DEBUG_H_
|
||||
|
||||
#include <stdio.h>
|
||||
|
||||
#define debug(fmt, args...) \
|
||||
do { \
|
||||
if (debug_fd >= 0) \
|
||||
dprintf(debug_fd, fmt"\n", ##args); \
|
||||
} while (0)
|
||||
|
||||
extern int debug_fd;
|
||||
|
||||
void debug_enable(int fd);
|
||||
void debug_disable(void);
|
||||
|
||||
#endif
|
||||
9
utils/src/check/eno.h
Normal file
9
utils/src/check/eno.h
Normal file
@@ -0,0 +1,9 @@
|
||||
#ifndef _SCOUTFS_UTILS_CHECK_ENO_H_
|
||||
#define _SCOUTFS_UTILS_CHECK_ENO_H_
|
||||
|
||||
#include <errno.h>
|
||||
|
||||
#define ENO_FMT "%d (%s)"
|
||||
#define ENO_ARG(eno) eno, strerror(eno)
|
||||
|
||||
#endif
|
||||
312
utils/src/check/extent.c
Normal file
312
utils/src/check/extent.c
Normal file
@@ -0,0 +1,312 @@
|
||||
#include <unistd.h>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <stdbool.h>
|
||||
#include <errno.h>
|
||||
|
||||
#include "util.h"
|
||||
#include "lk_rbtree_wrapper.h"
|
||||
|
||||
#include "debug.h"
|
||||
#include "extent.h"
|
||||
|
||||
/*
|
||||
* In-memory extent management in rbtree nodes.
|
||||
*/
|
||||
|
||||
bool extents_overlap(u64 a_start, u64 a_len, u64 b_start, u64 b_len)
|
||||
{
|
||||
u64 a_end = a_start + a_len;
|
||||
u64 b_end = b_start + b_len;
|
||||
|
||||
return !((a_end <= b_start) || (b_end <= a_start));
|
||||
}
|
||||
|
||||
static int ext_contains(struct extent_node *ext, u64 start, u64 len)
|
||||
{
|
||||
return ext->start <= start && ext->start + ext->len >= start + len;
|
||||
}
|
||||
|
||||
/*
|
||||
* True if the given extent is bisected by the given range; there's
|
||||
* leftover containing extents on both the left and right sides of the
|
||||
* range in the extent.
|
||||
*/
|
||||
static int ext_bisected(struct extent_node *ext, u64 start, u64 len)
|
||||
{
|
||||
return ext->start < start && ext->start + ext->len > start + len;
|
||||
}
|
||||
|
||||
static struct extent_node *ext_from_rbnode(struct rb_node *rbnode)
|
||||
{
|
||||
return rbnode ? container_of(rbnode, struct extent_node, rbnode) : NULL;
|
||||
}
|
||||
|
||||
static struct extent_node *next_ext(struct extent_node *ext)
|
||||
{
|
||||
return ext ? ext_from_rbnode(rb_next(&ext->rbnode)) : NULL;
|
||||
}
|
||||
|
||||
static struct extent_node *prev_ext(struct extent_node *ext)
|
||||
{
|
||||
return ext ? ext_from_rbnode(rb_prev(&ext->rbnode)) : NULL;
|
||||
}
|
||||
|
||||
struct walk_results {
|
||||
unsigned bisect_to_leaf:1;
|
||||
struct extent_node *found;
|
||||
struct extent_node *next;
|
||||
struct rb_node *parent;
|
||||
struct rb_node **node;
|
||||
};
|
||||
|
||||
static void walk_extents(struct extent_root *root, u64 start, u64 len, struct walk_results *wlk)
|
||||
{
|
||||
struct rb_node **node = &root->rbroot.rb_node;
|
||||
struct extent_node *ext;
|
||||
u64 end = start + len;
|
||||
int cmp;
|
||||
|
||||
wlk->found = NULL;
|
||||
wlk->next = NULL;
|
||||
wlk->parent = NULL;
|
||||
|
||||
while (*node) {
|
||||
wlk->parent = *node;
|
||||
ext = ext_from_rbnode(*node);
|
||||
cmp = end <= ext->start ? -1 :
|
||||
start >= ext->start + ext->len ? 1 : 0;
|
||||
|
||||
if (cmp < 0) {
|
||||
node = &ext->rbnode.rb_left;
|
||||
wlk->next = ext;
|
||||
} else if (cmp > 0) {
|
||||
node = &ext->rbnode.rb_right;
|
||||
} else {
|
||||
wlk->found = ext;
|
||||
if (!(wlk->bisect_to_leaf && ext_bisected(ext, start, len)))
|
||||
break;
|
||||
/* walk right so we can insert greater right from bisection */
|
||||
node = &ext->rbnode.rb_right;
|
||||
}
|
||||
}
|
||||
|
||||
wlk->node = node;
|
||||
}
|
||||
|
||||
/*
|
||||
* Return an extent that overlaps with the given range.
|
||||
*/
|
||||
int extent_lookup(struct extent_root *root, u64 start, u64 len, struct extent_node *found)
|
||||
{
|
||||
struct walk_results wlk = { 0, };
|
||||
int ret;
|
||||
|
||||
walk_extents(root, start, len, &wlk);
|
||||
if (wlk.found) {
|
||||
memset(found, 0, sizeof(struct extent_node));
|
||||
found->start = wlk.found->start;
|
||||
found->len = wlk.found->len;
|
||||
ret = 0;
|
||||
} else {
|
||||
ret = -ENOENT;
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
/*
|
||||
* Callers can iterate through direct node references and are entirely
|
||||
* responsible for consistency when doing so.
|
||||
*/
|
||||
struct extent_node *extent_first(struct extent_root *root)
|
||||
{
|
||||
struct walk_results wlk = { 0, };
|
||||
|
||||
walk_extents(root, 0, 1, &wlk);
|
||||
|
||||
return wlk.found ?: wlk.next;
|
||||
}
|
||||
|
||||
struct extent_node *extent_next(struct extent_node *ext)
|
||||
{
|
||||
return next_ext(ext);
|
||||
}
|
||||
|
||||
struct extent_node *extent_prev(struct extent_node *ext)
|
||||
{
|
||||
return prev_ext(ext);
|
||||
}
|
||||
|
||||
/*
|
||||
* Insert a new extent into the tree. We can extend existing nodes,
|
||||
* merge with neighbours, or remove existing extents entirely if we
|
||||
* insert a range that fully spans existing nodes.
|
||||
*/
|
||||
static int walk_insert(struct extent_root *root, u64 start, u64 len, int found_err)
|
||||
{
|
||||
struct walk_results wlk = { 0, };
|
||||
struct extent_node *ext;
|
||||
struct extent_node *nei;
|
||||
int ret;
|
||||
|
||||
walk_extents(root, start, len, &wlk);
|
||||
|
||||
ext = wlk.found;
|
||||
if (ext && found_err) {
|
||||
ret = found_err;
|
||||
goto out;
|
||||
}
|
||||
|
||||
if (!ext) {
|
||||
ext = malloc(sizeof(struct extent_node));
|
||||
if (!ext) {
|
||||
ret = -ENOMEM;
|
||||
goto out;
|
||||
}
|
||||
|
||||
ext->start = start;
|
||||
ext->len = len;
|
||||
|
||||
rb_link_node(&ext->rbnode, wlk.parent, wlk.node);
|
||||
rb_insert_color(&ext->rbnode, &root->rbroot);
|
||||
}
|
||||
|
||||
/* start by expanding an existing extent if our range is larger */
|
||||
if (start < ext->start) {
|
||||
ext->len += ext->start - start;
|
||||
ext->start = start;
|
||||
}
|
||||
if (ext->start + ext->len < start + len)
|
||||
ext->len += (start + len) - (ext->start + ext->len);
|
||||
|
||||
/* drop any fully spanned neighbors, possibly merging with a final adjacent one */
|
||||
|
||||
while ((nei = prev_ext(ext))) {
|
||||
if (nei->start + nei->len < ext->start)
|
||||
break;
|
||||
|
||||
if (nei->start < ext->start) {
|
||||
ext->len += ext->start - nei->start;
|
||||
ext->start = nei->start;
|
||||
}
|
||||
|
||||
rb_erase(&nei->rbnode, &root->rbroot);
|
||||
free(nei);
|
||||
}
|
||||
|
||||
while ((nei = next_ext(ext))) {
|
||||
if (ext->start + ext->len < nei->start)
|
||||
break;
|
||||
|
||||
if (ext->start + ext->len < nei->start + nei->len)
|
||||
ext->len += (nei->start + nei->len) - (ext->start + ext->len);
|
||||
|
||||
rb_erase(&nei->rbnode, &root->rbroot);
|
||||
free(nei);
|
||||
}
|
||||
|
||||
ret = 0;
|
||||
out:
|
||||
debug("start %llu len %llu ret %d", start, len, ret);
|
||||
return ret;
|
||||
}
|
||||
|
||||
/*
|
||||
* Insert a new extent. The specified extent must not overlap with any
|
||||
* existing extents or -EEXIST is returned.
|
||||
*/
|
||||
int extent_insert_new(struct extent_root *root, u64 start, u64 len)
|
||||
{
|
||||
return walk_insert(root, start, len, true);
|
||||
}
|
||||
|
||||
/*
|
||||
* Insert an extent, extending any existing extents that may overlap.
|
||||
*/
|
||||
int extent_insert_extend(struct extent_root *root, u64 start, u64 len)
|
||||
{
|
||||
return walk_insert(root, start, len, false);
|
||||
}
|
||||
|
||||
/*
|
||||
* Remove the specified extent from an existing node. The given extent must be fully
|
||||
* contained in a single node or -ENOENT is returned.
|
||||
*/
|
||||
int extent_remove(struct extent_root *root, u64 start, u64 len)
|
||||
{
|
||||
struct extent_node *ext;
|
||||
struct extent_node *ins;
|
||||
struct walk_results wlk = {
|
||||
.bisect_to_leaf = 1,
|
||||
};
|
||||
int ret;
|
||||
|
||||
walk_extents(root, start, len, &wlk);
|
||||
|
||||
if (!(ext = wlk.found) || !ext_contains(ext, start, len)) {
|
||||
ret = -ENOENT;
|
||||
goto out;
|
||||
}
|
||||
|
||||
if (ext_bisected(ext, start, len)) {
|
||||
debug("found bisected start %llu len %llu", ext->start, ext->len);
|
||||
ins = malloc(sizeof(struct extent_node));
|
||||
if (!ins) {
|
||||
ret = -ENOMEM;
|
||||
goto out;
|
||||
}
|
||||
|
||||
ins->start = start + len;
|
||||
ins->len = (ext->start + ext->len) - ins->start;
|
||||
|
||||
rb_link_node(&ins->rbnode, wlk.parent, wlk.node);
|
||||
rb_insert_color(&ins->rbnode, &root->rbroot);
|
||||
}
|
||||
|
||||
if (start > ext->start) {
|
||||
ext->len = start - ext->start;
|
||||
} else if (len < ext->len) {
|
||||
ext->start += len;
|
||||
ext->len -= len;
|
||||
} else {
|
||||
rb_erase(&ext->rbnode, &root->rbroot);
|
||||
}
|
||||
|
||||
ret = 0;
|
||||
out:
|
||||
debug("start %llu len %llu ret %d", start, len, ret);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
void extent_root_init(struct extent_root *root)
|
||||
{
|
||||
root->rbroot = RB_ROOT;
|
||||
root->total = 0;
|
||||
}
|
||||
|
||||
void extent_root_free(struct extent_root *root)
|
||||
{
|
||||
struct extent_node *ext;
|
||||
struct rb_node *node;
|
||||
struct rb_node *tmp;
|
||||
|
||||
for (node = rb_first(&root->rbroot); node && ((tmp = rb_next(node)), 1); node = tmp) {
|
||||
ext = rb_entry(node, struct extent_node, rbnode);
|
||||
rb_erase(&ext->rbnode, &root->rbroot);
|
||||
free(ext);
|
||||
}
|
||||
}
|
||||
|
||||
void extent_root_print(struct extent_root *root)
|
||||
{
|
||||
struct extent_node *ext;
|
||||
struct rb_node *node;
|
||||
struct rb_node *tmp;
|
||||
|
||||
for (node = rb_first(&root->rbroot); node && ((tmp = rb_next(node)), 1); node = tmp) {
|
||||
ext = rb_entry(node, struct extent_node, rbnode);
|
||||
debug(" start %llu len %llu", ext->start, ext->len);
|
||||
}
|
||||
}
|
||||
38
utils/src/check/extent.h
Normal file
38
utils/src/check/extent.h
Normal file
@@ -0,0 +1,38 @@
|
||||
#ifndef _SCOUTFS_UTILS_CHECK_EXTENT_H_
|
||||
#define _SCOUTFS_UTILS_CHECK_EXTENT_H_
|
||||
|
||||
#include "lk_rbtree_wrapper.h"
|
||||
|
||||
struct extent_root {
|
||||
struct rb_root rbroot;
|
||||
u64 total;
|
||||
};
|
||||
|
||||
struct extent_node {
|
||||
struct rb_node rbnode;
|
||||
u64 start;
|
||||
u64 len;
|
||||
};
|
||||
|
||||
typedef int (*extent_cb_t)(u64 start, u64 len, void *arg);
|
||||
|
||||
struct extent_cb_arg_t {
|
||||
extent_cb_t cb;
|
||||
void *cb_arg;
|
||||
};
|
||||
|
||||
bool extents_overlap(u64 a_start, u64 a_len, u64 b_start, u64 b_len);
|
||||
|
||||
int extent_lookup(struct extent_root *root, u64 start, u64 len, struct extent_node *found);
|
||||
struct extent_node *extent_first(struct extent_root *root);
|
||||
struct extent_node *extent_next(struct extent_node *ext);
|
||||
struct extent_node *extent_prev(struct extent_node *ext);
|
||||
int extent_insert_new(struct extent_root *root, u64 start, u64 len);
|
||||
int extent_insert_extend(struct extent_root *root, u64 start, u64 len);
|
||||
int extent_remove(struct extent_root *root, u64 start, u64 len);
|
||||
|
||||
void extent_root_init(struct extent_root *root);
|
||||
void extent_root_free(struct extent_root *root);
|
||||
void extent_root_print(struct extent_root *root);
|
||||
|
||||
#endif
|
||||
540
utils/src/check/image.c
Normal file
540
utils/src/check/image.c
Normal file
@@ -0,0 +1,540 @@
|
||||
#define _GNU_SOURCE /* O_DIRECT */
|
||||
#include <unistd.h>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <fcntl.h>
|
||||
#include <errno.h>
|
||||
#include <string.h>
|
||||
#include <stdbool.h>
|
||||
#include <argp.h>
|
||||
|
||||
#include "sparse.h"
|
||||
#include "bitmap.h"
|
||||
#include "parse.h"
|
||||
#include "util.h"
|
||||
#include "format.h"
|
||||
#include "crc.h"
|
||||
#include "cmd.h"
|
||||
#include "dev.h"
|
||||
|
||||
#include "alloc.h"
|
||||
#include "block.h"
|
||||
#include "btree.h"
|
||||
#include "log_trees.h"
|
||||
#include "super.h"
|
||||
|
||||
/* huh. */
|
||||
#define OFF_MAX (off_t)((u64)((off_t)~0ULL) >> 1)
|
||||
|
||||
#define SCOUTFS_META_IMAGE_HEADER_MAGIC 0x8aee00d098fa60c5ULL
|
||||
#define SCOUTFS_META_IMAGE_BLOCK_HEADER_MAGIC 0x70bd5e9269effd86ULL
|
||||
|
||||
struct scoutfs_meta_image_header {
|
||||
__le64 magic;
|
||||
__le64 total_bytes;
|
||||
__le32 version;
|
||||
} __packed;
|
||||
|
||||
struct scoutfs_meta_image_block_header {
|
||||
__le64 magic;
|
||||
__le64 offset;
|
||||
__le32 size;
|
||||
__le32 crc;
|
||||
} __packed;
|
||||
|
||||
struct image_args {
|
||||
char *meta_device;
|
||||
bool is_read;
|
||||
bool show_header;
|
||||
u64 ra_window;
|
||||
};
|
||||
|
||||
struct block_bitmaps {
|
||||
unsigned long *bits;
|
||||
u64 size;
|
||||
u64 count;
|
||||
};
|
||||
|
||||
#define errf(fmt, args...) \
|
||||
dprintf(STDERR_FILENO, fmt, ##args)
|
||||
|
||||
static int set_meta_bit(u64 start, u64 len, void *arg)
|
||||
{
|
||||
struct block_bitmaps *bm = arg;
|
||||
int ret;
|
||||
|
||||
if (len != 1) {
|
||||
ret = -EINVAL;
|
||||
} else {
|
||||
if (!test_bit(bm->bits, start)) {
|
||||
set_bit(bm->bits, start);
|
||||
bm->count++;
|
||||
}
|
||||
ret = 0;
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
static int get_ref_bits(struct block_bitmaps *bm)
|
||||
{
|
||||
struct scoutfs_super_block *super = global_super;
|
||||
int ret;
|
||||
u64 i;
|
||||
|
||||
/*
|
||||
* There are almost no small blocks we need to read, so we read
|
||||
* them as the large blocks that contain them to simplify the
|
||||
* block reading process.
|
||||
*/
|
||||
set_meta_bit(SCOUTFS_SUPER_BLKNO >> SCOUTFS_BLOCK_SM_LG_SHIFT, 1, bm);
|
||||
|
||||
for (i = 0; i < SCOUTFS_QUORUM_BLOCKS; i++)
|
||||
set_meta_bit((SCOUTFS_QUORUM_BLKNO + i) >> SCOUTFS_BLOCK_SM_LG_SHIFT, 1, bm);
|
||||
|
||||
ret = alloc_root_meta_iter(&super->meta_alloc[0], set_meta_bit, bm) ?:
|
||||
alloc_root_meta_iter(&super->meta_alloc[1], set_meta_bit, bm) ?:
|
||||
alloc_root_meta_iter(&super->data_alloc, set_meta_bit, bm) ?:
|
||||
alloc_list_meta_iter(&super->server_meta_avail[0], set_meta_bit, bm) ?:
|
||||
alloc_list_meta_iter(&super->server_meta_avail[1], set_meta_bit, bm) ?:
|
||||
alloc_list_meta_iter(&super->server_meta_freed[0], set_meta_bit, bm) ?:
|
||||
alloc_list_meta_iter(&super->server_meta_freed[1], set_meta_bit, bm) ?:
|
||||
btree_meta_iter(&super->fs_root, set_meta_bit, bm) ?:
|
||||
btree_meta_iter(&super->logs_root, set_meta_bit, bm) ?:
|
||||
btree_meta_iter(&super->log_merge, set_meta_bit, bm) ?:
|
||||
btree_meta_iter(&super->mounted_clients, set_meta_bit, bm) ?:
|
||||
btree_meta_iter(&super->srch_root, set_meta_bit, bm) ?:
|
||||
log_trees_meta_iter(set_meta_bit, bm);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
/*
|
||||
* Note that this temporarily modifies the header that it's given.
|
||||
*/
|
||||
static __le32 calc_crc(struct scoutfs_meta_image_block_header *bh, void *buf, size_t size)
|
||||
{
|
||||
__le32 saved = bh->crc;
|
||||
u32 crc = ~0;
|
||||
|
||||
bh->crc = 0;
|
||||
crc = crc32c(crc, bh, sizeof(*bh));
|
||||
crc = crc32c(crc, buf, size);
|
||||
bh->crc = saved;
|
||||
|
||||
return cpu_to_le32(crc);
|
||||
}
|
||||
|
||||
static void printf_header(struct scoutfs_meta_image_header *hdr)
|
||||
{
|
||||
errf("magic: 0x%016llx\n"
|
||||
"total_bytes: %llu\n"
|
||||
"version: %u\n",
|
||||
le64_to_cpu(hdr->magic),
|
||||
le64_to_cpu(hdr->total_bytes),
|
||||
le32_to_cpu(hdr->version));
|
||||
}
|
||||
|
||||
typedef ssize_t (*rw_func_t)(int fd, void *buf, size_t count, off_t offset);
|
||||
|
||||
static inline ssize_t rw_read(int fd, void *buf, size_t count, off_t offset)
|
||||
{
|
||||
return read(fd, buf, count);
|
||||
}
|
||||
|
||||
static inline ssize_t rw_pread(int fd, void *buf, size_t count, off_t offset)
|
||||
{
|
||||
return pread(fd, buf, count, offset);
|
||||
}
|
||||
|
||||
static inline ssize_t rw_write(int fd, void *buf, size_t count, off_t offset)
|
||||
{
|
||||
return write(fd, buf, count);
|
||||
}
|
||||
|
||||
static inline ssize_t rw_pwrite(int fd, void *buf, size_t count, off_t offset)
|
||||
{
|
||||
return pwrite(fd, buf, count, offset);
|
||||
}
|
||||
|
||||
static int rw_full_count(rw_func_t func, u64 *tot, int fd, void *buf, size_t count, off_t offset)
|
||||
{
|
||||
ssize_t sret;
|
||||
|
||||
while (count > 0) {
|
||||
sret = func(fd, buf, count, offset);
|
||||
if (sret <= 0 || sret > count) {
|
||||
if (sret < 0)
|
||||
return -errno;
|
||||
else
|
||||
return -EIO;
|
||||
}
|
||||
|
||||
if (tot)
|
||||
*tot += sret;
|
||||
buf += sret;
|
||||
count -= sret;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int read_image(struct image_args *args, int fd, struct block_bitmaps *bm)
|
||||
{
|
||||
struct scoutfs_meta_image_block_header bh;
|
||||
struct scoutfs_meta_image_header hdr;
|
||||
u64 opening;
|
||||
void *buf;
|
||||
off_t off;
|
||||
u64 bit;
|
||||
u64 ra;
|
||||
int ret;
|
||||
|
||||
buf = malloc(SCOUTFS_BLOCK_LG_SIZE);
|
||||
if (!buf) {
|
||||
ret = -ENOMEM;
|
||||
goto out;
|
||||
}
|
||||
|
||||
hdr.magic = cpu_to_le64(SCOUTFS_META_IMAGE_HEADER_MAGIC);
|
||||
hdr.total_bytes = cpu_to_le64(sizeof(hdr) +
|
||||
(bm->count * (SCOUTFS_BLOCK_LG_SIZE + sizeof(bh))));
|
||||
hdr.version = cpu_to_le32(1);
|
||||
|
||||
if (args->show_header) {
|
||||
printf_header(&hdr);
|
||||
ret = 0;
|
||||
goto out;
|
||||
}
|
||||
|
||||
ret = rw_full_count(rw_write, NULL, STDOUT_FILENO, &hdr, sizeof(hdr), 0);
|
||||
if (ret < 0)
|
||||
goto out;
|
||||
|
||||
opening = args->ra_window;
|
||||
ra = 0;
|
||||
bit = 0;
|
||||
|
||||
for (bit = 0; (bit = find_next_set_bit(bm->bits, bit, bm->size)) < bm->size; bit++) {
|
||||
|
||||
/* readahead to open the full window, then a block at a time */
|
||||
do {
|
||||
ra = find_next_set_bit(bm->bits, ra, bm->size);
|
||||
if (ra < bm->size) {
|
||||
off = ra << SCOUTFS_BLOCK_LG_SHIFT;
|
||||
posix_fadvise(fd, off, SCOUTFS_BLOCK_LG_SIZE, POSIX_FADV_WILLNEED);
|
||||
ra++;
|
||||
if (opening)
|
||||
opening -= min(opening, SCOUTFS_BLOCK_LG_SIZE);
|
||||
}
|
||||
} while (opening > 0);
|
||||
|
||||
off = bit << SCOUTFS_BLOCK_LG_SHIFT;
|
||||
ret = rw_full_count(rw_pread, NULL, fd, buf, SCOUTFS_BLOCK_LG_SIZE, off);
|
||||
if (ret < 0)
|
||||
goto out;
|
||||
|
||||
/*
|
||||
* Might as well try to drop the pages we've used to
|
||||
* reduce memory pressure on our read-ahead pages that
|
||||
* are waiting.
|
||||
*/
|
||||
posix_fadvise(fd, off, SCOUTFS_BLOCK_LG_SIZE, POSIX_FADV_DONTNEED);
|
||||
|
||||
bh.magic = SCOUTFS_META_IMAGE_BLOCK_HEADER_MAGIC;
|
||||
bh.offset = cpu_to_le64(off);
|
||||
bh.size = cpu_to_le32(SCOUTFS_BLOCK_LG_SIZE);
|
||||
bh.crc = calc_crc(&bh, buf, SCOUTFS_BLOCK_LG_SIZE);
|
||||
|
||||
ret = rw_full_count(rw_write, NULL, STDOUT_FILENO, &bh, sizeof(bh), 0) ?:
|
||||
rw_full_count(rw_write, NULL, STDOUT_FILENO, buf, SCOUTFS_BLOCK_LG_SIZE, 0);
|
||||
if (ret < 0)
|
||||
goto out;
|
||||
}
|
||||
|
||||
out:
|
||||
free(buf);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
static int invalid_header(struct scoutfs_meta_image_header *hdr)
|
||||
{
|
||||
if (le64_to_cpu(hdr->magic) != SCOUTFS_META_IMAGE_HEADER_MAGIC) {
|
||||
errf("bad image header magic 0x%016llx (!= expected %016llx)\n",
|
||||
le64_to_cpu(hdr->magic), SCOUTFS_META_IMAGE_HEADER_MAGIC);
|
||||
|
||||
} else if (le32_to_cpu(hdr->version) != 1) {
|
||||
errf("unknown image header version %u\n", le32_to_cpu(hdr->version));
|
||||
|
||||
} else {
|
||||
return 0;
|
||||
}
|
||||
|
||||
return -EIO;
|
||||
}
|
||||
|
||||
/*
|
||||
* Doesn't catch offset+size overflowing, presumes pwrite() will return
|
||||
* an error.
|
||||
*/
|
||||
static int invalid_block_header(struct scoutfs_meta_image_block_header *bh)
|
||||
{
|
||||
if (le64_to_cpu(bh->magic) != SCOUTFS_META_IMAGE_BLOCK_HEADER_MAGIC) {
|
||||
errf("bad block header magic 0x%016llx (!= expected %016llx)\n",
|
||||
le64_to_cpu(bh->magic), SCOUTFS_META_IMAGE_BLOCK_HEADER_MAGIC);
|
||||
|
||||
} else if (le32_to_cpu(bh->size) == 0) {
|
||||
errf("invalid block header size %u\n", le32_to_cpu(bh->size));
|
||||
|
||||
} else if (le32_to_cpu(bh->size) > SIZE_MAX) {
|
||||
errf("block header size %u too large for size_t (> %zu)\n",
|
||||
le32_to_cpu(bh->size), (size_t)SIZE_MAX);
|
||||
|
||||
} else if (le64_to_cpu(bh->offset) > OFF_MAX) {
|
||||
errf("block header offset %llu too large for off_t (> %llu)\n",
|
||||
le64_to_cpu(bh->offset), (u64)OFF_MAX);
|
||||
|
||||
} else {
|
||||
return 0;
|
||||
}
|
||||
|
||||
return -EIO;
|
||||
}
|
||||
|
||||
static int write_image(struct image_args *args, int fd, struct block_bitmaps *bm)
|
||||
{
|
||||
struct scoutfs_meta_image_block_header bh;
|
||||
struct scoutfs_meta_image_header hdr;
|
||||
size_t writeback_batch = (2 * 1024 * 1024);
|
||||
size_t buf_size;
|
||||
size_t dirty;
|
||||
size_t size;
|
||||
off_t first;
|
||||
off_t last;
|
||||
off_t off;
|
||||
__le32 calc;
|
||||
void *buf;
|
||||
u64 tot;
|
||||
int ret;
|
||||
|
||||
tot = 0;
|
||||
|
||||
ret = rw_full_count(rw_read, &tot, STDIN_FILENO, &hdr, sizeof(hdr), 0);
|
||||
if (ret < 0)
|
||||
goto out;
|
||||
|
||||
if (args->show_header) {
|
||||
printf_header(&hdr);
|
||||
ret = 0;
|
||||
goto out;
|
||||
}
|
||||
|
||||
ret = invalid_header(&hdr);
|
||||
if (ret < 0)
|
||||
goto out;
|
||||
|
||||
dirty = 0;
|
||||
first = OFF_MAX;
|
||||
last = 0;
|
||||
buf = NULL;
|
||||
buf_size = 0;
|
||||
|
||||
while (tot < le64_to_cpu(hdr.total_bytes)) {
|
||||
|
||||
ret = rw_full_count(rw_read, &tot, STDIN_FILENO, &bh, sizeof(bh), 0);
|
||||
if (ret < 0)
|
||||
goto out;
|
||||
|
||||
ret = invalid_block_header(&bh);
|
||||
if (ret < 0)
|
||||
goto out;
|
||||
|
||||
size = le32_to_cpu(bh.size);
|
||||
if (buf_size < size) {
|
||||
buf = realloc(buf, size);
|
||||
if (!buf) {
|
||||
ret = -ENOMEM;
|
||||
goto out;
|
||||
}
|
||||
|
||||
buf_size = size;
|
||||
}
|
||||
|
||||
ret = rw_full_count(rw_read, &tot, STDIN_FILENO, buf, size, 0);
|
||||
if (ret < 0)
|
||||
goto out;
|
||||
|
||||
calc = calc_crc(&bh, buf, size);
|
||||
if (calc != bh.crc) {
|
||||
errf("crc err");
|
||||
ret = -EIO;
|
||||
goto out;
|
||||
}
|
||||
|
||||
off = le64_to_cpu(bh.offset);
|
||||
|
||||
ret = rw_full_count(rw_pwrite, NULL, fd, buf, size, off);
|
||||
if (ret < 0)
|
||||
goto out;
|
||||
|
||||
dirty += size;
|
||||
first = min(first, off);
|
||||
last = max(last, off);
|
||||
if (dirty >= writeback_batch) {
|
||||
posix_fadvise(fd, first, last, POSIX_FADV_DONTNEED);
|
||||
dirty = 0;
|
||||
first = OFF_MAX;
|
||||
last = 0;
|
||||
}
|
||||
}
|
||||
|
||||
ret = fsync(fd);
|
||||
if (ret < 0) {
|
||||
ret = -errno;
|
||||
goto out;
|
||||
}
|
||||
|
||||
out:
|
||||
return ret;
|
||||
}
|
||||
|
||||
static int do_image(struct image_args *args)
|
||||
{
|
||||
struct block_bitmaps bm = { .bits = NULL };
|
||||
int meta_fd = -1;
|
||||
u64 dev_size;
|
||||
mode_t mode;
|
||||
int ret;
|
||||
|
||||
mode = args->is_read ? O_RDONLY : O_RDWR;
|
||||
|
||||
meta_fd = open(args->meta_device, mode);
|
||||
if (meta_fd < 0) {
|
||||
ret = -errno;
|
||||
errf("failed to open meta device '%s': %s (%d)\n",
|
||||
args->meta_device, strerror(errno), errno);
|
||||
goto out;
|
||||
}
|
||||
|
||||
if (args->is_read) {
|
||||
ret = flush_device(meta_fd);
|
||||
if (ret < 0)
|
||||
goto out;
|
||||
|
||||
ret = get_device_size(args->meta_device, meta_fd, &dev_size);
|
||||
if (ret < 0)
|
||||
goto out;
|
||||
|
||||
bm.size = DIV_ROUND_UP(dev_size, SCOUTFS_BLOCK_LG_SIZE);
|
||||
bm.bits = calloc(1, round_up(bm.size, BITS_PER_LONG) / 8);
|
||||
if (!bm.bits) {
|
||||
ret = -ENOMEM;
|
||||
goto out;
|
||||
}
|
||||
|
||||
ret = block_setup(meta_fd, 128 * 1024 * 1024, 32 * 1024 * 1024) ?:
|
||||
check_supers() ?:
|
||||
get_ref_bits(&bm) ?:
|
||||
read_image(args, meta_fd, &bm);
|
||||
block_shutdown();
|
||||
} else {
|
||||
ret = write_image(args, meta_fd, &bm);
|
||||
}
|
||||
out:
|
||||
free(bm.bits);
|
||||
|
||||
if (meta_fd >= 0)
|
||||
close(meta_fd);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
static int parse_opt(int key, char *arg, struct argp_state *state)
|
||||
{
|
||||
struct image_args *args = state->input;
|
||||
int ret;
|
||||
|
||||
switch (key) {
|
||||
case 'h':
|
||||
args->show_header = true;
|
||||
break;
|
||||
case 'r':
|
||||
ret = parse_u64(arg, &args->ra_window);
|
||||
if (ret)
|
||||
argp_error(state, "readahead winddoe parse error");
|
||||
break;
|
||||
case ARGP_KEY_ARG:
|
||||
if (!args->meta_device)
|
||||
args->meta_device = strdup_or_error(state, arg);
|
||||
else
|
||||
argp_error(state, "more than two device arguments given");
|
||||
break;
|
||||
case ARGP_KEY_FINI:
|
||||
if (!args->meta_device)
|
||||
argp_error(state, "no metadata device argument given");
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static struct argp_option options[] = {
|
||||
{ "show-header", 'h', NULL, 0, "Print image header and exit without processing stream" },
|
||||
{ "readahead", 'r', "NR", 0, "Maintain read-ahead window of NR blocks" },
|
||||
{ NULL }
|
||||
};
|
||||
|
||||
static struct argp read_image_argp = {
|
||||
options,
|
||||
parse_opt,
|
||||
"META-DEVICE",
|
||||
"Read metadata image stream from metadata device file"
|
||||
};
|
||||
|
||||
#define DEFAULT_RA_WINDOW (512 * 1024)
|
||||
|
||||
static int read_image_cmd(int argc, char **argv)
|
||||
{
|
||||
struct image_args image_args = {
|
||||
.is_read = true,
|
||||
.ra_window = DEFAULT_RA_WINDOW,
|
||||
};
|
||||
int ret;
|
||||
|
||||
ret = argp_parse(&read_image_argp, argc, argv, 0, NULL, &image_args);
|
||||
if (ret)
|
||||
return ret;
|
||||
|
||||
return do_image(&image_args);
|
||||
}
|
||||
|
||||
static struct argp write_image_argp = {
|
||||
options,
|
||||
parse_opt,
|
||||
"META-DEVICE",
|
||||
"Write metadata image stream to metadata device file"
|
||||
};
|
||||
|
||||
static int write_image_cmd(int argc, char **argv)
|
||||
{
|
||||
struct image_args image_args = {
|
||||
.is_read = false,
|
||||
.ra_window = DEFAULT_RA_WINDOW,
|
||||
};
|
||||
int ret;
|
||||
|
||||
ret = argp_parse(&write_image_argp, argc, argv, 0, NULL, &image_args);
|
||||
if (ret)
|
||||
return ret;
|
||||
|
||||
return do_image(&image_args);
|
||||
}
|
||||
|
||||
static void __attribute__((constructor)) image_ctor(void)
|
||||
{
|
||||
cmd_register_argp("read-metadata-image", &read_image_argp, GROUP_CORE, read_image_cmd);
|
||||
cmd_register_argp("write-metadata-image", &write_image_argp, GROUP_CORE, write_image_cmd);
|
||||
}
|
||||
15
utils/src/check/iter.h
Normal file
15
utils/src/check/iter.h
Normal file
@@ -0,0 +1,15 @@
|
||||
#ifndef _SCOUTFS_UTILS_CHECK_ITER_H_
|
||||
#define _SCOUTFS_UTILS_CHECK_ITER_H_
|
||||
|
||||
/*
|
||||
* Callbacks can return a weird -errno that we'll never use to indicate
|
||||
* that iteration can stop and return 0 for success.
|
||||
*/
|
||||
#define ECHECK_ITER_DONE EL2HLT
|
||||
|
||||
static inline int xlate_iter_errno(int ret)
|
||||
{
|
||||
return ret == -ECHECK_ITER_DONE ? 0 : ret;
|
||||
}
|
||||
|
||||
#endif
|
||||
98
utils/src/check/log_trees.c
Normal file
98
utils/src/check/log_trees.c
Normal file
@@ -0,0 +1,98 @@
|
||||
#include <unistd.h>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <errno.h>
|
||||
|
||||
#include "sparse.h"
|
||||
#include "util.h"
|
||||
#include "format.h"
|
||||
#include "key.h"
|
||||
|
||||
#include "alloc.h"
|
||||
#include "btree.h"
|
||||
#include "debug.h"
|
||||
#include "extent.h"
|
||||
#include "iter.h"
|
||||
#include "sns.h"
|
||||
#include "log_trees.h"
|
||||
#include "super.h"
|
||||
|
||||
struct iter_args {
|
||||
extent_cb_t cb;
|
||||
void *cb_arg;
|
||||
};
|
||||
|
||||
static int lt_meta_iter(struct scoutfs_key *key, void *val, u16 val_len, void *cb_arg)
|
||||
{
|
||||
struct iter_args *ia = cb_arg;
|
||||
struct scoutfs_log_trees *lt;
|
||||
int ret;
|
||||
|
||||
if (val_len != sizeof(struct scoutfs_log_trees))
|
||||
; /* XXX */
|
||||
|
||||
lt = val;
|
||||
|
||||
sns_push("log_trees", le64_to_cpu(lt->rid), le64_to_cpu(lt->nr));
|
||||
|
||||
debug("lt rid 0x%16llx nr %llu", le64_to_cpu(lt->rid), le64_to_cpu(lt->nr));
|
||||
|
||||
sns_push("meta_avail", 0, 0);
|
||||
ret = alloc_list_meta_iter(<->meta_avail, ia->cb, ia->cb_arg);
|
||||
sns_pop();
|
||||
if (ret < 0)
|
||||
goto out;
|
||||
|
||||
sns_push("meta_freed", 0, 0);
|
||||
ret = alloc_list_meta_iter(<->meta_freed, ia->cb, ia->cb_arg);
|
||||
sns_pop();
|
||||
if (ret < 0)
|
||||
goto out;
|
||||
|
||||
sns_push("item_root", 0, 0);
|
||||
ret = btree_meta_iter(<->item_root, ia->cb, ia->cb_arg);
|
||||
sns_pop();
|
||||
if (ret < 0)
|
||||
goto out;
|
||||
|
||||
if (lt->bloom_ref.blkno) {
|
||||
sns_push("bloom_ref", 0, 0);
|
||||
ret = ia->cb(le64_to_cpu(lt->bloom_ref.blkno), 1, ia->cb_arg);
|
||||
sns_pop();
|
||||
if (ret < 0) {
|
||||
ret = xlate_iter_errno(ret);
|
||||
goto out;
|
||||
}
|
||||
}
|
||||
|
||||
sns_push("data_avail", 0, 0);
|
||||
ret = alloc_root_meta_iter(<->data_avail, ia->cb, ia->cb_arg);
|
||||
sns_pop();
|
||||
if (ret < 0)
|
||||
goto out;
|
||||
|
||||
sns_push("data_freed", 0, 0);
|
||||
ret = alloc_root_meta_iter(<->data_freed, ia->cb, ia->cb_arg);
|
||||
sns_pop();
|
||||
if (ret < 0)
|
||||
goto out;
|
||||
|
||||
ret = 0;
|
||||
out:
|
||||
sns_pop();
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
/*
|
||||
* Call the callers callback with the extent of all the metadata block references contained
|
||||
* in log btrees. We walk the logs_root btree items and walk all the metadata structures
|
||||
* they reference.
|
||||
*/
|
||||
int log_trees_meta_iter(extent_cb_t cb, void *cb_arg)
|
||||
{
|
||||
struct scoutfs_super_block *super = global_super;
|
||||
struct iter_args ia = { .cb = cb, .cb_arg = cb_arg };
|
||||
|
||||
return btree_item_iter(&super->logs_root, lt_meta_iter, &ia);
|
||||
}
|
||||
8
utils/src/check/log_trees.h
Normal file
8
utils/src/check/log_trees.h
Normal file
@@ -0,0 +1,8 @@
|
||||
#ifndef _SCOUTFS_UTILS_CHECK_LOG_TREES_H_
|
||||
#define _SCOUTFS_UTILS_CHECK_LOG_TREES_H_
|
||||
|
||||
#include "extent.h"
|
||||
|
||||
int log_trees_meta_iter(extent_cb_t cb, void *cb_arg);
|
||||
|
||||
#endif
|
||||
367
utils/src/check/meta.c
Normal file
367
utils/src/check/meta.c
Normal file
@@ -0,0 +1,367 @@
|
||||
#include <unistd.h>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <stdbool.h>
|
||||
#include <sys/mman.h>
|
||||
#include <errno.h>
|
||||
|
||||
#include "sparse.h"
|
||||
#include "util.h"
|
||||
#include "format.h"
|
||||
#include "bitmap.h"
|
||||
#include "key.h"
|
||||
|
||||
#include "alloc.h"
|
||||
#include "btree.h"
|
||||
#include "debug.h"
|
||||
#include "extent.h"
|
||||
#include "sns.h"
|
||||
#include "log_trees.h"
|
||||
#include "meta.h"
|
||||
#include "problem.h"
|
||||
#include "super.h"
|
||||
|
||||
static struct meta_data {
|
||||
struct extent_root meta_refed;
|
||||
struct extent_root meta_free;
|
||||
struct {
|
||||
u64 ref_blocks;
|
||||
u64 free_extents;
|
||||
u64 free_blocks;
|
||||
} stats;
|
||||
} global_mdat;
|
||||
|
||||
bool valid_meta_blkno(u64 blkno)
|
||||
{
|
||||
u64 tot = le64_to_cpu(global_super->total_meta_blocks);
|
||||
|
||||
return blkno >= SCOUTFS_META_DEV_START_BLKNO && blkno < tot;
|
||||
}
|
||||
|
||||
static bool valid_meta_extent(u64 start, u64 len)
|
||||
{
|
||||
u64 tot = le64_to_cpu(global_super->total_meta_blocks);
|
||||
bool valid;
|
||||
|
||||
valid = len > 0 &&
|
||||
start >= SCOUTFS_META_DEV_START_BLKNO &&
|
||||
start < tot &&
|
||||
len <= tot &&
|
||||
((start + len) <= tot) &&
|
||||
((start + len) > start);
|
||||
|
||||
debug("start %llu len %llu valid %u", start, len, !!valid);
|
||||
|
||||
if (!valid)
|
||||
problem(PB_META_EXTENT_INVALID, "start %llu len %llu", start, len);
|
||||
|
||||
return valid;
|
||||
}
|
||||
|
||||
/*
|
||||
* Track references to individual metadata blocks. This uses the extent
|
||||
* callback type but is only ever called for single block references.
|
||||
* Any reference to a block that has already been referenced is
|
||||
* considered invalid and is ignored. Later repair will resolve
|
||||
* duplicate references.
|
||||
*/
|
||||
static int insert_meta_ref(u64 start, u64 len, void *arg)
|
||||
{
|
||||
struct meta_data *mdat = &global_mdat;
|
||||
struct extent_root *root = arg;
|
||||
int ret = 0;
|
||||
|
||||
/* this is tracking single metadata block references */
|
||||
if (len != 1) {
|
||||
ret = -EINVAL;
|
||||
goto out;
|
||||
}
|
||||
|
||||
if (valid_meta_blkno(start)) {
|
||||
ret = extent_insert_new(root, start, len);
|
||||
if (ret == 0)
|
||||
mdat->stats.ref_blocks++;
|
||||
else if (ret == -EEXIST)
|
||||
problem(PB_META_REF_OVERLAPS_EXISTING, "blkno %llu", start);
|
||||
}
|
||||
|
||||
out:
|
||||
return ret;
|
||||
}
|
||||
|
||||
static int insert_meta_free(u64 start, u64 len, void *arg)
|
||||
{
|
||||
struct meta_data *mdat = &global_mdat;
|
||||
struct extent_root *root = arg;
|
||||
int ret = 0;
|
||||
|
||||
if (valid_meta_extent(start, len)) {
|
||||
ret = extent_insert_new(root, start, len);
|
||||
if (ret == 0) {
|
||||
mdat->stats.free_extents++;
|
||||
mdat->stats.free_blocks++;
|
||||
|
||||
} else if (ret == -EEXIST) {
|
||||
problem(PB_META_FREE_OVERLAPS_EXISTING,
|
||||
"start %llu llen %llu", start, len);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
/*
|
||||
* Walk all metadata references in the system. This walk doesn't need
|
||||
* to read metadata that doesn't contain any metadata references so it
|
||||
* can skip the bulk of metadata blocks. This gives us the set of
|
||||
* referenced metadata blocks which we can then use to repair metadata
|
||||
* allocator structures.
|
||||
*/
|
||||
static int get_meta_refs(void)
|
||||
{
|
||||
struct meta_data *mdat = &global_mdat;
|
||||
struct scoutfs_super_block *super = global_super;
|
||||
int ret;
|
||||
|
||||
extent_root_init(&mdat->meta_refed);
|
||||
|
||||
/* XXX record reserved blocks around super as referenced */
|
||||
|
||||
sns_push("meta_alloc", 0, 0);
|
||||
ret = alloc_root_meta_iter(&super->meta_alloc[0], insert_meta_ref, &mdat->meta_refed);
|
||||
sns_pop();
|
||||
if (ret < 0)
|
||||
goto out;
|
||||
|
||||
sns_push("meta_alloc", 1, 0);
|
||||
ret = alloc_root_meta_iter(&super->meta_alloc[1], insert_meta_ref, &mdat->meta_refed);
|
||||
sns_pop();
|
||||
if (ret < 0)
|
||||
goto out;
|
||||
|
||||
sns_push("data_alloc", 1, 0);
|
||||
ret = alloc_root_meta_iter(&super->data_alloc, insert_meta_ref, &mdat->meta_refed);
|
||||
sns_pop();
|
||||
if (ret < 0)
|
||||
goto out;
|
||||
|
||||
sns_push("server_meta_avail", 0, 0);
|
||||
ret = alloc_list_meta_iter(&super->server_meta_avail[0],
|
||||
insert_meta_ref, &mdat->meta_refed);
|
||||
sns_pop();
|
||||
if (ret < 0)
|
||||
goto out;
|
||||
|
||||
sns_push("server_meta_avail", 1, 0);
|
||||
ret = alloc_list_meta_iter(&super->server_meta_avail[1],
|
||||
insert_meta_ref, &mdat->meta_refed);
|
||||
sns_pop();
|
||||
if (ret < 0)
|
||||
goto out;
|
||||
|
||||
sns_push("server_meta_freed", 0, 0);
|
||||
ret = alloc_list_meta_iter(&super->server_meta_freed[0],
|
||||
insert_meta_ref, &mdat->meta_refed);
|
||||
sns_pop();
|
||||
if (ret < 0)
|
||||
goto out;
|
||||
|
||||
sns_push("server_meta_freed", 1, 0);
|
||||
ret = alloc_list_meta_iter(&super->server_meta_freed[1],
|
||||
insert_meta_ref, &mdat->meta_refed);
|
||||
sns_pop();
|
||||
if (ret < 0)
|
||||
goto out;
|
||||
|
||||
sns_push("fs_root", 0, 0);
|
||||
ret = btree_meta_iter(&super->fs_root, insert_meta_ref, &mdat->meta_refed);
|
||||
sns_pop();
|
||||
if (ret < 0)
|
||||
goto out;
|
||||
|
||||
sns_push("logs_root", 0, 0);
|
||||
ret = btree_meta_iter(&super->logs_root, insert_meta_ref, &mdat->meta_refed);
|
||||
sns_pop();
|
||||
if (ret < 0)
|
||||
goto out;
|
||||
|
||||
sns_push("log_merge", 0, 0);
|
||||
ret = btree_meta_iter(&super->log_merge, insert_meta_ref, &mdat->meta_refed);
|
||||
sns_pop();
|
||||
if (ret < 0)
|
||||
goto out;
|
||||
|
||||
sns_push("mounted_clients", 0, 0);
|
||||
ret = btree_meta_iter(&super->mounted_clients, insert_meta_ref, &mdat->meta_refed);
|
||||
sns_pop();
|
||||
if (ret < 0)
|
||||
goto out;
|
||||
|
||||
sns_push("srch_root", 0, 0);
|
||||
ret = btree_meta_iter(&super->srch_root, insert_meta_ref, &mdat->meta_refed);
|
||||
sns_pop();
|
||||
if (ret < 0)
|
||||
goto out;
|
||||
|
||||
ret = log_trees_meta_iter(insert_meta_ref, &mdat->meta_refed);
|
||||
if (ret < 0)
|
||||
goto out;
|
||||
|
||||
printf("found %llu referenced metadata blocks\n", mdat->stats.ref_blocks);
|
||||
ret = 0;
|
||||
out:
|
||||
return ret;
|
||||
}
|
||||
|
||||
static int get_meta_free(void)
|
||||
{
|
||||
struct meta_data *mdat = &global_mdat;
|
||||
struct scoutfs_super_block *super = global_super;
|
||||
int ret;
|
||||
|
||||
extent_root_init(&mdat->meta_free);
|
||||
|
||||
sns_push("meta_alloc", 0, 0);
|
||||
ret = alloc_root_extent_iter(&super->meta_alloc[0], insert_meta_free, &mdat->meta_free);
|
||||
sns_pop();
|
||||
if (ret < 0)
|
||||
goto out;
|
||||
|
||||
sns_push("meta_alloc", 1, 0);
|
||||
ret = alloc_root_extent_iter(&super->meta_alloc[1], insert_meta_free, &mdat->meta_free);
|
||||
sns_pop();
|
||||
if (ret < 0)
|
||||
goto out;
|
||||
|
||||
sns_push("server_meta_avail", 0, 0);
|
||||
ret = alloc_list_extent_iter(&super->server_meta_avail[0],
|
||||
insert_meta_free, &mdat->meta_free);
|
||||
sns_pop();
|
||||
if (ret < 0)
|
||||
goto out;
|
||||
|
||||
sns_push("server_meta_avail", 1, 0);
|
||||
ret = alloc_list_extent_iter(&super->server_meta_avail[1],
|
||||
insert_meta_free, &mdat->meta_free);
|
||||
sns_pop();
|
||||
if (ret < 0)
|
||||
goto out;
|
||||
|
||||
sns_push("server_meta_freed", 0, 0);
|
||||
ret = alloc_list_extent_iter(&super->server_meta_freed[0],
|
||||
insert_meta_free, &mdat->meta_free);
|
||||
sns_pop();
|
||||
if (ret < 0)
|
||||
goto out;
|
||||
|
||||
sns_push("server_meta_freed", 1, 0);
|
||||
ret = alloc_list_extent_iter(&super->server_meta_freed[1],
|
||||
insert_meta_free, &mdat->meta_free);
|
||||
sns_pop();
|
||||
if (ret < 0)
|
||||
goto out;
|
||||
|
||||
printf("found %llu free metadata blocks in %llu extents\n",
|
||||
mdat->stats.free_blocks, mdat->stats.free_extents);
|
||||
ret = 0;
|
||||
out:
|
||||
return ret;
|
||||
}
|
||||
|
||||
/*
|
||||
* All the space between referenced blocks must be recorded in the free
|
||||
* extents. The free extent walk didn't check that the extents
|
||||
* overlapped with references, we do that here. Remember that metadata
|
||||
* block references were merged into extents here, the refed extents
|
||||
* aren't necessarily all a single block.
|
||||
*/
|
||||
static int compare_refs_and_free(void)
|
||||
{
|
||||
struct meta_data *mdat = &global_mdat;
|
||||
struct extent_node *ref;
|
||||
struct extent_node *free;
|
||||
struct extent_node *next;
|
||||
struct extent_node *prev;
|
||||
u64 expect;
|
||||
u64 start;
|
||||
u64 end;
|
||||
|
||||
expect = 0;
|
||||
ref = extent_first(&mdat->meta_refed);
|
||||
free = extent_first(&mdat->meta_free);
|
||||
while (ref || free) {
|
||||
|
||||
printf("exp %llu ref %llu.%llu free %llu.%llu\n",
|
||||
expect, ref ? ref->start : 0, ref ? ref->len : 0,
|
||||
free ? free->start : 0, free ? free->len : 0);
|
||||
|
||||
/* referenced marked free, remove ref from free and continue from same point */
|
||||
if (ref && free && extents_overlap(ref->start, ref->len, free->start, free->len)) {
|
||||
printf("ref extent %llu.%llu overlaps free %llu %llu\n",
|
||||
ref->start, ref->len, free->start, free->len);
|
||||
|
||||
start = max(ref->start, free->start);
|
||||
end = min(ref->start + ref->len, free->start + free->len);
|
||||
|
||||
prev = extent_prev(free);
|
||||
|
||||
extent_remove(&mdat->meta_free, start, end - start);
|
||||
|
||||
if (prev)
|
||||
free = extent_next(prev);
|
||||
else
|
||||
free = extent_first(&mdat->meta_free);
|
||||
continue;
|
||||
}
|
||||
|
||||
/* see which extent starts earlier */
|
||||
if (!free || (ref && ref->start <= free->start))
|
||||
next = ref;
|
||||
else
|
||||
next = free;
|
||||
|
||||
/* untracked region before next extent */
|
||||
if (expect < next->start) {
|
||||
printf("missing free extent %llu.%llu\n", expect, next->start - expect);
|
||||
expect = next->start;
|
||||
continue;
|
||||
}
|
||||
|
||||
|
||||
/* didn't overlap, advance past next extent */
|
||||
expect = next->start + next->len;
|
||||
if (next == ref)
|
||||
ref = extent_next(ref);
|
||||
else
|
||||
free = extent_next(free);
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
/*
|
||||
* Check the metadata allocators by comparing the set of referenced
|
||||
* blocks with the set of free blocks that are stored in free btree
|
||||
* items and alloc list blocks.
|
||||
*/
|
||||
int check_meta_alloc(void)
|
||||
{
|
||||
int ret;
|
||||
|
||||
ret = get_meta_refs();
|
||||
if (ret < 0)
|
||||
goto out;
|
||||
|
||||
ret = get_meta_free();
|
||||
if (ret < 0)
|
||||
goto out;
|
||||
|
||||
ret = compare_refs_and_free();
|
||||
if (ret < 0)
|
||||
goto out;
|
||||
|
||||
ret = 0;
|
||||
out:
|
||||
return ret;
|
||||
}
|
||||
9
utils/src/check/meta.h
Normal file
9
utils/src/check/meta.h
Normal file
@@ -0,0 +1,9 @@
|
||||
#ifndef _SCOUTFS_UTILS_CHECK_META_H_
|
||||
#define _SCOUTFS_UTILS_CHECK_META_H_
|
||||
|
||||
bool valid_meta_blkno(u64 blkno);
|
||||
|
||||
int check_meta_alloc(void);
|
||||
|
||||
#endif
|
||||
|
||||
23
utils/src/check/padding.c
Normal file
23
utils/src/check/padding.c
Normal file
@@ -0,0 +1,23 @@
|
||||
#include <string.h>
|
||||
#include <stdbool.h>
|
||||
|
||||
#include "util.h"
|
||||
#include "padding.h"
|
||||
|
||||
bool padding_is_zeros(const void *data, size_t sz)
|
||||
{
|
||||
static char zeros[32] = {0,};
|
||||
const size_t batch = array_size(zeros);
|
||||
|
||||
while (sz >= batch) {
|
||||
if (memcmp(data, zeros, batch))
|
||||
return false;
|
||||
data += batch;
|
||||
sz -= batch;
|
||||
}
|
||||
|
||||
if (sz > 0 && memcmp(data, zeros, sz))
|
||||
return false;
|
||||
|
||||
return true;
|
||||
}
|
||||
6
utils/src/check/padding.h
Normal file
6
utils/src/check/padding.h
Normal file
@@ -0,0 +1,6 @@
|
||||
#ifndef _SCOUTFS_UTILS_CHECK_PADDING_H_
|
||||
#define _SCOUTFS_UTILS_CHECK_PADDING_H_
|
||||
|
||||
bool padding_is_zeros(const void *data, size_t sz);
|
||||
|
||||
#endif
|
||||
23
utils/src/check/problem.c
Normal file
23
utils/src/check/problem.c
Normal file
@@ -0,0 +1,23 @@
|
||||
#include <stdio.h>
|
||||
#include <stdint.h>
|
||||
|
||||
#include "problem.h"
|
||||
|
||||
#if 0
|
||||
#define PROB_STR(pb) [pb] = #pb
|
||||
static char *prob_strs[] = {
|
||||
PROB_STR(PB_META_EXTENT_INVALID),
|
||||
PROB_STR(PB_META_EXTENT_OVERLAPS_EXISTING),
|
||||
};
|
||||
#endif
|
||||
|
||||
static struct problem_data {
|
||||
uint64_t counts[PB__NR];
|
||||
} global_pdat;
|
||||
|
||||
void problem_record(prob_t pb)
|
||||
{
|
||||
struct problem_data *pdat = &global_pdat;
|
||||
|
||||
pdat->counts[pb]++;
|
||||
}
|
||||
23
utils/src/check/problem.h
Normal file
23
utils/src/check/problem.h
Normal file
@@ -0,0 +1,23 @@
|
||||
#ifndef _SCOUTFS_UTILS_CHECK_PROBLEM_H_
|
||||
#define _SCOUTFS_UTILS_CHECK_PROBLEM_H_
|
||||
|
||||
#include "debug.h"
|
||||
#include "sns.h"
|
||||
|
||||
typedef enum {
|
||||
PB_META_EXTENT_INVALID,
|
||||
PB_META_REF_OVERLAPS_EXISTING,
|
||||
PB_META_FREE_OVERLAPS_EXISTING,
|
||||
PB_BTREE_BLOCK_BAD_LEVEL,
|
||||
PB__NR,
|
||||
} prob_t;
|
||||
|
||||
#define problem(pb, fmt, ...) \
|
||||
do { \
|
||||
debug("problem found: "#pb": %s: "fmt, sns_str(), __VA_ARGS__); \
|
||||
problem_record(pb); \
|
||||
} while (0)
|
||||
|
||||
void problem_record(prob_t pb);
|
||||
|
||||
#endif
|
||||
118
utils/src/check/sns.c
Normal file
118
utils/src/check/sns.c
Normal file
@@ -0,0 +1,118 @@
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
|
||||
#include "sns.h"
|
||||
|
||||
/*
|
||||
* This "str num stack" is used to describe our location in metadata at
|
||||
* any given time.
|
||||
*
|
||||
* As we descend into structures we pop a string on decribing them,
|
||||
* perhaps with associated numbers. Pushing and popping is very cheap
|
||||
* and only rarely do we format the stack into a string, as an arbitrary
|
||||
* example:
|
||||
* super.fs_root.btree_parent:1231.btree_leaf:3231"
|
||||
*/
|
||||
|
||||
#define SNS_MAX_DEPTH 1000
|
||||
#define SNS_STR_SIZE (SNS_MAX_DEPTH * (SNS_MAX_STR_LEN + 1 + 16 + 1))
|
||||
|
||||
static struct sns_data {
|
||||
unsigned int depth;
|
||||
|
||||
struct sns_entry {
|
||||
char *str;
|
||||
size_t len;
|
||||
u64 a;
|
||||
u64 b;
|
||||
} ents[SNS_MAX_DEPTH];
|
||||
|
||||
char str[SNS_STR_SIZE];
|
||||
|
||||
} global_lsdat;
|
||||
|
||||
void _sns_push(char *str, size_t len, u64 a, u64 b)
|
||||
{
|
||||
struct sns_data *lsdat = &global_lsdat;
|
||||
|
||||
if (lsdat->depth < SNS_MAX_DEPTH) {
|
||||
lsdat->ents[lsdat->depth++] = (struct sns_entry) {
|
||||
.str = str,
|
||||
.len = len,
|
||||
.a = a,
|
||||
.b = b,
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
void sns_pop(void)
|
||||
{
|
||||
struct sns_data *lsdat = &global_lsdat;
|
||||
|
||||
if (lsdat->depth > 0)
|
||||
lsdat->depth--;
|
||||
}
|
||||
|
||||
static char *append_str(char *pos, char *str, size_t len)
|
||||
{
|
||||
memcpy(pos, str, len);
|
||||
return pos + len;
|
||||
}
|
||||
|
||||
/*
|
||||
* This is not called for x = 0 so we don't need to emit an initial 0.
|
||||
* We could by using do {} while instead of while {}.
|
||||
*/
|
||||
static char *append_u64x(char *pos, u64 x)
|
||||
{
|
||||
static char hex[] = "0123456789abcdef";
|
||||
|
||||
while (x) {
|
||||
*pos++ = hex[x & 0xf];
|
||||
x >>= 4;
|
||||
}
|
||||
|
||||
return pos;
|
||||
}
|
||||
|
||||
static char *append_char(char *pos, char c)
|
||||
{
|
||||
*(pos++) = c;
|
||||
return pos;
|
||||
}
|
||||
|
||||
/*
|
||||
* Return a pointer to a null terminated string that describes the
|
||||
* current location stack. The string buffer is global.
|
||||
*/
|
||||
char *sns_str(void)
|
||||
{
|
||||
struct sns_data *lsdat = &global_lsdat;
|
||||
struct sns_entry *ent;
|
||||
char *pos;
|
||||
int i;
|
||||
|
||||
pos = lsdat->str;
|
||||
for (i = 0; i < lsdat->depth; i++) {
|
||||
ent = &lsdat->ents[i];
|
||||
|
||||
if (i)
|
||||
pos = append_char(pos, '.');
|
||||
|
||||
pos = append_str(pos, ent->str, ent->len);
|
||||
|
||||
if (ent->a) {
|
||||
pos = append_char(pos, ':');
|
||||
pos = append_u64x(pos, ent->a);
|
||||
}
|
||||
|
||||
if (ent->b) {
|
||||
pos = append_char(pos, ':');
|
||||
pos = append_u64x(pos, ent->b);
|
||||
}
|
||||
}
|
||||
|
||||
*pos = '\0';
|
||||
|
||||
return lsdat->str;
|
||||
}
|
||||
20
utils/src/check/sns.h
Normal file
20
utils/src/check/sns.h
Normal file
@@ -0,0 +1,20 @@
|
||||
#ifndef _SCOUTFS_UTILS_CHECK_SNS_H_
|
||||
#define _SCOUTFS_UTILS_CHECK_SNS_H_
|
||||
|
||||
#include <assert.h>
|
||||
|
||||
#include "sparse.h"
|
||||
|
||||
#define SNS_MAX_STR_LEN 20
|
||||
|
||||
#define sns_push(str, a, b) \
|
||||
do { \
|
||||
build_assert(sizeof(str) - 1 <= SNS_MAX_STR_LEN); \
|
||||
_sns_push((str), sizeof(str) - 1, a, b); \
|
||||
} while (0)
|
||||
|
||||
void _sns_push(char *str, size_t len, u64 a, u64 b);
|
||||
void sns_pop(void);
|
||||
char *sns_str(void);
|
||||
|
||||
#endif
|
||||
57
utils/src/check/super.c
Normal file
57
utils/src/check/super.c
Normal file
@@ -0,0 +1,57 @@
|
||||
#include <unistd.h>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <errno.h>
|
||||
|
||||
#include "sparse.h"
|
||||
#include "util.h"
|
||||
#include "format.h"
|
||||
|
||||
#include "block.h"
|
||||
#include "super.h"
|
||||
|
||||
/*
|
||||
* After we check the super blocks we provide a global buffer to track
|
||||
* the current super block. It is referenced to get static information
|
||||
* about the system and is also modified and written as part of
|
||||
* transactions.
|
||||
*/
|
||||
struct scoutfs_super_block *global_super;
|
||||
|
||||
/*
|
||||
* After checking the supers we save a copy of it in a global buffer that's used by
|
||||
* other modules to track the current super. It can be modified and written during commits.
|
||||
*/
|
||||
int check_supers(void)
|
||||
{
|
||||
struct scoutfs_super_block *super = NULL;
|
||||
struct block *blk = NULL;
|
||||
int ret;
|
||||
|
||||
global_super = malloc(sizeof(struct scoutfs_super_block));
|
||||
if (!global_super) {
|
||||
printf("error allocating super block buffer\n");
|
||||
ret = -ENOMEM;
|
||||
goto out;
|
||||
}
|
||||
|
||||
ret = block_get(&blk, SCOUTFS_SUPER_BLKNO, BF_SM);
|
||||
if (ret < 0) {
|
||||
printf("error reading super block\n");
|
||||
goto out;
|
||||
}
|
||||
|
||||
super = block_buf(blk);
|
||||
|
||||
memcpy(global_super, super, sizeof(struct scoutfs_super_block));
|
||||
ret = 0;
|
||||
out:
|
||||
block_put(&blk);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
void super_shutdown(void)
|
||||
{
|
||||
free(global_super);
|
||||
}
|
||||
9
utils/src/check/super.h
Normal file
9
utils/src/check/super.h
Normal file
@@ -0,0 +1,9 @@
|
||||
#ifndef _SCOUTFS_UTILS_CHECK_SUPER_H_
|
||||
#define _SCOUTFS_UTILS_CHECK_SUPER_H_
|
||||
|
||||
extern struct scoutfs_super_block *global_super;
|
||||
|
||||
int check_supers(void);
|
||||
void super_shutdown(void);
|
||||
|
||||
#endif
|
||||
@@ -156,6 +156,16 @@ static inline void list_move_tail(struct list_head *list,
|
||||
list_add_tail(list, head);
|
||||
}
|
||||
|
||||
/**
|
||||
* list_is_head - tests whether @list is the list @head
|
||||
* @list: the entry to test
|
||||
* @head: the head of the list
|
||||
*/
|
||||
static inline int list_is_head(const struct list_head *list, const struct list_head *head)
|
||||
{
|
||||
return list == head;
|
||||
}
|
||||
|
||||
/**
|
||||
* list_empty - tests whether a list is empty
|
||||
* @head: the list to test.
|
||||
@@ -242,6 +252,15 @@ static inline void list_splice_init(struct list_head *list,
|
||||
for (pos = (head)->next, n = pos->next; pos != (head); \
|
||||
pos = n, n = pos->next)
|
||||
|
||||
/**
|
||||
* list_entry_is_head - test if the entry points to the head of the list
|
||||
* @pos: the type * to cursor
|
||||
* @head: the head for your list.
|
||||
* @member: the name of the list_head within the struct.
|
||||
*/
|
||||
#define list_entry_is_head(pos, head, member) \
|
||||
(&pos->member == (head))
|
||||
|
||||
/**
|
||||
* list_for_each_entry - iterate over list of given type
|
||||
* @pos: the type * to use as a loop counter.
|
||||
@@ -307,4 +326,28 @@ static inline void list_splice_init(struct list_head *list,
|
||||
#define list_next_entry(pos, member) \
|
||||
list_entry((pos)->member.next, typeof(*(pos)), member)
|
||||
|
||||
/**
|
||||
* list_prev_entry - get the prev element in list
|
||||
* @pos: the type * to cursor
|
||||
* @member: the name of the list_head within the struct.
|
||||
*/
|
||||
#define list_prev_entry(pos, member) \
|
||||
list_entry((pos)->member.prev, typeof(*(pos)), member)
|
||||
|
||||
/**
|
||||
* list_for_each_entry_safe_reverse - iterate backwards over list safe against removal
|
||||
* @pos: the type * to use as a loop cursor.
|
||||
* @n: another type * to use as temporary storage
|
||||
* @head: the head for your list.
|
||||
* @member: the name of the list_head within the struct.
|
||||
*
|
||||
* Iterate backwards over list of given type, safe against removal
|
||||
* of list entry.
|
||||
*/
|
||||
#define list_for_each_entry_safe_reverse(pos, n, head, member) \
|
||||
for (pos = list_last_entry(head, typeof(*pos), member), \
|
||||
n = list_prev_entry(pos, member); \
|
||||
!list_entry_is_head(pos, head, member); \
|
||||
pos = n, n = list_prev_entry(n, member))
|
||||
|
||||
#endif
|
||||
|
||||
24
utils/src/lk_rbtree_wrapper.h
Normal file
24
utils/src/lk_rbtree_wrapper.h
Normal file
@@ -0,0 +1,24 @@
|
||||
#ifndef _LK_RBTREE_WRAPPER_H_
|
||||
#define _LK_RBTREE_WRAPPER_H_
|
||||
|
||||
/*
|
||||
* We're using this lame hack to build and use the kernel's rbtree in
|
||||
* userspace. We drop the kernel's rbtree*[ch] implementation in and
|
||||
* use them with this wrapper. We only have to remove the kernel
|
||||
* includes from the imported files.
|
||||
*/
|
||||
|
||||
#include <stdbool.h>
|
||||
#include "util.h"
|
||||
|
||||
#define rcu_assign_pointer(a, b) do { a = b; } while (0)
|
||||
#define READ_ONCE(a) ({ a; })
|
||||
#define WRITE_ONCE(a, b) do { a = b; } while (0)
|
||||
#define unlikely(a) ({ a; })
|
||||
#define EXPORT_SYMBOL(a) /* nop */
|
||||
|
||||
#include "rbtree_types.h"
|
||||
#include "rbtree.h"
|
||||
#include "rbtree_augmented.h"
|
||||
|
||||
#endif
|
||||
@@ -609,6 +609,8 @@ static int print_alloc_list_block(int fd, char *str, struct scoutfs_block_ref *r
|
||||
u64 blkno;
|
||||
u64 start;
|
||||
u64 len;
|
||||
u64 st;
|
||||
u64 nr;
|
||||
int wid;
|
||||
int ret;
|
||||
int i;
|
||||
@@ -627,27 +629,37 @@ static int print_alloc_list_block(int fd, char *str, struct scoutfs_block_ref *r
|
||||
AL_REF_A(&lblk->next), le32_to_cpu(lblk->start),
|
||||
le32_to_cpu(lblk->nr));
|
||||
|
||||
if (lblk->nr) {
|
||||
wid = printf(" exts: ");
|
||||
start = 0;
|
||||
len = 0;
|
||||
for (i = 0; i < le32_to_cpu(lblk->nr); i++) {
|
||||
if (len == 0)
|
||||
start = le64_to_cpu(lblk->blknos[i]);
|
||||
len++;
|
||||
|
||||
if (i == (le32_to_cpu(lblk->nr) - 1) ||
|
||||
start + len != le64_to_cpu(lblk->blknos[i + 1])) {
|
||||
if (wid >= 72)
|
||||
wid = printf("\n ");
|
||||
|
||||
wid += printf("%llu,%llu ", start, len);
|
||||
len = 0;
|
||||
}
|
||||
}
|
||||
printf("\n");
|
||||
st = le32_to_cpu(lblk->start);
|
||||
nr = le32_to_cpu(lblk->nr);
|
||||
if (st >= SCOUTFS_ALLOC_LIST_MAX_BLOCKS ||
|
||||
nr > SCOUTFS_ALLOC_LIST_MAX_BLOCKS ||
|
||||
(st + nr) > SCOUTFS_ALLOC_LIST_MAX_BLOCKS) {
|
||||
printf(" (invalid start and nr fields)\n");
|
||||
goto out;
|
||||
}
|
||||
|
||||
if (lblk->nr == 0)
|
||||
goto out;
|
||||
|
||||
wid = printf(" exts: ");
|
||||
start = 0;
|
||||
len = 0;
|
||||
for (i = 0; i < nr; i++) {
|
||||
if (len == 0)
|
||||
start = le64_to_cpu(lblk->blknos[st + i]);
|
||||
len++;
|
||||
|
||||
if (i == (nr - 1) || (start + len) != le64_to_cpu(lblk->blknos[st + i + 1])) {
|
||||
if (wid >= 72)
|
||||
wid = printf("\n ");
|
||||
|
||||
wid += printf("%llu,%llu ", start, len);
|
||||
len = 0;
|
||||
}
|
||||
}
|
||||
printf("\n");
|
||||
|
||||
out:
|
||||
next = lblk->next;
|
||||
free(lblk);
|
||||
return print_alloc_list_block(fd, str, &next);
|
||||
|
||||
629
utils/src/rbtree.c
Normal file
629
utils/src/rbtree.c
Normal file
@@ -0,0 +1,629 @@
|
||||
// SPDX-License-Identifier: GPL-2.0-or-later
|
||||
/*
|
||||
Red Black Trees
|
||||
(C) 1999 Andrea Arcangeli <andrea@suse.de>
|
||||
(C) 2002 David Woodhouse <dwmw2@infradead.org>
|
||||
(C) 2012 Michel Lespinasse <walken@google.com>
|
||||
|
||||
|
||||
linux/lib/rbtree.c
|
||||
*/
|
||||
|
||||
#include "lk_rbtree_wrapper.h"
|
||||
|
||||
/*
|
||||
* red-black trees properties: https://en.wikipedia.org/wiki/Rbtree
|
||||
*
|
||||
* 1) A node is either red or black
|
||||
* 2) The root is black
|
||||
* 3) All leaves (NULL) are black
|
||||
* 4) Both children of every red node are black
|
||||
* 5) Every simple path from root to leaves contains the same number
|
||||
* of black nodes.
|
||||
*
|
||||
* 4 and 5 give the O(log n) guarantee, since 4 implies you cannot have two
|
||||
* consecutive red nodes in a path and every red node is therefore followed by
|
||||
* a black. So if B is the number of black nodes on every simple path (as per
|
||||
* 5), then the longest possible path due to 4 is 2B.
|
||||
*
|
||||
* We shall indicate color with case, where black nodes are uppercase and red
|
||||
* nodes will be lowercase. Unknown color nodes shall be drawn as red within
|
||||
* parentheses and have some accompanying text comment.
|
||||
*/
|
||||
|
||||
/*
|
||||
* Notes on lockless lookups:
|
||||
*
|
||||
* All stores to the tree structure (rb_left and rb_right) must be done using
|
||||
* WRITE_ONCE(). And we must not inadvertently cause (temporary) loops in the
|
||||
* tree structure as seen in program order.
|
||||
*
|
||||
* These two requirements will allow lockless iteration of the tree -- not
|
||||
* correct iteration mind you, tree rotations are not atomic so a lookup might
|
||||
* miss entire subtrees.
|
||||
*
|
||||
* But they do guarantee that any such traversal will only see valid elements
|
||||
* and that it will indeed complete -- does not get stuck in a loop.
|
||||
*
|
||||
* It also guarantees that if the lookup returns an element it is the 'correct'
|
||||
* one. But not returning an element does _NOT_ mean it's not present.
|
||||
*
|
||||
* NOTE:
|
||||
*
|
||||
* Stores to __rb_parent_color are not important for simple lookups so those
|
||||
* are left undone as of now. Nor did I check for loops involving parent
|
||||
* pointers.
|
||||
*/
|
||||
|
||||
static inline void rb_set_black(struct rb_node *rb)
|
||||
{
|
||||
rb->__rb_parent_color |= RB_BLACK;
|
||||
}
|
||||
|
||||
static inline struct rb_node *rb_red_parent(struct rb_node *red)
|
||||
{
|
||||
return (struct rb_node *)red->__rb_parent_color;
|
||||
}
|
||||
|
||||
/*
|
||||
* Helper function for rotations:
|
||||
* - old's parent and color get assigned to new
|
||||
* - old gets assigned new as a parent and 'color' as a color.
|
||||
*/
|
||||
static inline void
|
||||
__rb_rotate_set_parents(struct rb_node *old, struct rb_node *new,
|
||||
struct rb_root *root, int color)
|
||||
{
|
||||
struct rb_node *parent = rb_parent(old);
|
||||
new->__rb_parent_color = old->__rb_parent_color;
|
||||
rb_set_parent_color(old, new, color);
|
||||
__rb_change_child(old, new, parent, root);
|
||||
}
|
||||
|
||||
static __always_inline void
|
||||
__rb_insert(struct rb_node *node, struct rb_root *root,
|
||||
void (*augment_rotate)(struct rb_node *old, struct rb_node *new))
|
||||
{
|
||||
struct rb_node *parent = rb_red_parent(node), *gparent, *tmp;
|
||||
|
||||
while (true) {
|
||||
/*
|
||||
* Loop invariant: node is red.
|
||||
*/
|
||||
if (unlikely(!parent)) {
|
||||
/*
|
||||
* The inserted node is root. Either this is the
|
||||
* first node, or we recursed at Case 1 below and
|
||||
* are no longer violating 4).
|
||||
*/
|
||||
rb_set_parent_color(node, NULL, RB_BLACK);
|
||||
break;
|
||||
}
|
||||
|
||||
/*
|
||||
* If there is a black parent, we are done.
|
||||
* Otherwise, take some corrective action as,
|
||||
* per 4), we don't want a red root or two
|
||||
* consecutive red nodes.
|
||||
*/
|
||||
if(rb_is_black(parent))
|
||||
break;
|
||||
|
||||
gparent = rb_red_parent(parent);
|
||||
|
||||
tmp = gparent->rb_right;
|
||||
if (parent != tmp) { /* parent == gparent->rb_left */
|
||||
if (tmp && rb_is_red(tmp)) {
|
||||
/*
|
||||
* Case 1 - node's uncle is red (color flips).
|
||||
*
|
||||
* G g
|
||||
* / \ / \
|
||||
* p u --> P U
|
||||
* / /
|
||||
* n n
|
||||
*
|
||||
* However, since g's parent might be red, and
|
||||
* 4) does not allow this, we need to recurse
|
||||
* at g.
|
||||
*/
|
||||
rb_set_parent_color(tmp, gparent, RB_BLACK);
|
||||
rb_set_parent_color(parent, gparent, RB_BLACK);
|
||||
node = gparent;
|
||||
parent = rb_parent(node);
|
||||
rb_set_parent_color(node, parent, RB_RED);
|
||||
continue;
|
||||
}
|
||||
|
||||
tmp = parent->rb_right;
|
||||
if (node == tmp) {
|
||||
/*
|
||||
* Case 2 - node's uncle is black and node is
|
||||
* the parent's right child (left rotate at parent).
|
||||
*
|
||||
* G G
|
||||
* / \ / \
|
||||
* p U --> n U
|
||||
* \ /
|
||||
* n p
|
||||
*
|
||||
* This still leaves us in violation of 4), the
|
||||
* continuation into Case 3 will fix that.
|
||||
*/
|
||||
tmp = node->rb_left;
|
||||
WRITE_ONCE(parent->rb_right, tmp);
|
||||
WRITE_ONCE(node->rb_left, parent);
|
||||
if (tmp)
|
||||
rb_set_parent_color(tmp, parent,
|
||||
RB_BLACK);
|
||||
rb_set_parent_color(parent, node, RB_RED);
|
||||
augment_rotate(parent, node);
|
||||
parent = node;
|
||||
tmp = node->rb_right;
|
||||
}
|
||||
|
||||
/*
|
||||
* Case 3 - node's uncle is black and node is
|
||||
* the parent's left child (right rotate at gparent).
|
||||
*
|
||||
* G P
|
||||
* / \ / \
|
||||
* p U --> n g
|
||||
* / \
|
||||
* n U
|
||||
*/
|
||||
WRITE_ONCE(gparent->rb_left, tmp); /* == parent->rb_right */
|
||||
WRITE_ONCE(parent->rb_right, gparent);
|
||||
if (tmp)
|
||||
rb_set_parent_color(tmp, gparent, RB_BLACK);
|
||||
__rb_rotate_set_parents(gparent, parent, root, RB_RED);
|
||||
augment_rotate(gparent, parent);
|
||||
break;
|
||||
} else {
|
||||
tmp = gparent->rb_left;
|
||||
if (tmp && rb_is_red(tmp)) {
|
||||
/* Case 1 - color flips */
|
||||
rb_set_parent_color(tmp, gparent, RB_BLACK);
|
||||
rb_set_parent_color(parent, gparent, RB_BLACK);
|
||||
node = gparent;
|
||||
parent = rb_parent(node);
|
||||
rb_set_parent_color(node, parent, RB_RED);
|
||||
continue;
|
||||
}
|
||||
|
||||
tmp = parent->rb_left;
|
||||
if (node == tmp) {
|
||||
/* Case 2 - right rotate at parent */
|
||||
tmp = node->rb_right;
|
||||
WRITE_ONCE(parent->rb_left, tmp);
|
||||
WRITE_ONCE(node->rb_right, parent);
|
||||
if (tmp)
|
||||
rb_set_parent_color(tmp, parent,
|
||||
RB_BLACK);
|
||||
rb_set_parent_color(parent, node, RB_RED);
|
||||
augment_rotate(parent, node);
|
||||
parent = node;
|
||||
tmp = node->rb_left;
|
||||
}
|
||||
|
||||
/* Case 3 - left rotate at gparent */
|
||||
WRITE_ONCE(gparent->rb_right, tmp); /* == parent->rb_left */
|
||||
WRITE_ONCE(parent->rb_left, gparent);
|
||||
if (tmp)
|
||||
rb_set_parent_color(tmp, gparent, RB_BLACK);
|
||||
__rb_rotate_set_parents(gparent, parent, root, RB_RED);
|
||||
augment_rotate(gparent, parent);
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Inline version for rb_erase() use - we want to be able to inline
|
||||
* and eliminate the dummy_rotate callback there
|
||||
*/
|
||||
static __always_inline void
|
||||
____rb_erase_color(struct rb_node *parent, struct rb_root *root,
|
||||
void (*augment_rotate)(struct rb_node *old, struct rb_node *new))
|
||||
{
|
||||
struct rb_node *node = NULL, *sibling, *tmp1, *tmp2;
|
||||
|
||||
while (true) {
|
||||
/*
|
||||
* Loop invariants:
|
||||
* - node is black (or NULL on first iteration)
|
||||
* - node is not the root (parent is not NULL)
|
||||
* - All leaf paths going through parent and node have a
|
||||
* black node count that is 1 lower than other leaf paths.
|
||||
*/
|
||||
sibling = parent->rb_right;
|
||||
if (node != sibling) { /* node == parent->rb_left */
|
||||
if (rb_is_red(sibling)) {
|
||||
/*
|
||||
* Case 1 - left rotate at parent
|
||||
*
|
||||
* P S
|
||||
* / \ / \
|
||||
* N s --> p Sr
|
||||
* / \ / \
|
||||
* Sl Sr N Sl
|
||||
*/
|
||||
tmp1 = sibling->rb_left;
|
||||
WRITE_ONCE(parent->rb_right, tmp1);
|
||||
WRITE_ONCE(sibling->rb_left, parent);
|
||||
rb_set_parent_color(tmp1, parent, RB_BLACK);
|
||||
__rb_rotate_set_parents(parent, sibling, root,
|
||||
RB_RED);
|
||||
augment_rotate(parent, sibling);
|
||||
sibling = tmp1;
|
||||
}
|
||||
tmp1 = sibling->rb_right;
|
||||
if (!tmp1 || rb_is_black(tmp1)) {
|
||||
tmp2 = sibling->rb_left;
|
||||
if (!tmp2 || rb_is_black(tmp2)) {
|
||||
/*
|
||||
* Case 2 - sibling color flip
|
||||
* (p could be either color here)
|
||||
*
|
||||
* (p) (p)
|
||||
* / \ / \
|
||||
* N S --> N s
|
||||
* / \ / \
|
||||
* Sl Sr Sl Sr
|
||||
*
|
||||
* This leaves us violating 5) which
|
||||
* can be fixed by flipping p to black
|
||||
* if it was red, or by recursing at p.
|
||||
* p is red when coming from Case 1.
|
||||
*/
|
||||
rb_set_parent_color(sibling, parent,
|
||||
RB_RED);
|
||||
if (rb_is_red(parent))
|
||||
rb_set_black(parent);
|
||||
else {
|
||||
node = parent;
|
||||
parent = rb_parent(node);
|
||||
if (parent)
|
||||
continue;
|
||||
}
|
||||
break;
|
||||
}
|
||||
/*
|
||||
* Case 3 - right rotate at sibling
|
||||
* (p could be either color here)
|
||||
*
|
||||
* (p) (p)
|
||||
* / \ / \
|
||||
* N S --> N sl
|
||||
* / \ \
|
||||
* sl Sr S
|
||||
* \
|
||||
* Sr
|
||||
*
|
||||
* Note: p might be red, and then both
|
||||
* p and sl are red after rotation(which
|
||||
* breaks property 4). This is fixed in
|
||||
* Case 4 (in __rb_rotate_set_parents()
|
||||
* which set sl the color of p
|
||||
* and set p RB_BLACK)
|
||||
*
|
||||
* (p) (sl)
|
||||
* / \ / \
|
||||
* N sl --> P S
|
||||
* \ / \
|
||||
* S N Sr
|
||||
* \
|
||||
* Sr
|
||||
*/
|
||||
tmp1 = tmp2->rb_right;
|
||||
WRITE_ONCE(sibling->rb_left, tmp1);
|
||||
WRITE_ONCE(tmp2->rb_right, sibling);
|
||||
WRITE_ONCE(parent->rb_right, tmp2);
|
||||
if (tmp1)
|
||||
rb_set_parent_color(tmp1, sibling,
|
||||
RB_BLACK);
|
||||
augment_rotate(sibling, tmp2);
|
||||
tmp1 = sibling;
|
||||
sibling = tmp2;
|
||||
}
|
||||
/*
|
||||
* Case 4 - left rotate at parent + color flips
|
||||
* (p and sl could be either color here.
|
||||
* After rotation, p becomes black, s acquires
|
||||
* p's color, and sl keeps its color)
|
||||
*
|
||||
* (p) (s)
|
||||
* / \ / \
|
||||
* N S --> P Sr
|
||||
* / \ / \
|
||||
* (sl) sr N (sl)
|
||||
*/
|
||||
tmp2 = sibling->rb_left;
|
||||
WRITE_ONCE(parent->rb_right, tmp2);
|
||||
WRITE_ONCE(sibling->rb_left, parent);
|
||||
rb_set_parent_color(tmp1, sibling, RB_BLACK);
|
||||
if (tmp2)
|
||||
rb_set_parent(tmp2, parent);
|
||||
__rb_rotate_set_parents(parent, sibling, root,
|
||||
RB_BLACK);
|
||||
augment_rotate(parent, sibling);
|
||||
break;
|
||||
} else {
|
||||
sibling = parent->rb_left;
|
||||
if (rb_is_red(sibling)) {
|
||||
/* Case 1 - right rotate at parent */
|
||||
tmp1 = sibling->rb_right;
|
||||
WRITE_ONCE(parent->rb_left, tmp1);
|
||||
WRITE_ONCE(sibling->rb_right, parent);
|
||||
rb_set_parent_color(tmp1, parent, RB_BLACK);
|
||||
__rb_rotate_set_parents(parent, sibling, root,
|
||||
RB_RED);
|
||||
augment_rotate(parent, sibling);
|
||||
sibling = tmp1;
|
||||
}
|
||||
tmp1 = sibling->rb_left;
|
||||
if (!tmp1 || rb_is_black(tmp1)) {
|
||||
tmp2 = sibling->rb_right;
|
||||
if (!tmp2 || rb_is_black(tmp2)) {
|
||||
/* Case 2 - sibling color flip */
|
||||
rb_set_parent_color(sibling, parent,
|
||||
RB_RED);
|
||||
if (rb_is_red(parent))
|
||||
rb_set_black(parent);
|
||||
else {
|
||||
node = parent;
|
||||
parent = rb_parent(node);
|
||||
if (parent)
|
||||
continue;
|
||||
}
|
||||
break;
|
||||
}
|
||||
/* Case 3 - left rotate at sibling */
|
||||
tmp1 = tmp2->rb_left;
|
||||
WRITE_ONCE(sibling->rb_right, tmp1);
|
||||
WRITE_ONCE(tmp2->rb_left, sibling);
|
||||
WRITE_ONCE(parent->rb_left, tmp2);
|
||||
if (tmp1)
|
||||
rb_set_parent_color(tmp1, sibling,
|
||||
RB_BLACK);
|
||||
augment_rotate(sibling, tmp2);
|
||||
tmp1 = sibling;
|
||||
sibling = tmp2;
|
||||
}
|
||||
/* Case 4 - right rotate at parent + color flips */
|
||||
tmp2 = sibling->rb_right;
|
||||
WRITE_ONCE(parent->rb_left, tmp2);
|
||||
WRITE_ONCE(sibling->rb_right, parent);
|
||||
rb_set_parent_color(tmp1, sibling, RB_BLACK);
|
||||
if (tmp2)
|
||||
rb_set_parent(tmp2, parent);
|
||||
__rb_rotate_set_parents(parent, sibling, root,
|
||||
RB_BLACK);
|
||||
augment_rotate(parent, sibling);
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/* Non-inline version for rb_erase_augmented() use */
|
||||
void __rb_erase_color(struct rb_node *parent, struct rb_root *root,
|
||||
void (*augment_rotate)(struct rb_node *old, struct rb_node *new))
|
||||
{
|
||||
____rb_erase_color(parent, root, augment_rotate);
|
||||
}
|
||||
EXPORT_SYMBOL(__rb_erase_color);
|
||||
|
||||
/*
|
||||
* Non-augmented rbtree manipulation functions.
|
||||
*
|
||||
* We use dummy augmented callbacks here, and have the compiler optimize them
|
||||
* out of the rb_insert_color() and rb_erase() function definitions.
|
||||
*/
|
||||
|
||||
static inline void dummy_propagate(struct rb_node *node, struct rb_node *stop) {}
|
||||
static inline void dummy_copy(struct rb_node *old, struct rb_node *new) {}
|
||||
static inline void dummy_rotate(struct rb_node *old, struct rb_node *new) {}
|
||||
|
||||
static const struct rb_augment_callbacks dummy_callbacks = {
|
||||
.propagate = dummy_propagate,
|
||||
.copy = dummy_copy,
|
||||
.rotate = dummy_rotate
|
||||
};
|
||||
|
||||
void rb_insert_color(struct rb_node *node, struct rb_root *root)
|
||||
{
|
||||
__rb_insert(node, root, dummy_rotate);
|
||||
}
|
||||
EXPORT_SYMBOL(rb_insert_color);
|
||||
|
||||
void rb_erase(struct rb_node *node, struct rb_root *root)
|
||||
{
|
||||
struct rb_node *rebalance;
|
||||
rebalance = __rb_erase_augmented(node, root, &dummy_callbacks);
|
||||
if (rebalance)
|
||||
____rb_erase_color(rebalance, root, dummy_rotate);
|
||||
}
|
||||
EXPORT_SYMBOL(rb_erase);
|
||||
|
||||
/*
|
||||
* Augmented rbtree manipulation functions.
|
||||
*
|
||||
* This instantiates the same __always_inline functions as in the non-augmented
|
||||
* case, but this time with user-defined callbacks.
|
||||
*/
|
||||
|
||||
void __rb_insert_augmented(struct rb_node *node, struct rb_root *root,
|
||||
void (*augment_rotate)(struct rb_node *old, struct rb_node *new))
|
||||
{
|
||||
__rb_insert(node, root, augment_rotate);
|
||||
}
|
||||
EXPORT_SYMBOL(__rb_insert_augmented);
|
||||
|
||||
/*
|
||||
* This function returns the first node (in sort order) of the tree.
|
||||
*/
|
||||
struct rb_node *rb_first(const struct rb_root *root)
|
||||
{
|
||||
struct rb_node *n;
|
||||
|
||||
n = root->rb_node;
|
||||
if (!n)
|
||||
return NULL;
|
||||
while (n->rb_left)
|
||||
n = n->rb_left;
|
||||
return n;
|
||||
}
|
||||
EXPORT_SYMBOL(rb_first);
|
||||
|
||||
struct rb_node *rb_last(const struct rb_root *root)
|
||||
{
|
||||
struct rb_node *n;
|
||||
|
||||
n = root->rb_node;
|
||||
if (!n)
|
||||
return NULL;
|
||||
while (n->rb_right)
|
||||
n = n->rb_right;
|
||||
return n;
|
||||
}
|
||||
EXPORT_SYMBOL(rb_last);
|
||||
|
||||
struct rb_node *rb_next(const struct rb_node *node)
|
||||
{
|
||||
struct rb_node *parent;
|
||||
|
||||
if (RB_EMPTY_NODE(node))
|
||||
return NULL;
|
||||
|
||||
/*
|
||||
* If we have a right-hand child, go down and then left as far
|
||||
* as we can.
|
||||
*/
|
||||
if (node->rb_right) {
|
||||
node = node->rb_right;
|
||||
while (node->rb_left)
|
||||
node = node->rb_left;
|
||||
return (struct rb_node *)node;
|
||||
}
|
||||
|
||||
/*
|
||||
* No right-hand children. Everything down and left is smaller than us,
|
||||
* so any 'next' node must be in the general direction of our parent.
|
||||
* Go up the tree; any time the ancestor is a right-hand child of its
|
||||
* parent, keep going up. First time it's a left-hand child of its
|
||||
* parent, said parent is our 'next' node.
|
||||
*/
|
||||
while ((parent = rb_parent(node)) && node == parent->rb_right)
|
||||
node = parent;
|
||||
|
||||
return parent;
|
||||
}
|
||||
EXPORT_SYMBOL(rb_next);
|
||||
|
||||
struct rb_node *rb_prev(const struct rb_node *node)
|
||||
{
|
||||
struct rb_node *parent;
|
||||
|
||||
if (RB_EMPTY_NODE(node))
|
||||
return NULL;
|
||||
|
||||
/*
|
||||
* If we have a left-hand child, go down and then right as far
|
||||
* as we can.
|
||||
*/
|
||||
if (node->rb_left) {
|
||||
node = node->rb_left;
|
||||
while (node->rb_right)
|
||||
node = node->rb_right;
|
||||
return (struct rb_node *)node;
|
||||
}
|
||||
|
||||
/*
|
||||
* No left-hand children. Go up till we find an ancestor which
|
||||
* is a right-hand child of its parent.
|
||||
*/
|
||||
while ((parent = rb_parent(node)) && node == parent->rb_left)
|
||||
node = parent;
|
||||
|
||||
return parent;
|
||||
}
|
||||
EXPORT_SYMBOL(rb_prev);
|
||||
|
||||
void rb_replace_node(struct rb_node *victim, struct rb_node *new,
|
||||
struct rb_root *root)
|
||||
{
|
||||
struct rb_node *parent = rb_parent(victim);
|
||||
|
||||
/* Copy the pointers/colour from the victim to the replacement */
|
||||
*new = *victim;
|
||||
|
||||
/* Set the surrounding nodes to point to the replacement */
|
||||
if (victim->rb_left)
|
||||
rb_set_parent(victim->rb_left, new);
|
||||
if (victim->rb_right)
|
||||
rb_set_parent(victim->rb_right, new);
|
||||
__rb_change_child(victim, new, parent, root);
|
||||
}
|
||||
EXPORT_SYMBOL(rb_replace_node);
|
||||
|
||||
void rb_replace_node_rcu(struct rb_node *victim, struct rb_node *new,
|
||||
struct rb_root *root)
|
||||
{
|
||||
struct rb_node *parent = rb_parent(victim);
|
||||
|
||||
/* Copy the pointers/colour from the victim to the replacement */
|
||||
*new = *victim;
|
||||
|
||||
/* Set the surrounding nodes to point to the replacement */
|
||||
if (victim->rb_left)
|
||||
rb_set_parent(victim->rb_left, new);
|
||||
if (victim->rb_right)
|
||||
rb_set_parent(victim->rb_right, new);
|
||||
|
||||
/* Set the parent's pointer to the new node last after an RCU barrier
|
||||
* so that the pointers onwards are seen to be set correctly when doing
|
||||
* an RCU walk over the tree.
|
||||
*/
|
||||
__rb_change_child_rcu(victim, new, parent, root);
|
||||
}
|
||||
EXPORT_SYMBOL(rb_replace_node_rcu);
|
||||
|
||||
static struct rb_node *rb_left_deepest_node(const struct rb_node *node)
|
||||
{
|
||||
for (;;) {
|
||||
if (node->rb_left)
|
||||
node = node->rb_left;
|
||||
else if (node->rb_right)
|
||||
node = node->rb_right;
|
||||
else
|
||||
return (struct rb_node *)node;
|
||||
}
|
||||
}
|
||||
|
||||
struct rb_node *rb_next_postorder(const struct rb_node *node)
|
||||
{
|
||||
const struct rb_node *parent;
|
||||
if (!node)
|
||||
return NULL;
|
||||
parent = rb_parent(node);
|
||||
|
||||
/* If we're sitting on node, we've already seen our children */
|
||||
if (parent && node == parent->rb_left && parent->rb_right) {
|
||||
/* If we are the parent's left node, go to the parent's right
|
||||
* node then all the way down to the left */
|
||||
return rb_left_deepest_node(parent->rb_right);
|
||||
} else
|
||||
/* Otherwise we are the parent's right node, and the parent
|
||||
* should be next */
|
||||
return (struct rb_node *)parent;
|
||||
}
|
||||
EXPORT_SYMBOL(rb_next_postorder);
|
||||
|
||||
struct rb_node *rb_first_postorder(const struct rb_root *root)
|
||||
{
|
||||
if (!root->rb_node)
|
||||
return NULL;
|
||||
|
||||
return rb_left_deepest_node(root->rb_node);
|
||||
}
|
||||
EXPORT_SYMBOL(rb_first_postorder);
|
||||
328
utils/src/rbtree.h
Normal file
328
utils/src/rbtree.h
Normal file
@@ -0,0 +1,328 @@
|
||||
/* SPDX-License-Identifier: GPL-2.0-or-later */
|
||||
/*
|
||||
Red Black Trees
|
||||
(C) 1999 Andrea Arcangeli <andrea@suse.de>
|
||||
|
||||
|
||||
linux/include/linux/rbtree.h
|
||||
|
||||
To use rbtrees you'll have to implement your own insert and search cores.
|
||||
This will avoid us to use callbacks and to drop drammatically performances.
|
||||
I know it's not the cleaner way, but in C (not in C++) to get
|
||||
performances and genericity...
|
||||
|
||||
See Documentation/core-api/rbtree.rst for documentation and samples.
|
||||
*/
|
||||
|
||||
#ifndef _LINUX_RBTREE_H
|
||||
#define _LINUX_RBTREE_H
|
||||
|
||||
#define rb_parent(r) ((struct rb_node *)((r)->__rb_parent_color & ~3))
|
||||
|
||||
#define rb_entry(ptr, type, member) container_of(ptr, type, member)
|
||||
|
||||
#define RB_EMPTY_ROOT(root) (READ_ONCE((root)->rb_node) == NULL)
|
||||
|
||||
/* 'empty' nodes are nodes that are known not to be inserted in an rbtree */
|
||||
#define RB_EMPTY_NODE(node) \
|
||||
((node)->__rb_parent_color == (unsigned long)(node))
|
||||
#define RB_CLEAR_NODE(node) \
|
||||
((node)->__rb_parent_color = (unsigned long)(node))
|
||||
|
||||
|
||||
extern void rb_insert_color(struct rb_node *, struct rb_root *);
|
||||
extern void rb_erase(struct rb_node *, struct rb_root *);
|
||||
|
||||
|
||||
/* Find logical next and previous nodes in a tree */
|
||||
extern struct rb_node *rb_next(const struct rb_node *);
|
||||
extern struct rb_node *rb_prev(const struct rb_node *);
|
||||
extern struct rb_node *rb_first(const struct rb_root *);
|
||||
extern struct rb_node *rb_last(const struct rb_root *);
|
||||
|
||||
/* Postorder iteration - always visit the parent after its children */
|
||||
extern struct rb_node *rb_first_postorder(const struct rb_root *);
|
||||
extern struct rb_node *rb_next_postorder(const struct rb_node *);
|
||||
|
||||
/* Fast replacement of a single node without remove/rebalance/add/rebalance */
|
||||
extern void rb_replace_node(struct rb_node *victim, struct rb_node *new,
|
||||
struct rb_root *root);
|
||||
extern void rb_replace_node_rcu(struct rb_node *victim, struct rb_node *new,
|
||||
struct rb_root *root);
|
||||
|
||||
static inline void rb_link_node(struct rb_node *node, struct rb_node *parent,
|
||||
struct rb_node **rb_link)
|
||||
{
|
||||
node->__rb_parent_color = (unsigned long)parent;
|
||||
node->rb_left = node->rb_right = NULL;
|
||||
|
||||
*rb_link = node;
|
||||
}
|
||||
|
||||
static inline void rb_link_node_rcu(struct rb_node *node, struct rb_node *parent,
|
||||
struct rb_node **rb_link)
|
||||
{
|
||||
node->__rb_parent_color = (unsigned long)parent;
|
||||
node->rb_left = node->rb_right = NULL;
|
||||
|
||||
rcu_assign_pointer(*rb_link, node);
|
||||
}
|
||||
|
||||
#define rb_entry_safe(ptr, type, member) \
|
||||
({ typeof(ptr) ____ptr = (ptr); \
|
||||
____ptr ? rb_entry(____ptr, type, member) : NULL; \
|
||||
})
|
||||
|
||||
/**
|
||||
* rbtree_postorder_for_each_entry_safe - iterate in post-order over rb_root of
|
||||
* given type allowing the backing memory of @pos to be invalidated
|
||||
*
|
||||
* @pos: the 'type *' to use as a loop cursor.
|
||||
* @n: another 'type *' to use as temporary storage
|
||||
* @root: 'rb_root *' of the rbtree.
|
||||
* @field: the name of the rb_node field within 'type'.
|
||||
*
|
||||
* rbtree_postorder_for_each_entry_safe() provides a similar guarantee as
|
||||
* list_for_each_entry_safe() and allows the iteration to continue independent
|
||||
* of changes to @pos by the body of the loop.
|
||||
*
|
||||
* Note, however, that it cannot handle other modifications that re-order the
|
||||
* rbtree it is iterating over. This includes calling rb_erase() on @pos, as
|
||||
* rb_erase() may rebalance the tree, causing us to miss some nodes.
|
||||
*/
|
||||
#define rbtree_postorder_for_each_entry_safe(pos, n, root, field) \
|
||||
for (pos = rb_entry_safe(rb_first_postorder(root), typeof(*pos), field); \
|
||||
pos && ({ n = rb_entry_safe(rb_next_postorder(&pos->field), \
|
||||
typeof(*pos), field); 1; }); \
|
||||
pos = n)
|
||||
|
||||
/* Same as rb_first(), but O(1) */
|
||||
#define rb_first_cached(root) (root)->rb_leftmost
|
||||
|
||||
static inline void rb_insert_color_cached(struct rb_node *node,
|
||||
struct rb_root_cached *root,
|
||||
bool leftmost)
|
||||
{
|
||||
if (leftmost)
|
||||
root->rb_leftmost = node;
|
||||
rb_insert_color(node, &root->rb_root);
|
||||
}
|
||||
|
||||
|
||||
static inline struct rb_node *
|
||||
rb_erase_cached(struct rb_node *node, struct rb_root_cached *root)
|
||||
{
|
||||
struct rb_node *leftmost = NULL;
|
||||
|
||||
if (root->rb_leftmost == node)
|
||||
leftmost = root->rb_leftmost = rb_next(node);
|
||||
|
||||
rb_erase(node, &root->rb_root);
|
||||
|
||||
return leftmost;
|
||||
}
|
||||
|
||||
static inline void rb_replace_node_cached(struct rb_node *victim,
|
||||
struct rb_node *new,
|
||||
struct rb_root_cached *root)
|
||||
{
|
||||
if (root->rb_leftmost == victim)
|
||||
root->rb_leftmost = new;
|
||||
rb_replace_node(victim, new, &root->rb_root);
|
||||
}
|
||||
|
||||
/*
|
||||
* The below helper functions use 2 operators with 3 different
|
||||
* calling conventions. The operators are related like:
|
||||
*
|
||||
* comp(a->key,b) < 0 := less(a,b)
|
||||
* comp(a->key,b) > 0 := less(b,a)
|
||||
* comp(a->key,b) == 0 := !less(a,b) && !less(b,a)
|
||||
*
|
||||
* If these operators define a partial order on the elements we make no
|
||||
* guarantee on which of the elements matching the key is found. See
|
||||
* rb_find().
|
||||
*
|
||||
* The reason for this is to allow the find() interface without requiring an
|
||||
* on-stack dummy object, which might not be feasible due to object size.
|
||||
*/
|
||||
|
||||
/**
|
||||
* rb_add_cached() - insert @node into the leftmost cached tree @tree
|
||||
* @node: node to insert
|
||||
* @tree: leftmost cached tree to insert @node into
|
||||
* @less: operator defining the (partial) node order
|
||||
*
|
||||
* Returns @node when it is the new leftmost, or NULL.
|
||||
*/
|
||||
static __always_inline struct rb_node *
|
||||
rb_add_cached(struct rb_node *node, struct rb_root_cached *tree,
|
||||
bool (*less)(struct rb_node *, const struct rb_node *))
|
||||
{
|
||||
struct rb_node **link = &tree->rb_root.rb_node;
|
||||
struct rb_node *parent = NULL;
|
||||
bool leftmost = true;
|
||||
|
||||
while (*link) {
|
||||
parent = *link;
|
||||
if (less(node, parent)) {
|
||||
link = &parent->rb_left;
|
||||
} else {
|
||||
link = &parent->rb_right;
|
||||
leftmost = false;
|
||||
}
|
||||
}
|
||||
|
||||
rb_link_node(node, parent, link);
|
||||
rb_insert_color_cached(node, tree, leftmost);
|
||||
|
||||
return leftmost ? node : NULL;
|
||||
}
|
||||
|
||||
/**
|
||||
* rb_add() - insert @node into @tree
|
||||
* @node: node to insert
|
||||
* @tree: tree to insert @node into
|
||||
* @less: operator defining the (partial) node order
|
||||
*/
|
||||
static __always_inline void
|
||||
rb_add(struct rb_node *node, struct rb_root *tree,
|
||||
bool (*less)(struct rb_node *, const struct rb_node *))
|
||||
{
|
||||
struct rb_node **link = &tree->rb_node;
|
||||
struct rb_node *parent = NULL;
|
||||
|
||||
while (*link) {
|
||||
parent = *link;
|
||||
if (less(node, parent))
|
||||
link = &parent->rb_left;
|
||||
else
|
||||
link = &parent->rb_right;
|
||||
}
|
||||
|
||||
rb_link_node(node, parent, link);
|
||||
rb_insert_color(node, tree);
|
||||
}
|
||||
|
||||
/**
|
||||
* rb_find_add() - find equivalent @node in @tree, or add @node
|
||||
* @node: node to look-for / insert
|
||||
* @tree: tree to search / modify
|
||||
* @cmp: operator defining the node order
|
||||
*
|
||||
* Returns the rb_node matching @node, or NULL when no match is found and @node
|
||||
* is inserted.
|
||||
*/
|
||||
static __always_inline struct rb_node *
|
||||
rb_find_add(struct rb_node *node, struct rb_root *tree,
|
||||
int (*cmp)(struct rb_node *, const struct rb_node *))
|
||||
{
|
||||
struct rb_node **link = &tree->rb_node;
|
||||
struct rb_node *parent = NULL;
|
||||
int c;
|
||||
|
||||
while (*link) {
|
||||
parent = *link;
|
||||
c = cmp(node, parent);
|
||||
|
||||
if (c < 0)
|
||||
link = &parent->rb_left;
|
||||
else if (c > 0)
|
||||
link = &parent->rb_right;
|
||||
else
|
||||
return parent;
|
||||
}
|
||||
|
||||
rb_link_node(node, parent, link);
|
||||
rb_insert_color(node, tree);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
/**
|
||||
* rb_find() - find @key in tree @tree
|
||||
* @key: key to match
|
||||
* @tree: tree to search
|
||||
* @cmp: operator defining the node order
|
||||
*
|
||||
* Returns the rb_node matching @key or NULL.
|
||||
*/
|
||||
static __always_inline struct rb_node *
|
||||
rb_find(const void *key, const struct rb_root *tree,
|
||||
int (*cmp)(const void *key, const struct rb_node *))
|
||||
{
|
||||
struct rb_node *node = tree->rb_node;
|
||||
|
||||
while (node) {
|
||||
int c = cmp(key, node);
|
||||
|
||||
if (c < 0)
|
||||
node = node->rb_left;
|
||||
else if (c > 0)
|
||||
node = node->rb_right;
|
||||
else
|
||||
return node;
|
||||
}
|
||||
|
||||
return NULL;
|
||||
}
|
||||
|
||||
/**
|
||||
* rb_find_first() - find the first @key in @tree
|
||||
* @key: key to match
|
||||
* @tree: tree to search
|
||||
* @cmp: operator defining node order
|
||||
*
|
||||
* Returns the leftmost node matching @key, or NULL.
|
||||
*/
|
||||
static __always_inline struct rb_node *
|
||||
rb_find_first(const void *key, const struct rb_root *tree,
|
||||
int (*cmp)(const void *key, const struct rb_node *))
|
||||
{
|
||||
struct rb_node *node = tree->rb_node;
|
||||
struct rb_node *match = NULL;
|
||||
|
||||
while (node) {
|
||||
int c = cmp(key, node);
|
||||
|
||||
if (c <= 0) {
|
||||
if (!c)
|
||||
match = node;
|
||||
node = node->rb_left;
|
||||
} else if (c > 0) {
|
||||
node = node->rb_right;
|
||||
}
|
||||
}
|
||||
|
||||
return match;
|
||||
}
|
||||
|
||||
/**
|
||||
* rb_next_match() - find the next @key in @tree
|
||||
* @key: key to match
|
||||
* @tree: tree to search
|
||||
* @cmp: operator defining node order
|
||||
*
|
||||
* Returns the next node matching @key, or NULL.
|
||||
*/
|
||||
static __always_inline struct rb_node *
|
||||
rb_next_match(const void *key, struct rb_node *node,
|
||||
int (*cmp)(const void *key, const struct rb_node *))
|
||||
{
|
||||
node = rb_next(node);
|
||||
if (node && cmp(key, node))
|
||||
node = NULL;
|
||||
return node;
|
||||
}
|
||||
|
||||
/**
|
||||
* rb_for_each() - iterates a subtree matching @key
|
||||
* @node: iterator
|
||||
* @key: key to match
|
||||
* @tree: tree to search
|
||||
* @cmp: operator defining node order
|
||||
*/
|
||||
#define rb_for_each(node, key, tree, cmp) \
|
||||
for ((node) = rb_find_first((key), (tree), (cmp)); \
|
||||
(node); (node) = rb_next_match((key), (node), (cmp)))
|
||||
|
||||
#endif /* _LINUX_RBTREE_H */
|
||||
313
utils/src/rbtree_augmented.h
Normal file
313
utils/src/rbtree_augmented.h
Normal file
@@ -0,0 +1,313 @@
|
||||
/* SPDX-License-Identifier: GPL-2.0-or-later */
|
||||
/*
|
||||
Red Black Trees
|
||||
(C) 1999 Andrea Arcangeli <andrea@suse.de>
|
||||
(C) 2002 David Woodhouse <dwmw2@infradead.org>
|
||||
(C) 2012 Michel Lespinasse <walken@google.com>
|
||||
|
||||
|
||||
linux/include/linux/rbtree_augmented.h
|
||||
*/
|
||||
|
||||
#ifndef _LINUX_RBTREE_AUGMENTED_H
|
||||
#define _LINUX_RBTREE_AUGMENTED_H
|
||||
|
||||
/*
|
||||
* Please note - only struct rb_augment_callbacks and the prototypes for
|
||||
* rb_insert_augmented() and rb_erase_augmented() are intended to be public.
|
||||
* The rest are implementation details you are not expected to depend on.
|
||||
*
|
||||
* See Documentation/core-api/rbtree.rst for documentation and samples.
|
||||
*/
|
||||
|
||||
struct rb_augment_callbacks {
|
||||
void (*propagate)(struct rb_node *node, struct rb_node *stop);
|
||||
void (*copy)(struct rb_node *old, struct rb_node *new);
|
||||
void (*rotate)(struct rb_node *old, struct rb_node *new);
|
||||
};
|
||||
|
||||
extern void __rb_insert_augmented(struct rb_node *node, struct rb_root *root,
|
||||
void (*augment_rotate)(struct rb_node *old, struct rb_node *new));
|
||||
|
||||
/*
|
||||
* Fixup the rbtree and update the augmented information when rebalancing.
|
||||
*
|
||||
* On insertion, the user must update the augmented information on the path
|
||||
* leading to the inserted node, then call rb_link_node() as usual and
|
||||
* rb_insert_augmented() instead of the usual rb_insert_color() call.
|
||||
* If rb_insert_augmented() rebalances the rbtree, it will callback into
|
||||
* a user provided function to update the augmented information on the
|
||||
* affected subtrees.
|
||||
*/
|
||||
static inline void
|
||||
rb_insert_augmented(struct rb_node *node, struct rb_root *root,
|
||||
const struct rb_augment_callbacks *augment)
|
||||
{
|
||||
__rb_insert_augmented(node, root, augment->rotate);
|
||||
}
|
||||
|
||||
static inline void
|
||||
rb_insert_augmented_cached(struct rb_node *node,
|
||||
struct rb_root_cached *root, bool newleft,
|
||||
const struct rb_augment_callbacks *augment)
|
||||
{
|
||||
if (newleft)
|
||||
root->rb_leftmost = node;
|
||||
rb_insert_augmented(node, &root->rb_root, augment);
|
||||
}
|
||||
|
||||
/*
|
||||
* Template for declaring augmented rbtree callbacks (generic case)
|
||||
*
|
||||
* RBSTATIC: 'static' or empty
|
||||
* RBNAME: name of the rb_augment_callbacks structure
|
||||
* RBSTRUCT: struct type of the tree nodes
|
||||
* RBFIELD: name of struct rb_node field within RBSTRUCT
|
||||
* RBAUGMENTED: name of field within RBSTRUCT holding data for subtree
|
||||
* RBCOMPUTE: name of function that recomputes the RBAUGMENTED data
|
||||
*/
|
||||
|
||||
#define RB_DECLARE_CALLBACKS(RBSTATIC, RBNAME, \
|
||||
RBSTRUCT, RBFIELD, RBAUGMENTED, RBCOMPUTE) \
|
||||
static inline void \
|
||||
RBNAME ## _propagate(struct rb_node *rb, struct rb_node *stop) \
|
||||
{ \
|
||||
while (rb != stop) { \
|
||||
RBSTRUCT *node = rb_entry(rb, RBSTRUCT, RBFIELD); \
|
||||
if (RBCOMPUTE(node, true)) \
|
||||
break; \
|
||||
rb = rb_parent(&node->RBFIELD); \
|
||||
} \
|
||||
} \
|
||||
static inline void \
|
||||
RBNAME ## _copy(struct rb_node *rb_old, struct rb_node *rb_new) \
|
||||
{ \
|
||||
RBSTRUCT *old = rb_entry(rb_old, RBSTRUCT, RBFIELD); \
|
||||
RBSTRUCT *new = rb_entry(rb_new, RBSTRUCT, RBFIELD); \
|
||||
new->RBAUGMENTED = old->RBAUGMENTED; \
|
||||
} \
|
||||
static void \
|
||||
RBNAME ## _rotate(struct rb_node *rb_old, struct rb_node *rb_new) \
|
||||
{ \
|
||||
RBSTRUCT *old = rb_entry(rb_old, RBSTRUCT, RBFIELD); \
|
||||
RBSTRUCT *new = rb_entry(rb_new, RBSTRUCT, RBFIELD); \
|
||||
new->RBAUGMENTED = old->RBAUGMENTED; \
|
||||
RBCOMPUTE(old, false); \
|
||||
} \
|
||||
RBSTATIC const struct rb_augment_callbacks RBNAME = { \
|
||||
.propagate = RBNAME ## _propagate, \
|
||||
.copy = RBNAME ## _copy, \
|
||||
.rotate = RBNAME ## _rotate \
|
||||
};
|
||||
|
||||
/*
|
||||
* Template for declaring augmented rbtree callbacks,
|
||||
* computing RBAUGMENTED scalar as max(RBCOMPUTE(node)) for all subtree nodes.
|
||||
*
|
||||
* RBSTATIC: 'static' or empty
|
||||
* RBNAME: name of the rb_augment_callbacks structure
|
||||
* RBSTRUCT: struct type of the tree nodes
|
||||
* RBFIELD: name of struct rb_node field within RBSTRUCT
|
||||
* RBTYPE: type of the RBAUGMENTED field
|
||||
* RBAUGMENTED: name of RBTYPE field within RBSTRUCT holding data for subtree
|
||||
* RBCOMPUTE: name of function that returns the per-node RBTYPE scalar
|
||||
*/
|
||||
|
||||
#define RB_DECLARE_CALLBACKS_MAX(RBSTATIC, RBNAME, RBSTRUCT, RBFIELD, \
|
||||
RBTYPE, RBAUGMENTED, RBCOMPUTE) \
|
||||
static inline bool RBNAME ## _compute_max(RBSTRUCT *node, bool exit) \
|
||||
{ \
|
||||
RBSTRUCT *child; \
|
||||
RBTYPE max = RBCOMPUTE(node); \
|
||||
if (node->RBFIELD.rb_left) { \
|
||||
child = rb_entry(node->RBFIELD.rb_left, RBSTRUCT, RBFIELD); \
|
||||
if (child->RBAUGMENTED > max) \
|
||||
max = child->RBAUGMENTED; \
|
||||
} \
|
||||
if (node->RBFIELD.rb_right) { \
|
||||
child = rb_entry(node->RBFIELD.rb_right, RBSTRUCT, RBFIELD); \
|
||||
if (child->RBAUGMENTED > max) \
|
||||
max = child->RBAUGMENTED; \
|
||||
} \
|
||||
if (exit && node->RBAUGMENTED == max) \
|
||||
return true; \
|
||||
node->RBAUGMENTED = max; \
|
||||
return false; \
|
||||
} \
|
||||
RB_DECLARE_CALLBACKS(RBSTATIC, RBNAME, \
|
||||
RBSTRUCT, RBFIELD, RBAUGMENTED, RBNAME ## _compute_max)
|
||||
|
||||
|
||||
#define RB_RED 0
|
||||
#define RB_BLACK 1
|
||||
|
||||
#define __rb_parent(pc) ((struct rb_node *)(pc & ~3))
|
||||
|
||||
#define __rb_color(pc) ((pc) & 1)
|
||||
#define __rb_is_black(pc) __rb_color(pc)
|
||||
#define __rb_is_red(pc) (!__rb_color(pc))
|
||||
#define rb_color(rb) __rb_color((rb)->__rb_parent_color)
|
||||
#define rb_is_red(rb) __rb_is_red((rb)->__rb_parent_color)
|
||||
#define rb_is_black(rb) __rb_is_black((rb)->__rb_parent_color)
|
||||
|
||||
static inline void rb_set_parent(struct rb_node *rb, struct rb_node *p)
|
||||
{
|
||||
rb->__rb_parent_color = rb_color(rb) | (unsigned long)p;
|
||||
}
|
||||
|
||||
static inline void rb_set_parent_color(struct rb_node *rb,
|
||||
struct rb_node *p, int color)
|
||||
{
|
||||
rb->__rb_parent_color = (unsigned long)p | color;
|
||||
}
|
||||
|
||||
static inline void
|
||||
__rb_change_child(struct rb_node *old, struct rb_node *new,
|
||||
struct rb_node *parent, struct rb_root *root)
|
||||
{
|
||||
if (parent) {
|
||||
if (parent->rb_left == old)
|
||||
WRITE_ONCE(parent->rb_left, new);
|
||||
else
|
||||
WRITE_ONCE(parent->rb_right, new);
|
||||
} else
|
||||
WRITE_ONCE(root->rb_node, new);
|
||||
}
|
||||
|
||||
static inline void
|
||||
__rb_change_child_rcu(struct rb_node *old, struct rb_node *new,
|
||||
struct rb_node *parent, struct rb_root *root)
|
||||
{
|
||||
if (parent) {
|
||||
if (parent->rb_left == old)
|
||||
rcu_assign_pointer(parent->rb_left, new);
|
||||
else
|
||||
rcu_assign_pointer(parent->rb_right, new);
|
||||
} else
|
||||
rcu_assign_pointer(root->rb_node, new);
|
||||
}
|
||||
|
||||
extern void __rb_erase_color(struct rb_node *parent, struct rb_root *root,
|
||||
void (*augment_rotate)(struct rb_node *old, struct rb_node *new));
|
||||
|
||||
static __always_inline struct rb_node *
|
||||
__rb_erase_augmented(struct rb_node *node, struct rb_root *root,
|
||||
const struct rb_augment_callbacks *augment)
|
||||
{
|
||||
struct rb_node *child = node->rb_right;
|
||||
struct rb_node *tmp = node->rb_left;
|
||||
struct rb_node *parent, *rebalance;
|
||||
unsigned long pc;
|
||||
|
||||
if (!tmp) {
|
||||
/*
|
||||
* Case 1: node to erase has no more than 1 child (easy!)
|
||||
*
|
||||
* Note that if there is one child it must be red due to 5)
|
||||
* and node must be black due to 4). We adjust colors locally
|
||||
* so as to bypass __rb_erase_color() later on.
|
||||
*/
|
||||
pc = node->__rb_parent_color;
|
||||
parent = __rb_parent(pc);
|
||||
__rb_change_child(node, child, parent, root);
|
||||
if (child) {
|
||||
child->__rb_parent_color = pc;
|
||||
rebalance = NULL;
|
||||
} else
|
||||
rebalance = __rb_is_black(pc) ? parent : NULL;
|
||||
tmp = parent;
|
||||
} else if (!child) {
|
||||
/* Still case 1, but this time the child is node->rb_left */
|
||||
tmp->__rb_parent_color = pc = node->__rb_parent_color;
|
||||
parent = __rb_parent(pc);
|
||||
__rb_change_child(node, tmp, parent, root);
|
||||
rebalance = NULL;
|
||||
tmp = parent;
|
||||
} else {
|
||||
struct rb_node *successor = child, *child2;
|
||||
|
||||
tmp = child->rb_left;
|
||||
if (!tmp) {
|
||||
/*
|
||||
* Case 2: node's successor is its right child
|
||||
*
|
||||
* (n) (s)
|
||||
* / \ / \
|
||||
* (x) (s) -> (x) (c)
|
||||
* \
|
||||
* (c)
|
||||
*/
|
||||
parent = successor;
|
||||
child2 = successor->rb_right;
|
||||
|
||||
augment->copy(node, successor);
|
||||
} else {
|
||||
/*
|
||||
* Case 3: node's successor is leftmost under
|
||||
* node's right child subtree
|
||||
*
|
||||
* (n) (s)
|
||||
* / \ / \
|
||||
* (x) (y) -> (x) (y)
|
||||
* / /
|
||||
* (p) (p)
|
||||
* / /
|
||||
* (s) (c)
|
||||
* \
|
||||
* (c)
|
||||
*/
|
||||
do {
|
||||
parent = successor;
|
||||
successor = tmp;
|
||||
tmp = tmp->rb_left;
|
||||
} while (tmp);
|
||||
child2 = successor->rb_right;
|
||||
WRITE_ONCE(parent->rb_left, child2);
|
||||
WRITE_ONCE(successor->rb_right, child);
|
||||
rb_set_parent(child, successor);
|
||||
|
||||
augment->copy(node, successor);
|
||||
augment->propagate(parent, successor);
|
||||
}
|
||||
|
||||
tmp = node->rb_left;
|
||||
WRITE_ONCE(successor->rb_left, tmp);
|
||||
rb_set_parent(tmp, successor);
|
||||
|
||||
pc = node->__rb_parent_color;
|
||||
tmp = __rb_parent(pc);
|
||||
__rb_change_child(node, successor, tmp, root);
|
||||
|
||||
if (child2) {
|
||||
rb_set_parent_color(child2, parent, RB_BLACK);
|
||||
rebalance = NULL;
|
||||
} else {
|
||||
rebalance = rb_is_black(successor) ? parent : NULL;
|
||||
}
|
||||
successor->__rb_parent_color = pc;
|
||||
tmp = successor;
|
||||
}
|
||||
|
||||
augment->propagate(tmp, NULL);
|
||||
return rebalance;
|
||||
}
|
||||
|
||||
static __always_inline void
|
||||
rb_erase_augmented(struct rb_node *node, struct rb_root *root,
|
||||
const struct rb_augment_callbacks *augment)
|
||||
{
|
||||
struct rb_node *rebalance = __rb_erase_augmented(node, root, augment);
|
||||
if (rebalance)
|
||||
__rb_erase_color(rebalance, root, augment->rotate);
|
||||
}
|
||||
|
||||
static __always_inline void
|
||||
rb_erase_augmented_cached(struct rb_node *node, struct rb_root_cached *root,
|
||||
const struct rb_augment_callbacks *augment)
|
||||
{
|
||||
if (root->rb_leftmost == node)
|
||||
root->rb_leftmost = rb_next(node);
|
||||
rb_erase_augmented(node, &root->rb_root, augment);
|
||||
}
|
||||
|
||||
#endif /* _LINUX_RBTREE_AUGMENTED_H */
|
||||
34
utils/src/rbtree_types.h
Normal file
34
utils/src/rbtree_types.h
Normal file
@@ -0,0 +1,34 @@
|
||||
/* SPDX-License-Identifier: GPL-2.0-or-later */
|
||||
#ifndef _LINUX_RBTREE_TYPES_H
|
||||
#define _LINUX_RBTREE_TYPES_H
|
||||
|
||||
struct rb_node {
|
||||
unsigned long __rb_parent_color;
|
||||
struct rb_node *rb_right;
|
||||
struct rb_node *rb_left;
|
||||
} __attribute__((aligned(sizeof(long))));
|
||||
/* The alignment might seem pointless, but allegedly CRIS needs it */
|
||||
|
||||
struct rb_root {
|
||||
struct rb_node *rb_node;
|
||||
};
|
||||
|
||||
/*
|
||||
* Leftmost-cached rbtrees.
|
||||
*
|
||||
* We do not cache the rightmost node based on footprint
|
||||
* size vs number of potential users that could benefit
|
||||
* from O(1) rb_last(). Just not worth it, users that want
|
||||
* this feature can always implement the logic explicitly.
|
||||
* Furthermore, users that want to cache both pointers may
|
||||
* find it a bit asymmetric, but that's ok.
|
||||
*/
|
||||
struct rb_root_cached {
|
||||
struct rb_root rb_root;
|
||||
struct rb_node *rb_leftmost;
|
||||
};
|
||||
|
||||
#define RB_ROOT (struct rb_root) { NULL, }
|
||||
#define RB_ROOT_CACHED (struct rb_root_cached) { {NULL, }, NULL }
|
||||
|
||||
#endif
|
||||
Reference in New Issue
Block a user