Compare commits

..

1 Commits

Author SHA1 Message Date
Zach Brown a7ed6bf242 Add force to prepare-empty-data-device
Signed-off-by: Zach Brown <zab@versity.com>
2023-11-02 18:05:51 -07:00
50 changed files with 397 additions and 5449 deletions
-26
View File
@@ -1,32 +1,6 @@
Versity ScoutFS Release Notes
=============================
---
v1.19
\
*Jan 30, 2024*
Added the log\_merge\_wait\_timeout\_ms mount option to set the timeout
for creating log merge operations. The previous timeout, now the
default, was too short for some systems and was resulting in consistent
timeouts which created an excessive number of log trees waiting to be
merged.
Improved performance of many in-mount server operations when there are a
large number of log trees waiting to be merged.
---
v1.18
\
*Nov 7, 2023*
Fixed a bug where background srch file compaction could stop making
forward progress if a partial compaction operation was committed at a
specific byte offset in a block. This would cause srch file searches to
be progressively more expensive over time. Once this fix is running
background compaction will resume, bringing the cost of searches back
down.
---
v1.17
\
+3 -9
View File
@@ -12,22 +12,17 @@ else
SP = @:
endif
SCOUTFS_GIT_DESCRIBE ?= \
SCOUTFS_GIT_DESCRIBE := \
$(shell git describe --all --abbrev=6 --long 2>/dev/null || \
echo no-git)
ESCAPED_GIT_DESCRIBE := \
$(shell echo $(SCOUTFS_GIT_DESCRIBE) |sed -e 's/\//\\\//g')
RPM_GITHASH ?= $(shell git rev-parse --short HEAD)
SCOUTFS_ARGS := SCOUTFS_GIT_DESCRIBE=$(SCOUTFS_GIT_DESCRIBE) \
RPM_GITHASH=$(RPM_GITHASH) \
CONFIG_SCOUTFS_FS=m -C $(SK_KSRC) M=$(CURDIR)/src \
EXTRA_CFLAGS="-Werror"
# - We use the git describe from tags to set up the RPM versioning
RPM_VERSION := $(shell git describe --long --tags | awk -F '-' '{gsub(/^v/,""); print $$1}')
RPM_GITHASH := $(shell git rev-parse --short HEAD)
TARFILE = scoutfs-kmod-$(RPM_VERSION).tar
@@ -46,8 +41,7 @@ modules_install:
%.spec: %.spec.in .FORCE
sed -e 's/@@VERSION@@/$(RPM_VERSION)/g' \
-e 's/@@GITHASH@@/$(RPM_GITHASH)/g' \
-e 's/@@GITDESCRIBE@@/$(ESCAPED_GIT_DESCRIBE)/g' < $< > $@+
-e 's/@@GITHASH@@/$(RPM_GITHASH)/g' < $< > $@+
mv $@+ $@
+2 -14
View File
@@ -1,7 +1,6 @@
%define kmod_name scoutfs
%define kmod_version @@VERSION@@
%define kmod_git_hash @@GITHASH@@
%define kmod_git_describe @@GITDESCRIBE@@
%define pkg_date %(date +%%Y%%m%%d)
# Disable the building of the debug package(s).
@@ -76,7 +75,7 @@ echo "Building for kernel: %{kernel_version} flavors: '%{flavors_to_build}'"
for flavor in %flavors_to_build; do
rm -rf obj/$flavor
cp -r source obj/$flavor
make RPM_GITHASH=%{kmod_git_hash} SCOUTFS_GIT_DESCRIBE=%{kmod_git_describe} SK_KSRC=%{kernel_source $flavor} -C obj/$flavor module
make SK_KSRC=%{kernel_source $flavor} -C obj/$flavor module
done
%install
@@ -98,21 +97,10 @@ find %{buildroot} -type f -name \*.ko -exec %{__chmod} u+x \{\} \;
/lib/modules
%post
echo /lib/modules/%{kversion}/%{install_mod_dir}/scoutfs.ko | weak-modules --add-modules --no-initramfs
weak-modules --add-kernel --no-initramfs
depmod -a
%endif
%clean
rm -rf %{buildroot}
%preun
# stash our modules for postun cleanup
SCOUTFS_RPM_NAME=$(rpm -q %{name} | grep "%{version}-%{release}")
rpm -ql $SCOUTFS_RPM_NAME | grep '\.ko$' > /var/run/%{name}-modules-%{version}-%{release} || true
%postun
if [ -x /sbin/weak-modules ]; then
cat /var/run/%{name}-modules-%{version}-%{release} | /sbin/weak-modules --remove-modules --no-initramfs
fi
rm /var/run/%{name}-modules-%{version}-%{release} || true
+185 -260
View File
@@ -2029,253 +2029,187 @@ int scoutfs_btree_rebalance(struct super_block *sb,
key, SCOUTFS_BTREE_MAX_VAL_LEN, NULL, NULL, NULL);
}
struct merged_range {
struct scoutfs_key start;
struct scoutfs_key end;
struct rb_root root;
int size;
};
struct merged_item {
struct merge_pos {
struct rb_node node;
struct scoutfs_key key;
struct scoutfs_btree_root *root;
struct scoutfs_block *bl;
struct scoutfs_btree_block *bt;
struct scoutfs_avl_node *avl;
struct scoutfs_key *key;
u64 seq;
u8 flags;
unsigned int val_len;
u8 val[0];
u8 *val;
};
static inline struct merged_item *mitem_container(struct rb_node *node)
static struct merge_pos *first_mpos(struct rb_root *root)
{
return node ? container_of(node, struct merged_item, node) : NULL;
}
static inline struct merged_item *first_mitem(struct rb_root *root)
{
return mitem_container(rb_first(root));
}
static inline struct merged_item *last_mitem(struct rb_root *root)
{
return mitem_container(rb_last(root));
}
static inline struct merged_item *next_mitem(struct merged_item *mitem)
{
return mitem_container(mitem ? rb_next(&mitem->node) : NULL);
}
static inline struct merged_item *prev_mitem(struct merged_item *mitem)
{
return mitem_container(mitem ? rb_prev(&mitem->node) : NULL);
}
static struct merged_item *find_mitem(struct rb_root *root, struct scoutfs_key *key,
struct rb_node **parent_ret, struct rb_node ***link_ret)
{
struct rb_node **node = &root->rb_node;
struct rb_node *parent = NULL;
struct merged_item *mitem;
int cmp;
while (*node) {
parent = *node;
mitem = container_of(*node, struct merged_item, node);
cmp = scoutfs_key_compare(key, &mitem->key);
if (cmp < 0) {
node = &(*node)->rb_left;
} else if (cmp > 0) {
node = &(*node)->rb_right;
} else {
*parent_ret = NULL;
*link_ret = NULL;
return mitem;
}
}
*parent_ret = parent;
*link_ret = node;
struct rb_node *node = rb_first(root);
if (node)
return container_of(node, struct merge_pos, node);
return NULL;
}
static void insert_mitem(struct merged_range *rng, struct merged_item *mitem,
struct rb_node *parent, struct rb_node **link)
static struct merge_pos *next_mpos(struct merge_pos *mpos)
{
rb_link_node(&mitem->node, parent, link);
rb_insert_color(&mitem->node, &rng->root);
rng->size += item_len_bytes(mitem->val_len);
struct rb_node *node;
if (mpos && (node = rb_next(&mpos->node)))
return container_of(node, struct merge_pos, node);
else
return NULL;
}
static void replace_mitem(struct merged_range *rng, struct merged_item *victim,
struct merged_item *new)
static void free_mpos(struct super_block *sb, struct merge_pos *mpos)
{
rb_replace_node(&victim->node, &new->node, &rng->root);
RB_CLEAR_NODE(&victim->node);
rng->size -= item_len_bytes(victim->val_len);
rng->size += item_len_bytes(new->val_len);
scoutfs_block_put(sb, mpos->bl);
kfree(mpos);
}
static void free_mitem(struct merged_range *rng, struct merged_item *mitem)
static void insert_mpos(struct rb_root *pos_root, struct merge_pos *ins)
{
if (IS_ERR_OR_NULL(mitem))
return;
struct rb_node **node = &pos_root->rb_node;
struct rb_node *parent = NULL;
struct merge_pos *mpos;
int cmp;
if (!RB_EMPTY_NODE(&mitem->node)) {
rng->size -= item_len_bytes(mitem->val_len);
rb_erase(&mitem->node, &rng->root);
parent = NULL;
while (*node) {
parent = *node;
mpos = container_of(*node, struct merge_pos, node);
/* sort merge items by key then newest to oldest */
cmp = scoutfs_key_compare(ins->key, mpos->key) ?:
-scoutfs_cmp(ins->seq, mpos->seq);
if (cmp < 0)
node = &(*node)->rb_left;
else
node = &(*node)->rb_right;
}
kfree(mitem);
}
static void trim_range_size(struct merged_range *rng, int merge_window)
{
struct merged_item *mitem;
struct merged_item *tmp;
mitem = last_mitem(&rng->root);
while (mitem && rng->size > merge_window) {
rng->end = mitem->key;
scoutfs_key_dec(&rng->end);
tmp = mitem;
mitem = prev_mitem(mitem);
free_mitem(rng, tmp);
}
}
static void trim_range_end(struct merged_range *rng)
{
struct merged_item *mitem;
struct merged_item *tmp;
mitem = last_mitem(&rng->root);
while (mitem && scoutfs_key_compare(&mitem->key, &rng->end) > 0) {
tmp = mitem;
mitem = prev_mitem(mitem);
free_mitem(rng, tmp);
}
rb_link_node(&ins->node, parent, node);
rb_insert_color(&ins->node, pos_root);
}
/*
* Record and combine logged items from log roots for merging with the
* writable destination root. The caller is responsible for trimming
* the range if it gets too large or if the key range shrinks.
* Find the next item in the merge_pos root in the caller's range and
* insert it into the rbtree sorted by key and version so that merging
* can find the next newest item at the front of the rbtree. We free
* the mpos on error or if there are no more items in the range.
*/
static int merge_read_item(struct super_block *sb, struct scoutfs_key *key, u64 seq, u8 flags,
void *val, int val_len, void *arg)
static int reset_mpos(struct super_block *sb, struct rb_root *pos_root, struct merge_pos *mpos,
struct scoutfs_key *start, struct scoutfs_key *end)
{
struct merged_range *rng = arg;
struct merged_item *mitem;
struct merged_item *found;
struct rb_node *parent;
struct rb_node **link;
int ret;
struct scoutfs_btree_item *item;
struct scoutfs_avl_node *next;
struct btree_walk_key_range kr;
struct scoutfs_key walk_key;
int ret = 0;
found = find_mitem(&rng->root, key, &parent, &link);
if (found) {
ret = scoutfs_forest_combine_deltas(key, found->val, found->val_len, val, val_len);
if (ret < 0)
goto out;
if (ret > 0) {
if (ret == SCOUTFS_DELTA_COMBINED) {
scoutfs_inc_counter(sb, btree_merge_delta_combined);
} else if (ret == SCOUTFS_DELTA_COMBINED_NULL) {
scoutfs_inc_counter(sb, btree_merge_delta_null);
free_mitem(rng, found);
}
ret = 0;
goto out;
}
if (found->seq >= seq) {
ret = 0;
goto out;
}
/* always erase before freeing or inserting */
if (!RB_EMPTY_NODE(&mpos->node)) {
rb_erase(&mpos->node, pos_root);
RB_CLEAR_NODE(&mpos->node);
}
mitem = kmalloc(offsetof(struct merged_item, val[val_len]), GFP_NOFS);
if (!mitem) {
ret = -ENOMEM;
/*
* advance to next item via the avl tree. The caller's pos is
* only ever incremented past the last key so we can use next to
* iterate rather than using search to skip past multiple items.
*/
if (mpos->avl)
mpos->avl = scoutfs_avl_next(&mpos->bt->item_root, mpos->avl);
/* find the next leaf with the key if we run out of items */
walk_key = *start;
while (!mpos->avl && !scoutfs_key_is_zeros(&walk_key)) {
scoutfs_block_put(sb, mpos->bl);
mpos->bl = NULL;
ret = btree_walk(sb, NULL, NULL, mpos->root, BTW_NEXT, &walk_key,
0, &mpos->bl, &kr, NULL);
if (ret < 0) {
if (ret == -ENOENT)
ret = 0;
free_mpos(sb, mpos);
goto out;
}
mpos->bt = mpos->bl->data;
mpos->avl = scoutfs_avl_search(&mpos->bt->item_root, cmp_key_item,
start, NULL, NULL, &next, NULL) ?: next;
if (mpos->avl == NULL)
walk_key = kr.iter_next;
}
/* see if we're out of items within the range */
item = node_item(mpos->avl);
if (!item || scoutfs_key_compare(item_key(item), end) > 0) {
free_mpos(sb, mpos);
ret = 0;
goto out;
}
mitem->key = *key;
mitem->seq = seq;
mitem->flags = flags;
mitem->val_len = val_len;
if (val_len)
memcpy(mitem->val, val, val_len);
if (found) {
replace_mitem(rng, found, mitem);
free_mitem(rng, found);
} else {
insert_mitem(rng, mitem, parent, link);
}
/* insert the next item within range at its version */
mpos->key = item_key(item);
mpos->seq = le64_to_cpu(item->seq);
mpos->flags = item->flags;
mpos->val_len = item_val_len(item);
mpos->val = item_val(mpos->bt, item);
insert_mpos(pos_root, mpos);
ret = 0;
out:
return ret;
}
/*
* Read a range of merged items. The caller has set the key bounds of
* the range. We read a merge window's worth of items from blocks in
* each input btree.
* The caller has reset all the merge positions for all the input log
* btree roots and wants the next logged item it should try and merge
* with the items in the fs_root.
*
* The caller can only use the smallest range that overlaps with all the
* blocks that we read. We start reading from the range's start key so
* it will always be present and we don't need to adjust it. The final
* block we read from each input might not cover the range's end so it
* needs to be adjusted.
*
* The end range can also shrink if we have to drop items because the
* items exceeded the merge window size.
* We look ahead in the logged item stream to see if we should merge any
* older logged delta items into one result for the caller. We also
* take this opportunity to skip and reset the mpos for any older
* versions of the first item.
*/
static int read_merged_range(struct super_block *sb, struct merged_range *rng,
struct list_head *inputs, int merge_window)
static int next_resolved_mpos(struct super_block *sb, struct rb_root *pos_root,
struct scoutfs_key *end, struct merge_pos **mpos_ret)
{
struct scoutfs_btree_root_head *rhead;
struct scoutfs_key start;
struct scoutfs_key end;
struct merge_pos *mpos;
struct merge_pos *next;
struct scoutfs_key key;
int ret = 0;
int i;
list_for_each_entry(rhead, inputs, head) {
key = rng->start;
while ((mpos = first_mpos(pos_root)) && (next = next_mpos(mpos)) &&
!scoutfs_key_compare(mpos->key, next->key)) {
for (i = 0; i < merge_window; i += SCOUTFS_BLOCK_LG_SIZE) {
start = key;
end = rng->end;
ret = scoutfs_btree_read_items(sb, &rhead->root, &key, &start, &end,
merge_read_item, rng);
ret = scoutfs_forest_combine_deltas(mpos->key, mpos->val, mpos->val_len,
next->val, next->val_len);
if (ret < 0)
break;
/* reset advances to the next item */
key = *mpos->key;
scoutfs_key_inc(&key);
/* always skip next combined or older version */
ret = reset_mpos(sb, pos_root, next, &key, end);
if (ret < 0)
break;
if (ret == SCOUTFS_DELTA_COMBINED) {
scoutfs_inc_counter(sb, btree_merge_delta_combined);
} else if (ret == SCOUTFS_DELTA_COMBINED_NULL) {
scoutfs_inc_counter(sb, btree_merge_delta_null);
/* if merging resulted in no info, skip current */
ret = reset_mpos(sb, pos_root, mpos, &key, end);
if (ret < 0)
goto out;
if (scoutfs_key_compare(&end, &rng->end) >= 0)
break;
key = end;
scoutfs_key_inc(&key);
}
if (scoutfs_key_compare(&end, &rng->end) < 0) {
rng->end = end;
trim_range_end(rng);
}
if (rng->size > merge_window)
trim_range_size(rng, merge_window);
}
trace_scoutfs_btree_merge_read_range(sb, &rng->start, &rng->end, rng->size);
ret = 0;
out:
*mpos_ret = mpos;
return ret;
}
@@ -2292,13 +2226,6 @@ out:
* to allocators running low or needing to join/split the parent.
* *next_ret is set to the next key which hasn't been merged so that the
* caller can retry with a new allocator and subtree.
*
* The number of input roots can be immense. The merge_window specifies
* the size of the set of merged items that we'll maintain as we iterate
* over all the input roots. Once we've merged items into the window
* from all the input roots the merged input items are then merged to
* the writable destination root. It may take multiple passes of
* windows of merged items to cover the input key range.
*/
int scoutfs_btree_merge(struct super_block *sb,
struct scoutfs_alloc *alloc,
@@ -2308,16 +2235,18 @@ int scoutfs_btree_merge(struct super_block *sb,
struct scoutfs_key *next_ret,
struct scoutfs_btree_root *root,
struct list_head *inputs,
bool subtree, int dirty_limit, int alloc_low, int merge_window)
bool subtree, int dirty_limit, int alloc_low)
{
struct scoutfs_btree_root_head *rhead;
struct rb_root pos_root = RB_ROOT;
struct scoutfs_btree_item *item;
struct scoutfs_btree_block *bt;
struct scoutfs_block *bl = NULL;
struct btree_walk_key_range kr;
struct scoutfs_avl_node *par;
struct merged_item *mitem;
struct merged_item *tmp;
struct merged_range rng;
struct scoutfs_key next;
struct merge_pos *mpos;
struct merge_pos *tmp;
int walk_val_len;
int walk_flags;
bool is_del;
@@ -2328,59 +2257,49 @@ int scoutfs_btree_merge(struct super_block *sb,
trace_scoutfs_btree_merge(sb, root, start, end);
scoutfs_inc_counter(sb, btree_merge);
list_for_each_entry(rhead, inputs, head) {
mpos = kzalloc(sizeof(*mpos), GFP_NOFS);
if (!mpos) {
ret = -ENOMEM;
goto out;
}
RB_CLEAR_NODE(&mpos->node);
mpos->root = &rhead->root;
ret = reset_mpos(sb, &pos_root, mpos, start, end);
if (ret < 0)
goto out;
}
walk_flags = BTW_DIRTY;
if (subtree)
walk_flags |= BTW_SUBTREE;
walk_val_len = 0;
rng.start = *start;
rng.end = *end;
rng.root = RB_ROOT;
rng.size = 0;
ret = read_merged_range(sb, &rng, inputs, merge_window);
if (ret < 0)
goto out;
for (;;) {
/* read next window as it empties (and it is possible to read an empty range) */
mitem = first_mitem(&rng.root);
if (!mitem) {
/* done if the read range hit the end */
if (scoutfs_key_compare(&rng.end, end) >= 0)
break;
/* read next batch of merged items */
rng.start = rng.end;
scoutfs_key_inc(&rng.start);
rng.end = *end;
ret = read_merged_range(sb, &rng, inputs, merge_window);
if (ret < 0)
break;
continue;
}
while ((ret = next_resolved_mpos(sb, &pos_root, end, &mpos)) == 0 && mpos) {
if (scoutfs_block_writer_dirty_bytes(sb, wri) >= dirty_limit) {
scoutfs_inc_counter(sb, btree_merge_dirty_limit);
ret = -ERANGE;
*next_ret = mitem->key;
*next_ret = *mpos->key;
goto out;
}
if (scoutfs_alloc_meta_low(sb, alloc, alloc_low)) {
scoutfs_inc_counter(sb, btree_merge_alloc_low);
ret = -ERANGE;
*next_ret = mitem->key;
*next_ret = *mpos->key;
goto out;
}
scoutfs_block_put(sb, bl);
bl = NULL;
ret = btree_walk(sb, alloc, wri, root, walk_flags,
&mitem->key, walk_val_len, &bl, &kr, NULL);
mpos->key, walk_val_len, &bl, &kr, NULL);
if (ret < 0) {
if (ret == -ERANGE)
*next_ret = mitem->key;
*next_ret = *mpos->key;
goto out;
}
bt = bl->data;
@@ -2392,21 +2311,22 @@ int scoutfs_btree_merge(struct super_block *sb,
continue;
}
while (mitem) {
while ((ret = next_resolved_mpos(sb, &pos_root, end, &mpos)) == 0 && mpos) {
/* walk to new leaf if we exceed parent ref key */
if (scoutfs_key_compare(&mitem->key, &kr.end) > 0)
if (scoutfs_key_compare(mpos->key, &kr.end) > 0)
break;
/* see if there's an existing item */
item = leaf_item_hash_search(sb, bt, &mitem->key);
is_del = !!(mitem->flags & SCOUTFS_ITEM_FLAG_DELETION);
item = leaf_item_hash_search(sb, bt, mpos->key);
is_del = !!(mpos->flags & SCOUTFS_ITEM_FLAG_DELETION);
/* see if we're merging delta items */
if (item && !is_del)
delta = scoutfs_forest_combine_deltas(&mitem->key,
delta = scoutfs_forest_combine_deltas(mpos->key,
item_val(bt, item),
item_val_len(item),
mitem->val, mitem->val_len);
mpos->val, mpos->val_len);
else
delta = 0;
if (delta < 0) {
@@ -2418,38 +2338,40 @@ int scoutfs_btree_merge(struct super_block *sb,
scoutfs_inc_counter(sb, btree_merge_delta_null);
}
trace_scoutfs_btree_merge_items(sb, &mitem->key, mitem->val_len,
trace_scoutfs_btree_merge_items(sb, mpos->root,
mpos->key, mpos->val_len,
item ? root : NULL,
item ? item_key(item) : NULL,
item ? item_val_len(item) : 0, is_del);
/* rewalk and split if ins/update needs room */
if (!is_del && !delta && !mid_free_item_room(bt, mitem->val_len)) {
if (!is_del && !delta && !mid_free_item_room(bt, mpos->val_len)) {
walk_flags |= BTW_INSERT;
walk_val_len = mitem->val_len;
walk_val_len = mpos->val_len;
break;
}
/* insert missing non-deletion merge items */
if (!item && !is_del) {
scoutfs_avl_search(&bt->item_root, cmp_key_item, &mitem->key,
scoutfs_avl_search(&bt->item_root,
cmp_key_item, mpos->key,
&cmp, &par, NULL, NULL);
create_item(bt, &mitem->key, mitem->seq, mitem->flags,
mitem->val, mitem->val_len, par, cmp);
create_item(bt, mpos->key, mpos->seq, mpos->flags,
mpos->val, mpos->val_len, par, cmp);
scoutfs_inc_counter(sb, btree_merge_insert);
}
/* update existing items */
if (item && !is_del && !delta) {
item->seq = cpu_to_le64(mitem->seq);
item->flags = mitem->flags;
update_item_value(bt, item, mitem->val, mitem->val_len);
item->seq = cpu_to_le64(mpos->seq);
item->flags = mpos->flags;
update_item_value(bt, item, mpos->val, mpos->val_len);
scoutfs_inc_counter(sb, btree_merge_update);
}
/* update combined delta item seq */
if (delta == SCOUTFS_DELTA_COMBINED) {
item->seq = cpu_to_le64(mitem->seq);
item->seq = cpu_to_le64(mpos->seq);
}
/*
@@ -2481,18 +2403,21 @@ int scoutfs_btree_merge(struct super_block *sb,
walk_flags &= ~(BTW_INSERT | BTW_DELETE);
walk_val_len = 0;
/* finished with this merged item */
tmp = mitem;
mitem = next_mitem(mitem);
free_mitem(&rng, tmp);
/* finished with this key, skip any older items */
next = *mpos->key;
scoutfs_key_inc(&next);
ret = reset_mpos(sb, &pos_root, mpos, &next, end);
if (ret < 0)
goto out;
}
}
ret = 0;
out:
scoutfs_block_put(sb, bl);
rbtree_postorder_for_each_entry_safe(mitem, tmp, &rng.root, node)
free_mitem(&rng, mitem);
rbtree_postorder_for_each_entry_safe(mpos, tmp, &pos_root, node) {
free_mpos(sb, mpos);
}
return ret;
}
+1 -1
View File
@@ -119,7 +119,7 @@ int scoutfs_btree_merge(struct super_block *sb,
struct scoutfs_key *next_ret,
struct scoutfs_btree_root *root,
struct list_head *input_list,
bool subtree, int dirty_limit, int alloc_low, int merge_window);
bool subtree, int dirty_limit, int alloc_low);
int scoutfs_btree_free_blocks(struct super_block *sb,
struct scoutfs_alloc *alloc,
-1
View File
@@ -145,7 +145,6 @@
EXPAND_COUNTER(lock_shrink_work) \
EXPAND_COUNTER(lock_unlock) \
EXPAND_COUNTER(lock_wait) \
EXPAND_COUNTER(log_merge_wait_timeout) \
EXPAND_COUNTER(net_dropped_response) \
EXPAND_COUNTER(net_send_bytes) \
EXPAND_COUNTER(net_send_error) \
+1 -2
View File
@@ -721,8 +721,7 @@ static void scoutfs_forest_log_merge_worker(struct work_struct *work)
ret = scoutfs_btree_merge(sb, &alloc, &wri, &req.start, &req.end,
&next, &comp.root, &inputs,
!!(req.flags & cpu_to_le64(SCOUTFS_LOG_MERGE_REQUEST_SUBTREE)),
SCOUTFS_LOG_MERGE_DIRTY_BYTE_LIMIT, 10,
(2 * 1024 * 1024));
SCOUTFS_LOG_MERGE_DIRTY_BYTE_LIMIT, 10);
if (ret == -ERANGE) {
comp.remain = next;
le64_add_cpu(&comp.flags, SCOUTFS_LOG_MERGE_COMP_REMAIN);
-68
View File
@@ -33,7 +33,6 @@ enum {
Opt_acl,
Opt_data_prealloc_blocks,
Opt_data_prealloc_contig_only,
Opt_log_merge_wait_timeout_ms,
Opt_metadev_path,
Opt_noacl,
Opt_orphan_scan_delay_ms,
@@ -46,7 +45,6 @@ static const match_table_t tokens = {
{Opt_acl, "acl"},
{Opt_data_prealloc_blocks, "data_prealloc_blocks=%s"},
{Opt_data_prealloc_contig_only, "data_prealloc_contig_only=%s"},
{Opt_log_merge_wait_timeout_ms, "log_merge_wait_timeout_ms=%s"},
{Opt_metadev_path, "metadev_path=%s"},
{Opt_noacl, "noacl"},
{Opt_orphan_scan_delay_ms, "orphan_scan_delay_ms=%s"},
@@ -115,10 +113,6 @@ static void free_options(struct scoutfs_mount_options *opts)
kfree(opts->metadev_path);
}
#define MIN_LOG_MERGE_WAIT_TIMEOUT_MS 100UL
#define DEFAULT_LOG_MERGE_WAIT_TIMEOUT_MS 500
#define MAX_LOG_MERGE_WAIT_TIMEOUT_MS (60 * MSEC_PER_SEC)
#define MIN_ORPHAN_SCAN_DELAY_MS 100UL
#define DEFAULT_ORPHAN_SCAN_DELAY_MS (10 * MSEC_PER_SEC)
#define MAX_ORPHAN_SCAN_DELAY_MS (60 * MSEC_PER_SEC)
@@ -132,27 +126,11 @@ static void init_default_options(struct scoutfs_mount_options *opts)
opts->data_prealloc_blocks = SCOUTFS_DATA_PREALLOC_DEFAULT_BLOCKS;
opts->data_prealloc_contig_only = 1;
opts->log_merge_wait_timeout_ms = DEFAULT_LOG_MERGE_WAIT_TIMEOUT_MS;
opts->orphan_scan_delay_ms = -1;
opts->quorum_heartbeat_timeout_ms = SCOUTFS_QUORUM_DEF_HB_TIMEO_MS;
opts->quorum_slot_nr = -1;
}
static int verify_log_merge_wait_timeout_ms(struct super_block *sb, int ret, int val)
{
if (ret < 0) {
scoutfs_err(sb, "failed to parse log_merge_wait_timeout_ms value");
return -EINVAL;
}
if (val < MIN_LOG_MERGE_WAIT_TIMEOUT_MS || val > MAX_LOG_MERGE_WAIT_TIMEOUT_MS) {
scoutfs_err(sb, "invalid log_merge_wait_timeout_ms value %d, must be between %lu and %lu",
val, MIN_LOG_MERGE_WAIT_TIMEOUT_MS, MAX_LOG_MERGE_WAIT_TIMEOUT_MS);
return -EINVAL;
}
return 0;
}
static int verify_quorum_heartbeat_timeout_ms(struct super_block *sb, int ret, u64 val)
{
if (ret < 0) {
@@ -218,14 +196,6 @@ static int parse_options(struct super_block *sb, char *options, struct scoutfs_m
opts->data_prealloc_contig_only = nr;
break;
case Opt_log_merge_wait_timeout_ms:
ret = match_int(args, &nr);
ret = verify_log_merge_wait_timeout_ms(sb, ret, nr);
if (ret < 0)
return ret;
opts->log_merge_wait_timeout_ms = nr;
break;
case Opt_metadev_path:
ret = parse_bdev_path(sb, &args[0], &opts->metadev_path);
if (ret < 0)
@@ -452,43 +422,6 @@ static ssize_t data_prealloc_contig_only_store(struct kobject *kobj, struct kobj
}
SCOUTFS_ATTR_RW(data_prealloc_contig_only);
static ssize_t log_merge_wait_timeout_ms_show(struct kobject *kobj, struct kobj_attribute *attr,
char *buf)
{
struct super_block *sb = SCOUTFS_SYSFS_ATTRS_SB(kobj);
struct scoutfs_mount_options opts;
scoutfs_options_read(sb, &opts);
return snprintf(buf, PAGE_SIZE, "%u", opts.log_merge_wait_timeout_ms);
}
static ssize_t log_merge_wait_timeout_ms_store(struct kobject *kobj, struct kobj_attribute *attr,
const char *buf, size_t count)
{
struct super_block *sb = SCOUTFS_SYSFS_ATTRS_SB(kobj);
DECLARE_OPTIONS_INFO(sb, optinf);
char nullterm[30]; /* more than enough for octal -U64_MAX */
int val;
int len;
int ret;
len = min(count, sizeof(nullterm) - 1);
memcpy(nullterm, buf, len);
nullterm[len] = '\0';
ret = kstrtoint(nullterm, 0, &val);
ret = verify_log_merge_wait_timeout_ms(sb, ret, val);
if (ret == 0) {
write_seqlock(&optinf->seqlock);
optinf->opts.log_merge_wait_timeout_ms = val;
write_sequnlock(&optinf->seqlock);
ret = count;
}
return ret;
}
SCOUTFS_ATTR_RW(log_merge_wait_timeout_ms);
static ssize_t metadev_path_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf)
{
struct super_block *sb = SCOUTFS_SYSFS_ATTRS_SB(kobj);
@@ -592,7 +525,6 @@ SCOUTFS_ATTR_RO(quorum_slot_nr);
static struct attribute *options_attrs[] = {
SCOUTFS_ATTR_PTR(data_prealloc_blocks),
SCOUTFS_ATTR_PTR(data_prealloc_contig_only),
SCOUTFS_ATTR_PTR(log_merge_wait_timeout_ms),
SCOUTFS_ATTR_PTR(metadev_path),
SCOUTFS_ATTR_PTR(orphan_scan_delay_ms),
SCOUTFS_ATTR_PTR(quorum_heartbeat_timeout_ms),
-1
View File
@@ -8,7 +8,6 @@
struct scoutfs_mount_options {
u64 data_prealloc_blocks;
bool data_prealloc_contig_only;
unsigned int log_merge_wait_timeout_ms;
char *metadev_path;
unsigned int orphan_scan_delay_ms;
int quorum_slot_nr;
+14 -169
View File
@@ -439,7 +439,6 @@ DECLARE_EVENT_CLASS(scoutfs_trans_hold_release_class,
SCSB_TRACE_ASSIGN(sb);
__entry->journal_info = (unsigned long)journal_info;
__entry->holders = holders;
__entry->ret = ret;
),
TP_printk(SCSBF" journal_info 0x%0lx holders %d ret %d",
@@ -1747,41 +1746,21 @@ TRACE_EVENT(scoutfs_btree_merge,
sk_trace_args(end))
);
TRACE_EVENT(scoutfs_btree_merge_read_range,
TP_PROTO(struct super_block *sb, struct scoutfs_key *start, struct scoutfs_key *end,
int size),
TP_ARGS(sb, start, end, size),
TP_STRUCT__entry(
SCSB_TRACE_FIELDS
sk_trace_define(start)
sk_trace_define(end)
__field(int, size)
),
TP_fast_assign(
SCSB_TRACE_ASSIGN(sb);
sk_trace_assign(start, start);
sk_trace_assign(end, end);
__entry->size = size;
),
TP_printk(SCSBF" start "SK_FMT" end "SK_FMT" size %d",
SCSB_TRACE_ARGS, sk_trace_args(start), sk_trace_args(end), __entry->size)
);
TRACE_EVENT(scoutfs_btree_merge_items,
TP_PROTO(struct super_block *sb,
struct scoutfs_btree_root *m_root,
struct scoutfs_key *m_key, int m_val_len,
struct scoutfs_btree_root *f_root,
struct scoutfs_key *f_key, int f_val_len,
int is_del),
TP_ARGS(sb, m_key, m_val_len, f_root, f_key, f_val_len, is_del),
TP_ARGS(sb, m_root, m_key, m_val_len, f_root, f_key, f_val_len, is_del),
TP_STRUCT__entry(
SCSB_TRACE_FIELDS
__field(__u64, m_root_blkno)
__field(__u64, m_root_seq)
__field(__u8, m_root_height)
sk_trace_define(m_key)
__field(int, m_val_len)
__field(__u64, f_root_blkno)
@@ -1794,6 +1773,10 @@ TRACE_EVENT(scoutfs_btree_merge_items,
TP_fast_assign(
SCSB_TRACE_ASSIGN(sb);
__entry->m_root_blkno = m_root ?
le64_to_cpu(m_root->ref.blkno) : 0;
__entry->m_root_seq = m_root ? le64_to_cpu(m_root->ref.seq) : 0;
__entry->m_root_height = m_root ? m_root->height : 0;
sk_trace_assign(m_key, m_key);
__entry->m_val_len = m_val_len;
__entry->f_root_blkno = f_root ?
@@ -1805,9 +1788,11 @@ TRACE_EVENT(scoutfs_btree_merge_items,
__entry->is_del = !!is_del;
),
TP_printk(SCSBF" merge item key "SK_FMT" val_len %d, fs item root blkno %llu seq %llu height %u key "SK_FMT" val_len %d, is_del %d",
SCSB_TRACE_ARGS, sk_trace_args(m_key), __entry->m_val_len,
__entry->f_root_blkno, __entry->f_root_seq, __entry->f_root_height,
TP_printk(SCSBF" merge item root blkno %llu seq %llu height %u key "SK_FMT" val_len %d, fs item root blkno %llu seq %llu height %u key "SK_FMT" val_len %d, is_del %d",
SCSB_TRACE_ARGS, __entry->m_root_blkno, __entry->m_root_seq,
__entry->m_root_height, sk_trace_args(m_key),
__entry->m_val_len, __entry->f_root_blkno,
__entry->f_root_seq, __entry->f_root_height,
sk_trace_args(f_key), __entry->f_val_len, __entry->is_del)
);
@@ -2090,71 +2075,6 @@ TRACE_EVENT(scoutfs_trans_seq_last,
SCSB_TRACE_ARGS, __entry->s_rid, __entry->trans_seq)
);
TRACE_EVENT(scoutfs_server_finalize_items,
TP_PROTO(struct super_block *sb, u64 rid, u64 item_rid, u64 item_nr, u64 item_flags,
u64 item_get_trans_seq),
TP_ARGS(sb, rid, item_rid, item_nr, item_flags, item_get_trans_seq),
TP_STRUCT__entry(
SCSB_TRACE_FIELDS
__field(__u64, c_rid)
__field(__u64, item_rid)
__field(__u64, item_nr)
__field(__u64, item_flags)
__field(__u64, item_get_trans_seq)
),
TP_fast_assign(
SCSB_TRACE_ASSIGN(sb);
__entry->c_rid = rid;
__entry->item_rid = item_rid;
__entry->item_nr = item_nr;
__entry->item_flags = item_flags;
__entry->item_get_trans_seq = item_get_trans_seq;
),
TP_printk(SCSBF" rid %016llx item_rid %016llx item_nr %llu item_flags 0x%llx item_get_trans_seq %llu",
SCSB_TRACE_ARGS, __entry->c_rid, __entry->item_rid, __entry->item_nr,
__entry->item_flags, __entry->item_get_trans_seq)
);
TRACE_EVENT(scoutfs_server_finalize_decision,
TP_PROTO(struct super_block *sb, u64 rid, bool saw_finalized, bool others_active,
bool ours_visible, bool finalize_ours, unsigned int delay_ms,
u64 finalize_sent_seq),
TP_ARGS(sb, rid, saw_finalized, others_active, ours_visible, finalize_ours, delay_ms,
finalize_sent_seq),
TP_STRUCT__entry(
SCSB_TRACE_FIELDS
__field(__u64, c_rid)
__field(bool, saw_finalized)
__field(bool, others_active)
__field(bool, ours_visible)
__field(bool, finalize_ours)
__field(unsigned int, delay_ms)
__field(__u64, finalize_sent_seq)
),
TP_fast_assign(
SCSB_TRACE_ASSIGN(sb);
__entry->c_rid = rid;
__entry->saw_finalized = saw_finalized;
__entry->others_active = others_active;
__entry->ours_visible = ours_visible;
__entry->finalize_ours = finalize_ours;
__entry->delay_ms = delay_ms;
__entry->finalize_sent_seq = finalize_sent_seq;
),
TP_printk(SCSBF" rid %016llx saw_finalized %u others_active %u ours_visible %u finalize_ours %u delay_ms %u finalize_sent_seq %llu",
SCSB_TRACE_ARGS, __entry->c_rid, __entry->saw_finalized, __entry->others_active,
__entry->ours_visible, __entry->finalize_ours, __entry->delay_ms,
__entry->finalize_sent_seq)
);
TRACE_EVENT(scoutfs_get_log_merge_status,
TP_PROTO(struct super_block *sb, u64 rid, struct scoutfs_key *next_range_key,
u64 nr_requests, u64 nr_complete, u64 seq),
@@ -2879,81 +2799,6 @@ TRACE_EVENT(scoutfs_omap_should_delete,
SCSB_TRACE_ARGS, __entry->ino, __entry->nlink, __entry->ret)
);
#define SSCF_FMT "[bo %llu bs %llu es %llu]"
#define SSCF_FIELDS(pref) \
__field(__u64, pref##_blkno) \
__field(__u64, pref##_blocks) \
__field(__u64, pref##_entries)
#define SSCF_ASSIGN(pref, sfl) \
__entry->pref##_blkno = le64_to_cpu((sfl)->ref.blkno); \
__entry->pref##_blocks = le64_to_cpu((sfl)->blocks); \
__entry->pref##_entries = le64_to_cpu((sfl)->entries);
#define SSCF_ENTRY_ARGS(pref) \
__entry->pref##_blkno, \
__entry->pref##_blocks, \
__entry->pref##_entries
DECLARE_EVENT_CLASS(scoutfs_srch_compact_class,
TP_PROTO(struct super_block *sb, struct scoutfs_srch_compact *sc),
TP_ARGS(sb, sc),
TP_STRUCT__entry(
SCSB_TRACE_FIELDS
__field(__u64, id)
__field(__u8, nr)
__field(__u8, flags)
SSCF_FIELDS(out)
__field(__u64, in0_blk)
__field(__u64, in0_pos)
SSCF_FIELDS(in0)
__field(__u64, in1_blk)
__field(__u64, in1_pos)
SSCF_FIELDS(in1)
__field(__u64, in2_blk)
__field(__u64, in2_pos)
SSCF_FIELDS(in2)
__field(__u64, in3_blk)
__field(__u64, in3_pos)
SSCF_FIELDS(in3)
),
TP_fast_assign(
SCSB_TRACE_ASSIGN(sb);
__entry->id = le64_to_cpu(sc->id);
__entry->nr = sc->nr;
__entry->flags = sc->flags;
SSCF_ASSIGN(out, &sc->out)
__entry->in0_blk = le64_to_cpu(sc->in[0].blk);
__entry->in0_pos = le64_to_cpu(sc->in[0].pos);
SSCF_ASSIGN(in0, &sc->in[0].sfl)
__entry->in1_blk = le64_to_cpu(sc->in[0].blk);
__entry->in1_pos = le64_to_cpu(sc->in[0].pos);
SSCF_ASSIGN(in1, &sc->in[1].sfl)
__entry->in2_blk = le64_to_cpu(sc->in[0].blk);
__entry->in2_pos = le64_to_cpu(sc->in[0].pos);
SSCF_ASSIGN(in2, &sc->in[2].sfl)
__entry->in3_blk = le64_to_cpu(sc->in[0].blk);
__entry->in3_pos = le64_to_cpu(sc->in[0].pos);
SSCF_ASSIGN(in3, &sc->in[3].sfl)
),
TP_printk(SCSBF" id %llu nr %u flags 0x%x out "SSCF_FMT" in0 b %llu p %llu "SSCF_FMT" in1 b %llu p %llu "SSCF_FMT" in2 b %llu p %llu "SSCF_FMT" in3 b %llu p %llu "SSCF_FMT,
SCSB_TRACE_ARGS, __entry->id, __entry->nr, __entry->flags, SSCF_ENTRY_ARGS(out),
__entry->in0_blk, __entry->in0_pos, SSCF_ENTRY_ARGS(in0),
__entry->in1_blk, __entry->in1_pos, SSCF_ENTRY_ARGS(in1),
__entry->in2_blk, __entry->in2_pos, SSCF_ENTRY_ARGS(in2),
__entry->in3_blk, __entry->in3_pos, SSCF_ENTRY_ARGS(in3))
);
DEFINE_EVENT(scoutfs_srch_compact_class, scoutfs_srch_compact_client_send,
TP_PROTO(struct super_block *sb, struct scoutfs_srch_compact *sc),
TP_ARGS(sb, sc)
);
DEFINE_EVENT(scoutfs_srch_compact_class, scoutfs_srch_compact_client_recv,
TP_PROTO(struct super_block *sb, struct scoutfs_srch_compact *sc),
TP_ARGS(sb, sc)
);
#endif /* _TRACE_SCOUTFS_H */
/* This part must be outside protection */
+61 -88
View File
@@ -148,8 +148,6 @@ struct server_info {
struct scoutfs_quorum_config qconf;
/* a running server maintains a private dirty super */
struct scoutfs_super_block dirty_super;
u64 finalize_sent_seq;
};
#define DECLARE_SERVER_INFO(sb, name) \
@@ -415,27 +413,6 @@ static void server_hold_commit(struct super_block *sb, struct commit_hold *hold)
wait_event(cusers->waitq, hold_commit(sb, server, cusers, hold));
}
/*
* Return the higher of the avail or freed used by the active commit
* since this holder joined the commit. This is *not* the amount used
* by the holder, we don't track per-holder alloc use.
*/
static u32 server_hold_alloc_used_since(struct super_block *sb, struct commit_hold *hold)
{
DECLARE_SERVER_INFO(sb, server);
u32 avail_used;
u32 freed_used;
u32 avail_now;
u32 freed_now;
scoutfs_alloc_meta_remaining(&server->alloc, &avail_now, &freed_now);
avail_used = hold->avail - avail_now;
freed_used = hold->freed - freed_now;
return max(avail_used, freed_used);
}
/*
* This is called while holding the commit and returns once the commit
* is successfully written. Many holders can all wait for all holders
@@ -961,24 +938,22 @@ static int find_log_trees_item(struct super_block *sb,
}
/*
* Find the log_trees item with the greatest nr for each rid. Fills the
* caller's log_trees and sets the key before the returned log_trees for
* the next iteration. Returns 0 when done, > 0 for each item, and
* -errno on fatal errors.
* Find the next log_trees item from the key. Fills the caller's log_trees and sets
* the key past the returned log_trees for iteration. Returns 0 when done, > 0 for each
* item, and -errno on fatal errors.
*/
static int for_each_rid_last_lt(struct super_block *sb, struct scoutfs_btree_root *root,
struct scoutfs_key *key, struct scoutfs_log_trees *lt)
static int for_each_lt(struct super_block *sb, struct scoutfs_btree_root *root,
struct scoutfs_key *key, struct scoutfs_log_trees *lt)
{
SCOUTFS_BTREE_ITEM_REF(iref);
int ret;
ret = scoutfs_btree_prev(sb, root, key, &iref);
ret = scoutfs_btree_next(sb, root, key, &iref);
if (ret == 0) {
if (iref.val_len == sizeof(struct scoutfs_log_trees)) {
memcpy(lt, iref.val, iref.val_len);
*key = *iref.key;
key->sklt_nr = 0;
scoutfs_key_dec(key);
scoutfs_key_inc(key);
ret = 1;
} else {
ret = -EIO;
@@ -1073,13 +1048,21 @@ static int next_log_merge_item(struct super_block *sb,
* abandoned log btree finalized. If it takes too long each client has
* a change to make forward progress before being asked to commit again.
*
* We're waiting on heavy state that is protected by mutexes and
* transaction machinery. It's tricky to recreate that state for
* lightweight condition tests that don't change task state. Instead of
* trying to get that right, particularly as we unwind after success or
* after timeouts, waiters use an unsatisfying poll. Short enough to
* not add terrible latency, given how heavy and infrequent this already
* is, and long enough to not melt the cpu. This could be tuned if it
* becomes a problem.
*
* This can end up finalizing a new empty log btree if a new mount
* happens to arrive at just the right time. That's fine, merging will
* ignore and tear down the empty input.
*/
#define FINALIZE_POLL_MIN_DELAY_MS 5U
#define FINALIZE_POLL_MAX_DELAY_MS 100U
#define FINALIZE_POLL_DELAY_GROWTH_PCT 150U
#define FINALIZE_POLL_MS (11)
#define FINALIZE_TIMEOUT_MS (MSEC_PER_SEC / 2)
static int finalize_and_start_log_merge(struct super_block *sb, struct scoutfs_log_trees *lt,
u64 rid, struct commit_hold *hold)
{
@@ -1087,10 +1070,8 @@ static int finalize_and_start_log_merge(struct super_block *sb, struct scoutfs_l
struct scoutfs_super_block *super = DIRTY_SUPER_SB(sb);
struct scoutfs_log_merge_status stat;
struct scoutfs_log_merge_range rng;
struct scoutfs_mount_options opts;
struct scoutfs_log_trees each_lt;
struct scoutfs_log_trees fin;
unsigned int delay_ms;
unsigned long timeo;
bool saw_finalized;
bool others_active;
@@ -1098,14 +1079,10 @@ static int finalize_and_start_log_merge(struct super_block *sb, struct scoutfs_l
bool ours_visible;
struct scoutfs_key key;
char *err_str = NULL;
ktime_t start;
int ret;
int err;
scoutfs_options_read(sb, &opts);
timeo = jiffies + msecs_to_jiffies(opts.log_merge_wait_timeout_ms);
delay_ms = FINALIZE_POLL_MIN_DELAY_MS;
start = ktime_get_raw();
timeo = jiffies + msecs_to_jiffies(FINALIZE_TIMEOUT_MS);
for (;;) {
/* nothing to do if there's already a merge in flight */
@@ -1122,13 +1099,8 @@ static int finalize_and_start_log_merge(struct super_block *sb, struct scoutfs_l
saw_finalized = false;
others_active = false;
ours_visible = false;
scoutfs_key_init_log_trees(&key, U64_MAX, U64_MAX);
while ((ret = for_each_rid_last_lt(sb, &super->logs_root, &key, &each_lt)) > 0) {
trace_scoutfs_server_finalize_items(sb, rid, le64_to_cpu(each_lt.rid),
le64_to_cpu(each_lt.nr),
le64_to_cpu(each_lt.flags),
le64_to_cpu(each_lt.get_trans_seq));
scoutfs_key_init_log_trees(&key, 0, 0);
while ((ret = for_each_lt(sb, &super->logs_root, &key, &each_lt)) > 0) {
if ((le64_to_cpu(each_lt.flags) & SCOUTFS_LOG_TREES_FINALIZED))
saw_finalized = true;
@@ -1153,10 +1125,6 @@ static int finalize_and_start_log_merge(struct super_block *sb, struct scoutfs_l
finalize_ours = (lt->item_root.height > 2) ||
(le32_to_cpu(lt->meta_avail.flags) & SCOUTFS_ALLOC_FLAG_LOW);
trace_scoutfs_server_finalize_decision(sb, rid, saw_finalized, others_active,
ours_visible, finalize_ours, delay_ms,
server->finalize_sent_seq);
/* done if we're not finalizing and there's no finalized */
if (!finalize_ours && !saw_finalized) {
ret = 0;
@@ -1164,13 +1132,12 @@ static int finalize_and_start_log_merge(struct super_block *sb, struct scoutfs_l
}
/* send sync requests soon to give time to commit */
scoutfs_key_init_log_trees(&key, U64_MAX, U64_MAX);
scoutfs_key_init_log_trees(&key, 0, 0);
while (others_active &&
(ret = for_each_rid_last_lt(sb, &super->logs_root, &key, &each_lt)) > 0) {
(ret = for_each_lt(sb, &super->logs_root, &key, &each_lt)) > 0) {
if ((le64_to_cpu(each_lt.flags) & SCOUTFS_LOG_TREES_FINALIZED) ||
(le64_to_cpu(each_lt.rid) == rid) ||
(le64_to_cpu(each_lt.get_trans_seq) <= server->finalize_sent_seq))
(le64_to_cpu(each_lt.rid) == rid))
continue;
ret = scoutfs_net_submit_request_node(sb, server->conn,
@@ -1190,8 +1157,6 @@ static int finalize_and_start_log_merge(struct super_block *sb, struct scoutfs_l
break;
}
server->finalize_sent_seq = scoutfs_server_seq(sb);
/* Finalize ours if it's visible to others */
if (ours_visible) {
fin = *lt;
@@ -1229,16 +1194,13 @@ static int finalize_and_start_log_merge(struct super_block *sb, struct scoutfs_l
if (ret < 0)
err_str = "applying commit before waiting for finalized";
msleep(delay_ms);
delay_ms = min(delay_ms * FINALIZE_POLL_DELAY_GROWTH_PCT / 100,
FINALIZE_POLL_MAX_DELAY_MS);
msleep(FINALIZE_POLL_MS);
server_hold_commit(sb, hold);
mutex_lock(&server->logs_mutex);
/* done if we timed out */
if (time_after(jiffies, timeo)) {
scoutfs_inc_counter(sb, log_merge_wait_timeout);
ret = 0;
break;
}
@@ -1821,29 +1783,43 @@ out:
* Give the caller the last seq before outstanding client commits. All
* seqs up to and including this are stable, new client transactions can
* only have greater seqs.
*
* For each rid, only its greatest log trees nr can be an open commit.
* We look at the last log_trees item for each client rid and record its
* trans seq if it hasn't been committed.
*/
static int get_stable_trans_seq(struct super_block *sb, u64 *last_seq_ret)
{
struct scoutfs_super_block *super = DIRTY_SUPER_SB(sb);
DECLARE_SERVER_INFO(sb, server);
struct scoutfs_log_trees lt;
SCOUTFS_BTREE_ITEM_REF(iref);
struct scoutfs_log_trees *lt;
struct scoutfs_key key;
u64 last_seq = 0;
int ret;
last_seq = scoutfs_server_seq(sb) - 1;
scoutfs_key_init_log_trees(&key, 0, 0);
mutex_lock(&server->logs_mutex);
scoutfs_key_init_log_trees(&key, U64_MAX, U64_MAX);
while ((ret = for_each_rid_last_lt(sb, &super->logs_root, &key, &lt)) > 0) {
if ((le64_to_cpu(lt.get_trans_seq) > le64_to_cpu(lt.commit_trans_seq)) &&
le64_to_cpu(lt.get_trans_seq) <= last_seq) {
last_seq = le64_to_cpu(lt.get_trans_seq) - 1;
for (;; scoutfs_key_inc(&key)) {
ret = scoutfs_btree_next(sb, &super->logs_root, &key, &iref);
if (ret == 0) {
if (iref.val_len == sizeof(*lt)) {
lt = iref.val;
if ((le64_to_cpu(lt->get_trans_seq) >
le64_to_cpu(lt->commit_trans_seq)) &&
le64_to_cpu(lt->get_trans_seq) <= last_seq) {
last_seq = le64_to_cpu(lt->get_trans_seq) - 1;
}
key = *iref.key;
} else {
ret = -EIO;
}
scoutfs_btree_put_iref(&iref);
}
if (ret < 0) {
if (ret == -ENOENT) {
ret = 0;
break;
}
}
}
@@ -1990,7 +1966,9 @@ static int server_srch_get_compact(struct super_block *sb,
ret = scoutfs_srch_get_compact(sb, &server->alloc, &server->wri,
&super->srch_root, rid, sc);
mutex_unlock(&server->srch_mutex);
if (ret < 0 || (ret == 0 && sc->nr == 0))
if (ret == 0 && sc->nr == 0)
ret = -ENOENT;
if (ret < 0)
goto apply;
mutex_lock(&server->alloc_mutex);
@@ -2495,11 +2473,9 @@ static void server_log_merge_free_work(struct work_struct *work)
while (!server_is_stopping(server)) {
if (!commit) {
server_hold_commit(sb, &hold);
mutex_lock(&server->logs_mutex);
commit = true;
}
server_hold_commit(sb, &hold);
mutex_lock(&server->logs_mutex);
commit = true;
ret = next_log_merge_item(sb, &super->log_merge,
SCOUTFS_LOG_MERGE_FREEING_ZONE,
@@ -2546,14 +2522,12 @@ static void server_log_merge_free_work(struct work_struct *work)
/* freed blocks are in allocator, we *have* to update fr */
BUG_ON(ret < 0);
if (server_hold_alloc_used_since(sb, &hold) >= COMMIT_HOLD_ALLOC_BUDGET / 2) {
mutex_unlock(&server->logs_mutex);
ret = server_apply_commit(sb, &hold, ret);
commit = false;
if (ret < 0) {
err_str = "looping commit del/upd freeing item";
break;
}
mutex_unlock(&server->logs_mutex);
ret = server_apply_commit(sb, &hold, ret);
commit = false;
if (ret < 0) {
err_str = "looping commit del/upd freeing item";
break;
}
}
@@ -4326,7 +4300,6 @@ static void scoutfs_server_worker(struct work_struct *work)
scoutfs_info(sb, "server starting at "SIN_FMT, SIN_ARG(&sin));
scoutfs_block_writer_init(sb, &server->wri);
server->finalize_sent_seq = 0;
/* first make sure no other servers are still running */
ret = scoutfs_quorum_fence_leaders(sb, &server->qconf, server->term);
+13 -188
View File
@@ -30,9 +30,6 @@
#include "client.h"
#include "counters.h"
#include "scoutfs_trace.h"
#include "triggers.h"
#include "sysfs.h"
#include "msg.h"
/*
* This srch subsystem gives us a way to find inodes that have a given
@@ -71,14 +68,10 @@ struct srch_info {
atomic_t shutdown;
struct workqueue_struct *workq;
struct delayed_work compact_dwork;
struct scoutfs_sysfs_attrs ssa;
atomic_t compact_delay_ms;
};
#define DECLARE_SRCH_INFO(sb, name) \
struct srch_info *name = SCOUTFS_SB(sb)->srch_info
#define DECLARE_SRCH_INFO_KOBJ(kobj, name) \
DECLARE_SRCH_INFO(SCOUTFS_SYSFS_ATTRS_SB(kobj), name)
#define SRE_FMT "%016llx.%llu.%llu"
#define SRE_ARG(sre) \
@@ -527,95 +520,6 @@ out:
return ret;
}
/*
* Padded entries are encoded in pairs after an existing entry. All of
* the pairs cancel each other out by all readers (the second encoding
* looks like deletion) so they aren't visible to the first/last bounds of
* the block or file.
*/
static int append_padded_entry(struct scoutfs_srch_file *sfl, u64 blk,
struct scoutfs_srch_block *srb, struct scoutfs_srch_entry *sre)
{
int ret;
ret = encode_entry(srb->entries + le32_to_cpu(srb->entry_bytes),
sre, &srb->tail);
if (ret > 0) {
srb->tail = *sre;
le32_add_cpu(&srb->entry_nr, 1);
le32_add_cpu(&srb->entry_bytes, ret);
le64_add_cpu(&sfl->entries, 1);
ret = 0;
}
return ret;
}
/*
* This is called by a testing trigger to create a very specific case of
* encoded entry offsets. We want the last entry in the block to start
* precisely at the _SAFE_BYTES offset.
*
* This is called when there is a single existing entry in the block.
* We have the entire block to work with. We encode pairs of matching
* entries. This hides them from readers (both searches and merging) as
* they're interpreted as creation and deletion and are deleted. We use
* the existing hash value of the first entry in the block but then set
* the inode to an impossibly large number so it doesn't interfere with
* anything.
*
* To hit the specific offset we very carefully manage the amount of
* bytes of change between fields in the entry. We know that if we
* change all the byte of the ino and id we end up with a 20 byte
* (2+8+8,2) encoding of the pair of entries. To have the last entry
* start at the _SAFE_POS offset we know that the final 20 byte pair
* encoding needs to end at 2 bytes (second entry encoding) after the
* _SAFE_POS offset.
*
* So as we encode pairs we watch the delta of our current offset from
* that desired final offset of 2 past _SAFE_POS. If we're a multiple
* of 20 away then we encode the full 20 byte pairs. If we're not, then
* we drop a byte to encode 19 bytes. That'll slowly change the offset
* to be a multiple of 20 again while encoding large entries.
*/
static void pad_entries_at_safe(struct scoutfs_srch_file *sfl, u64 blk,
struct scoutfs_srch_block *srb)
{
struct scoutfs_srch_entry sre;
u32 target;
s32 diff;
u64 hash;
u64 ino;
u64 id;
int ret;
hash = le64_to_cpu(srb->tail.hash);
ino = le64_to_cpu(srb->tail.ino) | (1ULL << 62);
id = le64_to_cpu(srb->tail.id);
target = SCOUTFS_SRCH_BLOCK_SAFE_BYTES + 2;
while ((diff = target - le32_to_cpu(srb->entry_bytes)) > 0) {
ino ^= 1ULL << (7 * 8);
if (diff % 20 == 0) {
id ^= 1ULL << (7 * 8);
} else {
id ^= 1ULL << (6 * 8);
}
sre.hash = cpu_to_le64(hash);
sre.ino = cpu_to_le64(ino);
sre.id = cpu_to_le64(id);
ret = append_padded_entry(sfl, blk, srb, &sre);
if (ret == 0)
ret = append_padded_entry(sfl, blk, srb, &sre);
BUG_ON(ret != 0);
diff = target - le32_to_cpu(srb->entry_bytes);
}
}
/*
* The caller is dropping an ino/id because the tracking rbtree is full.
* This loses information so we can't return any entries at or after the
@@ -1083,9 +987,6 @@ int scoutfs_srch_rotate_log(struct super_block *sb,
struct scoutfs_key key;
int ret;
if (sfl->ref.blkno && !force && scoutfs_trigger(sb, SRCH_FORCE_LOG_ROTATE))
force = true;
if (sfl->ref.blkno == 0 ||
(!force && le64_to_cpu(sfl->blocks) < SCOUTFS_SRCH_LOG_BLOCK_LIMIT))
return 0;
@@ -1561,7 +1462,7 @@ static int kway_merge(struct super_block *sb,
struct scoutfs_block_writer *wri,
struct scoutfs_srch_file *sfl,
kway_get_t kway_get, kway_advance_t kway_adv,
void **args, int nr, bool logs_input)
void **args, int nr)
{
DECLARE_SRCH_INFO(sb, srinf);
struct scoutfs_srch_block *srb = NULL;
@@ -1666,15 +1567,6 @@ static int kway_merge(struct super_block *sb,
blk++;
}
/* end sorted block on _SAFE offset for testing */
if (bl && le32_to_cpu(srb->entry_nr) == 1 && logs_input &&
scoutfs_trigger(sb, SRCH_COMPACT_LOGS_PAD_SAFE)) {
pad_entries_at_safe(sfl, blk, srb);
scoutfs_block_put(sb, bl);
bl = NULL;
blk++;
}
scoutfs_inc_counter(sb, srch_compact_entry);
} else {
@@ -1717,8 +1609,6 @@ static int kway_merge(struct super_block *sb,
empty++;
ret = 0;
} else if (ret < 0) {
if (ret == -ENOANO) /* just testing trigger */
ret = 0;
goto out;
}
@@ -1926,7 +1816,7 @@ static int compact_logs(struct super_block *sb,
}
ret = kway_merge(sb, alloc, wri, &sc->out, kway_get_page, kway_adv_page,
args, nr_pages, true);
args, nr_pages);
if (ret < 0)
goto out;
@@ -1984,18 +1874,12 @@ static int kway_get_reader(struct super_block *sb,
srb = rdr->bl->data;
if (rdr->pos > SCOUTFS_SRCH_BLOCK_SAFE_BYTES ||
rdr->skip > SCOUTFS_SRCH_BLOCK_SAFE_BYTES ||
rdr->skip >= SCOUTFS_SRCH_BLOCK_SAFE_BYTES ||
rdr->skip >= le32_to_cpu(srb->entry_bytes)) {
/* XXX inconsistency */
return -EIO;
}
if (rdr->decoded_bytes == 0 && rdr->pos == SCOUTFS_SRCH_BLOCK_SAFE_BYTES &&
scoutfs_trigger(sb, SRCH_MERGE_STOP_SAFE)) {
/* only used in testing */
return -ENOANO;
}
/* decode entry, possibly skipping start of the block */
while (rdr->decoded_bytes == 0 || rdr->pos < rdr->skip) {
ret = decode_entry(srb->entries + rdr->pos,
@@ -2085,7 +1969,7 @@ static int compact_sorted(struct super_block *sb,
}
ret = kway_merge(sb, alloc, wri, &sc->out, kway_get_reader,
kway_adv_reader, args, nr, false);
kway_adv_reader, args, nr);
sc->flags |= SCOUTFS_SRCH_COMPACT_FLAG_DONE;
for (i = 0; i < nr; i++) {
@@ -2214,15 +2098,8 @@ static int delete_files(struct super_block *sb, struct scoutfs_alloc *alloc,
return ret;
}
static void queue_compact_work(struct srch_info *srinf, bool immediate)
{
unsigned long delay;
if (!atomic_read(&srinf->shutdown)) {
delay = immediate ? 0 : msecs_to_jiffies(atomic_read(&srinf->compact_delay_ms));
queue_delayed_work(srinf->workq, &srinf->compact_dwork, delay);
}
}
/* wait 10s between compact attempts on error, immediate after success */
#define SRCH_COMPACT_DELAY_MS (10 * MSEC_PER_SEC)
/*
* Get a compaction operation from the server, sort the entries from the
@@ -2250,6 +2127,7 @@ static void scoutfs_srch_compact_worker(struct work_struct *work)
struct super_block *sb = srinf->sb;
struct scoutfs_block_writer wri;
struct scoutfs_alloc alloc;
unsigned long delay;
int ret;
int err;
@@ -2262,8 +2140,6 @@ static void scoutfs_srch_compact_worker(struct work_struct *work)
scoutfs_block_writer_init(sb, &wri);
ret = scoutfs_client_srch_get_compact(sb, sc);
if (ret >= 0)
trace_scoutfs_srch_compact_client_recv(sb, sc);
if (ret < 0 || sc->nr == 0)
goto out;
@@ -2292,7 +2168,6 @@ commit:
sc->meta_freed = alloc.freed;
sc->flags |= ret < 0 ? SCOUTFS_SRCH_COMPACT_FLAG_ERROR : 0;
trace_scoutfs_srch_compact_client_send(sb, sc);
err = scoutfs_client_srch_commit_compact(sb, sc);
if (err < 0 && ret == 0)
ret = err;
@@ -2303,56 +2178,14 @@ out:
scoutfs_inc_counter(sb, srch_compact_error);
scoutfs_block_writer_forget_all(sb, &wri);
queue_compact_work(srinf, sc->nr > 0 && ret == 0);
if (!atomic_read(&srinf->shutdown)) {
delay = ret == 0 ? 0 : msecs_to_jiffies(SRCH_COMPACT_DELAY_MS);
queue_delayed_work(srinf->workq, &srinf->compact_dwork, delay);
}
kfree(sc);
}
static ssize_t compact_delay_ms_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf)
{
DECLARE_SRCH_INFO_KOBJ(kobj, srinf);
return snprintf(buf, PAGE_SIZE, "%u", atomic_read(&srinf->compact_delay_ms));
}
#define MIN_COMPACT_DELAY_MS MSEC_PER_SEC
#define DEF_COMPACT_DELAY_MS (10 * MSEC_PER_SEC)
#define MAX_COMPACT_DELAY_MS (60 * MSEC_PER_SEC)
static ssize_t compact_delay_ms_store(struct kobject *kobj, struct kobj_attribute *attr,
const char *buf, size_t count)
{
struct super_block *sb = SCOUTFS_SYSFS_ATTRS_SB(kobj);
DECLARE_SRCH_INFO(sb, srinf);
char nullterm[30]; /* more than enough for octal -U64_MAX */
u64 val;
int len;
int ret;
len = min(count, sizeof(nullterm) - 1);
memcpy(nullterm, buf, len);
nullterm[len] = '\0';
ret = kstrtoll(nullterm, 0, &val);
if (ret < 0 || val < MIN_COMPACT_DELAY_MS || val > MAX_COMPACT_DELAY_MS) {
scoutfs_err(sb, "invalid compact_delay_ms value, must be between %lu and %lu",
MIN_COMPACT_DELAY_MS, MAX_COMPACT_DELAY_MS);
return -EINVAL;
}
atomic_set(&srinf->compact_delay_ms, val);
cancel_delayed_work(&srinf->compact_dwork);
queue_compact_work(srinf, false);
return count;
}
SCOUTFS_ATTR_RW(compact_delay_ms);
static struct attribute *srch_attrs[] = {
SCOUTFS_ATTR_PTR(compact_delay_ms),
NULL,
};
void scoutfs_srch_destroy(struct super_block *sb)
{
struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
@@ -2369,8 +2202,6 @@ void scoutfs_srch_destroy(struct super_block *sb)
destroy_workqueue(srinf->workq);
}
scoutfs_sysfs_destroy_attrs(sb, &srinf->ssa);
kfree(srinf);
sbi->srch_info = NULL;
}
@@ -2388,15 +2219,8 @@ int scoutfs_srch_setup(struct super_block *sb)
srinf->sb = sb;
atomic_set(&srinf->shutdown, 0);
INIT_DELAYED_WORK(&srinf->compact_dwork, scoutfs_srch_compact_worker);
scoutfs_sysfs_init_attrs(sb, &srinf->ssa);
atomic_set(&srinf->compact_delay_ms, DEF_COMPACT_DELAY_MS);
sbi->srch_info = srinf;
ret = scoutfs_sysfs_create_attrs(sb, &srinf->ssa, srch_attrs, "srch");
if (ret < 0)
goto out;
srinf->workq = alloc_workqueue("scoutfs_srch_compact",
WQ_NON_REENTRANT | WQ_UNBOUND |
WQ_HIGHPRI, 0);
@@ -2405,7 +2229,8 @@ int scoutfs_srch_setup(struct super_block *sb)
goto out;
}
queue_compact_work(srinf, false);
queue_delayed_work(srinf->workq, &srinf->compact_dwork,
msecs_to_jiffies(SRCH_COMPACT_DELAY_MS));
ret = 0;
out:
-3
View File
@@ -39,9 +39,6 @@ struct scoutfs_triggers {
static char *names[] = {
[SCOUTFS_TRIGGER_BLOCK_REMOVE_STALE] = "block_remove_stale",
[SCOUTFS_TRIGGER_SRCH_COMPACT_LOGS_PAD_SAFE] = "srch_compact_logs_pad_safe",
[SCOUTFS_TRIGGER_SRCH_FORCE_LOG_ROTATE] = "srch_force_log_rotate",
[SCOUTFS_TRIGGER_SRCH_MERGE_STOP_SAFE] = "srch_merge_stop_safe",
[SCOUTFS_TRIGGER_STATFS_LOCK_PURGE] = "statfs_lock_purge",
};
-3
View File
@@ -3,9 +3,6 @@
enum scoutfs_trigger {
SCOUTFS_TRIGGER_BLOCK_REMOVE_STALE,
SCOUTFS_TRIGGER_SRCH_COMPACT_LOGS_PAD_SAFE,
SCOUTFS_TRIGGER_SRCH_FORCE_LOG_ROTATE,
SCOUTFS_TRIGGER_SRCH_MERGE_STOP_SAFE,
SCOUTFS_TRIGGER_STATFS_LOCK_PURGE,
SCOUTFS_TRIGGER_NR,
};
+2 -6
View File
@@ -12,8 +12,7 @@ BIN := src/createmany \
src/find_xattrs \
src/create_xattr_loop \
src/fragmented_data_extents \
src/o_tmpfile_umask \
src/parallel_restore
src/o_tmpfile_umask
DEPS := $(wildcard src/*.d)
@@ -23,11 +22,8 @@ ifneq ($(DEPS),)
-include $(DEPS)
endif
src/parallel_restore_cflags := ../utils/src/scoutfs_parallel_restore.a -lm
$(BIN): %: %.c Makefile
gcc $(CFLAGS) -MD -MP -MF $*.d $< -o $@ $($(@)_cflags)
gcc $(CFLAGS) -MD -MP -MF $*.d $< -o $@
.PHONY: clean
clean:
+4 -5
View File
@@ -25,9 +25,8 @@ All options can be seen by running with -h.
This script is built to test multi-node systems on one host by using
different mounts of the same devices. The script creates a fake block
device in front of each fs block device for each mount that will be
tested. It will create predictable device mapper devices and mounts
them on /mnt/test.N. These static device names and mount paths limit
the script to a single execution per host.
tested. Currently it will create free loop devices and will mount on
/mnt/test.[0-9].
All tests will be run by default. Particular tests can be included or
excluded by providing test name regular expressions with the -I and -E
@@ -105,8 +104,8 @@ used during the test.
| Variable | Description | Origin | Example |
| ---------------- | ------------------- | --------------- | ----------------- |
| T\_MB[0-9] | per-mount meta bdev | created per run | /dev/mapper/\_scoutfs\_test\_meta\_[0-9] |
| T\_DB[0-9] | per-mount data bdev | created per run | /dev/mapper/\_scoutfs\_test\_data\_[0-9] |
| T\_MB[0-9] | per-mount meta bdev | created per run | /dev/loop0 |
| T\_DB[0-9] | per-mount data bdev | created per run | /dev/loop1 |
| T\_D[0-9] | per-mount test dir | made for test | /mnt/test.[0-9]/t |
| T\_META\_DEVICE | main FS meta bdev | -M | /dev/vda |
| T\_DATA\_DEVICE | main FS data bdev | -D | /dev/vdb |
+1 -58
View File
@@ -6,61 +6,6 @@ t_filter_fs()
-e 's@Device: [a-fA-F0-9]*h/[0-9]*d@Device: 0h/0d@g'
}
#
# We can hit a spurious kasan warning that was fixed upstream:
#
# e504e74cc3a2 x86/unwind/orc: Disable KASAN checking in the ORC unwinder, part 2
#
# KASAN can get mad when the unwinder doesn't find ORC metadata and
# wanders up without using frames and hits the KASAN stack red zones.
# We can ignore these messages.
#
# They're bracketed by:
# [ 2687.690127] ==================================================================
# [ 2687.691366] BUG: KASAN: stack-out-of-bounds in get_reg+0x1bc/0x230
# ...
# [ 2687.706220] ==================================================================
# [ 2687.707284] Disabling lock debugging due to kernel taint
#
# That final lock debugging message may not be included.
#
ignore_harmless_unwind_kasan_stack_oob()
{
awk '
BEGIN {
in_soob = 0
soob_nr = 0
}
( !in_soob && $0 ~ /==================================================================/ ) {
in_soob = 1
soob_nr = NR
saved = $0
}
( in_soob == 1 && NR == (soob_nr + 1) ) {
if (match($0, /KASAN: stack-out-of-bounds in get_reg/) != 0) {
in_soob = 2
} else {
in_soob = 0
print saved
}
saved=""
}
( in_soob == 2 && $0 ~ /==================================================================/ ) {
in_soob = 3
soob_nr = NR
}
( in_soob == 3 && NR > soob_nr && $0 !~ /Disabling lock debugging/ ) {
in_soob = 0
}
( !in_soob ) { print $0 }
END {
if (saved) {
print saved
}
}
'
}
#
# Filter out expected messages. Putting messages here implies that
# tests aren't relying on messages to discover failures.. they're
@@ -141,12 +86,10 @@ t_filter_dmesg()
re="$re|scoutfs .* critical transaction commit failure.*"
# change-devices causes loop device resizing
re="$re|loop: module loaded"
re="$re|loop[0-9].* detected capacity change from.*"
# ignore systemd-journal rotating
re="$re|systemd-journald.*"
egrep -v "($re)" | \
ignore_harmless_unwind_kasan_stack_oob
egrep -v "($re)"
}
+2 -10
View File
@@ -265,15 +265,6 @@ t_trigger_get() {
cat "$(t_trigger_path "$nr")/$which"
}
t_trigger_set() {
local which="$1"
local nr="$2"
local val="$3"
local path=$(t_trigger_path "$nr")
echo "$val" > "$path/$which"
}
t_trigger_show() {
local which="$1"
local string="$2"
@@ -285,8 +276,9 @@ t_trigger_show() {
t_trigger_arm_silent() {
local which="$1"
local nr="$2"
local path=$(t_trigger_path "$nr")
t_trigger_set "$which" "$nr" 1
echo 1 > "$path/$which"
}
t_trigger_arm() {
-1
View File
@@ -1,4 +1,3 @@
== measure initial createmany
== measure initial createmany
== measure two concurrent createmany runs
== cleanup
-1
View File
@@ -1,4 +1,3 @@
== setting longer hung task timeout
== creating fragmented extents
== unlink file with moved extents to free extents per block
== cleanup
-37
View File
@@ -1,37 +0,0 @@
== initialize per-mount values
== arm compaction triggers
trigger srch_compact_logs_pad_safe armed: 1
trigger srch_merge_stop_safe armed: 1
trigger srch_compact_logs_pad_safe armed: 1
trigger srch_merge_stop_safe armed: 1
trigger srch_compact_logs_pad_safe armed: 1
trigger srch_merge_stop_safe armed: 1
trigger srch_compact_logs_pad_safe armed: 1
trigger srch_merge_stop_safe armed: 1
trigger srch_compact_logs_pad_safe armed: 1
trigger srch_merge_stop_safe armed: 1
== compact more often
== create padded sorted inputs by forcing log rotation
trigger srch_force_log_rotate armed: 1
trigger srch_force_log_rotate armed: 1
trigger srch_force_log_rotate armed: 1
trigger srch_force_log_rotate armed: 1
trigger srch_compact_logs_pad_safe armed: 1
trigger srch_force_log_rotate armed: 1
trigger srch_force_log_rotate armed: 1
trigger srch_force_log_rotate armed: 1
trigger srch_force_log_rotate armed: 1
trigger srch_compact_logs_pad_safe armed: 1
trigger srch_force_log_rotate armed: 1
trigger srch_force_log_rotate armed: 1
trigger srch_force_log_rotate armed: 1
trigger srch_force_log_rotate armed: 1
trigger srch_compact_logs_pad_safe armed: 1
trigger srch_force_log_rotate armed: 1
trigger srch_force_log_rotate armed: 1
trigger srch_force_log_rotate armed: 1
trigger srch_force_log_rotate armed: 1
trigger srch_compact_logs_pad_safe armed: 1
== compaction of padded should stop at safe
== verify no compaction errors
== cleanup
+14 -17
View File
@@ -326,10 +326,16 @@ unmount_all() {
cmd wait $p
done
# delete all temp devices
for dev in /dev/mapper/_scoutfs_test_*; do
if [ -b "$dev" ]; then
cmd dmsetup remove $dev
# delete all temp meta devices
for dev in $(losetup --associated "$T_META_DEVICE" | cut -d : -f 1); do
if [ -e "$dev" ]; then
cmd losetup -d "$dev"
fi
done
# delete all temp data devices
for dev in $(losetup --associated "$T_DATA_DEVICE" | cut -d : -f 1); do
if [ -e "$dev" ]; then
cmd losetup -d "$dev"
fi
done
}
@@ -428,12 +434,6 @@ $T_UTILS/fenced/scoutfs-fenced > "$T_FENCED_LOG" 2>&1 &
fenced_pid=$!
fenced_log "started fenced pid $fenced_pid in the background"
# setup dm tables
echo "0 $(blockdev --getsz $T_META_DEVICE) linear $T_META_DEVICE 0" > \
$T_RESULTS/dmtable.meta
echo "0 $(blockdev --getsz $T_DATA_DEVICE) linear $T_DATA_DEVICE 0" > \
$T_RESULTS/dmtable.data
#
# mount concurrently so that a quorum is present to elect the leader and
# start a server.
@@ -442,13 +442,10 @@ msg "mounting $T_NR_MOUNTS mounts on meta $T_META_DEVICE data $T_DATA_DEVICE"
pids=""
for i in $(seq 0 $((T_NR_MOUNTS - 1))); do
name="_scoutfs_test_meta_$i"
cmd dmsetup create "$name" --table "$(cat $T_RESULTS/dmtable.meta)"
meta_dev="/dev/mapper/$name"
name="_scoutfs_test_data_$i"
cmd dmsetup create "$name" --table "$(cat $T_RESULTS/dmtable.data)"
data_dev="/dev/mapper/$name"
meta_dev=$(losetup --find --show $T_META_DEVICE)
test -b "$meta_dev" || die "failed to create temp device $meta_dev"
data_dev=$(losetup --find --show $T_DATA_DEVICE)
test -b "$data_dev" || die "failed to create temp device $data_dev"
dir="/mnt/test.$i"
test -d "$dir" || cmd mkdir -p "$dir"
-1
View File
@@ -14,7 +14,6 @@ offline-extent-waiting.sh
move-blocks.sh
large-fragmented-free.sh
enospc.sh
srch-safe-merge-pos.sh
srch-basic-functionality.sh
simple-xattr-unit.sh
totl-xattr-tag.sh
+40 -55
View File
@@ -1,7 +1,6 @@
#include <unistd.h>
#include <stdlib.h>
#include <stdio.h>
#include <stdarg.h>
#include <errno.h>
#include <string.h>
#include <sys/stat.h>
@@ -36,10 +35,10 @@ struct opts {
unsigned int dry_run:1,
ls_output:1,
quiet:1,
xattr_set:1,
xattr_file:1,
xattr_group:1;
char *xattr_name;
user_xattr:1,
same_srch_xattr:1,
group_srch_xattr:1,
unique_srch_xattr:1;
};
struct stats {
@@ -150,31 +149,12 @@ static void free_dir(struct dir *dir)
free(dir);
}
static size_t snprintf_off(void *buf, size_t sz, size_t off, char *fmt, ...)
{
va_list ap;
int ret;
if (off >= sz)
return sz;
va_start(ap, fmt);
ret = vsnprintf(buf + off, sz - off, fmt, ap);
va_end(ap);
if (ret <= 0)
return sz;
return off + ret;
}
static void create_dir(struct dir *dir, struct opts *opts,
struct stats *stats)
{
struct str_list *s;
char name[256]; /* max len and null term */
char name[100];
char val = 'v';
size_t off;
int rc;
int i;
@@ -195,21 +175,29 @@ static void create_dir(struct dir *dir, struct opts *opts,
rc = mknod(s->str, S_IFREG | 0644, 0);
error_exit(rc, "mknod %s failed"ERRF, s->str, ERRA);
if (opts->xattr_set) {
off = snprintf_off(name, sizeof(name), 0, "%s", opts->xattr_name);
if (opts->xattr_file)
off = snprintf_off(name, sizeof(name), off,
"-f-%lu", stats->files);
if (opts->xattr_group)
off = snprintf_off(name, sizeof(name), off,
"-g-%lu", stats->files / 10000);
error_exit(off >= sizeof(name), "xattr name longer than 255 bytes");
rc = 0;
if (rc == 0 && opts->user_xattr) {
strcpy(name, "user.scoutfs_bcp");
rc = setxattr(s->str, name, &val, 1, 0);
}
if (rc == 0 && opts->same_srch_xattr) {
strcpy(name, "scoutfs.srch.scoutfs_bcp");
rc = setxattr(s->str, name, &val, 1, 0);
}
if (rc == 0 && opts->group_srch_xattr) {
snprintf(name, sizeof(name),
"scoutfs.srch.scoutfs_bcp.group.%lu",
stats->files / 10000);
rc = setxattr(s->str, name, &val, 1, 0);
}
if (rc == 0 && opts->unique_srch_xattr) {
snprintf(name, sizeof(name),
"scoutfs.srch.scoutfs_bcp.unique.%lu",
stats->files);
rc = setxattr(s->str, name, &val, 1, 0);
error_exit(rc, "setxattr %s %s failed"ERRF, s->str, name, ERRA);
}
error_exit(rc, "setxattr %s %s failed"ERRF, s->str, name, ERRA);
stats->files++;
rate_banner(opts, stats);
@@ -377,10 +365,11 @@ static void usage(void)
" -d DIR | create all files in DIR top level directory\n"
" -n | dry run, only parse, don't create any files\n"
" -q | quiet, don't regularly print rates\n"
" -F | append \"-f-NR\" file nr to xattr name, requires -X\n"
" -G | append \"-g-NR\" file nr/10000 to xattr name, requires -X\n"
" -L | parse ls output; only reg, skip meta, paths at ./\n"
" -X NAM | set named xattr in all files\n");
" -X | set the same user. xattr name in all files\n"
" -S | set the same .srch. xattr name in all files\n"
" -G | set a .srch. xattr name shared by groups of files\n"
" -U | set a unique .srch. xattr name in all files\n");
}
int main(int argc, char **argv)
@@ -397,7 +386,7 @@ int main(int argc, char **argv)
memset(&opts, 0, sizeof(opts));
while ((c = getopt(argc, argv, "d:nqFGLX:")) != -1) {
while ((c = getopt(argc, argv, "d:nqLXSGU")) != -1) {
switch(c) {
case 'd':
top_dir = strdup(optarg);
@@ -408,19 +397,20 @@ int main(int argc, char **argv)
case 'q':
opts.quiet = 1;
break;
case 'F':
opts.xattr_file = 1;
break;
case 'G':
opts.xattr_group = 1;
break;
case 'L':
opts.ls_output = 1;
break;
case 'X':
opts.xattr_set = 1;
opts.xattr_name = strdup(optarg);
error_exit(!opts.xattr_name, "error allocating xattr name");
opts.user_xattr = 1;
break;
case 'S':
opts.same_srch_xattr = 1;
break;
case 'G':
opts.group_srch_xattr = 1;
break;
case 'U':
opts.unique_srch_xattr = 1;
break;
case '?':
printf("Unknown option '%c'\n", optopt);
@@ -429,11 +419,6 @@ int main(int argc, char **argv)
}
}
error_exit(opts.xattr_file && !opts.xattr_set,
"must specify xattr -X when appending file nr with -F");
error_exit(opts.xattr_group && !opts.xattr_set,
"must specify xattr -X when appending file nr with -G");
if (!opts.dry_run) {
error_exit(!top_dir,
"must specify top level directory with -d");
-782
View File
@@ -1,782 +0,0 @@
#define _GNU_SOURCE /* O_DIRECT */
#include <stdlib.h>
#include <stdio.h>
#include <stdbool.h>
#include <unistd.h>
#include <fcntl.h>
#include <sys/types.h>
#include <sys/xattr.h>
#include <ctype.h>
#include <string.h>
#include <errno.h>
#include <limits.h>
#include <time.h>
#include <sys/prctl.h>
#include <signal.h>
#include <sys/socket.h>
#include "../../utils/src/sparse.h"
#include "../../utils/src/util.h"
#include "../../utils/src/list.h"
#include "../../utils/src/parse.h"
#include "../../kmod/src/format.h"
#include "../../utils/src/parallel_restore.h"
/*
* XXX:
* - add a nice description of what's going on
* - mention allocator contention
* - test child process dying handling
* - root dir entry name length is wrong
*/
#define ERRF " errno %d (%s)"
#define ERRA errno, strerror(errno)
#define error_exit(cond, fmt, args...) \
do { \
if (cond) { \
printf("error: "fmt"\n", ##args); \
exit(1); \
} \
} while (0)
#define dprintf(fmt, args...) \
do { \
if (0) \
printf(fmt, ##args); \
} while (0)
#define REG_MODE (S_IFREG | 0644)
#define DIR_MODE (S_IFDIR | 0755)
struct opts {
unsigned long long buf_size;
unsigned long long write_batch;
unsigned long long low_dirs;
unsigned long long high_dirs;
unsigned long long low_files;
unsigned long long high_files;
char *meta_path;
unsigned long long total_files;
bool read_only;
unsigned long long seed;
unsigned long long nr_writers;
};
static void usage(void)
{
printf("usage:\n"
" -b NR | threads write blocks in batches files (100000)\n"
" -d LOW:HIGH | range of subdirs per directory (5:10)\n"
" -f LOW:HIGH | range of files per directory (10:20)\n"
" -m PATH | path to metadata device\n"
" -n NR | total number of files to create (100)\n"
" -r | read-only, all work except writing, measure cpu cost\n"
" -s NR | randomization seed (random)\n"
" -w NR | number of writing processes to fork (online cpus)\n"
);
}
static size_t write_bufs(struct opts *opts, struct scoutfs_parallel_restore_writer *wri,
void *buf, size_t buf_size, int dev_fd)
{
size_t total = 0;
size_t count;
off_t off;
int ret;
do {
ret = scoutfs_parallel_restore_write_buf(wri, buf, buf_size, &off, &count);
error_exit(ret, "write buf %d", ret);
if (count > 0) {
if (!opts->read_only)
ret = pwrite(dev_fd, buf, count, off);
else
ret = count;
error_exit(ret != count, "pwrite count %zu ret %d", count, ret);
total += ret;
}
} while (count > 0);
return total;
}
struct gen_inode {
struct scoutfs_parallel_restore_inode inode;
struct scoutfs_parallel_restore_xattr **xattrs;
u64 nr_xattrs;
struct scoutfs_parallel_restore_entry **entries;
u64 nr_files;
u64 nr_entries;
};
static void free_gino(struct gen_inode *gino)
{
u64 i;
if (gino) {
if (gino->entries) {
for (i = 0; i < gino->nr_entries; i++)
free(gino->entries[i]);
free(gino->entries);
}
if (gino->xattrs) {
for (i = 0; i < gino->nr_xattrs; i++)
free(gino->xattrs[i]);
free(gino->xattrs);
}
free(gino);
}
}
static struct scoutfs_parallel_restore_xattr *
generate_xattr(struct opts *opts, u64 ino, u64 pos, char *name, int name_len, void *value,
int value_len)
{
struct scoutfs_parallel_restore_xattr *xattr;
xattr = malloc(sizeof(struct scoutfs_parallel_restore_xattr) + name_len + value_len);
error_exit(!xattr, "error allocating generated xattr");
*xattr = (struct scoutfs_parallel_restore_xattr) {
.ino = ino,
.pos = pos,
.name_len = name_len,
.value_len = value_len,
};
xattr->name = (void *)(xattr + 1);
xattr->value = (void *)(xattr->name + name_len);
memcpy(xattr->name, name, name_len);
if (value_len)
memcpy(xattr->value, value, value_len);
return xattr;
}
static struct gen_inode *generate_inode(struct opts *opts, u64 ino, mode_t mode)
{
struct gen_inode *gino;
struct timespec now;
clock_gettime(CLOCK_REALTIME, &now);
gino = calloc(1, sizeof(struct gen_inode));
error_exit(!gino, "failure allocating generated inode");
gino->inode = (struct scoutfs_parallel_restore_inode) {
.ino = ino,
.mode = mode,
.atime = now,
.ctime = now,
.mtime = now,
.crtime = now,
};
/*
* hacky creation of a bunch of xattrs for now.
*/
if ((mode & S_IFMT) == S_IFREG) {
#define NV(n, v) { n, sizeof(n) - 1, v, sizeof(v) - 1, }
struct name_val {
char *name;
int len;
char *value;
int value_len;
} nv[] = {
NV("scoutfs.hide.totl.acct.8314611887310466424.2.0", "1"),
NV("scoutfs.hide.srch.sam_vol_E01001L6_4", ""),
NV("scoutfs.hide.sam_reqcopies", ""),
NV("scoutfs.hide.sam_copy_2", ""),
NV("scoutfs.hide.totl.acct.F01030L6.8314611887310466424.7.30", "1"),
NV("scoutfs.hide.sam_copy_1", ""),
NV("scoutfs.hide.srch.sam_vol_F01030L6_4", ""),
NV("scoutfs.hide.srch.sam_release_cand", ""),
NV("scoutfs.hide.sam_restime", ""),
NV("scoutfs.hide.sam_uuid", ""),
NV("scoutfs.hide.totl.acct.8314611887310466424.3.0", "1"),
NV("scoutfs.hide.srch.sam_vol_F01030L6", ""),
NV("scoutfs.hide.srch.sam_uuid_865939b7-24d6-472f-b85c-7ce7afeb813a", ""),
NV("scoutfs.hide.srch.sam_vol_E01001L6", ""),
NV("scoutfs.hide.totl.acct.E01001L6.8314611887310466424.7.1", "1"),
NV("scoutfs.hide.totl.acct.8314611887310466424.4.0", "1"),
NV("scoutfs.hide.totl.acct.8314611887310466424.11.0", "1"),
NV("scoutfs.hide.totl.acct.8314611887310466424.1.0", "1"),
};
unsigned int nr = array_size(nv);
int i;
gino->xattrs = calloc(nr, sizeof(struct scoutfs_parallel_restore_xattr *));
for (i = 0; i < nr; i++)
gino->xattrs[i] = generate_xattr(opts, ino, i, nv[i].name, nv[i].len,
nv[i].value, nv[i].value_len);
gino->nr_xattrs = nr;
gino->inode.nr_xattrs = nr;
}
return gino;
}
static struct scoutfs_parallel_restore_entry *
generate_entry(struct opts *opts, char *prefix, u64 nr, u64 dir_ino, u64 pos, u64 ino, mode_t mode)
{
struct scoutfs_parallel_restore_entry *entry;
char buf[PATH_MAX];
int bytes;
bytes = snprintf(buf, sizeof(buf), "%s-%llu", prefix, nr);
entry = malloc(sizeof(struct scoutfs_parallel_restore_entry) + bytes);
error_exit(!entry, "error allocating generated entry");
*entry = (struct scoutfs_parallel_restore_entry) {
.dir_ino = dir_ino,
.pos = pos,
.ino = ino,
.mode = mode,
.name = (void *)(entry + 1),
.name_len = bytes,
};
memcpy(entry->name, buf, bytes);
return entry;
}
static u64 random64(void)
{
return ((u64)lrand48() << 32) | lrand48();
}
static u64 random_range(u64 low, u64 high)
{
return low + (random64() % (high - low + 1));
}
static struct gen_inode *generate_dir(struct opts *opts, u64 dir_ino, u64 ino_start, u64 ino_len,
bool no_dirs)
{
struct scoutfs_parallel_restore_entry *entry;
struct gen_inode *gino;
u64 nr_entries;
u64 nr_files;
u64 nr_dirs;
u64 ino;
char *prefix;
mode_t mode;
u64 i;
nr_dirs = no_dirs ? 0 : random_range(opts->low_dirs, opts->high_dirs);
nr_files = random_range(opts->low_files, opts->high_files);
if (1 + nr_dirs + nr_files > ino_len) {
nr_dirs = no_dirs ? 0 : (ino_len - 1) / 2;
nr_files = (ino_len - 1) - nr_dirs;
}
nr_entries = nr_dirs + nr_files;
gino = generate_inode(opts, dir_ino, DIR_MODE);
error_exit(!gino, "error allocating generated inode");
gino->inode.nr_subdirs = nr_dirs;
gino->nr_files = nr_files;
if (nr_entries) {
gino->entries = calloc(nr_entries, sizeof(struct scoutfs_parallel_restore_entry *));
error_exit(!gino->entries, "error allocating generated inode entries");
gino->nr_entries = nr_entries;
}
mode = DIR_MODE;
prefix = "dir";
for (i = 0; i < nr_entries; i++) {
if (i == nr_dirs) {
mode = REG_MODE;
prefix = "file";
}
ino = ino_start + i;
entry = generate_entry(opts, prefix, ino, gino->inode.ino,
SCOUTFS_DIRENT_FIRST_POS + i, ino, mode);
gino->entries[i] = entry;
gino->inode.total_entry_name_bytes += entry->name_len;
}
return gino;
}
/*
* Restore a generated inode. If it's a directory then we also restore
* all its entries. The caller is going to descend into subdir entries and generate
* those dir inodes. We have to generate and restore all non-dir inodes referenced
* by this inode's entries.
*/
static void restore_inode(struct opts *opts, struct scoutfs_parallel_restore_writer *wri,
struct gen_inode *gino)
{
struct gen_inode *nondir;
int ret;
u64 i;
ret = scoutfs_parallel_restore_add_inode(wri, &gino->inode);
error_exit(ret, "thread add root inode %d", ret);
for (i = 0; i < gino->nr_entries; i++) {
ret = scoutfs_parallel_restore_add_entry(wri, gino->entries[i]);
error_exit(ret, "thread add entry %d", ret);
/* caller only needs subdir entries, generate and free others */
if ((gino->entries[i]->mode & S_IFMT) != S_IFDIR) {
nondir = generate_inode(opts, gino->entries[i]->ino,
gino->entries[i]->mode);
restore_inode(opts, wri, nondir);
free_gino(nondir);
free(gino->entries[i]);
if (i != gino->nr_entries - 1)
gino->entries[i] = gino->entries[gino->nr_entries - 1];
gino->nr_entries--;
gino->nr_files--;
i--;
}
}
for (i = 0; i < gino->nr_xattrs; i++) {
ret = scoutfs_parallel_restore_add_xattr(wri, gino->xattrs[i]);
error_exit(ret, "thread add xattr %d", ret);
}
}
struct writer_args {
struct list_head head;
int dev_fd;
int pair_fd;
struct scoutfs_parallel_restore_slice slice;
u64 writer_nr;
u64 dir_height;
u64 ino_start;
u64 ino_len;
};
struct write_result {
struct scoutfs_parallel_restore_progress prog;
struct scoutfs_parallel_restore_slice slice;
__le64 files_created;
__le64 bytes_written;
};
static void write_bufs_and_send(struct opts *opts, struct scoutfs_parallel_restore_writer *wri,
void *buf, size_t buf_size, int dev_fd,
struct write_result *res, bool get_slice, int pair_fd)
{
size_t total;
int ret;
total = write_bufs(opts, wri, buf, buf_size, dev_fd);
le64_add_cpu(&res->bytes_written, total);
ret = scoutfs_parallel_restore_get_progress(wri, &res->prog);
error_exit(ret, "get prog %d", ret);
if (get_slice) {
ret = scoutfs_parallel_restore_get_slice(wri, &res->slice);
error_exit(ret, "thread get slice %d", ret);
}
ret = write(pair_fd, res, sizeof(struct write_result));
error_exit(ret != sizeof(struct write_result), "result send error");
memset(res, 0, sizeof(struct write_result));
}
/*
* Calculate the number of bytes in toplevel "dir-%llu" entry names for the given
* number of writers.
*/
static u64 topdir_entry_bytes(u64 nr_writers)
{
u64 bytes = (3 + 1) * nr_writers;
u64 limit;
u64 done;
u64 wid;
u64 nr;
for (done = 0, wid = 1, limit = 10; done < nr_writers; done += nr, wid++, limit *= 10) {
nr = min(limit - done, nr_writers - done);
bytes += nr * wid;
}
return bytes;
}
struct dir_pos {
struct gen_inode *gino;
u64 pos;
};
static void writer_proc(struct opts *opts, struct writer_args *args)
{
struct scoutfs_parallel_restore_writer *wri = NULL;
struct scoutfs_parallel_restore_entry *entry;
struct dir_pos *dirs = NULL;
struct write_result res;
struct gen_inode *gino;
void *buf = NULL;
u64 level;
u64 ino;
int ret;
memset(&res, 0, sizeof(res));
dirs = calloc(args->dir_height, sizeof(struct dir_pos));
error_exit(errno, "error allocating parent dirs "ERRF, ERRA);
errno = posix_memalign((void **)&buf, 4096, opts->buf_size);
error_exit(errno, "error allocating block buf "ERRF, ERRA);
ret = scoutfs_parallel_restore_create_writer(&wri);
error_exit(ret, "create writer %d", ret);
ret = scoutfs_parallel_restore_add_slice(wri, &args->slice);
error_exit(ret, "add slice %d", ret);
/* writer 0 creates the root dir */
if (args->writer_nr == 0) {
gino = generate_inode(opts, SCOUTFS_ROOT_INO, DIR_MODE);
gino->inode.nr_subdirs = opts->nr_writers;
gino->inode.total_entry_name_bytes = topdir_entry_bytes(opts->nr_writers);
ret = scoutfs_parallel_restore_add_inode(wri, &gino->inode);
error_exit(ret, "thread add root inode %d", ret);
free_gino(gino);
}
/* create root entry for our top level dir */
ino = args->ino_start++;
args->ino_len--;
entry = generate_entry(opts, "top", args->writer_nr,
SCOUTFS_ROOT_INO, SCOUTFS_DIRENT_FIRST_POS + args->writer_nr,
ino, DIR_MODE);
ret = scoutfs_parallel_restore_add_entry(wri, entry);
error_exit(ret, "thread top entry %d", ret);
free(entry);
level = args->dir_height - 1;
while (args->ino_len > 0 && level < args->dir_height) {
gino = dirs[level].gino;
/* generate and restore if we follow entries */
if (!gino) {
gino = generate_dir(opts, ino, args->ino_start, args->ino_len, level == 0);
args->ino_start += gino->nr_entries;
args->ino_len -= gino->nr_entries;
le64_add_cpu(&res.files_created, gino->nr_files);
restore_inode(opts, wri, gino);
dirs[level].gino = gino;
}
if (dirs[level].pos == gino->nr_entries) {
/* ascend if we're done with this dir */
dirs[level].gino = NULL;
dirs[level].pos = 0;
free_gino(gino);
level++;
} else {
/* otherwise descend into subdir entry */
ino = gino->entries[dirs[level].pos]->ino;
dirs[level].pos++;
level--;
}
/* do a partial write at batch intervals when there's still more to do */
if (le64_to_cpu(res.files_created) >= opts->write_batch && args->ino_len > 0)
write_bufs_and_send(opts, wri, buf, opts->buf_size, args->dev_fd,
&res, false, args->pair_fd);
}
write_bufs_and_send(opts, wri, buf, opts->buf_size, args->dev_fd,
&res, true, args->pair_fd);
scoutfs_parallel_restore_destroy_writer(&wri);
free(dirs);
free(buf);
}
static void fork_writer(struct opts *opts, struct writer_args *args)
{
pid_t parent = getpid();
pid_t pid;
int ret;
pid = fork();
error_exit(pid == -1, "fork error");
if (pid != 0)
return;
ret = prctl(PR_SET_PDEATHSIG, SIGHUP);
error_exit(ret < 0, "failed to set parent death sig");
printf("pid %u getpid() %u parent %u getppid() %u\n",
pid, getpid(), parent, getppid());
error_exit(getppid() != parent, "child parent already changed");
writer_proc(opts, args);
exit(0);
}
static int do_restore(struct opts *opts)
{
struct scoutfs_parallel_restore_writer *wri = NULL;
struct scoutfs_parallel_restore_slice *slices = NULL;
struct scoutfs_super_block *super = NULL;
struct write_result res;
struct writer_args *args;
struct timespec begin;
struct timespec end;
LIST_HEAD(writers);
u64 next_ino;
u64 ino_per;
u64 avg_dirs;
u64 avg_files;
u64 dir_height;
u64 tot_files;
u64 tot_bytes;
int pair[2] = {-1, -1};
float secs;
void *buf = NULL;
int dev_fd = -1;
int ret;
int i;
ret = socketpair(PF_LOCAL, SOCK_STREAM, 0, pair);
error_exit(ret, "socketpair error "ERRF, ERRA);
dev_fd = open(opts->meta_path, O_DIRECT | (opts->read_only ? O_RDONLY : (O_RDWR|O_EXCL)));
error_exit(dev_fd < 0, "error opening '%s': "ERRF, opts->meta_path, ERRA);
errno = posix_memalign((void **)&super, 4096, SCOUTFS_BLOCK_SM_SIZE) ?:
posix_memalign((void **)&buf, 4096, opts->buf_size);
error_exit(errno, "error allocating block bufs "ERRF, ERRA);
ret = pread(dev_fd, super, SCOUTFS_BLOCK_SM_SIZE,
SCOUTFS_SUPER_BLKNO << SCOUTFS_BLOCK_SM_SHIFT);
error_exit(ret != SCOUTFS_BLOCK_SM_SIZE, "error reading super, ret %d", ret);
ret = scoutfs_parallel_restore_create_writer(&wri);
error_exit(ret, "create writer %d", ret);
ret = scoutfs_parallel_restore_import_super(wri, super);
error_exit(ret, "import super %d", ret);
slices = calloc(1 + opts->nr_writers, sizeof(struct scoutfs_parallel_restore_slice));
error_exit(!slices, "alloc slices");
scoutfs_parallel_restore_init_slices(wri, slices, 1 + opts->nr_writers);
ret = scoutfs_parallel_restore_add_slice(wri, &slices[0]);
error_exit(ret, "add slices[0] %d", ret);
next_ino = (SCOUTFS_ROOT_INO | SCOUTFS_LOCK_INODE_GROUP_MASK) + 1;
ino_per = opts->total_files / opts->nr_writers;
avg_dirs = (opts->low_dirs + opts->high_dirs) / 2;
avg_files = (opts->low_files + opts->high_files) / 2;
dir_height = 1;
tot_files = avg_files * opts->nr_writers;
while (tot_files < opts->total_files) {
dir_height++;
tot_files *= avg_dirs;
}
dprintf("height %llu tot %llu total %llu\n", dir_height, tot_files, opts->total_files);
clock_gettime(CLOCK_MONOTONIC_RAW, &begin);
/* start each writing process */
for (i = 0; i < opts->nr_writers; i++) {
args = calloc(1, sizeof(struct writer_args));
error_exit(!args, "alloc writer args");
args->dev_fd = dev_fd;
args->pair_fd = pair[1];
args->slice = slices[1 + i];
args->writer_nr = i;
args->dir_height = dir_height;
args->ino_start = next_ino;
args->ino_len = ino_per;
list_add_tail(&args->head, &writers);
next_ino += ino_per;
fork_writer(opts, args);
}
/* read results and watch for writers to finish */
tot_files = 0;
tot_bytes = 0;
i = 0;
while (i < opts->nr_writers) {
ret = read(pair[0], &res, sizeof(struct write_result));
error_exit(ret != sizeof(struct write_result), "result read error");
ret = scoutfs_parallel_restore_add_progress(wri, &res.prog);
error_exit(ret, "add thr prog %d", ret);
if (res.slice.meta_len != 0) {
ret = scoutfs_parallel_restore_add_slice(wri, &res.slice);
error_exit(ret, "add thr slice %d", ret);
i++;
}
tot_files += le64_to_cpu(res.files_created);
tot_bytes += le64_to_cpu(res.bytes_written);
}
tot_bytes += write_bufs(opts, wri, buf, opts->buf_size, dev_fd);
ret = scoutfs_parallel_restore_export_super(wri, super);
error_exit(ret, "update super %d", ret);
if (!opts->read_only) {
ret = pwrite(dev_fd, super, SCOUTFS_BLOCK_SM_SIZE,
SCOUTFS_SUPER_BLKNO << SCOUTFS_BLOCK_SM_SHIFT);
error_exit(ret != SCOUTFS_BLOCK_SM_SIZE, "error writing super, ret %d", ret);
}
clock_gettime(CLOCK_MONOTONIC_RAW, &end);
scoutfs_parallel_restore_destroy_writer(&wri);
secs = ((float)end.tv_sec + ((float)end.tv_nsec/NSEC_PER_SEC)) -
((float)begin.tv_sec + ((float)begin.tv_nsec/NSEC_PER_SEC));
printf("created %llu files in %llu bytes and %f secs => %f bytes/file, %f files/sec\n",
tot_files, tot_bytes, secs,
(float)tot_bytes / tot_files, (float)tot_files / secs);
if (dev_fd >= 0)
close(dev_fd);
if (pair[0] >= 0)
close(pair[0]);
if (pair[1] >= 0)
close(pair[1]);
free(super);
free(slices);
free(buf);
return 0;
}
static int parse_low_high(char *str, u64 *low_ret, u64 *high_ret)
{
char *sep;
int ret = 0;
sep = index(str, ':');
if (sep) {
*sep = '\0';
ret = parse_u64(sep + 1, high_ret);
}
if (ret == 0)
ret = parse_u64(str, low_ret);
if (sep)
*sep = ':';
return ret;
}
int main(int argc, char **argv)
{
struct opts opts = {
.buf_size = (32 * 1024 * 1024),
.write_batch = 1000000,
.low_dirs = 5,
.high_dirs = 10,
.low_files = 10,
.high_files = 20,
.total_files = 100,
};
int ret;
int c;
opts.seed = random64();
opts.nr_writers = sysconf(_SC_NPROCESSORS_ONLN);
while ((c = getopt(argc, argv, "b:d:f:m:n:rs:w:")) != -1) {
switch(c) {
case 'b':
ret = parse_u64(optarg, &opts.write_batch);
error_exit(ret, "error parsing -b '%s'\n", optarg);
error_exit(opts.write_batch == 0, "-b can't be 0");
break;
case 'd':
ret = parse_low_high(optarg, &opts.low_dirs, &opts.high_dirs);
error_exit(ret, "error parsing -d '%s'\n", optarg);
break;
case 'f':
ret = parse_low_high(optarg, &opts.low_files, &opts.high_files);
error_exit(ret, "error parsing -f '%s'\n", optarg);
break;
case 'm':
opts.meta_path = strdup(optarg);
break;
case 'n':
ret = parse_u64(optarg, &opts.total_files);
error_exit(ret, "error parsing -n '%s'\n", optarg);
break;
case 'r':
opts.read_only = true;
break;
case 's':
ret = parse_u64(optarg, &opts.seed);
error_exit(ret, "error parsing -s '%s'\n", optarg);
break;
case 'w':
ret = parse_u64(optarg, &opts.nr_writers);
error_exit(ret, "error parsing -w '%s'\n", optarg);
break;
case '?':
printf("Unknown option '%c'\n", optopt);
usage();
exit(1);
}
}
error_exit(opts.low_dirs > opts.high_dirs, "LOW > HIGH in -d %llu:%llu",
opts.low_dirs, opts.high_dirs);
error_exit(opts.low_files > opts.high_files, "LOW > HIGH in -f %llu:%llu",
opts.low_files, opts.high_files);
error_exit(!opts.meta_path, "must specify metadata device path with -m");
printf("recreate with: -d %llu:%llu -f %llu:%llu -n %llu -s %llu -w %llu\n",
opts.low_dirs, opts.high_dirs, opts.low_files, opts.high_files,
opts.total_files, opts.seed, opts.nr_writers);
ret = do_restore(&opts);
free(opts.meta_path);
return ret == 0 ? 0 : 1;
}
+6 -13
View File
@@ -7,11 +7,9 @@ t_require_mounts 2
COUNT=50000
#
# Prep dirs for test. We have per-directory inode number allocators so
# by putting each createmany in a per-mount dir they get their own inode
# number region and cluster locks.
#
# Prep dirs for test. Each mount needs to make their own parent dir for
# the createmany run, otherwise both dirs will end up in the same inode
# group, causing updates to bounce that lock around.
echo "== measure initial createmany"
mkdir -p $T_D0/dir/0
mkdir $T_D1/dir/1
@@ -19,20 +17,18 @@ mkdir $T_D1/dir/1
echo "== measure initial createmany"
START=$SECONDS
createmany -o "$T_D0/file_" $COUNT >> $T_TMP.full
sync
SINGLE=$((SECONDS - START))
echo single $SINGLE >> $T_TMP.full
echo "== measure two concurrent createmany runs"
START=$SECONDS
(cd $T_D0/dir/0; createmany -o ./file_ $COUNT > /dev/null) &
createmany -o $T_D0/dir/0/file $COUNT > /dev/null &
pids="$!"
(cd $T_D1/dir/1; createmany -o ./file_ $COUNT > /dev/null) &
createmany -o $T_D1/dir/1/file $COUNT > /dev/null &
pids="$pids $!"
for p in $pids; do
wait $p
done
sync
BOTH=$((SECONDS - START))
echo both $BOTH >> $T_TMP.full
@@ -45,10 +41,7 @@ echo both $BOTH >> $T_TMP.full
# synchronized operation.
FACTOR=200
if [ "$BOTH" -gt $(($SINGLE*$FACTOR)) ]; then
t_fail "both createmany took $BOTH sec, more than $FACTOR x single $SINGLE sec"
echo "both createmany took $BOTH sec, more than $FACTOR x single $SINGLE sec"
fi
echo "== cleanup"
find $T_D0/dir -delete
t_pass
-24
View File
@@ -10,30 +10,6 @@ EXTENTS_PER_BTREE_BLOCK=600
EXTENTS_PER_LIST_BLOCK=8192
FREED_EXTENTS=$((EXTENTS_PER_BTREE_BLOCK * EXTENTS_PER_LIST_BLOCK))
#
# This test specifically creates a pathologically sparse file that will
# be as expensive as possible to free. This is usually fine on
# dedicated or reasonable hardware, but trying to run this in
# virtualized debug kernels can take a very long time. This test is
# about making sure that the server doesn't fail, not that the platform
# can handle the scale of work that our btree formats happen to require
# while execution is bogged down with use-after-free memory reference
# tracking. So we give the test a lot more breathing room before
# deciding that its hung.
#
echo "== setting longer hung task timeout"
if [ -w /proc/sys/kernel/hung_task_timeout_secs ]; then
secs=$(cat /proc/sys/kernel/hung_task_timeout_secs)
test "$secs" -gt 0 || \
t_fail "confusing value '$secs' from /proc/sys/kernel/hung_task_timeout_secs"
restore_hung_task_timeout()
{
echo "$secs" > /proc/sys/kernel/hung_task_timeout_secs
}
trap restore_hung_task_timeout EXIT
echo "$((secs * 5))" > /proc/sys/kernel/hung_task_timeout_secs
fi
echo "== creating fragmented extents"
fragmented_data_extents $FREED_EXTENTS $EXTENTS_PER_BTREE_BLOCK "$T_D0/alloc" "$T_D0/move"
+24 -25
View File
@@ -9,7 +9,6 @@ LOG=340000
LIM=1000000
SEQF="%.20g"
SXA="scoutfs.srch.test-srch-basic-functionality"
t_require_commands touch rm setfattr scoutfs find_xattrs
@@ -28,20 +27,20 @@ diff_srch_find()
echo "== create new xattrs"
touch "$T_D0/"{create,update}
setfattr -n $SXA -v 1 "$T_D0/"{create,update} 2>&1 | t_filter_fs
diff_srch_find $SXA
setfattr -n scoutfs.srch.test -v 1 "$T_D0/"{create,update} 2>&1 | t_filter_fs
diff_srch_find scoutfs.srch.test
echo "== update existing xattr"
setfattr -n $SXA -v 2 "$T_D0/update" 2>&1 | t_filter_fs
diff_srch_find $SXA
setfattr -n scoutfs.srch.test -v 2 "$T_D0/update" 2>&1 | t_filter_fs
diff_srch_find scoutfs.srch.test
echo "== remove an xattr"
setfattr -x $SXA "$T_D0/create" 2>&1 | t_filter_fs
diff_srch_find $SXA
setfattr -x scoutfs.srch.test "$T_D0/create" 2>&1 | t_filter_fs
diff_srch_find scoutfs.srch.test
echo "== remove xattr with files"
rm -f "$T_D0/"{create,update}
diff_srch_find $SXA
diff_srch_find scoutfs.srch.test
echo "== trigger small log merges by rotating single block with unmount"
sv=$(t_server_nr)
@@ -57,7 +56,7 @@ while [ "$i" -lt "8" ]; do
eval path="\$T_D${nr}/single-block-$i"
touch "$path"
setfattr -n $SXA -v $i "$path"
setfattr -n scoutfs.srch.single-block-logs -v $i "$path"
t_umount $nr
t_mount $nr
@@ -66,51 +65,51 @@ while [ "$i" -lt "8" ]; do
done
# wait for srch compaction worker delay
sleep 10
find "$T_D0" -type f -name 'single-block-*' -delete
rm -rf "$T_D0/single-block-*"
echo "== create entries in current log"
DIR="$T_D0/dir"
NR=$((LOG / 4))
mkdir -p "$DIR"
seq -f "f-$SEQF" 1 $NR | src/bulk_create_paths -X $SXA -d "$DIR" > /dev/null
diff_srch_find $SXA
seq -f "f-$SEQF" 1 $NR | src/bulk_create_paths -S -d "$DIR" > /dev/null
diff_srch_find scoutfs.srch.scoutfs_bcp
echo "== delete small fraction"
seq -f "$DIR/f-$SEQF" 1 7 $NR | xargs setfattr -x $SXA
diff_srch_find $SXA
seq -f "$DIR/f-$SEQF" 1 7 $NR | xargs setfattr -x scoutfs.srch.scoutfs_bcp
diff_srch_find scoutfs.srch.scoutfs_bcp
echo "== remove files"
rm -rf "$DIR"
diff_srch_find $SXA
diff_srch_find scoutfs.srch.scoutfs_bcp
echo "== create entries that exceed one log"
NR=$((LOG * 3 / 2))
mkdir -p "$DIR"
seq -f "f-$SEQF" 1 $NR | src/bulk_create_paths -X $SXA -d "$DIR" > /dev/null
diff_srch_find $SXA
seq -f "f-$SEQF" 1 $NR | src/bulk_create_paths -S -d "$DIR" > /dev/null
diff_srch_find scoutfs.srch.scoutfs_bcp
echo "== delete fractions in phases"
for i in $(seq 1 3); do
seq -f "$DIR/f-$SEQF" $i 3 $NR | xargs setfattr -x $SXA
diff_srch_find $SXA
seq -f "$DIR/f-$SEQF" $i 3 $NR | xargs setfattr -x scoutfs.srch.scoutfs_bcp
diff_srch_find scoutfs.srch.scoutfs_bcp
done
echo "== remove files"
rm -rf "$DIR"
diff_srch_find $SXA
diff_srch_find scoutfs.srch.scoutfs_bcp
echo "== create entries for exceed search entry limit"
NR=$((LIM * 3 / 2))
mkdir -p "$DIR"
seq -f "f-$SEQF" 1 $NR | src/bulk_create_paths -X $SXA -d "$DIR" > /dev/null
diff_srch_find $SXA
seq -f "f-$SEQF" 1 $NR | src/bulk_create_paths -S -d "$DIR" > /dev/null
diff_srch_find scoutfs.srch.scoutfs_bcp
echo "== delete half"
seq -f "$DIR/f-$SEQF" 1 2 $NR | xargs setfattr -x $SXA
diff_srch_find $SXA
seq -f "$DIR/f-$SEQF" 1 2 $NR | xargs setfattr -x scoutfs.srch.scoutfs_bcp
diff_srch_find scoutfs.srch.scoutfs_bcp
echo "== entirely remove third batch"
rm -rf "$DIR"
diff_srch_find $SXA
diff_srch_find scoutfs.srch.scoutfs_bcp
t_pass
-90
View File
@@ -1,90 +0,0 @@
#
# There was a bug where srch file compaction could get stuck if a
# partial compaction finished at the specific _SAFE_BYTES offset in a
# block. Resuming from that position would return an error and
# compaction would stop making forward progress.
#
# We use triggers to pad the output of log compaction to end on the safe
# offset and then cause compaction of those padded inputs to stop at the
# safe offset. Continuation will either succeed or return errors.
#
# forcing rotation, so just a few
NR=10
SEQF="%.20g"
COMPACT_NR=4
echo "== initialize per-mount values"
declare -a err
declare -a compact_delay
for nr in $(t_fs_nrs); do
err[$nr]=$(t_counter srch_compact_error $nr)
compact_delay[$nr]=$(cat $(t_sysfs_path $nr)/srch/compact_delay_ms)
done
restore_compact_delay()
{
for nr in $(t_fs_nrs); do
echo ${compact_delay[$nr]} > $(t_sysfs_path $nr)/srch/compact_delay_ms
done
}
trap restore_compact_delay EXIT
echo "== arm compaction triggers"
for nr in $(t_fs_nrs); do
t_trigger_arm srch_compact_logs_pad_safe $nr
t_trigger_arm srch_merge_stop_safe $nr
done
echo "== compact more often"
for nr in $(t_fs_nrs); do
echo 1000 > $(t_sysfs_path $nr)/srch/compact_delay_ms
done
echo "== create padded sorted inputs by forcing log rotation"
sv=$(t_server_nr)
for i in $(seq 1 $COMPACT_NR); do
for j in $(seq 1 $COMPACT_NR); do
t_trigger_arm srch_force_log_rotate $sv
seq -f "f-$i-$j-$SEQF" 1 10 | \
bulk_create_paths -X "scoutfs.srch.t-srch-safe-merge-pos" -d "$T_D0" > \
/dev/null
sync
test "$(t_trigger_get srch_force_log_rotate $sv)" == "0" || \
t_fail "srch_force_log_rotate didn't trigger"
done
padded=0
while test $padded == 0 && sleep .5; do
for nr in $(t_fs_nrs); do
if [ "$(t_trigger_get srch_compact_logs_pad_safe $nr)" == "0" ]; then
t_trigger_arm srch_compact_logs_pad_safe $nr
padded=1
break
fi
test "$(t_counter srch_compact_error $nr)" == "${err[$nr]}" || \
t_fail "srch_compact_error counter increased on mount $nr"
done
done
done
echo "== compaction of padded should stop at safe"
sleep 2
for nr in $(t_fs_nrs); do
if [ "$(t_trigger_get srch_merge_stop_safe $nr)" == "0" ]; then
break
fi
done
echo "== verify no compaction errors"
sleep 2
for nr in $(t_fs_nrs); do
test "$(t_counter srch_compact_error $nr)" == "${err[$nr]}" || \
t_fail "srch_compact_error counter increased on mount $nr"
done
echo "== cleanup"
find "$T_D0" -type f -name 'f-*' -delete
t_pass
+1 -7
View File
@@ -18,9 +18,7 @@ BIN := src/scoutfs
OBJ := $(patsubst %.c,%.o,$(wildcard src/*.c))
DEPS := $(wildcard */*.d)
AR := src/scoutfs_parallel_restore.a
all: $(BIN) $(AR)
all: $(BIN)
ifneq ($(DEPS),)
-include $(DEPS)
@@ -38,10 +36,6 @@ $(BIN): $(OBJ)
$(QU) [BIN $@]
$(VE)gcc -o $@ $^ -luuid -lm -lcrypto -lblkid
$(AR): $(OBJ)
$(QU) [AR $@]
$(VE)ar rcs $@ $^
%.o %.d: %.c Makefile sparse.sh
$(QU) [CC $<]
$(VE)gcc $(CFLAGS) -MD -MP -MF $*.d -c $< -o $*.o
-13
View File
@@ -55,19 +55,6 @@ with initial sparse regions (perhaps by multiple threads writing to
different regions) and wasted space isn't an issue (perhaps because the
file population contains few small files).
.TP
.B log_merge_wait_timeout_ms=<number>
This option sets the amount of time, in milliseconds, that log merge
creation can wait before timing out. This setting is per-mount, only
changes the behavior of that mount, and only affects the server when it
is running in that mount.
.sp
This determines how long it may take for mounts to synchronize
committing their log trees to create a log merge operation. Setting it
too high can create long latencies in the event that a mount takes a
long time to commit their log. Setting it too low can result in the
creation of excessive numbers of log trees that are never merged. The
default is 500 and it can not be less than 100 nor greater than 60000.
.TP
.B metadev_path=<device>
The metadev_path option specifies the path to the block device that
contains the filesystem's metadata.
-3
View File
@@ -54,8 +54,6 @@ cp man/*.8.gz $RPM_BUILD_ROOT%{_mandir}/man8/.
install -m 755 -D src/scoutfs $RPM_BUILD_ROOT%{_sbindir}/scoutfs
install -m 644 -D src/ioctl.h $RPM_BUILD_ROOT%{_includedir}/scoutfs/ioctl.h
install -m 644 -D src/format.h $RPM_BUILD_ROOT%{_includedir}/scoutfs/format.h
install -m 644 -D src/parallel_restore.h $RPM_BUILD_ROOT%{_includedir}/scoutfs/parallel_restore.h
install -m 644 -D src/scoutfs_parallel_restore.a $RPM_BUILD_ROOT%{_libdir}/scoutfs/libscoutfs_parallel_restore.a
install -m 755 -D fenced/scoutfs-fenced $RPM_BUILD_ROOT%{_libexecdir}/scoutfs-fenced/scoutfs-fenced
install -m 644 -D fenced/scoutfs-fenced.service $RPM_BUILD_ROOT%{_unitdir}/scoutfs-fenced.service
install -m 644 -D fenced/scoutfs-fenced.conf.example $RPM_BUILD_ROOT%{_sysconfdir}/scoutfs/scoutfs-fenced.conf.example
@@ -72,7 +70,6 @@ install -m 644 -D fenced/scoutfs-fenced.conf.example $RPM_BUILD_ROOT%{_sysconfdi
%files -n scoutfs-devel
%defattr(644,root,root,755)
%{_includedir}/scoutfs
%{_libdir}/scoutfs
%clean
rm -rf %{buildroot}
-20
View File
@@ -1,20 +0,0 @@
#include <errno.h>
#include "sparse.h"
#include "util.h"
#include "format.h"
#include "hash.h"
#include "bloom.h"
void calc_bloom_nrs(struct scoutfs_key *key, unsigned int *nrs)
{
u64 hash;
int i;
hash = scoutfs_hash64(key, sizeof(struct scoutfs_key));
for (i = 0; i < SCOUTFS_FOREST_BLOOM_NRS; i++) {
nrs[i] = (u32)hash % SCOUTFS_FOREST_BLOOM_BITS;
hash >>= SCOUTFS_FOREST_BLOOM_FUNC_BITS;
}
}
-6
View File
@@ -1,6 +0,0 @@
#ifndef _BLOOM_H_
#define _BLOOM_H_
void calc_bloom_nrs(struct scoutfs_key *key, unsigned int *nrs);
#endif
+2 -2
View File
@@ -8,7 +8,7 @@
#include "leaf_item_hash.h"
#include "btree.h"
void btree_init_block(struct scoutfs_btree_block *bt, int level)
static void init_block(struct scoutfs_btree_block *bt, int level)
{
int free;
@@ -33,7 +33,7 @@ void btree_init_root_single(struct scoutfs_btree_root *root,
memset(bt, 0, SCOUTFS_BLOCK_LG_SIZE);
btree_init_block(bt, 0);
init_block(bt, 0);
}
static void *alloc_val(struct scoutfs_btree_block *bt, int len)
-1
View File
@@ -1,7 +1,6 @@
#ifndef _BTREE_H_
#define _BTREE_H_
void btree_init_block(struct scoutfs_btree_block *bt, int level);
void btree_init_root_single(struct scoutfs_btree_root *root,
struct scoutfs_btree_block *bt,
u64 seq, u64 blkno);
-24
View File
@@ -1,24 +0,0 @@
#ifndef _LK_RBTREE_WRAPPER_H_
#define _LK_RBTREE_WRAPPER_H_
/*
* We're using this lame hack to build and use the kernel's rbtree in
* userspace. We drop the kernel's rbtree*[ch] implementation in and
* use them with this wrapper. We only have to remove the kernel
* includes from the imported files.
*/
#include <stdbool.h>
#include "util.h"
#define rcu_assign_pointer(a, b) do { a = b; } while (0)
#define READ_ONCE(a) ({ a; })
#define WRITE_ONCE(a, b) do { a = b; } while (0)
#define unlikely(a) ({ a; })
#define EXPORT_SYMBOL(a) /* nop */
#include "rbtree_types.h"
#include "rbtree.h"
#include "rbtree_augmented.h"
#endif
-24
View File
@@ -1,24 +0,0 @@
#include <unistd.h>
#include <sys/stat.h>
#include "sparse.h"
#include "util.h"
#include "format.h"
#include "mode_types.h"
unsigned int mode_to_type(mode_t mode)
{
#define S_SHIFT 12
static unsigned char mode_types[S_IFMT >> S_SHIFT] = {
[S_IFIFO >> S_SHIFT] = SCOUTFS_DT_FIFO,
[S_IFCHR >> S_SHIFT] = SCOUTFS_DT_CHR,
[S_IFDIR >> S_SHIFT] = SCOUTFS_DT_DIR,
[S_IFBLK >> S_SHIFT] = SCOUTFS_DT_BLK,
[S_IFREG >> S_SHIFT] = SCOUTFS_DT_REG,
[S_IFLNK >> S_SHIFT] = SCOUTFS_DT_LNK,
[S_IFSOCK >> S_SHIFT] = SCOUTFS_DT_SOCK,
};
return mode_types[(mode & S_IFMT) >> S_SHIFT];
#undef S_SHIFT
}
-6
View File
@@ -1,6 +0,0 @@
#ifndef _MODE_TYPES_H_
#define _MODE_TYPES_H_
unsigned int mode_to_type(mode_t mode);
#endif
-46
View File
@@ -1,46 +0,0 @@
#ifndef _SCOUTFS_NAME_HASH_H_
#define _SCOUTFS_NAME_HASH_H_
#include "hash.h"
/*
* Test a bit number as though an array of bytes is a large len-bit
* big-endian value. nr 0 is the LSB of the final byte, nr (len - 1) is
* the MSB of the first byte.
*/
static int test_be_bytes_bit(int nr, const char *bytes, int len)
{
return bytes[(len - 1 - nr) >> 3] & (1 << (nr & 7));
}
/*
* Generate a 32bit "fingerprint" of the name by extracting 32 evenly
* distributed bits from the name. The intent is to have the sort order
* of the fingerprints reflect the memcmp() sort order of the names
* while mapping large names down to small fs keys.
*
* Names that are smaller than 32bits are biased towards the high bits
* of the fingerprint so that most significant bits of the fingerprints
* consistently reflect the initial characters of the names.
*/
static inline u32 dirent_name_fingerprint(const char *name, unsigned int name_len)
{
int name_bits = name_len * 8;
int skip = max(name_bits / 32, 1);
u32 fp = 0;
int f;
int n;
for (f = 31, n = name_bits - 1; f >= 0 && n >= 0; f--, n -= skip)
fp |= !!test_be_bytes_bit(n, name, name_bits) << f;
return fp;
}
static inline u64 dirent_name_hash(const char *name, unsigned int name_len)
{
return scoutfs_hash32(name, name_len) |
((u64)dirent_name_fingerprint(name, name_len) << 32);
}
#endif
File diff suppressed because it is too large Load Diff
-103
View File
@@ -1,103 +0,0 @@
#ifndef _SCOUTFS_PARALLEL_RESTORE_H_
#define _SCOUTFS_PARALLEL_RESTORE_H_
#include <errno.h>
struct scoutfs_parallel_restore_progress {
struct scoutfs_btree_root fs_items;
struct scoutfs_btree_root root_items;
struct scoutfs_srch_file sfl;
struct scoutfs_block_ref bloom_ref;
__le64 inode_count;
__le64 max_ino;
};
struct scoutfs_parallel_restore_slice {
__le64 fsid;
__le64 meta_start;
__le64 meta_len;
};
struct scoutfs_parallel_restore_entry {
u64 dir_ino;
u64 pos;
u64 ino;
mode_t mode;
char *name;
unsigned int name_len;
};
struct scoutfs_parallel_restore_xattr {
u64 ino;
u64 pos;
char *name;
unsigned int name_len;
void *value;
unsigned int value_len;
};
struct scoutfs_parallel_restore_inode {
/* all inodes */
u64 ino;
u64 nr_xattrs;
u32 uid;
u32 gid;
u32 mode;
u32 rdev;
struct timespec atime;
struct timespec ctime;
struct timespec mtime;
struct timespec crtime;
/* regular files */
u64 data_version;
u64 size;
bool offline;
/* only used for directories */
u64 nr_subdirs;
u64 total_entry_name_bytes;
/* only used for symlnks */
char *target;
unsigned int target_len; /* not including null terminator */
};
typedef __typeof__(EINVAL) spr_err_t;
struct scoutfs_parallel_restore_writer;
spr_err_t scoutfs_parallel_restore_create_writer(struct scoutfs_parallel_restore_writer **wrip);
void scoutfs_parallel_restore_destroy_writer(struct scoutfs_parallel_restore_writer **wrip);
spr_err_t scoutfs_parallel_restore_init_slices(struct scoutfs_parallel_restore_writer *wri,
struct scoutfs_parallel_restore_slice *slices,
int nr);
spr_err_t scoutfs_parallel_restore_add_slice(struct scoutfs_parallel_restore_writer *wri,
struct scoutfs_parallel_restore_slice *slice);
spr_err_t scoutfs_parallel_restore_get_slice(struct scoutfs_parallel_restore_writer *wri,
struct scoutfs_parallel_restore_slice *slice);
spr_err_t scoutfs_parallel_restore_add_inode(struct scoutfs_parallel_restore_writer *wri,
struct scoutfs_parallel_restore_inode *inode);
spr_err_t scoutfs_parallel_restore_add_entry(struct scoutfs_parallel_restore_writer *wri,
struct scoutfs_parallel_restore_entry *entry);
spr_err_t scoutfs_parallel_restore_add_xattr(struct scoutfs_parallel_restore_writer *wri,
struct scoutfs_parallel_restore_xattr *xattr);
spr_err_t scoutfs_parallel_restore_get_progress(struct scoutfs_parallel_restore_writer *wri,
struct scoutfs_parallel_restore_progress *prog);
spr_err_t scoutfs_parallel_restore_add_progress(struct scoutfs_parallel_restore_writer *wri,
struct scoutfs_parallel_restore_progress *prog);
spr_err_t scoutfs_parallel_restore_write_buf(struct scoutfs_parallel_restore_writer *wri,
void *buf, size_t len, off_t *off_ret,
size_t *count_ret);
spr_err_t scoutfs_parallel_restore_import_super(struct scoutfs_parallel_restore_writer *wri,
struct scoutfs_super_block *super);
spr_err_t scoutfs_parallel_restore_export_super(struct scoutfs_parallel_restore_writer *wri,
struct scoutfs_super_block *super);
#endif
+21 -13
View File
@@ -38,6 +38,7 @@ struct prepare_empty_data_dev_args {
char *meta_device;
char *data_device;
bool check;
bool force;
};
static int do_prepare_empty_data_dev(struct prepare_empty_data_dev_args *args)
@@ -77,20 +78,22 @@ static int do_prepare_empty_data_dev(struct prepare_empty_data_dev_args *args)
goto out;
}
ret = meta_super_in_use(meta_fd, meta_super);
if (ret < 0) {
if (ret == -EBUSY)
fprintf(stderr, "The filesystem must be fully recovered and cleanly unmounted to determine if the data device is empty.\n");
goto out;
}
if (!args->force) {
ret = meta_super_in_use(meta_fd, meta_super);
if (ret < 0) {
if (ret == -EBUSY)
fprintf(stderr, "The filesystem must be fully recovered and cleanly unmounted to determine if the data device is empty.\n");
goto out;
}
in_use = (le64_to_cpu(meta_super->total_data_blocks) - SCOUTFS_DATA_DEV_START_BLKNO) -
le64_to_cpu(meta_super->data_alloc.total_len);
if (in_use) {
fprintf(stderr, "Data block allocator metadata shows "SIZE_FMT" data blocks used by files. They must be removed, truncated, or released before a new empty data device can be used.\n",
SIZE_ARGS(in_use, SCOUTFS_BLOCK_SM_SIZE));
ret = -EINVAL;
goto out;
in_use = (le64_to_cpu(meta_super->total_data_blocks) - SCOUTFS_DATA_DEV_START_BLKNO) -
le64_to_cpu(meta_super->data_alloc.total_len);
if (in_use) {
fprintf(stderr, "Data block allocator metadata shows "SIZE_FMT" data blocks used by files. They must be removed, truncated, or released before a new empty data device can be used.\n",
SIZE_ARGS(in_use, SCOUTFS_BLOCK_SM_SIZE));
ret = -EINVAL;
goto out;
}
}
if (args->data_device) {
@@ -193,6 +196,9 @@ static int parse_opt(int key, char *arg, struct argp_state *state)
case 'c':
args->check = true;
break;
case 'f':
args->force = true;
break;
case ARGP_KEY_ARG:
if (!args->meta_device)
args->meta_device = strdup_or_error(state, arg);
@@ -216,6 +222,7 @@ static int parse_opt(int key, char *arg, struct argp_state *state)
static struct argp_option options[] = {
{ "check", 'c', NULL, 0, "Only check for errors and do not write", },
{ "force", 'f', NULL, 0, "Do not check that super is in use, nor if blocks are in use",},
{ NULL }
};
@@ -230,6 +237,7 @@ static int prepare_empty_data_dev_cmd(int argc, char *argv[])
{
struct prepare_empty_data_dev_args prepare_empty_data_dev_args = {
.check = false,
.force = false,
};
int ret;
-629
View File
@@ -1,629 +0,0 @@
// SPDX-License-Identifier: GPL-2.0-or-later
/*
Red Black Trees
(C) 1999 Andrea Arcangeli <andrea@suse.de>
(C) 2002 David Woodhouse <dwmw2@infradead.org>
(C) 2012 Michel Lespinasse <walken@google.com>
linux/lib/rbtree.c
*/
#include "lk_rbtree_wrapper.h"
/*
* red-black trees properties: https://en.wikipedia.org/wiki/Rbtree
*
* 1) A node is either red or black
* 2) The root is black
* 3) All leaves (NULL) are black
* 4) Both children of every red node are black
* 5) Every simple path from root to leaves contains the same number
* of black nodes.
*
* 4 and 5 give the O(log n) guarantee, since 4 implies you cannot have two
* consecutive red nodes in a path and every red node is therefore followed by
* a black. So if B is the number of black nodes on every simple path (as per
* 5), then the longest possible path due to 4 is 2B.
*
* We shall indicate color with case, where black nodes are uppercase and red
* nodes will be lowercase. Unknown color nodes shall be drawn as red within
* parentheses and have some accompanying text comment.
*/
/*
* Notes on lockless lookups:
*
* All stores to the tree structure (rb_left and rb_right) must be done using
* WRITE_ONCE(). And we must not inadvertently cause (temporary) loops in the
* tree structure as seen in program order.
*
* These two requirements will allow lockless iteration of the tree -- not
* correct iteration mind you, tree rotations are not atomic so a lookup might
* miss entire subtrees.
*
* But they do guarantee that any such traversal will only see valid elements
* and that it will indeed complete -- does not get stuck in a loop.
*
* It also guarantees that if the lookup returns an element it is the 'correct'
* one. But not returning an element does _NOT_ mean it's not present.
*
* NOTE:
*
* Stores to __rb_parent_color are not important for simple lookups so those
* are left undone as of now. Nor did I check for loops involving parent
* pointers.
*/
static inline void rb_set_black(struct rb_node *rb)
{
rb->__rb_parent_color |= RB_BLACK;
}
static inline struct rb_node *rb_red_parent(struct rb_node *red)
{
return (struct rb_node *)red->__rb_parent_color;
}
/*
* Helper function for rotations:
* - old's parent and color get assigned to new
* - old gets assigned new as a parent and 'color' as a color.
*/
static inline void
__rb_rotate_set_parents(struct rb_node *old, struct rb_node *new,
struct rb_root *root, int color)
{
struct rb_node *parent = rb_parent(old);
new->__rb_parent_color = old->__rb_parent_color;
rb_set_parent_color(old, new, color);
__rb_change_child(old, new, parent, root);
}
static __always_inline void
__rb_insert(struct rb_node *node, struct rb_root *root,
void (*augment_rotate)(struct rb_node *old, struct rb_node *new))
{
struct rb_node *parent = rb_red_parent(node), *gparent, *tmp;
while (true) {
/*
* Loop invariant: node is red.
*/
if (unlikely(!parent)) {
/*
* The inserted node is root. Either this is the
* first node, or we recursed at Case 1 below and
* are no longer violating 4).
*/
rb_set_parent_color(node, NULL, RB_BLACK);
break;
}
/*
* If there is a black parent, we are done.
* Otherwise, take some corrective action as,
* per 4), we don't want a red root or two
* consecutive red nodes.
*/
if(rb_is_black(parent))
break;
gparent = rb_red_parent(parent);
tmp = gparent->rb_right;
if (parent != tmp) { /* parent == gparent->rb_left */
if (tmp && rb_is_red(tmp)) {
/*
* Case 1 - node's uncle is red (color flips).
*
* G g
* / \ / \
* p u --> P U
* / /
* n n
*
* However, since g's parent might be red, and
* 4) does not allow this, we need to recurse
* at g.
*/
rb_set_parent_color(tmp, gparent, RB_BLACK);
rb_set_parent_color(parent, gparent, RB_BLACK);
node = gparent;
parent = rb_parent(node);
rb_set_parent_color(node, parent, RB_RED);
continue;
}
tmp = parent->rb_right;
if (node == tmp) {
/*
* Case 2 - node's uncle is black and node is
* the parent's right child (left rotate at parent).
*
* G G
* / \ / \
* p U --> n U
* \ /
* n p
*
* This still leaves us in violation of 4), the
* continuation into Case 3 will fix that.
*/
tmp = node->rb_left;
WRITE_ONCE(parent->rb_right, tmp);
WRITE_ONCE(node->rb_left, parent);
if (tmp)
rb_set_parent_color(tmp, parent,
RB_BLACK);
rb_set_parent_color(parent, node, RB_RED);
augment_rotate(parent, node);
parent = node;
tmp = node->rb_right;
}
/*
* Case 3 - node's uncle is black and node is
* the parent's left child (right rotate at gparent).
*
* G P
* / \ / \
* p U --> n g
* / \
* n U
*/
WRITE_ONCE(gparent->rb_left, tmp); /* == parent->rb_right */
WRITE_ONCE(parent->rb_right, gparent);
if (tmp)
rb_set_parent_color(tmp, gparent, RB_BLACK);
__rb_rotate_set_parents(gparent, parent, root, RB_RED);
augment_rotate(gparent, parent);
break;
} else {
tmp = gparent->rb_left;
if (tmp && rb_is_red(tmp)) {
/* Case 1 - color flips */
rb_set_parent_color(tmp, gparent, RB_BLACK);
rb_set_parent_color(parent, gparent, RB_BLACK);
node = gparent;
parent = rb_parent(node);
rb_set_parent_color(node, parent, RB_RED);
continue;
}
tmp = parent->rb_left;
if (node == tmp) {
/* Case 2 - right rotate at parent */
tmp = node->rb_right;
WRITE_ONCE(parent->rb_left, tmp);
WRITE_ONCE(node->rb_right, parent);
if (tmp)
rb_set_parent_color(tmp, parent,
RB_BLACK);
rb_set_parent_color(parent, node, RB_RED);
augment_rotate(parent, node);
parent = node;
tmp = node->rb_left;
}
/* Case 3 - left rotate at gparent */
WRITE_ONCE(gparent->rb_right, tmp); /* == parent->rb_left */
WRITE_ONCE(parent->rb_left, gparent);
if (tmp)
rb_set_parent_color(tmp, gparent, RB_BLACK);
__rb_rotate_set_parents(gparent, parent, root, RB_RED);
augment_rotate(gparent, parent);
break;
}
}
}
/*
* Inline version for rb_erase() use - we want to be able to inline
* and eliminate the dummy_rotate callback there
*/
static __always_inline void
____rb_erase_color(struct rb_node *parent, struct rb_root *root,
void (*augment_rotate)(struct rb_node *old, struct rb_node *new))
{
struct rb_node *node = NULL, *sibling, *tmp1, *tmp2;
while (true) {
/*
* Loop invariants:
* - node is black (or NULL on first iteration)
* - node is not the root (parent is not NULL)
* - All leaf paths going through parent and node have a
* black node count that is 1 lower than other leaf paths.
*/
sibling = parent->rb_right;
if (node != sibling) { /* node == parent->rb_left */
if (rb_is_red(sibling)) {
/*
* Case 1 - left rotate at parent
*
* P S
* / \ / \
* N s --> p Sr
* / \ / \
* Sl Sr N Sl
*/
tmp1 = sibling->rb_left;
WRITE_ONCE(parent->rb_right, tmp1);
WRITE_ONCE(sibling->rb_left, parent);
rb_set_parent_color(tmp1, parent, RB_BLACK);
__rb_rotate_set_parents(parent, sibling, root,
RB_RED);
augment_rotate(parent, sibling);
sibling = tmp1;
}
tmp1 = sibling->rb_right;
if (!tmp1 || rb_is_black(tmp1)) {
tmp2 = sibling->rb_left;
if (!tmp2 || rb_is_black(tmp2)) {
/*
* Case 2 - sibling color flip
* (p could be either color here)
*
* (p) (p)
* / \ / \
* N S --> N s
* / \ / \
* Sl Sr Sl Sr
*
* This leaves us violating 5) which
* can be fixed by flipping p to black
* if it was red, or by recursing at p.
* p is red when coming from Case 1.
*/
rb_set_parent_color(sibling, parent,
RB_RED);
if (rb_is_red(parent))
rb_set_black(parent);
else {
node = parent;
parent = rb_parent(node);
if (parent)
continue;
}
break;
}
/*
* Case 3 - right rotate at sibling
* (p could be either color here)
*
* (p) (p)
* / \ / \
* N S --> N sl
* / \ \
* sl Sr S
* \
* Sr
*
* Note: p might be red, and then both
* p and sl are red after rotation(which
* breaks property 4). This is fixed in
* Case 4 (in __rb_rotate_set_parents()
* which set sl the color of p
* and set p RB_BLACK)
*
* (p) (sl)
* / \ / \
* N sl --> P S
* \ / \
* S N Sr
* \
* Sr
*/
tmp1 = tmp2->rb_right;
WRITE_ONCE(sibling->rb_left, tmp1);
WRITE_ONCE(tmp2->rb_right, sibling);
WRITE_ONCE(parent->rb_right, tmp2);
if (tmp1)
rb_set_parent_color(tmp1, sibling,
RB_BLACK);
augment_rotate(sibling, tmp2);
tmp1 = sibling;
sibling = tmp2;
}
/*
* Case 4 - left rotate at parent + color flips
* (p and sl could be either color here.
* After rotation, p becomes black, s acquires
* p's color, and sl keeps its color)
*
* (p) (s)
* / \ / \
* N S --> P Sr
* / \ / \
* (sl) sr N (sl)
*/
tmp2 = sibling->rb_left;
WRITE_ONCE(parent->rb_right, tmp2);
WRITE_ONCE(sibling->rb_left, parent);
rb_set_parent_color(tmp1, sibling, RB_BLACK);
if (tmp2)
rb_set_parent(tmp2, parent);
__rb_rotate_set_parents(parent, sibling, root,
RB_BLACK);
augment_rotate(parent, sibling);
break;
} else {
sibling = parent->rb_left;
if (rb_is_red(sibling)) {
/* Case 1 - right rotate at parent */
tmp1 = sibling->rb_right;
WRITE_ONCE(parent->rb_left, tmp1);
WRITE_ONCE(sibling->rb_right, parent);
rb_set_parent_color(tmp1, parent, RB_BLACK);
__rb_rotate_set_parents(parent, sibling, root,
RB_RED);
augment_rotate(parent, sibling);
sibling = tmp1;
}
tmp1 = sibling->rb_left;
if (!tmp1 || rb_is_black(tmp1)) {
tmp2 = sibling->rb_right;
if (!tmp2 || rb_is_black(tmp2)) {
/* Case 2 - sibling color flip */
rb_set_parent_color(sibling, parent,
RB_RED);
if (rb_is_red(parent))
rb_set_black(parent);
else {
node = parent;
parent = rb_parent(node);
if (parent)
continue;
}
break;
}
/* Case 3 - left rotate at sibling */
tmp1 = tmp2->rb_left;
WRITE_ONCE(sibling->rb_right, tmp1);
WRITE_ONCE(tmp2->rb_left, sibling);
WRITE_ONCE(parent->rb_left, tmp2);
if (tmp1)
rb_set_parent_color(tmp1, sibling,
RB_BLACK);
augment_rotate(sibling, tmp2);
tmp1 = sibling;
sibling = tmp2;
}
/* Case 4 - right rotate at parent + color flips */
tmp2 = sibling->rb_right;
WRITE_ONCE(parent->rb_left, tmp2);
WRITE_ONCE(sibling->rb_right, parent);
rb_set_parent_color(tmp1, sibling, RB_BLACK);
if (tmp2)
rb_set_parent(tmp2, parent);
__rb_rotate_set_parents(parent, sibling, root,
RB_BLACK);
augment_rotate(parent, sibling);
break;
}
}
}
/* Non-inline version for rb_erase_augmented() use */
void __rb_erase_color(struct rb_node *parent, struct rb_root *root,
void (*augment_rotate)(struct rb_node *old, struct rb_node *new))
{
____rb_erase_color(parent, root, augment_rotate);
}
EXPORT_SYMBOL(__rb_erase_color);
/*
* Non-augmented rbtree manipulation functions.
*
* We use dummy augmented callbacks here, and have the compiler optimize them
* out of the rb_insert_color() and rb_erase() function definitions.
*/
static inline void dummy_propagate(struct rb_node *node, struct rb_node *stop) {}
static inline void dummy_copy(struct rb_node *old, struct rb_node *new) {}
static inline void dummy_rotate(struct rb_node *old, struct rb_node *new) {}
static const struct rb_augment_callbacks dummy_callbacks = {
.propagate = dummy_propagate,
.copy = dummy_copy,
.rotate = dummy_rotate
};
void rb_insert_color(struct rb_node *node, struct rb_root *root)
{
__rb_insert(node, root, dummy_rotate);
}
EXPORT_SYMBOL(rb_insert_color);
void rb_erase(struct rb_node *node, struct rb_root *root)
{
struct rb_node *rebalance;
rebalance = __rb_erase_augmented(node, root, &dummy_callbacks);
if (rebalance)
____rb_erase_color(rebalance, root, dummy_rotate);
}
EXPORT_SYMBOL(rb_erase);
/*
* Augmented rbtree manipulation functions.
*
* This instantiates the same __always_inline functions as in the non-augmented
* case, but this time with user-defined callbacks.
*/
void __rb_insert_augmented(struct rb_node *node, struct rb_root *root,
void (*augment_rotate)(struct rb_node *old, struct rb_node *new))
{
__rb_insert(node, root, augment_rotate);
}
EXPORT_SYMBOL(__rb_insert_augmented);
/*
* This function returns the first node (in sort order) of the tree.
*/
struct rb_node *rb_first(const struct rb_root *root)
{
struct rb_node *n;
n = root->rb_node;
if (!n)
return NULL;
while (n->rb_left)
n = n->rb_left;
return n;
}
EXPORT_SYMBOL(rb_first);
struct rb_node *rb_last(const struct rb_root *root)
{
struct rb_node *n;
n = root->rb_node;
if (!n)
return NULL;
while (n->rb_right)
n = n->rb_right;
return n;
}
EXPORT_SYMBOL(rb_last);
struct rb_node *rb_next(const struct rb_node *node)
{
struct rb_node *parent;
if (RB_EMPTY_NODE(node))
return NULL;
/*
* If we have a right-hand child, go down and then left as far
* as we can.
*/
if (node->rb_right) {
node = node->rb_right;
while (node->rb_left)
node = node->rb_left;
return (struct rb_node *)node;
}
/*
* No right-hand children. Everything down and left is smaller than us,
* so any 'next' node must be in the general direction of our parent.
* Go up the tree; any time the ancestor is a right-hand child of its
* parent, keep going up. First time it's a left-hand child of its
* parent, said parent is our 'next' node.
*/
while ((parent = rb_parent(node)) && node == parent->rb_right)
node = parent;
return parent;
}
EXPORT_SYMBOL(rb_next);
struct rb_node *rb_prev(const struct rb_node *node)
{
struct rb_node *parent;
if (RB_EMPTY_NODE(node))
return NULL;
/*
* If we have a left-hand child, go down and then right as far
* as we can.
*/
if (node->rb_left) {
node = node->rb_left;
while (node->rb_right)
node = node->rb_right;
return (struct rb_node *)node;
}
/*
* No left-hand children. Go up till we find an ancestor which
* is a right-hand child of its parent.
*/
while ((parent = rb_parent(node)) && node == parent->rb_left)
node = parent;
return parent;
}
EXPORT_SYMBOL(rb_prev);
void rb_replace_node(struct rb_node *victim, struct rb_node *new,
struct rb_root *root)
{
struct rb_node *parent = rb_parent(victim);
/* Copy the pointers/colour from the victim to the replacement */
*new = *victim;
/* Set the surrounding nodes to point to the replacement */
if (victim->rb_left)
rb_set_parent(victim->rb_left, new);
if (victim->rb_right)
rb_set_parent(victim->rb_right, new);
__rb_change_child(victim, new, parent, root);
}
EXPORT_SYMBOL(rb_replace_node);
void rb_replace_node_rcu(struct rb_node *victim, struct rb_node *new,
struct rb_root *root)
{
struct rb_node *parent = rb_parent(victim);
/* Copy the pointers/colour from the victim to the replacement */
*new = *victim;
/* Set the surrounding nodes to point to the replacement */
if (victim->rb_left)
rb_set_parent(victim->rb_left, new);
if (victim->rb_right)
rb_set_parent(victim->rb_right, new);
/* Set the parent's pointer to the new node last after an RCU barrier
* so that the pointers onwards are seen to be set correctly when doing
* an RCU walk over the tree.
*/
__rb_change_child_rcu(victim, new, parent, root);
}
EXPORT_SYMBOL(rb_replace_node_rcu);
static struct rb_node *rb_left_deepest_node(const struct rb_node *node)
{
for (;;) {
if (node->rb_left)
node = node->rb_left;
else if (node->rb_right)
node = node->rb_right;
else
return (struct rb_node *)node;
}
}
struct rb_node *rb_next_postorder(const struct rb_node *node)
{
const struct rb_node *parent;
if (!node)
return NULL;
parent = rb_parent(node);
/* If we're sitting on node, we've already seen our children */
if (parent && node == parent->rb_left && parent->rb_right) {
/* If we are the parent's left node, go to the parent's right
* node then all the way down to the left */
return rb_left_deepest_node(parent->rb_right);
} else
/* Otherwise we are the parent's right node, and the parent
* should be next */
return (struct rb_node *)parent;
}
EXPORT_SYMBOL(rb_next_postorder);
struct rb_node *rb_first_postorder(const struct rb_root *root)
{
if (!root->rb_node)
return NULL;
return rb_left_deepest_node(root->rb_node);
}
EXPORT_SYMBOL(rb_first_postorder);
-328
View File
@@ -1,328 +0,0 @@
/* SPDX-License-Identifier: GPL-2.0-or-later */
/*
Red Black Trees
(C) 1999 Andrea Arcangeli <andrea@suse.de>
linux/include/linux/rbtree.h
To use rbtrees you'll have to implement your own insert and search cores.
This will avoid us to use callbacks and to drop drammatically performances.
I know it's not the cleaner way, but in C (not in C++) to get
performances and genericity...
See Documentation/core-api/rbtree.rst for documentation and samples.
*/
#ifndef _LINUX_RBTREE_H
#define _LINUX_RBTREE_H
#define rb_parent(r) ((struct rb_node *)((r)->__rb_parent_color & ~3))
#define rb_entry(ptr, type, member) container_of(ptr, type, member)
#define RB_EMPTY_ROOT(root) (READ_ONCE((root)->rb_node) == NULL)
/* 'empty' nodes are nodes that are known not to be inserted in an rbtree */
#define RB_EMPTY_NODE(node) \
((node)->__rb_parent_color == (unsigned long)(node))
#define RB_CLEAR_NODE(node) \
((node)->__rb_parent_color = (unsigned long)(node))
extern void rb_insert_color(struct rb_node *, struct rb_root *);
extern void rb_erase(struct rb_node *, struct rb_root *);
/* Find logical next and previous nodes in a tree */
extern struct rb_node *rb_next(const struct rb_node *);
extern struct rb_node *rb_prev(const struct rb_node *);
extern struct rb_node *rb_first(const struct rb_root *);
extern struct rb_node *rb_last(const struct rb_root *);
/* Postorder iteration - always visit the parent after its children */
extern struct rb_node *rb_first_postorder(const struct rb_root *);
extern struct rb_node *rb_next_postorder(const struct rb_node *);
/* Fast replacement of a single node without remove/rebalance/add/rebalance */
extern void rb_replace_node(struct rb_node *victim, struct rb_node *new,
struct rb_root *root);
extern void rb_replace_node_rcu(struct rb_node *victim, struct rb_node *new,
struct rb_root *root);
static inline void rb_link_node(struct rb_node *node, struct rb_node *parent,
struct rb_node **rb_link)
{
node->__rb_parent_color = (unsigned long)parent;
node->rb_left = node->rb_right = NULL;
*rb_link = node;
}
static inline void rb_link_node_rcu(struct rb_node *node, struct rb_node *parent,
struct rb_node **rb_link)
{
node->__rb_parent_color = (unsigned long)parent;
node->rb_left = node->rb_right = NULL;
rcu_assign_pointer(*rb_link, node);
}
#define rb_entry_safe(ptr, type, member) \
({ typeof(ptr) ____ptr = (ptr); \
____ptr ? rb_entry(____ptr, type, member) : NULL; \
})
/**
* rbtree_postorder_for_each_entry_safe - iterate in post-order over rb_root of
* given type allowing the backing memory of @pos to be invalidated
*
* @pos: the 'type *' to use as a loop cursor.
* @n: another 'type *' to use as temporary storage
* @root: 'rb_root *' of the rbtree.
* @field: the name of the rb_node field within 'type'.
*
* rbtree_postorder_for_each_entry_safe() provides a similar guarantee as
* list_for_each_entry_safe() and allows the iteration to continue independent
* of changes to @pos by the body of the loop.
*
* Note, however, that it cannot handle other modifications that re-order the
* rbtree it is iterating over. This includes calling rb_erase() on @pos, as
* rb_erase() may rebalance the tree, causing us to miss some nodes.
*/
#define rbtree_postorder_for_each_entry_safe(pos, n, root, field) \
for (pos = rb_entry_safe(rb_first_postorder(root), typeof(*pos), field); \
pos && ({ n = rb_entry_safe(rb_next_postorder(&pos->field), \
typeof(*pos), field); 1; }); \
pos = n)
/* Same as rb_first(), but O(1) */
#define rb_first_cached(root) (root)->rb_leftmost
static inline void rb_insert_color_cached(struct rb_node *node,
struct rb_root_cached *root,
bool leftmost)
{
if (leftmost)
root->rb_leftmost = node;
rb_insert_color(node, &root->rb_root);
}
static inline struct rb_node *
rb_erase_cached(struct rb_node *node, struct rb_root_cached *root)
{
struct rb_node *leftmost = NULL;
if (root->rb_leftmost == node)
leftmost = root->rb_leftmost = rb_next(node);
rb_erase(node, &root->rb_root);
return leftmost;
}
static inline void rb_replace_node_cached(struct rb_node *victim,
struct rb_node *new,
struct rb_root_cached *root)
{
if (root->rb_leftmost == victim)
root->rb_leftmost = new;
rb_replace_node(victim, new, &root->rb_root);
}
/*
* The below helper functions use 2 operators with 3 different
* calling conventions. The operators are related like:
*
* comp(a->key,b) < 0 := less(a,b)
* comp(a->key,b) > 0 := less(b,a)
* comp(a->key,b) == 0 := !less(a,b) && !less(b,a)
*
* If these operators define a partial order on the elements we make no
* guarantee on which of the elements matching the key is found. See
* rb_find().
*
* The reason for this is to allow the find() interface without requiring an
* on-stack dummy object, which might not be feasible due to object size.
*/
/**
* rb_add_cached() - insert @node into the leftmost cached tree @tree
* @node: node to insert
* @tree: leftmost cached tree to insert @node into
* @less: operator defining the (partial) node order
*
* Returns @node when it is the new leftmost, or NULL.
*/
static __always_inline struct rb_node *
rb_add_cached(struct rb_node *node, struct rb_root_cached *tree,
bool (*less)(struct rb_node *, const struct rb_node *))
{
struct rb_node **link = &tree->rb_root.rb_node;
struct rb_node *parent = NULL;
bool leftmost = true;
while (*link) {
parent = *link;
if (less(node, parent)) {
link = &parent->rb_left;
} else {
link = &parent->rb_right;
leftmost = false;
}
}
rb_link_node(node, parent, link);
rb_insert_color_cached(node, tree, leftmost);
return leftmost ? node : NULL;
}
/**
* rb_add() - insert @node into @tree
* @node: node to insert
* @tree: tree to insert @node into
* @less: operator defining the (partial) node order
*/
static __always_inline void
rb_add(struct rb_node *node, struct rb_root *tree,
bool (*less)(struct rb_node *, const struct rb_node *))
{
struct rb_node **link = &tree->rb_node;
struct rb_node *parent = NULL;
while (*link) {
parent = *link;
if (less(node, parent))
link = &parent->rb_left;
else
link = &parent->rb_right;
}
rb_link_node(node, parent, link);
rb_insert_color(node, tree);
}
/**
* rb_find_add() - find equivalent @node in @tree, or add @node
* @node: node to look-for / insert
* @tree: tree to search / modify
* @cmp: operator defining the node order
*
* Returns the rb_node matching @node, or NULL when no match is found and @node
* is inserted.
*/
static __always_inline struct rb_node *
rb_find_add(struct rb_node *node, struct rb_root *tree,
int (*cmp)(struct rb_node *, const struct rb_node *))
{
struct rb_node **link = &tree->rb_node;
struct rb_node *parent = NULL;
int c;
while (*link) {
parent = *link;
c = cmp(node, parent);
if (c < 0)
link = &parent->rb_left;
else if (c > 0)
link = &parent->rb_right;
else
return parent;
}
rb_link_node(node, parent, link);
rb_insert_color(node, tree);
return NULL;
}
/**
* rb_find() - find @key in tree @tree
* @key: key to match
* @tree: tree to search
* @cmp: operator defining the node order
*
* Returns the rb_node matching @key or NULL.
*/
static __always_inline struct rb_node *
rb_find(const void *key, const struct rb_root *tree,
int (*cmp)(const void *key, const struct rb_node *))
{
struct rb_node *node = tree->rb_node;
while (node) {
int c = cmp(key, node);
if (c < 0)
node = node->rb_left;
else if (c > 0)
node = node->rb_right;
else
return node;
}
return NULL;
}
/**
* rb_find_first() - find the first @key in @tree
* @key: key to match
* @tree: tree to search
* @cmp: operator defining node order
*
* Returns the leftmost node matching @key, or NULL.
*/
static __always_inline struct rb_node *
rb_find_first(const void *key, const struct rb_root *tree,
int (*cmp)(const void *key, const struct rb_node *))
{
struct rb_node *node = tree->rb_node;
struct rb_node *match = NULL;
while (node) {
int c = cmp(key, node);
if (c <= 0) {
if (!c)
match = node;
node = node->rb_left;
} else if (c > 0) {
node = node->rb_right;
}
}
return match;
}
/**
* rb_next_match() - find the next @key in @tree
* @key: key to match
* @tree: tree to search
* @cmp: operator defining node order
*
* Returns the next node matching @key, or NULL.
*/
static __always_inline struct rb_node *
rb_next_match(const void *key, struct rb_node *node,
int (*cmp)(const void *key, const struct rb_node *))
{
node = rb_next(node);
if (node && cmp(key, node))
node = NULL;
return node;
}
/**
* rb_for_each() - iterates a subtree matching @key
* @node: iterator
* @key: key to match
* @tree: tree to search
* @cmp: operator defining node order
*/
#define rb_for_each(node, key, tree, cmp) \
for ((node) = rb_find_first((key), (tree), (cmp)); \
(node); (node) = rb_next_match((key), (node), (cmp)))
#endif /* _LINUX_RBTREE_H */
-313
View File
@@ -1,313 +0,0 @@
/* SPDX-License-Identifier: GPL-2.0-or-later */
/*
Red Black Trees
(C) 1999 Andrea Arcangeli <andrea@suse.de>
(C) 2002 David Woodhouse <dwmw2@infradead.org>
(C) 2012 Michel Lespinasse <walken@google.com>
linux/include/linux/rbtree_augmented.h
*/
#ifndef _LINUX_RBTREE_AUGMENTED_H
#define _LINUX_RBTREE_AUGMENTED_H
/*
* Please note - only struct rb_augment_callbacks and the prototypes for
* rb_insert_augmented() and rb_erase_augmented() are intended to be public.
* The rest are implementation details you are not expected to depend on.
*
* See Documentation/core-api/rbtree.rst for documentation and samples.
*/
struct rb_augment_callbacks {
void (*propagate)(struct rb_node *node, struct rb_node *stop);
void (*copy)(struct rb_node *old, struct rb_node *new);
void (*rotate)(struct rb_node *old, struct rb_node *new);
};
extern void __rb_insert_augmented(struct rb_node *node, struct rb_root *root,
void (*augment_rotate)(struct rb_node *old, struct rb_node *new));
/*
* Fixup the rbtree and update the augmented information when rebalancing.
*
* On insertion, the user must update the augmented information on the path
* leading to the inserted node, then call rb_link_node() as usual and
* rb_insert_augmented() instead of the usual rb_insert_color() call.
* If rb_insert_augmented() rebalances the rbtree, it will callback into
* a user provided function to update the augmented information on the
* affected subtrees.
*/
static inline void
rb_insert_augmented(struct rb_node *node, struct rb_root *root,
const struct rb_augment_callbacks *augment)
{
__rb_insert_augmented(node, root, augment->rotate);
}
static inline void
rb_insert_augmented_cached(struct rb_node *node,
struct rb_root_cached *root, bool newleft,
const struct rb_augment_callbacks *augment)
{
if (newleft)
root->rb_leftmost = node;
rb_insert_augmented(node, &root->rb_root, augment);
}
/*
* Template for declaring augmented rbtree callbacks (generic case)
*
* RBSTATIC: 'static' or empty
* RBNAME: name of the rb_augment_callbacks structure
* RBSTRUCT: struct type of the tree nodes
* RBFIELD: name of struct rb_node field within RBSTRUCT
* RBAUGMENTED: name of field within RBSTRUCT holding data for subtree
* RBCOMPUTE: name of function that recomputes the RBAUGMENTED data
*/
#define RB_DECLARE_CALLBACKS(RBSTATIC, RBNAME, \
RBSTRUCT, RBFIELD, RBAUGMENTED, RBCOMPUTE) \
static inline void \
RBNAME ## _propagate(struct rb_node *rb, struct rb_node *stop) \
{ \
while (rb != stop) { \
RBSTRUCT *node = rb_entry(rb, RBSTRUCT, RBFIELD); \
if (RBCOMPUTE(node, true)) \
break; \
rb = rb_parent(&node->RBFIELD); \
} \
} \
static inline void \
RBNAME ## _copy(struct rb_node *rb_old, struct rb_node *rb_new) \
{ \
RBSTRUCT *old = rb_entry(rb_old, RBSTRUCT, RBFIELD); \
RBSTRUCT *new = rb_entry(rb_new, RBSTRUCT, RBFIELD); \
new->RBAUGMENTED = old->RBAUGMENTED; \
} \
static void \
RBNAME ## _rotate(struct rb_node *rb_old, struct rb_node *rb_new) \
{ \
RBSTRUCT *old = rb_entry(rb_old, RBSTRUCT, RBFIELD); \
RBSTRUCT *new = rb_entry(rb_new, RBSTRUCT, RBFIELD); \
new->RBAUGMENTED = old->RBAUGMENTED; \
RBCOMPUTE(old, false); \
} \
RBSTATIC const struct rb_augment_callbacks RBNAME = { \
.propagate = RBNAME ## _propagate, \
.copy = RBNAME ## _copy, \
.rotate = RBNAME ## _rotate \
};
/*
* Template for declaring augmented rbtree callbacks,
* computing RBAUGMENTED scalar as max(RBCOMPUTE(node)) for all subtree nodes.
*
* RBSTATIC: 'static' or empty
* RBNAME: name of the rb_augment_callbacks structure
* RBSTRUCT: struct type of the tree nodes
* RBFIELD: name of struct rb_node field within RBSTRUCT
* RBTYPE: type of the RBAUGMENTED field
* RBAUGMENTED: name of RBTYPE field within RBSTRUCT holding data for subtree
* RBCOMPUTE: name of function that returns the per-node RBTYPE scalar
*/
#define RB_DECLARE_CALLBACKS_MAX(RBSTATIC, RBNAME, RBSTRUCT, RBFIELD, \
RBTYPE, RBAUGMENTED, RBCOMPUTE) \
static inline bool RBNAME ## _compute_max(RBSTRUCT *node, bool exit) \
{ \
RBSTRUCT *child; \
RBTYPE max = RBCOMPUTE(node); \
if (node->RBFIELD.rb_left) { \
child = rb_entry(node->RBFIELD.rb_left, RBSTRUCT, RBFIELD); \
if (child->RBAUGMENTED > max) \
max = child->RBAUGMENTED; \
} \
if (node->RBFIELD.rb_right) { \
child = rb_entry(node->RBFIELD.rb_right, RBSTRUCT, RBFIELD); \
if (child->RBAUGMENTED > max) \
max = child->RBAUGMENTED; \
} \
if (exit && node->RBAUGMENTED == max) \
return true; \
node->RBAUGMENTED = max; \
return false; \
} \
RB_DECLARE_CALLBACKS(RBSTATIC, RBNAME, \
RBSTRUCT, RBFIELD, RBAUGMENTED, RBNAME ## _compute_max)
#define RB_RED 0
#define RB_BLACK 1
#define __rb_parent(pc) ((struct rb_node *)(pc & ~3))
#define __rb_color(pc) ((pc) & 1)
#define __rb_is_black(pc) __rb_color(pc)
#define __rb_is_red(pc) (!__rb_color(pc))
#define rb_color(rb) __rb_color((rb)->__rb_parent_color)
#define rb_is_red(rb) __rb_is_red((rb)->__rb_parent_color)
#define rb_is_black(rb) __rb_is_black((rb)->__rb_parent_color)
static inline void rb_set_parent(struct rb_node *rb, struct rb_node *p)
{
rb->__rb_parent_color = rb_color(rb) | (unsigned long)p;
}
static inline void rb_set_parent_color(struct rb_node *rb,
struct rb_node *p, int color)
{
rb->__rb_parent_color = (unsigned long)p | color;
}
static inline void
__rb_change_child(struct rb_node *old, struct rb_node *new,
struct rb_node *parent, struct rb_root *root)
{
if (parent) {
if (parent->rb_left == old)
WRITE_ONCE(parent->rb_left, new);
else
WRITE_ONCE(parent->rb_right, new);
} else
WRITE_ONCE(root->rb_node, new);
}
static inline void
__rb_change_child_rcu(struct rb_node *old, struct rb_node *new,
struct rb_node *parent, struct rb_root *root)
{
if (parent) {
if (parent->rb_left == old)
rcu_assign_pointer(parent->rb_left, new);
else
rcu_assign_pointer(parent->rb_right, new);
} else
rcu_assign_pointer(root->rb_node, new);
}
extern void __rb_erase_color(struct rb_node *parent, struct rb_root *root,
void (*augment_rotate)(struct rb_node *old, struct rb_node *new));
static __always_inline struct rb_node *
__rb_erase_augmented(struct rb_node *node, struct rb_root *root,
const struct rb_augment_callbacks *augment)
{
struct rb_node *child = node->rb_right;
struct rb_node *tmp = node->rb_left;
struct rb_node *parent, *rebalance;
unsigned long pc;
if (!tmp) {
/*
* Case 1: node to erase has no more than 1 child (easy!)
*
* Note that if there is one child it must be red due to 5)
* and node must be black due to 4). We adjust colors locally
* so as to bypass __rb_erase_color() later on.
*/
pc = node->__rb_parent_color;
parent = __rb_parent(pc);
__rb_change_child(node, child, parent, root);
if (child) {
child->__rb_parent_color = pc;
rebalance = NULL;
} else
rebalance = __rb_is_black(pc) ? parent : NULL;
tmp = parent;
} else if (!child) {
/* Still case 1, but this time the child is node->rb_left */
tmp->__rb_parent_color = pc = node->__rb_parent_color;
parent = __rb_parent(pc);
__rb_change_child(node, tmp, parent, root);
rebalance = NULL;
tmp = parent;
} else {
struct rb_node *successor = child, *child2;
tmp = child->rb_left;
if (!tmp) {
/*
* Case 2: node's successor is its right child
*
* (n) (s)
* / \ / \
* (x) (s) -> (x) (c)
* \
* (c)
*/
parent = successor;
child2 = successor->rb_right;
augment->copy(node, successor);
} else {
/*
* Case 3: node's successor is leftmost under
* node's right child subtree
*
* (n) (s)
* / \ / \
* (x) (y) -> (x) (y)
* / /
* (p) (p)
* / /
* (s) (c)
* \
* (c)
*/
do {
parent = successor;
successor = tmp;
tmp = tmp->rb_left;
} while (tmp);
child2 = successor->rb_right;
WRITE_ONCE(parent->rb_left, child2);
WRITE_ONCE(successor->rb_right, child);
rb_set_parent(child, successor);
augment->copy(node, successor);
augment->propagate(parent, successor);
}
tmp = node->rb_left;
WRITE_ONCE(successor->rb_left, tmp);
rb_set_parent(tmp, successor);
pc = node->__rb_parent_color;
tmp = __rb_parent(pc);
__rb_change_child(node, successor, tmp, root);
if (child2) {
rb_set_parent_color(child2, parent, RB_BLACK);
rebalance = NULL;
} else {
rebalance = rb_is_black(successor) ? parent : NULL;
}
successor->__rb_parent_color = pc;
tmp = successor;
}
augment->propagate(tmp, NULL);
return rebalance;
}
static __always_inline void
rb_erase_augmented(struct rb_node *node, struct rb_root *root,
const struct rb_augment_callbacks *augment)
{
struct rb_node *rebalance = __rb_erase_augmented(node, root, augment);
if (rebalance)
__rb_erase_color(rebalance, root, augment->rotate);
}
static __always_inline void
rb_erase_augmented_cached(struct rb_node *node, struct rb_root_cached *root,
const struct rb_augment_callbacks *augment)
{
if (root->rb_leftmost == node)
root->rb_leftmost = rb_next(node);
rb_erase_augmented(node, &root->rb_root, augment);
}
#endif /* _LINUX_RBTREE_AUGMENTED_H */
-34
View File
@@ -1,34 +0,0 @@
/* SPDX-License-Identifier: GPL-2.0-or-later */
#ifndef _LINUX_RBTREE_TYPES_H
#define _LINUX_RBTREE_TYPES_H
struct rb_node {
unsigned long __rb_parent_color;
struct rb_node *rb_right;
struct rb_node *rb_left;
} __attribute__((aligned(sizeof(long))));
/* The alignment might seem pointless, but allegedly CRIS needs it */
struct rb_root {
struct rb_node *rb_node;
};
/*
* Leftmost-cached rbtrees.
*
* We do not cache the rightmost node based on footprint
* size vs number of potential users that could benefit
* from O(1) rb_last(). Just not worth it, users that want
* this feature can always implement the logic explicitly.
* Furthermore, users that want to cache both pointers may
* find it a bit asymmetric, but that's ok.
*/
struct rb_root_cached {
struct rb_root rb_root;
struct rb_node *rb_leftmost;
};
#define RB_ROOT (struct rb_root) { NULL, }
#define RB_ROOT_CACHED (struct rb_root_cached) { {NULL, }, NULL }
#endif
-34
View File
@@ -44,37 +44,3 @@ int srch_decode_entry(void *buf, struct scoutfs_srch_entry *sre,
return tot;
}
static int encode_u64(__le64 *buf, u64 val)
{
int bytes;
val = (val << 1) ^ ((s64)val >> 63); /* shift sign extend */
bytes = (fls64(val) + 7) >> 3;
put_unaligned_le64(val, buf);
return bytes;
}
int srch_encode_entry(void *buf, struct scoutfs_srch_entry *sre, struct scoutfs_srch_entry *prev)
{
u64 diffs[] = {
le64_to_cpu(sre->hash) - le64_to_cpu(prev->hash),
le64_to_cpu(sre->ino) - le64_to_cpu(prev->ino),
le64_to_cpu(sre->id) - le64_to_cpu(prev->id),
};
u16 lengths = 0;
int bytes;
int tot = 2;
int i;
for (i = 0; i < array_size(diffs); i++) {
bytes = encode_u64(buf + tot, diffs[i]);
lengths |= bytes << (i << 2);
tot += bytes;
}
put_unaligned_le16(lengths, buf);
return tot;
}
-1
View File
@@ -3,6 +3,5 @@
int srch_decode_entry(void *buf, struct scoutfs_srch_entry *sre,
struct scoutfs_srch_entry *prev);
int srch_encode_entry(void *buf, struct scoutfs_srch_entry *sre, struct scoutfs_srch_entry *prev);
#endif
-13
View File
@@ -69,8 +69,6 @@ do { \
#define container_of(ptr, type, memb) \
((type *)((void *)(ptr) - offsetof(type, memb)))
#define NSEC_PER_SEC 1000000000
#define BITS_PER_LONG (sizeof(long) * 8)
#define U8_MAX ((u8)~0ULL)
#define U16_MAX ((u16)~0ULL)
@@ -83,7 +81,6 @@ do { \
\
(_x == 0 ? 0 : 64 - __builtin_clzll(_x)); \
})
#define fls64(x) flsll(x)
#define ilog2(x) \
({ \
@@ -101,16 +98,6 @@ emit_get_unaligned_le(16)
emit_get_unaligned_le(32)
emit_get_unaligned_le(64)
#define emit_put_unaligned_le(nr) \
static inline void put_unaligned_le##nr(u##nr val, void *buf) \
{ \
__le##nr x = cpu_to_le##nr(val); \
memcpy(buf, &x, sizeof(x)); \
}
emit_put_unaligned_le(16)
emit_put_unaligned_le(32)
emit_put_unaligned_le(64)
/*
* return -1,0,+1 based on the memcmp comparison of the minimum of their
* two lengths. If their min shared bytes are equal but the lengths