mirror of
https://github.com/versity/scoutfs.git
synced 2026-06-09 21:22:36 +00:00
Compare commits
2 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
| 5bea29a168 | |||
| 7a999f2657 |
+1
-15
@@ -2,15 +2,10 @@ Versity ScoutFS Release Notes
|
||||
=============================
|
||||
|
||||
---
|
||||
v1.2-rc
|
||||
v1.x
|
||||
\
|
||||
*TBD*
|
||||
|
||||
---
|
||||
v1.1
|
||||
\
|
||||
*Feb 4, 2022*
|
||||
|
||||
|
||||
* **Add scoutfs(1) change-quorum-config command**
|
||||
\
|
||||
@@ -19,15 +14,6 @@ v1.1
|
||||
unmounted. This can be used to change the mounts that will
|
||||
participate in quorum and the IP addresses they use.
|
||||
|
||||
* **Fix Rare Risk of Item Cache Corruption**
|
||||
\
|
||||
Code review found a rare potential source of item cache corruption.
|
||||
If this happened it would look as though deleted parts of the filesystem
|
||||
returned, but only at the time they were deleted. Old deleted items are
|
||||
not affected. This problem only affected the item cache, never
|
||||
persistent storage. Unmounting and remounting would drop the bad item
|
||||
cache and resync it with the correct persistent data.
|
||||
|
||||
---
|
||||
v1.0
|
||||
\
|
||||
|
||||
@@ -13,6 +13,7 @@ scoutfs-y += \
|
||||
block.o \
|
||||
btree.o \
|
||||
client.o \
|
||||
cwskip.o \
|
||||
counters.o \
|
||||
data.o \
|
||||
dir.o \
|
||||
|
||||
+20
-19
@@ -1875,12 +1875,11 @@ out:
|
||||
* set in btree items. They're only used for fs items written through
|
||||
* the item cache and forest of log btrees.
|
||||
*/
|
||||
int scoutfs_btree_insert_list(struct super_block *sb,
|
||||
struct scoutfs_alloc *alloc,
|
||||
struct scoutfs_block_writer *wri,
|
||||
struct scoutfs_btree_root *root,
|
||||
struct scoutfs_btree_item_list *lst)
|
||||
int scoutfs_btree_insert_list(struct super_block *sb, struct scoutfs_alloc *alloc,
|
||||
struct scoutfs_block_writer *wri, struct scoutfs_btree_root *root,
|
||||
scoutfs_btree_item_iter_cb iter_cb, void *pos, void *arg)
|
||||
{
|
||||
struct scoutfs_btree_item_desc desc;
|
||||
struct scoutfs_btree_item *item;
|
||||
struct btree_walk_key_range kr;
|
||||
struct scoutfs_btree_block *bt;
|
||||
@@ -1889,44 +1888,46 @@ int scoutfs_btree_insert_list(struct super_block *sb,
|
||||
int cmp;
|
||||
int ret = 0;
|
||||
|
||||
while (lst) {
|
||||
pos = iter_cb(sb, &desc, pos, arg);
|
||||
|
||||
while (pos) {
|
||||
ret = btree_walk(sb, alloc, wri, root, BTW_DIRTY | BTW_INSERT,
|
||||
&lst->key, lst->val_len, &bl, &kr, NULL);
|
||||
desc.key, desc.val_len, &bl, &kr, NULL);
|
||||
if (ret < 0)
|
||||
goto out;
|
||||
bt = bl->data;
|
||||
|
||||
do {
|
||||
item = leaf_item_hash_search(sb, bt, &lst->key);
|
||||
item = leaf_item_hash_search(sb, bt, desc.key);
|
||||
if (item) {
|
||||
/* try to merge delta values, _NULL not deleted; merge will */
|
||||
ret = scoutfs_forest_combine_deltas(&lst->key,
|
||||
ret = scoutfs_forest_combine_deltas(desc.key,
|
||||
item_val(bt, item),
|
||||
item_val_len(item),
|
||||
lst->val, lst->val_len);
|
||||
desc.val, desc.val_len);
|
||||
if (ret < 0) {
|
||||
scoutfs_block_put(sb, bl);
|
||||
goto out;
|
||||
}
|
||||
|
||||
item->seq = cpu_to_le64(lst->seq);
|
||||
item->flags = lst->flags;
|
||||
item->seq = cpu_to_le64(desc.seq);
|
||||
item->flags = desc.flags;
|
||||
|
||||
if (ret == 0)
|
||||
update_item_value(bt, item, lst->val, lst->val_len);
|
||||
update_item_value(bt, item, desc.val, desc.val_len);
|
||||
else
|
||||
ret = 0;
|
||||
} else {
|
||||
scoutfs_avl_search(&bt->item_root,
|
||||
cmp_key_item, &lst->key,
|
||||
cmp_key_item, desc.key,
|
||||
&cmp, &par, NULL, NULL);
|
||||
create_item(bt, &lst->key, lst->seq, lst->flags, lst->val,
|
||||
lst->val_len, par, cmp);
|
||||
create_item(bt, desc.key, desc.seq, desc.flags, desc.val,
|
||||
desc.val_len, par, cmp);
|
||||
}
|
||||
|
||||
lst = lst->next;
|
||||
} while (lst && scoutfs_key_compare(&lst->key, &kr.end) <= 0 &&
|
||||
mid_free_item_room(bt, lst->val_len));
|
||||
pos = iter_cb(sb, &desc, pos, arg);
|
||||
} while (pos && scoutfs_key_compare(desc.key, &kr.end) <= 0 &&
|
||||
mid_free_item_room(bt, desc.val_len));
|
||||
|
||||
scoutfs_block_put(sb, bl);
|
||||
}
|
||||
|
||||
+17
-6
@@ -18,11 +18,24 @@ struct scoutfs_btree_item_ref {
|
||||
#define SCOUTFS_BTREE_ITEM_REF(name) \
|
||||
struct scoutfs_btree_item_ref name = {NULL,}
|
||||
|
||||
/* caller gives an item to the callback */
|
||||
/* btree gives an item to caller */
|
||||
typedef int (*scoutfs_btree_item_cb)(struct super_block *sb,
|
||||
struct scoutfs_key *key, u64 seq, u8 flags,
|
||||
void *val, int val_len, void *arg);
|
||||
|
||||
struct scoutfs_btree_item_desc {
|
||||
struct scoutfs_key *key;
|
||||
void *val;
|
||||
u64 seq;
|
||||
u8 flags;
|
||||
unsigned val_len;
|
||||
};
|
||||
|
||||
/* btree iterates through items from caller */
|
||||
typedef void *(*scoutfs_btree_item_iter_cb)(struct super_block *sb,
|
||||
struct scoutfs_btree_item_desc *desc,
|
||||
void *pos, void *arg);
|
||||
|
||||
/* simple singly-linked list of items */
|
||||
struct scoutfs_btree_item_list {
|
||||
struct scoutfs_btree_item_list *next;
|
||||
@@ -78,11 +91,9 @@ int scoutfs_btree_read_items(struct super_block *sb,
|
||||
struct scoutfs_key *start,
|
||||
struct scoutfs_key *end,
|
||||
scoutfs_btree_item_cb cb, void *arg);
|
||||
int scoutfs_btree_insert_list(struct super_block *sb,
|
||||
struct scoutfs_alloc *alloc,
|
||||
struct scoutfs_block_writer *wri,
|
||||
struct scoutfs_btree_root *root,
|
||||
struct scoutfs_btree_item_list *lst);
|
||||
int scoutfs_btree_insert_list(struct super_block *sb, struct scoutfs_alloc *alloc,
|
||||
struct scoutfs_block_writer *wri, struct scoutfs_btree_root *root,
|
||||
scoutfs_btree_item_iter_cb iter_cb, void *pos, void *arg);
|
||||
|
||||
int scoutfs_btree_parent_range(struct super_block *sb,
|
||||
struct scoutfs_btree_root *root,
|
||||
|
||||
+11
-20
@@ -90,36 +90,27 @@
|
||||
EXPAND_COUNTER(forest_read_items) \
|
||||
EXPAND_COUNTER(forest_roots_next_hint) \
|
||||
EXPAND_COUNTER(forest_set_bloom_bits) \
|
||||
EXPAND_COUNTER(item_alloc_bytes) \
|
||||
EXPAND_COUNTER(item_clear_dirty) \
|
||||
EXPAND_COUNTER(item_create) \
|
||||
EXPAND_COUNTER(item_delete) \
|
||||
EXPAND_COUNTER(item_delta) \
|
||||
EXPAND_COUNTER(item_delta_written) \
|
||||
EXPAND_COUNTER(item_dirty) \
|
||||
EXPAND_COUNTER(item_free_bytes) \
|
||||
EXPAND_COUNTER(item_invalidate) \
|
||||
EXPAND_COUNTER(item_invalidate_page) \
|
||||
EXPAND_COUNTER(item_invalidate_item) \
|
||||
EXPAND_COUNTER(item_lookup) \
|
||||
EXPAND_COUNTER(item_mark_dirty) \
|
||||
EXPAND_COUNTER(item_next) \
|
||||
EXPAND_COUNTER(item_page_accessed) \
|
||||
EXPAND_COUNTER(item_page_alloc) \
|
||||
EXPAND_COUNTER(item_page_clear_dirty) \
|
||||
EXPAND_COUNTER(item_page_compact) \
|
||||
EXPAND_COUNTER(item_page_free) \
|
||||
EXPAND_COUNTER(item_page_lru_add) \
|
||||
EXPAND_COUNTER(item_page_lru_remove) \
|
||||
EXPAND_COUNTER(item_page_mark_dirty) \
|
||||
EXPAND_COUNTER(item_page_rbtree_walk) \
|
||||
EXPAND_COUNTER(item_page_split) \
|
||||
EXPAND_COUNTER(item_pcpu_add_replaced) \
|
||||
EXPAND_COUNTER(item_pcpu_page_hit) \
|
||||
EXPAND_COUNTER(item_pcpu_page_miss) \
|
||||
EXPAND_COUNTER(item_pcpu_page_miss_keys) \
|
||||
EXPAND_COUNTER(item_read_pages_split) \
|
||||
EXPAND_COUNTER(item_shrink_page) \
|
||||
EXPAND_COUNTER(item_shrink_page_dirty) \
|
||||
EXPAND_COUNTER(item_shrink_page_reader) \
|
||||
EXPAND_COUNTER(item_shrink_page_trylock) \
|
||||
EXPAND_COUNTER(item_shrink) \
|
||||
EXPAND_COUNTER(item_shrink_all) \
|
||||
EXPAND_COUNTER(item_shrink_exhausted) \
|
||||
EXPAND_COUNTER(item_shrink_read_search) \
|
||||
EXPAND_COUNTER(item_shrink_removed) \
|
||||
EXPAND_COUNTER(item_shrink_searched) \
|
||||
EXPAND_COUNTER(item_shrink_skipped) \
|
||||
EXPAND_COUNTER(item_shrink_write_search) \
|
||||
EXPAND_COUNTER(item_update) \
|
||||
EXPAND_COUNTER(item_write_dirty) \
|
||||
EXPAND_COUNTER(lock_alloc) \
|
||||
|
||||
@@ -0,0 +1,584 @@
|
||||
/*
|
||||
* Copyright (C) 2021 Versity Software, Inc. All rights reserved.
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU General Public
|
||||
* License v2 as published by the Free Software Foundation.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* General Public License for more details.
|
||||
*/
|
||||
#include <linux/kernel.h>
|
||||
#include <linux/fs.h>
|
||||
#include <linux/rcupdate.h>
|
||||
#include <linux/random.h>
|
||||
|
||||
#include "cwskip.h"
|
||||
|
||||
/*
|
||||
* This skip list is built to allow concurrent modification and limit
|
||||
* contention to the region of the list around the modification. All
|
||||
* node references are protected by RCU. Each node has a write_seq
|
||||
* that works like a seqlock, the big differences are that we nest them
|
||||
* and use trylock to acquire them.
|
||||
*
|
||||
* Readers sample the write_seqs of nodes containing links as they
|
||||
* traverse them, verifying that the node hasn't been modified before
|
||||
* traversing to the node referenced by the link.
|
||||
*
|
||||
* Writers remember the seqs of all the nodes they traversed to end up
|
||||
* at their final node. They try to acquire the lock of all the nodes
|
||||
* needed to modify the list at a given height. Their trylocks will
|
||||
* fail if any of the nodes have changed since their traversal.
|
||||
*
|
||||
* The interface is built around references to adjacent pairs of nodes
|
||||
* and their sequence numbers. This lets readers and writers traverse
|
||||
* through their local region of the list until they hit contention and
|
||||
* must start over with a full search.
|
||||
*
|
||||
* The caller is responsible for allocating and freeing nodes. The
|
||||
* interface is built around caller's objects which each have embedded
|
||||
* nodes.
|
||||
*/
|
||||
|
||||
/*
|
||||
* node_off is the positive offset of the cwskip node within the
|
||||
* container structs stored in the list. The node_off is subtracted
|
||||
* from node pointers to give the caller a pointer to their stored
|
||||
* container struct.
|
||||
*/
|
||||
void scoutfs_cwskip_init_root(struct scoutfs_cwskip_root *root, scoutfs_cwskip_cmp_t cmp_fn,
|
||||
unsigned long node_off)
|
||||
{
|
||||
memset(root, 0, sizeof(&root));
|
||||
root->cmp_fn = cmp_fn;
|
||||
root->node_off = node_off;
|
||||
}
|
||||
|
||||
/* This is completely racey and should be used accordingly. */
|
||||
bool scoutfs_cwskip_empty(struct scoutfs_cwskip_root *root)
|
||||
{
|
||||
int i;
|
||||
|
||||
for (i = 0; i < SCOUTFS_CWSKIP_MAX_HEIGHT; i++) {
|
||||
if (root->node.links[i] != NULL)
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
/*
|
||||
* Return a random height between 1 and max height, inclusive. Using
|
||||
* ffs means that each greater height relies on all lower height bits
|
||||
* being clear and we get the height distribution we want: 1 = 1/2,
|
||||
* 2 = 1/4, 3 = 1/8, etc.
|
||||
*/
|
||||
int scoutfs_cwskip_rand_height(void)
|
||||
{
|
||||
return ffs(prandom_u32() | (1 << (SCOUTFS_CWSKIP_MAX_HEIGHT - 1)));
|
||||
}
|
||||
|
||||
static void *node_container(struct scoutfs_cwskip_root *root, struct scoutfs_cwskip_node *node)
|
||||
{
|
||||
return node ? (void *)((unsigned long)node - root->node_off) : NULL;
|
||||
}
|
||||
|
||||
/*
|
||||
* Set the caller's containers for the given nodes. There isn't a
|
||||
* previous container when the previous node is the root's static
|
||||
* full-height node.
|
||||
*/
|
||||
static void set_containers(struct scoutfs_cwskip_root *root, struct scoutfs_cwskip_node *prev,
|
||||
struct scoutfs_cwskip_node *node, void **prev_cont, void **node_cont)
|
||||
{
|
||||
if (prev_cont)
|
||||
*prev_cont = (prev != &root->node) ? node_container(root, prev) : NULL;
|
||||
if (node_cont)
|
||||
*node_cont = node_container(root, node);
|
||||
}
|
||||
|
||||
static struct scoutfs_cwskip_node *node_read_begin(struct scoutfs_cwskip_node *node,
|
||||
unsigned int *seq)
|
||||
{
|
||||
if (node) {
|
||||
*seq = READ_ONCE(node->write_seq) & ~1U;
|
||||
smp_rmb();
|
||||
} else {
|
||||
*seq = 1; /* caller shouldn't use if we return null, being careful */
|
||||
}
|
||||
|
||||
return node;
|
||||
}
|
||||
|
||||
static bool node_read_retry(struct scoutfs_cwskip_node *node, unsigned int seq)
|
||||
{
|
||||
if (node) {
|
||||
smp_rmb();
|
||||
return READ_ONCE(node->write_seq) != seq;
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
/*
|
||||
* write_seq is only an int to reduce the size of nodes and full-height
|
||||
* seq arrays, it could be a long if archs have trouble with int
|
||||
* cmpxchg.
|
||||
*/
|
||||
static bool __node_trylock(struct scoutfs_cwskip_node *node, unsigned int seq)
|
||||
{
|
||||
if (seq & 1)
|
||||
return false;
|
||||
|
||||
return cmpxchg(&node->write_seq, seq, seq + 1) == seq;
|
||||
}
|
||||
|
||||
static bool node_trylock(struct scoutfs_cwskip_node *node, unsigned int seq)
|
||||
{
|
||||
bool locked = __node_trylock(node, seq);
|
||||
if (locked)
|
||||
smp_wmb();
|
||||
return locked;
|
||||
}
|
||||
|
||||
static void __node_unlock(struct scoutfs_cwskip_node *node)
|
||||
{
|
||||
node->write_seq++;
|
||||
}
|
||||
|
||||
static void node_unlock(struct scoutfs_cwskip_node *node)
|
||||
{
|
||||
__node_unlock(node);
|
||||
smp_wmb();
|
||||
}
|
||||
|
||||
/* return -1/1 to go left/right, never 0 */
|
||||
static int random_cmp(void *K, void *C)
|
||||
{
|
||||
return (int)(prandom_u32() & 2) - 1;
|
||||
}
|
||||
|
||||
static void cwskip_search(struct scoutfs_cwskip_root *root, void *key, int *node_cmp,
|
||||
struct scoutfs_cwskip_reader *rd, struct scoutfs_cwskip_writer *wr,
|
||||
unsigned int *prev_seqs)
|
||||
{
|
||||
struct scoutfs_cwskip_node *prev;
|
||||
struct scoutfs_cwskip_node *node;
|
||||
scoutfs_cwskip_cmp_t cmp_fn;
|
||||
unsigned int prev_seq;
|
||||
unsigned int node_seq;
|
||||
int level;
|
||||
int cmp;
|
||||
|
||||
if (key == NULL)
|
||||
cmp_fn = random_cmp;
|
||||
|
||||
restart:
|
||||
prev = node_read_begin(&root->node, &prev_seq);
|
||||
node = NULL;
|
||||
node_seq = 1;
|
||||
cmp = -1;
|
||||
|
||||
level = SCOUTFS_CWSKIP_MAX_HEIGHT - 1;
|
||||
while (prev && level >= 0) {
|
||||
node = node_read_begin(prev->links[level], &node_seq);
|
||||
if (!node) {
|
||||
cmp = -1;
|
||||
level--;
|
||||
continue;
|
||||
}
|
||||
|
||||
cmp = cmp_fn(key, node_container(root, node));
|
||||
if (cmp > 0) {
|
||||
if (node_read_retry(prev, prev_seq))
|
||||
goto restart;
|
||||
prev = node;
|
||||
prev_seq = node_seq;
|
||||
node = NULL;
|
||||
continue;
|
||||
}
|
||||
|
||||
if (wr) {
|
||||
wr->prevs[level] = prev;
|
||||
prev_seqs[level] = prev_seq;
|
||||
}
|
||||
|
||||
level--;
|
||||
}
|
||||
|
||||
rd->prev = prev;
|
||||
rd->prev_seq = prev_seq;
|
||||
rd->node = node;
|
||||
rd->node_seq = node_seq;
|
||||
*node_cmp = cmp;
|
||||
}
|
||||
|
||||
static void init_reader(struct scoutfs_cwskip_reader *rd, struct scoutfs_cwskip_root *root)
|
||||
{
|
||||
memset(rd, 0, sizeof(struct scoutfs_cwskip_reader));
|
||||
rd->root = root;
|
||||
}
|
||||
|
||||
/*
|
||||
* Find and returns nodes that surround the search key.
|
||||
*
|
||||
* Either prev or null can be null if there are no nodes before or after
|
||||
* the search key. *node_cmp is set to the final comparison of the key
|
||||
* and the returned node's container key, it will be 0 if an exact match
|
||||
* is found.
|
||||
*
|
||||
* This starts an RCU read critical section and is fully concurrent with
|
||||
* both other readers and writers. The nodes won't be freed until
|
||||
* after the section so its always safe to reference them but their
|
||||
* contents might be nonsense if they're modified during the read.
|
||||
* Nothing learned from the list during the read section should have an
|
||||
* effect until after _read_valid has said it was OK.
|
||||
*
|
||||
* _read_valid can be called after referencing the nodes to see if they
|
||||
* were stable during the read. _read_next can be used to iterate
|
||||
* forward through the list without repeating the search. The caller
|
||||
* must always call a matching _read_end once they're done.
|
||||
*/
|
||||
void scoutfs_cwskip_read_begin(struct scoutfs_cwskip_root *root, void *key, void **prev_cont,
|
||||
void **node_cont, int *node_cmp, struct scoutfs_cwskip_reader *rd)
|
||||
__acquires(RCU) /* :/ */
|
||||
{
|
||||
init_reader(rd, root);
|
||||
|
||||
rcu_read_lock();
|
||||
cwskip_search(root, key, node_cmp, rd, NULL, NULL);
|
||||
set_containers(root, rd->prev, rd->node, prev_cont, node_cont);
|
||||
}
|
||||
|
||||
/*
|
||||
* Returns true of the nodes referenced by the reader haven't been
|
||||
* modified and any references of them were consistent. Thsi does not
|
||||
* end the reader critical section and can be called multiple times.
|
||||
*/
|
||||
bool scoutfs_cwskip_read_valid(struct scoutfs_cwskip_reader *rd)
|
||||
{
|
||||
return !(node_read_retry(rd->prev, rd->prev_seq) &&
|
||||
node_read_retry(rd->node, rd->node_seq));
|
||||
}
|
||||
|
||||
/*
|
||||
* Advance from the current prev/node to the next pair of nodes in the
|
||||
* list. prev_cont is set to what node_cont was before the call.
|
||||
* node_cont is set to the next node after the current node_cont.
|
||||
*
|
||||
* This returns true if it found a next node and that its load of the
|
||||
* next pointer from node was valid and stable. Returning false means
|
||||
* that the caller should retry. There could be more items in the list.
|
||||
*/
|
||||
bool scoutfs_cwskip_read_next(struct scoutfs_cwskip_reader *rd, void **prev_cont, void **node_cont)
|
||||
{
|
||||
struct scoutfs_cwskip_node *next;
|
||||
unsigned int next_seq;
|
||||
bool valid_next;
|
||||
|
||||
next = rd->node ? node_read_begin(rd->node->links[0], &next_seq) : NULL;
|
||||
valid_next = scoutfs_cwskip_read_valid(rd) && next;
|
||||
if (valid_next) {
|
||||
rd->prev = rd->node;
|
||||
rd->prev_seq = rd->node_seq;
|
||||
rd->node = next;
|
||||
rd->node_seq = next_seq;
|
||||
|
||||
set_containers(rd->root, rd->prev, rd->node, prev_cont, node_cont);
|
||||
}
|
||||
|
||||
return valid_next;
|
||||
}
|
||||
|
||||
/*
|
||||
* End the critical section started with _read_begin.
|
||||
*/
|
||||
void scoutfs_cwskip_read_end(struct scoutfs_cwskip_reader *rd)
|
||||
__releases(RCU) /* :/ */
|
||||
{
|
||||
rcu_read_unlock();
|
||||
}
|
||||
|
||||
/*
|
||||
* Higher locks are more likely to cause contention so we unlock them
|
||||
* first.
|
||||
*/
|
||||
static void writer_unlock(struct scoutfs_cwskip_writer *wr)
|
||||
{
|
||||
int i;
|
||||
|
||||
for (i = wr->locked_height - 1; i >= 0; i--) {
|
||||
if (i == 0 || (wr->prevs[i - 1] != wr->prevs[i]))
|
||||
__node_unlock(wr->prevs[i]);
|
||||
}
|
||||
|
||||
if (wr->node_locked)
|
||||
__node_unlock(wr->node);
|
||||
|
||||
smp_wmb();
|
||||
|
||||
wr->locked_height = 0;
|
||||
wr->node_locked = false;
|
||||
}
|
||||
|
||||
/*
|
||||
* A search traversal has saved all the previous nodes at each level.
|
||||
*
|
||||
* We try to acquire the write_seq locks for all the prevs up to height
|
||||
* from the seqs that we read during the search. The search was
|
||||
* protected by read sections so the prevs represent a consistent
|
||||
* version of the list at some point in the past. If nodes have been
|
||||
* locked since we read them we won't be able to acquire the locks.
|
||||
* Nodes aren't re-inserted after removal so we shouldn't see nodes in
|
||||
* multiple places (which would deadlock).
|
||||
*
|
||||
* The same node can be in multiple prev slots. We're careful to only
|
||||
* try locking the lowest duplicate slot.
|
||||
*
|
||||
* We lock from the highest level down. This only matters when there's
|
||||
* contention. The higher nodes are more likely to see contention so
|
||||
* we want trylock to fail early to avoid useless locking churn on lower
|
||||
* nodes.
|
||||
*/
|
||||
static bool writer_trylock(struct scoutfs_cwskip_writer *wr, unsigned int *prev_seqs, int height)
|
||||
{
|
||||
int i;
|
||||
|
||||
if (WARN_ON_ONCE(wr->locked_height != 0) ||
|
||||
WARN_ON_ONCE(height < 1 || height > ARRAY_SIZE(wr->prevs)))
|
||||
return false;
|
||||
|
||||
for (i = height - 1; i >= 0; i--) {
|
||||
if ((i == 0 || wr->prevs[i - 1] != wr->prevs[i]) &&
|
||||
!__node_trylock(wr->prevs[i], prev_seqs[i]))
|
||||
break;
|
||||
wr->locked_height++;
|
||||
}
|
||||
|
||||
if (i < height) {
|
||||
writer_unlock(wr);
|
||||
return false;
|
||||
}
|
||||
|
||||
/* paranoid debugging verification */
|
||||
for (i = 0; i < wr->locked_height; i++) {
|
||||
BUG_ON(wr->prevs[i]->height <= i);
|
||||
BUG_ON(wr->node && i < wr->node->height && wr->prevs[i]->links[i] != wr->node);
|
||||
}
|
||||
|
||||
smp_mb();
|
||||
return true;
|
||||
}
|
||||
|
||||
static void init_writer(struct scoutfs_cwskip_writer *wr, struct scoutfs_cwskip_root *root)
|
||||
{
|
||||
memset(wr, 0, sizeof(struct scoutfs_cwskip_writer));
|
||||
wr->root = root;
|
||||
}
|
||||
|
||||
/*
|
||||
* Search for and return references to the two nodes that surround the
|
||||
* search key, with the nodes locked.
|
||||
*
|
||||
* Either node can be null if there are no nodes before or after the
|
||||
* search key. We still hold a lock on the static root node if the
|
||||
* search key falls before the first node in the list.
|
||||
*
|
||||
* If lock_height is 0 then the caller is saying that they just want to
|
||||
* lock the surrounding nodes and not modify their position in the list.
|
||||
* We only lock those two nodes. Any greater lock_height represents a
|
||||
* height that we need to lock so the caller can insert an allocated
|
||||
* node with that height.
|
||||
*
|
||||
* The caller can use the writer context to iterate through locked nodes
|
||||
* via the lowest level list that contains all nodes. If they hit a
|
||||
* node that's higher than the locked height in the writer then they
|
||||
* have to unlock and restart because we don't have the previous node
|
||||
* for that height. We set a min level that we lock to reduce the
|
||||
* possibility of hitting higher nodes and retrying.
|
||||
*/
|
||||
#define MIN_LOCKED_HEIGHT 4
|
||||
void scoutfs_cwskip_write_begin(struct scoutfs_cwskip_root *root, void *key, int lock_height,
|
||||
void **prev_cont, void **node_cont, int *node_cmp,
|
||||
struct scoutfs_cwskip_writer *wr)
|
||||
__acquires(RCU) /* :/ */
|
||||
{
|
||||
unsigned int prev_seqs[SCOUTFS_CWSKIP_MAX_HEIGHT];
|
||||
struct scoutfs_cwskip_reader rd;
|
||||
int node_height;
|
||||
int use_height;
|
||||
bool locked;
|
||||
|
||||
BUG_ON(WARN_ON_ONCE(lock_height < 0 || lock_height > SCOUTFS_CWSKIP_MAX_HEIGHT));
|
||||
|
||||
do {
|
||||
init_reader(&rd, root);
|
||||
init_writer(wr, root);
|
||||
|
||||
rcu_read_lock();
|
||||
cwskip_search(root, key, node_cmp, &rd, wr, NULL);
|
||||
|
||||
wr->node = rd.node;
|
||||
if (wr->node) {
|
||||
/* _trylock of prevs will issue barrier on success */
|
||||
if (!__node_trylock(wr->node, rd.node_seq)) {
|
||||
locked = false;
|
||||
continue;
|
||||
}
|
||||
wr->node_locked = true;
|
||||
node_height = wr->node->height;
|
||||
} else {
|
||||
node_height = 0;
|
||||
}
|
||||
|
||||
if (lock_height > 0)
|
||||
use_height = max3(MIN_LOCKED_HEIGHT, node_height, lock_height);
|
||||
else
|
||||
use_height = 1;
|
||||
|
||||
locked = writer_trylock(wr, prev_seqs, use_height);
|
||||
if (!locked)
|
||||
rcu_read_unlock();
|
||||
} while (!locked);
|
||||
|
||||
set_containers(root, wr->prevs[0], wr->node, prev_cont, node_cont);
|
||||
}
|
||||
|
||||
/*
|
||||
* Insert a new node between the writer's two locked nodes. The
|
||||
* inserting node is locked and replaces the existing node in the writer
|
||||
* which is unlocked.
|
||||
*
|
||||
* The next node may not exist. The previous nodes will always exist
|
||||
* though they may be the static root node.
|
||||
*
|
||||
* The inserting node is visible to readers the moment we store the
|
||||
* first link to it in previous nodes. We first lock it with a write
|
||||
* barrier so that any readers will retry if they visit it before all
|
||||
* its links are updated and its unlocked.
|
||||
*
|
||||
* We don't unlock prevs that are higher than the inserting node. This
|
||||
* lets the caller continue iterating through nodes that are higher than
|
||||
* insertion but still under the locked height.
|
||||
*/
|
||||
void scoutfs_cwskip_write_insert(struct scoutfs_cwskip_writer *wr,
|
||||
struct scoutfs_cwskip_node *ins)
|
||||
{
|
||||
struct scoutfs_cwskip_node *node = wr->node;
|
||||
int i;
|
||||
|
||||
BUG_ON(ins->height > wr->locked_height);
|
||||
node_trylock(ins, ins->write_seq);
|
||||
|
||||
for (i = 0; i < ins->height; i++) {
|
||||
ins->links[i] = wr->prevs[i]->links[i];
|
||||
wr->prevs[i]->links[i] = ins;
|
||||
}
|
||||
|
||||
if (node)
|
||||
node_unlock(node);
|
||||
wr->node = ins;
|
||||
}
|
||||
|
||||
/*
|
||||
* Remove the node in the writer from the list. The writers node
|
||||
* pointer is not advanced because we don't want this to be able to fail
|
||||
* if trylock on the next node fails. The caller can call _write_next
|
||||
* on this writer and it will try and iterate from prevs[0].
|
||||
*
|
||||
* The caller's removal argument must be the node pointer in the writer.
|
||||
* This is redundant but meant to communicate to the caller that they're
|
||||
* responsible for the node after removing it (presumably queueing it
|
||||
* for freeing before _write_end leaves rcu).
|
||||
*
|
||||
* Readers can be traversing our node as we modify its pointers and can
|
||||
* read a temporarily inconsistent state. We have the node locked so
|
||||
* the reader will immediately retry once the check the seqs after
|
||||
* hitting our node that's being removed.
|
||||
*/
|
||||
void scoutfs_cwskip_write_remove(struct scoutfs_cwskip_writer *wr,
|
||||
struct scoutfs_cwskip_node *node)
|
||||
{
|
||||
int i;
|
||||
|
||||
BUG_ON(node != wr->node);
|
||||
BUG_ON(node->height > wr->locked_height);
|
||||
|
||||
for (i = 0; i < node->height; i++) {
|
||||
wr->prevs[i]->links[i] = node->links[i];
|
||||
node->links[i] = NULL;
|
||||
}
|
||||
|
||||
node_unlock(node);
|
||||
wr->node = NULL;
|
||||
}
|
||||
|
||||
/*
|
||||
* Advance through the list by setting prevs to node and node to the
|
||||
* next node in the list after locking it. Returns true only if there
|
||||
* was a next node that we were able to lock. Returning false can mean
|
||||
* that we weren't able to lock the next node and the caller should
|
||||
* retry a full search.
|
||||
*
|
||||
* This may be called after _write_remove clears node so we try to
|
||||
* iterate from prev if there is no node.
|
||||
*
|
||||
* If lock_height is greater than zero then the caller needs at least
|
||||
* that lock_height to insert a node of that height. If locked_height
|
||||
* doesn't cover it then we return false so the caller can retry
|
||||
* _write_begin with the needed height.
|
||||
*
|
||||
* Like insertion, we don't unlock prevs higher than the height of the
|
||||
* next node. They're not strictly needed to modify the next node but
|
||||
* we want to keep them locked so the caller can continue to iterate
|
||||
* through nodes up to the locked height.
|
||||
*/
|
||||
bool scoutfs_cwskip_write_next(struct scoutfs_cwskip_writer *wr, int lock_height,
|
||||
void **prev_cont, void **node_cont)
|
||||
{
|
||||
struct scoutfs_cwskip_node *next;
|
||||
int i;
|
||||
|
||||
if (WARN_ON_ONCE(lock_height < 0 || lock_height > SCOUTFS_CWSKIP_MAX_HEIGHT))
|
||||
return false;
|
||||
|
||||
if (wr->node)
|
||||
next = rcu_dereference(wr->node->links[0]);
|
||||
else
|
||||
next = rcu_dereference(wr->prevs[0]->links[0]);
|
||||
|
||||
if (!next ||
|
||||
(lock_height > wr->locked_height) ||
|
||||
(lock_height > 0 && next->height > wr->locked_height) ||
|
||||
!__node_trylock(next, next->write_seq))
|
||||
return false;
|
||||
|
||||
if (!wr->node) {
|
||||
/* set next as missing node */
|
||||
wr->node = next;
|
||||
wr->node_locked = true;
|
||||
|
||||
} else {
|
||||
/* existing node becomes prevs for its height */
|
||||
__node_unlock(wr->prevs[0]);
|
||||
for (i = 0; i < wr->node->height; i++)
|
||||
wr->prevs[0] = wr->node;
|
||||
wr->node = next;
|
||||
}
|
||||
|
||||
smp_wmb(); /* next locked and prev unlocked */
|
||||
|
||||
set_containers(wr->root, wr->prevs[0], wr->node, prev_cont, node_cont);
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
void scoutfs_cwskip_write_end(struct scoutfs_cwskip_writer *wr)
|
||||
__releases(RCU) /* :/ */
|
||||
{
|
||||
writer_unlock(wr);
|
||||
rcu_read_unlock();
|
||||
}
|
||||
@@ -0,0 +1,68 @@
|
||||
#ifndef _SCOUTFS_CWSKIP_H_
|
||||
#define _SCOUTFS_CWSKIP_H_
|
||||
|
||||
/* A billion seems like a lot. */
|
||||
#define SCOUTFS_CWSKIP_MAX_HEIGHT 30
|
||||
|
||||
struct scoutfs_cwskip_node {
|
||||
int height;
|
||||
unsigned int write_seq;
|
||||
struct scoutfs_cwskip_node *links[];
|
||||
};
|
||||
|
||||
#define SCOUTFS_CWSKIP_FULL_NODE_BYTES \
|
||||
offsetof(struct scoutfs_cwskip_node, links[SCOUTFS_CWSKIP_MAX_HEIGHT + 1])
|
||||
|
||||
typedef int (*scoutfs_cwskip_cmp_t)(void *K, void *C);
|
||||
|
||||
struct scoutfs_cwskip_root {
|
||||
scoutfs_cwskip_cmp_t cmp_fn;
|
||||
unsigned long node_off;
|
||||
union {
|
||||
struct scoutfs_cwskip_node node;
|
||||
__u8 __full_root_node[SCOUTFS_CWSKIP_FULL_NODE_BYTES];
|
||||
};
|
||||
};
|
||||
|
||||
struct scoutfs_cwskip_reader {
|
||||
struct scoutfs_cwskip_root *root;
|
||||
struct scoutfs_cwskip_node *prev;
|
||||
struct scoutfs_cwskip_node *node;
|
||||
unsigned int prev_seq;
|
||||
unsigned int node_seq;
|
||||
};
|
||||
|
||||
/*
|
||||
* The full height prevs array makes these pretty enormous :/.
|
||||
*/
|
||||
struct scoutfs_cwskip_writer {
|
||||
struct scoutfs_cwskip_root *root;
|
||||
bool node_locked;
|
||||
int locked_height;
|
||||
struct scoutfs_cwskip_node *node;
|
||||
struct scoutfs_cwskip_node *prevs[SCOUTFS_CWSKIP_MAX_HEIGHT];
|
||||
};
|
||||
|
||||
void scoutfs_cwskip_init_root(struct scoutfs_cwskip_root *root, scoutfs_cwskip_cmp_t cmp_fn,
|
||||
unsigned long node_off);
|
||||
bool scoutfs_cwskip_empty(struct scoutfs_cwskip_root *root);
|
||||
int scoutfs_cwskip_rand_height(void);
|
||||
|
||||
void scoutfs_cwskip_read_begin(struct scoutfs_cwskip_root *root, void *key, void **prev_cont,
|
||||
void **node_cont, int *node_cmp, struct scoutfs_cwskip_reader *rd);
|
||||
bool scoutfs_cwskip_read_valid(struct scoutfs_cwskip_reader *rd);
|
||||
bool scoutfs_cwskip_read_next(struct scoutfs_cwskip_reader *rd, void **prev_cont, void **node_cont);
|
||||
void scoutfs_cwskip_read_end(struct scoutfs_cwskip_reader *rd);
|
||||
|
||||
void scoutfs_cwskip_write_begin(struct scoutfs_cwskip_root *root, void *key, int lock_height,
|
||||
void **prev_cont, void **node_cont, int *node_cmp,
|
||||
struct scoutfs_cwskip_writer *wr);
|
||||
void scoutfs_cwskip_write_insert(struct scoutfs_cwskip_writer *wr,
|
||||
struct scoutfs_cwskip_node *ins);
|
||||
void scoutfs_cwskip_write_remove(struct scoutfs_cwskip_writer *wr,
|
||||
struct scoutfs_cwskip_node *node);
|
||||
bool scoutfs_cwskip_write_next(struct scoutfs_cwskip_writer *wr, int lock_height,
|
||||
void **prev_cont, void **node_cont);
|
||||
void scoutfs_cwskip_write_end(struct scoutfs_cwskip_writer *wr);
|
||||
|
||||
#endif
|
||||
+1
-1
@@ -511,7 +511,7 @@ out:
|
||||
else if (ino == 0)
|
||||
inode = NULL;
|
||||
else
|
||||
inode = scoutfs_iget(sb, ino, 0, 0);
|
||||
inode = scoutfs_iget(sb, ino, 0);
|
||||
|
||||
/*
|
||||
* We can't splice dir aliases into the dcache. dir entries
|
||||
|
||||
+3
-3
@@ -81,7 +81,7 @@ static struct dentry *scoutfs_fh_to_dentry(struct super_block *sb,
|
||||
trace_scoutfs_fh_to_dentry(sb, fh_type, sfid);
|
||||
|
||||
if (scoutfs_valid_fileid(fh_type))
|
||||
inode = scoutfs_iget(sb, le64_to_cpu(sfid->ino), 0, SCOUTFS_IGF_LINKED);
|
||||
inode = scoutfs_iget(sb, le64_to_cpu(sfid->ino), 0);
|
||||
|
||||
return d_obtain_alias(inode);
|
||||
}
|
||||
@@ -100,7 +100,7 @@ static struct dentry *scoutfs_fh_to_parent(struct super_block *sb,
|
||||
|
||||
if (scoutfs_valid_fileid(fh_type) &&
|
||||
fh_type == FILEID_SCOUTFS_WITH_PARENT)
|
||||
inode = scoutfs_iget(sb, le64_to_cpu(sfid->parent_ino), 0, SCOUTFS_IGF_LINKED);
|
||||
inode = scoutfs_iget(sb, le64_to_cpu(sfid->parent_ino), 0);
|
||||
|
||||
return d_obtain_alias(inode);
|
||||
}
|
||||
@@ -123,7 +123,7 @@ static struct dentry *scoutfs_get_parent(struct dentry *child)
|
||||
scoutfs_dir_free_backref_path(sb, &list);
|
||||
trace_scoutfs_get_parent(sb, inode, ino);
|
||||
|
||||
inode = scoutfs_iget(sb, ino, 0, SCOUTFS_IGF_LINKED);
|
||||
inode = scoutfs_iget(sb, ino, 0);
|
||||
|
||||
return d_obtain_alias(inode);
|
||||
}
|
||||
|
||||
+3
-3
@@ -494,13 +494,13 @@ out:
|
||||
return ret;
|
||||
}
|
||||
|
||||
int scoutfs_forest_insert_list(struct super_block *sb,
|
||||
struct scoutfs_btree_item_list *lst)
|
||||
int scoutfs_forest_insert_list(struct super_block *sb, scoutfs_btree_item_iter_cb cb,
|
||||
void *pos, void *arg)
|
||||
{
|
||||
DECLARE_FOREST_INFO(sb, finf);
|
||||
|
||||
return scoutfs_btree_insert_list(sb, finf->alloc, finf->wri,
|
||||
&finf->our_log.item_root, lst);
|
||||
&finf->our_log.item_root, cb, pos, arg);
|
||||
}
|
||||
|
||||
/*
|
||||
|
||||
+2
-2
@@ -29,8 +29,8 @@ void scoutfs_forest_set_max_seq(struct super_block *sb, u64 max_seq);
|
||||
int scoutfs_forest_get_max_seq(struct super_block *sb,
|
||||
struct scoutfs_super_block *super,
|
||||
u64 *seq);
|
||||
int scoutfs_forest_insert_list(struct super_block *sb,
|
||||
struct scoutfs_btree_item_list *lst);
|
||||
int scoutfs_forest_insert_list(struct super_block *sb, scoutfs_btree_item_iter_cb cb,
|
||||
void *pos, void *arg);
|
||||
int scoutfs_forest_srch_add(struct super_block *sb, u64 hash, u64 ino, u64 id);
|
||||
|
||||
void scoutfs_forest_inc_inode_count(struct super_block *sb);
|
||||
|
||||
+25
-36
@@ -276,7 +276,7 @@ static void load_inode(struct inode *inode, struct scoutfs_inode *cinode)
|
||||
set_item_info(si, cinode);
|
||||
}
|
||||
|
||||
void scoutfs_inode_init_key(struct scoutfs_key *key, u64 ino)
|
||||
static void init_inode_key(struct scoutfs_key *key, u64 ino)
|
||||
{
|
||||
*key = (struct scoutfs_key) {
|
||||
.sk_zone = SCOUTFS_FS_ZONE,
|
||||
@@ -296,7 +296,8 @@ void scoutfs_inode_init_key(struct scoutfs_key *key, u64 ino)
|
||||
* fields because they should have already had a locked refreshed inode
|
||||
* to be dereferencing its contents.
|
||||
*/
|
||||
int scoutfs_inode_refresh(struct inode *inode, struct scoutfs_lock *lock)
|
||||
int scoutfs_inode_refresh(struct inode *inode, struct scoutfs_lock *lock,
|
||||
int flags)
|
||||
{
|
||||
struct scoutfs_inode_info *si = SCOUTFS_I(inode);
|
||||
struct super_block *sb = inode->i_sb;
|
||||
@@ -316,7 +317,7 @@ int scoutfs_inode_refresh(struct inode *inode, struct scoutfs_lock *lock)
|
||||
if (atomic64_read(&si->last_refreshed) == refresh_gen)
|
||||
return 0;
|
||||
|
||||
scoutfs_inode_init_key(&key, scoutfs_ino(inode));
|
||||
init_inode_key(&key, scoutfs_ino(inode));
|
||||
|
||||
mutex_lock(&si->item_mutex);
|
||||
if (atomic64_read(&si->last_refreshed) < refresh_gen) {
|
||||
@@ -696,20 +697,21 @@ struct inode *scoutfs_ilookup(struct super_block *sb, u64 ino)
|
||||
return ilookup5(sb, ino, scoutfs_iget_test, &ino);
|
||||
}
|
||||
|
||||
struct inode *scoutfs_iget(struct super_block *sb, u64 ino, int lkf, int igf)
|
||||
struct inode *scoutfs_iget(struct super_block *sb, u64 ino, int lkf)
|
||||
{
|
||||
struct scoutfs_lock *lock = NULL;
|
||||
struct scoutfs_inode_info *si;
|
||||
struct inode *inode = NULL;
|
||||
struct inode *inode;
|
||||
int ret;
|
||||
|
||||
ret = scoutfs_lock_ino(sb, SCOUTFS_LOCK_READ, lkf, ino, &lock);
|
||||
if (ret < 0)
|
||||
goto out;
|
||||
if (ret)
|
||||
return ERR_PTR(ret);
|
||||
|
||||
inode = iget5_locked(sb, ino, scoutfs_iget_test, scoutfs_iget_set, &ino);
|
||||
inode = iget5_locked(sb, ino, scoutfs_iget_test, scoutfs_iget_set,
|
||||
&ino);
|
||||
if (!inode) {
|
||||
ret = -ENOMEM;
|
||||
inode = ERR_PTR(-ENOMEM);
|
||||
goto out;
|
||||
}
|
||||
|
||||
@@ -719,33 +721,20 @@ struct inode *scoutfs_iget(struct super_block *sb, u64 ino, int lkf, int igf)
|
||||
atomic64_set(&si->last_refreshed, 0);
|
||||
inode->i_version = 0;
|
||||
|
||||
ret = scoutfs_inode_refresh(inode, lock);
|
||||
if (ret < 0)
|
||||
goto out;
|
||||
|
||||
if ((igf & SCOUTFS_IGF_LINKED) && inode->i_nlink == 0) {
|
||||
ret = -ENOENT;
|
||||
goto out;
|
||||
ret = scoutfs_inode_refresh(inode, lock, 0);
|
||||
if (ret == 0)
|
||||
ret = scoutfs_omap_inc(sb, ino);
|
||||
if (ret) {
|
||||
iget_failed(inode);
|
||||
inode = ERR_PTR(ret);
|
||||
} else {
|
||||
set_inode_ops(inode);
|
||||
unlock_new_inode(inode);
|
||||
}
|
||||
|
||||
ret = scoutfs_omap_inc(sb, ino);
|
||||
if (ret < 0)
|
||||
goto out;
|
||||
|
||||
set_inode_ops(inode);
|
||||
unlock_new_inode(inode);
|
||||
}
|
||||
|
||||
ret = 0;
|
||||
out:
|
||||
scoutfs_unlock(sb, lock, SCOUTFS_LOCK_READ);
|
||||
|
||||
if (ret < 0) {
|
||||
if (inode)
|
||||
iget_failed(inode);
|
||||
inode = ERR_PTR(ret);
|
||||
}
|
||||
|
||||
return inode;
|
||||
}
|
||||
|
||||
@@ -814,7 +803,7 @@ int scoutfs_dirty_inode_item(struct inode *inode, struct scoutfs_lock *lock)
|
||||
|
||||
store_inode(&sinode, inode);
|
||||
|
||||
scoutfs_inode_init_key(&key, scoutfs_ino(inode));
|
||||
init_inode_key(&key, scoutfs_ino(inode));
|
||||
|
||||
ret = scoutfs_item_update(sb, &key, &sinode, sizeof(sinode), lock);
|
||||
if (!ret)
|
||||
@@ -1033,7 +1022,7 @@ void scoutfs_update_inode_item(struct inode *inode, struct scoutfs_lock *lock,
|
||||
ret = update_indices(sb, si, ino, inode->i_mode, &sinode, lock_list);
|
||||
BUG_ON(ret);
|
||||
|
||||
scoutfs_inode_init_key(&key, ino);
|
||||
init_inode_key(&key, ino);
|
||||
|
||||
err = scoutfs_item_update(sb, &key, &sinode, sizeof(sinode), lock);
|
||||
if (err) {
|
||||
@@ -1432,7 +1421,7 @@ struct inode *scoutfs_new_inode(struct super_block *sb, struct inode *dir,
|
||||
set_inode_ops(inode);
|
||||
|
||||
store_inode(&sinode, inode);
|
||||
scoutfs_inode_init_key(&key, scoutfs_ino(inode));
|
||||
init_inode_key(&key, scoutfs_ino(inode));
|
||||
|
||||
ret = scoutfs_omap_inc(sb, ino);
|
||||
if (ret < 0)
|
||||
@@ -1557,7 +1546,7 @@ static int delete_inode_items(struct super_block *sb, u64 ino, struct scoutfs_lo
|
||||
goto out;
|
||||
}
|
||||
|
||||
scoutfs_inode_init_key(&key, ino);
|
||||
init_inode_key(&key, ino);
|
||||
|
||||
ret = scoutfs_item_lookup_exact(sb, &key, &sinode, sizeof(sinode),
|
||||
lock);
|
||||
@@ -1866,7 +1855,7 @@ static void inode_orphan_scan_worker(struct work_struct *work)
|
||||
}
|
||||
|
||||
/* try to cached and evict unused inode to delete, can be racing */
|
||||
inode = scoutfs_iget(sb, ino, 0, 0);
|
||||
inode = scoutfs_iget(sb, ino, 0);
|
||||
if (IS_ERR(inode)) {
|
||||
ret = PTR_ERR(inode);
|
||||
if (ret == -ENOENT)
|
||||
|
||||
+3
-4
@@ -80,11 +80,9 @@ int scoutfs_drop_inode(struct inode *inode);
|
||||
void scoutfs_evict_inode(struct inode *inode);
|
||||
void scoutfs_inode_queue_iput(struct inode *inode);
|
||||
|
||||
#define SCOUTFS_IGF_LINKED (1 << 0) /* enoent if nlink == 0 */
|
||||
struct inode *scoutfs_iget(struct super_block *sb, u64 ino, int lkf, int igf);
|
||||
struct inode *scoutfs_iget(struct super_block *sb, u64 ino, int lkf);
|
||||
struct inode *scoutfs_ilookup(struct super_block *sb, u64 ino);
|
||||
|
||||
void scoutfs_inode_init_key(struct scoutfs_key *key, u64 ino);
|
||||
void scoutfs_inode_init_index_key(struct scoutfs_key *key, u8 type, u64 major,
|
||||
u32 minor, u64 ino);
|
||||
int scoutfs_inode_index_start(struct super_block *sb, u64 *seq);
|
||||
@@ -119,7 +117,8 @@ u64 scoutfs_inode_data_version(struct inode *inode);
|
||||
void scoutfs_inode_get_onoff(struct inode *inode, s64 *on, s64 *off);
|
||||
int scoutfs_complete_truncate(struct inode *inode, struct scoutfs_lock *lock);
|
||||
|
||||
int scoutfs_inode_refresh(struct inode *inode, struct scoutfs_lock *lock);
|
||||
int scoutfs_inode_refresh(struct inode *inode, struct scoutfs_lock *lock,
|
||||
int flags);
|
||||
int scoutfs_getattr(struct vfsmount *mnt, struct dentry *dentry,
|
||||
struct kstat *stat);
|
||||
int scoutfs_setattr(struct dentry *dentry, struct iattr *attr);
|
||||
|
||||
@@ -1320,84 +1320,6 @@ out:
|
||||
return ret ?: count;
|
||||
}
|
||||
|
||||
static long scoutfs_ioc_get_allocated_inos(struct file *file, unsigned long arg)
|
||||
{
|
||||
struct super_block *sb = file_inode(file)->i_sb;
|
||||
struct scoutfs_ioctl_get_allocated_inos __user *ugai = (void __user *)arg;
|
||||
struct scoutfs_ioctl_get_allocated_inos gai;
|
||||
struct scoutfs_lock *lock = NULL;
|
||||
struct scoutfs_key key;
|
||||
struct scoutfs_key end;
|
||||
u64 __user *uinos;
|
||||
u64 bytes;
|
||||
u64 ino;
|
||||
int nr;
|
||||
int ret;
|
||||
|
||||
if (!(file->f_mode & FMODE_READ)) {
|
||||
ret = -EBADF;
|
||||
goto out;
|
||||
}
|
||||
|
||||
if (!capable(CAP_SYS_ADMIN)) {
|
||||
ret = -EPERM;
|
||||
goto out;
|
||||
}
|
||||
|
||||
if (copy_from_user(&gai, ugai, sizeof(gai))) {
|
||||
ret = -EFAULT;
|
||||
goto out;
|
||||
}
|
||||
|
||||
if ((gai.inos_ptr & (sizeof(__u64) - 1)) || (gai.inos_bytes < sizeof(__u64))) {
|
||||
ret = -EINVAL;
|
||||
goto out;
|
||||
}
|
||||
|
||||
scoutfs_inode_init_key(&key, gai.start_ino);
|
||||
scoutfs_inode_init_key(&end, gai.start_ino | SCOUTFS_LOCK_INODE_GROUP_MASK);
|
||||
uinos = (void __user *)gai.inos_ptr;
|
||||
bytes = gai.inos_bytes;
|
||||
nr = 0;
|
||||
|
||||
ret = scoutfs_lock_ino(sb, SCOUTFS_LOCK_READ, 0, gai.start_ino, &lock);
|
||||
if (ret < 0)
|
||||
goto out;
|
||||
|
||||
while (bytes >= sizeof(*uinos)) {
|
||||
|
||||
ret = scoutfs_item_next(sb, &key, &end, NULL, 0, lock);
|
||||
if (ret < 0) {
|
||||
if (ret == -ENOENT)
|
||||
ret = 0;
|
||||
break;
|
||||
}
|
||||
|
||||
if (key.sk_zone != SCOUTFS_FS_ZONE) {
|
||||
ret = 0;
|
||||
break;
|
||||
}
|
||||
|
||||
/* all fs items are owned by allocated inodes, and _first is always ino */
|
||||
ino = le64_to_cpu(key._sk_first);
|
||||
if (put_user(ino, uinos)) {
|
||||
ret = -EFAULT;
|
||||
break;
|
||||
}
|
||||
|
||||
uinos++;
|
||||
bytes -= sizeof(*uinos);
|
||||
if (++nr == INT_MAX)
|
||||
break;
|
||||
|
||||
scoutfs_inode_init_key(&key, ino + 1);
|
||||
}
|
||||
|
||||
scoutfs_unlock(sb, lock, SCOUTFS_LOCK_READ);
|
||||
out:
|
||||
return ret ?: nr;
|
||||
}
|
||||
|
||||
long scoutfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
|
||||
{
|
||||
switch (cmd) {
|
||||
@@ -1431,8 +1353,6 @@ long scoutfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
|
||||
return scoutfs_ioc_resize_devices(file, arg);
|
||||
case SCOUTFS_IOC_READ_XATTR_TOTALS:
|
||||
return scoutfs_ioc_read_xattr_totals(file, arg);
|
||||
case SCOUTFS_IOC_GET_ALLOCATED_INOS:
|
||||
return scoutfs_ioc_get_allocated_inos(file, arg);
|
||||
}
|
||||
|
||||
return -ENOTTY;
|
||||
|
||||
@@ -520,43 +520,4 @@ struct scoutfs_ioctl_xattr_total {
|
||||
#define SCOUTFS_IOC_READ_XATTR_TOTALS \
|
||||
_IOW(SCOUTFS_IOCTL_MAGIC, 15, struct scoutfs_ioctl_read_xattr_totals)
|
||||
|
||||
/*
|
||||
* This fills the caller's inos array with inode numbers that are in use
|
||||
* after the start ino, within an internal inode group.
|
||||
*
|
||||
* This only makes a promise about the state of the inode numbers within
|
||||
* the first and last numbers returned by one call. At one time, all of
|
||||
* those inodes were still allocated. They could have changed before
|
||||
* the call returned. And any numbers outside of the first and last
|
||||
* (or single) are undefined.
|
||||
*
|
||||
* This doesn't iterate over all allocated inodes, it only probes a
|
||||
* single group that the start inode is within. This interface was
|
||||
* first introduced to support tests that needed to find out about a
|
||||
* specific inode, while having some other similarly niche uses. It is
|
||||
* unsuitable for a consistent iteration over all the inode numbers in
|
||||
* use.
|
||||
*
|
||||
* This test of inode items doesn't serialize with the inode lifetime
|
||||
* mechanism. It only tells you the numbers of inodes that were once
|
||||
* active in the system and haven't yet been fully deleted. The inode
|
||||
* numbers returned could have been in the process of being deleted and
|
||||
* were already unreachable even before the call started.
|
||||
*
|
||||
* @start_ino: the first inode number that could be returned
|
||||
* @inos_ptr: pointer to an aligned array of 64bit inode numbers
|
||||
* @inos_bytes: the number of bytes available in the inos_ptr array
|
||||
*
|
||||
* Returns errors or the count of inode numbers returned, quite possibly
|
||||
* including 0.
|
||||
*/
|
||||
struct scoutfs_ioctl_get_allocated_inos {
|
||||
__u64 start_ino;
|
||||
__u64 inos_ptr;
|
||||
__u64 inos_bytes;
|
||||
};
|
||||
|
||||
#define SCOUTFS_IOC_GET_ALLOCATED_INOS \
|
||||
_IOW(SCOUTFS_IOCTL_MAGIC, 16, struct scoutfs_ioctl_get_allocated_inos)
|
||||
|
||||
#endif
|
||||
|
||||
+1101
-1895
File diff suppressed because it is too large
Load Diff
+1
-1
@@ -26,7 +26,7 @@ int scoutfs_item_delete_force(struct super_block *sb,
|
||||
struct scoutfs_key *key,
|
||||
struct scoutfs_lock *lock);
|
||||
|
||||
u64 scoutfs_item_dirty_pages(struct super_block *sb);
|
||||
u64 scoutfs_item_dirty_bytes(struct super_block *sb);
|
||||
int scoutfs_item_write_dirty(struct super_block *sb);
|
||||
int scoutfs_item_write_done(struct super_block *sb);
|
||||
bool scoutfs_item_range_cached(struct super_block *sb,
|
||||
|
||||
+1
-1
@@ -1050,7 +1050,7 @@ int scoutfs_lock_inode(struct super_block *sb, enum scoutfs_lock_mode mode, int
|
||||
goto out;
|
||||
|
||||
if (flags & SCOUTFS_LKF_REFRESH_INODE) {
|
||||
ret = scoutfs_inode_refresh(inode, *lock);
|
||||
ret = scoutfs_inode_refresh(inode, *lock, flags);
|
||||
if (ret < 0) {
|
||||
scoutfs_unlock(sb, *lock, mode);
|
||||
*lock = NULL;
|
||||
|
||||
+55
-55
@@ -153,30 +153,30 @@ enum {
|
||||
*/
|
||||
static void add_client_entry(struct server_lock_node *snode,
|
||||
struct list_head *list,
|
||||
struct client_lock_entry *c_ent)
|
||||
struct client_lock_entry *clent)
|
||||
{
|
||||
WARN_ON_ONCE(!mutex_is_locked(&snode->mutex));
|
||||
|
||||
if (list_empty(&c_ent->head))
|
||||
list_add_tail(&c_ent->head, list);
|
||||
if (list_empty(&clent->head))
|
||||
list_add_tail(&clent->head, list);
|
||||
else
|
||||
list_move_tail(&c_ent->head, list);
|
||||
list_move_tail(&clent->head, list);
|
||||
|
||||
c_ent->on_list = list == &snode->granted ? OL_GRANTED :
|
||||
clent->on_list = list == &snode->granted ? OL_GRANTED :
|
||||
list == &snode->requested ? OL_REQUESTED :
|
||||
OL_INVALIDATED;
|
||||
}
|
||||
|
||||
static void free_client_entry(struct lock_server_info *inf,
|
||||
struct server_lock_node *snode,
|
||||
struct client_lock_entry *c_ent)
|
||||
struct client_lock_entry *clent)
|
||||
{
|
||||
WARN_ON_ONCE(!mutex_is_locked(&snode->mutex));
|
||||
|
||||
if (!list_empty(&c_ent->head))
|
||||
list_del_init(&c_ent->head);
|
||||
scoutfs_tseq_del(&inf->tseq_tree, &c_ent->tseq_entry);
|
||||
kfree(c_ent);
|
||||
if (!list_empty(&clent->head))
|
||||
list_del_init(&clent->head);
|
||||
scoutfs_tseq_del(&inf->tseq_tree, &clent->tseq_entry);
|
||||
kfree(clent);
|
||||
}
|
||||
|
||||
static bool invalid_mode(u8 mode)
|
||||
@@ -339,13 +339,13 @@ static struct client_lock_entry *find_entry(struct server_lock_node *snode,
|
||||
struct list_head *list,
|
||||
u64 rid)
|
||||
{
|
||||
struct client_lock_entry *c_ent;
|
||||
struct client_lock_entry *clent;
|
||||
|
||||
WARN_ON_ONCE(!mutex_is_locked(&snode->mutex));
|
||||
|
||||
list_for_each_entry(c_ent, list, head) {
|
||||
if (c_ent->rid == rid)
|
||||
return c_ent;
|
||||
list_for_each_entry(clent, list, head) {
|
||||
if (clent->rid == rid)
|
||||
return clent;
|
||||
}
|
||||
|
||||
return NULL;
|
||||
@@ -364,7 +364,7 @@ int scoutfs_lock_server_request(struct super_block *sb, u64 rid,
|
||||
u64 net_id, struct scoutfs_net_lock *nl)
|
||||
{
|
||||
DECLARE_LOCK_SERVER_INFO(sb, inf);
|
||||
struct client_lock_entry *c_ent;
|
||||
struct client_lock_entry *clent;
|
||||
struct server_lock_node *snode;
|
||||
int ret;
|
||||
|
||||
@@ -376,29 +376,29 @@ int scoutfs_lock_server_request(struct super_block *sb, u64 rid,
|
||||
goto out;
|
||||
}
|
||||
|
||||
c_ent = kzalloc(sizeof(struct client_lock_entry), GFP_NOFS);
|
||||
if (!c_ent) {
|
||||
clent = kzalloc(sizeof(struct client_lock_entry), GFP_NOFS);
|
||||
if (!clent) {
|
||||
ret = -ENOMEM;
|
||||
goto out;
|
||||
}
|
||||
|
||||
INIT_LIST_HEAD(&c_ent->head);
|
||||
c_ent->rid = rid;
|
||||
c_ent->net_id = net_id;
|
||||
c_ent->mode = nl->new_mode;
|
||||
INIT_LIST_HEAD(&clent->head);
|
||||
clent->rid = rid;
|
||||
clent->net_id = net_id;
|
||||
clent->mode = nl->new_mode;
|
||||
|
||||
snode = alloc_server_lock(inf, &nl->key);
|
||||
if (snode == NULL) {
|
||||
kfree(c_ent);
|
||||
kfree(clent);
|
||||
ret = -ENOMEM;
|
||||
goto out;
|
||||
}
|
||||
|
||||
snode->stats[SLT_REQUEST]++;
|
||||
|
||||
c_ent->snode = snode;
|
||||
add_client_entry(snode, &snode->requested, c_ent);
|
||||
scoutfs_tseq_add(&inf->tseq_tree, &c_ent->tseq_entry);
|
||||
clent->snode = snode;
|
||||
add_client_entry(snode, &snode->requested, clent);
|
||||
scoutfs_tseq_add(&inf->tseq_tree, &clent->tseq_entry);
|
||||
|
||||
ret = process_waiting_requests(sb, snode);
|
||||
out:
|
||||
@@ -417,7 +417,7 @@ int scoutfs_lock_server_response(struct super_block *sb, u64 rid,
|
||||
struct scoutfs_net_lock *nl)
|
||||
{
|
||||
DECLARE_LOCK_SERVER_INFO(sb, inf);
|
||||
struct client_lock_entry *c_ent;
|
||||
struct client_lock_entry *clent;
|
||||
struct server_lock_node *snode;
|
||||
int ret;
|
||||
|
||||
@@ -438,18 +438,18 @@ int scoutfs_lock_server_response(struct super_block *sb, u64 rid,
|
||||
|
||||
snode->stats[SLT_RESPONSE]++;
|
||||
|
||||
c_ent = find_entry(snode, &snode->invalidated, rid);
|
||||
if (!c_ent) {
|
||||
clent = find_entry(snode, &snode->invalidated, rid);
|
||||
if (!clent) {
|
||||
put_server_lock(inf, snode);
|
||||
ret = -EINVAL;
|
||||
goto out;
|
||||
}
|
||||
|
||||
if (nl->new_mode == SCOUTFS_LOCK_NULL) {
|
||||
free_client_entry(inf, snode, c_ent);
|
||||
free_client_entry(inf, snode, clent);
|
||||
} else {
|
||||
c_ent->mode = nl->new_mode;
|
||||
add_client_entry(snode, &snode->granted, c_ent);
|
||||
clent->mode = nl->new_mode;
|
||||
add_client_entry(snode, &snode->granted, clent);
|
||||
}
|
||||
|
||||
ret = process_waiting_requests(sb, snode);
|
||||
@@ -632,7 +632,7 @@ int scoutfs_lock_server_recover_response(struct super_block *sb, u64 rid,
|
||||
{
|
||||
DECLARE_LOCK_SERVER_INFO(sb, inf);
|
||||
struct client_lock_entry *existing;
|
||||
struct client_lock_entry *c_ent;
|
||||
struct client_lock_entry *clent;
|
||||
struct server_lock_node *snode;
|
||||
struct scoutfs_key key;
|
||||
int ret = 0;
|
||||
@@ -652,35 +652,35 @@ int scoutfs_lock_server_recover_response(struct super_block *sb, u64 rid,
|
||||
}
|
||||
|
||||
for (i = 0; i < le16_to_cpu(nlr->nr); i++) {
|
||||
c_ent = kzalloc(sizeof(struct client_lock_entry), GFP_NOFS);
|
||||
if (!c_ent) {
|
||||
clent = kzalloc(sizeof(struct client_lock_entry), GFP_NOFS);
|
||||
if (!clent) {
|
||||
ret = -ENOMEM;
|
||||
goto out;
|
||||
}
|
||||
|
||||
INIT_LIST_HEAD(&c_ent->head);
|
||||
c_ent->rid = rid;
|
||||
c_ent->net_id = 0;
|
||||
c_ent->mode = nlr->locks[i].new_mode;
|
||||
INIT_LIST_HEAD(&clent->head);
|
||||
clent->rid = rid;
|
||||
clent->net_id = 0;
|
||||
clent->mode = nlr->locks[i].new_mode;
|
||||
|
||||
snode = alloc_server_lock(inf, &nlr->locks[i].key);
|
||||
if (snode == NULL) {
|
||||
kfree(c_ent);
|
||||
kfree(clent);
|
||||
ret = -ENOMEM;
|
||||
goto out;
|
||||
}
|
||||
|
||||
existing = find_entry(snode, &snode->granted, rid);
|
||||
if (existing) {
|
||||
kfree(c_ent);
|
||||
kfree(clent);
|
||||
put_server_lock(inf, snode);
|
||||
ret = -EEXIST;
|
||||
goto out;
|
||||
}
|
||||
|
||||
c_ent->snode = snode;
|
||||
add_client_entry(snode, &snode->granted, c_ent);
|
||||
scoutfs_tseq_add(&inf->tseq_tree, &c_ent->tseq_entry);
|
||||
clent->snode = snode;
|
||||
add_client_entry(snode, &snode->granted, clent);
|
||||
scoutfs_tseq_add(&inf->tseq_tree, &clent->tseq_entry);
|
||||
|
||||
put_server_lock(inf, snode);
|
||||
|
||||
@@ -707,7 +707,7 @@ out:
|
||||
int scoutfs_lock_server_farewell(struct super_block *sb, u64 rid)
|
||||
{
|
||||
DECLARE_LOCK_SERVER_INFO(sb, inf);
|
||||
struct client_lock_entry *c_ent;
|
||||
struct client_lock_entry *clent;
|
||||
struct client_lock_entry *tmp;
|
||||
struct server_lock_node *snode;
|
||||
struct scoutfs_key key;
|
||||
@@ -724,9 +724,9 @@ int scoutfs_lock_server_farewell(struct super_block *sb, u64 rid)
|
||||
(list == &snode->requested) ? &snode->invalidated :
|
||||
NULL) {
|
||||
|
||||
list_for_each_entry_safe(c_ent, tmp, list, head) {
|
||||
if (c_ent->rid == rid) {
|
||||
free_client_entry(inf, snode, c_ent);
|
||||
list_for_each_entry_safe(clent, tmp, list, head) {
|
||||
if (clent->rid == rid) {
|
||||
free_client_entry(inf, snode, clent);
|
||||
freed = true;
|
||||
}
|
||||
}
|
||||
@@ -787,15 +787,15 @@ static char *lock_on_list_string(u8 on_list)
|
||||
static void lock_server_tseq_show(struct seq_file *m,
|
||||
struct scoutfs_tseq_entry *ent)
|
||||
{
|
||||
struct client_lock_entry *c_ent = container_of(ent,
|
||||
struct client_lock_entry *clent = container_of(ent,
|
||||
struct client_lock_entry,
|
||||
tseq_entry);
|
||||
struct server_lock_node *snode = c_ent->snode;
|
||||
struct server_lock_node *snode = clent->snode;
|
||||
|
||||
seq_printf(m, SK_FMT" %s %s rid %016llx net_id %llu\n",
|
||||
SK_ARG(&snode->key), lock_mode_string(c_ent->mode),
|
||||
lock_on_list_string(c_ent->on_list), c_ent->rid,
|
||||
c_ent->net_id);
|
||||
SK_ARG(&snode->key), lock_mode_string(clent->mode),
|
||||
lock_on_list_string(clent->on_list), clent->rid,
|
||||
clent->net_id);
|
||||
}
|
||||
|
||||
static void stats_tseq_show(struct seq_file *m, struct scoutfs_tseq_entry *ent)
|
||||
@@ -857,7 +857,7 @@ void scoutfs_lock_server_destroy(struct super_block *sb)
|
||||
DECLARE_LOCK_SERVER_INFO(sb, inf);
|
||||
struct server_lock_node *snode;
|
||||
struct server_lock_node *stmp;
|
||||
struct client_lock_entry *c_ent;
|
||||
struct client_lock_entry *clent;
|
||||
struct client_lock_entry *ctmp;
|
||||
LIST_HEAD(list);
|
||||
|
||||
@@ -873,8 +873,8 @@ void scoutfs_lock_server_destroy(struct super_block *sb)
|
||||
list_splice_init(&snode->invalidated, &list);
|
||||
|
||||
mutex_lock(&snode->mutex);
|
||||
list_for_each_entry_safe(c_ent, ctmp, &list, head) {
|
||||
free_client_entry(inf, snode, c_ent);
|
||||
list_for_each_entry_safe(clent, ctmp, &list, head) {
|
||||
free_client_entry(inf, snode, clent);
|
||||
}
|
||||
mutex_unlock(&snode->mutex);
|
||||
|
||||
|
||||
@@ -1772,6 +1772,23 @@ int scoutfs_net_response_node(struct super_block *sb,
|
||||
NULL, NULL, NULL);
|
||||
}
|
||||
|
||||
/*
|
||||
* The response function that was submitted with the request is not
|
||||
* called if the request is canceled here.
|
||||
*/
|
||||
void scoutfs_net_cancel_request(struct super_block *sb,
|
||||
struct scoutfs_net_connection *conn,
|
||||
u8 cmd, u64 id)
|
||||
{
|
||||
struct message_send *msend;
|
||||
|
||||
spin_lock(&conn->lock);
|
||||
msend = find_request(conn, cmd, id);
|
||||
if (msend)
|
||||
complete_send(conn, msend);
|
||||
spin_unlock(&conn->lock);
|
||||
}
|
||||
|
||||
struct sync_request_completion {
|
||||
struct completion comp;
|
||||
void *resp;
|
||||
|
||||
@@ -134,6 +134,9 @@ int scoutfs_net_submit_request_node(struct super_block *sb,
|
||||
u64 rid, u8 cmd, void *arg, u16 arg_len,
|
||||
scoutfs_net_response_t resp_func,
|
||||
void *resp_data, u64 *id_ret);
|
||||
void scoutfs_net_cancel_request(struct super_block *sb,
|
||||
struct scoutfs_net_connection *conn,
|
||||
u8 cmd, u64 id);
|
||||
int scoutfs_net_sync_request(struct super_block *sb,
|
||||
struct scoutfs_net_connection *conn,
|
||||
u8 cmd, void *arg, unsigned arg_len,
|
||||
|
||||
+1
-1
@@ -262,7 +262,7 @@ void scoutfs_recov_shutdown(struct super_block *sb)
|
||||
recinf->timeout_fn = NULL;
|
||||
spin_unlock(&recinf->lock);
|
||||
|
||||
list_for_each_entry_safe(pend, tmp, &list, head) {
|
||||
list_for_each_entry_safe(pend, tmp, &recinf->pending, head) {
|
||||
list_del(&pend->head);
|
||||
kfree(pend);
|
||||
}
|
||||
|
||||
@@ -403,24 +403,24 @@ TRACE_EVENT(scoutfs_sync_fs,
|
||||
);
|
||||
|
||||
TRACE_EVENT(scoutfs_trans_write_func,
|
||||
TP_PROTO(struct super_block *sb, u64 dirty_block_bytes, u64 dirty_item_pages),
|
||||
TP_PROTO(struct super_block *sb, u64 dirty_block_bytes, u64 dirty_item_bytes),
|
||||
|
||||
TP_ARGS(sb, dirty_block_bytes, dirty_item_pages),
|
||||
TP_ARGS(sb, dirty_block_bytes, dirty_item_bytes),
|
||||
|
||||
TP_STRUCT__entry(
|
||||
SCSB_TRACE_FIELDS
|
||||
__field(__u64, dirty_block_bytes)
|
||||
__field(__u64, dirty_item_pages)
|
||||
__field(__u64, dirty_item_bytes)
|
||||
),
|
||||
|
||||
TP_fast_assign(
|
||||
SCSB_TRACE_ASSIGN(sb);
|
||||
__entry->dirty_block_bytes = dirty_block_bytes;
|
||||
__entry->dirty_item_pages = dirty_item_pages;
|
||||
__entry->dirty_item_bytes = dirty_item_bytes;
|
||||
),
|
||||
|
||||
TP_printk(SCSBF" dirty_block_bytes %llu dirty_item_pages %llu",
|
||||
SCSB_TRACE_ARGS, __entry->dirty_block_bytes, __entry->dirty_item_pages)
|
||||
TP_printk(SCSBF" dirty_block_bytes %llu dirty_item_bytes %llu",
|
||||
SCSB_TRACE_ARGS, __entry->dirty_block_bytes, __entry->dirty_item_bytes)
|
||||
);
|
||||
|
||||
DECLARE_EVENT_CLASS(scoutfs_trans_hold_release_class,
|
||||
|
||||
+10
-27
@@ -171,7 +171,7 @@ static bool test_shutting_down(struct server_info *server)
|
||||
static void set_shutting_down(struct server_info *server, bool val)
|
||||
{
|
||||
server->shutting_down = val;
|
||||
smp_wmb();
|
||||
smp_rmb();
|
||||
}
|
||||
|
||||
static void stop_server(struct server_info *server)
|
||||
@@ -2455,14 +2455,15 @@ static int server_commit_log_merge(struct super_block *sb,
|
||||
}
|
||||
|
||||
/* find the completion's original saved request */
|
||||
ret = next_log_merge_item(sb, &super->log_merge, SCOUTFS_LOG_MERGE_REQUEST_ZONE,
|
||||
rid, le64_to_cpu(comp->seq), &orig_req, sizeof(orig_req));
|
||||
if (ret == 0 && (comp->rid != orig_req.rid || comp->seq != orig_req.seq))
|
||||
ret = -ENOENT;
|
||||
ret = next_log_merge_item(sb, &super->log_merge,
|
||||
SCOUTFS_LOG_MERGE_REQUEST_ZONE,
|
||||
rid, le64_to_cpu(comp->seq),
|
||||
&orig_req, sizeof(orig_req));
|
||||
if (WARN_ON_ONCE(ret == 0 && (comp->rid != orig_req.rid ||
|
||||
comp->seq != orig_req.seq)))
|
||||
ret = -ENOENT; /* inconsistency */
|
||||
if (ret < 0) {
|
||||
/* ENOENT is expected for resent processed completion */
|
||||
if (ret != -ENOENT)
|
||||
err_str = "finding orig request";
|
||||
err_str = "finding orig request";
|
||||
goto out;
|
||||
}
|
||||
|
||||
@@ -2532,7 +2533,7 @@ static int server_commit_log_merge(struct super_block *sb,
|
||||
out:
|
||||
mutex_unlock(&server->logs_mutex);
|
||||
|
||||
if (ret < 0 && err_str)
|
||||
if (ret < 0)
|
||||
scoutfs_err(sb, "error %d committing log merge: %s", ret, err_str);
|
||||
|
||||
err = scoutfs_server_apply_commit(sb, ret);
|
||||
@@ -3448,18 +3449,6 @@ static void farewell_worker(struct work_struct *work)
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Responses that are ready to send can be further delayed by
|
||||
* moving them back to the reqs list.
|
||||
*/
|
||||
list_for_each_entry_safe(fw, tmp, &send, entry) {
|
||||
/* finish lock recovery before destroying locks, fenced if too long */
|
||||
if (scoutfs_recov_is_pending(sb, fw->rid, SCOUTFS_RECOV_LOCKS)) {
|
||||
list_move_tail(&fw->entry, &reqs);
|
||||
quo_reqs++;
|
||||
}
|
||||
}
|
||||
|
||||
/* clean up resources for mounts before sending responses */
|
||||
list_for_each_entry_safe(fw, tmp, &send, entry) {
|
||||
ret = reclaim_rid(sb, fw->rid);
|
||||
@@ -3638,14 +3627,8 @@ static void finished_recovery(struct super_block *sb)
|
||||
|
||||
void scoutfs_server_recov_finish(struct super_block *sb, u64 rid, int which)
|
||||
{
|
||||
DECLARE_SERVER_INFO(sb, server);
|
||||
|
||||
if (scoutfs_recov_finish(sb, rid, which) > 0)
|
||||
finished_recovery(sb);
|
||||
|
||||
/* rid's farewell response might be sent after it finishes lock recov */
|
||||
if (which & SCOUTFS_RECOV_LOCKS)
|
||||
queue_farewell_work(server);
|
||||
}
|
||||
|
||||
/*
|
||||
|
||||
+1
-1
@@ -601,7 +601,7 @@ static int scoutfs_fill_super(struct super_block *sb, void *data, int silent)
|
||||
goto out;
|
||||
|
||||
/* this interruptible iget lets hung mount be aborted with ctl-c */
|
||||
inode = scoutfs_iget(sb, SCOUTFS_ROOT_INO, SCOUTFS_LKF_INTERRUPTIBLE, 0);
|
||||
inode = scoutfs_iget(sb, SCOUTFS_ROOT_INO, SCOUTFS_LKF_INTERRUPTIBLE);
|
||||
if (IS_ERR(inode)) {
|
||||
ret = PTR_ERR(inode);
|
||||
if (ret == -ERESTARTSYS)
|
||||
|
||||
+8
-6
@@ -207,7 +207,7 @@ void scoutfs_trans_write_func(struct work_struct *work)
|
||||
}
|
||||
|
||||
trace_scoutfs_trans_write_func(sb, scoutfs_block_writer_dirty_bytes(sb, &tri->wri),
|
||||
scoutfs_item_dirty_pages(sb));
|
||||
scoutfs_item_dirty_bytes(sb));
|
||||
|
||||
if (tri->deadline_expired)
|
||||
scoutfs_inc_counter(sb, trans_commit_timer);
|
||||
@@ -422,16 +422,18 @@ static void release_holders(struct super_block *sb)
|
||||
*/
|
||||
static bool commit_before_hold(struct super_block *sb, struct trans_info *tri)
|
||||
{
|
||||
u64 dirty_blocks = (scoutfs_item_dirty_bytes(sb) >> SCOUTFS_BLOCK_LG_SHIFT) + 1;
|
||||
|
||||
/*
|
||||
* In theory each dirty item page could be straddling two full
|
||||
* blocks, requiring 4 allocations for each item cache page.
|
||||
* That's much too conservative, typically many dirty item cache
|
||||
* pages that are near each other all land in one block. This
|
||||
* In theory each dirty item could be added to a full block that
|
||||
* has to split, requiring 2 meta block allocs for each dirty
|
||||
* item. That's much too conservative, typically many dirty
|
||||
* items that are near each other all land in one block. This
|
||||
* rough estimate is still so far beyond what typically happens
|
||||
* that it accounts for having to dirty parent blocks and
|
||||
* whatever dirtying is done during the transaction hold.
|
||||
*/
|
||||
if (scoutfs_alloc_meta_low(sb, &tri->alloc, scoutfs_item_dirty_pages(sb) * 2)) {
|
||||
if (scoutfs_alloc_meta_low(sb, &tri->alloc, dirty_blocks * 4)) {
|
||||
scoutfs_inc_counter(sb, trans_commit_dirty_meta_full);
|
||||
return true;
|
||||
}
|
||||
|
||||
@@ -1 +0,0 @@
|
||||
== 60s of unmounting non-quorum clients during recovery
|
||||
+3
-4
@@ -227,9 +227,8 @@ test "$T_QUORUM" -le "$T_NR_MOUNTS" || \
|
||||
die "-q quorum mmembers must not be greater than -n mounts"
|
||||
|
||||
# top level paths
|
||||
T_TESTS=$(realpath "$(dirname $0)")
|
||||
T_KMOD=$(realpath "$T_TESTS/../kmod")
|
||||
T_UTILS=$(realpath "$T_TESTS/../utils")
|
||||
T_KMOD=$(realpath "$(dirname $0)/../kmod")
|
||||
T_UTILS=$(realpath "$T_KMOD/../utils")
|
||||
|
||||
test -d "$T_KMOD" || die "kmod/ repo dir $T_KMOD not directory"
|
||||
test -d "$T_UTILS" || die "utils/ repo dir $T_UTILS not directory"
|
||||
@@ -383,7 +382,7 @@ cmd grep . /sys/kernel/debug/tracing/options/trace_printk \
|
||||
conf="$T_RESULTS/scoutfs-fencd.conf"
|
||||
cat > $conf << EOF
|
||||
SCOUTFS_FENCED_DELAY=1
|
||||
SCOUTFS_FENCED_RUN=$T_TESTS/fenced-local-force-unmount.sh
|
||||
SCOUTFS_FENCED_RUN=$T_UTILS/fenced/local-force-unmount
|
||||
SCOUTFS_FENCED_RUN_ARGS=""
|
||||
EOF
|
||||
export SCOUTFS_FENCED_CONFIG_FILE="$conf"
|
||||
|
||||
@@ -33,7 +33,6 @@ resize-devices.sh
|
||||
fence-and-reclaim.sh
|
||||
orphan-inodes.sh
|
||||
mount-unmount-race.sh
|
||||
client-unmount-recovery.sh
|
||||
createmany-parallel-mounts.sh
|
||||
archive-light-cycle.sh
|
||||
block-stale-reads.sh
|
||||
|
||||
@@ -1,61 +0,0 @@
|
||||
#
|
||||
# Unmount Server and unmount a client as it's replaying to a remaining server
|
||||
#
|
||||
|
||||
majority_nr=$(t_majority_count)
|
||||
quorum_nr=$T_QUORUM
|
||||
|
||||
test "$quorum_nr" == "$majority_nr" && \
|
||||
t_skip "all quorum members make up majority, need more mounts to unmount"
|
||||
|
||||
test "$T_NR_MOUNTS" -lt "$T_QUORUM" && \
|
||||
t_skip "Need enough non-quorum clients to unmount"
|
||||
|
||||
for i in $(t_fs_nrs); do
|
||||
mounted[$i]=1
|
||||
done
|
||||
|
||||
LENGTH=60
|
||||
echo "== ${LENGTH}s of unmounting non-quorum clients during recovery"
|
||||
END=$((SECONDS + LENGTH))
|
||||
while [ "$SECONDS" -lt "$END" ]; do
|
||||
sv=$(t_server_nr)
|
||||
rid=$(t_mount_rid $sv)
|
||||
echo "sv $sv rid $rid" >> "$T_TMP.log"
|
||||
sync
|
||||
t_umount $sv &
|
||||
|
||||
for i in $(t_fs_nrs); do
|
||||
if [ "$i" -ge "$quorum_nr" ]; then
|
||||
t_umount $i &
|
||||
echo "umount $i pid $pid quo $quorum_nr" \
|
||||
>> $T_TMP.log
|
||||
mounted[$i]=0
|
||||
fi
|
||||
done
|
||||
|
||||
wait
|
||||
|
||||
t_mount $sv &
|
||||
for i in $(t_fs_nrs); do
|
||||
if [ "${mounted[$i]}" == 0 ]; then
|
||||
t_mount $i &
|
||||
fi
|
||||
done
|
||||
|
||||
wait
|
||||
|
||||
declare RID_LIST=$(cat /sys/fs/scoutfs/*/rid | sort -u)
|
||||
read -a rid_arr <<< $RID_LIST
|
||||
|
||||
declare LOCK_LIST=$(cut -d' ' -f 5 /sys/kernel/debug/scoutfs/*/server_locks | sort -u)
|
||||
read -a lock_arr <<< $LOCK_LIST
|
||||
|
||||
for i in "${lock_arr[@]}"; do
|
||||
if [[ ! " ${rid_arr[*]} " =~ " $i " ]]; then
|
||||
t_fail "RID($i): exists when not mounted"
|
||||
fi
|
||||
done
|
||||
done
|
||||
|
||||
t_pass
|
||||
@@ -26,8 +26,7 @@ inode_exists()
|
||||
{
|
||||
local ino="$1"
|
||||
|
||||
scoutfs get-allocated-inos -i "$ino" -s -p "$T_M0" > $T_TMP.inos.log 2>&1
|
||||
test "$?" == 0 -a "$(head -1 $T_TMP.inos.log)" == "$ino"
|
||||
handle_cat "$T_M0" "$ino" > "$T_TMP.handle_cat.log" 2>&1
|
||||
}
|
||||
|
||||
echo "== test our inode existance function"
|
||||
|
||||
@@ -1,6 +1,3 @@
|
||||
# delay, in seconds, between each check for pending fence requests.
|
||||
SCOUTFS_FENCED_DELAY=1
|
||||
# path to executable to run to service fence request
|
||||
#SCOUTFS_FENCED_RUN=
|
||||
# arguments to pass to binary
|
||||
SCOUTFS_FENCED_RUN=/usr/libexec/scoutfs-fenced/run/local-force-unmount
|
||||
SCOUTFS_FENCED_RUN_ARGS=""
|
||||
|
||||
@@ -617,33 +617,6 @@ command is used first.
|
||||
.RE
|
||||
.PD
|
||||
|
||||
.TP
|
||||
.BI "get-allocated-inos [-i|--ino INO] [-s|--single] [-p|--path PATH]"
|
||||
.sp
|
||||
This debugging command prints allocated inode numbers. It only prints
|
||||
inodes
|
||||
found in the group that contains the starting inode. The printed inode
|
||||
numbers aren't necessarily reachable. They could be anywhere in the
|
||||
process from being unlinked to finally deleted when their items
|
||||
were found.
|
||||
.RS 1.0i
|
||||
.PD 0
|
||||
.TP
|
||||
.sp
|
||||
.B "-i, --ino INO"
|
||||
The first 64bit inode number which could be printed.
|
||||
.TP
|
||||
.B "-s, --single"
|
||||
Only print the single starting inode when it is allocated, all other allocated
|
||||
inode numbers will be ignored.
|
||||
.TP
|
||||
.B "-p, --path PATH"
|
||||
A path within a ScoutFS filesystem.
|
||||
.RE
|
||||
.PD
|
||||
|
||||
.TP
|
||||
|
||||
.SH SEE ALSO
|
||||
.BR scoutfs (5),
|
||||
.BR xattr (7),
|
||||
|
||||
@@ -55,6 +55,7 @@ install -m 755 -D src/scoutfs $RPM_BUILD_ROOT%{_sbindir}/scoutfs
|
||||
install -m 644 -D src/ioctl.h $RPM_BUILD_ROOT%{_includedir}/scoutfs/ioctl.h
|
||||
install -m 644 -D src/format.h $RPM_BUILD_ROOT%{_includedir}/scoutfs/format.h
|
||||
install -m 755 -D fenced/scoutfs-fenced $RPM_BUILD_ROOT%{_libexecdir}/scoutfs-fenced/scoutfs-fenced
|
||||
install -m 755 -D fenced/local-force-unmount $RPM_BUILD_ROOT%{_libexecdir}/scoutfs-fenced/run/local-force-unmount
|
||||
install -m 644 -D fenced/scoutfs-fenced.service $RPM_BUILD_ROOT%{_unitdir}/scoutfs-fenced.service
|
||||
install -m 644 -D fenced/scoutfs-fenced.conf.example $RPM_BUILD_ROOT%{_sysconfdir}/scoutfs/scoutfs-fenced.conf.example
|
||||
|
||||
|
||||
@@ -1,137 +0,0 @@
|
||||
#include <unistd.h>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <stdbool.h>
|
||||
#include <sys/types.h>
|
||||
#include <sys/stat.h>
|
||||
#include <sys/ioctl.h>
|
||||
#include <fcntl.h>
|
||||
#include <errno.h>
|
||||
#include <string.h>
|
||||
#include <argp.h>
|
||||
|
||||
#include "sparse.h"
|
||||
#include "parse.h"
|
||||
#include "util.h"
|
||||
#include "format.h"
|
||||
#include "ioctl.h"
|
||||
#include "cmd.h"
|
||||
|
||||
struct get_allocated_inos_args {
|
||||
char *path;
|
||||
u64 ino;
|
||||
bool have_ino;
|
||||
bool single;
|
||||
};
|
||||
|
||||
static int do_get_allocated_inos(struct get_allocated_inos_args *args)
|
||||
{
|
||||
struct scoutfs_ioctl_get_allocated_inos gai;
|
||||
u64 *inos = NULL;
|
||||
int fd = -1;
|
||||
u64 bytes;
|
||||
int ret;
|
||||
int i;
|
||||
|
||||
if (args->single)
|
||||
bytes = sizeof(*inos);
|
||||
else
|
||||
bytes = SCOUTFS_LOCK_INODE_GROUP_NR * sizeof(*inos);
|
||||
|
||||
inos = malloc(bytes);
|
||||
if (!inos) {
|
||||
fprintf(stderr, "inode number array allocation failed\n");
|
||||
ret = -ENOMEM;
|
||||
goto out;
|
||||
}
|
||||
|
||||
fd = get_path(args->path, O_RDONLY);
|
||||
if (fd < 0)
|
||||
return fd;
|
||||
|
||||
memset(&gai, 0, sizeof(gai));
|
||||
gai.start_ino = args->ino;
|
||||
gai.inos_ptr = (unsigned long)inos;
|
||||
gai.inos_bytes = bytes;
|
||||
|
||||
ret = ioctl(fd, SCOUTFS_IOC_GET_ALLOCATED_INOS, &gai);
|
||||
if (ret < 0) {
|
||||
ret = -errno;
|
||||
fprintf(stderr, "get_allocated_inos ioctl failed: "
|
||||
"%s (%d)\n", strerror(errno), errno);
|
||||
goto out;
|
||||
}
|
||||
|
||||
if (args->single && ret > 0 && inos[0] != args->ino)
|
||||
ret = 0;
|
||||
|
||||
for (i = 0; i < ret; i++)
|
||||
printf("%llu\n", inos[i]);
|
||||
|
||||
ret = 0;
|
||||
out:
|
||||
if (fd >= 0)
|
||||
close(fd);
|
||||
free(inos);
|
||||
|
||||
return ret;
|
||||
};
|
||||
|
||||
static int parse_opt(int key, char *arg, struct argp_state *state)
|
||||
{
|
||||
struct get_allocated_inos_args *args = state->input;
|
||||
int ret;
|
||||
|
||||
switch (key) {
|
||||
case 'i':
|
||||
ret = parse_u64(arg, &args->ino);
|
||||
if (ret)
|
||||
return ret;
|
||||
args->have_ino = true;
|
||||
case 'p':
|
||||
args->path = strdup_or_error(state, arg);
|
||||
break;
|
||||
case 's':
|
||||
args->single = true;
|
||||
break;
|
||||
case ARGP_KEY_FINI:
|
||||
if (!args->have_ino)
|
||||
argp_error(state, "must provide --ino starting inode number option");
|
||||
default:
|
||||
break;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static struct argp_option options[] = {
|
||||
{ "ino", 'i', "NUMBER", 0, "Start from 64bit inode number (required)"},
|
||||
{ "path", 'p', "PATH", 0, "Path to ScoutFS filesystem"},
|
||||
{ "single", 's', NULL, 0, "Only print single specific inode number argument"},
|
||||
{ NULL }
|
||||
};
|
||||
|
||||
static struct argp argp = {
|
||||
options,
|
||||
parse_opt,
|
||||
NULL,
|
||||
"Print allocated inode numbers from starting inode number"
|
||||
};
|
||||
|
||||
static int get_allocated_inos_cmd(int argc, char **argv)
|
||||
{
|
||||
|
||||
struct get_allocated_inos_args get_allocated_inos_args = {NULL};
|
||||
int ret;
|
||||
|
||||
ret = argp_parse(&argp, argc, argv, 0, NULL, &get_allocated_inos_args);
|
||||
if (ret)
|
||||
return ret;
|
||||
|
||||
return do_get_allocated_inos(&get_allocated_inos_args);
|
||||
}
|
||||
|
||||
static void __attribute__((constructor)) get_allocated_inos_ctor(void)
|
||||
{
|
||||
cmd_register_argp("get-allocated-inos", &argp, GROUP_DEBUG, get_allocated_inos_cmd);
|
||||
}
|
||||
@@ -278,8 +278,6 @@ static int print_log_trees_item(struct scoutfs_key *key, u64 seq, u8 flags, void
|
||||
" data_freed: "ALCROOT_F"\n"
|
||||
" srch_file: "SRF_FMT"\n"
|
||||
" inode_count_delta: %lld\n"
|
||||
" get_trans_seq: %lld\n"
|
||||
" commit_trans_seq: %lld\n"
|
||||
" max_item_seq: %llu\n"
|
||||
" finalize_seq: %llu\n"
|
||||
" rid: %016llx\n"
|
||||
@@ -298,8 +296,6 @@ static int print_log_trees_item(struct scoutfs_key *key, u64 seq, u8 flags, void
|
||||
ALCROOT_A(<->data_freed),
|
||||
SRF_A(<->srch_file),
|
||||
le64_to_cpu(lt->inode_count_delta),
|
||||
le64_to_cpu(lt->get_trans_seq),
|
||||
le64_to_cpu(lt->commit_trans_seq),
|
||||
le64_to_cpu(lt->max_item_seq),
|
||||
le64_to_cpu(lt->finalize_seq),
|
||||
le64_to_cpu(lt->rid),
|
||||
|
||||
Reference in New Issue
Block a user